def make_and_run_on_device_benchmark(opts, train=True): name = "training" if train else "test" logging.info(f"Creating the {name} benchmark for running with a device") graph = tf.Graph() with graph.as_default(): ds, num_ds, *_ = make_dataset(opts, use_synthetic_data=False, training=train) num_ds = num_ds // opts.batch_size infeed = ipu_infeed_queue.IPUInfeedQueue(ds) def empty_loop(): def empty_body(data_infeed): return tf.no_op() return ipu.loops.repeat(opts.repeat_count, empty_body, [], infeed) with ipu.scopes.ipu_scope("/device:IPU:0"): benchmark_op = ipu.ipu_compiler.compile(empty_loop, inputs=[]) with tf.Session(graph=graph) as sess: # run a first un-monitored epoch to force compile sess.run(benchmark_op) times = [] for _ in range(opts.epochs): progress = tqdm.tqdm(range(num_ds // opts.repeat_count)) for _ in progress: t0 = time.perf_counter() sess.run(benchmark_op) t1 = time.perf_counter() times.append(t1 - t0) avg_time = np.mean(times) token_throughput = opts.source_sequence_length * opts.batch_size * opts.repeat_count / avg_time bytes_throughput = token_throughput * 4 / (2**30) logging.info(f"On device throughput: {token_throughput:0.2f} tokens/s = {bytes_throughput:0.2f} GB/s")
def main(in_dataset_file, in_model_folder, in_result_file): dataset = pd.read_json(in_dataset_file) with tf.Session() as sess: model, actual_config, vocab, char_vocab, label_vocab = load( in_model_folder, sess) rev_label_vocab = { label_id: label for label, label_id in label_vocab.iteritems() } print 'Done loading' X, y = make_dataset(dataset, vocab, label_vocab, actual_config) y_pred = predict(model, (X, y), [rev_label_vocab], sess) tags_predicted = [] tag_idx = 0 for tag_seq in dataset['tags']: tags_predicted.append(y_pred[tag_idx:tag_idx + len(tag_seq)]) tag_idx += len(tag_seq) result = pd.DataFrame({ 'utterance': dataset['utterance'], 'tags_gold': dataset['tags'], 'tags_predicted': tags_predicted }) result.to_json(in_result_file)
def make_loaders(opt): """makes training/val/test""" batch_size = opt.batch_size * opt.world_size eval_batch_size = opt.eval_batch_size * opt.world_size seq_length = opt.seq_length if seq_length < 0: seq_length = seq_length * opt.world_size eval_seq_length = opt.eval_seq_length if opt.eval_seq_length < 0: eval_seq_length = eval_seq_length * opt.world_size # data_loader_args = {'num_workers': 0, 'shuffle': opt.shuffle, 'batch_size': batch_size, data_loader_args = { 'num_workers': 0, 'shuffle': opt.shuffle, 'batch_size': batch_size, # data_loader_args = {'num_workers': 1, 'shuffle': opt.shuffle, 'batch_size': batch_size, 'pin_memory': True, 'transpose': opt.transpose, 'distributed': opt.world_size > 1, 'rank': opt.rank, 'world_size': opt.world_size, 'drop_last': opt.world_size > 1 } if opt.data_set_type == 'L2R': loader_type = data_utils.ShardLoader data_loader_args.update({ 'seq_len': seq_length, 'persist_state': opt.persist_state, 'samples_per_shard': opt.samples_per_shard }) else: loader_type = data_utils.DataLoader split = get_split(opt) data_set_args = { 'path': opt.data, 'seq_length': seq_length, 'lazy': opt.lazy, 'delim': opt.delim, 'text_key': opt.text_key, 'label_key': opt.label_key, 'preprocess': opt.preprocess, 'ds_type': opt.data_set_type, 'split': split, 'loose': opt.loose_json, 'tokenizer_type': opt.tokenizer_type, 'tokenizer_model_path': opt.tokenizer_path, 'vocab_size': opt.vocab_size, 'model_type': opt.tokenizer_model_type, 'non_binary_cols': opt.non_binary_cols, 'process_fn': opt.process_fn } eval_loader_args = copy.copy(data_loader_args) eval_set_args = copy.copy(data_set_args) eval_set_args['split'] = [1.] # if optional eval args were set then replace their equivalent values in the arg dict if opt.eval_batch_size != 0: eval_loader_args['batch_size'] = eval_batch_size if opt.eval_seq_length != 0: eval_set_args['seq_length'] = eval_seq_length if opt.data_set_type == 'L2R': eval_loader_args['seq_len'] = eval_seq_length if opt.eval_text_key is not None: eval_set_args['text_key'] = opt.eval_text_key if opt.eval_label_key is not None: eval_set_args['label_key'] = opt.eval_label_key train = None valid = None test = None if opt.data is not None: train, tokenizer = data_utils.make_dataset(**data_set_args) if should_split(split): train, valid, test = train eval_set_args['tokenizer'] = tokenizer if opt.valid is not None: eval_set_args['path'] = opt.valid valid, _ = data_utils.make_dataset(**eval_set_args) if test is None and opt.test is not None: eval_set_args['path'] = opt.test test, _ = data_utils.make_dataset(**eval_set_args) if train is not None and opt.batch_size > 0: train = loader_type(train, **data_loader_args) if valid is not None: valid = loader_type(valid, **eval_loader_args) if test is not None: test = loader_type(test, **eval_loader_args) return (train, valid, test), tokenizer
logging.info(f"On device throughput: {token_throughput:0.2f} tokens/s = {bytes_throughput:0.2f} GB/s") if __name__ == "__main__": logging.basicConfig( level=logging.getLevelName('INFO'), format='%(asctime)s %(name)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') # Parse options opts = parse_args() if not opts.on_device_only: logger.info("Creating training dataset, infeed queue and benchmark.") # Create training dataset and infeed queue train_set, num_train, *_ = make_dataset(opts, use_synthetic_data=False, training=True) num_train = num_train // opts.batch_size infeed_train_queue = ipu_infeed_queue.IPUInfeedQueue(train_set) # Benchmark it infeed_perf_train = dataset_benchmark.infeed_benchmark( infeed_queue=infeed_train_queue, number_of_epochs=opts.epochs, elements_per_epochs=num_train, print_stats=False) ds_perf_train = dataset_benchmark.dataset_benchmark( dataset=train_set, number_of_epochs=opts.epochs, elements_per_epochs=num_train, print_stats=False, apply_options=True)
def make_loaders(opt): """makes training/val/test""" batch_size = opt.batch_size * opt.world_size eval_batch_size = opt.eval_batch_size * opt.world_size seq_length = opt.seq_length if seq_length < 0: seq_length = seq_length * opt.world_size eval_seq_length = opt.eval_seq_length if opt.eval_seq_length < 0: eval_seq_length = eval_seq_length * opt.world_size # TODO: fix data race in lazy loader # data_loader_args = {'num_workers': 10, 'shuffle': opt.shuffle, 'batch_size': batch_size, data_loader_args = { 'num_workers': 1, 'shuffle': opt.shuffle, 'batch_size': batch_size, 'pin_memory': True, 'transpose': opt.transpose, 'distributed': opt.world_size > 1, 'rank': opt.rank, 'world_size': opt.world_size, 'drop_last': opt.world_size > 1 } split = get_split(opt) data_set_args = { 'path': opt.data, 'seq_length': seq_length, 'lazy': opt.lazy, 'text_key': opt.text_key, 'label_key': opt.label_key, 'preprocess': opt.preprocess, 'persist_state': opt.persist_state, 'delim': opt.delim, 'num_shards': opt.num_shards, 'ds_type': opt.data_set_type, 'split': split, 'loose': opt.loose_json } eval_loader_args = copy.copy(data_loader_args) eval_set_args = copy.copy(data_set_args) eval_set_args['split'] = [1.] # if optional eval args were set then replace their equivalent values in the arg dict if opt.eval_batch_size != 0: eval_loader_args['batch_size'] = eval_batch_size if opt.eval_seq_length != 0: eval_set_args['seq_length'] = eval_seq_length if opt.eval_text_key != 'None': eval_set_args['text_key'] = opt.eval_text_key if opt.eval_label_key != 'None': eval_set_args['label_key'] = opt.eval_label_key train = None valid = None test = None if opt.data != 'None': train = data_utils.make_dataset(**data_set_args) if should_split(split): train, valid, test = train if opt.valid != 'None': eval_set_args['path'] = opt.valid valid = data_utils.make_dataset(**eval_set_args) if test is None and opt.test != 'None': eval_set_args['path'] = opt.test test = data_utils.make_dataset(**eval_set_args) if train is not None and opt.batch_size > 0: train = data_utils.DataLoader(train, **data_loader_args) if valid is not None: if opt.data_set_type == 'unsupervised': if opt.eval_seq_length != 0: valid.set_seq_len(eval_seq_length) if opt.val_shards != 0: valid.set_num_shards(opt.val_shards) valid = data_utils.DataLoader(valid, **eval_loader_args) if test is not None: if opt.data_set_type == 'unsupervised': if opt.eval_seq_length != 0: test.set_seq_len(eval_seq_length) if opt.test_shards != 0: test.set_num_shards(opt.test_shards) test = data_utils.DataLoader(test, **eval_loader_args) return train, valid, test
def make_loaders(args, tokenizer): """makes training/val/test""" if args.use_tfrecords: return make_tfrecord_loaders(args) world_size = torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) if args.loader_scatter is not None: assert world_size % args.loader_scatter == 0 batch_size = args.batch_size * world_size eval_batch_size = batch_size if args.eval_batch_size is not None: eval_batch_size = args.eval_batch_size * world_size seq_length = args.seq_length if seq_length < 0: seq_length = seq_length * world_size eval_seq_length = args.eval_seq_length if eval_seq_length is not None and eval_seq_length < 0: eval_seq_length = eval_seq_length * world_size split = get_split(args) data_set_args = { 'path': args.train_data, 'seq_length': seq_length, 'mem_length': args.mem_length, 'delim': args.delim, 'text_key': args.text_key, 'label_key': 'label', 'ds_type': args.data_set_type, 'split': split, 'loose': args.loose_json, 'max_preds_per_seq': args.max_preds_per_seq, 'presplit_sentences': args.presplit_sentences, 'sample_one_document': args.sample_one_document, 'filter_english': args.filter_english, 'pre_tokenize': not args.no_pre_tokenize, 'tokenizer': tokenizer, 'save_splits': args.save_splits, 'load_splits': args.load_splits, 'save_test_data': args.save_test_data, 'no_lazy_loader': args.no_lazy_loader, 'loader_scatter': args.loader_scatter, 'data_parallel_rank': mpu.get_data_parallel_rank(), "non_sentence_start": args.non_sentence_start, "half_lazy_loader": args.half_lazy_loader } eval_set_args = copy.copy(data_set_args) eval_set_args['split'] = [1.] # if optional eval args were set then replace their # equivalent values in the arg dict if eval_seq_length: eval_set_args['seq_length'] = eval_seq_length if args.eval_max_preds_per_seq: eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq if args.eval_text_key is not None: eval_set_args['text_key'] = args.eval_text_key # make datasets splits and tokenizer train, valid, test = None, None, None if args.train_data is not None: train = data_utils.make_dataset(**data_set_args) if data_utils.should_split(split): train, valid, test = train eval_set_args['tokenizer'] = tokenizer # make training and val dataset if necessary if valid is None and args.valid_data is not None: eval_set_args['path'] = args.valid_data valid = data_utils.make_dataset(**eval_set_args) eval_set_args['tokenizer'] = tokenizer if test is None and args.test_data is not None: eval_set_args['path'] = args.test_data test = data_utils.make_dataset(**eval_set_args) # wrap datasets with data loader use_block = args.block_lm or args.encoder_decoder if train is not None and args.batch_size > 0: train = make_data_loader(train, tokenizer, batch_size, args.train_iters, args, shuffle=args.shuffle, block_collate=use_block) args.do_train = True else: args.do_train = False eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size if valid is not None: valid = make_data_loader(valid, tokenizer, eval_batch_size, args.train_iters, args, shuffle=args.shuffle, block_collate=use_block) args.do_valid = True else: args.do_valid = False if test is not None: test = make_data_loader(test, tokenizer, eval_batch_size, len(test) // eval_batch_size + 1, args, shuffle=args.shuffle, block_collate=use_block) args.do_test = True else: args.do_test = False return train, valid, test
def run_testing(opts, transformer): testing_graph = tf.Graph() with testing_graph.as_default(): with tf.device("cpu"): logger.info("Creating test dataset") dataset, num_test, vocab = data_utils.make_dataset( opts, use_synthetic_data=opts.use_synthetic_data, training=False) batch_size = opts.batch_size if opts.pipeline: batch_size *= opts.gradient_accumulation_count batches_per_epoch = num_test // batch_size logger.info(f"Effective batch-size (global batch): {batch_size}") logger.info("Creating infeed and outfeed queues") test_infeed = IPUInfeedQueue(dataset, feed_name="test_infeed") test_outfeed = IPUOutfeedQueue(feed_name="test_outfeed") # Compile the forward pass for testing with scopes.ipu_scope("/device:IPU:0"): # Helper function def loop_builder(iterations, builder_func, infeed): return loops.repeat(iterations, builder_func, [], infeed) if opts.pipeline: logger.info("Creating pipelined test graph") test_loop = partial(forward_pass, opts, transformer, batches_per_epoch, False, test_outfeed, dense_queue=None, infeed=test_infeed) else: logger.info("Creating test graph") test_loop = partial(forward_pass, opts, transformer, batches_per_epoch, False, test_outfeed, None) test_loop = partial(loop_builder, batches_per_epoch, test_loop, test_infeed) test_loop = ipu_compiler.compile(test_loop, inputs=[]) # Metrics with tf.device("cpu"): metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") metrics_initializer = tf.variables_initializer( var_list=metrics_vars) saver = tf.train.Saver() if opts.restore_epoch is None: checkpoint = tf.train.latest_checkpoint(opts.train_checkpoint_path) else: checkpoint = opts.train_checkpoint_path + "/model_" + str( opts.restore_epoch) + ".ckpt" with tf.Session(graph=testing_graph) as sess: # The sparsity will also be streamed from the checkpoint logger.info("Restoring weights") saver.restore(sess, checkpoint) sess.run(test_infeed.initializer) sess.run(metrics_initializer) # Run inference (whole dataset in one session call) logger.info("Testing...") dt = time.perf_counter() sess.run(test_loop) dt = time.perf_counter() - dt session_outputs = sess.run(test_outfeed.dequeue())[-1] # Test set performance # Log progress nll_loss = session_outputs['nll_loss'][-1] training_loss = session_outputs['training_loss'][-1] perplexity = session_outputs["perplexity"][-1] token_accuracy = session_outputs['token_accuracy'][-1] desc = (f"\nTraining loss : {training_loss:.4f}" f"\nXentropy loss : {nll_loss:.4f}" f"\nPerplexity : {perplexity:.3f}" f"\nToken accuracy: {token_accuracy:.2f}") logger.info(desc) if (opts.decode and opts.log_level == 'INFO'): text_pred, text_target = data_utils.decode_prediction( prediction=session_outputs['predictions'][-1], target=session_outputs['target'][-1], vocab=vocab) logger.info(f"Target: {text_target}\n" f"Prediction: {text_pred}\n") os.sys.stdout.flush() logger.info(f"Test complete.") return desc
def run_training(opts, transformer): # Construct the training graph training_graph = tf.Graph() with training_graph.as_default(): with tf.device("cpu"): dataset, num_train, vocab = data_utils.make_dataset( opts, use_synthetic_data=opts.use_synthetic_data, training=True) # Calculate dataset length batch_size = opts.batch_size if opts.pipeline: batch_size *= opts.gradient_accumulation_count batches_per_epoch = num_train // batch_size io_steps_per_epoch = batches_per_epoch // opts.repeat_count total_io_steps = opts.nepochs * io_steps_per_epoch total_global_steps = opts.nepochs * io_steps_per_epoch * opts.repeat_count logger.info(f"Effective batch-size (global batch): {batch_size}, " f"IO steps per epoch: {io_steps_per_epoch}, " f"Total IO steps: {total_io_steps} " f"Total global steps: {total_global_steps}") if opts.prune_ratio is not None and opts.prune_ratio > 0: # Compute the pruning ratio when the learning rate will reach a minimum lr_decay_steps = opts.cooldown_steps + opts.warmup_steps lr_min_epochs = lr_decay_steps / (io_steps_per_epoch * opts.repeat_count) remainining_prune_ratio = opts.prune_ratio * sparse_training.cosine_prune_function( lr_decay_steps, total_global_steps, opts.cosine_prune_schedule) logger.warn( f"\n\nThe learning rate schedule will reach a minimum after {lr_min_epochs:0.2f} epochs, " f"at which point the pruning ratio will be {remainining_prune_ratio:0.3f}\n\n" ) logger.info( f"Cosine prune schedule options: {opts.cosine_prune_schedule}") logger.info("Creating infeed and outfeed queues") # Queues for streaming from host to device and back train_infeed = IPUInfeedQueue(dataset, feed_name="train_infeed") train_outfeed = IPUOutfeedQueue(feed_name="train_outfeed") prune_and_grow_outfeed = IPUOutfeedQueue( feed_name="prune_and_grow_outfeed") # Helper function def loop_builder(iterations, builder_func, infeed): return loops.repeat(iterations, builder_func, [], infeed) # Compile the forward and backward pass for training with scopes.ipu_scope("/device:IPU:0"): if opts.pipeline: logger.info("Creating pipelined training graph") train_loop = partial(forward_pass, opts, transformer, opts.repeat_count, True, train_outfeed, prune_and_grow_outfeed, train_infeed) else: logger.info("Creating training graph") train_body = partial(forward_pass, opts, transformer, opts.repeat_count, True, train_outfeed, prune_and_grow_outfeed) train_loop = partial(loop_builder, opts.repeat_count, train_body, train_infeed) train_loop = ipu_compiler.compile(train_loop, inputs=[]) transformer.buildSparsityUpdateOps() # Metrics with tf.device("cpu"): metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metrics") metrics_initializer = tf.variables_initializer( var_list=metrics_vars) saver = tf.train.Saver() # These ops are declared here so that the graph can be frozen afterwards global_initializer = tf.global_variables_initializer() train_outfeed_dequeue = train_outfeed.dequeue() if opts.prune_ratio is not None and opts.prune_ratio > 0: prune_and_grow_dequeue = prune_and_grow_outfeed.dequeue() utils.move_variable_initialization_to_cpu() # Tensorboard log_name = "logs/" + datetime.now().isoformat() summary_writer = tf.summary.FileWriter(logdir=os.path.join( opts.train_checkpoint_path, log_name), flush_secs=5) # Run the model: training_graph.finalize() # no more new ops added from here on out with tf.Session(graph=training_graph) as sess: logger.info(f"Initializing training session") sess.run(global_initializer) sess.run(train_infeed.initializer) logger.info(f"Training...") progress = tqdm(range(opts.nepochs)) for e in progress: sess.run(metrics_initializer) for io_step in range(io_steps_per_epoch): # Train the model step_start_time = time.perf_counter() sess.run(train_loop) ipu_train_time = time.perf_counter() - step_start_time session_outputs = sess.run(train_outfeed_dequeue)[-1] logger.debug(f"Train outputs: {session_outputs.keys()}") # Calculate avg throughput num_tokens = transformer.source_sequence_length * opts.repeat_count * batch_size throughput = num_tokens / ipu_train_time # Log progress - average stats over the last accumulation step only: start_point = -1 if not opts.pipeline else -opts.gradient_accumulation_count lr = np.mean(session_outputs["learning_rate"][start_point:]) training_loss = np.mean( session_outputs['training_loss'][start_point:]) std_training_loss = np.std( session_outputs['training_loss'][start_point:]) nll_loss = np.mean(session_outputs['nll_loss'][start_point:]) perplexity = np.mean( session_outputs["perplexity"][start_point:]) token_accuracy = np.mean( session_outputs['token_accuracy'][start_point:]) global_step = session_outputs['global_step'][start_point:][-1] logger.info( f"\nEpoch {e}: io_step {io_step+1}/{io_steps_per_epoch}" f"\nGlobal step: {global_step}/{total_global_steps}" f"\nTraining loss : {training_loss:.4f}" f"\nTraining loss standard deviation: {std_training_loss:.4f}" f"\nXentropy loss : {nll_loss:.4f}" f"\nPerplexity : {perplexity:.3f}" f"\nToken accuracy: {token_accuracy:.2f}" f"\nLearning rate: {lr:3.4e}" f"\nThroughput {throughput:.1f} token/s") if opts.decode and logger.level <= logging.INFO: try: text_pred, text_target = data_utils.decode_prediction( prediction=session_outputs['predictions'][-1], target=session_outputs['target'][-1], vocab=vocab) logger.info( f"\nTarget: {text_target}\n\nPrediction: {text_pred}\n" ) except Exception as ex: logger.warn(f"Decoding failed: {ex}") summary_value = [ tf.Summary.Value(tag="perplexity", simple_value=perplexity), tf.Summary.Value(tag="training_loss", simple_value=training_loss), tf.Summary.Value(tag="stddev_training_loss", simple_value=std_training_loss), tf.Summary.Value(tag="xentropy_loss", simple_value=nll_loss), tf.Summary.Value(tag="token_accuracy", simple_value=token_accuracy), tf.Summary.Value(tag="learning_rate", simple_value=lr), tf.Summary.Value(tag="throughput", simple_value=throughput), tf.Summary.Value(tag="epoch", simple_value=e) ] # If we just completed the last io step we do not # prune and grow regardless, otherwise check the prune ratio: if io_step + 1 < io_steps_per_epoch and transformer.prune_ratio is not None and transformer.prune_ratio > 0: # Retrieve p and g results from the conditional queue: prune_and_grow_data = sess.run(prune_and_grow_dequeue) for k in prune_and_grow_data: prune_and_grow_data[k] = prune_and_grow_data[k][-1] logger.debug( f"Prune and grow outputs: {prune_and_grow_data.keys()}" ) prune_and_grow_time, cosine_schedule_factor = transformer.syncPruneAndRegrowOnHost( opts.cosine_prune_schedule, global_step, total_global_steps, prune_and_grow_data) transformer.streamSparsityFromHostToDevice() summary_value.extend([ tf.Summary.Value(tag="prune+grow_time", simple_value=prune_and_grow_time), tf.Summary.Value(tag="cosine_schedule_factor", simple_value=cosine_schedule_factor) ]) for layer_name, sparse_layer in transformer.sparse_layers.items( ): values_var = sparse_layer.get_values_var() grad_w_name = values_var.name.replace( 'nz_values:0', 'grad_w') grad_w = np.array(prune_and_grow_data[grad_w_name]) if (opts.log_histograms): histogram = tf_utils.make_histogram_proto( grad_w, bins_count=opts.bins_count) summary_value.extend([ tf.Summary.Value(tag=layer_name + "/dense_grad_w", histo=histogram) ]) summary_value.extend([ tf.Summary.Value(tag=layer_name + "/dense_grad_w_stddev", simple_value=np.std(grad_w)), tf.Summary.Value(tag=layer_name + "/dense_grad_w_mean", simple_value=np.mean(grad_w)), tf.Summary.Value(tag=layer_name + "/dense_grad_w_min", simple_value=np.min(grad_w)), tf.Summary.Value(tag=layer_name + "/dense_grad_w_max", simple_value=np.max(grad_w)) ]) for slot_name, slot in sparse_layer.get_slot_var_dict( ).items(): slot_val = prune_and_grow_data[ slot.tf_variable.name] if opts.log_histograms: histogram = tf_utils.make_histogram_proto( slot_val, bins_count=opts.bins_count) summary_value.extend([ tf.Summary.Value(tag=slot_name, histo=histogram) ]) summary_value.extend([ tf.Summary.Value( tag=slot_name + "/stddev", simple_value=np.std(slot_val)), tf.Summary.Value( tag=slot_name + "/mean", simple_value=np.mean(slot_val)), tf.Summary.Value( tag=slot_name + "/min", simple_value=np.min(slot_val)), tf.Summary.Value(tag=slot_name + "/max", simple_value=np.max(slot_val)) ]) # Log to tensorboard (outside any graph) summary = tf.Summary(value=summary_value) summary_writer.add_summary(summary, np.mean(global_step)) if opts.use_wandb: wandb.tensorflow.log(summary.SerializeToString()) logger.info( f"Total time for step {time.perf_counter() - step_start_time}" ) logger.info(f"IPU train time for step {ipu_train_time}") logger.info(f"Saving model after epoch {e}") saver.save( sess, os.path.join(opts.train_checkpoint_path, 'model_' + str(e) + '.ckpt')) os.sys.stdout.flush() logger.info(f"Training complete.")
def make_loaders(args): """makes training/val/test""" if args.use_tfrecords: return make_tfrecord_loaders(args) world_size = torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) batch_size = args.batch_size * world_size eval_batch_size = batch_size if args.eval_batch_size is not None: eval_batch_size = args.eval_batch_size * world_size seq_length = args.seq_length if seq_length < 0: seq_length = seq_length * world_size eval_seq_length = args.eval_seq_length if eval_seq_length is not None and eval_seq_length < 0: eval_seq_length = eval_seq_length * world_size split = get_split(args) data_set_args = { 'local_rank': args.local_rank, 'path': args.train_data, 'seq_length': seq_length, 'mem_length': args.mem_length, 'lazy': args.lazy_loader, 'xl_style': args.transformer_xl, 'delim': args.delim, 'text_key': args.text_key, 'label_key': 'label', 'non_binary_cols': None, 'ds_type': args.data_set_type, 'split': split, 'loose': args.loose_json, 'tokenizer_type': args.tokenizer_type, 'tokenizer_model_path': args.tokenizer_path, 'vocab_size': args.vocab_size, 'model_type': args.tokenizer_model_type, 'cache_dir': args.cache_dir, 'max_preds_per_seq': args.max_preds_per_seq, 'presplit_sentences': args.presplit_sentences, 'sample_one_document': args.sample_one_document, 'pre_tokenize': not args.not_pre_tokenize } eval_set_args = copy.copy(data_set_args) eval_set_args['split'] = [1.] # if optional eval args were set then replace their # equivalent values in the arg dict if eval_seq_length: eval_set_args['seq_length'] = eval_seq_length if args.eval_max_preds_per_seq: eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq if args.eval_text_key is not None: eval_set_args['text_key'] = args.eval_text_key # make datasets splits and tokenizer train = None valid = None test = None if args.train_data is not None: train, tokenizer = data_utils.make_dataset(**data_set_args) if data_utils.should_split(split): train, valid, test = train eval_set_args['tokenizer'] = tokenizer # make training and val dataset if necessary if valid is None and args.valid_data is not None: eval_set_args['path'] = args.valid_data valid, tokenizer = data_utils.make_dataset(**eval_set_args) eval_set_args['tokenizer'] = tokenizer if test is None and args.test_data is not None: eval_set_args['path'] = args.test_data test, tokenizer = data_utils.make_dataset(**eval_set_args) # wrap datasets with data loader if train is not None and args.batch_size > 0: train = make_data_loader(train, batch_size, args) args.do_train = True else: args.do_train = False eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size if valid is not None: valid = make_data_loader(valid, eval_batch_size, args) args.do_valid = True else: args.do_valid = False if test is not None: test = make_data_loader(test, eval_batch_size, args) args.do_test = True else: args.do_test = False return (train, valid, test), tokenizer
def train(n_epochs=10): data_file = '../data/train-stanford-raw.conll' # if vocab_file is given (ie for pretrained wordvectors), use x2i and i2x from this file. # If not given, create new vocab file in ../data vocab_file = None log_folder = '../logs' model_folder = '../models' model_name = 'wsj_3' model_file = os.path.join(model_folder, model_name + '_{}.model') log_file = open(os.path.join(log_folder, model_name + '.csv'), 'w', 1) print('epoch,train_loss,val_loss,arc_acc,lab_acc', file=log_file) batch_size = 64 prints_per_epoch = 10 n_epochs *= prints_per_epoch # load data print('loading data...') data, x2i, i2x = make_dataset(data_file) if not vocab_file: with open('../data/vocab_{}.pkl'.format(model_name), 'wb') as f: pickle.dump((x2i, i2x), f) # make train and val batch loaders train_data, val_data = split_train_test(data) print('# train sentences', len(train_data)) print('# val sentences', len(val_data)) train_loader = batch_loader(train_data, batch_size) val_loader = batch_loader(val_data, batch_size, shuffle=False) print('creating model...') # make model model = BiAffineParser(word_vocab_size=len(x2i['word']), word_emb_dim=100, pos_vocab_size=len(x2i['tag']), pos_emb_dim=28, emb_dropout=0.33, lstm_hidden=512, lstm_depth=3, lstm_dropout=.33, arc_hidden=256, arc_depth=1, arc_dropout=.33, arc_activation='ReLU', lab_hidden=128, lab_depth=1, lab_dropout=.33, lab_activation='ReLU', n_labels=len(x2i['label'])) print(model) model.cuda() base_params, arc_params, lab_params = model.get_param_groups() opt = Adam([ { 'params': base_params, 'lr': 2e-3 }, { 'params': arc_params, 'lr': 2e-3 }, { 'params': lab_params, 'lr': 1e-4 }, ], betas=[.9, .9]) sched = ReduceLROnPlateau(opt, threshold=1e-3, patience=8, factor=.4, verbose=True) n_train_batches = int(len(train_data) / batch_size) n_val_batches = int(len(val_data) / batch_size) batches_per_epoch = int(n_train_batches / prints_per_epoch) for epoch in range(n_epochs): t0 = time.time() # Training train_loss = 0 model.train() for i in range(batches_per_epoch): opt.zero_grad() # Load batch words, tags, arcs, lengths = next(train_loader) words = words.cuda() tags = tags.cuda() # Forward S_arc, S_lab = model(words, tags, lengths=lengths) # Calculate loss arc_loss = get_arc_loss(S_arc, arcs) lab_loss = get_label_loss(S_lab, arcs) loss = arc_loss + .025 * lab_loss train_loss += arc_loss.data[0] + lab_loss.data[0] # Backward loss.backward() opt.step() train_loss /= batches_per_epoch # Evaluation val_loss = 0 arc_acc = 0 lab_acc = 0 model.eval() for i in range(n_val_batches): words, tags, arcs, lengths = next(val_loader) words = words.cuda() tags = tags.cuda() S_arc, S_lab = model(words, tags, lengths=lengths) arc_loss = get_arc_loss(S_arc, arcs) lab_loss = get_label_loss(S_lab, arcs) loss = arc_loss + lab_loss val_loss += arc_loss.data[0] + lab_loss.data[0] arc_acc += get_arc_accuracy(S_arc, arcs) lab_acc += get_label_accuracy(S_lab, arcs) val_loss /= n_val_batches arc_acc /= n_val_batches lab_acc /= n_val_batches epoch_time = time.time() - t0 print( 'epoch {:.1f}\t train_loss {:.3f}\t val_loss {:.3f}\t arc_acc {:.3f}\t lab_acc {:.3f}\t time {:.1f} sec' .format(epoch / prints_per_epoch, train_loss, val_loss, arc_acc, lab_acc, epoch_time), end="\r") print('{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}'.format( epoch / prints_per_epoch, train_loss, val_loss, arc_acc, lab_acc), file=log_file) sched.step(val_loss) print('Done!') torch.save(model, model_file.format(val_loss)) log_file.close()
def make_loaders(args): """makes training/val/test""" if args.use_tfrecords: return make_tfrecord_loaders(args) batch_size = args.batch_size * args.world_size eval_batch_size = batch_size if args.eval_batch_size is not None: eval_batch_size = args.eval_batch_size * args.world_size seq_length = args.seq_length if seq_length < 0: seq_length = seq_length * args.world_size eval_seq_length = args.eval_seq_length if eval_seq_length is not None and eval_seq_length < 0: eval_seq_length = eval_seq_length * args.world_size split = get_split(args) data_set_args = { 'path': args.train_data, 'seq_length': seq_length, 'lazy': args.lazy_loader, 'delim': args.delim, 'text_key': args.text_key, 'label_key': 'label', 'non_binary_cols': None, 'ds_type': args.data_set_type, 'split': split, 'loose': args.loose_json, 'tokenizer_type': args.tokenizer_type, 'tokenizer_model_path': args.tokenizer_path, 'vocab_size': args.vocab_size, 'model_type': args.tokenizer_model_type, 'cache_dir': args.cache_dir, 'max_preds_per_seq': args.max_preds_per_seq } eval_set_args = copy.copy(data_set_args) eval_set_args['split'] = [1.] # if optional eval args were set then replace their # equivalent values in the arg dict if eval_seq_length: eval_set_args['seq_length'] = eval_seq_length if args.eval_max_preds_per_seq: eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq if args.eval_text_key is not None: eval_set_args['text_key'] = args.eval_text_key # make datasets splits and tokenizer train = None valid = None test = None if args.train_data is not None: train, tokenizer = data_utils.make_dataset(**data_set_args) if data_utils.should_split(split): train, valid, test = train eval_set_args['tokenizer'] = tokenizer # make training and val dataset if necessary if valid is None and args.valid_data is not None: eval_set_args['path'] = args.valid_data valid, _ = data_utils.make_dataset(**eval_set_args) if test is None and args.test_data is not None: eval_set_args['path'] = args.test_data test, _ = data_utils.make_dataset(**eval_set_args) # wrap datasets with data loader if train is not None and args.batch_size > 0: train = make_data_loader(train, batch_size, args) eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size if valid is not None: valid = make_data_loader(valid, eval_batch_size, args) if test is not None: test = make_data_loader(test, eval_batch_size, args) return (train, valid, test), tokenizer