def create_pipeline(dataset, batch_size=32, num_gpus=1, local_rank=0, mode='train'): data_layer = nemo_nlp.BertSentenceClassificationDataLayer( dataset, batch_size=batch_size, num_workers=0, local_rank=local_rank) ids, type_ids, input_mask, labels = data_layer() data_size = len(data_layer) if data_size < batch_size: nf.logger.warning("Batch_size is larger than the dataset size") nf.logger.warning("Reducing batch_size to dataset size") batch_size = data_size steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus)) nf.logger.info(f"Steps_per_epoch = {steps_per_epoch}") hidden_states = pretrained_bert_model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask) logits = classifier(hidden_states=hidden_states) loss = loss_fn(logits=logits, labels=labels) if mode == 'train': tensors_to_evaluate = [loss, logits] else: tensors_to_evaluate = [logits, labels] return tensors_to_evaluate, loss, steps_per_epoch, data_layer
def create_pipeline(data_file, max_seq_length, batch_size=32, num_samples=-1, shuffle=True, num_gpus=1, local_rank=0, mode='train'): nf.logger.info(f"Loading {mode} data...") data_layer = nemo_nlp.BertSentenceClassificationDataLayer( path_to_data=data_file, tokenizer=tokenizer, mode=mode, max_seq_length=max_seq_length, num_samples=num_samples, batch_size=batch_size, shuffle=shuffle, num_workers=0, local_rank=local_rank) ids, type_ids, input_mask, labels = data_layer() data_size = len(data_layer) if data_size < batch_size: nf.logger.warning("Batch_size is larger than the dataset size") nf.logger.warning("Reducing batch_size to dataset size") batch_size = data_size steps_per_epoch = int(data_size / (batch_size * num_gpus)) nf.logger.info(f"Steps_per_epoch = {steps_per_epoch}") hidden_states = pretrained_bert_model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask) logits = classifier(hidden_states=hidden_states) loss = loss_fn(logits=logits, labels=labels) # Create trainer and execute training action if mode == 'train': callback_fn = nemo.core.SimpleLossLoggerCallback( tensors=[loss, logits], print_func=lambda x: str(np.round(x[0].item(), 3)), tb_writer=nf.tb_writer, get_tb_values=lambda x: [["loss", x[0]]], step_freq=100) elif mode == 'eval': callback_fn = nemo.core.EvaluatorCallback( eval_tensors=[logits, labels], user_iter_callback=lambda x, y: eval_iter_callback( x, y, data_layer), user_epochs_done_callback=lambda x: eval_epochs_done_callback( x, f'{nf.work_dir}/graphs'), tb_writer=nf.tb_writer, eval_step=steps_per_epoch) return loss, callback_fn, steps_per_epoch
def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mode='train'): nf.logger.info(f"Loading {mode} data...") data_file = getattr(data_desc, mode + '_file') shuffle = args.shuffle_data if mode == 'train' else False data_layer = nemo_nlp.BertSentenceClassificationDataLayer( input_file=data_file, tokenizer=tokenizer, max_seq_length=args.max_seq_length, num_samples=num_samples, shuffle=shuffle, batch_size=batch_size, num_workers=0, local_rank=local_rank) ids, type_ids, input_mask, labels = data_layer() data_size = len(data_layer) if data_size < batch_size: nf.logger.warning("Batch_size is larger than the dataset size") nf.logger.warning("Reducing batch_size to dataset size") batch_size = data_size steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus)) nf.logger.info(f"Steps_per_epoch = {steps_per_epoch}") hidden_states = pretrained_bert_model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask) logits = classifier(hidden_states=hidden_states) loss = loss_fn(logits=logits, labels=labels) if mode == 'train': tensors_to_evaluate = [loss, logits] else: tensors_to_evaluate = [logits, labels] return tensors_to_evaluate, loss, steps_per_epoch, data_layer
def sentence_classification(args): # TODO: construct name of experiment based on args """ name = construct_name( args.exp_name, args.lr, args.batch_size, args.num_epochs, args.weight_decay, args.optimizer) work_dir = name if args.work_dir: work_dir = os.path.join(args.work_dir, name) """ # Instantiate neural modules nf = NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=args.amp_opt_level, log_dir=args.work_dir, create_tb_writer=True, files_to_copy=[__file__], add_time_to_log_dir=True) # Pre-trained BERT tokenizer = BertTokenizer.from_pretrained(args.pretrained_bert_model) if args.bert_checkpoint is None: bert = nemo_nlp.BERT(pretrained_model_name=args.pretrained_bert_model) # save bert config for inference after fine-tuning bert_config = bert.config.to_dict() with open(args.work_dir + '/' + args.pretrained_bert_model + '_config.json', 'w+') as json_file: json.dump(bert_config, json_file) else: if args.bert_config is not None: with open(args.bert_config) as json_file: bert_config = json.load(json_file) bert = nemo_nlp.BERT(**bert_config) bert.restore_from(args.bert_checkpoint) # MLP bert_hidden_size = bert.local_parameters['hidden_size'] mlp = nemo_nlp.SequenceClassifier( hidden_size=bert_hidden_size, num_classes=args.num_classes, num_layers=args.num_layers, log_softmax=False, dropout=args.dropout) # TODO: save mlp/all model configs (bake in to Neural Module?) if args.mlp_checkpoint: mlp.restore_from(args.mlp_checkpoint) # Loss function for classification loss_fn = CrossEntropyLoss() # Data layers, pipelines, and callbacks callbacks = [] # callbacks depend on files present if args.train_file: if args.preproc: train_data_layer = preproc_data_layer.PreprocBertSentenceClassificationDataLayer( input_file=args.train_file, shuffle=True, num_samples=args.num_samples, # lower for dev, -1 for all dataset batch_size=args.batch_size, num_workers=0, local_rank=args.local_rank) else: train_data_layer = nemo_nlp.BertSentenceClassificationDataLayer( input_file=args.train_file, tokenizer=tokenizer, max_seq_length=args.max_seq_length, shuffle=True, num_samples=args.num_samples, # lower for dev, -1 for all dataset batch_size=args.batch_size, num_workers=0, local_rank=args.local_rank) train_logits, train_loss, steps_per_epoch, train_labels = create_pipeline( nf, train_data_layer, bert, mlp, loss_fn) train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[train_loss, train_logits], print_func=lambda x: nf.logger.info(f'Train loss: {str(np.round(x[0].item(), 3))}'), tb_writer=nf.tb_writer, get_tb_values=lambda x: [["train_loss", x[0]]], step_freq=steps_per_epoch) callbacks.append(train_callback) if args.num_checkpoints != 0: ckpt_callback = nemo.core.CheckpointCallback( folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, checkpoints_to_keep=args.num_checkpoints) callbacks.append(ckpt_callback) if args.eval_file: if args.preproc: eval_data_layer = preproc_data_layer.PreprocBertSentenceClassificationDataLayer( input_file=args.eval_file, shuffle=False, num_samples=args.num_samples, batch_size=args.batch_size, num_workers=0, local_rank=args.local_rank) else: eval_data_layer = nemo_nlp.BertSentenceClassificationDataLayer( input_file=args.eval_file, tokenizer=tokenizer, max_seq_length=args.max_seq_length, shuffle=False, num_samples=args.num_samples, batch_size=args.batch_size, num_workers=0, local_rank=args.local_rank) eval_logits, eval_loss, _, eval_labels = create_pipeline( nf, eval_data_layer, bert, mlp, loss_fn) eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[eval_logits, eval_labels], user_iter_callback=lambda x, y: eval_iter_callback( x, y, eval_data_layer), user_epochs_done_callback=lambda x: eval_epochs_done_callback( x, f'{nf.work_dir}/graphs'), tb_writer=nf.tb_writer, eval_step=steps_per_epoch) callbacks.append(eval_callback) if args.inference_file: if args.preproc: inference_data_layer = preproc_data_layer.PreprocBertSentenceClassificationDataLayer( input_file=args.inference_file, shuffle=False, num_samples=args.num_samples, batch_size=args.batch_size, num_workers=0, local_rank=args.local_rank) else: inference_data_layer = nemo_nlp.BertSentenceClassificationDataLayer( input_file=args.inference_file, tokenizer=tokenizer, max_seq_length=args.max_seq_length, shuffle=False, num_samples=args.num_samples, batch_size=args.batch_size, num_workers=0, local_rank=args.local_rank) # TODO: Finish inference inference_callback = None # Training, eval and inference if args.train_file: lr_policy_fn = get_lr_policy( args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion) nf.train( tensors_to_optimize=[train_loss], callbacks=callbacks, lr_policy=lr_policy_fn, optimizer=args.optimizer_kind, optimization_params={'num_epochs': args.num_epochs, 'lr': args.lr})