def __init__(self, model: PreTrainedModel, tokenizer: PreTrainedTokenizer): self.tokenizer = tokenizer self.model = model self.device = next(model.parameters()).device
def train(self, model: transformers.PreTrainedModel, training_tasks: typing.List[Task], validation_tasks: typing.List[Task], num_epochs: int, batch_size: int, steps_per_epoch: int, prefetch_size: int, eval_batch_size: typing.Optional[int] = None, eval_batches: typing.Optional[int] = None, checkpoint_file: typing.Optional[str] = None) -> None: logging.info('Preparing kitchen sink with %d training tasks: %s', len(training_tasks), training_tasks) # Train the model & return its training history logging.info('Beginning training...') training_data, data_sizes = self.load_train_data(training_tasks, batch_size=batch_size, prefetch_size=prefetch_size) if validation_tasks: logging.info('Preparing kitchen sink with %d validation tasks: %s', len(validation_tasks), validation_tasks) validation_data = self.load_valid_data(validation_tasks, batch_size=eval_batch_size or batch_size, prefetch_size=prefetch_size, num_batches=eval_batches) else: validation_data = None logging.info('Preparing kitchen sink without validation') num_epochs += self.warmup_epochs optimizer, scheduler = get_optimizer(model, num_warmup_steps=self.warmup_epochs * steps_per_epoch, num_training_steps=num_epochs * steps_per_epoch) model.to(self.device) if self.use_amp: if not is_apex_available(): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level='O1') global_step = 0 tr_loss = 0.0 logging_loss = 0.0 model.zero_grad() train_itr = tqdm.trange(0, num_epochs * steps_per_epoch, desc="Training", unit="batch") tasks = [task.dataset for task in training_tasks] mixing_rates = self.get_mixing_rate(tasks, data_sizes) total_task_steps = Counter({task: np.float32(0.) for task in tasks}) for epoch in range(1, num_epochs + 1): epoch_itr = tqdm.trange(0, steps_per_epoch, desc="Epoch %d" % epoch, leave=False, unit="batch") epoch_task_steps = Counter({task: np.float32(0.) for task in tasks}) running_task_losses = {task: np.float32(0.) for task in tasks} for step in epoch_itr: inputs, labels, _ = next(np.random.choice(training_data, p=mixing_rates)) step_loss = self._train_step(model, inputs, labels, optimizer) tr_loss += step_loss train_itr.update() task = inputs['task'][0].decode('UTF-8') epoch_task_steps[task] += 1 running_task_losses[task] += step_loss if (step + 1) % self.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps self.gradient_accumulation_steps >= steps_per_epoch == (step + 1)): if self.use_amp: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 total_tasks = sum(epoch_task_steps.values()) print('Epoch %d: Empirical Mixing Rates: %s' % ( epoch, '; '.join('{:s}: {:0>5.2f}%'.format(task, rate * 100. / total_tasks) for task, rate in epoch_task_steps.items()) )) print('Epoch %d: Expected Mixing Rates: %s' % ( epoch, '; '.join('{:s}: {:0>5.2f}%'.format(task, rate * 100.) for task, rate in zip(tasks, mixing_rates)) )) mixing_losses = [loss / epoch_task_steps[task] for task, loss in running_task_losses.items()] print('Epoch %d: Training Losses: %s' % ( epoch, '; '.join('{:s}: {:g}'.format(task, loss) for task, loss in zip(tasks, mixing_losses)) )) if epoch > self.warmup_epochs: total_task_steps += epoch_task_steps exploration_ratios = np.array([total_task_steps.get(task, np.float32(0)) / size for task, size in zip(tasks, data_sizes)]) print('Epoch %d: Exploration Ratios: %s' % ( epoch, '; '.join('{:s}: {:0>5.2f}%'.format(task, ratio * 100.) for task, ratio in zip(tasks, exploration_ratios)) )) if not self.mix_from_validation: avg_loss = np.nanmean(mixing_losses) mixing_losses = [er * loss + (1. - er) * avg_loss for er, loss in zip(exploration_ratios, np.nan_to_num(mixing_losses))] valid_steps = 0 running_valid_loss = 0. if validation_data: epoch_task_steps = {task: np.float32(0.) for task in tasks} running_task_losses = {task: np.float32(0.) for task in tasks} with torch.no_grad(): for step, (inputs, labels, _) in enumerate(validation_data.as_numpy_iterator(), 1): model.eval() # Run the forward pass valid_step_loss = model(**self.prepare_forward_inputs(model, inputs, labels))[0].item() running_valid_loss += valid_step_loss valid_task = inputs['task'][0].decode('UTF-8') if valid_task in tasks: epoch_task_steps[valid_task] += 1 running_task_losses[valid_task] += valid_step_loss valid_steps += 1 avg_val_loss = running_valid_loss / valid_steps # Save checkpoint if validation loss decreases and checkpoint dir has been provided if checkpoint_file: if epoch == 1: best_val_loss = avg_val_loss logging.info("Saving best model with initial validation loss {0})".format(best_val_loss)) self.save_model(model, "{0}_best".format(checkpoint_file)) else: if avg_val_loss < best_val_loss: best_val_loss = avg_val_loss logging.info( "Saving new best model with validation loss {0} (epoch {1})".format(best_val_loss, epoch)) self.save_model(model, "{0}_best".format(checkpoint_file)) print('Epoch {:d}: Validation Losses: {:s}'.format( epoch, '; '.join('{:s}: {:g}'.format(task, loss / epoch_task_steps[task]) for task, loss in running_task_losses.items()) )) if self.mix_from_validation: mixing_losses = [loss / epoch_task_steps[task] for task, loss in running_task_losses.items()] if epoch > self.warmup_epochs and self.dynamic_mixing: new_mixing_rates = self.get_mixing_rate( tasks=tasks, rates=mixing_losses, normalize=False, temperature=(1. / self.temperature) ) print('Epoch {:d}: Updating Mixing Rate: {:s}'.format( epoch, '; '.join( '{:s}: {:0>5.2f}%->{:0>5.2f}% (Δ={:0>5.2f})'.format( task, old_rate * 100., smooth_rate * 100., (smooth_rate-old_rate) * 100.) for task, old_rate, smooth_rate in zip(tasks, mixing_rates, new_mixing_rates)) )) mixing_rates = new_mixing_rates logging.debug('Mixing rates (shape=%s; |tasks|=%d): %s', mixing_rates.shape, len(tasks), mixing_rates) lr = scheduler.get_last_lr()[0] loss_scalar = (tr_loss - logging_loss) / steps_per_epoch logging_loss = tr_loss train_itr.write('Global step: %d, lr: %g, loss: %g, val_loss: %g' % ( global_step, lr, loss_scalar, running_valid_loss / valid_steps if valid_steps > 0 else np.NaN)) if not np.isfinite(loss_scalar): logging.info('Loss was NaN, ending training after %d epochs.', epoch) train_itr.close() return train_itr.close()
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): input_ids, attention, token_ids, child, head = batch[0], batch[ 1], batch[2], batch[3], batch[4] dep_labels, num_dependency, arcs, arc_labels = batch[5], batch[ 6], batch[7], batch[8] arc_label_lengths, sent_labels = batch[9], batch[10] inputs = { 'input_ids': input_ids, 'attention': attention, 'token_ids': token_ids, 'child': child, 'head': head, 'dep_labels': dep_labels, 'arcs': arc_labels, 'arc_label_lengths': arc_label_lengths, 'device': args.device } outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = dep_labels.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, dep_labels.detach().cpu().numpy(), axis=0) f_out = open(os.path.join(eval_output_dir, 'dev_out.txt'), 'w') k = 0 for batch in eval_dataloader: for inp, arc_list in zip(batch[0], batch[8]): text = tokenizer.decode(inp) text = text.replace(tokenizer.pad_token, '').strip() f_out.write(text + '\n') for j, arc in enumerate(arc_list): arc_text = tokenizer.decode(arc) arc_text = arc_text.replace(tokenizer.pad_token, '').strip() if arc_text == '': # for bert break pred_temp = softmax([preds[k][j]]) f_out.write(text + '\n') f_out.write(arc_text + '\n') f_out.write('gold:\t' + str(out_label_ids[k][j]) + '\n') f_out.write('pred:\t' + str(np.argmax(pred_temp)) + '\n') f_out.write( str(pred_temp[0][0]) + '\t' + str(pred_temp[0][1]) + '\n') f_out.write('\n') k += 1 f_out.close() preds = preds.reshape(-1, 2) preds = softmax(preds) out_label_ids = out_label_ids.reshape(-1) preds = np.argmax(preds, axis=1) result = compute_metrics_intermediate(preds, out_label_ids) print(result) output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info("dep level %s = %s", key, str(result[key])) writer.write("dep level %s = %s\n" % (key, str(result[key]))) writer.write('\n') return result
def freeze_all_tokens(model: PreTrainedModel, tokenizer: PreTrainedTokenizer): model.get_input_embeddings().weight.requires_grad = False
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs model = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training model.resize_token_embeddings(len(tokenizer)) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if ( args.model_name_or_path and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] ) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def evaluate(args, eval_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, run_batch_fn, desc="") -> Dict: if args.local_rank in [-1, 0]: eval_output_dir = args.output_dir os.makedirs(eval_output_dir, exist_ok=True) # eval_batch_size for selection must be 1 to handle variable number of candidates if args.task == "selection": args.eval_batch_size = 1 else: args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=eval_dataset.collate_fn) # multi-gpu evaluate if args.n_gpu > 1 and (args.task != "selection" or eval_dataset.args.eval_all_snippets): if not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) eval_loss = 0.0 nb_eval_steps = 0 model.eval() data_infos = [] all_preds = [] all_labels = [] for batch in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): with torch.no_grad(): loss, lm_logits, mc_logits, mc_labels = run_batch_fn( args, model, batch) if args.task == "detection": mc_logits = mc_logits.sigmoid() if args.task in ["selection", "detection"]: data_infos.append(batch[-1]) all_preds.append(mc_logits.detach().cpu().numpy()) all_labels.append(mc_labels.detach().cpu().numpy()) eval_loss += loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps if args.task.lower() == "generation": perplexity = torch.exp(torch.tensor(eval_loss)) result = {"perplexity": perplexity, "loss": eval_loss} elif args.task.lower() == "selection": all_labels = np.array(all_labels).reshape(-1) all_pred_ids = np.array([np.argmax(logits) for logits in all_preds]) accuracy = np.sum(all_pred_ids == all_labels) / len(all_labels) logger.info("Avg. # of candidates: %f", sum([len(arr[0]) for arr in all_preds]) / len(all_preds)) result = {"loss": eval_loss, "accuracy": accuracy} if args.output_file: sorted_pred_ids = [ np.argsort(logits.squeeze())[::-1] for logits in all_preds ] write_selection_preds(eval_dataset.dataset_walker, args.output_file, data_infos, sorted_pred_ids, topk=5) elif args.task.lower() == "detection": all_labels = np.concatenate(all_labels) all_pred_ids = (np.concatenate(all_preds) > 0.5) accuracy = np.sum(all_pred_ids == all_labels) / len(all_labels) precision = sklearn.metrics.precision_score(all_labels, all_pred_ids) recall = sklearn.metrics.recall_score(all_labels, all_pred_ids) result = { "loss": eval_loss, "accuracy": accuracy, "precision": precision, "recall": recall } if args.output_file: write_detection_preds(eval_dataset.dataset_walker, args.output_file, data_infos, all_pred_ids) else: raise ValueError( "args.task not in ['generation', 'selection', 'detection'], got %s" % args.task) if args.local_rank in [-1, 0]: output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results %s *****" % desc) writer.write("***** Eval results %s *****\n" % desc) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def evaluate(args, data_generator, tb_writer, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, global_step, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir criterion = nn.BCEWithLogitsLoss() eval_dataset = data_generator.instance_a_valid_dataset() if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(batch): # if tokenizer._pad_token is None: # return pad_sequence(examples, batch_first=True) # return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) tokens = [b[0] for b in batch] features = [b[1] for b in batch] targets = [b[2] for b in batch] inputs = [b[3] for b in batch] lens = [len(x) for x in inputs] inputs = pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id) attention_mask = (inputs != tokenizer.pad_token_id).int() tokens, features, targets = [ torch.tensor(x) for x in [tokens, features, targets] ] return tokens, features, targets, inputs, attention_mask, torch.tensor( lens).unsqueeze(1) if args.use_bucket_iterator: bucket_boundaries = [0, 20, 40, 60, 80, 101] eval_sampler = BySequenceLengthSampler(eval_dataset, bucket_boundaries, batch_size=args.eval_batch_size, drop_last=False) eval_dataloader = DataLoader(eval_dataset, batch_size=1, batch_sampler=eval_sampler, collate_fn=collate) else: eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate # if args.n_gpu > 1: # model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() preds, labels = [], [] for batch in tqdm(eval_dataloader, desc="Evaluating"): # training loop tokens, features, targets, inputs, attention_mask, lens = batch tokens, features, targets, inputs, attention_mask, lens = [ x.to(args.device) for x in [tokens, features, targets, inputs, attention_mask, lens] ] tokens, features, targets = [ x.float() for x in [tokens, features, targets] ] with torch.no_grad(): logit = model(tokens, features, inputs, attention_mask, lens) loss = criterion(logit, targets) pred = torch.sigmoid(logit).detach().cpu().numpy() labels.append(targets.long().detach().cpu().numpy()) preds.append(pred) eval_loss += loss.mean().item() nb_eval_steps += 1 labels = np.vstack(labels) preds = np.float64(np.vstack(preds)) aucprs = [] for i, engage in enumerate(["reply", "retweet", "comment", "like"]): _prauc = compute_prauc(preds[:, i], labels[:, i]) _rce = compute_rce(preds[:, i], labels[:, i]) aucprs.append(_prauc) print(engage + ":", _prauc, _rce) tb_writer.add_scalar('PRAUC/{}_val'.format(engage), _prauc, global_step) tb_writer.add_scalar('RCE/{}_val'.format(engage), _rce, global_step) print("Mean AUCPR : {}".format(sum(aucprs) / 4.0)) tb_writer.add_scalar('PRAUC/mean', sum(aucprs) / 4.0, global_step)
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) fact_token_ids, fact_embedding_ids = zip( *[get_inputs(seq, mask) for seq, mask, genre in examples]) seqs = [seq for seq, mask, genre in examples] pad_seqs = pad_sequence(seqs, batch_first=True, padding_value=tokenizer.pad_token_id) pad_facts = pad_sequence(fact_token_ids, batch_first=True, padding_value=tokenizer.pad_token_id) pad_factsembeds = pad_sequence(fact_embedding_ids, batch_first=True, padding_value=tokenizer.pad_token_id) return list(zip(pad_facts, pad_factsembeds, pad_seqs)) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): if args.mlm: inputs, labels = mask_tokens(batch, tokenizer, args) with torch.no_grad(): outputs = model( inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() elif args.xlnet: with torch.no_grad(): pad_facts, pad_factsembeds, pad_seqs = zip(*batch) tfacts = torch.stack(pad_facts).to(args.device) tfact_embeds = torch.stack(pad_factsembeds).to(args.device) facts_padding_masks = torch.where( tfacts == tokenizer.pad_token_id, torch.ones_like(tfacts), torch.zeros_like(tfacts)).to(args.device) tseqs = torch.stack(pad_seqs).to(args.device) tseqs_padding_masks = torch.where( tseqs == tokenizer.pad_token_id, torch.ones_like(tseqs), torch.zeros_like(tseqs)).to(args.device) perm_masks = get_perm_masks(torch.zeros_like(tseqs), order="L2R") target_mapping = get_target_mapping(torch.zeros_like(tseqs), device=args.device) outputs = model(input_ids=tseqs, facts_tokens=tfacts, facts_embeds=tfact_embeds, input_mask=tseqs_padding_masks, facts_input_mask=facts_padding_masks, perm_mask=perm_masks, target_mapping=target_mapping) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() else: inputs, labels = (batch, batch) with torch.no_grad(): outputs = model( inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps print(f"validation loss value at step is {eval_loss}") logger.info(f"validation loss value at step is {eval_loss}") perplexity = torch.exp(torch.tensor(eval_loss)) result = {"perplexity": perplexity} output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples): src_id,tgt_id,src_am,tgt_am=list(zip(*examples)) # print (src_id) src_id,tgt_id,src_am,tgt_am=torch.stack(src_id),torch.stack(tgt_id),torch.stack(src_am),torch.stack(tgt_am) # padding_value = 0 if tokenizer._pad_token is None else tokenizer.pad_token_id # input_ids_src = pad_sequence(src_examples, batch_first=True, padding_value=padding_value) # input_ids_tgt = pad_sequence(tgt_examples, batch_first=True, padding_value=padding_value) # max_length = input_ids.shape[1] # attention_mask_src = torch.stack( # [torch.cat([torch.ones(len(t), dtype=torch.long), torch.zeros(max_length - len(t), dtype=torch.long)]) for t # in src_examples]) # attention_mask_tgt = torch.stack( # [torch.cat([torch.ones(len(t), dtype=torch.long), torch.zeros(max_length - len(t), dtype=torch.long)]) for t # in tgt_examples]) return src_id,tgt_id,src_am,tgt_am # def collate(examples: List[torch.Tensor]): # padding_value = 0 if tokenizer._pad_token is None else tokenizer.pad_token_id # input_ids=pad_sequence(examples, batch_first=True,padding_value=padding_value) # max_length = input_ids.shape[1] # attention_mask = torch.stack( # [torch.cat([torch.ones(len(t), dtype=torch.long), torch.zeros(max_length - len(t), dtype=torch.long)]) for t # in examples]) # return input_ids, attention_mask # if tokenizer._pad_token is None: # max_length = input_ids.shape[1] # attention_mask = torch.stack( # [torch.cat([torch.ones(len(t), dtype=torch.long), torch.zeros(max_length - len(t), dtype=torch.long)]) # for t in examples]) # return pad_sequence(examples, batch_first=True) # # return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate ) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): inputs_src,inputs_tgt,attention_mask_src,attention_mask_tgt=batch inputs_src,inputs_tgt = inputs_src.to(args.device),inputs_tgt.to(args.device) attention_mask_src,attention_mask_tgt=attention_mask_src.to(args.device),attention_mask_tgt.to(args.device) with torch.no_grad(): loss = model(input_ids_src=inputs_src, input_ids_tgt=inputs_tgt,attention_mask_src=attention_mask_src,attention_mask_tgt=attention_mask_tgt) eval_loss += loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps # perplexity = torch.exp(torch.tensor(eval_loss)) result = {"loss": eval_loss} output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def train(args, train_dataset, corrects, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ current_time = datetime.now().strftime('%b%d_%H-%M-%S') log_dir = os.path.join( config.output_dir, 'runs', args.relation, os.path.basename(args.output_dir) + '_' + current_time) tb_writer = SummaryWriter(log_dir=log_dir) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=args.batch_size, collate_fn=collate) t_total = len( train_dataloader) // args.gradient_accumulation_steps * args.epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.epochs) logger.info(" Instantaneous batch size per GPU = %d", args.batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.batch_size * args.gradient_accumulation_steps, ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 tr_loss, logging_loss = 0.0, 0.0 model_to_resize = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() train_iterator = trange(epochs_trained, int(args.epochs), desc="Epoch", disable=False) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels = mask_tokens(batch, tokenizer, args) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics results = evaluate(args, corrects, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) # print((tr_loss - logging_loss) / args.logging_steps) logging_loss = tr_loss if args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) tb_writer.close() return global_step, tr_loss / global_step
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.add_fake_english: add_shifted_input(eval_dataset.examples, args.special_token_indices, model.config.shift) # remove parallel data is not meaningful for evaluation if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate ) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() mycounter = 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): if mycounter == 0 or mycounter == 68: pass if args.invert_order: invert(batch, model.config.shift) if args.language_specific_positions: if args.block_size > 256: raise ValueError("Language specific posiiton embeddings can only be <256.") position_ids, segment_ids = get_language_specific_positions(batch, model.config.shift, args.block_size) position_ids = position_ids.to(args.device) segment_ids = segment_ids.to(args.device) else: position_ids, segment_ids = None, None inputs, labels = mask_tokens(batch, tokenizer, args, model) if args.mlm else (batch, batch) if args.shift_special_tokens: shift_special_tokens(inputs, model.config.shift, args.special_token_indices) mycounter += 1 if mycounter < 5: logger.info("") logger.info("#" * 10 + " {} ".format(mycounter) + "#"*10) logger.info("-" * 30 + " INPUTS") logger.info(inputs) logger.info("-" * 30 + " POSITIONS") logger.info(position_ids) logger.info("-" * 30 + " TOKENS") logger.info(segment_ids) logger.info("-" * 30 + " LABELS") logger.info(labels) inputs = inputs.to(args.device) labels = labels.to(args.device) with torch.no_grad(): outputs = model(inputs, masked_lm_labels=labels, position_ids=position_ids, token_type_ids=segment_ids) if args.mlm else model(inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) result = {"perplexity": perplexity} if args.eval_output_file is not None: output_eval_file = args.eval_output_file with open(output_eval_file, "a") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("{} {} {} {}\n".format(args.output_dir, args.seed, key, result[key])) else: output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly if args.wiki_dataset: collate_fn = functools.partial(collate_wiki, tokenizer) else: collate_fn = functools.partial(collate, tokenizer) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn, ) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): if args.eval_subsampling != 1.0 and random.random( ) >= args.eval_subsampling: continue if args.wiki_dataset: if args.mlm: raise RuntimeError("Can't do mlm for wiki dataset") tokens, loss_mask = batch inputs, labels = (tokens, tokens) loss_mask = loss_mask.to(args.device) loss_weights = (~loss_mask) + loss_mask * args.title_scale inputs = inputs.to(args.device) labels = labels.to(args.device) outputs = model(inputs, labels=labels, loss_weights=loss_weights) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() else: inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) with torch.no_grad(): outputs = model( inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) loss = torch.tensor(eval_loss) result = {"perplexity": perplexity, "loss": loss} if args.eval_creativity_blacklist: if not args.parsed_dictionary_dataset: raise RuntimeError( "Evaluating creativity blacklist with non-parsed dictionary dataset" ) blacklist = datasets.Blacklist.load(args.eval_creativity_blacklist) print( f"Evaluating creativity over {args.num_eval_creativity} words with {args.eval_creativity_batch_size} batch size" ) s = time.time() result.update( datasets.ParsedDictionaryDefinitionDataset.evaluate_creativity( tokenizer, model, blacklist, args.num_eval_creativity, args.eval_creativity_batch_size, max_length=args.block_size, )) print(f"Done evaluating creativity in {time.time() - s}s") output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def export_pytorch( tokenizer: PreTrainedTokenizer, model: PreTrainedModel, config: OnnxConfig, opset: int, output: Path, ) -> Tuple[List[str], List[str]]: """ Export a PyTorch model to an ONNX Intermediate Representation (IR) Args: tokenizer ([`PreTrainedTokenizer`]): The tokenizer used for encoding the data. model ([`PreTrainedModel`]): The model to export. config ([`~onnx.config.OnnxConfig`]): The ONNX configuration associated with the exported model. opset (`int`): The version of the ONNX operator set to use. output (`Path`): Directory to store the exported ONNX model. Returns: `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from the ONNX configuration. """ if issubclass(type(model), PreTrainedModel): import torch from torch.onnx import export as onnx_export logger.info(f"Using framework PyTorch: {torch.__version__}") with torch.no_grad(): model.config.return_dict = True model.eval() # Check if we need to override certain configuration item if config.values_override is not None: logger.info(f"Overriding {len(config.values_override)} configuration item(s)") for override_config_key, override_config_value in config.values_override.items(): logger.info(f"\t- {override_config_key} -> {override_config_value}") setattr(model.config, override_config_key, override_config_value) # Ensure inputs match # TODO: Check when exporting QA we provide "is_pair=True" model_inputs = config.generate_dummy_inputs(tokenizer, framework=TensorType.PYTORCH) inputs_match, matched_inputs = ensure_model_and_config_inputs_match(model, model_inputs.keys()) onnx_outputs = list(config.outputs.keys()) if not inputs_match: raise ValueError("Model and config inputs doesn't match") config.patch_ops() # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11, # so we check the torch version for backwards compatibility if parse(torch.__version__) < parse("1.10"): # export can work with named args but the dict containing named args # has to be the last element of the args tuple. try: onnx_export( model, (model_inputs,), f=output.as_posix(), input_names=list(config.inputs.keys()), output_names=onnx_outputs, dynamic_axes={ name: axes for name, axes in chain(config.inputs.items(), config.outputs.items()) }, do_constant_folding=True, use_external_data_format=config.use_external_data_format(model.num_parameters()), enable_onnx_checker=True, opset_version=opset, ) except RuntimeError as err: message = str(err) if ( message == "Exporting model exceed maximum protobuf size of 2GB. Please call torch.onnx.export without setting use_external_data_format parameter." ): message = "Exporting model exceed maximum protobuf size of 2GB. Please call torch.onnx.export without setting use_external_data_format parameter or try with torch 1.10+." raise RuntimeError(message) else: raise err else: onnx_export( model, (model_inputs,), f=output.as_posix(), input_names=list(config.inputs.keys()), output_names=onnx_outputs, dynamic_axes={name: axes for name, axes in chain(config.inputs.items(), config.outputs.items())}, do_constant_folding=True, opset_version=opset, ) config.restore_ops() return matched_inputs, onnx_outputs
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if (args.model_name_or_path and os.path.isfile( os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model_to_resize = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch") set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, label_ids = batch labels = label_ids.repeat((inputs.shape[1], 1)).T masks = inputs.eq(tokenizer.mask_token_id) labels[~masks] = -100 inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break tb_writer.close() return global_step, tr_loss / global_step
def save_preds(args, data_generator, tb_writer, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, global_step, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir criterion = nn.BCEWithLogitsLoss() eval_dataset = data_generator.instance_a_lb_dataset() if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(batch): # if tokenizer._pad_token is None: # return pad_sequence(examples, batch_first=True) # return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) tokens = [b[0] for b in batch] features = [b[1] for b in batch] tweet_ids = [b[3] for b in batch] user_ids = [b[4] for b in batch] inputs = [b[2] for b in batch] lens = [len(x) for x in inputs] inputs = pad_sequence(inputs, batch_first=True, padding_value=tokenizer.pad_token_id) attention_mask = (inputs != tokenizer.pad_token_id).int() tokens, features = [torch.tensor(x) for x in [tokens, features]] return tokens, features, tweet_ids, user_ids, inputs, attention_mask, torch.tensor( lens).unsqueeze(1) if args.use_bucket_iterator: bucket_boundaries = [0, 20, 40, 60, 80, 101] eval_sampler = BySequenceLengthSampler(eval_dataset, bucket_boundaries, batch_size=args.eval_batch_size, drop_last=False) eval_dataloader = DataLoader(eval_dataset, batch_size=1, batch_sampler=eval_sampler, collate_fn=collate) else: eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate # if args.n_gpu > 1: # model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) nb_eval_steps = 0 model.eval() tweets, users, preds = [], [], [] for batch in tqdm(eval_dataloader, desc="Evaluating"): # training loop tokens, features, tweet_ids, user_ids, inputs, attention_mask, lens = batch tokens, features, inputs, attention_mask, lens = [ x.to(args.device) for x in [tokens, features, inputs, attention_mask, lens] ] tokens, features = [x.float() for x in [tokens, features]] with torch.no_grad(): logit = model(tokens, features, inputs, attention_mask, lens) pred = torch.sigmoid(logit).detach().cpu().numpy() tweets += tweet_ids users += user_ids preds.append(pred) nb_eval_steps += 1 #if nb_eval_steps == 10: # break tweets = np.array(tweets) users = np.array(users) preds = np.float64(np.vstack(preds)) print(tweets.shape, users.shape, preds.shape) print(tweets[0:10]) print(users[0:10]) for i, engage in enumerate(["reply", "retweet", "comment", "like"]): preds_i = preds[:, i] print(preds_i.shape) with open( args.test_inference_path + "submission_{}.csv".format(engage), "w") as f: for k in range(preds_i.shape[0]): f.write( str(tweets[k]) + "," + str(users[k]) + "," + str(preds_i[k]) + "\n") print("Saved to csv the predictions for task {}".format(engage))
def __init__(self, model: SentenceTransformer, decoder_name_or_path: str = None, tie_encoder_decoder: bool = True): """ :param model: SentenceTransformer model :param decoder_name_or_path: Model name or path for initializing a decoder (compatible with Huggingface's Transformers) :param tie_encoder_decoder: whether to tie the trainable parameters of encoder and decoder """ super(DenoisingAutoEncoderLoss, self).__init__() self.encoder = model # This will be the final model used during the inference time. self.tokenizer_encoder = model.tokenizer encoder_name_or_path = model[0].auto_model.config._name_or_path if decoder_name_or_path is None: assert tie_encoder_decoder, "Must indicate the decoder_name_or_path argument when tie_encoder_decoder=False!" if tie_encoder_decoder: if decoder_name_or_path: logger.warning( 'When tie_encoder_decoder=True, the decoder_name_or_path will be invalid.' ) decoder_name_or_path = encoder_name_or_path self.tokenizer_decoder = AutoTokenizer.from_pretrained( decoder_name_or_path) self.need_retokenization = not (type(self.tokenizer_encoder) == type( self.tokenizer_decoder)) decoder_config = AutoConfig.from_pretrained(decoder_name_or_path) decoder_config.is_decoder = True decoder_config.add_cross_attention = True kwargs_decoder = {'config': decoder_config} try: self.decoder = AutoModelForCausalLM.from_pretrained( decoder_name_or_path, **kwargs_decoder) except ValueError as e: logger.error( f'Model name or path "{decoder_name_or_path}" does not support being as a decoder. Please make sure the decoder model has an "XXXLMHead" class.' ) raise e assert model[ 0].auto_model.config.hidden_size == decoder_config.hidden_size, 'Hidden sizes do not match!' if self.tokenizer_decoder.pad_token is None: # Needed by GPT-2, etc. self.tokenizer_decoder.pad_token = self.tokenizer_decoder.eos_token self.decoder.config.pad_token_id = self.decoder.config.eos_token_id if len(AutoTokenizer.from_pretrained(encoder_name_or_path)) != len( self.tokenizer_encoder): logger.warning( 'WARNING: The vocabulary of the encoder has been changed. One might need to change the decoder vocabulary, too.' ) if tie_encoder_decoder: assert not self.need_retokenization, "The tokenizers should be the same when tie_encoder_decoder=True." if len(self.tokenizer_encoder) != len( self.tokenizer_decoder ): # The vocabulary has been changed. self.tokenizer_decoder = self.tokenizer_encoder self.decoder.resize_token_embeddings( len(self.tokenizer_decoder)) logger.warning( 'Since the encoder vocabulary has been changed and --tie_encoder_decoder=True, now the new vocabulary has also been used for the decoder.' ) decoder_base_model_prefix = self.decoder.base_model_prefix PreTrainedModel._tie_encoder_decoder_weights( model[0].auto_model, self.decoder._modules[decoder_base_model_prefix], self.decoder.base_model_prefix)
def train(args, data, datasets, model: PreTrainedModel, original_model, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) train_datasets = datasets['train'] dev_datasets = datasets['dev'] train_dataloaders, train_example_num, train_distribution = create_dataloader( args, train_datasets, tokenizer, train=True) dev_dataloaders, dev_example_num, dev_distribution = create_dataloader( args, dev_datasets, tokenizer, train=False) train_iter_num = sum( [len(dataloader) for dataloader in train_dataloaders.values()]) dev_iter_num = sum( [len(dataloader) for dataloader in dev_dataloaders.values()]) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( train_iter_num // args.gradient_accumulation_steps) + 1 else: t_total = train_iter_num // args.gradient_accumulation_steps * args.num_train_epochs model = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model.resize_token_embeddings(len(tokenizer)) original_model = original_model.module if hasattr( original_model, "module" ) else original_model # Take care of distributed/parallel training original_model.resize_token_embeddings(len(tokenizer)) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if (args.model_name_or_path and os.path.isfile( os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) original_model = torch.nn.DataParallel(original_model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) original_model = torch.nn.parallel.DistributedDataParallel( original_model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", train_example_num) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 best_loss = float('inf') best_step = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (train_iter_num // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( train_iter_num // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") model.zero_grad() original_model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) def inner_product(x, y): return torch.mean(torch.sum(y * x, 3)) def mean_square(x, y, idx): return torch.mean(torch.mean((y - x)**2, idx)) #return torch.mean(torch.sum((y - x) ** 2, 3)) def save_best_model(best_loss, best_step, dev_dataloaders): if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well eval_loss = evaluate(model, attributes_hiddens, dev_dataloaders) #eval_loss = evaluate(args, model, original_model, dev_dataloaders, dev_example_num, dev_distribution, criterion_mse, criterion_ip, feminine_hiddens, masculine_hiddens, gender_hiddens) logger.info(" global_step = %s, evaluate loss = %s", global_step, eval_loss) tb_writer.add_scalar("eval_loss", eval_loss, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) if eval_loss < best_loss: best_loss = eval_loss best_step = global_step checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-best") os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) #_rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) logger.info(" best_step = %s, best loss = %s", best_step, best_loss) return best_loss, best_step def get_hiddens_of_model(input): model.zero_grad() if args.model_type == 'roberta': _, _, hiddens = model.roberta(input) elif args.model_type == 'bert': _, _, hiddens = model.bert(input) elif args.model_type == 'albert': _, _, hiddens = model.albert(input) elif args.model_type == 'dbert': _, hiddens = model.distilbert(input) elif args.model_type == 'electra': _, hiddens = model.electra(input) elif args.model_type == 'gpt2': _, _, hiddens = model.transformer(input) elif args.model_type == 'gpt': _, hiddens = model.transformer(input) return hiddens def attribute_vector_example(): attributes_hiddens = {f'attribute{i}': [] for i in range(2)} dataloaders, _, distribution = create_dataloader(args, train_datasets, tokenizer, train=True) for key in distribution: if key != 'neutral': inputs, labels = next(dataloaders[key]) inputs = inputs.to(args.device) hiddens = get_hiddens_of_model(inputs) hiddens = torch.stack(hiddens, 2) if labels.size(1) > 1: onehot = torch.eye(hiddens.size(1)) zeros = torch.zeros(1, onehot.size(0)) onehot = torch.cat((zeros, onehot), 0) onehot = onehot[labels] onehot = torch.sum(onehot, 1) onehot = onehot.view(hiddens.size(0), -1, 1, 1) else: onehot = torch.eye(hiddens.size(1))[labels].view( hiddens.size(0), -1, 1, 1) onehot = onehot.to(args.device) attributes_hiddens[key].append( torch.sum(hiddens * onehot, 1) / labels.size(1)) # neutralも含まれている attribute_size = len(data['train']['example']) for i in range(attribute_size - 1): attributes_hiddens[f'attribute{i}'] = torch.mean( torch.cat(attributes_hiddens[f'attribute{i}'], 0), 0).detach().unsqueeze(0) return attributes_hiddens def forward(attributes_hiddens, dataloaders, key): inputs = next(dataloaders[key]) if len(inputs) == 2: inputs, labels = inputs labels = labels.to(args.device) else: labels = None inputs = inputs.to(args.device) if args.model_type == 'roberta': final_layer_hiddens, first_token_hidden, all_layer_hiddens = model.roberta( inputs) if 'neutral' != key: with torch.no_grad(): final_layer_original_hiddens, _, all_layer_original_hiddens = original_model.roberta( inputs) if args.token_loss: token_predicts = model.lm_head(final_layer_hiddens) token_original = original_model.lm_head( final_layer_original_hiddens) elif args.model_type == 'bert': final_layer_hiddens, first_token_hidden, all_layer_hiddens = model.bert( inputs) if 'neutral' != key: with torch.no_grad(): final_layer_original_hiddens, _, all_layer_original_hiddens = original_model.bert( inputs) if args.token_loss: token_predicts = model.cls(final_layer_hiddens) token_original = original_model.cls( final_layer_original_hiddens) elif args.model_type == 'albert': final_layer_hiddens, first_token_hidden, all_layer_hiddens = model.albert( inputs) if 'neutral' != key: with torch.no_grad(): final_layer_original_hiddens, _, all_layer_original_hiddens = original_model.albert( inputs) if args.token_loss: token_predicts = model.classifier(final_layer_hiddens) token_original = original_model.classifier( final_layer_original_hiddens) elif args.model_type == 'dbert': final_layer_hiddens, all_layer_hiddens = model.distilbert(inputs) if 'neutral' != key: with torch.no_grad(): final_layer_original_hiddens, all_layer_original_hiddens = original_model.distilbert( inputs) if args.token_loss: token_predicts = model.classifier(final_layer_hiddens) token_original = original_model.classifier( final_layer_original_hiddens) elif args.model_type == 'electra': final_layer_hiddens, all_layer_hiddens = model.electra(inputs) if 'neutral' != key: with torch.no_grad(): final_layer_original_hiddens, all_layer_original_hiddens = original_model.electra( inputs) if args.token_loss: hiddens = model.generator_predictions(final_layer_hiddens) token_predicts = model.generator_lm_head(hiddens) original_hiddens = original_model.generator_predictions( final_layer_original_hiddens) token_original = original_model.generator_lm_head( original_hiddens) elif args.model_type == 'gpt2': final_layer_hiddens, first_token_hidden, all_layer_hiddens = model.transformer( inputs) if 'neutral' != key: with torch.no_grad(): final_layer_original_hiddens, _, all_layer_original_hiddens = original_model.transformer( inputs) if args.token_loss: token_predicts = model.lm_head(final_layer_hiddens) token_original = original_model.lm_head( final_layer_original_hiddens) elif args.model_type == 'gpt': final_layer_hiddens, all_layer_hiddens = model.transformer(inputs) if 'neutral' != key: with torch.no_grad(): final_layer_original_hiddens, all_layer_original_hiddens = original_model.transformer( inputs) if args.token_loss: token_predicts = model.lm_head(final_layer_hiddens) token_original = original_model.lm_head( final_layer_original_hiddens) all_layer_hiddens = torch.stack(all_layer_hiddens, 2) if 'neutral' != key: all_original_hiddens = torch.stack(all_layer_original_hiddens, 2) all_original_hiddens = all_original_hiddens.detach() if args.token_loss: original_hiddens - original_hiddens.detach() token_original = token_original.detach() if args.debias_layer == 'all': target_layer_hiddens = all_layer_hiddens target_original_hiddens = all_layer_hiddens else: if args.debias_layer == 'first': idx = 0 elif args.debias_layer == 'last': idx = -1 target_layer_hiddens = all_layer_hiddens[:, :, idx] target_layer_hiddens = target_layer_hiddens.unsqueeze(2) if 'neutral' != key: target_original_hiddens = all_original_hiddens[:, :, idx] target_original_hiddens = target_original_hiddens.unsqueeze(2) else: attributes_hiddens = { key: value[:, idx, :].unsqueeze(1) for key, value in attributes_hiddens.items() } if args.loss_target == 'sentence' or labels is None: attributes_hiddens = { key: value.unsqueeze(1) for key, value in attributes_hiddens.items() } #elif args.loss_target == 'token' and key == 'neutral': elif args.loss_target == 'token': if labels.size(1) > 1: onehot = torch.eye(target_layer_hiddens.size(1)) zeros = torch.zeros(1, onehot.size(0)) onehot = torch.cat((zeros, onehot), 0) onehot = onehot[labels] onehot = torch.sum(onehot, 1) onehot = onehot.view(target_layer_hiddens.size(0), -1, 1, 1) else: onehot = torch.eye(target_layer_hiddens.size(1))[labels].view( target_layer_hiddens.size(0), -1, 1, 1) onehot = onehot.to(args.device) target_layer_hiddens = torch.sum(target_layer_hiddens * onehot, 1).unsqueeze(1) / labels.size(1) if 'neutral' != key: target_original_hiddens = torch.sum( target_original_hiddens * onehot, 1).unsqueeze(1) / labels.size(1) else: attributes_hiddens = { key: value.expand(target_layer_hiddens.size(0), 1, value.size(1), value.size(2)) for key, value in attributes_hiddens.items() } if 'neutral' == key: loss = 0 for attribute_hiddens in attributes_hiddens.values(): tmp_loss = criterion_ip(target_layer_hiddens, attribute_hiddens) if args.square_loss: tmp_loss = tmp_loss**2 tmp_loss *= alpha loss += tmp_loss else: #loss = criterion_ms(target_layer_hiddens, target_original_hiddens) loss = criterion_ms(all_layer_hiddens, all_original_hiddens, 3) if args.token_loss: loss += criterion_ms(token_predicts, token_original, 2) #loss += criterion_ms(hiddens, original_hiddens, 2) loss *= beta return loss #def evaluate(args, model: PreTrainedModel, original_model, dev_dataloaders, dev_example_num, dev_distribution, criterion_mse, criterion_ip, feminine_hiddens, masculine_hiddens, gender_hiddens, prefix="") -> Dict: def evaluate(model, attributes_hiddens, dev_dataloaders, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", dev_example_num) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 model.eval() #criterion.eval() for key in tqdm(dev_distribution): with torch.no_grad(): loss = forward(attributes_hiddens, dev_dataloaders, key) eval_loss += loss.item() model.zero_grad() original_model.zero_grad() output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") ''' with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) logger.info(" Loss = %s", eval_loss) writer.write("Loss = %s\n" % (eval_loss)) ''' return eval_loss #criterion_ms = torch.nn.MSELoss() criterion_ms = mean_square #criterion.train() criterion_ip = inner_product original_model.eval() alpha, beta = args.weighted_loss alpha = float(alpha) beta = float(beta) train_loss = 0.0 for _ in train_iterator: random.shuffle(train_distribution) epoch_iterator = tqdm(train_distribution, desc="Iteration", disable=args.local_rank not in [-1, 0]) model.eval() with torch.no_grad(): attributes_hiddens = attribute_vector_example() for step, key in enumerate(epoch_iterator): model.train() # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue loss = forward(attributes_hiddens, train_dataloaders, key) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() original_model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logger.info(" global_step = %s, train loss = %s", global_step, train_loss) train_loss = 0.0 # Log metrics best_loss, best_step = save_best_model( best_loss, best_step, dev_dataloaders) dev_dataloaders, dev_example_num, dev_distribution = create_dataloader( args, dev_datasets, tokenizer, train=False) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break train_dataloaders, train_example_num, train_distribution = create_dataloader( args, train_datasets, tokenizer, train=True) if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break dev_dataloaders, dev_example_num, dev_distribution = create_dataloader( args, dev_datasets, tokenizer, train=False) best_loss, best_step = save_best_model(best_loss, best_step, dev_dataloaders) if args.local_rank in [-1, 0]: tb_writer.close()
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) evaluation_loss = dict() eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) # adjusting eval batch size according to the number of train epochs to # make it easier to plot with same length for train and eval also for # the eval loss plot to be adjusted and clear within the plot frame # args.eval_batch_size = int(len(eval_dataset) / args.num_train_epochs) # commenting the actual one args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): inputs, labels = mask_tokens(batch, tokenizer, args) \ if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) with torch.no_grad(): outputs = model(inputs, masked_lm_labels=labels) \ if args.mlm else model(inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 # write for each batch eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) print('\n--------------------------') result = { "perplexity": perplexity, "eval_loss": eval_loss, "eval_steps": nb_eval_steps } output_eval_file = os.path.join(FINETUNE_DIR, "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) print('----------------------------') return result
def train(args, train_dataset, eval_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, run_batch_fn_train, run_batch_fn_eval) -> Tuple[int, float]: if args.local_rank in [-1, 0]: log_dir = os.path.join("runs", args.exp_name) if args.exp_name else None tb_writer = SummaryWriter(log_dir) args.output_dir = log_dir args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=train_dataset.collate_fn) t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! global_step = 0 model.zero_grad() train_iterator = trange(0, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # for reproducibility for _ in train_iterator: local_steps = 0 tr_loss = 0.0 epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() loss, _, _, _ = run_batch_fn_train(args, model, batch) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 local_steps += 1 epoch_iterator.set_postfix(Loss=tr_loss / local_steps) results = evaluate(args, eval_dataset, model, tokenizer, run_batch_fn_eval, desc=str(global_step)) if args.local_rank in [-1, 0]: for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", tr_loss / local_steps, global_step) checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training logger.info("Saving model checkpoint to %s", output_dir) model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) with open(os.path.join(output_dir, "params.json"), "w") as jsonfile: json.dump(args.params, jsonfile, indent=2, default=lambda x: str(x)) logger.info("Saving model checkpoint to %s", output_dir) if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / local_steps
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, train_dataset_second, DP_classifier) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = SequentialSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate ) correct_sampler = SequentialSampler(train_dataset_second) if args.local_rank == -1 else DistributedSampler(train_dataset_second) correct_dataloader = DataLoader( train_dataset_second, sampler=correct_sampler, batch_size=args.train_batch_size, collate_fn=collate ) wrong_sampler = RandomSampler(train_dataset_second) if args.local_rank == -1 else DistributedSampler(train_dataset_second) wrong_dataloader = DataLoader( train_dataset_second, sampler=wrong_sampler, batch_size=args.train_batch_size, collate_fn=collate ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)] + [p for n, p in DP_classifier.named_parameters()], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) #scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if ( args.model_name_or_path and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model_to_resize = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() DP_classifier.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] ) set_seed(args) # Added here for reproducibility zipped_data = zip(train_dataloader, correct_dataloader, wrong_dataloader) correct_mc_tensor = torch.ones(args.train_batch_size, dtype=torch.float) correct_mc_tensor = correct_mc_tensor.to(args.device) wrong_mc_tensor = torch.zeros(args.train_batch_size, dtype=torch.float) wrong_mc_tensor = wrong_mc_tensor.to(args.device) print(correct_mc_tensor) print(wrong_mc_tensor) accumulated_lm_loss = 0.0 accumulated_mc_loss = 0.0 for _ in train_iterator: train_sampler = SequentialSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate ) correct_sampler = SequentialSampler(train_dataset_second) if args.local_rank == -1 else DistributedSampler(train_dataset_second) correct_dataloader = DataLoader( train_dataset_second, sampler=correct_sampler, batch_size=args.train_batch_size, collate_fn=collate ) wrong_sampler = RandomSampler(train_dataset_second) if args.local_rank == -1 else DistributedSampler(train_dataset_second) wrong_dataloader = DataLoader( train_dataset_second, sampler=wrong_sampler, batch_size=args.train_batch_size, collate_fn=collate ) zipped_data = zip(train_dataloader, correct_dataloader, wrong_dataloader) epoch_iterator = tqdm(zipped_data, desc="Iteration", disable=args.local_rank not in [-1, 0], total=len(train_dataloader)) for step, zipped_batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() DP_classifier.train() # unpack zipped_batch batch, correct_batch, wrong_batch = zipped_batch # First: original sentence inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) labels = inputs.clone() cls_pos = [] for curr in labels: for idx, tk in enumerate(curr): if tk == tokenizer.cls_token_id: curr[idx] = -100 cls_pos.append(idx) break inputs = inputs.to(args.device) labels = labels.to(args.device) outputs = model(inputs, lm_labels=labels) loss_lm_1 = outputs[0] hidden_1 = outputs[3] sentence_embed_1_pieces = [hh[cls_pos[idx]].unsqueeze(0) for idx, hh in enumerate(hidden_1)] sentence_embed_1 = torch.cat(sentence_embed_1_pieces) # Second: correct next sentence correct_input = correct_batch correct_labels = correct_input.clone() cls_pos = [] for curr in correct_labels: for idx, tk in enumerate(curr): if tk == tokenizer.cls_token_id: curr[idx] = -100 cls_pos.append(idx) break correct_input = correct_input.to(args.device) correct_labels = correct_labels.to(args.device) outputs = model(correct_input, lm_labels=correct_labels) loss_lm_2 = outputs[0] hidden_2 = outputs[3] sentence_embed_2_pieces = [hh[cls_pos[idx]].unsqueeze(0) for idx, hh in enumerate(hidden_2)] sentence_embed_2 = torch.cat(sentence_embed_2_pieces) # Get correct loss if random.randint(0, 1) == 1: outputs = DP_classifier(sentence_embed_1, sentence_embed_2, correct_mc_tensor) else: outputs = DP_classifier(sentence_embed_2, sentence_embed_1, correct_mc_tensor) loss_mc = outputs[0] # MC_LOSS SCALING SCALING = 0.05 loss_lm = loss_lm_1 + loss_lm_2 #loss = loss_lm loss_first = loss_lm + SCALING * loss_mc #print("loss_mc: ", loss_mc.item()) #print("loss_lm: ", loss_lm.item()) accumulated_lm_loss += loss_lm.item() / 2.0 accumulated_mc_loss += SCALING * loss_mc.item() # Second loss: wrong next sentence randomly sampled from training set wrong_input = wrong_batch wrong_labels = wrong_input.clone() cls_pos = [] for curr in wrong_labels: for idx, tk in enumerate(curr): if tk == tokenizer.cls_token_id: curr[idx] = -100 cls_pos.append(idx) break wrong_input = wrong_input.to(args.device) wrong_labels = wrong_labels.to(args.device) outputs = model(wrong_input, lm_labels=wrong_labels) loss_lm_3 = outputs[0] hidden_3 = outputs[3] sentence_embed_3_pieces = [hh[cls_pos[idx]].unsqueeze(0) for idx, hh in enumerate(hidden_3)] sentence_embed_3 = torch.cat(sentence_embed_3_pieces) if random.randint(0, 1) == 1: outputs = DP_classifier(sentence_embed_1, sentence_embed_3, wrong_mc_tensor) else: outputs = DP_classifier(sentence_embed_3, sentence_embed_1, wrong_mc_tensor) loss_mc = outputs[0] #loss = loss_lm loss_second = loss_lm_3 + SCALING * loss_mc #print("loss_mc: ", loss_mc.item()) #print("loss_lm: ", loss_lm.item()) accumulated_mc_loss += SCALING * loss_mc.item() # Total loss loss = loss_first + loss_second SKIP_STEP = 50 if (step % SKIP_STEP == 0): print(' iter %d, avg. lm_loss %.2f, avg. mc_loss %.2f, avg. ppl %.2f ' % (step, accumulated_lm_loss / SKIP_STEP, accumulated_mc_loss / SKIP_STEP, math.exp(loss_lm.item() /2), ), file=sys.stderr) tb_writer.add_scalar("training_lm_loss", accumulated_lm_loss / SKIP_STEP, global_step) tb_writer.add_scalar("training_mc_loss", accumulated_mc_loss / SKIP_STEP, global_step) accumulated_lm_loss = 0.0 accumulated_mc_loss = 0.0 if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) torch.nn.utils.clip_grad_norm_(DP_classifier.parameters(), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) torch.nn.utils.clip_grad_norm_(DP_classifier.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() DP_classifier.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, DP_classifier) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(DP_classifier, os.path.join(output_dir, "DP_classifier.bin")) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def export(tokenizer: PreTrainedTokenizer, model: PreTrainedModel, config: OnnxConfig, opset: int, output: Path) -> Tuple[List[str], List[str]]: """ Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR Args: tokenizer: model: config: opset: output: Returns: """ if not is_torch_available(): raise ImportError( "Cannot convert because PyTorch is not installed. Please install torch first." ) import torch from torch.onnx import export from ..file_utils import torch_version if not is_torch_onnx_dict_inputs_support_available(): raise AssertionError( f"Unsupported PyTorch version, minimum required is 1.8.0, got: {torch_version}" ) logger.info(f"Using framework PyTorch: {torch.__version__}") with torch.no_grad(): model.config.return_dict = True model.eval() # Check if we need to override certain configuration item if config.values_override is not None: logger.info( f"Overriding {len(config.values_override)} configuration item(s)" ) for override_config_key, override_config_value in config.values_override.items( ): logger.info( f"\t- {override_config_key} -> {override_config_value}") setattr(model.config, override_config_key, override_config_value) # Ensure inputs match # TODO: Check when exporting QA we provide "is_pair=True" model_inputs = config.generate_dummy_inputs( tokenizer, framework=TensorType.PYTORCH) inputs_match, matched_inputs = ensure_model_and_config_inputs_match( model, model_inputs.keys()) onnx_outputs = list(config.outputs.keys()) if not inputs_match: raise ValueError("Model and config inputs doesn't match") config.patch_ops() # export can works with named args but the dict containing named args as to be last element of the args tuple export( model, (model_inputs, ), f=output.as_posix(), input_names=list(config.inputs.keys()), output_names=onnx_outputs, dynamic_axes={ name: axes for name, axes in chain(config.inputs.items(), config.outputs.items()) }, do_constant_folding=True, use_external_data_format=config.use_external_data_format( model.num_parameters()), enable_onnx_checker=True, opset_version=opset, ) config.restore_ops() return matched_inputs, onnx_outputs
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, DP_classifier, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True, doubling=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate ) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) labels = inputs.clone() for curr in labels: for idx, tk in enumerate(curr): if tk == tokenizer.cls_token_id: curr[idx] = -100 inputs = inputs.to(args.device) labels = labels.to(args.device) with torch.no_grad(): outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, lm_labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) result = {"perplexity": perplexity} ###### Evaluate NSP accuracy eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) eval_dataset_second = load_and_cache_examples(args, tokenizer, evaluate=True, second=True) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader( eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate ) eval_correct_sampler = SequentialSampler(eval_dataset_second) eval_correct_dataloader = DataLoader( eval_dataset_second, sampler=eval_correct_sampler, batch_size=args.eval_batch_size, collate_fn=collate ) eval_wrong_sampler = RandomSampler(eval_dataset_second) eval_wrong_dataloader = DataLoader( eval_dataset_second, sampler=eval_wrong_sampler, batch_size=args.eval_batch_size, collate_fn=collate ) nb_eval_steps = 0 num_correctly_predicted = 0 num_wrongly_predicted = 0 for zipped_batch in tqdm(zip(eval_dataloader, eval_correct_dataloader, eval_wrong_dataloader), desc="Evaluating", total=len(eval_dataloader)): batch, correct_batch, wrong_batch = zipped_batch inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) second_input = None if_correct = False if random.randint(0, 1) == 1: second_input = correct_batch if_correct = True else: second_input = wrong_batch if_correct = False cls_pos = [] for curr in inputs: for idx, tk in enumerate(curr): if tk == tokenizer.cls_token_id: cls_pos.append(idx) break inputs = inputs.to(args.device) with torch.no_grad(): outputs = model(inputs) hidden_1 = outputs[2] sentence_embed_1_pieces = [hh[cls_pos[idx]].unsqueeze(0) for idx, hh in enumerate(hidden_1)] sentence_embed_1 = torch.cat(sentence_embed_1_pieces) cls_pos = [] for curr in second_input: for idx, tk in enumerate(curr): if tk == tokenizer.cls_token_id: cls_pos.append(idx) break second_input = second_input.to(args.device) with torch.no_grad(): outputs = model(second_input) hidden_2 = outputs[2] sentence_embed_2_pieces = [hh[cls_pos[idx]].unsqueeze(0) for idx, hh in enumerate(hidden_2)] sentence_embed_2 = torch.cat(sentence_embed_2_pieces) with torch.no_grad(): if random.randint(0, 1) == 1: outputs = DP_classifier(sentence_embed_1, sentence_embed_2) else: outputs = DP_classifier(sentence_embed_2, sentence_embed_1) mc_logits = outputs[0].cpu() for jj in range(mc_logits.shape[0]): if (mc_logits[jj, 0] > 0) == if_correct: num_correctly_predicted += 1 else: num_wrongly_predicted += 1 nb_eval_steps += 1 total_predicted = num_correctly_predicted + num_wrongly_predicted accuracy = num_correctly_predicted / total_predicted result["accuracy"] = accuracy output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() for batch in tqdm(eval_dataloader, desc="Evaluating"): inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) with torch.no_grad(): outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) lm_loss = outputs[0] eval_loss += lm_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) result = {"perplexity": perplexity} output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, tr_loss_sent, logging_loss, logging_loss_sent = 0.0, 0.0, 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch") set_seed(args) # Added here for reproducibility results = {} acc_prev = 0. preds = None labels = None for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue batch = tuple(t.to(args.device) for t in batch) input_ids, attention, token_ids, child, head = batch[0], batch[ 1], batch[2], batch[3], batch[4] dep_labels, num_dependency, arcs, arc_labels = batch[5], batch[ 6], batch[7], batch[8] arc_label_lengths, sent_labels = batch[9], batch[10] inputs = { 'input_ids': input_ids, 'attention': attention, 'token_ids': token_ids, 'child': child, 'head': head, 'dep_labels': dep_labels, 'arcs': arc_labels, 'arc_label_lengths': arc_label_lengths, 'device': args.device } model.train() outputs = model(**inputs) loss = outputs[0] logits = outputs[1] tr_loss += loss.item() loss.backward() if preds is None: preds = logits.detach().cpu().numpy() labels = dep_labels.view(-1).cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) labels = np.append(labels, dep_labels.view(-1).cpu().numpy(), axis=0) if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.save_steps > 0 and global_step % args.save_steps == 0: logs = {} loss_scalar_dep = (tr_loss - logging_loss) / args.save_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss_dep"] = loss_scalar_dep logging_loss = tr_loss print(json.dumps({**logs, **{"step": global_step}})) logger.info(json.dumps({**logs, **{"step": global_step}})) preds = preds.reshape(-1, 2) preds = softmax(preds) preds = np.argmax(preds, axis=1) res_train = compute_metrics_intermediate(preds, labels) preds = None labels = None print(res_train) # Evaluation result = evaluate(args, model, tokenizer) results.update(result) save_checkpoints(args, args.output_dir, model, tokenizer) if result['acc'] > acc_prev: acc_prev = result['acc'] # Save model checkpoint best output_dir = os.path.join(args.output_dir, "model-best") save_checkpoints(args, output_dir, model, tokenizer) if 0 < args.max_steps < global_step: epoch_iterator.close() break if 0 < args.max_steps < global_step: train_iterator.close() break return global_step, tr_loss / global_step
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: set_seed(args) # Added here for reproducibility """ Train the model """ if args.gpu == 0: current_time = datetime.now().strftime('%b%d_%H-%M-%S') tb_writer = SummaryWriter(args.output_dir + '/runs/' + current_time) args.train_batch_size = args.per_gpu_train_batch_size def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) if args.shuffle: logger.info(f"Shuffle the dataset in training," f"GPU: {args.gpu}," f"Rank: {args.rank}," f"Total: {args.world_size}") train_sampler = DistributedSampler( train_dataset, num_replicas=args.world_size, rank=args.rank, shuffle=args.shuffle, ) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, shuffle=False, num_workers=0, batch_size=args.train_batch_size, collate_fn=collate, pin_memory=True ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, # betas=(0.9, 0.98), lr=args.learning_rate, eps=args.adam_epsilon) if args.warmup_ratio > 0.: assert args.warmup_steps == 0 args.warmup_steps = int(t_total * args.warmup_ratio) if args.gpu == 0: print("Optimized with lr %f, steps %d, warmup steps %d, and use beta, epsilon %0.8f." % ( args.learning_rate, t_total, args.warmup_steps, optimizer.defaults['eps'] ), optimizer.defaults['betas']) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if ( args.model_name_or_path and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level, verbosity=0) from apex.parallel import DistributedDataParallel as DDP model = DDP(model) else: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * args.world_size ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 # Check if continuing training from a checkpoint # if args.model_name_or_path and os.path.exists(args.model_name_or_path): # try: # # set global_step to gobal_step of last saved checkpoint from model path # checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] # epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) # steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) # logger.info(" Continuing training from checkpoint, will skip to saved global_step") # logger.info(" Continuing training from epoch %d", epochs_trained) # except ValueError: # logger.info(" Do not load model from %s, restart training" % args.model_name_or_path) # model_to_resize = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training # model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.gpu != 0 ) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.gpu != 0) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() # Support of accumulating gradients for step, batch in enumerate(epoch_iterator): inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) # If some of the input is padded, then the attention mask is needed attention_mask = (inputs != tokenizer.pad_token_id) # word_tokens --> 1, pad_token --> 0 if attention_mask.all(): attention_mask = None if epoch == 0 and step < 3 and args.gpu == 0: print(inputs.shape) print(inputs[0]) print(tokenizer.convert_ids_to_tokens(inputs[0].cpu().numpy())) print(labels[0]) print(attention_mask) model.train() outputs = model(inputs, attention_mask=attention_mask, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.max_grad_norm > 0.: if args.fp16: total_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: total_norm =torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.gpu == 0 and args.logging_steps > 0 and (step + 1) % args.logging_steps == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) if args.fp16: try: from apex.amp import _amp_state tb_writer.add_scalar("loss_scale", _amp_state.loss_scalers[0]._loss_scale, global_step) tb_writer.add_scalar("scaled_loss", scaled_loss.item(), global_step) except ImportError: logger.warning("Cannot import apex.amp._amp_state, " "would not state the loss_scale in the log") if args.max_grad_norm > 0.: # Only clip the grad when it is valid tb_writer.add_scalar("grad_norm", total_norm, global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.max_steps > 0 and global_step >= args.max_steps: break # Save it each epoch if args.gpu == 0: # Save checkpoints checkpoint_name = "checkpoint-epoch%04d" % epoch save_model(args, checkpoint_name, model, tokenizer, optimizer, scheduler) last_path = os.path.join(args.output_dir, 'checkpoint-last') # if os.path.exists(last_path): # print(last_path) # os.remove(last_path) # os.symlink(os.path.join(args.output_dir, checkpoint_name), last_path) # Evaluate the model logger.info(" Training loss of Epoch %d: %0.4f" % (epoch, tr_loss / step)) logger.info(" Evaluation Results of Epoch %d: " % epoch) results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) logger.info("\t %s: %0.4f" % (key, value)) output_eval_file = os.path.join(args.output_dir, checkpoint_name, "eval_results.json") json.dump(results, open(output_eval_file, 'w'), sort_keys=True, indent=4) if args.max_steps > 0 and global_step >= args.max_steps: epoch_iterator.close() train_iterator.close() break if args.gpu == 0: tb_writer.close()
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict: def cal_simple_nll(pred, gold): loss_fct = nn.NLLLoss(reduce=False) batch_size = gold.size(0) gold = gold.contiguous() norm = nn.Softmax(dim=1) pred = pred.contiguous().view(-1, pred.size(2)) pred = norm(pred) pred_prob_t = pred.contiguous().view(batch_size, -1, pred.size(1)) + 1e-16 pred_prob_t_log = torch.log(pred_prob_t) pred_prob_t_log = pred_prob_t_log.view(-1, pred_prob_t_log.size(2)) loss = loss_fct(pred_prob_t_log, gold.view(-1)) return loss # Loop to handle MNLI double evaluation (matched, mis-matched) eval_output_dir = args.output_dir eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if args.local_rank in [-1, 0]: os.makedirs(eval_output_dir, exist_ok=True) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate) # multi-gpu evaluate # if args.n_gpu > 1: # model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 model.eval() reward_fct = nn.NLLLoss(reduce=False) for batch in tqdm(eval_dataloader, desc="Evaluating"): raw_text = batch.clone().detach().to(args.device) inputs = raw_text[:, :-1].contiguous() gold = raw_text[:, 1:].contiguous() masks = inputs.ne(0).type(torch.float) with torch.no_grad(): outputs = model(inputs, attention_mask=masks) lm_loss = cal_simple_nll(outputs[0], gold) lm_loss = lm_loss.contiguous().view(inputs.size(0), -1).mean(dim=-1) eval_loss += lm_loss.sum().item() nb_eval_steps += lm_loss.size(0) eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) result = {"perplexity": perplexity} output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return result