def mask_heads(args, model, eval_dataloader): """ This method shows how to mask head (set some heads to zero), to test the effect on the network, based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650) """ _, head_importance, preds, labels = compute_heads_importance( args, model, eval_dataloader, compute_entropy=False) preds = np.argmax( preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) original_score = compute_metrics(args.task_name, preds, labels)[args.metric_name] logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold) new_head_mask = torch.ones_like(head_importance) num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount)) current_score = original_score while current_score >= original_score * args.masking_threshold: head_mask = new_head_mask.clone() # save current head mask # heads from least important to most - keep only not-masked heads head_importance[head_mask == 0.0] = float("Inf") current_heads_to_mask = head_importance.view(-1).sort()[1] if len(current_heads_to_mask) <= num_to_mask: break # mask heads current_heads_to_mask = current_heads_to_mask[:num_to_mask] logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist())) new_head_mask = new_head_mask.view(-1) new_head_mask[current_heads_to_mask] = 0.0 new_head_mask = new_head_mask.view_as(head_mask) print_2d_tensor(new_head_mask) # Compute metric and head importance again _, head_importance, preds, labels = compute_heads_importance( args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask) preds = np.argmax( preds, axis=1 ) if args.output_mode == "classification" else np.squeeze(preds) current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name] logger.info( "Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum() / new_head_mask.numel() * 100, ) logger.info("Final head mask") print_2d_tensor(head_mask) np.save(os.path.join(args.output_dir, "head_mask.npy"), head_mask.detach().cpu().numpy()) return head_mask
def prune_heads(args, model, eval_dataloader, head_mask): """ This method shows how to prune head (remove heads weights) based on the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650) """ # Try pruning and test time speedup # Pruning is like masking but we actually remove the masked weights before_time = datetime.now() _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask) preds = np.argmax( preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name] original_time = datetime.now() - before_time original_num_params = sum(p.numel() for p in model.parameters()) heads_to_prune = dict( (layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask))) assert sum( len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item() model.prune_heads(heads_to_prune) pruned_num_params = sum(p.numel() for p in model.parameters()) before_time = datetime.now() _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=None) preds = np.argmax( preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name] new_time = datetime.now() - before_time logger.info( "Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params / original_num_params * 100, ) logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning) logger.info( "Pruning: speed ratio (new timing / original timing): %f percents", original_time / new_time * 100)
def mask_mlps(args, model, eval_dataloader): """ This method shows how to mask head (set some heads to zero), to test the effect on the network, based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650) """ mlp_importance, preds, labels = compute_mlps_importance( args, model, eval_dataloader) preds = np.argmax( preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) original_score = compute_metrics(args.task_name, preds, labels)[args.metric_name] logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold) new_mlp_mask = torch.ones_like(mlp_importance) current_score = original_score i = 0 while current_score >= original_score * args.masking_threshold: mlp_mask = new_mlp_mask.clone() # save current head mask # heads from least important to most - keep only not-masked heads mlp_importance[mlp_mask == 0.0] = float("Inf") current_mlps_to_mask = mlp_importance.sort()[1] mlp_to_mask = current_mlps_to_mask[0] if mlp_importance[mlp_to_mask] == float("Inf"): break new_mlp_mask[mlp_to_mask] = 0.0 logger.info("MLP Layer to mask: %s", str(current_mlps_to_mask[0])) print_1d_tensor(new_mlp_mask) # Compute metric and head importance again mlp_importance, preds, labels = compute_mlps_importance( args, model, eval_dataloader, mlp_mask=new_mlp_mask) preds = np.argmax( preds, axis=1 ) if args.output_mode == "classification" else np.squeeze(preds) current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name] logger.info( "Masking: current score: %f, remaning layers %d (%.1f percents)", current_score, new_mlp_mask.sum(), new_mlp_mask.sum() / new_mlp_mask.numel() * 100, ) logger.info("Final mlp mask") print_1d_tensor(mlp_mask) np.save(os.path.join(args.output_dir, "mlp_mask.npy"), mlp_mask.detach().cpu().numpy()) return mlp_mask
def take_eval_steps(args, model, tokenizer, prune, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) #eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) results = {} #for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): for eval_task in eval_task_names: eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, data_type='dev') #if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: # os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None model = model.model model.eval() pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} #inputs['token_type_ids'] = batch[2] outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) pbar(step) if 'cuda' in str(args.device): torch.cuda.empty_cache() eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) return results
def evaluate(args, model, tokenizer, prefix=""): eval_task_names = (args.task_name,) eval_outputs_dirs = (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = np.argmax(preds, axis=1) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results
def evaluate(args, model, tokenizer, examples, prefix="", write_output=True, out_file_suffix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = get_dataset(args, eval_task, examples, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) if write_output: write_wrong_output(args, examples, eval_task, prefix, preds, out_label_ids, out_file_suffix) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) return results
def evaluate(features_model, decoder_head, task_name, eval_dataset, output_mode, eval_batch_size=8, device='cuda'): # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size) # Eval! logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): decoder_head.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): if os.getenv('GLUEMODEL', 'transformer') == 'nontransformer': features = batch[0] else: features = features_model(batch=batch) labels = batch[-1] outputs = decoder_head(features, labels=labels) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = labels.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, out_label_ids) logger.info("***** Eval results *****") logger.info(" loss = %s", str(eval_loss)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) return result
def evaluate(args, model, tokenizer, world, prefix="", custom_dev_file=None, wikiqa=False, paws=False): eval_task = args.task_name eval_output_dir = args.output_dir results = {} eval_dataset, eval_examples = load_and_cache_examples( args, eval_task, tokenizer, world, evaluate=True, custom_file=custom_dev_file, paws=paws) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! preds, preds_soft, out_label_ids = predict(args, model, eval_dataset, eval_dataloader, prefix=prefix) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) if wikiqa: results.update(compute_metrics_wikiqa(preds_soft, eval_examples)) if paws: from sklearn.metrics import average_precision_score results['average_precision'] = average_precision_score( out_label_ids, preds_soft) output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) writer.write("%s = %s\n" % (key, str(results[key]))) return results
def evaluate(args, model, tokenizer, prefix=""): eval_outputs_dirs = (args.output_dir, ) results = {} for eval_output_dir in eval_outputs_dirs: eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) eval_dataloader = DataLoader(eval_dataset, shuffle=False, batch_size=args.eval_batch_size) logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = np.argmax(preds, axis=1) result = compute_metrics("mnli", preds, out_label_ids) results.update(result) output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results
def evaluate_onnxrt(args, model, tokenizer, eval_dataloader, benchmark=False): session = onnxruntime.InferenceSession(model.SerializeToString(), None) output_mode = output_modes[args.task_name] # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task = args.task_name results = {} # Eval! logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) #eval_loss = 0.0 #nb_eval_steps = 0 preds = None out_label_ids = None latencies = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.detach().cpu().numpy() \ if not isinstance(t, np.ndarray) else t \ for t in batch) ort_inputs = { session.get_inputs()[0].name: batch[0], session.get_inputs()[1].name: batch[1], session.get_inputs()[2].name: batch[2] } if benchmark: start = time.time() _ = session.run(None, ort_inputs) latencies.append(time.time() - start) else: logits = np.reshape(session.run(None, ort_inputs)[0], (-1, 2)) if preds is None: preds = logits out_label_ids = batch[3] else: preds = np.append(preds, logits, axis=0) out_label_ids = np.append(out_label_ids, batch[3], axis=0) if not benchmark: if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) return results["acc"] else: return latencies
def compute_metrics_intermediate(preds, gold): preds_new = [] gold_new = [] for p, g in zip(preds, gold): if g == -1: continue else: preds_new.append(p) gold_new.append(g) preds_new = np.array(preds_new) gold_new = np.array(gold_new) result = compute_metrics('qqp', preds_new, gold_new) return result
def get_metrics(self, outputs, inputs): """ Based on outputs calculate the metrics """ loss, logits = outputs[:2] preds = logits.detach().cpu().numpy() out_labels_ids = inputs["labels"].detach().cpu().numpy() if self.context.get_data_config()["output_mode"] == "classification": preds = np.argmax(preds, axis=1) elif self.context.get_data_config()["output_mode"] == "regression": preds = np.squeeze(preds) results = compute_metrics( self.context.get_data_config().get("task").lower(), preds, out_labels_ids) results["loss"] = loss return results
def _eval_end(self, outputs): val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean().detach().cpu().item() preds = np.concatenate([x["pred"] for x in outputs], axis=0) if self.hparams.glue_output_mode == "classification": preds = np.argmax(preds, axis=1) elif self.hparams.glue_output_mode == "regression": preds = np.squeeze(preds) out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0) out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] results = {**{"val_loss": val_loss_mean}, **compute_metrics(self.hparams.task, preds, out_label_ids)} ret = {k: v for k, v in results.items()} ret["log"] = results return ret, preds_list, out_label_list
def evaluate(args, model, tokenizer, prefix="", verbose=1): global verbosity verbosity = verbose global stdout_verbose_every stdout_verbose_every = args.verbose_every # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): global verbose_outfile vf = os.path.join(eval_output_dir, 'verbose.txt') verbose_outfile = open(vf, 'w', encoding='utf-8') logger.info('writing logits etc to %s' % vf) eval_dataset, eval_examples = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) logger.info(" #data=%s #examples=%s first=%s" % (len(eval_dataset), len(eval_examples), eval_examples[:1])) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = int(args.per_gpu_eval_batch_size * max(1, args.n_gpu)) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) start_time = timeit.default_timer() # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None i = 0 confs = None dups = Counter() nclasses = None for batch in tqdm(eval_dataloader, desc="Evaluating", mininterval=mininterval): model.eval() batch = tuple(t.to(args.device) for t in batch) # t[0] pair with input inputs = batch_inputs(batch, args.model_type) with torch.no_grad(): outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() logs = logits.tolist() outverbose('%s\t%s' % (rounded(logs), inputs['labels'].tolist()), v=1, seq=nb_eval_steps) for l in logs: if i >= len(eval_examples): break ex = eval_examples[i] minl = min(l) if nclasses is None or len(l) > nclasses: if nclasses is not None: logger.warn("# of classes differed: %s vs %s in %s; dropping old data" % (nclasses, len(l), l)) nclasses = len(l) confs = [[] for x in l] for j in range(nclasses): confj = l[j] l[j] = minl confmax = max(l) l[j] = confj conf = l[j] - confmax if conf > 0: t = ex.texts() dups[t] += 1 if dups[t] == 1: confs[j].append((conf, i, l, ex)) if conf > 8: outverbose('%s %s %s %s' % (rounded(conf), j, t, ex.label), v=1) i += 1 nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) outmax = 20 docsentiment = [0 for x in range(nclasses)] nsents = i for j, c in enumerate(confs): s = sorted(c, reverse=True) for topi, x in enumerate(s): conf, i, logit, example = x desc = '%s %s [#%s gold:%s] %s %s' % (rounded(logit), j, i, example.label, rounded(conf), example.texts()) docsentiment[j] += conf if topi < outmax: sys.stdout.write(desc + '\n') outverbose(desc, v=2, seq=topi) scale = sum(docsentiment) scale = 1. / scale if scale > 0 else 0 docsentimentraw = docsentiment docsentiment = [x * scale for x in docsentimentraw] docneg = docsentiment[0] docpos = docsentiment[1] docneu = docsentiment[2] if len(docsentiment) >= 3 else 0 docposneg = (docpos - docneg) * (1. - docneu) logger.info('document sentiment (%s sentences): unnormalized: %s; normalized: %s; net positive/negative: %.3f; ' % (nsents, rounded(docsentimentraw), rounded(docsentiment), docposneg)) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) from transformers import glue_compute_metrics as compute_metrics result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(eval_dataset)) output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") def fmtfloat(x): return str(x) if (not isinstance(x, float) or x.is_integer()) else '%.3f' % x with open(output_eval_file, "w") as writer: skeys = sorted(result.keys()) for key in skeys: writer.write("%s = %s\n" % (key, str(result[key]))) logger.info("%s = %s"%(key, fmtfloat(result[key]))) logger.info("***** Eval results {} *****: {}".format(prefix, " ".join("%s = %s"%(key, fmtfloat(result[key])) for key in skeys))) return results
def evaluate(args, model, tokenizer, prefix="", disable_logging=False): """Evaluate the model""" if xm.is_master_ordinal(): # Only master writes to Tensorboard tb_writer = SummaryWriter(args.tensorboard_logdir) # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) eval_sampler = get_sampler(eval_dataset) if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, shuffle=False) eval_dataloader = pl.ParallelLoader(dataloader, [args.device]).per_device_loader(args.device) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataloader) * args.eval_batch_size) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating", disable=disable_logging): model.eval() with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} if args.model_type != "distilbert": # XLM, DistilBERT and RoBERTa don't use segment_ids inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None outputs = model(**inputs) batch_eval_loss, logits = outputs[:2] eval_loss += batch_eval_loss nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) # tpu-comment: Get all predictions and labels from all worker shards of eval dataset preds = xm.mesh_reduce("eval_preds", preds, np.concatenate) out_label_ids = xm.mesh_reduce("eval_out_label_ids", out_label_ids, np.concatenate) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) results["eval_loss"] = eval_loss.item() output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") if xm.is_master_ordinal(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) writer.write("%s = %s\n" % (key, str(results[key]))) tb_writer.add_scalar(f"{eval_task}/{key}", results[key]) if args.metrics_debug: # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) if xm.is_master_ordinal(): tb_writer.close() return results
def evaluate(args, model, tokenizer, tokenizer_langs, prefix=""): eval_task_names = (args.task_name, ) eval_outputs_dirs = (args.eval_output_file, ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples( args, eval_task, tokenizer_langs[args.language_id.index(args.language)], evaluate=True) # now shift dataset eval_dataset.tensors = list(eval_dataset.tensors) eval_dataset.tensors[0] += model.config.shifts[args.language_id.index( args.language)] eval_dataset.tensors = tuple(eval_dataset.tensors) # if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: # os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): if args.invert_order: modifications.invert_pairs(batch[0], model.config.shifts, args, tokenizer) if args.language_specific_positions: if args.block_size > 256: raise ValueError( "Language specific posiiton embeddings can only be <256." ) position_ids, segment_ids = modifications.get_language_specific_positions( batch[0], model.config.shifts, args.block_size, tokenizer) position_ids = position_ids.to(args.device) segment_ids = segment_ids.to(args.device) else: position_ids, segment_ids = None, None if args.shift_special_tokens: modifications.shift_special_tokens( batch[0], model.config.shifts, args.special_token_indices, tokenizer) else: modifications.unshift_special_tokens( batch[0], model.config.shifts, args.special_token_indices, tokenizer) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], "position_ids": position_ids, "token_type_ids": segment_ids } # if args.model_type != "distilbert": # inputs["token_type_ids"] = ( # batch[2] if args.model_type in ["bert"] else None # ) # XLM and DistilBERT don't use segment_ids #import ipdb;ipdb.set_trace() outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) else: raise ValueError("No other `output_mode` for XNLI.") result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = eval_output_dir with open(output_eval_file, "a") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("{} {} {} {} {}\n".format(args.output_dir, args.language, args.seed, key, str(result[key]))) return results
def evaluate(args, model, tokenizer, checkpoint, evaluate_on_training=False, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else ( args.output_dir, ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): if evaluate_on_training: eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=False) else: eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None eval_loss_pos = [] eval_loss_neg = [] eval_loss_total = [] for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss_total.append(float(tmp_eval_loss.detach().cpu().numpy())) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) output_pred_file = os.path.join( eval_output_dir, '{0}_{1}_preds_gt.txt'.format(str(checkpoint), str(args.evaluation_set))) with open(output_pred_file, "w") as writer: for (sample_loss, (pr, gt)) in zip(eval_loss_total, zip(preds, out_label_ids)): if gt == 0: eval_loss_neg.append(sample_loss) else: eval_loss_pos.append(sample_loss) writer.write("%s\t %s\n" % (pr, gt)) result = compute_metrics(eval_task, preds, out_label_ids) # Add per-class losses result['neg_loss'] = np.mean(eval_loss_neg) result['pos_loss'] = np.mean(eval_loss_pos) result['total_loss'] = np.mean(eval_loss_total) result['neg_hist'] = np.array(eval_loss_neg) result['pos_hist'] = np.array(eval_loss_pos) results.update(result) output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results
def evaluate(args, model, tokenizer, label_list, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else ( args.output_dir, ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:3] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) processor = processors[args.task_name]() #label_list = processor.get_labels() task_result = compute_metrics(eval_task, preds, out_label_ids, label_list) results.update(task_result) output_eval_file = os.path.join(eval_output_dir, prefix, args.results_file) #"eval_results.txt" with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(task_result.keys()): logger.info(" %s: \n%s", key, task_result[key]) #writer.write("%s = %s\n" % (key, str(task_result[key]))) task_result = task_result['results'] acc = task_result['acc'] P = task_result['prec'] R = task_result['rec'] F1 = task_result['f1'] AUC = task_result['AUC'] writer.write("acc\tR\tR\tF1\tAUC\n") writer.write( str(acc) + "\t" + str(P) + "\t" + str(R) + "\t" + str(F1) + "\t" + str(AUC) + "\n") writer.write(task_result['perclass'] + "\n") writer.write(task_result['confusion_matrix'] + "\n") writer.write(task_result['perclassAcc'] + "\n") return results
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) # eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = [batch[0], batch[1].half() if args.data_type == 'fp16' else batch[1], batch[2]] outputs = model(*inputs) # tmp_eval_loss, logits = outputs[:2] logits = outputs[0] # eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = batch[3].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, batch[3].detach().cpu().numpy(), axis=0) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation for " + eval_task + " done in total %f secs (%f sec per example)", evalTime, evalTime / len(eval_dataset)) # eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results
def eval_train_or_test(args, model, tokenizer, prefix="", eval=True, epoch=None): eval_task_names = (args.task_name, ) eval_outputs_dirs = (args.output_dir, ) tord = "dev" if eval else "train" results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=eval) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "custom"] else None) # XLM and DistilBERT don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) else: raise ValueError("No other `output_mode` for XNLI.") result = compute_metrics(eval_task, preds, out_label_ids) # print(eval_dataset) if eval: print("predictions:", preds) print("output label ids:", out_label_ids) results.update(result) labels = out_label_ids num_entailment_wrong, num_contradiction_wrong = 0, 0 for i in range((len(preds))): if preds[i] != labels[i] and labels[i] == 1: num_entailment_wrong += 1 elif preds[i] != labels[i] and labels[i] == 0: num_contradiction_wrong += 1 print("num entailment wrong:", num_entailment_wrong) print("num contradiction wrong:", num_contradiction_wrong) preds_labels = list(zip(preds, out_label_ids)) if epoch is not None: if eval: all_dev_losses.append(eval_loss) for key in sorted(result.keys()): all_dev_acc.append(result[key]) else: all_train_losses.append(eval_loss) for key in sorted(result.keys()): all_train_acc.append(result[key]) if epoch is None: output_eval_file = os.path.join(eval_output_dir, prefix, tord + "_eval_results" + ".txt") else: output_eval_file = os.path.join( eval_output_dir, prefix, tord + "_eval_results_epoch_" + str(epoch) + ".txt") with open(output_eval_file, "w") as writer: logger.info("***** {} results {} for epoch {} *****".format( tord, prefix, "n/a" if epoch is None else epoch)) logger.info("Loss=%s", str(eval_loss)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("%s = %s\n" % (key, str(result[key]))) if eval and epoch is None: with open(os.path.join(eval_output_dir, "preds_and_labels.csv"), "wt") as out_file: tsv_writer = csv.writer(out_file, delimiter='\t') tsv_writer.writerows([["prediction", "label"]] + preds_labels) elif eval and epoch is not None: with open( os.path.join( eval_output_dir, "preds_and_labels_epoch_" + str(epoch) + ".csv"), "wt") as out_file: tsv_writer = csv.writer(out_file, delimiter='\t') tsv_writer.writerows([["prediction", "label"]] + preds_labels) return results
def evaluate(args, model, tokenizer, prefix=""): eval_task_names = (args.task_name, ) eval_outputs_dirs = (args.output_dir, ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "labels": batch[3], } outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0, ) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) else: raise ValueError("No other `output_mode` for XNLI.") result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset,id_map = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 num_id = 0 preds = None out_label_ids = None key_map = {} cnt_map = {} for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 logits = logits.detach().cpu().numpy() for logit in logits: qas_id = id_map[num_id] if qas_id in key_map: logit_list = key_map[qas_id] logit_list[0] += logit[0] logit_list[1] += logit[1] cnt_map[qas_id] += 1 else: cnt_map[qas_id] = 1 key_map[qas_id] = [logit[0], logit[1]] num_id += 1 if preds is None: preds = logits out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits, axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) print(len(preds)) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) #final_map = {} #for idx, key in enumerate(key_map): # key_list = key_map[key] # final_map[key] = [str(key_list[0]), str(key_list[1])] final_map = {} for idx, key in enumerate(key_map): key_list = key_map[key] key_list[0] = key_list[0] / cnt_map[key] key_list[1] = key_list[1] / cnt_map[key] final_map[key] = key_list[1] - key_list[0] with open(os.path.join(args.output_dir, prefix, "cls_score.json"), "w") as writer: writer.write(json.dumps(final_map, indent=4) + "\n") output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results {} *****".format(prefix)) writer.write("***** Eval results %s *****\n" % (str(prefix))) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results
def evaluate(args, model, tokenizer, prefix="", test_set=False): # Loop to handle MNLI double evaluation (matched, mis-matched) model.eval() eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else ( args.output_dir, ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = None if test_set: cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}_{}'.format( 'test', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(eval_task))) if not os.path.exists(cached_features_file): eval_dataset = get_test_set(args, eval_task, tokenizer) torch.save(eval_dataset, cached_features_file) else: eval_dataset = torch.load(cached_features_file) else: eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = os.path.join( eval_output_dir, "eval_results.txt" if not test_set else "test_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results
def evaluate(args, model, tokenizer, prefix="", output_layer=-1, eval_highway=False): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else ( args.output_dir, ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1: model = nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None exit_layer_counter = {(i + 1): 0 for i in range(model.num_layers)} st = time.time() for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet"] else None ) # XLM, DistilBERT and RoBERTa don't use segment_ids if output_layer >= 0: inputs["output_layer"] = output_layer outputs = model(**inputs) if eval_highway: exit_layer_counter[outputs[-1]] += 1 tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_time = time.time() - st logger.info("Eval time: {}".format(eval_time)) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) if eval_highway: logger.info("Exit layer counter: {}".format(exit_layer_counter)) actual_cost = sum([l * c for l, c in exit_layer_counter.items()]) full_cost = len(eval_dataloader) * model.num_layers logger.info("Expected saving: {}".format(actual_cost / full_cost)) if args.early_exit_entropy >= 0: save_fname = ( args.plot_data_dir + "/" + args.model_name_or_path[2:] + "/entropy_{}.npy".format(args.early_exit_entropy)) if not os.path.exists(os.path.dirname(save_fname)): os.makedirs(os.path.dirname(save_fname)) print_result = get_wanted_result(result) np.save( save_fname, np.array([ exit_layer_counter, eval_time, actual_cost / full_cost, print_result ])) logger.info("Entropy={}\tResult={:.2f}".format( args.early_exit_entropy, 100 * print_result)) output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else ( args.output_dir, ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info(f"***** Running evaluation {prefix} *****") logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't # use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet", "albert" ] else None) outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) return results
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) labeldict = ["contradiction", "entailment", "neutral"] if args.separate_evals: eval_task_names = ("mnli-contr", "mnli-neut", "mnli-entail") if args.task_name == "mnli" else ( args.task_name, ) eval_outputs_dirs = (args.output_dir + '-contr', args.output_dir + '-neut', args.output_dir + '-entail') if args.task_name == "mnli" else ( args.output_dir, ) elif args.stress_test: eval_task_names = ( "mnli_stress_neg_m", "mnli_stress_neg_mm") if args.task_name == "mnli" else ( args.task_name, ) eval_outputs_dirs = ( args.output_dir + '_stress_neg', args.output_dir + '_stress_neg_mm') if args.task_name == "mnli" else ( args.output_dir, ) else: eval_task_names = ("mnli", "mnli-mm", "mnli-neg", "mnli-neg-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM', args.output_dir + 'Neg', args.output_dir + 'Neg-MM') if args.task_name == "mnli" else ( args.output_dir, ) results = {} wandb_res = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler( eval_dataset) if args.local_rank == -1 else DistributedSampler( eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) wandb_res[eval_task] = result['acc'] output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results, wandb_res
def evaluate(args, model, tokenizer, prefix=""): """ Evaluate the model """ eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_outputs_dirs = (args.output_dir, args.output_dir + 'MM') if args.task_name == "mnli" else ( args.output_dir, ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert' ] else None outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) if eval_task == 'mnli-mm': results.update({'acc_mm': result['acc']}) else: results.update(result) output_eval_file = os.path.join( eval_output_dir, "eval_results.txt") # wirte all the results to the same file with open(output_eval_file, "a") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("\n") return results
def evaluate(args, model, tokenizer, prefix="", calibration=False): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) results = {} for eval_task in eval_task_names: eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if calibration: args.eval_batch_size = 16 else: args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) calibation_iteration = int( (len(eval_dataset) * 0.05 + args.eval_batch_size - 1) / args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None if args.mkldnn_eval: from torch.utils import mkldnn as mkldnn_utils model = mkldnn_utils.to_mkldnn(model) print(model) import timeit total_time = 0.0 for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) if calibration and nb_eval_steps >= calibation_iteration: break with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids if nb_eval_steps >= args.warmup: start = timeit.default_timer() outputs = model(**inputs) if nb_eval_steps >= args.warmup: total_time += (timeit.default_timer() - start) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if args.do_bf16: if preds is None: preds = logits.detach().cpu().to(torch.float).numpy() out_label_ids = inputs['labels'].detach().cpu().to( torch.float).numpy() else: preds = np.append(preds, logits.detach().cpu().to( torch.float).numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().to( torch.float).numpy(), axis=0) else: if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) if nb_eval_steps >= args.warmup: perf = (len(eval_dataloader) - args.warmup) * args.eval_batch_size / total_time logger.info("***** perfformance {} samples/s *****".format(perf)) else: logger.info( "*****no perfformance, please check dataset length and warmup number *****" ) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) return results, perf
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else ( args.output_dir, ) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max( 1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results
def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + "/MM") if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None # Global TopK if args.global_topk: threshold_mem = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "masked_bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids if "masked" in args.model_type: inputs["threshold"] = args.final_threshold if args.global_topk: if threshold_mem is None: concat = torch.cat( [param.view(-1) for name, param in model.named_parameters() if "mask_scores" in name] ) n = concat.numel() kth = max(n - (int(n * args.final_threshold) + 1), 1) threshold_mem = concat.kthvalue(kth).values.item() inputs["threshold"] = threshold_mem outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": from scipy.special import softmax probs = softmax(preds, axis=-1) entropy = np.exp((-probs * np.log(probs)).sum(axis=-1).mean()) preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) if entropy is not None: result["eval_avg_entropy"] = entropy output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results