def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("umlm",) eval_outputs_dirs = (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu eval if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model, device_ids=args.cuda_ids) # Eval! logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = np.argmax(preds, axis=1) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) return results
def test(args, model, tokenizer, prefix=""): # Loop to handle MNLI double testing (matched, mis-matched) test_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) test_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" \ else (args.output_dir,) results = {} for test_task, test_output_dir in zip(test_task_names, test_outputs_dirs): test_output_dir = os.path.join(test_output_dir, prefix) test_dataset, guids = load_and_cache_examples(args, test_task, tokenizer, evaluate="test") if not os.path.exists(test_output_dir) and args.local_rank in [-1, 0]: os.makedirs(test_output_dir) args.test_batch_size = args.per_gpu_test_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly test_sampler = SequentialSampler(test_dataset) if args.local_rank == -1 else DistributedSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.test_batch_size) # multi-gpu test if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Test! logger.info("***** Running testing {} *****".format(prefix)) logger.info(" Num examplfrom_pretrainedes = %d", len(test_dataset)) logger.info(" Batch size = %d", args.test_batch_size) test_loss = 0.0 nb_test_steps = 0 preds = None out_label_ids = None preds_normal = [] for batch in tqdm(test_dataloader, desc="testuating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids outputs = model(**inputs) tmp_test_loss, logits = outputs[:2] # 先归一化,再softmax # for item in torch.softmax(torch.nn.functional.normalize(logits, p=2, dim=1), dim=1).tolist(): # preds_normal.append(item) for item in torch.softmax(logits, dim=1).tolist(): preds_normal.append(item) test_loss += tmp_test_loss.mean().item() nb_test_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) test_loss = test_loss / nb_test_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) processor = processors[args.task_name]() result = compute_metrics(test_task, preds, out_label_ids, processor.get_labels()) results.update(result) output_test_file = os.path.join(test_output_dir, "test_report.txt") with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format(prefix)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("%s = %s\n" % ("loss", str(test_loss))) writer.close() if len(guids) == len(preds): if args.output_mode == "classification": preds = processor.label_index_to_label(preds) output_result_file = os.path.join(test_output_dir, "test_results.csv") with open(output_result_file, "w", encoding='utf-8', newline='') as result_file: predict_file_writer = csv.writer(result_file, delimiter=',') headers = ["id", "label"] predict_file_writer.writerow(headers) for index in range(len(guids)): predict_file_writer.writerow([guids[index], preds[index]]) logger.info("Save " + test_output_dir + " down.") result_file.close() output_result_file = os.path.join(test_output_dir, "test_probability.csv") with open(output_result_file, "w", encoding='utf-8', newline='') as result_file: predict_file_writer = csv.writer(result_file, delimiter=',') headers = ["id"] + processor.get_labels() predict_file_writer.writerow(headers) for index in range(len(guids)): predict_file_writer.writerow([guids[index]] + preds_normal[index]) logger.info("Save " + test_output_dir + " down.") result_file.close() elif args.output_mode == "regression": output_result_file = os.path.join(test_output_dir, "test_regression.csv") with open(output_result_file, "w", encoding='utf-8', newline='') as result_file: predict_file_writer = csv.writer(result_file, delimiter=',') headers = ["id"] + processor.get_labels() predict_file_writer.writerow(headers) for index in range(len(guids)): predict_file_writer.writerow([guids[index]] + preds_normal[index]) logger.info("Save " + test_output_dir + " down.") result_file.close() else: raise ValueError("The length of guid and the length of pred is not match: len(guid) = %s, len(pred) %s" % ( len(guids), len(preds))) return results