def test(model, args, eval_examples, eval_features, device): print("***** Eval *****") RawResult = collections.namedtuple( "RawResult", ["unique_id", "start_logits", "end_logits"]) output_prediction_file = os.path.join(args.checkpoint_dir, "predictions_test.json") output_nbest_file = output_prediction_file.replace('predictions', 'nbest') all_input_ids = torch.tensor([f['input_ids'] for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f['input_mask'] for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f['segment_ids'] for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_dataloader = DataLoader(eval_data, batch_size=args.n_batch, shuffle=False) model.eval() all_results = [] print("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature['unique_id']) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) write_predictions(eval_examples, eval_features, all_results, n_best_size=args.n_best, max_answer_length=args.max_ans_length, do_lower_case=True, output_prediction_file=output_prediction_file, output_nbest_file=output_nbest_file) tmp_result = get_eval(args.test_file, output_prediction_file) print(tmp_result)
RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) if mpi_rank == 0: output_prediction_file = os.path.join( args.checkpoint_dir, 'prediction_epoch' + str(i) + '.json') output_nbest_file = os.path.join( args.checkpoint_dir, 'nbest_epoch' + str(i) + '.json') write_predictions( dev_examples, dev_data, all_results, n_best_size=args.n_best, max_answer_length=args.max_ans_length, do_lower_case=True, output_prediction_file= output_prediction_file, output_nbest_file=output_nbest_file) tmp_result = get_eval(args.dev_file, output_prediction_file) tmp_result['STEP'] = global_steps print_rank0(tmp_result) with open(args.log_file, 'a') as aw: aw.write( json.dumps(str(tmp_result)) + '\n') if float(tmp_result['F1']) > best_f1: best_f1 = float(tmp_result['F1']) if float(tmp_result['EM']) > best_em:
def evaluate(model, args, eval_examples, eval_features, device, global_steps, best_f1, best_em, best_f1_em): print("***** Eval *****") RawResult = collections.namedtuple( "RawResult", ["unique_id", "start_logits", "end_logits"]) output_prediction_file = os.path.join( args.checkpoint_dir, "predictions_steps" + str(global_steps) + ".json") output_nbest_file = output_prediction_file.replace('predictions', 'nbest') all_input_ids = torch.tensor([f['input_ids'] for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f['input_mask'] for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f['segment_ids'] for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_dataloader = DataLoader(eval_data, batch_size=args.n_batch, shuffle=False) model.eval() all_results = [] print("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature['unique_id']) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) write_predictions(eval_examples, eval_features, all_results, n_best_size=args.n_best, max_answer_length=args.max_ans_length, do_lower_case=True, output_prediction_file=output_prediction_file, output_nbest_file=output_nbest_file) tmp_result = get_eval(args.dev_file, output_prediction_file) tmp_result['STEP'] = global_steps with open(args.log_file, 'a') as aw: aw.write(json.dumps(tmp_result) + '\n') print(tmp_result) if float(tmp_result['F1']) > best_f1: best_f1 = float(tmp_result['F1']) if float(tmp_result['EM']) > best_em: best_em = float(tmp_result['EM']) if float(tmp_result['F1']) + float(tmp_result['EM']) > best_f1_em: utils.torch_save_model(model, args.checkpoint_dir, { 'f1': float(tmp_result['F1']), 'em': float(tmp_result['EM']) }, max_save_num=1) model.train() return best_f1, best_em