예제 #1
0
def test(model, args, eval_examples, eval_features, device):
    print("***** Eval *****")
    RawResult = collections.namedtuple(
        "RawResult", ["unique_id", "start_logits", "end_logits"])
    output_prediction_file = os.path.join(args.checkpoint_dir,
                                          "predictions_test.json")
    output_nbest_file = output_prediction_file.replace('predictions', 'nbest')

    all_input_ids = torch.tensor([f['input_ids'] for f in eval_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f['input_mask'] for f in eval_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f['segment_ids'] for f in eval_features],
                                   dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                              all_example_index)
    eval_dataloader = DataLoader(eval_data,
                                 batch_size=args.n_batch,
                                 shuffle=False)

    model.eval()
    all_results = []
    print("Start evaluating")
    for input_ids, input_mask, segment_ids, example_indices in tqdm(
            eval_dataloader, desc="Evaluating"):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        with torch.no_grad():
            batch_start_logits, batch_end_logits = model(
                input_ids, segment_ids, input_mask)

        for i, example_index in enumerate(example_indices):
            start_logits = batch_start_logits[i].detach().cpu().tolist()
            end_logits = batch_end_logits[i].detach().cpu().tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature['unique_id'])
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

    write_predictions(eval_examples,
                      eval_features,
                      all_results,
                      n_best_size=args.n_best,
                      max_answer_length=args.max_ans_length,
                      do_lower_case=True,
                      output_prediction_file=output_prediction_file,
                      output_nbest_file=output_nbest_file)

    tmp_result = get_eval(args.test_file, output_prediction_file)
    print(tmp_result)
예제 #2
0
                                        RawResult(unique_id=unique_id,
                                                  start_logits=start_logits,
                                                  end_logits=end_logits))
                            if mpi_rank == 0:
                                output_prediction_file = os.path.join(
                                    args.checkpoint_dir,
                                    'prediction_epoch' + str(i) + '.json')
                                output_nbest_file = os.path.join(
                                    args.checkpoint_dir,
                                    'nbest_epoch' + str(i) + '.json')

                                write_predictions(
                                    dev_examples,
                                    dev_data,
                                    all_results,
                                    n_best_size=args.n_best,
                                    max_answer_length=args.max_ans_length,
                                    do_lower_case=True,
                                    output_prediction_file=
                                    output_prediction_file,
                                    output_nbest_file=output_nbest_file)
                                tmp_result = get_eval(args.dev_file,
                                                      output_prediction_file)
                                tmp_result['STEP'] = global_steps
                                print_rank0(tmp_result)
                                with open(args.log_file, 'a') as aw:
                                    aw.write(
                                        json.dumps(str(tmp_result)) + '\n')

                                if float(tmp_result['F1']) > best_f1:
                                    best_f1 = float(tmp_result['F1'])
                                if float(tmp_result['EM']) > best_em:
예제 #3
0
def evaluate(model, args, eval_examples, eval_features, device, global_steps,
             best_f1, best_em, best_f1_em):
    print("***** Eval *****")
    RawResult = collections.namedtuple(
        "RawResult", ["unique_id", "start_logits", "end_logits"])
    output_prediction_file = os.path.join(
        args.checkpoint_dir, "predictions_steps" + str(global_steps) + ".json")
    output_nbest_file = output_prediction_file.replace('predictions', 'nbest')

    all_input_ids = torch.tensor([f['input_ids'] for f in eval_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f['input_mask'] for f in eval_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f['segment_ids'] for f in eval_features],
                                   dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                              all_example_index)
    eval_dataloader = DataLoader(eval_data,
                                 batch_size=args.n_batch,
                                 shuffle=False)

    model.eval()
    all_results = []
    print("Start evaluating")
    for input_ids, input_mask, segment_ids, example_indices in tqdm(
            eval_dataloader, desc="Evaluating"):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        with torch.no_grad():
            batch_start_logits, batch_end_logits = model(
                input_ids, segment_ids, input_mask)

        for i, example_index in enumerate(example_indices):
            start_logits = batch_start_logits[i].detach().cpu().tolist()
            end_logits = batch_end_logits[i].detach().cpu().tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature['unique_id'])
            all_results.append(
                RawResult(unique_id=unique_id,
                          start_logits=start_logits,
                          end_logits=end_logits))

    write_predictions(eval_examples,
                      eval_features,
                      all_results,
                      n_best_size=args.n_best,
                      max_answer_length=args.max_ans_length,
                      do_lower_case=True,
                      output_prediction_file=output_prediction_file,
                      output_nbest_file=output_nbest_file)

    tmp_result = get_eval(args.dev_file, output_prediction_file)
    tmp_result['STEP'] = global_steps
    with open(args.log_file, 'a') as aw:
        aw.write(json.dumps(tmp_result) + '\n')
    print(tmp_result)

    if float(tmp_result['F1']) > best_f1:
        best_f1 = float(tmp_result['F1'])

    if float(tmp_result['EM']) > best_em:
        best_em = float(tmp_result['EM'])

    if float(tmp_result['F1']) + float(tmp_result['EM']) > best_f1_em:
        utils.torch_save_model(model,
                               args.checkpoint_dir, {
                                   'f1': float(tmp_result['F1']),
                                   'em': float(tmp_result['EM'])
                               },
                               max_save_num=1)

    model.train()

    return best_f1, best_em