Пример #1
0
    def _evaluate(self, dataset: DataLoader, features: InputFeatures):
        logger.info("***** Running inference *****")
#         logger.info(" Batch size: {}".format(dataset.batch_size))
#         logger.info("  Num examples = %d", len(dataset))
        eval_results = []
#         for batch in tqdm(dataset, desc="Evaluating"):
        for batch in dataset:
            self.model.eval()
            batch = tuple(t.to(self.device) for t in batch)
            with torch.no_grad():
                inputs = {
                    'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'token_type_ids': None if self.model_type == 'xlm' else batch[2]
                }
                example_indices = batch[3]
                if self.model_type in ['xlnet', 'xlm']:
                    inputs.update(
                        {'cls_index': batch[4], 'p_mask': batch[5]}
                    )
                outputs = self.model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                if self.model_type in ['xlnet', 'xlm']:
                    # XLNet uses a more complex post-processing procedure
                    result = RawResultExtended(
                        unique_id=unique_id,
                        start_top_log_probs=(outputs[0][i]).detach().cpu().tolist(),
                        start_top_index=(outputs[1][i]).detach().cpu().tolist(),
                        end_top_log_probs=(outputs[2][i]).detach().cpu().tolist(),
                        end_top_index=(outputs[3][i]).detach().cpu().tolist(),
                        cls_logits=(outputs[4][i]).detach().cpu().tolist())
                else:
                    result = RawResult(
                        unique_id=unique_id,
                        start_logits=(outputs[0][i]).detach().cpu().tolist(),
                        end_logits=(outputs[1][i]).detach().cpu().tolist())
                eval_results.append(result)
        return eval_results
Пример #2
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and (args.local_rank in [-1, 0] or
                                                args.no_distributed_training):
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset) if (
        args.local_rank == -1
        or args.no_distributed_training) else DistributedSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': None if args.model_type == 'xlm' else
                batch[2]  # XLM don't use segment_ids
            }
            example_indices = batch[3]
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            if args.model_type in ['xlnet', 'xlm']:
                # XLNet uses a more complex post-processing procedure
                result = RawResultExtended(
                    unique_id=unique_id,
                    start_top_log_probs=to_list(outputs[0][i]),
                    start_top_index=to_list(outputs[1][i]),
                    end_top_log_probs=to_list(outputs[2][i]),
                    end_top_index=to_list(outputs[3][i]),
                    cls_logits=to_list(outputs[4][i]))
            else:
                result = RawResult(unique_id=unique_id,
                                   start_logits=to_list(outputs[0][i]),
                                   end_logits=to_list(outputs[1][i]))
            all_results.append(result)

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    if args.model_type in ['xlnet', 'xlm']:
        # XLNet uses a more complex post-processing procedure
        write_predictions_extended(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, output_prediction_file, output_nbest_file,
            output_null_log_odds_file, args.predict_file,
            model.config.start_n_top, model.config.end_n_top,
            args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
        write_predictions(examples, features, all_results, args.n_best_size,
                          args.max_answer_length, args.do_lower_case,
                          output_prediction_file, output_nbest_file,
                          output_null_log_odds_file, args.verbose_logging,
                          args.version_2_with_negative,
                          args.null_score_diff_threshold)

    # Evaluate with the official SQuAD script
    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
                                 pred_file=output_prediction_file,
                                 na_prob_file=output_null_log_odds_file)
    results = evaluate_on_squad(evaluate_options)

    return results
Пример #3
0
    batch = tuple(t.to(device) for t in batch)
    with torch.no_grad():
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'token_type_ids': batch[2]
        }
        example_indices = batch[3]

        outputs = model(**inputs)

    for i, example_index in enumerate(example_indices):
        eval_feature = features[example_index.item()]
        unique_id = int(eval_feature.unique_id)
        result = RawResult(unique_id=unique_id,
                           start_logits=to_list(outputs[0][i]),
                           end_logits=to_list(outputs[1][i]))
        all_results.append(result)

# Compute predictions
output_prediction_file = "predictions_.json"
output_nbest_file = "nbest_predictions_.json"

write_predictions(examples, features, all_results, 20, max_answer_length,
                  do_lower_case, output_prediction_file, output_nbest_file,
                  output_null_log_odds_file, False, False,
                  null_score_diff_threshold)

# Evaluate with the official SQuAD script
evaluate_options = EVAL_OPTS(data_file=dev_file,
                             pred_file=output_prediction_file,
Пример #4
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            # TODO: i and feature_index are the same number! Simplify by removing enumerate?
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            if args.model_type in ["xlnet", "xlm"]:
                # XLNet uses a more complex post-processing procedure
                result = RawResultExtended(
                    unique_id=unique_id,
                    start_top_log_probs=to_list(outputs[0][i]),
                    start_top_index=to_list(outputs[1][i]),
                    end_top_log_probs=to_list(outputs[2][i]),
                    end_top_index=to_list(outputs[3][i]),
                    cls_logits=to_list(outputs[4][i]),
                )
            else:
                result = RawResult(
                    unique_id=unique_id,
                    start_logits=to_list(outputs[0][i]),
                    end_logits=to_list(outputs[1][i]),
                )

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    if args.model_type in ["xlnet", "xlm"]:
        # XLNet uses a more complex post-processing procedure
        write_predictions_extended(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.predict_file,
            model.config.start_n_top,
            model.config.end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging,
        )
    else:
        write_predictions(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
        )

    # Evaluate with the official SQuAD script
    evaluate_options = EVAL_OPTS(
        data_file=args.predict_file,
        pred_file=output_prediction_file,
        na_prob_file=output_null_log_odds_file,
    )
    results = evaluate_on_squad(evaluate_options)
    return results
def evaluate(args, model, tokenizer, prefix=""):
    global_rank = -1 if args.local_rank == -1 else torch.distributed.get_rank()
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
    if args.preprocess_only:
        return

    write_dir = args.output_dir if args.write_dir is None else args.write_dir
    if not os.path.exists(write_dir) and global_rank in [-1, 0]:
        os.makedirs(write_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)  # if global_rank == -1 else DistributedSampler(dataset)  # No distributed eval to eval on full dev set
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': None if args.model_type in ['xlm', 'roberta'] else batch[2]  # XLM don't use segment_ids
                      }
            example_indices = batch[3]
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4],
                               'p_mask':    batch[5]})
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            if args.model_type in ['xlnet', 'xlm']:
                # XLNet uses a more complex post-processing procedure
                result = RawResultExtended(unique_id            = unique_id,
                                           start_top_log_probs  = to_list(outputs[0][i]),
                                           start_top_index      = to_list(outputs[1][i]),
                                           end_top_log_probs    = to_list(outputs[2][i]),
                                           end_top_index        = to_list(outputs[3][i]),
                                           cls_logits           = to_list(outputs[4][i]))
            else:
                result = RawResult(unique_id    = unique_id,
                                   start_logits = to_list(outputs[0][i]),
                                   end_logits   = to_list(outputs[1][i]))
            all_results.append(result)

    # Compute predictions
    output_prediction_file = os.path.join(write_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(write_dir, "nbest_predictions_{}.json".format(prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(write_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    if args.model_type in ['xlnet', 'xlm']:
        # XLNet uses a more complex post-processing procedure
        all_predictions, all_nbest_predictions, all_null_odds = write_predictions_extended(
                        examples, features, all_results, args.n_best_size,
                        args.max_answer_length, output_prediction_file,
                        output_nbest_file, output_null_log_odds_file, args.predict_file,
                        model.config.start_n_top, model.config.end_n_top,
                        args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
        all_predictions, all_nbest_predictions, all_null_odds = write_predictions(
                        examples, features, all_results, args.n_best_size,
                        args.max_answer_length, args.do_lower_case, output_prediction_file,
                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                        args.version_2_with_negative, args.null_score_diff_threshold)

    # Evaluate with the official SQuAD script
    def filter_keys(qid_to_val, task_name):
        task_name = task_name.lower()
        assert task_name in {'hotpot', 'squad'}, 'task_name {} not implemented.'.format(task_name)
        return {qid: val for qid, val in qid_to_val.items() if len(qid.split('.')) == (2 if task_name == 'hotpot' else 1)}

    if len(filter_keys(all_predictions, 'squad')) == 0:
        results = {}  # No SQuAD data in evaluation set
    else:
        squad_output_prediction_file = os.path.join(write_dir, "squad_predictions_{}.json".format(prefix))
        with open(squad_output_prediction_file, 'w') as writer:
            writer.write(json.dumps(filter_keys(all_predictions, 'squad'), indent=2))
        squad_output_nbest_file = os.path.join(write_dir, "squad_nbest_predictions_{}.json".format(prefix))
        with open(squad_output_nbest_file, 'w') as writer:
            writer.write(json.dumps(filter_keys(all_nbest_predictions, 'squad'), indent=2))
        if args.version_2_with_negative:
            squad_output_null_log_odds_file = os.path.join(write_dir, "squad_null_odds_{}.json".format(prefix))
            with open(squad_output_null_log_odds_file, 'w') as writer:
                writer.write(json.dumps(filter_keys(all_null_odds, 'squad'), indent=2))
        else:
            squad_output_null_log_odds_file = None
        predict_file_parts = args.predict_file.split('/')
        squad_predict_file = '/'.join(predict_file_parts[:-2] + ['squad', predict_file_parts[-1]])
        evaluate_options = EVAL_OPTS(data_file=squad_predict_file,
                                     pred_file=squad_output_prediction_file,
                                     na_prob_file=squad_output_null_log_odds_file)
        results = evaluate_on_squad(evaluate_options)

    # Check if HotpotQA answer file exists to do HotpotQA evaluation
    hotpot_answer_file_parts = args.predict_file.split('/')
    hotpot_answer_file_parts[-2] = 'hotpot-orig'
    hotpot_answer_file = '/'.join(hotpot_answer_file_parts)
    if (not args.no_answer_file) and (not os.path.exists(hotpot_answer_file)):
        with open(os.path.join(write_dir, "squad_results_{}.json".format(prefix)), "w") as writer:
            writer.write(json.dumps(results, indent=2, sort_keys=True))
        return results

    # Evaluate with official HotpotQA script
    nbest_predictions = filter_keys(all_nbest_predictions, 'hotpot')
    null_odds = filter_keys(all_null_odds, 'hotpot')

    qids = {single_hop_qid.split('.')[0] for single_hop_qid in nbest_predictions.keys()}
    pred_answers_and_sps = {'answer': {}, 'sp': {}}
    globally_normed_pred_answers_and_sps = {'answer': {}, 'sp': {}}
    pred_infos = {}
    globally_normed_pred_infos = {}
    max_num_paragraphs = 10
    for qid in qids:
        # Find paragraph with answer prediction
        min_null_odds = float('inf')
        max_logit_sum = float('-inf')
        best_single_hop_qid = None
        for paragraph_no in range(max_num_paragraphs):
            single_hop_qid = qid + '.' + str(paragraph_no)
            if (single_hop_qid in null_odds) and (null_odds[single_hop_qid] < min_null_odds):
                best_single_hop_qid = single_hop_qid
                min_null_odds = null_odds[single_hop_qid]
            if single_hop_qid in nbest_predictions:
                for nbest_prediction in nbest_predictions[single_hop_qid]:
                    if (len(nbest_prediction['text']) > 0) and (args.model_type not in ['xlnet', 'xlm']):
                        logit_sum = nbest_prediction['start_logit'] + nbest_prediction['end_logit'] - null_odds[single_hop_qid]
                        if logit_sum > max_logit_sum:
                            globally_normed_pred_answers_and_sps['answer'][qid] = nbest_prediction['text']
                            globally_normed_pred_infos[qid] = nbest_prediction
                            max_logit_sum = logit_sum

        # Find/store answer and supporting fact
        pred_answers_and_sps['sp'][qid] = []  # NB: Dummy supporting fact for now
        globally_normed_pred_answers_and_sps['sp'][qid] = []  # NB: Dummy supporting fact for now
        for nbest_prediction in nbest_predictions[best_single_hop_qid]:
            if len(nbest_prediction['text']) > 0:
                pred_answers_and_sps['answer'][qid] = nbest_prediction['text']
                pred_infos[qid] = nbest_prediction
                break
        assert qid in pred_answers_and_sps['answer'], 'Error: No predicted answer found.'
        # assert qid in globally_normed_pred_answers_and_sps['answer'], 'Error: No globally normed predicted answer found.'

    hotpot_output_prediction_file = os.path.join(write_dir, "hotpot_predictions_{}.json".format(prefix))
    with open(hotpot_output_prediction_file, "w") as writer:
        writer.write(json.dumps(pred_answers_and_sps, indent=2))
    hotpot_results = evaluate_on_hotpot(hotpot_output_prediction_file, hotpot_answer_file) if not args.no_answer_file else {}
    with open(os.path.join(write_dir, "hotpot_predictions_info_{}.json".format(prefix)), "w") as writer:
        writer.write(json.dumps(pred_infos, indent=2))

    hotpot_output_prediction_gn_file = os.path.join(write_dir, "hotpot_predictions_gn_{}.json".format(prefix))
    with open(hotpot_output_prediction_gn_file, "w") as writer:
        writer.write(json.dumps(globally_normed_pred_answers_and_sps, indent=2))
    hotpot_gn_results = evaluate_on_hotpot(hotpot_output_prediction_gn_file, hotpot_answer_file) \
        if ((not args.no_answer_file) and (args.model_type not in ['xlnet', 'xlm'])) else {}
    with open(os.path.join(write_dir, "hotpot_predictions_gn_info_{}.json".format(prefix)), "w") as writer:
        writer.write(json.dumps(globally_normed_pred_infos, indent=2))

    hotpot_results = {k: v * 100. for k, v in hotpot_results.items()}
    hotpot_gn_results = {'gn_' + k: v * 100. for k, v in hotpot_gn_results.items()}
    results = {'squad_' + k: v for k, v in results.items()}
    results.update(hotpot_results)
    results.update(hotpot_gn_results)
    with open(os.path.join(write_dir, "hotpot_results_{}.json".format(prefix)), "w") as writer:
        writer.write(json.dumps(results, indent=2, sort_keys=True))
    return results
Пример #6
0
def do_prediction(model_dir):
    # 1. Load a trained model

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
    model = BertForQuestionAnswering.from_pretrained(model_dir)
    model.to(device)
    model.eval()

    # 2. Load and pre-process the test set

    dev_file = "data/sfu.json"
    predict_batch_size = 2
    max_seq_length = 384

    eval_examples = read_squad_examples(input_file=dev_file, is_training=False, version_2_with_negative=False)

    tokenizer = BertTokenizer.from_pretrained(model_dir)
    eval_features = convert_examples_to_features(
                examples=eval_examples,
                tokenizer=tokenizer,
                max_seq_length=max_seq_length,
                doc_stride=128,
                max_query_length=64,
                is_training=False)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)

    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=predict_batch_size)

    # 3. Run inference on the test set

    all_results = []
    for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader):

        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        with torch.no_grad():       
            batch_start_logits, batch_end_logits = model(input_ids, input_mask, segment_ids)
                
        for i, example_index in enumerate(example_indices):
            start_logits = batch_start_logits[i].detach().cpu().tolist()
            end_logits = batch_end_logits[i].detach().cpu().tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            all_results.append(RawResult(unique_id=unique_id,
                                                 start_logits=start_logits,
                                                 end_logits=end_logits))
            
    output_prediction_file = os.path.join(model_dir, "predictions_sfu.json")
    output_nbest_file = os.path.join(model_dir, "nbest_predictions_sfu.json")
    output_null_log_odds_file = os.path.join(model_dir, "null_odds_sfu.json")

    preds = write_predictions(eval_examples, eval_features, all_results, 20,
                          30, True, output_prediction_file,
                          output_nbest_file, output_null_log_odds_file, True,
                          False, 0.0)
Пример #7
0
def evaluate(args, model, tokenizer, checkpoint_id=None, prefix=""):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
    if args.eval_data_subset > 0:
        dataset = Subset(dataset, list(range(min(args.eval_data_subset, len(dataset)))))

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_results = []
    start_time = timeit.default_timer()
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids':  batch[2],
                      }
            example_indices = batch[3]
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            result = RawResult(unique_id    = unique_id,
                               start_logits = to_list(outputs[0][i]),
                               end_logits   = to_list(outputs[1][i]))
            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(checkpoint_id if checkpoint_id is not None else prefix))
    output_nbest_file = os.path.join(args.output_dir,
                                     "nbest_predictions_{}.json".format(checkpoint_id if checkpoint_id is not None else prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(args.output_dir,
                                                 "null_odds_{}.json".format(checkpoint_id if checkpoint_id is not None else prefix))
    else:
        output_null_log_odds_file = None

    write_predictions(examples, features, all_results, args.n_best_size,
                    args.max_answer_length, args.do_lower_case, output_prediction_file,
                    output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                    args.version_2_with_negative, args.null_score_diff_threshold)

    # Evaluate with the official SQuAD script
    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
                                 pred_file=output_prediction_file,
                                 na_prob_file=output_null_log_odds_file)
    results = evaluate_on_squad(evaluate_options)

    output_eval_file = os.path.join(args.output_dir,
                                    "eval_results_{}.txt".format(checkpoint_id) if checkpoint_id is not None
                                    else "eval_results.txt"
                                    )
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(results.keys()):
            logger.info("  %s = %s", key, str(results[key]))
            writer.write("%s = %s\n" % (key, str(results[key])))

    return results
Пример #8
0
def predict(args, model, tokenizer, prefix="test"):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    pred_dataloader = DataLoader(dataset,
                                 batch_size=args.pred_batch_size,
                                 shuffle=False)

    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.pred_batch_size)
    all_results = []
    for batch in tqdm(pred_dataloader, desc="Predicting"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': None if args.model_type == 'xlm' else
                batch[2]  # XLM don't use segment_ids
            }
            example_indices = batch[3]
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4], 'p_mask': batch[5]})
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            pred_feature = features[example_index.item()]
            unique_id = int(pred_feature.unique_id)
            if args.model_type in ['xlnet', 'xlm']:
                # XLNet uses a more complex post-processing procedure
                result = RawResultExtended(
                    unique_id=unique_id,
                    start_top_log_probs=to_list(outputs[0][i]),
                    start_top_index=to_list(outputs[1][i]),
                    end_top_log_probs=to_list(outputs[2][i]),
                    end_top_index=to_list(outputs[3][i]),
                    cls_logits=to_list(outputs[4][i]))
            else:
                result = RawResult(unique_id=unique_id,
                                   start_logits=to_list(outputs[0][i]),
                                   end_logits=to_list(outputs[1][i]))
            all_results.append(result)

    # Compute predictions
    output_prediction_file = os.path.join(args.save_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.save_dir, "nbest_predictions_{}.json".format(prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.save_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    if args.model_type in ['xlnet', 'xlm']:
        # XLNet uses a more complex post-processing procedure
        write_predictions_extended(
            examples, features, all_results, args.n_best_size,
            args.max_answer_length, output_prediction_file, output_nbest_file,
            output_null_log_odds_file, args.predict_file,
            model.config.start_n_top, model.config.end_n_top,
            args.version_2_with_negative, tokenizer, args.verbose_logging)
    else:
        write_predictions(examples, features, all_results, args.n_best_size,
                          args.max_answer_length, args.do_lower_case,
                          output_prediction_file, output_nbest_file,
                          output_null_log_odds_file, args.verbose_logging,
                          args.version_2_with_negative,
                          args.null_score_diff_threshold)

    output_pred_file = os.path.join(args.save_dir, 'submit.csv')
    convert_json_to_csv(output_nbest_file, output_pred_file,
                        args.max_answer_length)
Пример #9
0
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(
        dataset) if args.local_rank == -1 else DistributedSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
            example_indices = batch[3]
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            result = RawResult(unique_id=unique_id,
                               start_logits=to_list(outputs[0][i]),
                               end_logits=to_list(outputs[1][i]))
            all_results.append(result)

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    write_predictions(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        args.do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        args.verbose_logging,
        args.version_2_with_negative,
        args.null_score_diff_threshold,
    )

    # Evaluate with the official SQuAD script
    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
                                 pred_file=output_prediction_file,
                                 na_prob_file=output_null_log_odds_file)
    results = evaluate_on_squad(evaluate_options)
    return results
Пример #10
0
def decode_ouput(outputs, features, examples, example_indices):
    """ Helps in decoding the model ouput to nbest probabilites and start , end word indexes """
    eval_feature = features[example_indices[0].item()]
    unique_id = int(eval_feature.unique_id)
    result = RawResult(unique_id=unique_id,
                       start_logits=to_list(outputs[0][0]),
                       end_logits=to_list(outputs[1][0]))
    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "PrelimPrediction", [
            "feature_index", "start_index", "end_index", "start_logit",
            "end_logit"
        ])
    start_indexes = _get_best_indexes(to_list(outputs[0][0]), 20)
    end_indexes = _get_best_indexes(to_list(outputs[1][0]), 20)
    feature = eval_feature
    prelim_predictions = []
    for start_index in start_indexes:
        for end_index in end_indexes:
            # We could hypothetically create invalid predictions, e.g., predict
            # that the start of the span is in the question. We throw out all
            # invalid predictions.
            if start_index >= len(feature.tokens):
                continue
            if end_index >= len(feature.tokens):
                continue
            if start_index not in feature.token_to_orig_map:
                continue
            if end_index not in feature.token_to_orig_map:
                continue
            if not feature.token_is_max_context.get(start_index, False):
                continue
            if end_index < start_index:
                continue
            length = end_index - start_index + 1
            if length > 30:
                continue
            prelim_predictions.append(
                _PrelimPrediction(feature_index=0,
                                  start_index=start_index,
                                  end_index=end_index,
                                  start_logit=result.start_logits[start_index],
                                  end_logit=result.end_logits[end_index]))
    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "NbestPrediction", ["text", "start_logit", "end_logit"])
    seen_predictions = {}
    nbest = []
    for pred in prelim_predictions:
        if len(nbest) >= 30:
            break
        feature = features[pred.feature_index]
        if pred.start_index > 0:  # this is a non-null prediction
            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
            orig_doc_start = feature.token_to_orig_map[pred.start_index]
            orig_doc_end = feature.token_to_orig_map[pred.end_index]
            orig_tokens = examples[0].doc_tokens[orig_doc_start:(orig_doc_end +
                                                                 1)]
            tok_text = " ".join(tok_tokens)

            # De-tokenize WordPieces that have been split off.
            tok_text = tok_text.replace(" ##", "")
            tok_text = tok_text.replace("##", "")

            # Clean whitespace
            tok_text = tok_text.strip()
            tok_text = " ".join(tok_text.split())
            orig_text = " ".join(orig_tokens)

            final_text = get_final_text(tok_text, orig_text, True, True)
            if final_text in seen_predictions:
                continue

            seen_predictions[final_text] = True
        else:
            final_text = ""
            seen_predictions[final_text] = True

        nbest.append(
            _NbestPrediction(text=final_text,
                             start_logit=pred.start_logit,
                             end_logit=pred.end_logit))

    total_scores = []
    best_non_null_entry = None
    for entry in nbest:
        total_scores.append(entry.start_logit + entry.end_logit)
        if not best_non_null_entry:
            if entry.text:
                best_non_null_entry = entry

    probs = _compute_softmax(total_scores)

    nbest_json = []
    for (i, entry) in enumerate(nbest):
        output = collections.OrderedDict()
        output["text"] = entry.text
        output["probability"] = probs[i]
        output["start_logit"] = entry.start_logit
        output["end_logit"] = entry.end_logit
        nbest_json.append(output)
    return nbest_json, nbest_json[0]["text"]


# context = "Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24\u201310 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the \"golden anniversary\" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as \"Super Bowl L\"), so that the logo could prominently feature the Arabic numerals 50."
# question = "Which NFL team represented the AFC at Super Bowl 50?"
# model_path = "pretrained/"
# model = load_model(model_path)
# inputs,features,examples,example_indices = feature_extract(context,question)
# ouputs = model(**inputs)
# nbest,best = decode_ouput(ouputs,features,examples,example_indices)
# print best