示例#1
0
 def test_ngrams(self):
     assert (get_metric_narrativeqa(
         "David Thomas was eating an apple",
         ["David Thomas was eating an apple and fell to the ground"],
     ) == (0.43, 0.43, 0.57, 0.75, 1.0, 0.6))
     assert (get_metric_narrativeqa(
         "David Thomas was eating an apple and fell to the ground",
         ["David Thomas was eating an apple", "he fell to the ground"],
     ) == (0.55, 0.38, 0.92, 0.75, 0.6, 1.0))
     assert (get_metric_narrativeqa(
         "David Thomas was eating an apple and fell to the ground",
         ["David Thomas was eating an apple and fell to the ground"],
     ) == (1.0, 1.0, 1.0, 1.0, 1.0, 1.0))
示例#2
0
def evaluate_dataset(dataset_name, prediction, ground_truths, metrics):
    prediction = prediction[0] if isinstance(prediction, list) else prediction
    if dataset_name in [
        "squad1",
        "squad2",
        "ropes",
        "newsqa",
        "duorc",
        "squad1_syn",
        "ropes_syn",
        "newsqa_syn",
        "duorc_syn",
    ]:
        exact_match, f1 = get_metric_squad(prediction, [truth[0] for truth in ground_truths])
        metrics = update_extractive_metrics(metrics, dataset_name, exact_match, f1)
    elif dataset_name in ["drop", "quoref", "drop_syn", "quoref_syn"]:
        exact_match, f1 = get_metric_drop(prediction, [truth[0] for truth in ground_truths])
        metrics = update_extractive_metrics(metrics, dataset_name, exact_match, f1)
    elif dataset_name == "narrativeqa":
        prediction = prediction[0] if isinstance(prediction, list) else prediction
        ground_truths = [truth[0] for truth in ground_truths]
        bleu1, bleu4, meteor, rouge_f, rouge_p, rouge_r = get_metric_narrativeqa(
            prediction, ground_truths
        )
        metrics = update_abstractive_metrics(
            metrics, bleu1, bleu4, meteor, rouge_f, rouge_p, rouge_r
        )
    else:
        print("Incorrect dataset name at :{0}".format(dataset_name))
        raise ValueError

    return metrics