def test_ngrams(self): assert (get_metric_narrativeqa( "David Thomas was eating an apple", ["David Thomas was eating an apple and fell to the ground"], ) == (0.43, 0.43, 0.57, 0.75, 1.0, 0.6)) assert (get_metric_narrativeqa( "David Thomas was eating an apple and fell to the ground", ["David Thomas was eating an apple", "he fell to the ground"], ) == (0.55, 0.38, 0.92, 0.75, 0.6, 1.0)) assert (get_metric_narrativeqa( "David Thomas was eating an apple and fell to the ground", ["David Thomas was eating an apple and fell to the ground"], ) == (1.0, 1.0, 1.0, 1.0, 1.0, 1.0))
def evaluate_dataset(dataset_name, prediction, ground_truths, metrics): prediction = prediction[0] if isinstance(prediction, list) else prediction if dataset_name in [ "squad1", "squad2", "ropes", "newsqa", "duorc", "squad1_syn", "ropes_syn", "newsqa_syn", "duorc_syn", ]: exact_match, f1 = get_metric_squad(prediction, [truth[0] for truth in ground_truths]) metrics = update_extractive_metrics(metrics, dataset_name, exact_match, f1) elif dataset_name in ["drop", "quoref", "drop_syn", "quoref_syn"]: exact_match, f1 = get_metric_drop(prediction, [truth[0] for truth in ground_truths]) metrics = update_extractive_metrics(metrics, dataset_name, exact_match, f1) elif dataset_name == "narrativeqa": prediction = prediction[0] if isinstance(prediction, list) else prediction ground_truths = [truth[0] for truth in ground_truths] bleu1, bleu4, meteor, rouge_f, rouge_p, rouge_r = get_metric_narrativeqa( prediction, ground_truths ) metrics = update_abstractive_metrics( metrics, bleu1, bleu4, meteor, rouge_f, rouge_p, rouge_r ) else: print("Incorrect dataset name at :{0}".format(dataset_name)) raise ValueError return metrics