def test_meteor(self): # This is a regression test, not necessarily a test for correctness metric = Meteor() expected_output = [{ 'METEOR': 0.1669124164573825 }, { 'METEOR': 0.1948832291162732 }, { 'METEOR': 0.20611797558912198 }, { 'METEOR': 0.16514495147439157 }, { 'METEOR': 0.17681198359839032 }, { 'METEOR': 0.1711431947904745 }, { 'METEOR': 0.21743329361081287 }, { 'METEOR': 0.21450528119487908 }, { 'METEOR': 0.16550404185166837 }, { 'METEOR': 0.21164463762707278 }, { 'METEOR': 0.20412130512572657 }, { 'METEOR': 0.18243523574488876 }] super().assert_expected_output(metric, expected_output)
def test_score_multi_all_order(self): """Tests to ensure the scoring returns the same results, no matter the order.""" meteor = Meteor() duc2004 = load_references(_duc2004_file_path) centroid1 = load_summaries(_centroid_file_path) centroid2 = list( reversed(centroid1)) # Just create a second fake dataset summaries_list = list(zip(*[centroid1, centroid2])) metrics_lists1 = meteor.score_multi_all(summaries_list, duc2004) metrics_lists1 = list(zip(*metrics_lists1)) summaries_list = list(zip(*[centroid2, centroid1])) metrics_lists2 = meteor.score_multi_all(summaries_list, duc2004) metrics_lists2 = list(zip(*metrics_lists2)) metrics_lists2 = list(reversed(metrics_lists2)) assert metrics_lists1 == metrics_lists2
def test_chen2018(self): """ Tests to ensure that Meteor returns the expected score on the Chen 2018 data subset. I ran Meteor on the full data (~11k examples) which takes too long to run for a unit test. After confirming the numbers are the same as what is reported in the paper, I ran the code on just the subset, and this test ensures those numbers are returned. """ gold_file_path = f'{FIXTURES_ROOT}/data/chen2018/gold.jsonl' model_file_path = f'{FIXTURES_ROOT}/data/chen2018/model.jsonl' gold = JsonlReader(gold_file_path).read() model = JsonlReader(model_file_path).read() gold = [[summary['summary']] for summary in gold] model = [summary['summary'] for summary in model] meteor = Meteor() score, _ = meteor.evaluate(model, gold) assert score['METEOR'] == pytest.approx(0.1828372, abs=1e-7)
def test_meteor(self): """Verify METEOR runs""" # These numbers aren't verified to be correct, but will catch if # the code changes meteor = Meteor() duc2004 = load_references(_duc2004_file_path) centroid = load_summaries(_centroid_file_path) system_score, summary_scores = meteor.evaluate(centroid, duc2004) assert system_score == pytest.approx({'METEOR': 0.16913291651219267}, abs=1e-4) assert summary_scores[:5] == pytest.approx( [{ 'METEOR': 0.19100187219134385 }, { 'METEOR': 0.155452410194115 }, { 'METEOR': 0.1840667990852698 }, { 'METEOR': 0.18111121583550865 }, { 'METEOR': 0.13502476502555888 }], abs=1e-5)
def test_bewte_order_invariant(self): metric = Meteor() self.assert_order_invariant(metric)