def test_sum_qe(self): """ Verify SumQE runs. These scores haven't been tested to be accurate, but the test will capture if anything changes with the metric. """ sumqe = SumQE(environment_name='SumQE') centroid = load_summaries(_centroid_file_path) # It's quite slow, so we only run a few examples here centroid = centroid[:5] scores = sumqe.score_all(centroid) assert scores == pytest.approx( [ { "SumQE": { "Q1": 0.8985345363616943, "Q2": 0.9253203272819519, "Q3": 0.8012534379959106, "Q4": 0.871218204498291, "Q5": 0.6108772158622742 } }, { "SumQE": { "Q1": 0.7544711828231812, "Q2": 0.8587688207626343, "Q3": 0.9127543568611145, "Q4": 0.8986099362373352, "Q5": 0.623852014541626 } }, { "SumQE": { "Q1": 0.9851462244987488, "Q2": 0.8688598275184631, "Q3": 0.942189633846283, "Q4": 0.8591314554214478, "Q5": 0.5004895925521851 } }, { "SumQE": { "Q1": 0.3283337950706482, "Q2": 0.8776571750640869, "Q3": 0.8603634834289551, "Q4": 0.8669484853744507, "Q5": 0.44943714141845703 } }, { "SumQE": { "Q1": 0.6950153112411499, "Q2": 1.0309709310531616, "Q3": 0.6369255781173706, "Q4": 0.5551949143409729, "Q5": 0.49241942167282104 } }], abs=1e-4)
def test_multi_all(self): duc2004 = load_references(_duc2004_file_path) centroid = load_summaries(_centroid_file_path) classy04 = load_summaries(_classy04_file_path) classy11 = load_summaries(_classy11_file_path) rouge = Rouge(max_words=100) summaries_list = list(zip(*[centroid, classy04, classy11])) metrics_lists = rouge.score_multi_all(summaries_list, duc2004) metrics_lists = list(zip(*metrics_lists)) metrics_list = [rouge.aggregate(metrics_list) for metrics_list in metrics_lists] expected_metrics_list = [] for dataset in [centroid, classy04, classy11]: expected_metrics_list.append(rouge.aggregate(rouge.score_all(dataset, duc2004))) assert metrics_list == expected_metrics_list
def test_bewte_runs(self): duc2004 = load_references(_duc2004_file_path) centroid = load_summaries(_centroid_file_path) bewte = BEwTE() _, metrics_list = bewte.evaluate(centroid, duc2004) metrics_list[5]['BEwTE']['precision'] == pytest.approx(54.54545454, abs=1e-5) metrics_list[5]['BEwTE']['recall'] == pytest.approx(16.50488585, abs=1e-5) metrics_list[5]['BEwTE']['f1'] == pytest.approx(25.32239681, abs=1e-5)
def test_hong2014(self): duc2004 = load_references(_duc2004_file_path) centroid = load_summaries(_centroid_file_path) use_porter_stemmer = True remove_stopwords = False compute_rouge_l = True max_words = 100 rouge = Rouge(max_ngram=2, use_porter_stemmer=use_porter_stemmer, remove_stopwords=remove_stopwords, max_words=max_words, compute_rouge_l=compute_rouge_l) python_rouge = PythonRouge(use_porter_stemmer=use_porter_stemmer, remove_stopwords=remove_stopwords, max_words=max_words, compute_rouge_l=compute_rouge_l) expected_metrics, _ = rouge.evaluate(centroid, duc2004) actual_metrics, _ = python_rouge.evaluate(centroid, duc2004) assert math.isclose(expected_metrics['rouge-1']['precision'], actual_metrics['python-rouge-1']['precision'], abs_tol=1e-2) assert math.isclose(expected_metrics['rouge-1']['recall'], actual_metrics['python-rouge-1']['recall'], abs_tol=2e-2) assert math.isclose(expected_metrics['rouge-1']['f1'], actual_metrics['python-rouge-1']['f1'], abs_tol=2e-2) assert math.isclose(expected_metrics['rouge-2']['precision'], actual_metrics['python-rouge-2']['precision'], abs_tol=1e-2) assert math.isclose(expected_metrics['rouge-2']['recall'], actual_metrics['python-rouge-2']['recall'], abs_tol=1e-2) assert math.isclose(expected_metrics['rouge-2']['f1'], actual_metrics['python-rouge-2']['f1'], abs_tol=1e-2) # Rouge-L is a little further off, but still reasonably close enough that I'm not worried assert math.isclose(expected_metrics['rouge-l']['precision'], actual_metrics['python-rouge-l']['precision'], abs_tol=1e-1) assert math.isclose(expected_metrics['rouge-l']['recall'], actual_metrics['python-rouge-l']['recall'], abs_tol=1e-1) assert math.isclose(expected_metrics['rouge-l']['f1'], actual_metrics['python-rouge-l']['f1'], abs_tol=1e-1)
def test_score_multi_all_order(self): """Tests to ensure the scoring returns the same results, no matter the order.""" moverscore = MoverScore() duc2004 = load_references(_duc2004_file_path) centroid1 = load_summaries(_centroid_file_path) centroid2 = list( reversed(centroid1)) # Just create a second fake dataset summaries_list = list(zip(*[centroid1, centroid2])) metrics_lists1 = moverscore.score_multi_all(summaries_list, duc2004) metrics_lists1 = list(zip(*metrics_lists1)) summaries_list = list(zip(*[centroid2, centroid1])) metrics_lists2 = moverscore.score_multi_all(summaries_list, duc2004) metrics_lists2 = list(zip(*metrics_lists2)) metrics_lists2 = list(reversed(metrics_lists2)) assert metrics_lists1 == metrics_lists2
def test_mover_score_runs(self): # These numbers were not checked to be correct, but will detect if anything # changes in the code duc2004 = load_references(_duc2004_file_path) centroid = load_summaries(_centroid_file_path) moverscore = MoverScore() _, metrics_list = moverscore.evaluate(centroid, duc2004) assert metrics_list[0]['MoverScore'] == pytest.approx( 0.24826391722135857, abs=1e-4) assert metrics_list[1]['MoverScore'] == pytest.approx( 0.19464766520457838, abs=1e-4) assert metrics_list[2]['MoverScore'] == pytest.approx( 0.26644948499030685, abs=1e-4) assert metrics_list[3]['MoverScore'] == pytest.approx( 0.21231040174382498, abs=1e-4) assert metrics_list[4]['MoverScore'] == pytest.approx( 0.15387569485290115, abs=1e-4)
def test_score_multi_all_order(self): """Tests to ensure the scoring returns the same results, no matter the order.""" sumqe = SumQE(environment_name='SumQE') centroid1 = load_summaries(_centroid_file_path) # Just test a few because this is a slow metric centroid1 = centroid1[:5] centroid2 = list(reversed(centroid1)) # Just create a second fake dataset summaries_list = list(zip(*[centroid1, centroid2])) metrics_lists1 = sumqe.score_multi_all(summaries_list) metrics_lists1 = list(zip(*metrics_lists1)) summaries_list = list(zip(*[centroid2, centroid1])) metrics_lists2 = sumqe.score_multi_all(summaries_list) metrics_lists2 = list(zip(*metrics_lists2)) metrics_lists2 = list(reversed(metrics_lists2)) assert metrics_lists1 == metrics_lists2
def test_meteor(self): """Verify METEOR runs""" # These numbers aren't verified to be correct, but will catch if # the code changes meteor = Meteor() duc2004 = load_references(_duc2004_file_path) centroid = load_summaries(_centroid_file_path) system_score, summary_scores = meteor.evaluate(centroid, duc2004) assert system_score == pytest.approx({'METEOR': 0.16913291651219267}, abs=1e-4) assert summary_scores[:5] == pytest.approx( [{ 'METEOR': 0.19100187219134385 }, { 'METEOR': 0.155452410194115 }, { 'METEOR': 0.1840667990852698 }, { 'METEOR': 0.18111121583550865 }, { 'METEOR': 0.13502476502555888 }], abs=1e-5)
def test_hong2014(self): """ Tests to ensure that the Rouge scores for the summaries from Hong et al. 2014 (http://www.lrec-conf.org/proceedings/lrec2014/pdf/1093_Paper.pdf) do not change. The hard-coded scores are very close to the scores reported in the paper. """ duc2004 = load_references(_duc2004_file_path) centroid = load_summaries(_centroid_file_path) classy04 = load_summaries(_classy04_file_path) classy11 = load_summaries(_classy11_file_path) dpp = load_summaries(_dpp_file_path) freq_sum = load_summaries(_freq_sum_file_path) greedy_kl = load_summaries(_greedy_kl_file_path) icsi_summ = load_summaries(_icsi_summ_file_path) lexrank = load_summaries(_lexrank_file_path) occams_v = load_summaries(_occams_v_file_path) reg_sum = load_summaries(_reg_sum_file_path) submodular = load_summaries(_submodular_file_path) ts_sum = load_summaries(_ts_sum_file_path) rouge = Rouge(max_words=100) # Reported: 36.41, 7.97, 1.21 metrics, _ = rouge.evaluate(centroid, duc2004) self.assertAlmostEqual(metrics['rouge-1']['recall'], 36.41, places=2) self.assertAlmostEqual(metrics['rouge-2']['recall'], 7.97, places=2) self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.21, places=2) # Reported: 37.62, 8.96, 1.51 metrics, _ = rouge.evaluate(classy04, duc2004) self.assertAlmostEqual(metrics['rouge-1']['recall'], 37.61, places=2) self.assertAlmostEqual(metrics['rouge-2']['recall'], 8.96, places=2) self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.51, places=2) # Reported: 37.22, 9.20, 1.48 metrics, _ = rouge.evaluate(classy11, duc2004) self.assertAlmostEqual(metrics['rouge-1']['recall'], 37.22, places=2) self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.20, places=2) self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.48, places=2) # Reported: 39.79, 9.62, 1.57 metrics, _ = rouge.evaluate(dpp, duc2004) self.assertAlmostEqual(metrics['rouge-1']['recall'], 39.79, places=2) self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.62, places=2) self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.57, places=2) # Reported: 35.30, 8.11, 1.00 metrics, _ = rouge.evaluate(freq_sum, duc2004) self.assertAlmostEqual(metrics['rouge-1']['recall'], 35.30, places=2) self.assertAlmostEqual(metrics['rouge-2']['recall'], 8.11, places=2) self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.00, places=2) # Reported: 37.98, 8.53, 1.26 metrics, _ = rouge.evaluate(greedy_kl, duc2004) self.assertAlmostEqual(metrics['rouge-1']['recall'], 37.98, places=2) self.assertAlmostEqual(metrics['rouge-2']['recall'], 8.53, places=2) self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.26, places=2) # Reported: 38.41, 9.78, 1.73 metrics, _ = rouge.evaluate(icsi_summ, duc2004) self.assertAlmostEqual(metrics['rouge-1']['recall'], 38.41, places=2) self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.78, places=2) self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.73, places=2) # Reported: 35.95, 7.47, 0.82 metrics, _ = rouge.evaluate(lexrank, duc2004) self.assertAlmostEqual(metrics['rouge-1']['recall'], 35.95, places=2) self.assertAlmostEqual(metrics['rouge-2']['recall'], 7.47, places=2) self.assertAlmostEqual(metrics['rouge-4']['recall'], 0.82, places=2) # Reported: 38.50, 9.76, 1.33 metrics, _ = rouge.evaluate(occams_v, duc2004) self.assertAlmostEqual(metrics['rouge-1']['recall'], 38.50, places=2) self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.76, places=2) self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.33, places=2) # Reported: 38.57, 9.75, 1.60 metrics, _ = rouge.evaluate(reg_sum, duc2004) self.assertAlmostEqual(metrics['rouge-1']['recall'], 38.56, places=2) self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.75, places=2) self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.60, places=2) # Reported: 39.18, 9.35, 1.39 metrics, _ = rouge.evaluate(submodular, duc2004) self.assertAlmostEqual(metrics['rouge-1']['recall'], 39.18, places=2) self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.35, places=2) self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.39, places=2) # Reported: 35.88, 8.15, 1.03 metrics, _ = rouge.evaluate(ts_sum, duc2004) self.assertAlmostEqual(metrics['rouge-1']['recall'], 35.88, places=2) self.assertAlmostEqual(metrics['rouge-2']['recall'], 8.14, places=2) self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.03, places=2)