예제 #1
0
    def test_sum_qe(self):
        """
        Verify SumQE runs. These scores haven't been tested to be accurate,
        but the test will capture if anything changes with the metric.
        """
        sumqe = SumQE(environment_name='SumQE')
        centroid = load_summaries(_centroid_file_path)

        # It's quite slow, so we only run a few examples here
        centroid = centroid[:5]
        scores = sumqe.score_all(centroid)
        assert scores == pytest.approx(
            [
                {
                    "SumQE": {
                        "Q1": 0.8985345363616943,
                        "Q2": 0.9253203272819519,
                        "Q3": 0.8012534379959106,
                        "Q4": 0.871218204498291,
                        "Q5": 0.6108772158622742
                    }
                },
                {
                    "SumQE": {
                        "Q1": 0.7544711828231812,
                        "Q2": 0.8587688207626343,
                        "Q3": 0.9127543568611145,
                        "Q4": 0.8986099362373352,
                        "Q5": 0.623852014541626
                    }
                },
                {
                    "SumQE": {
                        "Q1": 0.9851462244987488,
                        "Q2": 0.8688598275184631,
                        "Q3": 0.942189633846283,
                        "Q4": 0.8591314554214478,
                        "Q5": 0.5004895925521851
                    }
                },
                {
                    "SumQE": {
                        "Q1": 0.3283337950706482,
                        "Q2": 0.8776571750640869,
                        "Q3": 0.8603634834289551,
                        "Q4": 0.8669484853744507,
                        "Q5": 0.44943714141845703
                    }
                },
                {
                    "SumQE": {
                        "Q1": 0.6950153112411499,
                        "Q2": 1.0309709310531616,
                        "Q3": 0.6369255781173706,
                        "Q4": 0.5551949143409729,
                        "Q5": 0.49241942167282104
                    }
                }], abs=1e-4)
    def test_multi_all(self):
        duc2004 = load_references(_duc2004_file_path)
        centroid = load_summaries(_centroid_file_path)
        classy04 = load_summaries(_classy04_file_path)
        classy11 = load_summaries(_classy11_file_path)

        rouge = Rouge(max_words=100)

        summaries_list = list(zip(*[centroid, classy04, classy11]))
        metrics_lists = rouge.score_multi_all(summaries_list, duc2004)
        metrics_lists = list(zip(*metrics_lists))
        metrics_list = [rouge.aggregate(metrics_list) for metrics_list in metrics_lists]

        expected_metrics_list = []
        for dataset in [centroid, classy04, classy11]:
            expected_metrics_list.append(rouge.aggregate(rouge.score_all(dataset, duc2004)))

        assert metrics_list == expected_metrics_list
예제 #3
0
    def test_bewte_runs(self):
        duc2004 = load_references(_duc2004_file_path)
        centroid = load_summaries(_centroid_file_path)

        bewte = BEwTE()
        _, metrics_list = bewte.evaluate(centroid, duc2004)
        metrics_list[5]['BEwTE']['precision'] == pytest.approx(54.54545454,
                                                               abs=1e-5)
        metrics_list[5]['BEwTE']['recall'] == pytest.approx(16.50488585,
                                                            abs=1e-5)
        metrics_list[5]['BEwTE']['f1'] == pytest.approx(25.32239681, abs=1e-5)
    def test_hong2014(self):
        duc2004 = load_references(_duc2004_file_path)
        centroid = load_summaries(_centroid_file_path)

        use_porter_stemmer = True
        remove_stopwords = False
        compute_rouge_l = True
        max_words = 100
        rouge = Rouge(max_ngram=2,
                      use_porter_stemmer=use_porter_stemmer,
                      remove_stopwords=remove_stopwords,
                      max_words=max_words,
                      compute_rouge_l=compute_rouge_l)
        python_rouge = PythonRouge(use_porter_stemmer=use_porter_stemmer,
                                   remove_stopwords=remove_stopwords,
                                   max_words=max_words,
                                   compute_rouge_l=compute_rouge_l)
        expected_metrics, _ = rouge.evaluate(centroid, duc2004)
        actual_metrics, _ = python_rouge.evaluate(centroid, duc2004)
        assert math.isclose(expected_metrics['rouge-1']['precision'],
                            actual_metrics['python-rouge-1']['precision'],
                            abs_tol=1e-2)
        assert math.isclose(expected_metrics['rouge-1']['recall'],
                            actual_metrics['python-rouge-1']['recall'],
                            abs_tol=2e-2)
        assert math.isclose(expected_metrics['rouge-1']['f1'],
                            actual_metrics['python-rouge-1']['f1'],
                            abs_tol=2e-2)
        assert math.isclose(expected_metrics['rouge-2']['precision'],
                            actual_metrics['python-rouge-2']['precision'],
                            abs_tol=1e-2)
        assert math.isclose(expected_metrics['rouge-2']['recall'],
                            actual_metrics['python-rouge-2']['recall'],
                            abs_tol=1e-2)
        assert math.isclose(expected_metrics['rouge-2']['f1'],
                            actual_metrics['python-rouge-2']['f1'],
                            abs_tol=1e-2)
        # Rouge-L is a little further off, but still reasonably close enough that I'm not worried
        assert math.isclose(expected_metrics['rouge-l']['precision'],
                            actual_metrics['python-rouge-l']['precision'],
                            abs_tol=1e-1)
        assert math.isclose(expected_metrics['rouge-l']['recall'],
                            actual_metrics['python-rouge-l']['recall'],
                            abs_tol=1e-1)
        assert math.isclose(expected_metrics['rouge-l']['f1'],
                            actual_metrics['python-rouge-l']['f1'],
                            abs_tol=1e-1)
예제 #5
0
    def test_score_multi_all_order(self):
        """Tests to ensure the scoring returns the same results, no matter the order."""
        moverscore = MoverScore()
        duc2004 = load_references(_duc2004_file_path)
        centroid1 = load_summaries(_centroid_file_path)
        centroid2 = list(
            reversed(centroid1))  # Just create a second fake dataset

        summaries_list = list(zip(*[centroid1, centroid2]))
        metrics_lists1 = moverscore.score_multi_all(summaries_list, duc2004)
        metrics_lists1 = list(zip(*metrics_lists1))

        summaries_list = list(zip(*[centroid2, centroid1]))
        metrics_lists2 = moverscore.score_multi_all(summaries_list, duc2004)
        metrics_lists2 = list(zip(*metrics_lists2))

        metrics_lists2 = list(reversed(metrics_lists2))
        assert metrics_lists1 == metrics_lists2
예제 #6
0
    def test_mover_score_runs(self):
        # These numbers were not checked to be correct, but will detect if anything
        # changes in the code
        duc2004 = load_references(_duc2004_file_path)
        centroid = load_summaries(_centroid_file_path)

        moverscore = MoverScore()
        _, metrics_list = moverscore.evaluate(centroid, duc2004)
        assert metrics_list[0]['MoverScore'] == pytest.approx(
            0.24826391722135857, abs=1e-4)
        assert metrics_list[1]['MoverScore'] == pytest.approx(
            0.19464766520457838, abs=1e-4)
        assert metrics_list[2]['MoverScore'] == pytest.approx(
            0.26644948499030685, abs=1e-4)
        assert metrics_list[3]['MoverScore'] == pytest.approx(
            0.21231040174382498, abs=1e-4)
        assert metrics_list[4]['MoverScore'] == pytest.approx(
            0.15387569485290115, abs=1e-4)
예제 #7
0
    def test_score_multi_all_order(self):
        """Tests to ensure the scoring returns the same results, no matter the order."""
        sumqe = SumQE(environment_name='SumQE')
        centroid1 = load_summaries(_centroid_file_path)
        # Just test a few because this is a slow metric
        centroid1 = centroid1[:5]
        centroid2 = list(reversed(centroid1))  # Just create a second fake dataset

        summaries_list = list(zip(*[centroid1, centroid2]))
        metrics_lists1 = sumqe.score_multi_all(summaries_list)
        metrics_lists1 = list(zip(*metrics_lists1))

        summaries_list = list(zip(*[centroid2, centroid1]))
        metrics_lists2 = sumqe.score_multi_all(summaries_list)
        metrics_lists2 = list(zip(*metrics_lists2))

        metrics_lists2 = list(reversed(metrics_lists2))
        assert metrics_lists1 == metrics_lists2
    def test_meteor(self):
        """Verify METEOR runs"""
        # These numbers aren't verified to be correct, but will catch if
        # the code changes
        meteor = Meteor()
        duc2004 = load_references(_duc2004_file_path)
        centroid = load_summaries(_centroid_file_path)

        system_score, summary_scores = meteor.evaluate(centroid, duc2004)
        assert system_score == pytest.approx({'METEOR': 0.16913291651219267},
                                             abs=1e-4)
        assert summary_scores[:5] == pytest.approx(
            [{
                'METEOR': 0.19100187219134385
            }, {
                'METEOR': 0.155452410194115
            }, {
                'METEOR': 0.1840667990852698
            }, {
                'METEOR': 0.18111121583550865
            }, {
                'METEOR': 0.13502476502555888
            }],
            abs=1e-5)
    def test_hong2014(self):
        """
        Tests to ensure that the Rouge scores for the summaries from Hong et al. 2014
        (http://www.lrec-conf.org/proceedings/lrec2014/pdf/1093_Paper.pdf) do not
        change. The hard-coded scores are very close to the scores reported in the paper.
        """
        duc2004 = load_references(_duc2004_file_path)
        centroid = load_summaries(_centroid_file_path)
        classy04 = load_summaries(_classy04_file_path)
        classy11 = load_summaries(_classy11_file_path)
        dpp = load_summaries(_dpp_file_path)
        freq_sum = load_summaries(_freq_sum_file_path)
        greedy_kl = load_summaries(_greedy_kl_file_path)
        icsi_summ = load_summaries(_icsi_summ_file_path)
        lexrank = load_summaries(_lexrank_file_path)
        occams_v = load_summaries(_occams_v_file_path)
        reg_sum = load_summaries(_reg_sum_file_path)
        submodular = load_summaries(_submodular_file_path)
        ts_sum = load_summaries(_ts_sum_file_path)

        rouge = Rouge(max_words=100)

        # Reported: 36.41, 7.97, 1.21
        metrics, _ = rouge.evaluate(centroid, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 36.41, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 7.97, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.21, places=2)

        # Reported: 37.62, 8.96, 1.51
        metrics, _ = rouge.evaluate(classy04, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 37.61, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 8.96, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.51, places=2)

        # Reported: 37.22, 9.20, 1.48
        metrics, _ = rouge.evaluate(classy11, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 37.22, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.20, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.48, places=2)

        # Reported: 39.79, 9.62, 1.57
        metrics, _ = rouge.evaluate(dpp, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 39.79, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.62, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.57, places=2)

        # Reported: 35.30, 8.11, 1.00
        metrics, _ = rouge.evaluate(freq_sum, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 35.30, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 8.11, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.00, places=2)

        # Reported: 37.98, 8.53, 1.26
        metrics, _ = rouge.evaluate(greedy_kl, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 37.98, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 8.53, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.26, places=2)

        # Reported: 38.41, 9.78, 1.73
        metrics, _ = rouge.evaluate(icsi_summ, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 38.41, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.78, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.73, places=2)

        # Reported: 35.95, 7.47, 0.82
        metrics, _ = rouge.evaluate(lexrank, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 35.95, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 7.47, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 0.82, places=2)

        # Reported: 38.50, 9.76, 1.33
        metrics, _ = rouge.evaluate(occams_v, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 38.50, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.76, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.33, places=2)

        # Reported: 38.57, 9.75, 1.60
        metrics, _ = rouge.evaluate(reg_sum, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 38.56, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.75, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.60, places=2)

        # Reported: 39.18, 9.35, 1.39
        metrics, _ = rouge.evaluate(submodular, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 39.18, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 9.35, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.39, places=2)

        # Reported: 35.88, 8.15, 1.03
        metrics, _ = rouge.evaluate(ts_sum, duc2004)
        self.assertAlmostEqual(metrics['rouge-1']['recall'], 35.88, places=2)
        self.assertAlmostEqual(metrics['rouge-2']['recall'], 8.14, places=2)
        self.assertAlmostEqual(metrics['rouge-4']['recall'], 1.03, places=2)