Пример #1
0
    def score_multi_all(
        self, summaries_list: List[List[SummaryField]],
        references_list: List[List[ReferencesField]]
    ) -> List[List[MetricsDict]]:
        # Just take the summaries themselves, not the fields
        summaries_list = [[field.summary for field in fields]
                          for fields in summaries_list]
        references_list = [field.references for field in references_list]

        with TemporaryDirectory() as temp_dir:
            self._save_summaries(temp_dir, summaries_list, references_list)

            self._run_step1(temp_dir)
            self._run_step2(temp_dir)
            self._run_step3(temp_dir)
            stdout = self._run_step4(temp_dir)

            # There is a weird way to score a summary given multiple references
            # in the original code. They multiply the highest recall score by
            # (num_references - 1) and adds that to the second to last score and
            # divides by num_references.
            # (See https://github.com/igorbrigadir/ROUGE-BEwTE/blob/f69a85556c889b805c89c5c71d7b77a983e75a05/src/main/java/bewte/BEwT_E.java#L419)
            # I don't understand this because it depends on the order that the
            # summaries are processed. We instead compute the average over the references.
            metrics_lists = self._parse_stdout(stdout)
            return metrics_lists
Пример #2
0
    def score_multi_all(self, summaries_list: List[List[SummaryType]],
                        references_list: List[List[ReferenceType]],
                        **kwargs) -> List[List[MetricsDict]]:
        summaries_list = self._flatten_summaries(summaries_list)
        references_list = self._flatten_summaries(references_list)

        logger.info(f'Serializing the summaries and references to a file')
        num_summaries = 0
        with TemporaryDirectory() as temp_dir:
            input_file = f'{temp_dir}/input.jsonl'
            output_file = f'{temp_dir}/output.jsonl'
            with JsonlWriter(input_file) as out:
                for summaries, references in zip(summaries_list,
                                                 references_list):
                    for summary in summaries:
                        out.write({
                            'summary': summary,
                            'references': references
                        })
                        num_summaries += 1
            logger.info(f'Wrote {num_summaries} (summary, references) pairs')

            commands = [f'cd {self.s3_root}/S3']
            if self.environment_name is not None:
                commands.append(f'source {os.environ["CONDA_INIT"]}')
                commands.append(f'conda activate {self.environment_name}')
            commands.append(
                f'python2.7 run_batch.py {input_file} {output_file} {self.embeddings_file} {self.model_dir}'
            )
            command = ' && '.join(commands)

            logger.info(f'Running command: "{command}"')
            redirect = None if self.verbose else PIPE
            process = Popen(command,
                            stdout=redirect,
                            stderr=redirect,
                            shell=True)
            process.communicate()

            scores = JsonlReader(output_file).read()
            assert len(scores) == num_summaries

            metrics_list = []
            index = 0
            for summaries in summaries_list:
                metrics_list.append([])
                for _ in summaries:
                    metrics_list[-1].append(
                        MetricsDict({
                            's3': {
                                'pyr': scores[index]['pyr'],
                                'resp': scores[index]['resp'],
                            }
                        }))
                    index += 1
            return metrics_list
Пример #3
0
    def test_correlate_reference(self):
        # We have to use the TAC 2008 data because the MultiLing data is too small for
        # the bootstrapping to work
        with TemporaryDirectory() as temp_dir:
            command = [
                'python', '-m', 'sacrerouge', 'stat-sig-test',
                '--metrics-jsonl-files', _metrics_file_path,
                '--dependent-metric', 'overall_responsiveness', '--metric-A',
                'rouge-1_jk_precision', '--metric-B', 'rouge-2_jk_recall',
                '--summarizer-type', 'all', '--hypothesis-test',
                'bootstrap-both', '--output-file',
                f'{temp_dir}/correlations.json', '--random-seed', '6',
                '--silent'
            ]
            subprocess.run(command, check=True)
            correlations = json.load(open(f'{temp_dir}/correlations.json',
                                          'r'))

            assert correlations['dependent_metric'] == 'overall_responsiveness'
            assert correlations['metric_A'] == 'rouge-1_jk_precision'
            assert correlations['metric_B'] == 'rouge-2_jk_recall'
            assert correlations['summarizer_type'] == 'all'
            assert correlations['test_method'] == 'bootstrap-both'
            self.assertAlmostEqual(correlations['alpha'], 0.05, places=4)
            assert correlations['two_tailed'] is False
            assert correlations[
                'H0'] == 'r(rouge-1_jk_precision, overall_responsiveness) <= r(rouge-2_jk_recall, overall_responsiveness)'
            assert correlations[
                'H1'] == 'r(rouge-1_jk_precision, overall_responsiveness) > r(rouge-2_jk_recall, overall_responsiveness)'

            assert correlations['summary_level']['pearson']['pvalue'] == 0.829
            assert correlations['summary_level']['pearson'][
                'is_significant'] is False
            assert correlations['summary_level']['spearman']['pvalue'] == 0.938
            assert correlations['summary_level']['spearman'][
                'is_significant'] is False
            assert correlations['summary_level']['kendall']['pvalue'] == 0.929
            assert correlations['summary_level']['kendall'][
                'is_significant'] is False

            assert correlations['system_level']['pearson']['pvalue'] == 0.603
            assert correlations['system_level']['pearson'][
                'is_significant'] is False
            assert correlations['system_level']['spearman']['pvalue'] == 0.945
            assert correlations['system_level']['spearman'][
                'is_significant'] is False
            assert correlations['system_level']['kendall']['pvalue'] == 0.977
            assert correlations['system_level']['kendall'][
                'is_significant'] is False

            assert correlations['global']['pearson']['pvalue'] == 0.49
            assert correlations['global']['pearson']['is_significant'] is False
            assert correlations['global']['spearman']['pvalue'] == 0.831
            assert correlations['global']['spearman']['is_significant'] is False
            assert correlations['global']['kendall']['pvalue'] == 0.811
            assert correlations['global']['kendall']['is_significant'] is False
Пример #4
0
 def test_correlation(self):
     with TemporaryDirectory() as temp_dir:
         command = [
             'python', '-m', 'sacrerouge', 'correlate',
             '--metrics-jsonl-files', _metrics_file_path, '--metrics',
             'chaganty2018_overall', 'chaganty2018_rouge-1_recall',
             '--summarizer-type', 'peer', '--output-file',
             f'{temp_dir}/correlations.json', '--silent'
         ]
         subprocess.run(command, check=True)
         correlations = json.load(open(f'{temp_dir}/correlations.json',
                                       'r'))
Пример #5
0
    def test_numeric_metric(self):
        with TemporaryDirectory() as temp_dir:
            output_file = f'{temp_dir}/metrics.jsonl'
            command = [
                'python', '-m', 'sacrerouge', 'score',
                _numeric_config_file_path, output_file
            ]

            process = Popen(command, stdout=PIPE, stderr=PIPE)
            process.communicate()

            metrics_list = JsonlReader(output_file, Metrics).read()

            assert len(metrics_list) == 5
            assert metrics_list[0].instance_id == 'D1'
            assert metrics_list[1].instance_id == 'D1'
            assert metrics_list[2].instance_id == 'D1'
            assert metrics_list[3].instance_id == 'D1'
            assert metrics_list[4].instance_id == 'D1'

            assert metrics_list[0].summarizer_id == '1'
            assert metrics_list[1].summarizer_id == '2'
            assert metrics_list[2].summarizer_id == 'A'
            assert metrics_list[3].summarizer_id == 'B'
            assert metrics_list[4].summarizer_id == 'C'

            assert metrics_list[0].summarizer_type == 'peer'
            assert metrics_list[1].summarizer_type == 'peer'
            assert metrics_list[2].summarizer_type == 'reference'
            assert metrics_list[3].summarizer_type == 'reference'
            assert metrics_list[4].summarizer_type == 'reference'

            # test: 1 * 10 + 1 * 100 + 1 * 1000 == 1110
            # test_jk = ((1 * 10 + 1 * 100) + (1 * 10 + 1 * 1000) + (1 * 100 + 1 * 1000)) / 3 == 740
            assert metrics_list[0].metrics == {'test': 1110, 'test_jk': 740}
            # test: 2 * 10 + 2 * 100 + 2 * 1000 == 2220
            # test_jk = ((2 * 10 + 2 * 100) + (2 * 10 + 2 * 1000) + (2 * 100 + 2 * 1000)) / 3 == 1480
            assert metrics_list[1].metrics == {
                'test': 2220,
                'test_jk': 1480
            }  # 2 * 10 + 2 * 100 + 2 * 1000
            # test_jk = 10 * 100 + 10 * 1000 == 11000
            assert metrics_list[2].metrics == {
                'test_jk': 11000
            }  # 10 * 100 + 10 * 1000
            # test_jk = 100 * 10 + 100 * 1000 == 101000
            assert metrics_list[3].metrics == {
                'test_jk': 101000
            }  # 100 * 10 + 100 * 1000
            # test_jk = 1000 * 10 + 10000 * 100 == 110000
            assert metrics_list[4].metrics == {
                'test_jk': 110000
            }  # 1000 * 10 + 10000 * 100
Пример #6
0
    def test_numeric_metric(self):
        with TemporaryDirectory() as temp_dir:
            macro_file = f'{temp_dir}/macro.json'
            micro_file = f'{temp_dir}/micro.jsonl'
            command = [
                'python', '-m', 'sacrerouge', 'evaluate', '--config',
                _numeric_config_file_path, '--macro-output-json', macro_file,
                '--micro-output-jsonl', micro_file, '--silent'
            ]

            process = Popen(command, stdout=PIPE, stderr=PIPE)
            process.communicate()

            macro_metrics = json.load(open(macro_file, 'r'))
            micro_metrics_list = JsonlReader(micro_file, Metrics).read()

            assert macro_metrics == {'metrics': {'test': 45066}}

            assert len(micro_metrics_list) == 5
            assert micro_metrics_list[0].instance_id == 'D1'
            assert micro_metrics_list[1].instance_id == 'D1'
            assert micro_metrics_list[2].instance_id == 'D1'
            assert micro_metrics_list[3].instance_id == 'D1'
            assert micro_metrics_list[4].instance_id == 'D1'

            assert micro_metrics_list[0].summarizer_id == '1'
            assert micro_metrics_list[1].summarizer_id == '2'
            assert micro_metrics_list[2].summarizer_id == 'A'
            assert micro_metrics_list[3].summarizer_id == 'B'
            assert micro_metrics_list[4].summarizer_id == 'C'

            assert micro_metrics_list[0].summarizer_type == 'peer'
            assert micro_metrics_list[1].summarizer_type == 'peer'
            assert micro_metrics_list[2].summarizer_type == 'reference'
            assert micro_metrics_list[3].summarizer_type == 'reference'
            assert micro_metrics_list[4].summarizer_type == 'reference'

            assert micro_metrics_list[0].metrics == {
                'test': 1110
            }  # 1 * 10 + 1 * 100 + 1 * 1000
            assert micro_metrics_list[1].metrics == {
                'test': 2220
            }  # 2 * 10 + 2 * 100 + 2 * 1000
            assert micro_metrics_list[2].metrics == {
                'test': 11000
            }  # 10 * 100 + 10 * 1000
            assert micro_metrics_list[3].metrics == {
                'test': 101000
            }  # 100 * 10 + 100 * 1000
            assert micro_metrics_list[4].metrics == {
                'test': 110000
            }  # 1000 * 10 + 10000 * 100
Пример #7
0
    def _run(self,
             summaries_list: List[List[SummaryType]],
             documents_list: List[List[str]]) -> Tuple[List[MetricsDict], List[List[MetricsDict]]]:
        with TemporaryDirectory() as temp_dir:
            mappings_file_path = f'{temp_dir}/mappings.txt'
            with open(mappings_file_path, 'w') as out:
                for i, (summaries, documents) in enumerate(zip(summaries_list, documents_list)):
                    document_dir = f'{temp_dir}/documents/{i}'
                    for j, document in enumerate(documents):
                        document_file_path = f'{document_dir}/{j}.txt'
                        self._save_summary_like(document, document_file_path)

                    for j, summary in enumerate(summaries):
                        summary_file_path = f'{temp_dir}/summaries/{i}-{j}.txt'
                        self._save_summary_like(summary, summary_file_path)

                        out.write(f'{i} {j} {document_dir} {summary_file_path}\n')

            config_file_path = f'{temp_dir}/config'
            with open(config_file_path, 'w') as out:
                perform_stemming = 'Y' if self.use_stemmer else 'N'
                out.write(f'performStemming = {perform_stemming}\n')

                remove_stopwords = 'Y' if self.remove_stopwords else 'N'
                out.write(f'removeStopWords = {remove_stopwords}\n')
                out.write(f'stopFilePath = {self.data_dir}/smart_common_words.txt\n')

                out.write(f'divergence = Y\n')
                out.write(f'frequencyFeatures = Y\n')
                out.write(f'cosineOverlap = Y\n')
                out.write(f'topicWordFeatures = Y\n')
                out.write(f'backgroundCorpusFreqCounts = {self.data_dir}/bgFreqCounts.unstemmed.txt\n')
                out.write(f'backgroundIdfUnstemmed = {self.data_dir}/bgIdfValues.unstemmed.txt\n')
                out.write(f'backgroundIdfStemmed = {self.data_dir}/bgIdfValues.stemmed.txt\n')

            command = [
                'java',
                '-cp', self.jar_path,
                'edu.upenn.seas.simetrix.InputBasedEvaluation',
                mappings_file_path,
                config_file_path
            ]

            logger.info(f'Running SIMetrix command: "{command}"')
            process = Popen(command, stdout=PIPE, stderr=PIPE)
            stdout, stderr = process.communicate()
            if stderr:
                raise Exception(f'SIMetrix failed with stderr: {stderr.decode()}')

            macro_results = self._parse_macro_file(f'{temp_dir}/mappings.txt.ieval.macro')
            micro_results = self._parse_micro_file(f'{temp_dir}/mappings.txt.ieval.micro')
            return macro_results, micro_results
Пример #8
0
    def _run(
            self, summaries_list: List[List[SummaryType]]
    ) -> List[List[MetricsDict]]:
        with TemporaryDirectory() as temp_dir:
            summaries_file = f'{temp_dir}/summaries.jsonl'
            predictions_file = f'{temp_dir}/predictions.json'

            # Save all of the summaries to a file
            with JsonlWriter(summaries_file) as out:
                for summaries in summaries_list:
                    for summary in summaries:
                        out.write({'summary': self._flatten_summary(summary)})

            commands = [f'cd {self.sum_qe_root}']
            if self.environment_name:
                commands += [f'source activate {self.environment_name}']
            commands += [
                ' '.join([
                    'python', '-m', 'src.BERT_experiments.predict',
                    summaries_file, self.model_file, predictions_file
                ])
            ]

            redirect = None if self.verbose else PIPE
            process = Popen(' && '.join(commands),
                            stdout=redirect,
                            stderr=redirect,
                            shell=True)
            stdout, stderr = process.communicate()

            predictions = json.loads(open(predictions_file, 'r').read())

            index = 0
            metrics_lists = []
            for summaries in summaries_list:
                metrics_lists.append([])
                for summary in summaries:
                    preds = predictions[index]
                    metrics_lists[-1].append(
                        MetricsDict({
                            'SumQE': {
                                'Q1': preds[0],
                                'Q2': preds[1],
                                'Q3': preds[2],
                                'Q4': preds[3],
                                'Q5': preds[4]
                            }
                        }))
                    index += 1

            return metrics_lists
Пример #9
0
    def _run(
            self, summaries_list: List[List[SummaryType]],
            references_list: List[List[SummaryType]]
    ) -> List[List[MetricsDict]]:
        with TemporaryDirectory() as temp_dir:
            files_tsv_path = f'{temp_dir}/files.tsv'
            with open(files_tsv_path, 'w') as out:
                for i, (summaries, references) in enumerate(
                        zip(summaries_list, references_list)):
                    reference_filenames = []
                    for j, reference in enumerate(references):
                        filename = f'{temp_dir}/references/{i}/{j}.txt'
                        self._save_summary(reference, filename)
                        reference_filenames.append(filename)

                    peer_filenames = []
                    for j, summary in enumerate(summaries):
                        filename = f'{temp_dir}/peers/{i}/{j}.txt'
                        self._save_summary(summary, filename)
                        peer_filenames.append(filename)

                    out.write(
                        f'{",".join(reference_filenames)}\t{",".join(peer_filenames)}\n'
                    )

            output_file = f'{temp_dir}/output.tsv'
            args = ' '.join([
                f'-files={files_tsv_path}', f'-output={output_file}',
                f'-minN={self.min_n}', f'-maxN={self.max_n}',
                f'-dwin={self.d_window}', f'-minScore={self.min_score}',
                f'-maxScore={self.max_score}'
            ])

            commands = [
                f'cd {self.autosummeng_root}',
                f'mvn exec:java@NPowERBatch -Dexec.args=\'{args}\''
            ]
            command = ' && '.join(commands)

            logger.info(f'Running AutoSummENG command: "{command}"')
            redirect = None if self.verbose else PIPE
            process = Popen(command,
                            stdout=redirect,
                            stderr=redirect,
                            shell=True)
            stdout, stderr = process.communicate()

            return self._parse_output_file(output_file)
Пример #10
0
 def test_plots(self):
     # Tests to ensure plot files exist
     with TemporaryDirectory() as temp_dir:
         system_plot_file = f'{temp_dir}/system.pdf'
         global_plot_file = f'{temp_dir}/global.pdf'
         command = [
             'python', '-m', 'sacrerouge', 'correlate',
             '--metrics-jsonl-files', MULTILING_METRICS, '--metrics',
             'rouge-1_jk_precision', 'grade', '--summarizer-type', 'all',
             '--output-file', f'{temp_dir}/correlations.json', '--silent',
             '--system-level-output-plot', system_plot_file,
             '--global-output-plot', global_plot_file
         ]
         subprocess.run(command, check=True)
         assert os.path.exists(system_plot_file)
         assert os.path.exists(global_plot_file)
Пример #11
0
    def _run(
        self, summaries_list: List[List[SummaryType]],
        references_list: List[List[SummaryType]]
    ) -> Tuple[MetricsDict, List[List[MetricsDict]]]:
        summaries_list = self._flatten_summaries(summaries_list)
        references_list = self._flatten_summaries(references_list)

        with TemporaryDirectory() as temp_dir:
            # As far as I can tell, the input only allows for one reference
            # per input, so we need to write an instance for every pair and then
            # aggregate the output
            summaries_file = f'{temp_dir}/summaries.txt'
            references_file = f'{temp_dir}/references.txt'
            index = 0
            tuple_to_indices = defaultdict(list)
            with open(summaries_file, 'w') as out_summaries:
                with open(references_file, 'w') as out_references:
                    for i, (summaries, references) in enumerate(
                            zip(summaries_list, references_list)):
                        for j, summary in enumerate(summaries):
                            for reference in references:
                                out_summaries.write(summary + '\n')
                                out_references.write(reference + '\n')
                                tuple_to_indices[(i, j)].append(index)
                                index += 1

            # Run meteor
            command = [
                'java', '-jar',
                f'{self.meteor_root}/meteor-1.5/meteor-1.5.jar',
                summaries_file, references_file, '-l', 'en', '-norm'
            ]

            process = Popen(command, stdout=PIPE, stderr=PIPE)
            stdout, stderr = process.communicate()
            if stderr:
                raise Exception(
                    f'Meteor failed with stderr: {stderr.decode()}')

            final_score, individual_scores = self._parse_meteor_stdout(
                stdout.decode())

            macro_metrics = MetricsDict({'METEOR': final_score})
            micro_metrics_list = self._aggregate_summary_scores(
                summaries_list, references_list, tuple_to_indices,
                individual_scores)
            return macro_metrics, micro_metrics_list
Пример #12
0
    def test_evaluate(self):
        with TemporaryDirectory() as temp_dir:
            macro_file = f'{temp_dir}/macro.json'
            micro_file = f'{temp_dir}/micro.jsonl'
            command = [
                'python', '-m', 'sacrerouge', 'evaluate', _config_file_path,
                macro_file, micro_file, '--silent'
            ]

            process = Popen(command, stdout=PIPE, stderr=PIPE)
            stdout, stderr = process.communicate()

            macro_metrics = json.load(open(macro_file, 'r'))
            micro_metrics_list = JsonlReader(micro_file, Metrics).read()

            self._check_macro(macro_metrics)
            self._check_micro_list(micro_metrics_list)
Пример #13
0
    def score_multi_all(self, summaries_list: List[List[SummaryType]],
                        documents_list: List[List[DocumentType]],
                        **kwargs) -> List[List[MetricsDict]]:
        with TemporaryDirectory() as temp_dir:
            input_dir = f'{temp_dir}/input'
            output_file = f'{temp_dir}/output.json'

            logger.info(f'Serializing data to {input_dir}')
            os.makedirs(input_dir)
            for i, (summaries,
                    documents) in enumerate(zip(summaries_list,
                                                documents_list)):
                instance_dir = f'{input_dir}/{i}'
                documents_dir = f'{instance_dir}/input_docs'
                summaries_dir = f'{instance_dir}/summaries'

                self._save_documents(documents, documents_dir)
                self._save_summaries(summaries, summaries_dir)

            commands = [f'cd {self.supert_root}']
            if self.environment_name is not None:
                commands.append(f'source {os.environ["CONDA_INIT"]}')
                commands.append(f'conda activate {self.environment_name}')
            commands.append(f'python run_batch.py {input_dir} {output_file}')
            command = ' && '.join(commands)

            logger.info(f'Running command: "{command}"')
            redirect = None if self.verbose else PIPE
            process = Popen(command,
                            stdout=redirect,
                            stderr=redirect,
                            shell=True)
            process.communicate()

            logger.info(f'Loading output from {output_file}')
            output = json.loads(open(output_file, 'r').read())
            metrics_list = []
            for i, summaries in enumerate(summaries_list):
                metrics_list.append([])
                for j in range(len(summaries)):
                    score = output[str(i)][str(j)]
                    # SUPERT will output None if the summary was empty, so we replace that with a 0.0
                    if score is None:
                        score = 0.0
                    metrics_list[-1].append(MetricsDict({'supert': score}))
            return metrics_list
Пример #14
0
    def test_command_line(self):
        # This is a regression test and does not test for accuracy
        with TemporaryDirectory() as temp_dir:
            with open(f'{temp_dir}/A.json', 'w') as out:
                out.write(json.dumps(self.correlations_A))
            with open(f'{temp_dir}/B.json', 'w') as out:
                out.write(json.dumps(self.correlations_B))

            command = [
                'python', '-m', 'sacrerouge', 'stat-sig-test',
                '--summary-level-correlations-A', f'{temp_dir}/A.json',
                '--summary-level-correlations-B', f'{temp_dir}/B.json',
                '--output-file', f'{temp_dir}/results.json', '--silent'
            ]
            subprocess.run(command, check=True)
            results = json.load(open(f'{temp_dir}/results.json', 'r'))
            self._verify_results(results)
Пример #15
0
    def test_correlate_reference(self):
        # This is a regression test for the "correlate" command. It does not test if it's accurate
        # TODO This needs to be a better test. There are too few summarization systems to get interesting
        # correlations
        with TemporaryDirectory() as temp_dir:
            command = [
                'python', '-m', 'sacrerouge', 'correlate',
                '--metrics-jsonl-files', MULTILING_METRICS, '--metrics',
                'rouge-1_jk_precision', 'grade', '--summarizer-type',
                'reference', '--output-file', f'{temp_dir}/correlations.json',
                '--silent'
            ]
            subprocess.run(command, check=True)
            correlations = json.load(open(f'{temp_dir}/correlations.json',
                                          'r'))

            assert correlations['metric1'] == 'rouge-1_jk_precision'
            assert correlations['metric2'] == 'grade'
            assert correlations['summarizer_type'] == 'reference'

            assert correlations['summary_level']['pearson'][
                'r'] == pytest.approx(0.3333, abs=1e-4)
            assert correlations['summary_level']['spearman'][
                'rho'] == pytest.approx(0.3333, abs=1e-4)
            assert correlations['summary_level']['kendall'][
                'tau'] == pytest.approx(0.3333, abs=1e-4)
            assert correlations['summary_level']['num_summary_groups'] == 3

            assert correlations['system_level']['pearson'][
                'r'] == pytest.approx(1.0, abs=1e-4)
            assert correlations['system_level']['spearman'][
                'rho'] == pytest.approx(1.0, abs=1e-4)
            assert correlations['system_level']['kendall'][
                'tau'] == pytest.approx(1.0, abs=1e-4)
            assert correlations['system_level']['num_summarizers'] == 2

            assert correlations['global']['pearson']['r'] == pytest.approx(
                0.6225273481541307, abs=1e-4)
            assert correlations['global']['spearman']['rho'] == pytest.approx(
                0.8285714285714287, abs=1e-4)
            assert correlations['global']['kendall']['tau'] == pytest.approx(
                0.7333333333333333, abs=1e-4)
            assert correlations['global']['num_summaries'] == 6
Пример #16
0
    def test_evaluate(self):
        # This is a regression test and does not ensure correctness
        with TemporaryDirectory() as temp_dir:
            macro_file = f'{temp_dir}/macro.json'
            micro_file = f'{temp_dir}/micro.jsonl'
            command = [
                'python', '-m', 'sacrerouge', 'evaluate', '--config',
                _config_file_path, '--macro-output-json', macro_file,
                '--micro-output-jsonl', micro_file, '--silent'
            ]

            process = Popen(command, stdout=PIPE, stderr=PIPE)
            process.communicate()

            macro_metrics = json.load(open(macro_file, 'r'))
            micro_metrics_list = JsonlReader(micro_file, Metrics).read()

            self._check_macro(macro_metrics)
            self._check_micro_list(micro_metrics_list)
Пример #17
0
    def test_all_summary_level_correlations(self):
        # This is a regression test for the "correlate" command. It does not test if it's accurate
        with TemporaryDirectory() as temp_dir:
            command = [
                'python', '-m', 'sacrerouge', 'correlate',
                '--metrics-jsonl-files', MULTILING_METRICS, '--metrics',
                'rouge-1_jk_precision', 'grade', '--summarizer-type', 'all',
                '--output-file', f'{temp_dir}/correlations.json', '--silent'
            ]
            subprocess.run(command, check=True)

            # Check the original correlations
            correlations = json.load(open(f'{temp_dir}/correlations.json',
                                          'r'))

            assert correlations['metric1'] == 'rouge-1_jk_precision'
            assert correlations['metric2'] == 'grade'
            assert correlations['summarizer_type'] == 'all'

            assert correlations['summary_level']['pearson'][
                'r'] == pytest.approx(0.4365526945989437, abs=1e-4)
            assert correlations['summary_level']['spearman'][
                'rho'] == pytest.approx(0.3720759220056127, abs=1e-4)
            assert correlations['summary_level']['kendall'][
                'tau'] == pytest.approx(0.1719691730561296, abs=1e-4)
            assert correlations['summary_level']['num_summary_groups'] == 3

            assert correlations['system_level']['pearson'][
                'r'] == pytest.approx(0.28732601225892834, abs=1e-4)
            assert correlations['system_level']['spearman'][
                'rho'] == pytest.approx(0.19999999999999998, abs=1e-4)
            assert correlations['system_level']['kendall'][
                'tau'] == pytest.approx(0.0, abs=1e-4)
            assert correlations['system_level']['num_summarizers'] == 4

            assert correlations['global']['pearson']['r'] == pytest.approx(
                0.34183806349510004, abs=1e-4)
            assert correlations['global']['spearman']['rho'] == pytest.approx(
                0.4035707976004214, abs=1e-4)
            assert correlations['global']['kendall']['tau'] == pytest.approx(
                0.28603877677367767, abs=1e-4)
            assert correlations['global']['num_summaries'] == 12
Пример #18
0
    def test_evaluate_default(self):
        # I manually ran evaluate with these parameters and this method checks to make sure
        # those values are equal to the output here
        with TemporaryDirectory() as temp_dir:
            macro_file = f'{temp_dir}/macro.json'
            micro_file = f'{temp_dir}/micro.jsonl'
            command = [
                'python', '-m', 'sacrerouge', 'python-rouge', 'evaluate',
                macro_file, micro_file, '--dataset-reader', 'reference-based',
                '--input-files', MULTILING_SUMMARIES, '--silent'
            ]

            process = Popen(command, stdout=PIPE, stderr=PIPE)
            process.communicate()

            macro_metrics = json.load(open(macro_file, 'r'))
            micro_metrics_list = JsonlReader(micro_file, Metrics).read()

            self._check_macro_default(macro_metrics)
            self._check_micro_list_default(micro_metrics_list)
Пример #19
0
    def test_dang_2008_table_6_example(self):
        with TemporaryDirectory() as temp_dir:
            command = [
                'python', '-m', 'sacrerouge', 'correlate',
                '--metrics-jsonl-files', _metrics_file_path, '--metrics',
                'overall_responsiveness', 'linguistic_quality',
                '--summarizer-type', 'reference', '--output-file',
                f'{temp_dir}/correlations.json', '--silent'
            ]
            subprocess.run(command, check=True)
            correlations = json.load(open(f'{temp_dir}/correlations.json',
                                          'r'))
            assert correlations['system_level']['spearman'][
                'rho'] == pytest.approx(0.910, 1e-2)
            assert correlations['system_level']['pearson'][
                'r'] == pytest.approx(0.778, 1e-2)
            assert correlations['system_level']['num_summarizers'] == 8

            # Kendall's tau is not reported in the paper, but this should break if
            # anything changes in the code
            assert correlations['system_level']['kendall'][
                'tau'] == pytest.approx(0.836, 1e-2)
Пример #20
0
    def test_skip_calculations(self):
        # Ensures the flags to skip calculating specific correlations work
        with TemporaryDirectory() as temp_dir:
            command = [
                'python', '-m', 'sacrerouge', 'correlate',
                '--metrics-jsonl-files', MULTILING_METRICS, '--metrics',
                'rouge-1_jk_precision', 'grade', '--summarizer-type', 'all',
                '--output-file', f'{temp_dir}/correlations.json', '--silent',
                '--skip-summary-level'
            ]
            subprocess.run(command, check=True)
            correlations = json.load(open(f'{temp_dir}/correlations.json',
                                          'r'))
            assert 'summary_level' not in correlations

            command = [
                'python', '-m', 'sacrerouge', 'correlate',
                '--metrics-jsonl-files', MULTILING_METRICS, '--metrics',
                'rouge-1_jk_precision', 'grade', '--summarizer-type', 'all',
                '--output-file', f'{temp_dir}/correlations.json', '--silent',
                '--skip-system-level'
            ]
            subprocess.run(command, check=True)
            correlations = json.load(open(f'{temp_dir}/correlations.json',
                                          'r'))
            assert 'system_level' not in correlations

            command = [
                'python', '-m', 'sacrerouge', 'correlate',
                '--metrics-jsonl-files', MULTILING_METRICS, '--metrics',
                'rouge-1_jk_precision', 'grade', '--summarizer-type', 'all',
                '--output-file', f'{temp_dir}/correlations.json', '--silent',
                '--skip-global'
            ]
            subprocess.run(command, check=True)
            correlations = json.load(open(f'{temp_dir}/correlations.json',
                                          'r'))
            assert 'global' not in correlations
Пример #21
0
    def test_correlate_peer(self):
        # This is a regression test for the "correlate" command. It does not test if it's accurate
        # TODO This needs to be a better test. There are too few summarization systems to get interesting
        # correlations
        with TemporaryDirectory() as temp_dir:
            command = [
                'python', '-m', 'sacrerouge', 'correlate',
                '--metrics-jsonl-files', MULTILING_METRICS, '--metrics',
                'rouge-1_precision', 'grade', '--summarizer-type', 'peer',
                '--output-file', f'{temp_dir}/correlations.json', '--silent'
            ]
            subprocess.run(command, check=True)
            correlations = json.load(open(f'{temp_dir}/correlations.json',
                                          'r'))

            assert correlations['summary_level']['pearson'][
                'r'] == pytest.approx(-0.3333, abs=1e-4)
            assert correlations['summary_level']['spearman'][
                'rho'] == pytest.approx(-0.3333, abs=1e-4)
            assert correlations['summary_level']['kendall'][
                'tau'] == pytest.approx(-0.3333, abs=1e-4)
            assert correlations['summary_level']['num_summary_groups'] == 3

            assert correlations['system_level']['pearson'][
                'r'] == pytest.approx(-1.0, abs=1e-4)
            assert correlations['system_level']['spearman'][
                'rho'] == pytest.approx(-1.0, abs=1e-4)
            assert correlations['system_level']['kendall'][
                'tau'] == pytest.approx(-1.0, abs=1e-4)
            assert correlations['system_level']['num_summarizers'] == 2

            assert correlations['global']['pearson']['r'] == pytest.approx(
                -0.33056857901135617, abs=1e-4)
            assert correlations['global']['spearman']['rho'] == pytest.approx(
                -0.3768511731740915, abs=1e-4)
            assert correlations['global']['kendall']['tau'] == pytest.approx(
                -0.27602622373694163, abs=1e-4)
            assert correlations['global']['num_summaries'] == 6
Пример #22
0
    def score_multi_all(
            self, summaries_list: List[List[SummaryType]],
            references_list: List[List[ReferenceType]]
    ) -> List[List[MetricsDict]]:
        with TemporaryDirectory() as temp_dir:
            temp_dir = os.path.abspath(temp_dir)

            summaries_file = f'{temp_dir}/summaries.json'
            questions_file = f'{temp_dir}/questions.jsonl'
            metadata_file = f'{temp_dir}/metadata.json'
            answers_file = f'{temp_dir}/answers.jsonl'

            instance_id_to_reference_ids = self._save_summaries(
                summaries_file, summaries_list, references_list)
            metadata = self._run_preprocess(summaries_file, questions_file,
                                            metadata_file)
            ids_to_scores = self._run_answer_questions(questions_file,
                                                       answers_file)
            metrics_lists = self._get_metrics(summaries_list, references_list,
                                              instance_id_to_reference_ids,
                                              ids_to_scores, metadata)

            return metrics_lists
Пример #23
0
    def _run(
            self, summaries_list: List[List[SummaryType]]
    ) -> List[List[MetricsDict]]:
        with TemporaryDirectory() as temp_dir:
            summaries_file = f'{temp_dir}/summaries.jsonl'
            predictions_file = f'{temp_dir}/predictions.json'

            # Save all of the summaries to a file, keeping track of the indices
            # that are empty summaries
            empty_summaries = set()
            with JsonlWriter(summaries_file) as out:
                index = 0
                for summaries in summaries_list:
                    for summary in summaries:
                        summary = self._flatten_summary(summary)
                        if len(summary) > 0:
                            out.write({'summary': summary})
                        else:
                            empty_summaries.add(index)
                        index += 1

            commands = [
                f'cd {self.sum_qe_root}', ' '.join([
                    self.python_binary, '-m', 'src.BERT_experiments.predict',
                    summaries_file, self.model_file, predictions_file
                ])
            ]
            command = ' && '.join(commands)

            logger.info(f'Running SumQE command: "{command}"')
            redirect = None if self.verbose else PIPE
            process = Popen(command,
                            stdout=redirect,
                            stderr=redirect,
                            shell=True)
            stdout, stderr = process.communicate()

            predictions = json.loads(open(predictions_file, 'r').read())

            index = 0
            output_index = 0
            metrics_lists = []
            for summaries in summaries_list:
                metrics_lists.append([])
                for _ in summaries:
                    if index in empty_summaries:
                        metrics_lists[-1].append(
                            MetricsDict({
                                'SumQE': {
                                    'Q1': 0.0,
                                    'Q2': 0.0,
                                    'Q3': 0.0,
                                    'Q4': 0.0,
                                    'Q5': 0.0
                                }
                            }))
                    else:
                        preds = predictions[output_index]
                        metrics_lists[-1].append(
                            MetricsDict({
                                'SumQE': {
                                    'Q1': preds[0],
                                    'Q2': preds[1],
                                    'Q3': preds[2],
                                    'Q4': preds[3],
                                    'Q5': preds[4]
                                }
                            }))
                        output_index += 1
                    index += 1

            return metrics_lists
Пример #24
0
    def _run(
        self, summaries_list: List[List[SummaryType]],
        references_list: List[List[SummaryType]]
    ) -> Tuple[List[MetricsDict], List[List[MetricsDict]]]:
        with TemporaryDirectory() as temp_dir:
            summary_filenames_list = []
            reference_filenames_list = []

            for i, (summaries, references) in enumerate(
                    zip(summaries_list, references_list)):
                summary_filenames_list.append([])
                reference_filenames_list.append([])
                for j, summary in enumerate(summaries):
                    summary_filename = f'{i}/model.{j}.txt'
                    summary_filenames_list[-1].append(summary_filename)
                    self._save_summary(summary,
                                       f'{temp_dir}/{summary_filename}')

                for j, reference in enumerate(references):
                    symbol = chr(j + 65)
                    reference_filename = f'{i}/gold.{symbol}.txt'
                    reference_filenames_list[-1].append(reference_filename)
                    self._save_summary(reference,
                                       f'{temp_dir}/{reference_filename}')

            config_filename = f'{temp_dir}/config.xml'
            self._save_config_file(config_filename, summary_filenames_list,
                                   reference_filenames_list)

            command = [
                self.rouge_script_location, '-e', self.rouge_eval_home, '-n',
                str(self.max_ngram), '-a', '-c', '95', '-r', '1000', '-p',
                '0.5', '-t', '0', '-d'
            ]
            if self.use_porter_stemmer:
                command += ['-m']
            if self.remove_stopwords:
                command += ['-s']
            if self.max_bytes is not None:
                command += ['-b', str(self.max_bytes)]
            if self.max_words is not None:
                command += ['-l', str(self.max_words)]
            if not self.compute_rouge_l:
                command += ['-x']
            if self.skip_bigram_gap_length is not None:
                command += ['-2', str(self.skip_bigram_gap_length), '-u']
            if self.wlcs_weight is not None:
                command += ['-w', str(self.wlcs_weight)]
            if self.scoring_function == 'average':
                command += ['-f', 'A']
            elif self.scoring_function == 'max':
                command += ['-f', 'B']
            else:
                raise Exception(
                    f'Unrecognized scoring function: "{self.scoring_function}"'
                )
            command += [config_filename]

            # We used to fail if anything was written to stderr, but ROUGE writes
            # a warning if the number of peers per reference set is different, which
            # is expected in some situations for us (if we just have more summaries
            # to score for some reference sets than others). Therefore, we no longer fail
            # if stderr is not empty.
            logger.info(f'Running ROUGE command: "{" ".join(command)}"')
            process = Popen(command, stdout=PIPE, stderr=PIPE)
            stdout, stderr = process.communicate()

            macro_metrics_list, micro_metrics_lists = self._parse_rouge_stdout(
                stdout.decode())
            return macro_metrics_list, micro_metrics_lists
Пример #25
0
    def test_all_summary_level_correlations(self):
        # This is a regression test for the "correlate" command. It does not test if it's accurate
        with TemporaryDirectory() as temp_dir:
            command = [
                'python', '-m', 'sacrerouge', 'correlate',
                '--metrics-jsonl-files', MULTILING_METRICS, '--metrics',
                'rouge-1_jk_precision', 'grade', '--summarizer-type', 'all',
                '--output-file', f'{temp_dir}/correlations.json', '--silent',
                '--summary-level-correlations-output',
                f'{temp_dir}/summary-level.json'
            ]
            subprocess.run(command, check=True)

            # Check the original correlations
            correlations = json.load(open(f'{temp_dir}/correlations.json',
                                          'r'))

            assert correlations['summary_level']['pearson'][
                'r'] == pytest.approx(0.4365526945989437, abs=1e-4)
            assert correlations['summary_level']['spearman'][
                'rho'] == pytest.approx(0.3720759220056127, abs=1e-4)
            assert correlations['summary_level']['kendall'][
                'tau'] == pytest.approx(0.1719691730561296, abs=1e-4)
            assert correlations['summary_level']['num_summary_groups'] == 3

            assert correlations['system_level']['pearson'][
                'r'] == pytest.approx(0.28732601225892834, abs=1e-4)
            assert correlations['system_level']['spearman'][
                'rho'] == pytest.approx(0.19999999999999998, abs=1e-4)
            assert correlations['system_level']['kendall'][
                'tau'] == pytest.approx(0.0, abs=1e-4)
            assert correlations['system_level']['num_summarizers'] == 4

            assert correlations['global']['pearson']['r'] == pytest.approx(
                0.34183806349510004, abs=1e-4)
            assert correlations['global']['spearman']['rho'] == pytest.approx(
                0.4035707976004214, abs=1e-4)
            assert correlations['global']['kendall']['tau'] == pytest.approx(
                0.28603877677367767, abs=1e-4)
            assert correlations['global']['num_summaries'] == 12

            # Check the individual summary-level correlations
            summary_level = json.load(
                open(f'{temp_dir}/summary-level.json', 'r'))
            assert len(summary_level['pearson']) == 3
            assert summary_level['pearson']['M000'] == pytest.approx(
                0.3216337604513384, abs=1e-4)
            assert summary_level['pearson']['M001'] == pytest.approx(
                0.38969747442783453, abs=1e-4)
            assert summary_level['pearson']['M002'] == pytest.approx(
                0.598326848917658, abs=1e-4)

            assert len(summary_level['spearman']) == 3
            assert summary_level['spearman']['M000'] == pytest.approx(
                0.6000000000000001, abs=1e-4)
            assert summary_level['spearman']['M001'] == pytest.approx(
                0.316227766016838, abs=1e-4)
            assert summary_level['spearman']['M002'] == pytest.approx(
                0.19999999999999998, abs=1e-4)

            assert len(summary_level['kendall']) == 3
            assert summary_level['kendall']['M000'] == pytest.approx(
                0.3333333333333334, abs=1e-4)
            assert summary_level['kendall']['M001'] == pytest.approx(
                0.18257418583505539, abs=1e-4)
            assert summary_level['kendall']['M002'] == pytest.approx(0.0,
                                                                     abs=1e-4)
Пример #26
0
    def score_multi_all(
            self, summaries_list: List[List[SummaryType]],
            references_list: List[List[ReferenceType]]
    ) -> List[List[MetricsDict]]:
        with TemporaryDirectory() as temp_dir:
            # Save the summaries to a file. Each file has one summary per line.
            # For multiple references, each reference is used to evaluate the same
            # summary independently, so the system summary is repeated.
            candidate_file = f'{temp_dir}/candidates.txt'
            reference_file = f'{temp_dir}/references.txt'
            score_file = f'{temp_dir}/scores.txt'

            with open(candidate_file, 'w') as out_candidates:
                with open(reference_file, 'w') as out_references:
                    for summaries, references in zip(summaries_list,
                                                     references_list):
                        for summary in summaries:
                            for reference in references:
                                if isinstance(summary, list):
                                    out_candidates.write(' '.join(summary) +
                                                         '\n')
                                else:
                                    out_candidates.write(summary + '\n')

                                if isinstance(reference, list):
                                    out_references.write(' '.join(reference) +
                                                         '\n')
                                else:
                                    out_references.write(reference + '\n')

            # Run through BLEURT
            commands = [f'cd {self.bleurt_root}']
            if self.environment_name is not None:
                commands.append(f'source {os.environ["CONDA_INIT"]}')
                commands.append(f'conda activate {self.environment_name}')
            commands.append(f'python -m bleurt.score '
                            f'-candidate_file={candidate_file} '
                            f'-reference_file={reference_file} '
                            f'-bleurt_checkpoint={self.checkpoint_dir} '
                            f'-scores_file={score_file} '
                            f'-bleurt_batch_size={self.batch_size}')
            command = ' && '.join(commands)

            logger.info(f'Running command: "{command}"')
            redirect = None if self.verbose else PIPE
            process = Popen(command,
                            stdout=redirect,
                            stderr=redirect,
                            shell=True)
            process.communicate()

            # Load the results
            scores = list(map(float,
                              open(score_file, 'r').read().splitlines()))
            metrics_lists = []
            index = 0
            for summaries, references in zip(summaries_list, references_list):
                metrics_lists.append([])
                for _ in summaries:
                    reference_scores = []
                    for _ in references:
                        reference_scores.append(scores[index])
                        index += 1
                    average = sum(reference_scores) / len(reference_scores)
                    max_ = max(reference_scores)
                    metrics_lists[-1].append(
                        MetricsDict(
                            {'bleurt': {
                                'average': average,
                                'max': max_
                            }}))
            return metrics_lists
    def test_score(self):
        with TemporaryDirectory() as temp_dir:
            output_file = f'{temp_dir}/metrics.jsonl'
            command = [
                'python', '-m', 'sacrerouge', 'score',
                _config_file_path,
                output_file
            ]

            process = Popen(command, stdout=PIPE, stderr=PIPE)
            stdout, stderr = process.communicate()

            instances = JsonlReader(_summaries_file_path).read()
            metrics_list = JsonlReader(output_file, Metrics).read()
            metrics_dicts = defaultdict(dict)

            assert len(instances) == len(metrics_list)
            for instance, metrics in zip(instances, metrics_list):
                assert metrics.instance_id == instance['instance_id']
                assert metrics.summarizer_id == instance['summarizer_id']
                assert metrics.summarizer_type == instance['summarizer_type']
                metrics_dicts[metrics.instance_id][metrics.summarizer_id] = metrics

                if metrics.summarizer_type == 'reference':
                    assert 'python-rouge-1_jk' in metrics.metrics
                    assert 'python-rouge-2_jk' in metrics.metrics
                else:
                    assert 'python-rouge-1' in metrics.metrics
                    assert 'python-rouge-2' in metrics.metrics
                    assert 'python-rouge-1_jk' in metrics.metrics
                    assert 'python-rouge-2_jk' in metrics.metrics

            # Test a couple of instances. I did not check to see if these are correct,
            # but the test will check if the results have changed
            assert metrics_dicts['d0801-A']['0'].metrics == {
                'python-rouge-1': {
                    'precision': 29.444444444444446,
                    'recall': 26.700251889168765,
                    'f1': 28.005284015852048
                },
                'python-rouge-2': {
                    'precision': 2.8089887640449436,
                    'recall': 2.5445292620865136,
                    'f1': 2.67022696929239
                },
                'python-rouge-1_jk': {
                    'precision': 29.444444444444443,
                    'recall': 26.719572295067344,
                    'f1': 28.015250464050713
                },
                'python-rouge-2_jk': {
                    'precision': 2.808988764044944,
                    'recall': 2.549772468714448,
                    'f1': 2.6730599647266313
                }
            }

            assert metrics_dicts['d0805-A']['B'].metrics == {
                'python-rouge-1_jk': {
                    'precision': 37.84722222222222,
                    'recall': 36.21262458471761,
                    'f1': 37.011884550084886
                },
                'python-rouge-2_jk': {
                    'precision': 9.12280701754386,
                    'recall': 8.724832214765101,
                    'f1': 8.919382504288166
                }
            }
Пример #28
0
    def score_multi_all(
            self, summaries_list: List[List[SummaryType]],
            references_list: List[List[ReferenceType]]
    ) -> List[List[MetricsDict]]:
        # The original code for PyrEval processes exactly 1 pyramid at a time. Therefore, the whole pipeline needs
        # to be run once per item in `references_list`. Each execution of the pipeline will load the Stanford CoreNLP
        # models and run them over the text. Loading the models takes a lot of time, and the preprocessing of
        # the same summary may run multiple times (for instance in jackknifing).
        #
        # To save time, our implementation passes all of the unique peer and reference summaries through the
        # preprocessing step of the pipeline at once, then runs the analysis step per-pyramid afterward. This
        # significantly increases the speed of the processing.

        # Identify the unique summaries so less preprocessing needs to be done
        summaries_list = self._flatten_summaries(summaries_list)
        references_list = self._flatten_summaries(references_list)

        all_summaries, summary_to_index = self._index_summaries(
            summaries_list, references_list)

        with TemporaryDirectory() as temp_dir:
            # First, clear the PyrEval directory in case the last run was messed up
            self._clean_directories()

            # All of the summaries are saved in the "peers" folder, even if they are references. The PyrEval code
            # normally runs separate steps to process the peer and model directories, which is slower because it requires
            # loading the Stanford models twice, but the preprocessing is the same.
            self._save_summaries(all_summaries,
                                 f'{self.pyreval_root}/Raw/peers')

            self._run_through_preprocessing()

            # The PyrEval code will create an xml for summary i called i.xml and a directory with more data for
            # that file. The directory names aren't consistent because they're created by enumerating glob results
            # (which I think are not always deterministically sorted, or I don't want to rely on the assumption
            # that they are sorted). So we have to figure out the mapping from the summary index to the directory
            file_index_to_dir = self._map_file_index_to_directory(
                f'{self.pyreval_root}/Preprocess/peer_summaries')

            # All of the preprocessed summaries are now moved out of the PyrEval directory (or else they would be
            # used in the rest of the processing) to a temporary directory
            self._move_summaries_to_temp_dir(temp_dir)

            # Remove any extra data which could interfere with processing
            self._clean_directories()

            # Now build the pyramids and score
            metrics_dict_lists = []
            for i, (summaries, references) in enumerate(
                    zip(summaries_list, references_list)):
                array_index_to_tgt_index = self._copy_summaries_for_processing(
                    summaries, summary_to_index, file_index_to_dir,
                    f'{temp_dir}/peers',
                    f'{self.pyreval_root}/Preprocess/peer_summaries', False,
                    True)
                self._copy_summaries_for_processing(
                    references, summary_to_index, file_index_to_dir,
                    f'{temp_dir}/peers',
                    f'{self.pyreval_root}/Preprocess/wise_crowd_summaries',
                    True, False)

                metrics_list = self._score_summaries(array_index_to_tgt_index)
                metrics_dict_lists.append(metrics_list)

                # Clean for the next iteration
                self._clean_directories()

            return metrics_dict_lists
Пример #29
0
    def test_score_default(self):
        with TemporaryDirectory() as temp_dir:
            # I manually ran the scoring function with these parameters, and this test makes sure
            # those are equal to the output here
            output_file = f'{temp_dir}/metrics.jsonl'
            command = [
                'python', '-m', 'sacrerouge', 'python-rouge', 'score',
                '--output-jsonl', output_file, '--dataset-reader',
                'reference-based', '--input-files', MULTILING_SUMMARIES,
                '--silent'
            ]

            process = Popen(command, stdout=PIPE, stderr=PIPE)
            process.communicate()

            instances = JsonlReader(MULTILING_SUMMARIES).read()
            metrics_list = JsonlReader(output_file, Metrics).read()
            metrics_dicts = defaultdict(dict)

            assert len(instances) == len(metrics_list)
            for instance, metrics in zip(instances, metrics_list):
                assert metrics.instance_id == instance['instance_id']
                assert metrics.summarizer_id == instance['summarizer_id']
                assert metrics.summarizer_type == instance['summarizer_type']
                metrics_dicts[metrics.instance_id][
                    metrics.summarizer_id] = metrics

                if metrics.summarizer_type == 'reference':
                    assert 'python-rouge-1_jk' in metrics.metrics
                    assert 'python-rouge-2_jk' in metrics.metrics
                else:
                    assert 'python-rouge-1' in metrics.metrics
                    assert 'python-rouge-2' in metrics.metrics
                    assert 'python-rouge-1_jk' in metrics.metrics
                    assert 'python-rouge-2_jk' in metrics.metrics

            assert metrics_dicts['M000']['1'].metrics == {
                'python-rouge-1': {
                    'precision': 41.699867197875164,
                    'recall': 40.516129032258064,
                    'f1': 41.09947643979057
                },
                'python-rouge-2': {
                    'precision': 10.533333333333333,
                    'recall': 10.233160621761659,
                    'f1': 10.38107752956636
                },
                'python-rouge-1_jk': {
                    'precision': 41.699867197875164,
                    'recall': 40.514662613316766,
                    'f1': 41.098355761265616
                },
                'python-rouge-2_jk': {
                    'precision': 10.533333333333333,
                    'recall': 10.226158358122346,
                    'f1': 10.3773782079838
                }
            }

            assert metrics_dicts['M001']['B'].metrics == {
                'python-rouge-1_jk': {
                    'precision': 51.59362549800797,
                    'recall': 51.18577075098815,
                    'f1': 51.3888888888889
                },
                'python-rouge-2_jk': {
                    'precision': 20.4,
                    'recall': 20.238095238095237,
                    'f1': 20.318725099601597
                }
            }
Пример #30
0
    def test_score_arguments(self):
        with TemporaryDirectory() as temp_dir:
            # I manually ran the scoring function with these parameters, and this test makes sure
            # those are equal to the output here
            output_file = f'{temp_dir}/metrics.jsonl'
            command = [
                'python', '-m', 'sacrerouge', 'python-rouge', 'score',
                '--output-jsonl', output_file, '--dataset-reader',
                'reference-based', '--input-files', MULTILING_SUMMARIES,
                '--ngram_orders', '[3]', '--use_porter_stemmer', 'false',
                '--remove_stopwords', 'true', '--compute_rouge_l', 'true',
                '--dataset-reader', 'reference-based', '--silent'
            ]

            process = Popen(command, stdout=PIPE, stderr=PIPE)
            process.communicate()

            instances = JsonlReader(MULTILING_SUMMARIES).read()
            metrics_list = JsonlReader(output_file, Metrics).read()
            metrics_dicts = defaultdict(dict)

            assert len(instances) == len(metrics_list)
            for instance, metrics in zip(instances, metrics_list):
                assert metrics.instance_id == instance['instance_id']
                assert metrics.summarizer_id == instance['summarizer_id']
                assert metrics.summarizer_type == instance['summarizer_type']
                metrics_dicts[metrics.instance_id][
                    metrics.summarizer_id] = metrics

                if metrics.summarizer_type == 'reference':
                    assert 'python-rouge-3_jk' in metrics.metrics
                    assert 'python-rouge-l_jk' in metrics.metrics
                else:
                    assert 'python-rouge-3' in metrics.metrics
                    assert 'python-rouge-l' in metrics.metrics
                    assert 'python-rouge-3_jk' in metrics.metrics
                    assert 'python-rouge-l_jk' in metrics.metrics

            assert metrics_dicts['M000']['1'].metrics == {
                'python-rouge-3': {
                    'precision': 3.0952380952380953,
                    'recall': 3.110047846889952,
                    'f1': 3.1026252983293556
                },
                'python-rouge-l': {
                    'precision': 20.657276995305164,
                    'recall': 20.754716981132077,
                    'f1': 20.705882352941174
                },
                'python-rouge-3_jk': {
                    'precision': 3.095238095238095,
                    'recall': 3.073768703921825,
                    'f1': 3.0843425372732653
                },
                'python-rouge-l_jk': {
                    'precision': 20.657276995305164,
                    'recall': 20.72789236755767,
                    'f1': 20.69018908745478
                }
            }

            assert metrics_dicts['M001']['B'].metrics == {
                'python-rouge-3_jk': {
                    'precision': 3.75,
                    'recall': 3.4615384615384617,
                    'f1': 3.6
                },
                'python-rouge-l_jk': {
                    'precision': 33.60655737704918,
                    'recall': 31.060606060606062,
                    'f1': 32.28346456692913
                }
            }