Exemplo n.º 1
0
    def test_pyramids(self):
        """Do some basic sanity tests on the files."""
        pyramids = JsonlReader(_pyramids_file_path, Pyramid).read()
        annotations = JsonlReader(_pyramid_annotations_file_path, PyramidAnnotation).read()

        instance_id_to_pyramid = {}
        for pyramid in pyramids:
            instance_id_to_pyramid[pyramid.instance_id] = pyramid

        instance_id_to_annotations = defaultdict(list)
        for annotation in annotations:
            instance_id_to_annotations[annotation.instance_id].append(annotation)

        assert instance_id_to_pyramid.keys() == instance_id_to_annotations.keys()
        for instance_id, pyramid in instance_id_to_pyramid.items():
            assert len(pyramid.summaries) == 4
            assert len(pyramid.summarizer_ids) == 4
            for reference in pyramid.summaries:
                assert len(reference) > 0

            scu_ids = set([scu.scu_id for scu in pyramid.scus])
            for annotation in instance_id_to_annotations[instance_id]:
                assert len(annotation.summary) > 0, (instance_id, annotation.summarizer_id)
                for scu in annotation.scus:
                    assert scu.scu_id in scu_ids, (scu.scu_id, scu_ids)
Exemplo n.º 2
0
    def read(self, documents_jsonl: str,
             summaries_jsonl) -> List[EvalInstance]:
        logger.info(f'Loading documents from {documents_jsonl}')
        documents_dict = {}
        with JsonlReader(documents_jsonl) as f:
            for data in f:
                instance_id = data['instance_id']
                if 'document' in data:
                    documents = [data['document']['text']]
                else:
                    documents = [
                        document['text'] for document in data['documents']
                    ]
                documents = flatten_documents(documents)
                documents_dict[instance_id] = DocumentsField(documents)
        logger.info(f'Loaded {len(documents_dict)} document sets')

        logger.info(f'Loading summaries from {summaries_jsonl}')
        instances = []
        with JsonlReader(summaries_jsonl) as f:
            for data in f:
                fields = {}
                fields['summary'] = SummaryField(data['summary']['text'])

                instance_id = data['instance_id']
                fields['documents'] = documents_dict[instance_id]
                fields = Fields(fields)

                instance = EvalInstance(data['instance_id'],
                                        data['summarizer_id'],
                                        data['summarizer_type'], fields)
                instances.append(instance)
        logger.info(f'Loaded {len(instances)} instances')
        return instances
Exemplo n.º 3
0
    def test_pyramid_score(self):
        # This is a regression test, not necessarily a test for correctness
        pyramids = {
            pyramid.instance_id: pyramid
            for pyramid in JsonlReader(_pyramid_file_path, Pyramid).read()
        }
        annotations = JsonlReader(_annotation_file_path,
                                  PyramidAnnotation).read()
        annotation_pyramids = [
            pyramids[annotation.instance_id] for annotation in annotations
        ]

        metric = PyramidScore()
        actual_output = metric.score_all(annotations, annotation_pyramids)[:5]
        expected_output = [{
            'modified_pyramid_score': 0.2413793103448276
        }, {
            'modified_pyramid_score': 0.0
        }, {
            'modified_pyramid_score': 0.06896551724137931
        }, {
            'modified_pyramid_score': 0.034482758620689655
        }, {
            'modified_pyramid_score': 0.1724137931034483
        }]
        for i, (expected,
                actual) in enumerate(zip(expected_output, actual_output)):
            assert actual.approx_equal(
                MetricsDict(expected), abs=1e-4
            ), f'Instance {i} not equal. Expected {expected}, actual {actual}'
Exemplo n.º 4
0
def main(args):
    random.seed(args.random_seed)

    summaries = JsonlReader(args.summaries_jsonl).read()
    metrics_list = JsonlReader(args.metrics_jsonl).read()

    instance_ids = sample_instances(summaries, args.num_instances)
    summarizer_ids1 = sample_summarizers(summaries, args.num_summaries)
    summarizer_ids2 = sample_next_summarizers(summaries, summarizer_ids1,
                                              16 - args.num_summaries)
    union = summarizer_ids1 | summarizer_ids2

    print(summarizer_ids1)
    print(summarizer_ids2)

    save(summaries, instance_ids, union, args.output_summaries_jsonl)
    save(metrics_list, instance_ids, union, args.output_metrics_jsonl)

    save(summaries, instance_ids, summarizer_ids1,
         args.output_summaries_jsonl_1)
    save(metrics_list, instance_ids, summarizer_ids1,
         args.output_metrics_jsonl_1)

    save(summaries, instance_ids, summarizer_ids2,
         args.output_summaries_jsonl_2)
    save(metrics_list, instance_ids, summarizer_ids2,
         args.output_metrics_jsonl_2)
Exemplo n.º 5
0
def main(args):
    expert_answers = JsonlReader(args.expert_answers_jsonl).read()
    model_answers = JsonlReader(args.model_answers_jsonl).read()

    expert_answers = filter_to_summarizer_type(expert_answers,
                                               args.summarizer_type)
    model_answers = filter_to_summarizer_type(model_answers,
                                              args.summarizer_type)

    annotations_dict = load_annotations(args.annotations_csv, expert_answers,
                                        model_answers)

    is_answerable_dict = get_ground_truth_answerable(expert_answers)

    process_expert_answers(expert_answers, annotations_dict,
                           is_answerable_dict)
    print()
    process_model_answers(model_answers, annotations_dict, is_answerable_dict,
                          args.output_right_for_wrong_reasons_csv)
    print()

    print('Saving expert')
    save_human_judgments(expert_answers, annotations_dict,
                         args.output_expert_jsonl)
    print('Saving model')
    save_human_judgments(model_answers, annotations_dict,
                         args.output_model_jsonl)
    def read(self) -> List[EvalInstance]:
        pyramids = JsonlReader(self.pyramid_jsonl, Pyramid).read()
        annotations = JsonlReader(self.annotation_jsonl,
                                  PyramidAnnotation).read()

        # Enumerate the peers
        instance_id_to_pyramid = {
            pyramid.instance_id: pyramid
            for pyramid in pyramids
        }
        eval_instances = []
        for annotation in annotations:
            summary = PyramidAnnotationField(annotation)
            pyramid = instance_id_to_pyramid[annotation.instance_id]
            fields = Fields({'pyramid': PyramidField(pyramid)})

            eval_instances.append(
                EvalInstance(annotation.instance_id, annotation.summarizer_id,
                             annotation.summarizer_type, summary, fields))

        # Enumerate the references
        for pyramid in pyramids:
            for i, summary in enumerate(pyramid.summaries):
                annotation = pyramid.get_annotation(i)
                summary = PyramidAnnotationField(annotation)

                fields = Fields(
                    {'pyramid': PyramidField(pyramid.remove_summary(i))})

                eval_instances.append(
                    EvalInstance(annotation.instance_id,
                                 annotation.summarizer_id,
                                 annotation.summarizer_type, summary, fields))

        return eval_instances
Exemplo n.º 7
0
    def test_sanity_checks(self):
        file_paths = glob(f'{_mds_dir}/??.train.jsonl')
        assert len(file_paths) == 10  # 10 languages

        # Make sure each one has 3 instances with non-empty documents and summaries
        for file_path in file_paths:
            instances = JsonlReader(file_path).read()
            assert len(instances) == 3
            for instance in instances:
                assert len(instance['documents']) > 0
                for document in instance['documents']:
                    assert len(document['text']) > 0
                assert len(instance['summaries']) > 0
                for summary in instance['summaries']:
                    assert len(summary['text']) > 0

        # The test currently does not have summaries
        file_paths = glob(f'{_mds_dir}/??.test.jsonl')
        assert len(file_paths) == 10  # 10 languages

        # Make sure each one has 3 instances with non-empty documents and summaries
        for file_path in file_paths:
            instances = JsonlReader(file_path).read()
            assert len(instances) == 7
            for instance in instances:
                assert len(instance['documents']) > 0
                for document in instance['documents']:
                    assert len(document['text']) > 0
                assert 'summaries' not in instance
Exemplo n.º 8
0
    def setUpClass(cls) -> None:
        super().setUpClass()
        cls.instance_ids = []
        cls.summaries = []
        cls.references_list = []
        with JsonlReader(MULTILING_SUMMARIES) as f:
            for instance in f:
                cls.instance_ids.append(instance['instance_id'])
                cls.summaries.append(instance['summary']['text'])
                references = []
                for reference in instance['references']:
                    references.append(reference['text'])
                cls.references_list.append(references)

        # Load the documents, grouped by instance id, then put them into a list
        # parallel with the instances
        cls.documents_dict = {}
        with JsonlReader(MULTILING_DOCUMENTS) as f:
            for instance in f:
                cls.documents_dict[instance['instance_id']] = [
                    document['text'] for document in instance['documents']
                ]
        cls.documents_list = []
        for instance_id in cls.instance_ids:
            cls.documents_list.append(cls.documents_dict[instance_id])
Exemplo n.º 9
0
    def read(self, pyramid_jsonl: str,
             annotation_jsonl: str) -> List[EvalInstance]:
        logger.info(f'Loading Pyramids from {pyramid_jsonl}')
        pyramids = {}
        with JsonlReader(pyramid_jsonl, Pyramid) as f:
            for pyramid in f:
                pyramids[pyramid.instance_id] = pyramid
        logger.info(f'Loaded {len(pyramids)} pyramids')

        logger.info(f'Loading Pyramid annotations from {annotation_jsonl}')
        instances = []
        instance_ids = set()
        with JsonlReader(annotation_jsonl, PyramidAnnotation) as f:
            for annotation in f:
                fields = Fields({
                    'annotation':
                    PyramidAnnotationField(annotation),
                    'pyramid':
                    PyramidField(pyramids[annotation.instance_id])
                })

                instance = EvalInstance(annotation.instance_id,
                                        annotation.summarizer_id,
                                        annotation.summarizer_type, fields)
                instances.append(instance)

                instance_ids.add(annotation.instance_id)

            logger.info(f'Loaded {len(instances)} Pyramid annotations')

        if self.include_reference_annotations:
            logger.info(
                f'Generating Pyramid annotations for the reference summaries')
            reference_instances = []
            for instance_id in instance_ids:
                pyramid = pyramids[instance_id]

                # We can only do this if there are > 1 summaries used to construct the pyramid
                if len(pyramid.summarizer_ids) > 1:
                    for i in range(len(pyramid.summarizer_ids)):
                        annotation = pyramid.get_annotation(i)
                        reduced_pyramid = pyramid.remove_summary(i)
                        fields = Fields({
                            'annotation':
                            PyramidAnnotationField(annotation),
                            'pyramid':
                            PyramidField(reduced_pyramid)
                        })
                        instance = EvalInstance(annotation.instance_id,
                                                annotation.summarizer_id,
                                                annotation.summarizer_type,
                                                fields)
                        reference_instances.append(instance)
            logger.info(
                f'Generated {len(reference_instances)} reference summary annotations'
            )
            instances.extend(reference_instances)

        logger.info(f'Loaded a total of {len(instances)} instances')
        return instances
def main(args):
    metrics_list = JsonlReader(args.input_file, Metrics).read()
    aggregated_metrics = aggregate_metrics(metrics_list)

    with JsonlWriter(args.output_file) as out:
        # I'm being lazy and just iterating twice because the aggregate method edited the metrics inplace
        for metrics in JsonlReader(args.input_file, Metrics).read():
            metrics.metrics['overall_responsiveness'] = aggregated_metrics[
                metrics.summarizer_id]['overall_responsiveness']
            out.write(metrics)
    def test_score_multi_all(self):
        instances = JsonlReader(_instances_file_path).read()
        summaries_lists, documents_lists, instance_id_to_index, summarizer_id_to_index = self._setup_multi_all(
            instances)

        simetrix = SIMetrix()
        metrics_lists = simetrix.score_multi_all(summaries_lists,
                                                 documents_lists)
        for expected_metrics in JsonlReader(_summary_metrics_file_path).read():
            instance_index = instance_id_to_index[
                expected_metrics['instance_id']]
            summarizer_index = summarizer_id_to_index[
                expected_metrics['summarizer_id']]
            actual_metrics = metrics_lists[instance_index][summarizer_index]
            assert actual_metrics == expected_metrics['metrics']
Exemplo n.º 12
0
    def test_pyramid_examples(self):
        """Pick some random examples and test them."""
        pyramids = JsonlReader(_pyramids_file_path, Pyramid).read()
        annotations = JsonlReader(_pyramid_annotations_file_path,
                                  PyramidAnnotation).read()

        instance_id_to_pyramid = {}
        for pyramid in pyramids:
            instance_id_to_pyramid[pyramid.instance_id] = pyramid

        instance_id_to_annotations = defaultdict(dict)
        for annotation in annotations:
            instance_id_to_annotations[annotation.instance_id][
                annotation.summarizer_id] = annotation

        pyramid = instance_id_to_pyramid['d1101-A']
        assert pyramid.summarizer_ids == ['A', 'B', 'C', 'D']
        annotation = instance_id_to_annotations['d1101-A']['1']
        label = 'Roberts entered the school'
        contrib_label = 'gunman inside their tiny one-room schoolhouse'
        assert len(annotation.scus) == 2
        assert annotation.scus[0].scu_id == 2
        assert len(annotation.scus[0].contributors) == 1
        assert annotation.scus[0].contributors[0].label == contrib_label
        assert len(annotation.scus[0].contributors[0].parts) == 1
        assert annotation.scus[0].contributors[0].parts[
            0].text == contrib_label
        start = annotation.scus[0].contributors[0].parts[0].start
        end = annotation.scus[0].contributors[0].parts[0].end
        assert annotation.summary[start:end] == contrib_label
        self._test_example(pyramid, annotation)

        # This is an example of a pyramid annotation that has SCUs that are
        # not present in the pyramid, therefore they are missing here.
        pyramid = instance_id_to_pyramid['d1112-B']
        assert pyramid.summarizer_ids == ['A', 'C', 'G', 'H']
        annotation = instance_id_to_annotations['d1112-B']['22']
        contrib_label = 'some jurors in the Metrolink train derailment case last month said they really didn\'t think Alvarez intended to kill anyone'
        part_label = 'some jurors in the Metrolink train derailment case last month said they really didn\'t think Alvarez intended to kill anyone'
        assert len(annotation.scus) == 1
        assert annotation.scus[0].scu_id == 41
        assert len(annotation.scus[0].contributors) == 1
        assert annotation.scus[0].contributors[0].label == contrib_label
        assert len(annotation.scus[0].contributors[0].parts) == 1
        assert annotation.scus[0].contributors[0].parts[0].text == part_label
        start = annotation.scus[0].contributors[0].parts[0].start
        end = annotation.scus[0].contributors[0].parts[0].end
        assert annotation.summary[start:end] == part_label
Exemplo n.º 13
0
def main(args):
    random.seed(4)

    instances = JsonlReader(args.input_jsonl).read()
    instance_id_to_instances = defaultdict(list)
    for instance in instances:
        instance_id_to_instances[instance['instance_id']].append(instance)
    instance_ids = list(sorted(instance_id_to_instances.keys()))

    # Each point on the learning curve. Also make sure that the number of inputs isn't larger
    # than the number of instances and the max number of inputs is included.
    num_inputs_list = args.num_inputs
    while num_inputs_list[-1] >= len(instance_ids):
        num_inputs_list.pop()
    num_inputs_list.append(len(instance_ids))

    for num_inputs in tqdm(num_inputs_list):
        for sample_index in range(args.num_samples):
            with JsonlWriter(
                    f'{args.output_dir}/{num_inputs}/{sample_index}.jsonl'
            ) as out:
                random.shuffle(instance_ids)
                sample = list(sorted(instance_ids[:num_inputs]))
                for instance_id in sample:
                    for instance in instance_id_to_instances[instance_id]:
                        out.write(instance)
Exemplo n.º 14
0
def load_metrics(metrics_files: List[str]) -> List[Metrics]:
    logger.info(f'Loading metrics from {metrics_files}')
    metrics_list = []
    for metrics_file in metrics_files:
        metrics_list.extend(JsonlReader(metrics_file, Metrics).read())
    logger.info(f'Loaded {len(metrics_list)} metrics objects')
    return metrics_list
Exemplo n.º 15
0
    def _check_micro_list(self, micro_list: List[Metrics]) -> None:
        instances = JsonlReader(_summaries_file_path).read()

        assert len(micro_list) == len(instances)
        for micro, instance in zip(micro_list, instances):
            assert micro.instance_id == instance['instance_id']
            assert micro.summarizer_id == instance['summarizer_id']
            assert micro.summarizer_type == instance['summarizer_type']

        # Test a couple of cases. Again, I did not manually test these, but they
        # will catch if anything changes
        assert micro_list[1].metrics == {
            'python-rouge-1': {
                'precision': 30.412371134020617,
                'recall': 29.72292191435768,
                'f1': 30.06369426751592
            },
            'python-rouge-2': {
                'precision': 2.604166666666667,
                'recall': 2.5445292620865136,
                'f1': 2.5740025740025736
            }
        }
        assert micro_list[2848].metrics == {
            'python-rouge-1': {
                'precision': 29.207920792079207,
                'recall': 28.780487804878046,
                'f1': 28.99262899262899
            },
            'python-rouge-2': {
                'precision': 8.5,
                'recall': 8.374384236453201,
                'f1': 8.436724565756823
            }
        }
Exemplo n.º 16
0
    def test_fr_summary_level(self):
        # Test a few random metrics to ensure the data was parsed correctly
        metrics_list = JsonlReader(_fr_file_path, Metrics).read()
        metrics_dicts = self._convert_to_dicts(metrics_list)

        assert metrics_dicts['M004']['1']['rouge-1'][
            'precision'] == pytest.approx(42.791, abs=1e-3)
        assert metrics_dicts['M004']['1']['rouge-1'][
            'recall'] == pytest.approx(44.753, abs=1e-3)
        assert metrics_dicts['M004']['1']['rouge-1']['f1'] == pytest.approx(
            43.750, abs=1e-3)

        assert metrics_dicts['M009']['2']['rouge-2'][
            'precision'] == pytest.approx(17.770, abs=1e-3)
        assert metrics_dicts['M009']['2']['rouge-2'][
            'recall'] == pytest.approx(17.813, abs=1e-3)
        assert metrics_dicts['M009']['2']['rouge-2']['f1'] == pytest.approx(
            17.791, abs=1e-3)

        assert metrics_dicts['M002']['5']['MeMoG'] == pytest.approx(0.24885254,
                                                                    abs=1e-4)
        assert metrics_dicts['M004']['3']['MeMoG'] == pytest.approx(0.46445730,
                                                                    abs=1e-4)

        assert metrics_dicts['M000']['1']['grade'] == [3, 1, 3]
        assert metrics_dicts['M006']['4']['grade'] == [1, 3, 3]

        assert metrics_dicts['M000']['3']['length_aware_grade'] == [
            2.73, 2.73, 3.63
        ]
        assert metrics_dicts['M006']['4']['length_aware_grade'] == [
            1.00, 3.00, 3.00
        ]
def main(args):
    aqs = load_answered_questions(args.expert_answers_jsonl)
    with JsonlWriter(args.output_jsonl) as out:
        for instance in JsonlReader(args.questions_jsonl).read():
            instance_id = instance['instance_id']
            summarizer_id = instance['summarizer_id']
            new_references = []
            for reference in instance['references']:
                reference_id = reference['summarizer_id']
                new_questions = []
                for question in reference['questions']:
                    prompt_id = question['prompt_id']
                    key = (instance_id, summarizer_id, reference_id, prompt_id)
                    if key in aqs:
                        new_questions.append(question)

                if len(reference['questions']) != len(new_questions):
                    print(
                        f'({instance_id}, {summarizer_id}, {reference_id}) only has {len(new_questions)} / {len(reference["questions"])} answered'
                    )

                if len(new_questions) > 0:
                    reference['questions'] = new_questions
                    new_references.append(reference)

            if len(instance['references']) != len(new_references):
                print(
                    f'({instance_id}, {summarizer_id}) only has {len(new_references)} / {len(instance["references"])} references'
                )

            if len(new_references) > 0:
                instance['references'] = new_references
                out.write(instance)
def load_contributions(scores_jsonl: str) -> Dict[str, float]:
    metrics_dicts = []
    for metrics in JsonlReader(scores_jsonl, Metrics).read():
        if metrics.summarizer_type != 'peer':
            continue
        metrics_dicts.append(metrics.metrics)
    return sum(metrics_dicts) / len(metrics_dicts)
Exemplo n.º 19
0
    def _run_answer_questions(
            self, input_file: str,
            output_file: str) -> Dict[Tuple[str, str], Dict[str, float]]:
        logger.info('Running answering questions')
        commands = [f'cd {self.apes_root}/rc-cnn-dailymail']
        commands.append(f'source {os.environ["CONDA_INIT"]}')
        commands.append(f'conda activate {self.environment_name}')
        commands.append(
            f'python2.7 code/run_qa_model.py --input_file {input_file} --output_file {output_file} --train_path cnn_train.txt --dev_path cnn_dev.txt --glove_path glove.6B.100d.txt'
        )
        command = ' && '.join(commands)

        logger.info(f'Running command: "{command}"')
        redirect = None if self.verbose else PIPE
        process = Popen(command, stdout=redirect, stderr=redirect, shell=True)
        process.communicate()

        ids_to_scores = {}
        with JsonlReader(output_file) as f:
            for data in f:
                summarizer_id = data['answering_doc']
                reference_id = data['questioning_doc']
                accuracy = data['acc']
                num_correct = data['num_correct']
                ids_to_scores[(summarizer_id, reference_id)] = {
                    'accuracy': accuracy,
                    'num_correct': num_correct
                }
        return ids_to_scores
Exemplo n.º 20
0
def load_metrics_dicts(file_path: str) -> Dict[str, Dict[str, MetricsDict]]:
    metrics_dicts = defaultdict(dict)
    with JsonlReader(file_path, Metrics) as f:
        for instance in f:
            metrics_dicts[instance.instance_id][
                instance.summarizer_id] = instance.metrics
    return metrics_dicts
Exemplo n.º 21
0
    def _check_micro_list_arguments(self, micro_list: List[Metrics]) -> None:
        instances = JsonlReader(MULTILING_SUMMARIES).read()

        assert len(micro_list) == len(instances)
        for micro, instance in zip(micro_list, instances):
            assert micro.instance_id == instance['instance_id']
            assert micro.summarizer_id == instance['summarizer_id']
            assert micro.summarizer_type == instance['summarizer_type']

        assert micro_list[0].metrics == {
            'python-rouge-3': {
                'precision': 3.0952380952380953,
                'recall': 3.110047846889952,
                'f1': 3.1026252983293556
            },
            'python-rouge-l': {
                'precision': 20.657276995305164,
                'recall': 20.754716981132077,
                'f1': 20.705882352941174
            }
        }
        assert micro_list[11].metrics == {
            'python-rouge-3': {
                'precision': 4.273504273504273,
                'recall': 3.816793893129771,
                'f1': 4.03225806451613
            },
            'python-rouge-l': {
                'precision': 28.15126050420168,
                'recall': 25.18796992481203,
                'f1': 26.58730158730159
            }
        }
Exemplo n.º 22
0
    def _check_micro_list(self, micro_list: List[Metrics]) -> None:
        instances = JsonlReader(MULTILING_SUMMARIES).read()

        assert len(micro_list) == len(instances)
        for micro, instance in zip(micro_list, instances):
            assert micro.instance_id == instance['instance_id']
            assert micro.summarizer_id == instance['summarizer_id']
            assert micro.summarizer_type == instance['summarizer_type']

        assert micro_list[0].metrics == {
            'python-rouge-1': {
                'precision': 41.699867197875164,
                'recall': 40.516129032258064,
                'f1': 41.09947643979057
            },
            'python-rouge-2': {
                'precision': 10.533333333333333,
                'recall': 10.233160621761659,
                'f1': 10.38107752956636
            }
        }
        assert micro_list[11].metrics == {
            'python-rouge-1': {
                'precision': 45.34412955465587,
                'recall': 44.71057884231537,
                'f1': 45.0251256281407
            },
            'python-rouge-2': {
                'precision': 15.24390243902439,
                'recall': 15.030060120240481,
                'f1': 15.136226034308779
            }
        }
Exemplo n.º 23
0
    def test_en_summary_level(self):
        # Test a few random metrics to ensure the data was parsed correctly
        metrics_list = JsonlReader(_en_file_path, Metrics).read()
        metrics_dicts = self._convert_to_dicts(metrics_list)

        assert metrics_dicts['M004']['1']['rouge-1'][
            'precision'] == pytest.approx(50.525, abs=1e-3)
        assert metrics_dicts['M004']['1']['rouge-1'][
            'recall'] == pytest.approx(50.459, abs=1e-3)
        assert metrics_dicts['M004']['1']['rouge-1']['f1'] == pytest.approx(
            50.492, abs=1e-3)

        assert metrics_dicts['M009']['2']['rouge-2'][
            'precision'] == pytest.approx(17.647, abs=1e-3)
        assert metrics_dicts['M009']['2']['rouge-2'][
            'recall'] == pytest.approx(18.072, abs=1e-3)
        assert metrics_dicts['M009']['2']['rouge-2']['f1'] == pytest.approx(
            17.857, abs=1e-3)

        assert metrics_dicts['M002']['5']['MeMoG'] == pytest.approx(0.32643265,
                                                                    abs=1e-4)
        assert metrics_dicts['M004']['3']['MeMoG'] == pytest.approx(0.57679360,
                                                                    abs=1e-4)

        assert metrics_dicts['M000']['1']['grade'] == [2, 3, 3]
        assert metrics_dicts['M006']['4']['grade'] == [1, 3, 1]

        assert metrics_dicts['M000']['3']['length_aware_grade'] == [
            2.88, 3.83, 3.83
        ]
        assert metrics_dicts['M006']['4']['length_aware_grade'] == [
            1.00, 3.00, 1.00
        ]
    def test_task2_metrics(self):
        summary_level_metrics = JsonlReader(_task2_metrics_file_path,
                                            Metrics).read()
        summary_level_metrics = self._convert_to_dicts(summary_level_metrics)

        # Check a few metrics to make sure they are equal to what's in the NIST files. Unlike
        # the "system_level_test.py", these metrics don't have aggregated values in the NIST data,
        # so we just check to make sure a few of them were correctly parsed
        # updateEval/Pyramid/scoring/2007_modified_scores.txt
        assert summary_level_metrics['d0703-A']['39'][
            'modified_pyramid_score'] == pytest.approx(0.2857, 1e-2)
        assert summary_level_metrics['d0711-B']['45'][
            'modified_pyramid_score'] == pytest.approx(0.2353, 1e-2)
        assert summary_level_metrics['d0726-C']['51'][
            'modified_pyramid_score'] == pytest.approx(0.1739, 1e-2)

        assert summary_level_metrics['d0703-A']['39']['num_scus'] == 3
        assert summary_level_metrics['d0711-B']['45']['num_scus'] == 1
        assert summary_level_metrics['d0726-C']['51']['num_scus'] == 2

        assert summary_level_metrics['d0703-A']['39']['num_repetitions'] == 0
        assert summary_level_metrics['d0711-B']['45']['num_repetitions'] == 0
        assert summary_level_metrics['d0726-C']['51']['num_repetitions'] == 0

        assert summary_level_metrics['d0703-A']['39'][
            'content_responsiveness'] == 2
        assert summary_level_metrics['d0711-B']['45'][
            'content_responsiveness'] == 2
        assert summary_level_metrics['d0726-C']['51'][
            'content_responsiveness'] == 2
def load_metrics(scores_jsonl: str, summarizer_type: str) -> List[MetricsDict]:
    metrics_list = []
    with JsonlReader(scores_jsonl, Metrics) as f:
        for metrics in f:
            if summarizer_type == 'all' or metrics.summarizer_type == summarizer_type:
                metrics_list.append(metrics.metrics)
    return metrics_list
Exemplo n.º 26
0
    def test_system_level(self):
        summary_level_metrics = JsonlReader(_metrics_file_path, Metrics).read()
        summary_level_metrics = self._convert_to_dicts(summary_level_metrics)

        # Check a few metrics to make sure they are equal to what's in the NIST files
        # Pyramid/DUC2005/processed_pans.txt
        assert summary_level_metrics['d311']['14']['pyramid_score'] == pytest.approx(0.5521, 1e-2)
        assert summary_level_metrics['d345']['31']['pyramid_score'] == 0.0
        assert summary_level_metrics['d400']['A']['pyramid_score'] == pytest.approx(0.5094, 1e-2)

        assert summary_level_metrics['d311']['14']['modified_pyramid_score'] == pytest.approx(0.3869, 1e-2)
        assert summary_level_metrics['d345']['31']['modified_pyramid_score'] == 0.0
        assert summary_level_metrics['d400']['A']['modified_pyramid_score'] == pytest.approx(0.4576, 1e-2)

        assert summary_level_metrics['d311']['14']['num_scus'] == 10
        assert summary_level_metrics['d345']['31']['num_scus'] == 0
        assert summary_level_metrics['d400']['A']['num_scus'] == 23

        assert summary_level_metrics['d311']['14']['num_repetitions'] == 7
        assert summary_level_metrics['d345']['31']['num_repetitions'] == 0
        assert summary_level_metrics['d400']['A']['num_repetitions'] == 7

        assert summary_level_metrics['d311']['14']['responsiveness'] == 3
        assert summary_level_metrics['d345']['31']['responsiveness'] == 1
        assert summary_level_metrics['d400']['A']['responsiveness'] == 5
Exemplo n.º 27
0
    def test_system_level(self):
        summary_level_metrics = JsonlReader(_metrics_file_path, Metrics).read()
        summary_level_metrics = self._convert_to_dicts(summary_level_metrics)

        # Check a few metrics to make sure they are equal to what's in the NIST files
        # DUC2006pyramiddata/scoring/2006_modified_scores.txt
        assert summary_level_metrics['d0603']['32'][
            'modified_pyramid_score'] == pytest.approx(0.1048, 1e-2)
        assert summary_level_metrics['d0616']['10'][
            'modified_pyramid_score'] == pytest.approx(0.3103, 1e-2)
        assert summary_level_metrics['d0628']['6'][
            'modified_pyramid_score'] == pytest.approx(0.0769, 1e-2)

        assert summary_level_metrics['d0603']['32']['num_scus'] == 6
        assert summary_level_metrics['d0616']['10']['num_scus'] == 7
        assert summary_level_metrics['d0628']['6']['num_scus'] == 2

        assert summary_level_metrics['d0603']['32']['num_repetitions'] == 0
        assert summary_level_metrics['d0616']['10']['num_repetitions'] == 0
        assert summary_level_metrics['d0628']['6']['num_repetitions'] == 0

        assert summary_level_metrics['d0603']['32'][
            'content_responsiveness'] == 3
        assert summary_level_metrics['d0616']['10'][
            'content_responsiveness'] == 4
        assert summary_level_metrics['d0628']['6'][
            'content_responsiveness'] == 2
Exemplo n.º 28
0
    def test_rouge(self):
        # Test the first several instances in the TAC 2008 data to ensure that
        # our computation of ROUGE matches the values released by NIST
        instances = ReferenceBasedDatasetReader().read(_summaries_file_path)
        metrics_list = JsonlReader(_metrics_file_path, Metrics).read()
        metric_names = ['rouge-1', 'rouge-2', 'rouge-3', 'rouge-4', 'rouge-l', 'rouge-su4', 'rouge-w-1.2']
        rouge = Rouge(max_ngram=4,
                      use_porter_stemmer=True,
                      remove_stopwords=False,
                      compute_rouge_l=True,
                      skip_bigram_gap_length=4,
                      wlcs_weight=1.2)

        peer_instances, peer_metrics = self._filter_by_type(instances, metrics_list, 'peer')
        reference_instances, reference_metrics = self._filter_by_type(instances, metrics_list, 'reference')

        num_to_check = 25
        actual_metrics_dicts = score_instances(peer_instances[:num_to_check], [rouge])
        for expected_metrics in peer_metrics[:num_to_check]:
            instance_id = expected_metrics.instance_id
            summarizer_id = expected_metrics.summarizer_id
            actual_metrics = actual_metrics_dicts[instance_id][summarizer_id]

            for metric in metric_names:
                assert actual_metrics.metrics[metric] == expected_metrics.metrics[metric]
                assert actual_metrics.metrics[metric + '_jk'] == expected_metrics.metrics[metric + '_jk']

        actual_metrics_dicts = score_instances(reference_instances[:num_to_check], [rouge])
        for expected_metrics in reference_metrics[:num_to_check]:
            instance_id = expected_metrics.instance_id
            summarizer_id = expected_metrics.summarizer_id
            actual_metrics = actual_metrics_dicts[instance_id][summarizer_id]

            for metric in metric_names:
                assert actual_metrics.metrics[metric + '_jk'] == expected_metrics.metrics[metric + '_jk']
Exemplo n.º 29
0
def load_peers(file_path: str):
    instance_to_peers = defaultdict(dict)
    with JsonlReader(file_path) as f:
        for instance in f:
            instance_to_peers[instance['instance_id']][
                instance['summarizer_id']] = instance
    return instance_to_peers
Exemplo n.º 30
0
def load_questions(file_path: str):
    questions = {}
    for instance in JsonlReader(file_path).read():
        for reference in instance['references']:
            for question in reference['questions']:
                questions[question['question_id']] = question
    return questions