Exemplo n.º 1
0
def evaluate_instances(
        instances: List[EvalInstance],
        metrics: List[Metric]) -> Tuple[MetricsDict, List[Metrics]]:
    macro = MetricsDict()
    micro_list = get_initial_micro_list(instances)

    for metric in metrics:
        # Prepare the input arguments
        summary_args = []
        for field in metric.required_summary_fields:
            summary_args.append(
                [instance.fields[field].to_input() for instance in instances])

        context_args = []
        for field in metric.required_context_fields:
            context_args.append(
                [instance.fields[field].to_input() for instance in instances])

        # Score all the summaries
        this_macro, this_micro_list = metric.evaluate(*summary_args,
                                                      *context_args)

        # Update the global metrics dictionaries
        macro.update(this_macro)
        for micro, this_micro in zip(micro_list, this_micro_list):
            micro.metrics.update(this_micro)

    return macro, micro_list
Exemplo n.º 2
0
    def _run(self,
             summary_index_to_scus: List[Set[int]],
             reference_index_to_scus_list: List[List[Set[int]]],
             matches_list: List[List[Tuple[int, int, float]]],
             precision_weights: List[float],
             recall_weights_list: List[List[float]]):

        standard_counts = MetricsDict({'weight': 0, 'summary_weight': 0, 'reference_weight': 0, 'scu_weight': 0, 'non_scu_weight': 0})
        scu_counts = MetricsDict({'weight': 0, 'summary_weight': 0, 'reference_weight': 0})
        non_scu_counts = MetricsDict({'weight': 0, 'summary_weight': 0, 'reference_weight': 0})

        for matches, reference_index_to_scus, recall_weights in zip(matches_list, reference_index_to_scus_list, recall_weights_list):
            # Filter the SCUs to just those which the summary and reference have in common
            valid_scus = self._get_scu_intersection(summary_index_to_scus, reference_index_to_scus)
            this_summary_index_to_scus = self._filter_index_to_scus(summary_index_to_scus, valid_scus)
            this_reference_index_to_scus = self._filter_index_to_scus(reference_index_to_scus, valid_scus)

            standard_counts += self.backend.calculate_standard_metric(this_summary_index_to_scus, this_reference_index_to_scus, precision_weights, recall_weights, matches)
            scu_counts += self.backend.calculate_scu_metric(this_summary_index_to_scus, this_reference_index_to_scus, precision_weights, recall_weights, matches)
            non_scu_counts += self.backend.calculate_non_scu_metric(this_summary_index_to_scus, this_reference_index_to_scus, precision_weights, recall_weights, matches)

        self._add_pr(standard_counts)
        self._add_pr(scu_counts)
        self._add_pr(non_scu_counts)
        return MetricsDict({
            f'{self.name}-standard': standard_counts,
            f'{self.name}-scu': scu_counts,
            f'{self.name}-non-scu': non_scu_counts
        })
Exemplo n.º 3
0
    def test_init_with_metrics_dict(self):
        a = MetricsDict({'k1': 1, 'k2': {'k3': [1, 2, 3]}})
        b = MetricsDict(a)

        b['k2']['k3'].append(4)
        assert a == {'k1': 1, 'k2': {'k3': [1, 2, 3]}}
        assert b == {'k1': 1, 'k2': {'k3': [1, 2, 3, 4]}}
Exemplo n.º 4
0
    def test_add(self):
        a = MetricsDict({'k1': 1, 'k2': {'k3': 4}})
        b = MetricsDict({'k1': 2, 'k2': {'k3': 5}})
        c = MetricsDict({'k1': 3, 'k2': {'k3': 6}})

        assert a + b == {'k1': 3, 'k2': {'k3': 9}}
        assert a == {'k1': 1, 'k2': {'k3': 4}}
        assert b == {'k1': 2, 'k2': {'k3': 5}}

        assert sum([a, b, c]) == {'k1': 6, 'k2': {'k3': 15}}
        assert a == {'k1': 1, 'k2': {'k3': 4}}
        assert b == {'k1': 2, 'k2': {'k3': 5}}
        assert c == {'k1': 3, 'k2': {'k3': 6}}
Exemplo n.º 5
0
def _load_generic_scores(input_file: str):
    data = json.load(open(input_file, 'r'))
    instances = []
    metrics_list = []
    documents = {}
    for i, instance in enumerate(data):
        document = instance['text'].strip()
        summary = instance['summary'].strip()
        scores = instance['scores']
        summarizer_id = str(i)

        if document not in documents:
            documents[document] = str(len(documents))
        instance_id = documents[document]

        instances.append({
            'instance_id': instance_id,
            'summarizer_id': summarizer_id,
            'summarizer_type': 'peer',
            'summary': {
                'text': summary
            },
            'document': {
                'text': document
            }
        })
        metrics_list.append(
            Metrics(instance_id, summarizer_id, 'peer',
                    MetricsDict({'generic_quality': scores})))
    return instances, metrics_list
Exemplo n.º 6
0
    def select_matches(self,
                       summary_tokens: List[Token],
                       reference_tokens: List[Token],
                       matches: List[Tuple[int, int]],
                       intersection: int):
        common_matches = []
        for i, j in matches:
            summary_token = summary_tokens[i]
            reference_token = reference_tokens[j]
            if self.is_match(summary_token, reference_token):
                common_matches.append((i, j, 1.0))

        num_matches = calculate_maximum_matching(common_matches)
        num_summary_tokens = sum(1 for token in summary_tokens if self.is_candidate(token))
        num_reference_tokens = sum(1 for token in reference_tokens if self.is_candidate(token))

        precision = num_matches / num_summary_tokens * 100 if num_summary_tokens > 0 else 0
        recall = num_matches / num_reference_tokens * 100 if num_reference_tokens > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        contribution = num_matches / intersection * 100 if len(matches) > 0 else 0

        return MetricsDict({
            self.name: {
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'contribution': contribution
            }
        })
def write_table(metrics1: MetricsDict, metrics2: MetricsDict,
                difference: MetricsDict, rel_difference: MetricsDict,
                output_path: str) -> None:
    # Put all of the data into tuples and sort by the relative difference
    data = []
    for key in rel_difference.keys():
        if 'f1' in rel_difference[key]:
            value1 = metrics1[key]['f1']
            value2 = metrics2[key]['f1']
            data.append((key, value1, value2, difference[key]['f1'],
                         rel_difference[key]['f1']))
    data.sort(key=lambda t: -t[4])

    # Prepare the lines for writing
    lines = []
    for category, value1, value2, diff, rel_diff in data:
        lines.append(' & '.join([
            category, f'{value1:.1f}', f'{value2:.1f}', f'{diff:.1f}',
            f'{rel_diff:.1f}'
        ]) + ' \\\\')

    dirname = os.path.dirname(output_path)
    if dirname:
        os.makedirs(dirname, exist_ok=True)
    with open(output_path, 'w') as out:
        out.write('\n'.join(lines))
Exemplo n.º 8
0
    def calculate_scu_metric(
            self, summary_index_to_scus: List[Set[int]],
            reference_index_to_scus: List[Set[int]],
            summary_weights: List[float], reference_weights: List[float],
            matches: List[Tuple[int, int, float]]) -> MetricsDict:
        summary_scu_to_indices = self._get_scu_to_indices(
            summary_index_to_scus)
        reference_scu_to_indices = self._get_scu_to_indices(
            reference_index_to_scus)

        all_matches = []
        for scu in summary_scu_to_indices.keys():
            summary_indices = summary_scu_to_indices[scu]
            reference_indices = reference_scu_to_indices[scu]
            scu_matches = self._get_matches(summary_indices, reference_indices,
                                            matches)
            all_matches.extend(scu_matches)

        intersection = calculate_maximum_matching(all_matches)
        return MetricsDict({
            'weight':
            intersection,
            'summary_weight':
            self._sum_scu_token_weight(summary_index_to_scus, summary_weights),
            'reference_weight':
            self._sum_scu_token_weight(reference_index_to_scus,
                                       reference_weights)
        })
Exemplo n.º 9
0
    def _parse_output_file(self, file_path: str) -> List[List[MetricsDict]]:
        metrics_dicts = defaultdict(dict)
        with open(file_path, 'r') as f:
            for i, line in enumerate(f):
                # Header
                if i == 0:
                    continue
                columns = line.split('\t')
                if len(columns) != 5:
                    raise Exception(f'Expected 5 columns: {line}')

                instance_index = int(columns[0])
                summarizer_index = int(columns[1])
                metrics_dicts[instance_index][summarizer_index] = MetricsDict({
                    'AutoSummENG':
                    float(columns[2]),
                    'MeMoG':
                    float(columns[3]),
                    'NPowER':
                    float(columns[4])
                })

        metrics_lists = []
        for i in range(len(metrics_dicts)):
            metrics_lists.append([])
            for j in range(len(metrics_dicts[i])):
                metrics_lists[-1].append(metrics_dicts[i][j])
        return metrics_lists
Exemplo n.º 10
0
    def select_matches(self,
                       summary_tokens: List[Token],
                       reference_tokens: List[Token],
                       matches: List[Tuple[int, int]],
                       weights: List[float],
                       metric: str,
                       backend: Backend) -> Tuple[List[Tuple[int, int]], MetricsDict]:
        common_matches = []
        for i, j, weight in matches:
            summary_token = summary_tokens[i]
            reference_token = reference_tokens[j]
            if self.is_match(summary_token, reference_token):
                common_matches.append((i, j, weight))

        norm_weight = 0
        if metric == 'precision':
            tokens = summary_tokens
        else:
            tokens = reference_tokens
        for i, token in enumerate(tokens):
            if self.is_candidate(token):
                norm_weight += weights[i]

        matching_weight = backend.get_total_weight(common_matches)
        metrics = MetricsDict({
            self.name: {
                f'{metric}_weight': matching_weight,
                f'{metric}_norm_weight': norm_weight,
            }
        })

        return common_matches, metrics
Exemplo n.º 11
0
    def select_matches(self,
                       summary_tokens: List[Token],
                       reference_tokens: List[Token],
                       matches: List[Tuple[int, int]],
                       weights: List[float],
                       metric: str,
                       backend: Backend) -> Tuple[List[Tuple[int, int]], MetricsDict]:
        summary_tuples = self.get_tuples(summary_tokens)
        reference_tuples = self.get_tuples(reference_tokens)

        if isinstance(backend, BertScoreBackend):
            common_matches, total_weight = self._select_matches_bert(summary_tuples, reference_tuples, matches)
        else:
            common_matches, total_weight = self._select_matches_rouge(summary_tuples, reference_tuples, matches)

        if metric == 'precision':
            tuples = summary_tuples
        else:
            tuples = reference_tuples

        norm_weight = 0
        norm_indices = set()
        for tup in tuples:
            for i in tup.values():
                norm_indices.add(i)
        for i in norm_indices:
            norm_weight += weights[i]

        return common_matches, MetricsDict({
            self.name: {
                f'{metric}_weight': total_weight,
                f'{metric}_norm_weight': norm_weight,
            }
        })
Exemplo n.º 12
0
    def test_pyramid_score(self):
        # This is a regression test, not necessarily a test for correctness
        pyramids = {
            pyramid.instance_id: pyramid
            for pyramid in JsonlReader(_pyramid_file_path, Pyramid).read()
        }
        annotations = JsonlReader(_annotation_file_path,
                                  PyramidAnnotation).read()
        annotation_pyramids = [
            pyramids[annotation.instance_id] for annotation in annotations
        ]

        metric = PyramidScore()
        actual_output = metric.score_all(annotations, annotation_pyramids)[:5]
        expected_output = [{
            'modified_pyramid_score': 0.2413793103448276
        }, {
            'modified_pyramid_score': 0.0
        }, {
            'modified_pyramid_score': 0.06896551724137931
        }, {
            'modified_pyramid_score': 0.034482758620689655
        }, {
            'modified_pyramid_score': 0.1724137931034483
        }]
        for i, (expected,
                actual) in enumerate(zip(expected_output, actual_output)):
            assert actual.approx_equal(
                MetricsDict(expected), abs=1e-4
            ), f'Instance {i} not equal. Expected {expected}, actual {actual}'
Exemplo n.º 13
0
    def _run(self,
             summary: SummaryType,
             annotation: PyramidAnnotation,
             pyramid: Pyramid) -> MetricsDict:
        summary_all_scus_to_offsets = self._get_summary_scu_to_offsets(annotation)

        standard_counts = MetricsDict({'intersection': 0, 'num_summary_tokens': 0, 'num_reference_tokens': 0, 'num_scu_matches': 0, 'num_non_scu_matches': 0})
        scu_counts = MetricsDict({'intersection': 0, 'num_summary_tokens': 0, 'num_reference_tokens': 0})
        non_scu_counts = MetricsDict({'intersection': 0, 'num_summary_tokens': 0, 'num_reference_tokens': 0})

        total_common_scus = 0
        for i, reference in enumerate(pyramid.summaries):
            reference_all_scus_to_offsets = self._get_reference_scu_to_offsets(pyramid, i)
            valid_scus = self._get_scu_intersection(annotation, pyramid, i)
            total_common_scus += len(valid_scus)

            # Take only the SCUs which are common between the summary and reference
            summary_scus_to_offsets = self._filter_scu_to_offsets(summary_all_scus_to_offsets, valid_scus)
            reference_scus_to_offsets = self._filter_scu_to_offsets(reference_all_scus_to_offsets, valid_scus)

            # Tokenize each
            summary_tokens, summary_index_to_scus = self._tokenize(annotation.summary, summary_scus_to_offsets)
            reference_tokens, reference_index_to_scus = self._tokenize(reference, reference_scus_to_offsets)

            # Compute ROUGE
            standard_counts += self._compute_standard_rouge(summary_tokens, summary_index_to_scus,
                                                            reference_tokens, reference_index_to_scus)

            scu_counts += self._compute_scu_rouge(summary_tokens, summary_index_to_scus,
                                                  reference_tokens, reference_index_to_scus)

            non_scu_counts += self._compute_non_scu_rouge(summary_tokens, summary_index_to_scus,
                                                          reference_tokens, reference_index_to_scus)

        avg_common_scus = total_common_scus / len(pyramid.summaries)

        self._add_pr(standard_counts)
        self._add_pr(scu_counts)
        self._add_pr(non_scu_counts)
        return MetricsDict({
            'common_scus': avg_common_scus,
            'standard-rouge': standard_counts,
            'scu-rouge': scu_counts,
            'non-scu-rouge': non_scu_counts,
        })
Exemplo n.º 14
0
 def _score(
         self, answered_questions_list: List[List[AnsweredQuestion]]
 ) -> MetricsDict:
     # Average over references
     metrics = []
     for answered_questions in answered_questions_list:
         metrics.append(self._score_reference(answered_questions))
     final_metrics = sum(metrics) / len(metrics)
     return MetricsDict({'qa-eval': final_metrics})
Exemplo n.º 15
0
    def score_multi_all(self, summaries_list: List[List[SummaryType]],
                        references_list: List[List[ReferenceType]],
                        **kwargs) -> List[List[MetricsDict]]:
        summaries_list = self._flatten_summaries(summaries_list)
        references_list = self._flatten_summaries(references_list)

        logger.info(f'Serializing the summaries and references to a file')
        num_summaries = 0
        with TemporaryDirectory() as temp_dir:
            input_file = f'{temp_dir}/input.jsonl'
            output_file = f'{temp_dir}/output.jsonl'
            with JsonlWriter(input_file) as out:
                for summaries, references in zip(summaries_list,
                                                 references_list):
                    for summary in summaries:
                        out.write({
                            'summary': summary,
                            'references': references
                        })
                        num_summaries += 1
            logger.info(f'Wrote {num_summaries} (summary, references) pairs')

            commands = [f'cd {self.s3_root}/S3']
            if self.environment_name is not None:
                commands.append(f'source {os.environ["CONDA_INIT"]}')
                commands.append(f'conda activate {self.environment_name}')
            commands.append(
                f'python2.7 run_batch.py {input_file} {output_file} {self.embeddings_file} {self.model_dir}'
            )
            command = ' && '.join(commands)

            logger.info(f'Running command: "{command}"')
            redirect = None if self.verbose else PIPE
            process = Popen(command,
                            stdout=redirect,
                            stderr=redirect,
                            shell=True)
            process.communicate()

            scores = JsonlReader(output_file).read()
            assert len(scores) == num_summaries

            metrics_list = []
            index = 0
            for summaries in summaries_list:
                metrics_list.append([])
                for _ in summaries:
                    metrics_list[-1].append(
                        MetricsDict({
                            's3': {
                                'pyr': scores[index]['pyr'],
                                'resp': scores[index]['resp'],
                            }
                        }))
                    index += 1
            return metrics_list
Exemplo n.º 16
0
    def score_multi_all(
            self, summaries_list: List[List[SummaryType]],
            references_list: List[List[ReferenceType]]
    ) -> List[List[MetricsDict]]:
        summaries_list = [[
            self.preprocess_summary(summary) for summary in summaries
        ] for summaries in summaries_list]
        references_list = [[
            self.preprocess_summary(reference) for reference in references
        ] for references in references_list]

        metrics_lists = []
        for summaries, references in zip(summaries_list, references_list):
            metrics_list = [MetricsDict() for _ in summaries]

            for n in self.ngram_orders:
                reference_ngrams_list = [
                    self._count_ngrams(reference, n)
                    for reference in references
                ]

                for i, summary in enumerate(summaries):
                    total_reference_count = 0
                    total_summary_count = 0
                    total_intersection = 0

                    summary_ngrams = self._count_ngrams(summary, n)
                    for reference_ngrams in reference_ngrams_list:
                        reference_total, summary_total, intersection = self._calculate_intersection(
                            reference_ngrams, summary_ngrams)

                        total_reference_count += reference_total
                        total_summary_count += summary_total
                        total_intersection += intersection

                    precision, recall, f1 = self._calculate_pr_f1(
                        total_reference_count, total_summary_count,
                        total_intersection)
                    metrics_list[i][f'python-rouge-{n}'] = {
                        'precision': precision,
                        'recall': recall,
                        'f1': f1,
                    }

            if self.compute_rouge_l:
                for i, summary in enumerate(summaries):
                    precision, recall, f1 = self._calculate_rouge_l(
                        references, summary)
                    metrics_list[i]['python-rouge-l'] = {
                        'precision': precision,
                        'recall': recall,
                        'f1': f1
                    }

            metrics_lists.append(metrics_list)
        return metrics_lists
def main(args):
    metrics1 = MetricsDict(
        json.loads(open(args.metrics_json1, 'r').read())['metrics'])
    metrics2 = MetricsDict(
        json.loads(open(args.metrics_json2, 'r').read())['metrics'])

    # If you calculate the differences based on the true values and then do rounding, the results
    # look a little weird in the table because the rounded values no longer make sense.
    # For example (26.61 - 21.95 = 4.66; The table would show 26.6 - 22.0 = 4.7). Therefore, we
    # first round the numbers, then calculate the differences. This shouldn't have any major
    # impact on the results, but it will avoid any confusion from the reader.
    round_metrics(metrics1, 1)
    round_metrics(metrics2, 1)

    difference = metrics2 - metrics1
    rel_difference = calculate_relative_difference(metrics1, difference)

    write_table(metrics1, metrics2, difference, rel_difference,
                args.output_tex)
    def run(self, args):
        params = Params.from_file(args.config, args.overrides)
        dataset_reader = DatasetReader.from_params(
            params.pop('dataset_reader'))
        metrics = load_metrics(params)

        instances = dataset_reader.read()
        summaries = [instance.summary for instance in instances]

        macro = MetricsDict()
        micro_list = get_initial_micro_list(instances)

        for metric in metrics:
            # Prepare the extra input arguments
            eval_args = []
            for field in metric.required_fields:
                eval_args.append(
                    [instance.fields[field] for instance in instances])

            # Score all the summaries
            this_macro, this_micro_list = metric.evaluate(
                summaries, *eval_args)

            # Update the global metrics dictionaries
            macro.update(this_macro)
            for micro, this_micro in zip(micro_list, this_micro_list):
                micro.metrics.update(this_micro)

        dirname = os.path.dirname(args.macro_output_json)
        if dirname:
            os.makedirs(dirname, exist_ok=True)

        serialized_macro = jsons.dumps({'metrics': macro},
                                       jdkwargs={'indent': 2})
        with open(args.macro_output_json, 'w') as out:
            out.write(serialized_macro)
        if not args.silent:
            print(serialized_macro)

        with JsonlWriter(args.micro_output_jsonl) as out:
            for metrics_dict in micro_list:
                out.write(metrics_dict)
Exemplo n.º 19
0
    def _compute_standard_rouge(self,
                                summary_tokens: List[str],
                                summary_index_to_scus: List[Set[int]],
                                reference_tokens: List[str],
                                reference_index_to_scus: List[Set[int]]) -> MetricsDict():
        # This is the standard ROUGE calculation except the SCU-based matches are
        # given priority over non-SCU matches to maximize the percentage of the
        # ROUGE score the SCU matches contribute.
        summary_scu_to_indices = self._get_scu_to_indices(summary_index_to_scus)
        reference_scu_to_indices = self._get_scu_to_indices(reference_index_to_scus)

        all_matches = []
        for scu in summary_scu_to_indices.keys():
            summary_indices = summary_scu_to_indices[scu]
            reference_indices = reference_scu_to_indices[scu]
            matches = self._get_matches(summary_tokens, summary_indices,
                                        reference_tokens, reference_indices)
            all_matches.extend(matches)
        num_scu_matches, matching = calculate_maximum_matching(all_matches, return_matching=True)

        # Mark which tokens were matched and therefore no long eligible
        summary_matches = [False] * len(summary_tokens)
        references_matches = [False] * len(reference_tokens)
        for i, j in matching:
            summary_matches[i] = True
            references_matches[j] = True

        summary_indices = [i for i in range(len(summary_tokens)) if not summary_matches[i]]
        reference_indices = [i for i in range(len(reference_tokens)) if not references_matches[i]]
        matches = self._get_matches(summary_tokens, summary_indices,
                                    reference_tokens, reference_indices)
        num_non_scu_matches = calculate_maximum_matching(matches)

        intersection = num_scu_matches + num_non_scu_matches
        m = MetricsDict({
            'intersection': intersection,
            'num_summary_tokens': len(summary_tokens),
            'num_reference_tokens': len(reference_tokens),
            'num_scu_matches': num_scu_matches,
            'num_non_scu_matches': num_non_scu_matches,
        })
        return m
Exemplo n.º 20
0
 def _assert_expected_output(self, metric: Metric,
                             expected_output: List[MetricsDict], *args):
     """Ensures that the output from `score_all` is equal to the `expected_output`."""
     assert len(self.summaries) == len(expected_output)
     actual_output = metric.score_all(self.summaries, *args)
     assert len(actual_output) == len(expected_output)
     for i, (expected,
             actual) in enumerate(zip(expected_output, actual_output)):
         assert actual.approx_equal(
             MetricsDict(expected), abs=1e-4
         ), f'Instance {i} not equal. Expected {expected}, actual {actual}'
Exemplo n.º 21
0
        def _run(self,
                 summaries_list: List[List[SummaryType]],
                 references_list: List[List[ReferenceType]]) -> List[List[MetricsDict]]:
            summaries_list = [[flatten(summary) for summary in summaries] for summaries in summaries_list]
            references_list = [[flatten(reference) for reference in references] for references in references_list]

            # Create the candidate and reference lists for passing to the scoring function
            input_candidates = []
            input_references = []
            empty_inputs = set()
            for i, (summaries, references) in enumerate(zip(summaries_list, references_list)):
                for j, summary in enumerate(summaries):
                    if len(summary) == 0:
                        empty_inputs.add((i, j))
                    else:
                        input_candidates.append(summary)
                        input_references.append(references)

            # Score the summaries
            precisions, recalls, f1s = bert_score.score(
                input_candidates,
                input_references,
                model_type=self.model_type,
                num_layers=self.num_layers,
                idf=False,
                nthreads=self.nthreads,
                batch_size=self.batch_size,
                lang=self.lang,
                verbose=self.verbose
            )

            # Remap the scores to the summaries
            index = 0
            metrics_lists = []
            for i, summaries in enumerate(summaries_list):
                metrics_lists.append([])
                for j, summary in enumerate(summaries):
                    if (i, j) in empty_inputs:
                        precision, recall, f1 = 0.0, 0.0, 0.0
                    else:
                        precision = precisions[index].item()
                        recall = recalls[index].item()
                        f1 = f1s[index].item()
                        index += 1

                    metrics_lists[-1].append(MetricsDict({
                        'bertscore': {
                            'precision': precision,
                            'recall': recall,
                            'f1': f1,
                        }
                    }))

            return metrics_lists
Exemplo n.º 22
0
        def _run(
            self, summaries_list: List[List[SummaryType]],
            references_list: List[List[SummaryType]]
        ) -> List[List[MetricsDict]]:
            summaries_list = self._flatten_summaries(summaries_list)
            references_list = self._flatten_summaries(references_list)

            unique_summaries = self._get_unique_summaries(summaries_list)
            unique_references = self._get_unique_summaries(references_list)

            idf_dict_summaries = get_idf_dict(unique_summaries)
            idf_dict_references = get_idf_dict(unique_references)

            # Prepare the inputs into flat lists for faster processing. The
            # indices will keep track of which item the score belongs to
            indices = []
            input_summaries = []
            input_references = []
            for i, (summaries, references) in enumerate(
                    zip(summaries_list, references_list)):
                for j, summary in enumerate(summaries):
                    for reference in references:
                        indices.append((i, j))
                        input_summaries.append(summary)
                        input_references.append(reference)

            # Score all of the data
            scores = word_mover_score(input_references,
                                      input_summaries,
                                      idf_dict_references,
                                      idf_dict_summaries,
                                      self.stopwords,
                                      n_gram=1,
                                      remove_subwords=True,
                                      batch_size=48)

            # Compute the mean over the references
            indices_to_scores = defaultdict(list)
            for pair, score in zip(indices, scores):
                indices_to_scores[pair].append(score)

            indices_to_score = {}
            for pair, scores in indices_to_scores.items():
                indices_to_score[pair] = np.mean(scores)

            # Put back into lists
            metrics_dict_lists = []
            for i in range(len(summaries_list)):
                metrics_dict_lists.append([])
                for j in range(len(summaries_list[i])):
                    metrics_dict_lists[-1].append(
                        MetricsDict({'MoverScore': indices_to_score[(i, j)]}))
            return metrics_dict_lists
Exemplo n.º 23
0
    def select_matches(self,
                       summary_tokens: List[Token],
                       reference_tokens: List[Token],
                       matches: List[Tuple[int, int]],
                       intersection: int):
        summary_tuples = self.get_tuples(summary_tokens)
        reference_tuples = self.get_tuples(reference_tokens)
        matches = set(matches)

        # Figure out the list of tuple-level matches based on whether the tuples
        # match each other completely
        tuple_matches = []
        for s_i, summary_tuple in enumerate(summary_tuples):
            for r_j, reference_tuple in enumerate(reference_tuples):
                assert len(summary_tuple) == len(reference_tuple)
                # See if each component of these two tuples can be aligned
                matched = True
                for key, i in summary_tuple.items():
                    if key not in reference_tuple:
                        matched = False
                        break
                    j = reference_tuple[key]
                    if (i, j) not in matches:
                        matched = False
                        break

                if matched:
                    tuple_matches.append((s_i, r_j, len(summary_tuple)))

        # Calculate the weight of the matched tuples, only allowing each tuple to be matched once.
        # The tuples form an equivalence class, so it doesn't matter exactly what match we use. This is
        # equivalent to calculating the size of the maximum matching in a bipartite graph where the
        # two disjoint sets of vertices are the summary and reference tuples, and an edge exists between
        # them if they were matched
        total_weight = calculate_maximum_matching(tuple_matches)

        summary_tuples_weight = sum(len(tup) for tup in summary_tuples)
        reference_tuples_weight = sum(len(tup) for tup in reference_tuples)

        precision = total_weight / summary_tuples_weight * 100 if summary_tuples_weight > 0 else 0
        recall = total_weight / reference_tuples_weight * 100 if reference_tuples_weight > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        contribution = total_weight / intersection * 100 if len(matches) > 0 else 0

        return MetricsDict({
            self.name: {
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'contribution': contribution
            }
        })
Exemplo n.º 24
0
    def _run(
            self, summaries_list: List[List[SummaryType]]
    ) -> List[List[MetricsDict]]:
        with TemporaryDirectory() as temp_dir:
            summaries_file = f'{temp_dir}/summaries.jsonl'
            predictions_file = f'{temp_dir}/predictions.json'

            # Save all of the summaries to a file
            with JsonlWriter(summaries_file) as out:
                for summaries in summaries_list:
                    for summary in summaries:
                        out.write({'summary': self._flatten_summary(summary)})

            commands = [f'cd {self.sum_qe_root}']
            if self.environment_name:
                commands += [f'source activate {self.environment_name}']
            commands += [
                ' '.join([
                    'python', '-m', 'src.BERT_experiments.predict',
                    summaries_file, self.model_file, predictions_file
                ])
            ]

            redirect = None if self.verbose else PIPE
            process = Popen(' && '.join(commands),
                            stdout=redirect,
                            stderr=redirect,
                            shell=True)
            stdout, stderr = process.communicate()

            predictions = json.loads(open(predictions_file, 'r').read())

            index = 0
            metrics_lists = []
            for summaries in summaries_list:
                metrics_lists.append([])
                for summary in summaries:
                    preds = predictions[index]
                    metrics_lists[-1].append(
                        MetricsDict({
                            'SumQE': {
                                'Q1': preds[0],
                                'Q2': preds[1],
                                'Q3': preds[2],
                                'Q4': preds[3],
                                'Q5': preds[4]
                            }
                        }))
                    index += 1

            return metrics_lists
Exemplo n.º 25
0
 def score_multi_all(self,
                     summaries_list: List[List[SummaryType]],
                     references_list: List[List[ReferenceType]]) -> List[List[MetricsDict]]:
     metrics_dict_lists = []
     for summaries, references in zip(summaries_list, references_list):
         metrics_dict_lists.append([])
         for summary in summaries:
             summary_value = float(summary)
             total = 0
             for reference in references:
                 reference_value = float(reference)
                 total += summary_value * reference_value
             metrics_dict_lists[-1].append(MetricsDict({'test': total}))
     return metrics_dict_lists
Exemplo n.º 26
0
    def calculate_standard_metric(
            self, summary_index_to_scus: List[Set[int]],
            reference_index_to_scus: List[Set[int]],
            summary_weights: List[float], reference_weights: List[float],
            matches: List[Tuple[int, int, float]]) -> MetricsDict:
        # This is the standard ROUGE calculation except the SCU-based matches are
        # given priority over non-SCU matches to maximize the percentage of the
        # ROUGE score the SCU matches contribute.
        summary_scu_to_indices = self._get_scu_to_indices(
            summary_index_to_scus)
        reference_scu_to_indices = self._get_scu_to_indices(
            reference_index_to_scus)

        all_matches = []
        for scu in summary_scu_to_indices.keys():
            summary_indices = summary_scu_to_indices[scu]
            reference_indices = reference_scu_to_indices[scu]
            scu_matches = self._get_matches(summary_indices, reference_indices,
                                            matches)
            all_matches.extend(scu_matches)
        num_scu_matches, matching = calculate_maximum_matching(
            all_matches, return_matching=True)

        # Mark which tokens were matched and therefore no long eligible
        summary_matches = [False] * len(summary_index_to_scus)
        references_matches = [False] * len(reference_index_to_scus)
        for i, j in matching:
            summary_matches[i] = True
            references_matches[j] = True

        summary_indices = [
            i for i in range(len(summary_index_to_scus))
            if not summary_matches[i]
        ]
        reference_indices = [
            i for i in range(len(reference_index_to_scus))
            if not references_matches[i]
        ]
        non_scus_matches = self._get_matches(summary_indices,
                                             reference_indices, matches)
        num_non_scu_matches = calculate_maximum_matching(non_scus_matches)

        intersection = num_scu_matches + num_non_scu_matches
        return MetricsDict({
            'weight': intersection,
            'summary_weight': sum(summary_weights),
            'reference_weight': sum(reference_weights),
            'scu_weight': num_scu_matches,
            'non_scu_weight': num_non_scu_matches,
        })
Exemplo n.º 27
0
    def _score_summaries(
            self, array_index_to_tgt_index: List[int]) -> List[MetricsDict]:
        logging.info('Building pyramids and scoring peers')

        # Each step can be run by piping its ID into the pyreval.py program.
        #   4: pyramid
        #   5 -t: score (-t means to write the results to file)
        for args in ['4', '5 -t']:
            commands = [f'cd {self.pyreval_root}']
            if self.environment_name is not None:
                commands.append(f'source {os.environ["CONDA_INIT"]}')
                commands.append(f'conda activate {self.environment_name}')
            commands.append(f'echo {args} | python2.7 pyreval.py')
            command = ' && '.join(commands)

            logger.info(f'Running command: "{command}"')
            redirect = None if self.verbose else PIPE
            process = Popen(command,
                            stdout=redirect,
                            stderr=redirect,
                            shell=True)
            process.communicate()

        # Parse the results
        results_path = f'{self.pyreval_root}/results.csv'
        if not os.path.exists(results_path):
            raise Exception(
                f'PyrEval results file does not exist: "{results_path}"')

        # First line is the name of the pyramid
        # Second line is the header
        lines = open(results_path, 'r').read().splitlines()
        metrics_dicts = {}
        for line in lines[2:]:
            index, raw, quality, coverage, comprehensive = line.split(',')
            metrics_dicts[int(index)] = MetricsDict({
                'pyreval': {
                    'raw': int(raw),
                    'quality': float(quality),
                    'coverage': float(coverage),
                    'comprehensive': float(comprehensive),
                }
            })

        metrics_list = []
        for index in array_index_to_tgt_index:
            metrics_list.append(metrics_dicts[index])

        logging.info('Finished building pyramids and scoring peers')
        return metrics_list
Exemplo n.º 28
0
 def _aggregate_summary_scores(self,
                               summaries_list: List[List[str]],
                               references_list: List[List[str]],
                               tuple_to_indices: Dict[Tuple[int, int], List[int]],
                               individual_scores: List[float]) -> List[List[MetricsDict]]:
     metrics_lists = []
     for i, (summaries, references) in enumerate(zip(summaries_list, references_list)):
         metrics_lists.append([])
         for j, summary in enumerate(summaries):
             scores = [individual_scores[index] for index in tuple_to_indices[(i, j)]]
             metrics_lists[-1].append(MetricsDict({
                 'METEOR': sum(scores) / len(scores)
             }))
     return metrics_lists
Exemplo n.º 29
0
    def test_get_set_item(self):
        metrics = MetricsDict()

        metrics['a'] = 4
        assert metrics['a'] == 4

        metrics['b']['c'] = [1, 2]
        assert metrics['b']['c'] == [1, 2]
        assert isinstance(metrics['b'], MetricsDict)

        metrics['d'] = {'e': 4, 'f': {'g': 4}}
        assert metrics['d'] == {'e': 4, 'f': {'g': 4}}
        assert isinstance(metrics['d'], MetricsDict)
        assert isinstance(metrics['d']['f'], MetricsDict)
 def calculate_non_scu_metric(self,
                              summary_index_to_scus: List[Set[int]],
                              reference_index_to_scus: List[Set[int]],
                              summary_weights: List[float],
                              reference_weights: List[float],
                              matches: List[Tuple[int, int, float]]) -> MetricsDict:
     total_weight = 0
     for i, j, weight in matches:
         if len(summary_index_to_scus[i] & reference_index_to_scus[j]) == 0:
             total_weight += weight
     return MetricsDict({
         'weight': total_weight,
         'summary_weight': sum(summary_weights),
         'reference_weight': sum(reference_weights)
     })