Пример #1
0
def text_eval(encoder,
              features_iter,
              model_dir,
              global_step,
              eval_tag,
              enable_logging,
              inputs_pattern="^inputs[0-9]*$",
              targets_key="targets",
              predictions_key="outputs",
              additional_keys=(),
              num_reserved=None):
  """Evaluates a set of text targets/predictions."""
  decode_fn = lambda x: ids2str(encoder, x, num_reserved)
  scorers_dict = {}
  scorers_dict[_ROUGE_METRIC] = rouge_scorer.RougeScorer(
    ["rouge1", "rouge2", "rougeL", "rougeLsum"], use_stemmer=True)
  scorers_dict[_BLEU_METRIC] = bleu_scorer.BleuScorer()
  scorers_dict[_REPETITION_METRIC] = repetition_scorer.RepetitionScorer(
    ["regs1", "regs2", "regs3", "regsTCR"])
  scorers_dict[_LENGTH_METRIC] = length_scorer.LengthScorer(["word", "char"])
  aggregators_dict = {k: scoring.BootstrapAggregator() for k in scorers_dict}

  with LogWriter(additional_keys, model_dir, global_step, eval_tag,
                 enable_logging) as log_writer:
    for i, features in enumerate(features_iter):
      inputs_list = []
      for k in sorted(features):
        if re.match(inputs_pattern, k):
          single_inputs = decode_matrix(decode_fn, features[k])
          if isinstance(single_inputs, list):
            inputs_list.extend(single_inputs)
          else:
            inputs_list.append(single_inputs)

      inputs = "\n".join(inputs_list)
      targets = decode_fn(features[targets_key])
      preds = decode_fn(features[predictions_key])
      text_dict = {
        "inputs": inputs_list,
        "targets": targets,
        "predictions": preds
      }

      for key in additional_keys:
        if key == "selected_ids":
          text_dict[key] = decode_selected_indices(decode_fn, features)
        else:
          text_dict[key] = decode_matrix(decode_fn, features[key])

      log_writer.write(text_dict, i)

      for key, scorer in scorers_dict.items():
        scores_i = scorer.score(targets, preds)
        aggregators_dict[key].add_scores(scores_i)

  aggregates_dict = {k: v.aggregate() for k, v in aggregators_dict.items()}
  length_histograms = scorers_dict[_LENGTH_METRIC].histograms(as_string=True)
  _write_aggregates(model_dir, global_step, eval_tag, aggregates_dict,
                    length_histograms)
  _write_aggregate_summaries(model_dir, global_step, eval_tag, aggregates_dict)
Пример #2
0
    def test_epoch_end(self, outputs):
        """
        Called at the end of a testing epoch: `PyTorch Lightning Documentation <https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.core.html#pytorch_lightning.core.LightningModule.test_epoch_end>`__
        Finds the mean of all the metrics logged by :meth:`~abstractive.AbstractiveSummarizer.test_step`.
        """
        avg_generation_time = torch.stack(
            [x["generation_time"] for x in outputs]
        ).mean()

        rouge_scores_log = {}

        if self.hparams.test_use_pyrouge:
            test_rouge("tmp", "save_pred.txt", "save_gold.txt")
        else:
            aggregator = scoring.BootstrapAggregator()
            rouge_scores_list = [
                rouge_score_set
                for batch_list in outputs
                for rouge_score_set in batch_list["rouge_scores"]
            ]
            for score in rouge_scores_list:
                aggregator.add_scores(score)
            # The aggregator returns a dictionary with keys coresponding to the rouge metric
            # and values that are `AggregateScore` objects. Each `AggregateScore` object is a
            # named tuple with a low, mid, and high value. Each value is a `Score` object, which
            # is also a named tuple, that contains the precision, recall, and fmeasure values.
            # For more info see the source code: https://github.com/google-research/google-research/blob/master/rouge/scoring.py
            rouge_result = aggregator.aggregate()

            for metric, value in rouge_result.items():
                rouge_scores_log[metric + "-precision"] = value.mid.precision
                rouge_scores_log[metric + "-recall"] = value.mid.recall
                rouge_scores_log[metric + "-fmeasure"] = value.mid.fmeasure

        # Write the saved predictions and targets to file
        if self.hparams.save_percentage:
            predictions = [
                x["prediction"] for x in outputs if x["prediction"] is not None
            ]
            targets = [x["target"] for x in outputs if x["target"] is not None]
            output_test_predictions_file = os.path.join(
                self.hparams.default_root_dir, "test_predictions.txt"
            )
            output_test_targets_file = os.path.join(
                self.hparams.default_root_dir, "test_targets.txt"
            )
            with open(output_test_predictions_file, "w+") as p_writer, open(
                output_test_targets_file, "w+"
            ) as t_writer:
                for prediction, target in zip(predictions, targets):
                    p_writer.writelines(s + "\n" for s in prediction)
                    t_writer.writelines(s + "\n" for s in target)
                p_writer.close()
                t_writer.close()

        # Generate logs
        tqdm_dict = {"generation_time": avg_generation_time}
        log = {**rouge_scores_log, **tqdm_dict}
        result = {"progress_bar": tqdm_dict, "log": log}
        return result
Пример #3
0
 def _compute(self,
              preds,
              refs,
              rouge_types=None,
              use_agregator=True,
              use_stemmer=False):
     if rouge_types is None:
         rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
     scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types,
                                       use_stemmer=use_stemmer)
     if use_agregator:
         aggregator = scoring.BootstrapAggregator()
     else:
         scores = []
     for r, p in zip(refs, preds):
         score = scorer.score(r, p)
         if use_agregator:
             aggregator.add_scores(score)
         else:
             scores.append(score)
     if use_agregator:
         y = aggregator.aggregate()
     else:
         y = {}
         for k in scores[0]:
             y[k] = list(score[k] for score in scores)
     return y
Пример #4
0
    def _compute(self,
                 predictions,
                 references,
                 rouge_types=None,
                 use_agregator=True,
                 use_stemmer=False):
        if rouge_types is None:
            rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

        scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types,
                                          use_stemmer=use_stemmer)
        if use_agregator:
            aggregator = scoring.BootstrapAggregator()
        else:
            scores = []

        for ref, pred in zip(references, predictions):
            score = scorer.score(ref, pred)
            if use_agregator:
                aggregator.add_scores(score)
            else:
                scores.append(score)

        if use_agregator:
            result = aggregator.aggregate()
        else:
            result = {}
            for key in scores[0]:
                result[key] = list(score[key] for score in scores)

        return result
 def test_aggregate(self):
     np.random.seed(0)
     types = ["regs1", "regs2", "regs3", "regsLCR", "regsTCR"]
     rs = repetition_scorer.RepetitionScorer(repetition_types=types)
     aggregator = scoring.BootstrapAggregator()
     for text in ["a a a b c b", "a b a b c b", "a b a b c a b c"]:
         aggregator.add_scores(rs.score("", text))
     aggregates = aggregator.aggregate()
     self.assertAlmostEqual(aggregates["regs1"].low.prediction_ratio, 5 / 6)
     self.assertAlmostEqual(aggregates["regs1"].high.prediction_ratio, 1)
     self.assertAlmostEqual(aggregates["regs1"].mid.prediction_ratio,
                            (5 / 6 + 5 / 6 + 1) / 3)
     self.assertAlmostEqual(aggregates["regs2"].low.prediction_ratio, 2 / 5)
     self.assertAlmostEqual(aggregates["regs2"].high.prediction_ratio,
                            5 / 7)
     self.assertAlmostEqual(aggregates["regs2"].mid.prediction_ratio,
                            (2 / 5 + 2 / 5 + 5 / 7) / 3)
     self.assertAlmostEqual(aggregates["regs3"].low.prediction_ratio, 0)
     self.assertAlmostEqual(aggregates["regs3"].high.prediction_ratio,
                            2 / 6)
     self.assertAlmostEqual(aggregates["regs3"].mid.prediction_ratio,
                            (0 + 0 + 2 / 6) / 3)
     self.assertAlmostEqual(aggregates["regsLCR"].low.prediction_ratio,
                            3 / 6)
     self.assertAlmostEqual(aggregates["regsLCR"].high.prediction_ratio,
                            6 / 8)
     self.assertAlmostEqual(aggregates["regsLCR"].mid.prediction_ratio,
                            (3 / 6 + 4 / 6 + 6 / 8) / 3)
     self.assertAlmostEqual(aggregates["regsTCR"].low.prediction_ratio,
                            3 / 6)
     self.assertAlmostEqual(aggregates["regsTCR"].high.prediction_ratio, 1)
     self.assertAlmostEqual(aggregates["regsTCR"].mid.prediction_ratio,
                            (3 / 6 + 4 / 6 + 1) / 3)
    def rouge(self, refs, preds):
        """
        Returns `t5` style ROUGE scores. See the related implementation:
        https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68

        :param refs:
            A `list` of reference `strs`.
        :param preds:
            A `list` of predicted `strs`.
        """
        rouge_types = ["rouge1", "rouge2", "rougeLsum"]
        scorer = rouge_scorer.RougeScorer(rouge_types)

        # Add newlines between sentences to correctly compute `rougeLsum`.

        def _prepare_summary(summary):
            summary = summary.replace(" . ", ".\n")
            return summary

        # Accumulate confidence intervals.
        aggregator = scoring.BootstrapAggregator()
        for ref, pred in zip(refs, preds):
            ref = _prepare_summary(ref)
            pred = _prepare_summary(pred)
            aggregator.add_scores(scorer.score(ref, pred))
        result = aggregator.aggregate()
        return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
Пример #7
0
def calculate_rouge(
    pred_lns: List[str],
    tgt_lns: List[str],
    use_stemmer=True,
    rouge_keys=ROUGE_KEYS,
    return_precision_and_recall=False,
    bootstrap_aggregation=True,
    newline_sep=True,
) -> Dict:

    scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer)
    aggregator = scoring.BootstrapAggregator()
    for pred, tgt in zip(tgt_lns, pred_lns):
        # rougeLsum expects "\n" separated sentences within a summary
        if newline_sep:
            pred = add_newline_to_end_of_each_sentence(pred)
            tgt = add_newline_to_end_of_each_sentence(tgt)
        scores = scorer.score(pred, tgt)
        aggregator.add_scores(scores)

    if bootstrap_aggregation:
        result = aggregator.aggregate()
        if return_precision_and_recall:
            return extract_rouge_mid_statistics(result)  # here we return dict
        else:
            return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}

    else:
        return aggregator._scores  # here we return defaultdict(list)
Пример #8
0
    def _compute(self,
                 predictions,
                 references,
                 use_agregator=True,
                 use_stemmer=False):

        rouge_types = ["rougeL"]
        predictions = " ".join([str(p) for p in predictions])
        references = " ".join([str(r) for r in references])

        scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types,
                                          use_stemmer=use_stemmer)
        if use_agregator:
            aggregator = scoring.BootstrapAggregator()
        else:
            scores = []

        for ref, pred in zip(references, predictions):
            score = scorer.score(ref, pred)
            if use_agregator:
                aggregator.add_scores(score)
            else:
                scores.append(score)

        if use_agregator:
            result = aggregator.aggregate()
        else:
            result = {}
            for key in scores[0]:
                result[key] = list(score[key] for score in scores)

        return result
Пример #9
0
def calculate_meteor(output_lns, reference_lns):
    aggregator = scoring.BootstrapAggregator()

    for reference_ln, output_ln in zip(reference_lns, output_lns):
        scores = meteor_score.single_meteor_score(reference_ln, output_ln)
        aggregator.add_scores({'meteor': scores})

    result = aggregator.aggregate()
    return {k: round(v.mid * 100, 4) for k, v in result.items()}
Пример #10
0
def _rouge_calculation(hypotheses,
                       references1,
                       references2=[],
                       metrics=['rougeLsum']):
    """Internal function for rouge scoring.

  If two references are provided,
  the best score is chosen for each instance.

  Args:
    hypotheses: list of predicted long answers
    references1: list of references to score hypotheses against
    references2: optional list of references to score hypotheses against
    metrics: evaluation metric

  Returns:
    dictionary representation of rouge scores
  """

    if references2 == []:
        references2 = references1

    scorer = rouge_scorer.RougeScorer(metrics, use_stemmer=True)
    aggregator1 = scoring.BootstrapAggregator()
    aggregator2 = scoring.BootstrapAggregator()

    for i in range(len(hypotheses)):
        scores1 = scorer.score(references1[i], hypotheses[i])
        scores2 = scorer.score(references2[i], hypotheses[i])
        aggregator1.add_scores(scores1)
        aggregator2.add_scores(scores2)

    scores = {m: [] for m in metrics}

    for m in metrics:
        fmeasure1 = aggregator1.aggregate()[m].mid.fmeasure
        fmeasure2 = aggregator2.aggregate()[m].mid.fmeasure
        scores[m].append(max(fmeasure1, fmeasure2))

    for m in scores:
        scores[m] = 100 * sum(scores[m]) / len(scores[m])

    return scores
Пример #11
0
def calculate_rouge(output_lns: List[str], reference_lns: List[str], use_stemmer=True) -> Dict:
    scorer = rouge_scorer.RougeScorer(ROUGE_KEYS, use_stemmer=use_stemmer)
    aggregator = scoring.BootstrapAggregator()

    for reference_ln, output_ln in zip(reference_lns, output_lns):
        scores = scorer.score(reference_ln, output_ln)
        aggregator.add_scores(scores)

    result = aggregator.aggregate()
    return {k: v.mid.fmeasure for k, v in result.items()}
Пример #12
0
def calculate_rouge(predicted_txts,
                    reference_txts,
                    rouge_keys=["rouge1", "rouge2", "rougeL"],
                    use_stemmer=True):
    scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer)
    aggregator = scoring.BootstrapAggregator()

    for ref_text, pred_txt in zip(reference_txts, predicted_txts):
        scores = scorer.score(ref_text, pred_txt)
        aggregator.add_scores(scores)

    result = aggregator.aggregate()
    return result
Пример #13
0
def calculate_rouge(output_lns, reference_lns, score_path):
    score_file = Path(score_path).open("w")
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"],
                                      use_stemmer=True)
    aggregator = scoring.BootstrapAggregator()

    for reference_ln, output_ln in zip(reference_lns, output_lns):
        scores = scorer.score(reference_ln, output_ln)
        aggregator.add_scores(scores)

    result = aggregator.aggregate()
    score_file.write(
        "ROUGE_1: \n{} \n\n ROUGE_2: \n{} \n\n ROUGE_L: \n{} \n\n".format(
            result["rouge1"], result["rouge2"], result["rougeL"]))
Пример #14
0
def calculate_rouge(
    pred_lns: List[str],
    tgt_lns: List[str],
    use_stemmer=True,
    rouge_keys=ROUGE_KEYS,
    return_precision_and_recall=False,
    bootstrap_aggregation=True,
    newline_sep=True,
) -> Dict:
    """Calculate rouge using rouge_scorer package.

    Args:
        pred_lns: list of summaries generated by model
        tgt_lns: list of groundtruth summaries (e.g. contents of val.target)
        use_stemmer:  Bool indicating whether Porter stemmer should be used to
        strip word suffixes to improve matching.
        rouge_keys:  which metrics to compute, defaults to rouge1, rouge2, rougeL, rougeLsum
        return_precision_and_recall: (False) whether to also return precision and recall.
        bootstrap_aggregation: whether to do the typical bootstrap resampling of scores. Defaults to True, if False
            this function returns a collections.defaultdict[metric: list of values for each observation for each subscore]``
        newline_sep:(default=True) whether to add newline between sentences. This is essential for calculation rougeL
        on multi sentence summaries (CNN/DM dataset).

    Returns:
         Dict[score: value] if aggregate else defaultdict(list) keyed by rouge_keys

    """
    scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer)
    aggregator = scoring.BootstrapAggregator()
    for pred, tgt in zip(tgt_lns, pred_lns):
        # rougeLsum expects "\n" separated sentences within a summary
        if newline_sep:
            pred = add_newline_to_end_of_each_sentence(pred)
            tgt = add_newline_to_end_of_each_sentence(tgt)
        # change pred and tgt
        scores = scorer.score(pred, tgt)
        aggregator.add_scores(scores)

    if bootstrap_aggregation:
        result = aggregator.aggregate()
        if return_precision_and_recall:
            return extract_rouge_mid_statistics(result)  # here we return dict
        else:
            return {
                k: round(v.mid.fmeasure * 100, 4)
                for k, v in result.items()
            }

    else:
        return aggregator._scores  # here we return defaultdict(list)
Пример #15
0
def evaluate_rouge_avg(hypotheses,
                       references,
                       type='f',
                       use_progress_bar=False):
    metrics = ['rouge1', 'rouge2', 'rougeL']
    scorer = {}
    scorer["rouge"] = rouge_scorer.RougeScorer(metrics, use_stemmer=True)
    aggregators_dict = {k: scoring.BootstrapAggregator() for k in scorer}

    if len(hypotheses) < len(references):
        print(
            "Warning number of papers in submission file is smaller than ground truth file",
            file=sys.stderr)
    # import pdb;pdb.set_trace()
    hypotheses = list(hypotheses)
    references = list(references)

    if not use_progress_bar:
        for j, hyp in enumerate(hypotheses):
            submission_summary = hyp.replace('<q>', ' ')
            for key, scorr in scorer.items():
                scores_i = scorr.score(references[j].strip(),
                                       submission_summary)
                aggregators_dict[key].add_scores(scores_i)

        aggregates_dict = {
            k: v.aggregate()
            for k, v in aggregators_dict.items()
        }
        out_avg_scores = {}
        for k, v in sorted(aggregates_dict["rouge"].items()):
            out_avg_scores[k] = v.mid.fmeasure
    else:
        for j, hyp in tqdm(enumerate(hypotheses), total=len(hypotheses)):
            submission_summary = hyp.replace('<q>', ' ')
            for key, scorr in scorer.items():
                scores_i = scorr.score(references[j].strip(),
                                       submission_summary)
                aggregators_dict[key].add_scores(scores_i)

        aggregates_dict = {
            k: v.aggregate()
            for k, v in aggregators_dict.items()
        }
        out_avg_scores = {}
        for k, v in sorted(aggregates_dict["rouge"].items()):
            out_avg_scores[k] = v.mid.fmeasure
    return out_avg_scores['rouge1'], out_avg_scores['rouge2'], out_avg_scores[
        'rougeL']
Пример #16
0
def calculate_rouge(output_lns: List[str], reference_lns: List[str]) -> Dict:
    scorer = rouge_scorer.RougeScorer(ROUGE_KEYS, use_stemmer=True)
    aggregator = scoring.BootstrapAggregator()

    for reference_ln, output_ln in zip(reference_lns, output_lns):
        scores = scorer.score(reference_ln, output_ln)
        aggregator.add_scores(scores)

        # with open(out_dir + '/cands.txt', 'a+') as c, open(out_dir + '/refs.txt', 'a+') as r, \
        #     open(out_dir + '/scores.txt', 'a+') as s:
        #     c.write(output_ln + '\n')
        #     r.write(reference_ln + '\n')
        #     s.write(str(scores['rouge1'].fmeasure) + '\n')

    result = aggregator.aggregate()
    return {k: v.mid.fmeasure for k, v in result.items()}
Пример #17
0
def text_eval(preds_file,
              model_dir,
              global_step: int = 0,
              eval_tag: str = "",
              enable_logging: bool = True):
    """Evaluates a set of text targets/predictions."""
    scorers_dict = {
        _ROUGE_METRIC:
        rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL", "rougeLsum"],
                                 use_stemmer=True),
        _BLEU_METRIC:
        bleu_scorer.BleuScorer(),
        _REPETITION_METRIC:
        repetition_scorer.RepetitionScorer(
            ["regs1", "regs2", "regs3", "regsTCR"]),
        _LENGTH_METRIC:
        length_scorer.LengthScorer(["word", "char"])
    }
    aggregators_dict = {k: scoring.BootstrapAggregator() for k in scorers_dict}

    with LogWriter((), model_dir, 0, "", enable_logging) as log_writer:
        with open(preds_file) as csv_file:
            reader = csv.DictReader(csv_file)
            for i, row in enumerate(reader):
                text_dict = {
                    "inputs": row['prompt'],
                    "targets": row['targets'],
                    "predictions": row['predictions']
                }

                log_writer.write(text_dict, i)

                for key, scorer in scorers_dict.items():
                    scores_i = scorer.score(row['targets'], row['predictions'])
                    aggregators_dict[key].add_scores(scores_i)

    aggregates_dict = {k: v.aggregate() for k, v in aggregators_dict.items()}
    length_histograms = scorers_dict[_LENGTH_METRIC].histograms(as_string=True)
    _write_aggregates(model_dir, global_step, eval_tag, aggregates_dict,
                      length_histograms)
    _write_aggregate_summaries(model_dir, global_step, eval_tag,
                               aggregates_dict)
Пример #18
0
def calculate_rouge(output_lns: List[str],
                    reference_lns: List[str],
                    cleaned_up_tokenization_spaces=False,
                    use_stemmer=True) -> Dict:
    scorer = rouge_scorer.RougeScorer(ROUGE_KEYS, use_stemmer=use_stemmer)
    aggregator = scoring.BootstrapAggregator()

    split_txt = ". " if cleaned_up_tokenization_spaces else " . "

    for reference_ln, output_ln in zip(reference_lns, output_lns):

        # rouge_score expects \n separated sentences within a summary
        reference_ln_formatted = " . \n".join(reference_ln.split(". "))
        output_ln_formatted = " . \n".join(output_ln.split(split_txt))

        scores = scorer.score(reference_ln_formatted, output_ln_formatted)
        aggregator.add_scores(scores)

    result = aggregator.aggregate()
    return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}
Пример #19
0
    def compute_rouge(predictions,
                      references,
                      rouge_types=None,
                      use_stemmer=True):
        if rouge_types is None:
            rouge_types = ["rouge1", "rouge2", "rougeLsum"]

        scorer = rouge_scorer.RougeScorer(
            rouge_types=rouge_types, use_stemmer=use_stemmer)
        aggregator = scoring.BootstrapAggregator()

        for ref, pred in zip(references, predictions):
            score = scorer.score(ref, pred)
            aggregator.add_scores(score)
        result = aggregator.aggregate()
        result = {
            key: round(value.mid.fmeasure * 100, 4)
            for key, value in result.items()
        }
        return result
Пример #20
0
def rouge_scores(preds: List[List[torch.Tensor]],
                 targets: List[List[torch.Tensor]],
                 tokenizer,
                 use_stemmer=False,
                 use_aggregator=False):
    # largely copied from https://github.com/huggingface/nlp/blob/master/metrics/rouge/rouge.py#L84
    rouge_types = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
    scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types,
                                      use_stemmer=use_stemmer)
    refs, hyps = [], []
    for p, t in zip(preds, targets):
        assert len(p) == len(t)
        refs.extend(p)
        hyps.extend(t)

    if use_aggregator:
        aggregator = scoring.BootstrapAggregator()
        scores = None
    else:
        aggregator = None
        scores = []

    for ref, pred in zip(refs, hyps):
        if isinstance(ref, torch.Tensor):
            ref = tokenizer.decode(ref).lower()
        if isinstance(pred, torch.Tensor):
            pred = tokenizer.decode(pred).lower()
        score = scorer.score(ref, pred)
        if use_aggregator:
            aggregator.add_scores(score)
        else:
            scores.append(score)

    if use_aggregator:
        result = aggregator.aggregate()
    else:
        result = {}
        for key in scores[0]:
            result[key] = list(score[key] for score in scores)

    return result
Пример #21
0
def edit_rouge(targets, predictions):
    """Measures a variety of different ROUGE scores."""
    # We do not measure ROUGE-L for updates since LCS is likely entirely contained
    # in source.
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeLsum"])
    aggregator = scoring.BootstrapAggregator()

    for prediction, target in zip(predictions, targets):

        all_scores = {}

        target_additions = rendering_utils.extract_additions(
            source=target["normalized_inputs"],
            target=target["normalized_targets"],
        )
        target_additions = " ".join(target_additions)
        prediction_additions = rendering_utils.extract_additions(
            source=target["normalized_inputs"],
            target=prediction["normalized_targets"],
        )
        prediction_additions = " ".join(prediction_additions)

        addition_scores = scorer.score(
            target=target_additions,
            prediction=prediction_additions,
        )

        if target_additions.strip() or prediction_additions.strip():
            all_scores.update(
                {f"update_{k}": v
                 for k, v in addition_scores.items()})
        else:
            all_scores.update(
                {f"update_{k}": 100.0
                 for k, _ in addition_scores.items()})

        aggregator.add_scores(all_scores)

    result = aggregator.aggregate()
    return {key: value.mid.fmeasure * 100 for key, value in result.items()}
Пример #22
0
def rouge_dict(gathered_dict,
               target_key='labels',
               prediction_key='predictions',
               score_keys=None):
    """Computes rouge score.
    Args:
      targets: list of strings
      predictions: list of strings
      score_keys: list of strings with the keys to compute.
    Returns:
      dict with score_key: rouge score across all targets and predictions
    """
    targets = gathered_dict[target_key]
    predictions = gathered_dict[prediction_key]

    if score_keys is None:
        score_keys = ["rouge1", "rouge2", "rougeLsum"]
    scorer = rouge_scorer.RougeScorer(score_keys)
    aggregator = scoring.BootstrapAggregator()

    def _prepare_summary(summary):
        # Make sure the summary is not bytes-type
        # Add newlines between sentences so that rougeLsum is computed correctly.
        summary = summary.replace(" . ", " .\n")
        return summary

    for prediction, target in zip(predictions, targets):
        target = _prepare_summary(target)
        prediction = _prepare_summary(prediction)
        aggregator.add_scores(
            scorer.score(target=target, prediction=prediction))
    result = aggregator.aggregate()
    return {
        key: {
            'score': result[key].mid.fmeasure * 100,
            'count': len(targets)
        }
        for key in score_keys
    }
Пример #23
0
def rouge(targets, predictions, score_keys=None):
    """Computes rouge score.

  Args:
    targets: list of strings
    predictions: list of strings
    score_keys: list of strings with the keys to compute.
  Returns:
    dict with score_key: rouge score across all targets and predictions
  """

    if score_keys is None:
        score_keys = ["rouge1", "rouge2", "rougeLsum"]
    scorer = rouge_scorer.RougeScorer(score_keys)
    aggregator = scoring.BootstrapAggregator()

    def _prepare_summary(summary):
        # Make sure the summary is not bytes-type
        summary = tf.compat.as_text(summary)
        # Add newlines between sentences so that rougeLsum is computed correctly.
        summary = summary.replace(" . ", " .\n")
        return summary

    for prediction, target in zip(predictions, targets):
        target = _prepare_summary(target)
        prediction = _prepare_summary(prediction)
        aggregator.add_scores(
            scorer.score(target=target, prediction=prediction))
    result = aggregator.aggregate()
    for key in score_keys:
        logging.info(
            "%s = %.2f, 95%% confidence [%.2f, %.2f]",
            key,
            result[key].mid.fmeasure * 100,
            result[key].low.fmeasure * 100,
            result[key].high.fmeasure * 100,
        )
    return {key: result[key].mid.fmeasure * 100 for key in score_keys}
Пример #24
0
    def compute(self, predictions, references):
        rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
        rouge = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=True)
        aggregator = scoring.BootstrapAggregator()
        # TODO expecting pretokenized data, do we want to imitate Rouge-155 tokenizer somehow?
        for refs, pred in zip(references.whitespace_tokenized, predictions.whitespace_tokenized):

            # ROUGE multi-ref jackknifing
            if len(refs) > 1:
                scores = []
                for ref in refs:
                    scores.append(rouge.score(ref, pred))

                # get best score for all leave-one-out sets
                best_scores = []
                for leave in range(len(refs)):
                    cur_scores = [s for s in scores]
                    del cur_scores[leave]
                    best_scores.append({rouge_type: max([s[rouge_type] for s in cur_scores],
                                                        key=lambda s: s.fmeasure)
                                        for rouge_type in rouge_types})

                # average the leave-one-out bests to produce the final score
                score = {rouge_type: scoring.Score(np.mean([b[rouge_type].precision for b in best_scores]),
                                                   np.mean([b[rouge_type].recall for b in best_scores]),
                                                   np.mean([b[rouge_type].fmeasure for b in best_scores]))
                         for rouge_type in rouge_types}
            else:
                score = rouge.score(refs[0], pred)
            aggregator.add_scores(score)

        result = aggregator.aggregate()
        # convert the named tuples to plain nested dicts
        result = {rouge_type: {vtype: dict(val._asdict()) for vtype, val in result[rouge_type]._asdict().items()}
                  for rouge_type in rouge_types}
        return result
Пример #25
0
                sources.append(source)
                target_lists.append(targets)

        # Exact and SARI scores
        exact = score_lib.compute_exact_score(predictions, target_lists)
        sari, keep, addition, deletion = score_lib.compute_sari_scores(
            sources, predictions, target_lists)
        print(f'Exact score:     {100*exact:.3f}')
        print(f'SARI score:      {100*sari:.3f}')
        print(f' KEEP score:     {100*keep:.3f}')
        print(f' ADDITION score: {100*addition:.3f}')
        print(f' DELETION score: {100*deletion:.3f}')

        # ROUGE-L scores
        scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
        aggregator = scoring.BootstrapAggregator()
        scores = []
        for target, pred in zip(target_lists, predictions):
            aggregator.add_scores(scorer.score(target[0], pred))

        aggregates = aggregator.aggregate()

        print("\nROUGE scores:")
        print(
            "----------------------------------------------------------------")
        print("score_type\t\tlow\t\tmid\t\thigh")
        print(
            "----------------------------------------------------------------")
        for score_type, aggregate in sorted(aggregates.items()):
            print("%s-Recall:   \t%f\t%f\t%f" %
                  (score_type, aggregate.low.recall, aggregate.mid.recall,
def calculate_metrics(model):
    _, _, dev = preprocess_QG()
    sentences = pd.DataFrame(dev, columns=['Complex', 'Simple'])
    sentences = sentences.groupby(
        ['Complex']).agg(lambda x: tuple(x)).applymap(list).reset_index()

    questions = []
    with open("/content/tgt-dev.txt", 'r') as f:
        lines = f.readlines()
        for l in lines:
            questions.append(l[:-1])

    contexts = []
    with open("/content/src-dev.txt", 'r') as f:
        lines = f.readlines()
        for l in lines:
            contexts.append(l[:-1])

    filename = "/content/val.source"
    sep = '<sep>'

    with open(filename, 'r') as f:
        lines = f.readlines()
        rouge_scores = []
        sari_scores = []
        results = []
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        aggregator = scoring.BootstrapAggregator()

        for i, line in enumerate(tqdm(lines)):
            if sep in line:
                line = line[:line.find(sep) + len(sep)]
                s = ""
                try:
                    s = run_model(line + sep)
                    s = s[s.find(sep) + len(sep):]
                except:
                    s = run_model(line + sep)
                results.append(s[0])
                ref_questions_idx = [
                    i for i, cont in enumerate(contexts)
                    if cont + sep == line[:-len(sep)]
                ]
                ref_questions = [questions[i] for i in ref_questions_idx]
                fm_ = [0]
                for r in ref_questions:
                    scores = scorer.score(s, r)
                    fm_.append([
                        round(v.fmeasure * 100, 4) for k, v in scores.items()
                    ][0])
                rouge_scores.append(max(fm_))
            else:
                s = run_model(line)[0]
                results.append(s)
                ref = list(sentences.loc[sentences['Complex'].str.contains(
                    line[:-len(sep)])]['Simple'])
                ref = [str(r[0]) for r in ref]
                sari_scores.append(SARIsent(line, s, ref))
            if i % 10 == 0 and i != 0:
                print('Current avg rouge {}, max = {}'.format(
                    np.mean(rouge_scores), np.max(rouge_scores)))
                k = np.argmax(rouge_scores) * 2
                print('Max rouge for context {} Result of the model == {}'.
                      format(lines[k], results[k]))
                print('\nCurrent avg sari {}, max = {}'.format(
                    np.mean(sari_scores), np.max(sari_scores)))
                k = np.argmax(sari_scores) * 2 + 1
                print(
                    'Max sari for context {} Result of the model == {}'.format(
                        lines[k], results[k]))
            if i > 1000:
                break