示例#1
0
    def _compute(
        self,
        predictions,
        references,
        normalized: bool = False,
        no_punct: bool = False,
        asian_support: bool = False,
        case_sensitive: bool = False,
    ):
        references_per_prediction = len(references[0])
        if any(len(refs) != references_per_prediction for refs in references):
            raise ValueError(
                "Sacrebleu requires the same number of references for each prediction"
            )
        transformed_references = [[refs[i] for refs in references]
                                  for i in range(references_per_prediction)]

        sb_ter = TER(normalized, no_punct, asian_support, case_sensitive)
        output = sb_ter.corpus_score(predictions, transformed_references)

        return {
            "score": output.score,
            "num_edits": output.num_edits,
            "ref_length": output.ref_length
        }
    def create_scorer(self) -> None:
        """

        :return:
        """
        if self.utility_function_name == UTILITY_SENTENCE_CHRF_2_PRECISION:

            chrf_beta = 2

            self.args = argparse.Namespace(chrf_order=6,
                                           chrf_beta=chrf_beta,
                                           chrf_whitespace=False,
                                           short=False)

            self.scorer = cached_metrics.CachedPrecisionCHRF(self.args)
            self.cached_scorer = True

        elif "chrf" in self.utility_function_name:
            if self.utility_function_name.endswith("balanced"):
                chrf_beta = 1
            else:
                last_part = self.utility_function_name.split("-")[-1]
                chrf_beta = int(last_part)

            self.args = argparse.Namespace(chrf_order=6,
                                           chrf_beta=chrf_beta,
                                           chrf_whitespace=False,
                                           short=False)

            self.scorer = cached_metrics.CachedCHRF(self.args)
            self.cached_scorer = True

        elif "bleu" in self.utility_function_name:
            if self.utility_function_name.endswith("floor"):
                smooth_method = "floor"
                smooth_value = 0.01
            elif self.utility_function_name.endswith("exp"):
                smooth_method = "exp"
                smooth_value = None
            elif self.utility_function_name.endswith("add-k"):
                smooth_method = "add-k"
                smooth_value = 1
            else:
                smooth_method = "none"
                smooth_value = None

            self.args = argparse.Namespace(smooth_method=smooth_method,
                                           smooth_value=smooth_value,
                                           force=False,
                                           short=False,
                                           lc=False,
                                           tokenize=DEFAULT_TOKENIZER)

            self.scorer = cached_metrics.CachedBLEU(self.args)
            self.cached_scorer = True

        elif self.utility_function_name == "sentence-ter":

            self.args = argparse.Namespace(normalized=False,
                                           no_punct=False,
                                           asian_support=False,
                                           case_sensitive=False)
            self.scorer = TER(self.args)
            self.cached_scorer = False

        else:
            if self.utility_function_name.endswith("balanced"):
                meteor_alpha = 0.5
            else:
                meteor_alpha = 0.85

            self.scorer = eval_meteor.MeteorScorer(meteor_alpha=meteor_alpha)
            self.cached_scorer = False
示例#3
0
    pages = "223--231",
}
@inproceedings{post-2018-call,
    title = "A Call for Clarity in Reporting {BLEU} Scores",
    author = "Post, Matt",
    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
    month = oct,
    year = "2018",
    address = "Belgium, Brussels",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W18-6319",
    pages = "186--191",
}
"""

_DESCRIPTION = """\
TER (Translation Edit Rate, also called Translation Error Rate) is a metric to quantify the edit operations that a
hypothesis requires to match a reference translation. We use the implementation that is already present in sacrebleu
(https://github.com/mjpost/sacreBLEU#ter), which in turn is inspired by the TERCOM implementation, which can be found
here: https://github.com/jhclark/tercom.

The implementation here is slightly different from sacrebleu in terms of the required input format. The length of
the references and hypotheses lists need to be the same, so you may need to transpose your references compared to
sacrebleu's required input format. See https://github.com/huggingface/datasets/issues/3154#issuecomment-950746534

See the README.md file at https://github.com/mjpost/sacreBLEU#ter for more information.
"""

_KWARGS_DESCRIPTION = """
Produces TER scores alongside the number of edits and reference length.
class MBR(object):
    def __init__(self,
                 utility_function_name: str,
                 symmetric: bool = False) -> None:
        """

        :param utility_function_name:
        :param symmetric:
        """
        self.cached_scorer = None
        self.scorer = None
        self.args = None

        self.utility_function_name = utility_function_name
        self.symmetric = symmetric

        self.create_scorer()

    def create_scorer(self) -> None:
        """

        :return:
        """
        if self.utility_function_name == UTILITY_SENTENCE_CHRF_2_PRECISION:

            chrf_beta = 2

            self.args = argparse.Namespace(chrf_order=6,
                                           chrf_beta=chrf_beta,
                                           chrf_whitespace=False,
                                           short=False)

            self.scorer = cached_metrics.CachedPrecisionCHRF(self.args)
            self.cached_scorer = True

        elif "chrf" in self.utility_function_name:
            if self.utility_function_name.endswith("balanced"):
                chrf_beta = 1
            else:
                last_part = self.utility_function_name.split("-")[-1]
                chrf_beta = int(last_part)

            self.args = argparse.Namespace(chrf_order=6,
                                           chrf_beta=chrf_beta,
                                           chrf_whitespace=False,
                                           short=False)

            self.scorer = cached_metrics.CachedCHRF(self.args)
            self.cached_scorer = True

        elif "bleu" in self.utility_function_name:
            if self.utility_function_name.endswith("floor"):
                smooth_method = "floor"
                smooth_value = 0.01
            elif self.utility_function_name.endswith("exp"):
                smooth_method = "exp"
                smooth_value = None
            elif self.utility_function_name.endswith("add-k"):
                smooth_method = "add-k"
                smooth_value = 1
            else:
                smooth_method = "none"
                smooth_value = None

            self.args = argparse.Namespace(smooth_method=smooth_method,
                                           smooth_value=smooth_value,
                                           force=False,
                                           short=False,
                                           lc=False,
                                           tokenize=DEFAULT_TOKENIZER)

            self.scorer = cached_metrics.CachedBLEU(self.args)
            self.cached_scorer = True

        elif self.utility_function_name == "sentence-ter":

            self.args = argparse.Namespace(normalized=False,
                                           no_punct=False,
                                           asian_support=False,
                                           case_sensitive=False)
            self.scorer = TER(self.args)
            self.cached_scorer = False

        else:
            if self.utility_function_name.endswith("balanced"):
                meteor_alpha = 0.5
            else:
                meteor_alpha = 0.85

            self.scorer = eval_meteor.MeteorScorer(meteor_alpha=meteor_alpha)
            self.cached_scorer = False

    def score_single(self, hyp: str, ref: str) -> float:
        """
        Computes a single score between two strings.

        :param hyp:
        :param ref:
        :return:
        """
        return self.scorer.sentence_score(hyp, [ref]).score

    def score(self, hyp: str, ref: str) -> Union[float, np.ndarray]:
        """

        :param hyp:
        :param ref:
        :return:
        """

        # no actual computation if one input is empty or whitespace-only

        if hyp.strip() == "" or ref.strip() == "":
            return 0.0

        if self.symmetric:
            return self.score_symmetric(hyp, ref)

        return self.score_single(hyp, ref)

    def score_symmetric(self, hyp: str, ref: str) -> np.ndarray:
        """

        :param hyp:
        :param ref:
        :return:
        """
        forward = self.score_single(hyp, ref)
        backward = self.score_single(ref, hyp)

        # harmonic mean of forward and backward values

        return stats.hmean([forward, backward])

    def get_maximum_utility_sample(
            self,
            samples: List[str],
            reference: Optional[str] = None) -> Tuple[str, float, List[float]]:
        """

        :param samples: Sampled target translations for one single source input sentence
        :param reference: Actual reference translation to compare to samples (oracle mode).

        :return: The best-performing sample, its utility score and all utility scores.
        """

        average_utilities = []

        for sample in samples:

            if reference is None:

                # without reference, compute mean utility among pool of samples

                utilities = []

                for pseudo_reference in samples:

                    utility = self.score(sample, pseudo_reference)
                    utilities.append(utility)

                average_utility = np.mean(utilities)

            else:

                # with reference, operate in oracle mode and compare to the actual reference

                average_utility = self.score(sample, reference)

            average_utilities.append(average_utility)

        maximum_utility_index = int(np.argmax(average_utilities))

        return samples[maximum_utility_index], np.max(
            average_utilities), average_utilities

    def cache_info(self) -> None:
        """

        :return:
        """
        if self.cached_scorer:
            logging.debug("Scorer cache:")
            logging.debug(self.scorer.cache_info())

    def cache_clear(self) -> None:
        """

        :return:
        """
        if self.cached_scorer:
            self.scorer.cache_clear()