예제 #1
0
    def test_brevity_penalty(self):
        # Test case from brevity_penalty_closest function in mteval-v13a.pl.
        # Same test cases as in the doctest in nltk.translate.bleu_score.py
        references = [['a'] * 11, ['a'] * 8]
        hypothesis = ['a'] * 7
        hyp_len = len(hypothesis)
        closest_ref_len =  closest_ref_length(references, hyp_len)
        self.assertAlmostEqual(brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4)

        references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
        hypothesis = ['a'] * 7
        hyp_len = len(hypothesis)
        closest_ref_len =  closest_ref_length(references, hyp_len)
        assert brevity_penalty(closest_ref_len, hyp_len) == 1.0
예제 #2
0
파일: test_bleu.py 프로젝트: DrDub/nltk
    def test_brevity_penalty(self):
        # Test case from brevity_penalty_closest function in mteval-v13a.pl.
        # Same test cases as in the doctest in nltk.translate.bleu_score.py
        references = [['a'] * 11, ['a'] * 8]
        hypothesis = ['a'] * 7
        hyp_len = len(hypothesis)
        closest_ref_len =  closest_ref_length(references, hyp_len)
        self.assertAlmostEqual(brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4)

        references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
        hypothesis = ['a'] * 7
        hyp_len = len(hypothesis)
        closest_ref_len =  closest_ref_length(references, hyp_len)
        assert brevity_penalty(closest_ref_len, hyp_len) == 1.0
예제 #3
0
파일: bleu.py 프로젝트: netsafe/DeepVesnin
def bleu_advanced(y_true: List[Any], y_predicted: List[Any],
                  weights: Tuple = (1,), smoothing_function=SMOOTH.method1,
                  auto_reweigh=False, penalty=True) -> float:
    """Calculate BLEU score

    Parameters:
        y_true: list of reference tokens
        y_predicted: list of query tokens
        weights: n-gram weights
        smoothing_function: SmoothingFunction
        auto_reweigh: Option to re-normalize the weights uniformly
        penalty: either enable brevity penalty or not

    Return:
        BLEU score
    """

    bleu_measure = sentence_bleu([y_true], y_predicted, weights, smoothing_function, auto_reweigh)

    hyp_len = len(y_predicted)
    hyp_lengths = hyp_len
    ref_lengths = closest_ref_length([y_true], hyp_len)

    bpenalty = brevity_penalty(ref_lengths, hyp_lengths)

    if penalty is True or bpenalty == 0:
        return bleu_measure

    return bleu_measure / bpenalty
예제 #4
0
파일: bleu.py 프로젝트: RileyShe/DeepPavlov
def bleu_advanced(y_true: List[Any], y_predicted: List[Any],
                  weights: Tuple=(1,), smoothing_function=SMOOTH.method1,
                  auto_reweigh=False, penalty=True) -> float:
    """Calculate BLEU score

    Parameters:
        y_true: list of reference tokens
        y_predicted: list of query tokens
        weights: n-gram weights
        smoothing_function: SmoothingFunction
        auto_reweigh: Option to re-normalize the weights uniformly
        penalty: either enable brevity penalty or not

    Return:
        BLEU score
    """

    bleu_measure = sentence_bleu([y_true], y_predicted, weights, smoothing_function, auto_reweigh)

    hyp_len = len(y_predicted)
    hyp_lengths = hyp_len
    ref_lengths = closest_ref_length([y_true], hyp_len)

    bpenalty = brevity_penalty(ref_lengths, hyp_lengths)

    if penalty is True or bpenalty == 0:
        return bleu_measure

    return bleu_measure/bpenalty
예제 #5
0
파일: python_bleu.py 프로젝트: RayXu14/xlib
def calculate_bleu(list_of_references, hypotheses):
    '''
    by Qiu
    '''
    hyp_lengths, ref_lengths = 0, 0
    for references, hypothesis in zip(list_of_references, hypotheses):
        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)
    try:
        bp = brevity_penalty(ref_lengths, hyp_lengths)
        bleu1 = corpus_bleu(
            list_of_references, hypotheses, weights=(1, 0, 0, 0)) / bp * 100
        bleu2 = corpus_bleu(
            list_of_references, hypotheses, weights=(0, 1, 0, 0)) / bp * 100
        bleu3 = corpus_bleu(
            list_of_references, hypotheses, weights=(0, 0, 1, 0)) / bp * 100
        bleu4 = corpus_bleu(
            list_of_references, hypotheses, weights=(0, 0, 0, 1)) / bp * 100
        bleu_all = corpus_bleu(list_of_references,
                               hypotheses,
                               weights=(0.25, 0.25, 0.25, 0.25)) * 100
    except Exception:
        bleu_all, bleu1, bleu2, bleu3, bleu4 = 0.0, 0.0, 0.0, 0.0, 0.0

    return bleu_all, bleu1, bleu2, bleu3, bleu4
예제 #6
0
def corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25),
                smoothing_function=None, auto_reweigh=False,
                emulate_multibleu=False):
    p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    hyp_lengths, ref_lengths = 0, 0

    if len(list_of_references) != len(hypotheses):
        print ("The number of hypotheses and their reference(s) should be the same")
        return (0, *(0, 0, 0, 0), 0, 0, 0)

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(weights, start=1):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len =  len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = brevity_penalty(ref_lengths, hyp_lengths)

    # Uniformly re-weighting based on maximum hypothesis lengths if largest
    # order of n-grams < 4 and weights is set at default.
    if auto_reweigh:
        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
            weights = ( 1 / hyp_lengths ,) * hyp_lengths

    # Collects the various precision values for the different ngram orders.
    p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
           for i, _ in enumerate(weights, start=1)]

    p_n_ = [xx.numerator / xx.denominator * 100 for xx in p_n]

    # Returns 0 if there's no matching n-grams
    # We only need to check for p_numerators[1] == 0, since if there's
    # no unigrams, there won't be any higher order ngrams.
    if p_numerators[1] == 0:
        return (0, *(0, 0, 0, 0), 0, 0, 0)

    # If there's no smoothing, set use method0 from SmoothinFunction class.
    if not smoothing_function:
        smoothing_function = SmoothingFunction().method0
    # Smoothen the modified precision.
    # Note: smoothing_function() may convert values into floats;
    #       it tries to retain the Fraction object as much as the
    #       smoothing method allows.
    p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis,
                             hyp_len=hyp_len, emulate_multibleu=emulate_multibleu)
    s = (w * math.log(p_i) for i, (w, p_i) in enumerate(zip(weights, p_n)))
    s =  bp * math.exp(math.fsum(s)) * 100
    final_bleu = round(s, 4) if emulate_multibleu else s
    return (final_bleu, *p_n_, bp, ref_lengths, hyp_lengths)
예제 #7
0
def compute_bp(hypotheses, list_of_references):
    hyp_lengths, ref_lengths = 0, 0
    for references, hypothesis in zip(list_of_references, hypotheses):
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = brevity_penalty(ref_lengths, hyp_lengths)
    return bp
예제 #8
0
def corpus_nist(list_of_references, hypotheses, n=5):
    """
    Calculate a single corpus-level NIST score (aka. system-level BLEU) for all
    the hypotheses and their respective references.

    :param references: a corpus of lists of reference sentences, w.r.t. hypotheses
    :type references: list(list(list(str)))
    :param hypotheses: a list of hypothesis sentences
    :type hypotheses: list(list(str))
    :param n: highest n-gram order
    :type n: int
    """
    # Before proceeding to compute NIST, perform sanity checks.
    assert len(list_of_references) == len(
        hypotheses), "The number of hypotheses and their reference(s) should be the same"

    # Key = ngram order, and value = no. of ngram matches.
    p_numerators = Counter()
    # Key = ngram order, and value = no. of ngram in ref.
    p_denominators = Counter()
    # Key = ngram order, and value = no. of ngram in hyp.
    sysoutput_lengths = Counter()
    hyp_lengths, ref_lengths = 0, 0

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(range(1, n + 1)):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator
            # Adds the no. of ngrams in the hypothesis.
            sysoutput_lengths[i] += len(hypothesis) - (i - 1)

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = nist_length_penalty(ref_lengths, hyp_lengths)

    # Collects the various precision values for the different ngram orders.
    p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
           for i, _ in enumerate(range(1, n + 1))]

    # Eqn 2 in Doddington (2002):
    # Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ]
    info = [0 if p_n[i].numerator == 0 or p_n[i + 1].numerator == 0  # Handles math domain and zero division errors.
            else math.log(p_n[i].numerator / p_n[i + 1].numerator)
            for i in range(len(p_n) - 1)]
    return sum(info_i / sysoutput_lengths[i] for i, info_i in enumerate(info)) * bp
예제 #9
0
def custom_corpus_bleu(list_of_references,
                       hypotheses,
                       weights=(0.25, 0.25, 0.25, 0.25)):
    from collections import Counter
    from nltk.translate.bleu_score import modified_precision, closest_ref_length, brevity_penalty, Fraction
    import math

    p_numerators = Counter(
    )  # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter(
    )  # Key = ngram order, and value = no. of ngram in ref.
    hyp_lengths, ref_lengths = 0, 0

    assert len(list_of_references) == len(
        hypotheses
    ), "The number of hypotheses and their reference(s) should be the same"

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(weights, start=1):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = brevity_penalty(ref_lengths, hyp_lengths)

    # Collects the various precision values for the different ngram orders.
    p_n = [
        Fraction(p_numerators[i], p_denominators[i], _normalize=False)
        for i, _ in enumerate(weights, start=1)
    ]

    # Returns 0 if there's no matching n-grams
    # We only need to check for p_numerators[1] == 0, since if there's
    # no unigrams, there won't be any higher order ngrams.
    if p_numerators[1] == 0:
        return 0, p_n, bp

    s = (w * math.log(p_i + 1e-12)
         for i, (w, p_i) in enumerate(zip(weights, p_n)))
    s = bp * math.exp(math.fsum(s))
    return s
예제 #10
0
def corpus_nist(list_of_references, hypotheses, n=5):
    """
    Calculate a single corpus-level NIST score (aka. system-level BLEU) for all
    the hypotheses and their respective references.

    :param references: a corpus of lists of reference sentences, w.r.t. hypotheses
    :type references: list(list(list(str)))
    :param hypotheses: a list of hypothesis sentences
    :type hypotheses: list(list(str))
    :param n: highest n-gram order
    :type n: int
    """
    # Before proceeding to compute NIST, perform sanity checks.
    assert len(list_of_references) == len(hypotheses), "The number of hypotheses and their reference(s) should be the same"

    p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
    sysoutput_lengths = Counter() # Key = ngram order, and value = no. of ngram in hyp.
    hyp_lengths, ref_lengths = 0, 0

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(range(1,n+1)):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator
            # Adds the no. of ngrams in the hypothesis.
            sysoutput_lengths[i] += len(hypothesis) - (i - 1)

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len =  len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = nist_length_penalty(ref_lengths, hyp_lengths)

    # Collects the various precision values for the different ngram orders.
    p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
           for i, _ in enumerate(range(1,n+1))]

    # Eqn 2 in Doddington (2002):
    # Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ]
    info = [0 if p_n[i].numerator == 0 or p_n[i+1].numerator == 0 # Handles math domain and zero division errors.
            else math.log(p_n[i].numerator / p_n[i+1].numerator)
            for i in range(len(p_n)-1)]
    return sum(info_i/sysoutput_lengths[i] for i, info_i in enumerate(info)) * bp
예제 #11
0
def corpus_bleu(list_of_references,
                hypotheses,
                weights=(0.25, 0.25, 0.25, 0.25),
                smoothing_function=None,
                auto_reweigh=False,
                averaging_mode="geometric",
                no_length_penalty=False):
    """
    Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
    the hypotheses and their respective references.

    Instead of averaging the sentence level BLEU scores (i.e. marco-average
    precision), the original BLEU metric (Papineni et al. 2002) accounts for
    the micro-average precision (i.e. summing the numerators and denominators
    for each hypothesis-reference(s) pairs before the division).

    >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
    ...         'ensures', 'that', 'the', 'military', 'always',
    ...         'obeys', 'the', 'commands', 'of', 'the', 'party']
    >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
    ...          'ensures', 'that', 'the', 'military', 'will', 'forever',
    ...          'heed', 'Party', 'commands']
    >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
    ...          'guarantees', 'the', 'military', 'forces', 'always',
    ...          'being', 'under', 'the', 'command', 'of', 'the', 'Party']
    >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
    ...          'army', 'always', 'to', 'heed', 'the', 'directions',
    ...          'of', 'the', 'party']

    >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
    ...         'interested', 'in', 'world', 'history']
    >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
    ...          'because', 'he', 'read', 'the', 'book']

    >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
    >>> hypotheses = [hyp1, hyp2]
    >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
    0.5920...

    The example below show that corpus_bleu() is different from averaging
    sentence_bleu() for hypotheses

    >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
    >>> score2 = sentence_bleu([ref2a], hyp2)
    >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
    0.6223...

    :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
    :type list_of_references: list(list(list(str)))
    :param hypotheses: a list of hypothesis sentences
    :type hypotheses: list(list(str))
    :param weights: weights for unigrams, bigrams, trigrams and so on
    :type weights: list(float)
    :param smoothing_function:
    :type smoothing_function: SmoothingFunction
    :param auto_reweigh: Option to re-normalize the weights uniformly.
    :type auto_reweigh: bool
    :return: The corpus-level BLEU score.
    :rtype: float
    """
    # Before proceeding to compute BLEU, perform sanity checks.

    p_numerators = Counter(
    )  # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter(
    )  # Key = ngram order, and value = no. of ngram in ref.
    hyp_lengths, ref_lengths = 0, 0

    assert len(list_of_references) == len(hypotheses), (
        "The number of hypotheses and their reference(s) should be the "
        "same ")

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(weights, start=1):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    if no_length_penalty and averaging_mode == 'geometric':
        bp = 1.0
    elif no_length_penalty and averaging_mode == 'arithmetic':
        bp = 0.0
    else:
        assert not no_length_penalty
        assert averaging_mode != 'arithmetic', 'Not sure how to apply length penalty when aurithmetic mode'
        bp = brevity_penalty(ref_lengths, hyp_lengths)

    # Uniformly re-weighting based on maximum hypothesis lengths if largest
    # order of n-grams < 4 and weights is set at default.
    if auto_reweigh:
        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
            weights = (1 / hyp_lengths, ) * hyp_lengths

    # Collects the various precision values for the different ngram orders.
    p_n = [
        Fraction(p_numerators[i], p_denominators[i], _normalize=False)
        for i, _ in enumerate(weights, start=1)
    ]

    # Returns 0 if there's no matching n-grams
    # We only need to check for p_numerators[1] == 0, since if there's
    # no unigrams, there won't be any higher order ngrams.
    if p_numerators[1] == 0:
        return 0

    # If there's no smoothing, set use method0 from SmoothinFunction class.
    if not smoothing_function:
        smoothing_function = SmoothingFunction().method0
    # Smoothen the modified precision.
    # Note: smoothing_function() may convert values into floats;
    #       it tries to retain the Fraction object as much as the
    #       smoothing method allows.
    p_n = smoothing_function(p_n,
                             references=references,
                             hypothesis=hypothesis,
                             hyp_len=hyp_lengths)

    if averaging_mode == "geometric":
        s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
        s = bp * math.exp(math.fsum(s))
    elif averaging_mode == "arithmetic":
        s = (w_i * p_i for w_i, p_i in zip(weights, p_n))
        s = math.fsum(s)

    return s
예제 #12
0
def modified_corpus_bleu(list_of_references,
                         hypotheses,
                         weights=(0.25, 0.25, 0.25, 0.25),
                         smoothing_function=None,
                         auto_reweigh=False):
    """
    modified from nltk.translate.bleu_score.corpus_bleu,
    returns 'multi-bleu.perl'-like intermediate results.
    Args:
        list_of_references:
        hypotheses:
        weights:
        smoothing_function:
        auto_reweigh:

    Returns:

    """
    # Before proceeding to compute BLEU, perform sanity checks.

    p_numerators = Counter(
    )  # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter(
    )  # Key = ngram order, and value = no. of ngram in ref.
    hyp_lengths, ref_lengths = 0, 0

    assert len(list_of_references) == len(hypotheses), f"The number of hypotheses and their reference(s) should be " \
                                                       f"the same: {len(list_of_references)} != {len(hypotheses)}"

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(weights, start=1):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = brevity_penalty(ref_lengths, hyp_lengths)

    # Uniformly re-weighting based on maximum hypothesis lengths if largest
    # order of n-grams < 4 and weights is set at default.
    if auto_reweigh:
        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
            weights = (1 / hyp_lengths, ) * hyp_lengths

    # Collects the various precision values for the different ngram orders.
    p_n = [
        Fraction(p_numerators[i], p_denominators[i], _normalize=False)
        for i, _ in enumerate(weights, start=1)
    ]

    # Returns 0 if there's no matching n-grams
    # We only need to check for p_numerators[1] == 0, since if there's
    # no unigrams, there won't be any higher order ngrams.
    if p_numerators[1] == 0:
        return 0

    # If there's no smoothing, set use method0 from SmoothinFunction class.
    if not smoothing_function:
        smoothing_function = SmoothingFunction().method0
    # Smoothen the modified precision.
    # Note: smoothing_function() may convert values into floats;
    #       it tries to retain the Fraction object as much as the
    #       smoothing method allows.
    p_n = smoothing_function(p_n,
                             references=references,
                             hypothesis=hypothesis,
                             hyp_len=hyp_len)
    s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
    s = bp * math.exp(math.fsum(s))
    return s, p_n, bp, hyp_lengths / ref_lengths, hyp_lengths, ref_lengths