Exemplo n.º 1
0
    def test_bleu(self):
        bleu = BLEUCalculator()
        score = bleu.bleu(
            "I am waiting on the beach",
            "He is walking on the beach",
        )
        score_from_list = bleu.bleu("I am waiting on the beach".split(),
                                    ["He is walking on the beach".split()])
        self.assertLess(abs(score - score_from_list), 1e-8)

        bleu = BLEUCalculator(lang="ja")
        score_ja = bleu.bleu("私はビーチで待ってる", "彼がベンチで待ってる")

        self.assertLess(abs(score - score_ja), 1e-8)
def eval_rouges(refrence_summary, model_summary):
    # refrence_summary = "tokyo shares close up #.## percent"
    # model_summary = "tokyo stocks close up # percent to fresh record high"

    rouge = RougeCalculator(stopwords=True, lang="en")

    rouge_1 = rouge.rouge_n(
        summary=model_summary,
        references=refrence_summary,
        n=1)

    rouge_2 = rouge.rouge_n(
        summary=model_summary,
        references=[refrence_summary],
        n=2)

    rouge_l = rouge.rouge_l(
        summary=model_summary,
        references=[refrence_summary])

    # You need spaCy to calculate ROUGE-BE

    rouge_be = rouge.rouge_be(
        summary=model_summary,
        references=[refrence_summary])

    bleu = BLEUCalculator()
    bleu_score = bleu.bleu(summary=model_summary,
                           references=[refrence_summary])

    # print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}, ROUGE-BE: {}".format(
    #    rouge_1, rouge_2, rouge_l, rouge_be
    # ).replace(", ", "\n"))

    return rouge_1, rouge_2, rouge_l, rouge_be, bleu_score
    def convert_excel_to_df_and_evaluate(self):
        """エクセルをDataFrameへ"""
        input_df = pd.ExcelFile(self.input_file).parse()

        bleu_ja = BLEUCalculator(lang="ja")
        rouge = RougeCalculator(lang="ja")
        output_dict = {
            'bleu': [],
            'rouge_1': [],
            'rouge_2': [],
            'rouge_long': []
        }
        for index, row in input_df.iterrows():
            output_dict['bleu'].append(
                bleu_ja.bleu(row['ref_text'], row['input_text']))  # BLEUでの評価
            output_dict['rouge_1'].append(
                rouge.rouge_n(summary=row['ref_text'],
                              references=row['input_text'],
                              n=1))  # ROUGE(n1)での評価
            output_dict['rouge_2'].append(
                rouge.rouge_n(summary=row['ref_text'],
                              references=row['input_text'],
                              n=2))  # ROUGE(n2)での評価
            output_dict['rouge_long'].append(
                rouge.rouge_l(
                    summary=row['ref_text'],
                    references=row['input_text']))  # ROUGE(rouge_l)での評価
            # input_microphone_df['rouge_be'] = rouge.rouge_be(summary=row['ref_text'], references=row['input_text'])      # ROUGE(rouge_be)での評価

        input_df['bleu'] = output_dict['bleu']
        input_df['rouge_1'] = output_dict['rouge_1']
        input_df['rouge_2'] = output_dict['rouge_2']
        input_df['rouge_long'] = output_dict['rouge_long']
        # print(input_df)
        return input_df
Exemplo n.º 4
0
def main(args):
    system_out = read_file(args.system_output)
    reference_list = read_file(args.reference)
    bleu = BLEUCalculator()
    bleu_list = []
    for index, snt in enumerate(system_out):
        bleu_list.append(
            bleu.bleu(summary=snt, references=reference_list[index]))
    print('SACRE_BLEU\t%.6f' % (np.average(bleu_list)))
Exemplo n.º 5
0
def evaluate_bleu(summary, references, lang="zh"):
    bleu_calc = BLEUCalculator(lang=lang)
    assert len(summary) == len(references), "number of summary and references should be equal"

    scores = []
    for s, rs in zip(summary, references):
        score = bleu_calc.bleu(s, rs)
        scores.append(score)
    score_avg = sum(scores) /  len(scores)
    return score_avg, scores
def cal_bleu(prediction_str, target_str):
    bleu = BLEUCalculator()
    total_bleu = []
    for index in range(len(prediction_str)):
        prediction_rel = ' '.join(prediction_str[index])
        eos_index = prediction_rel.find('<eos>')
        if (eos_index > 0):
            prediction_rel = prediction_rel[:eos_index - 1]
        target_rel = ' '.join(target_str[index])
        target_rel = target_rel[:target_rel.find('<eos>') - 1]
        total_bleu.append(bleu.bleu(prediction_rel, target_rel))
    return np.mean(total_bleu)
Exemplo n.º 7
0
class LanguageMetrics(object):
    bleu = BLEUCalculator(tokenizer=SimpleTokenizer())
    rouge = RougeCalculator(stopwords=True,
                            lang="en",
                            tokenizer=SimpleTokenizer())

    @staticmethod
    def _computeScore(summary, refs, criteria):
        if isinstance(refs, str):
            refs = [refs]
        score = criteria(summary=summary, references=refs)
        return score

    @staticmethod
    def blue_score(summary, refs):
        score = LanguageMetrics._computeScore(summary, refs,
                                              LanguageMetrics.bleu.bleu)
        return score

    @staticmethod
    def rouge_1_score(summary, refs):
        score = LanguageMetrics._computeScore(summary, refs,
                                              LanguageMetrics.rouge.rouge_1)
        return score

    @staticmethod
    def rouge_2_score(summary, refs):
        score = LanguageMetrics._computeScore(summary, refs,
                                              LanguageMetrics.rouge.rouge_2)
        return score

    @staticmethod
    def rouge_l_score(summary, refs):
        score = LanguageMetrics._computeScore(summary, refs,
                                              LanguageMetrics.rouge.rouge_l)
        return score

    @staticmethod
    def rouge_be_score(summary, refs):
        score = LanguageMetrics._computeScore(summary, refs,
                                              LanguageMetrics.rouge.rouge_be)
        return score

    @staticmethod
    def rouge_n_score(summary, refs, n):
        rouge_n = partial(func=LanguageMetrics.rouge.rouge_n, n=n)
        score = LanguageMetrics._computeScore(summary, refs, rouge_n)
        return score
Exemplo n.º 8
0
def myeval(valid_x, valid_y, vocab, model):
    rouge = RougeCalculator(stopwords=True, lang="zh")
    bleu_ch = BLEUCalculator(lang="zh")

    model.eval()
    eval_batch_num = 0
    sum_rouge_1 = 0
    sum_rouge_2 = 0
    sum_rouge_L = 0
    score_ch = 0
    sum_loss = 0
    limit = 63
    logging.info('Evaluating on %d minibatches...' % limit)
    i2w = {key: value for value, key in vocab.items()}
    ckpt_file = args.ckpt_file[9:]
    fout_pred = open(os.path.join('tmp/systems', '%s.txt' % ckpt_file), "w")
    fout_y = open(os.path.join('tmp/models', 'ref_%s.txt' % ckpt_file), "w")
    while eval_batch_num < limit:
        with torch.no_grad():
            loss = run_batch(valid_x, valid_y, model)
            sum_loss += loss
            _, x = valid_x.next_batch()
            pred = greedy(model, x, vocab)
            _, y = valid_y.next_batch()
            y = y[:,1:].tolist()
            for idx in range(len(pred)):
                line_pred = [i2w[tok] for tok in pred[idx] if tok != vocab[config.end_tok] and tok != vocab[config.pad_tok]]
                line_y = [i2w[tok] for tok in y[idx] if tok != vocab[config.end_tok] and tok != vocab[config.pad_tok]]
                fout_pred.write(" ".join(line_pred) + "\n")
                fout_y.write(" ".join(line_y) + "\n")
                sum_rouge_1 += rouge.rouge_n(references=" ".join(line_y),summary=" ".join(line_pred),n=1)
                sum_rouge_2 += rouge.rouge_n(references=" ".join(line_y),summary=" ".join(line_pred),n=2)
                sum_rouge_L += rouge.rouge_l(references=" ".join(line_y),summary=" ".join(line_pred))
                score_ch += bleu_ch.bleu(" ".join(line_y), " ".join(line_pred))
            eval_batch_num += 1
    fout_pred.close()
    fout_y.close()
    avg_rouge_1 = sum_rouge_1/(len(pred) * limit)
    avg_rouge_2 = sum_rouge_2/(len(pred) * limit)
    avg_rouge_L = sum_rouge_L/(len(pred) * limit)
    avg_bleu_ch = score_ch/(len(pred) * limit)
    avg_loss = sum_loss/limit
    print("ROUGE_1 = ",avg_rouge_1)
    print("ROUGE_2 = ",avg_rouge_2)
    print("ROUGE_L = ",avg_rouge_L)
    print("BLEU = ", avg_bleu_ch)
    print("Perplexity = ", math.pow(2, avg_loss))
    model.train()
Exemplo n.º 9
0
 def __init__(self,
              metrics: List[str] = [
                  "rouge_1", "rouge_2", "rouge_l", "rouge_be", "bleu"
              ],
              lang: str = "en",
              stopwords: bool = True,
              stemming: bool = True,
              use_porter=True):
     if use_porter:
         self.rouge = RougeCalculator(stopwords=stopwords,
                                      stemming=stemming,
                                      lang="en-porter")
     else:
         self.rouge = RougeCalculator(stopwords=stopwords,
                                      stemming=stemming,
                                      lang="en")
     self.bleu = BLEUCalculator(lang=lang)
     self.metrics = sorted(metrics)
Exemplo n.º 10
0
def computeSacreBleu(translation_path,
                     reference_path,
                     lang,
                     detokenize_trans=True,
                     detokenize_ref=False):
    bleu = BLEUCalculator(lang=lang)
    trans_raw = trans = readSentences(translation_path)
    reference_raw = reference = readSentences(reference_path)
    if detokenize_trans or detokenize_ref:
        detok = MosesDetokenizer(lang)

        if detokenize_trans:
            trans = [detok([d]) for d in trans_raw]
        if detokenize_ref:
            reference = [detok([d]) for d in reference_raw]
    bleu_score = bleu.bleu(summary=trans,
                           references=[reference],
                           score_only=True)
    print(bleu_score)
    return bleu_score
Exemplo n.º 11
0
    def test_custom_lang(self):
        class Custom(BaseLang):
            def __init__(self):
                super(Custom, self).__init__("cs")

            def tokenize(self, text):
                return text.split("/")

        lang = Custom()
        rouge = RougeCalculator(lang=lang)
        rouge_score = rouge.rouge_n(
            summary="I/went/to/the/Mars/from/my/living/town.",
            references="I/went/to/Mars",
            n=1)

        bleu = BLEUCalculator(lang=lang)
        bleu_score = bleu.bleu("I/am/waiting/on/the/beach",
                               "He/is/walking/on/the/beach")

        self.assertGreater(rouge_score, 0)
        self.assertGreater(bleu_score, 0)
Exemplo n.º 12
0
 def __init__(self):
     self.bleu_calc = BLEUCalculator()
Exemplo n.º 13
0
def main(score_desc: ("ex: To calculate ROUGE-N, L, BE => 'r-nlb'"),
         use_file: ("read data from file", "flag", "f"),
         include_stopwords: ("don't ignore stop words", "flag", "in"),
         stemming: ("use stemming", "flag", "st"),
         word_limit: ("word limit count", "option", "wl") = -1,
         length_limit: ("sentence limit length", "option", "ll") = -1,
         alpha: ("alpha for f1-score", "option") = 0.5,
         language: ("word limit count", "option", "la") = "en",
         *params):

    if "-" in score_desc:
        score_type, score_kinds = score_desc.lower().split("-")
    else:
        score_type = score_desc.lower()
        score_kinds = ""

    if len(params) < 2:
        print("You have to specify at least one summary and reference.")
        return

    summary = params[0]
    references = params[1:]
    if isinstance(references, tuple):
        references = list(references)
    stopwords = not include_stopwords

    generator = None
    if use_file:
        generator = file_generator(summary, references)
    else:
        generator = sentence_to_generator(summary, references)

    scores = []
    keys = []
    if score_type == "r":
        scorer = RougeCalculator(stopwords=stopwords,
                                 stemming=stemming,
                                 word_limit=word_limit,
                                 length_limit=length_limit,
                                 lang=language)

        for s, rs in generator:
            score = {}
            for k in score_kinds:
                if k == "n":
                    score["ROUGE-1"] = scorer.rouge_1(s, rs, alpha)
                    score["ROUGE-2"] = scorer.rouge_2(s, rs, alpha)
                elif k == "l":
                    score["ROUGE-L"] = scorer.rouge_l(s, rs, alpha)
                elif k == "b":
                    score["ROUGE-BE"] = scorer.rouge_be(s, rs, "HMR", alpha)
            if len(keys) == 0:
                keys = list(score.keys())
            scores.append(score)

    elif score_type == "b":
        scorer = BLEUCalculator(lang=language)
        for s, rs in generator:
            score = {}
            print(s, rs)
            score["BLEU"] = scorer.bleu(s, rs)
            if len(keys) == 0:
                keys = list(score.keys())
            scores.append(score)

    avgs = {}
    for k in keys:
        avg = mean([s[k] for s in scores])
        avgs[k] = avg

    result = {
        "options": {
            "stopwords": stopwords,
            "stemming": stemming,
            "word_limit": word_limit,
            "length_limit": length_limit,
            "alpha": alpha,
            "input-summary": summary,
            "input-references": references
        },
        "averages": avgs,
        "scores": scores
    }

    output = json.dumps(result, indent=2, ensure_ascii=False)
    print(output)
Exemplo n.º 14
0
                                                      document.ref_keywords)
    tokens: Set[str] = retrieve_lemmatized_tokens(document.lang,
                                                  document.keywords)
    tp = len(ref_tokens.intersection(tokens))
    fp = len(tokens) - tp
    fn = len(ref_tokens) - tp
    eps = 1e-6
    precision = tp / (tp + fp + eps)
    recall = tp / (tp + fn + eps)
    f1 = 2 * (precision * recall) / (precision + recall + eps)
    res = {'precision': precision, 'recall': recall, 'f1': f1}
    return res


rouge_en = RougeCalculator(stopwords=True, lang="en")
bleu = BLEUCalculator()


def evaluate_summary(document: DocumentForEval) -> Dict[str, float]:
    summary = '\n'.join(document.summary)
    ref_summary = '\n'.join(document.ref_summary)
    if document.lang.lower() != 'en':
        raise ValueError("Only English language is supported at the moment.")

    rouge_1 = rouge_en.rouge_n(summary=summary, references=ref_summary, n=1)

    rouge_2 = rouge_en.rouge_n(summary=summary, references=ref_summary, n=2)

    rouge_3 = rouge_en.rouge_n(summary=summary, references=ref_summary, n=3)

    rouge_4 = rouge_en.rouge_n(summary=summary, references=ref_summary, n=4)
Exemplo n.º 15
0
#Evaluation task ##################################

from sumeval.metrics.rouge import RougeCalculator

rouge = RougeCalculator(stopwords=True, lang="en")

rouge_1 = rouge.rouge_n(summary="I went to the Mars from my living town.",
                        references="I went to Mars",
                        n=1)

rouge_2 = rouge.rouge_n(summary="I went to the Mars from my living town.",
                        references=["I went to Mars", "It's my living town"],
                        n=2)

rouge_l = rouge.rouge_l(summary="I went to the Mars from my living town.",
                        references=["I went to Mars", "It's my living town"])

# You need spaCy to calculate ROUGE-BE

rouge_be = rouge.rouge_be(summary="I went to the Mars from my living town.",
                          references=["I went to Mars", "It's my living town"])

print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}, ROUGE-BE: {}".format(
    rouge_1, rouge_2, rouge_l, rouge_be).replace(", ", "\n"))

from sumeval.metrics.bleu import BLEUCalculator

bleu = BLEUCalculator()
score = bleu.bleu("I am waiting on the beach", "He is walking on the beach")

bleu_ja = BLEUCalculator(lang="en")