def convert_excel_to_df_and_evaluate(self): """エクセルをDataFrameへ""" input_df = pd.ExcelFile(self.input_file).parse() bleu_ja = BLEUCalculator(lang="ja") rouge = RougeCalculator(lang="ja") output_dict = { 'bleu': [], 'rouge_1': [], 'rouge_2': [], 'rouge_long': [] } for index, row in input_df.iterrows(): output_dict['bleu'].append( bleu_ja.bleu(row['ref_text'], row['input_text'])) # BLEUでの評価 output_dict['rouge_1'].append( rouge.rouge_n(summary=row['ref_text'], references=row['input_text'], n=1)) # ROUGE(n1)での評価 output_dict['rouge_2'].append( rouge.rouge_n(summary=row['ref_text'], references=row['input_text'], n=2)) # ROUGE(n2)での評価 output_dict['rouge_long'].append( rouge.rouge_l( summary=row['ref_text'], references=row['input_text'])) # ROUGE(rouge_l)での評価 # input_microphone_df['rouge_be'] = rouge.rouge_be(summary=row['ref_text'], references=row['input_text']) # ROUGE(rouge_be)での評価 input_df['bleu'] = output_dict['bleu'] input_df['rouge_1'] = output_dict['rouge_1'] input_df['rouge_2'] = output_dict['rouge_2'] input_df['rouge_long'] = output_dict['rouge_long'] # print(input_df) return input_df
def eval_rouges(refrence_summary, model_summary): # refrence_summary = "tokyo shares close up #.## percent" # model_summary = "tokyo stocks close up # percent to fresh record high" rouge = RougeCalculator(stopwords=True, lang="en") rouge_1 = rouge.rouge_n( summary=model_summary, references=refrence_summary, n=1) rouge_2 = rouge.rouge_n( summary=model_summary, references=[refrence_summary], n=2) rouge_l = rouge.rouge_l( summary=model_summary, references=[refrence_summary]) # You need spaCy to calculate ROUGE-BE rouge_be = rouge.rouge_be( summary=model_summary, references=[refrence_summary]) bleu = BLEUCalculator() bleu_score = bleu.bleu(summary=model_summary, references=[refrence_summary]) # print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}, ROUGE-BE: {}".format( # rouge_1, rouge_2, rouge_l, rouge_be # ).replace(", ", "\n")) return rouge_1, rouge_2, rouge_l, rouge_be, bleu_score
def main(args): system_out = read_file(args.system_output) reference_list = read_file(args.reference) bleu = BLEUCalculator() bleu_list = [] for index, snt in enumerate(system_out): bleu_list.append( bleu.bleu(summary=snt, references=reference_list[index])) print('SACRE_BLEU\t%.6f' % (np.average(bleu_list)))
def evaluate_bleu(summary, references, lang="zh"): bleu_calc = BLEUCalculator(lang=lang) assert len(summary) == len(references), "number of summary and references should be equal" scores = [] for s, rs in zip(summary, references): score = bleu_calc.bleu(s, rs) scores.append(score) score_avg = sum(scores) / len(scores) return score_avg, scores
def cal_bleu(prediction_str, target_str): bleu = BLEUCalculator() total_bleu = [] for index in range(len(prediction_str)): prediction_rel = ' '.join(prediction_str[index]) eos_index = prediction_rel.find('<eos>') if (eos_index > 0): prediction_rel = prediction_rel[:eos_index - 1] target_rel = ' '.join(target_str[index]) target_rel = target_rel[:target_rel.find('<eos>') - 1] total_bleu.append(bleu.bleu(prediction_rel, target_rel)) return np.mean(total_bleu)
def myeval(valid_x, valid_y, vocab, model): rouge = RougeCalculator(stopwords=True, lang="zh") bleu_ch = BLEUCalculator(lang="zh") model.eval() eval_batch_num = 0 sum_rouge_1 = 0 sum_rouge_2 = 0 sum_rouge_L = 0 score_ch = 0 sum_loss = 0 limit = 63 logging.info('Evaluating on %d minibatches...' % limit) i2w = {key: value for value, key in vocab.items()} ckpt_file = args.ckpt_file[9:] fout_pred = open(os.path.join('tmp/systems', '%s.txt' % ckpt_file), "w") fout_y = open(os.path.join('tmp/models', 'ref_%s.txt' % ckpt_file), "w") while eval_batch_num < limit: with torch.no_grad(): loss = run_batch(valid_x, valid_y, model) sum_loss += loss _, x = valid_x.next_batch() pred = greedy(model, x, vocab) _, y = valid_y.next_batch() y = y[:,1:].tolist() for idx in range(len(pred)): line_pred = [i2w[tok] for tok in pred[idx] if tok != vocab[config.end_tok] and tok != vocab[config.pad_tok]] line_y = [i2w[tok] for tok in y[idx] if tok != vocab[config.end_tok] and tok != vocab[config.pad_tok]] fout_pred.write(" ".join(line_pred) + "\n") fout_y.write(" ".join(line_y) + "\n") sum_rouge_1 += rouge.rouge_n(references=" ".join(line_y),summary=" ".join(line_pred),n=1) sum_rouge_2 += rouge.rouge_n(references=" ".join(line_y),summary=" ".join(line_pred),n=2) sum_rouge_L += rouge.rouge_l(references=" ".join(line_y),summary=" ".join(line_pred)) score_ch += bleu_ch.bleu(" ".join(line_y), " ".join(line_pred)) eval_batch_num += 1 fout_pred.close() fout_y.close() avg_rouge_1 = sum_rouge_1/(len(pred) * limit) avg_rouge_2 = sum_rouge_2/(len(pred) * limit) avg_rouge_L = sum_rouge_L/(len(pred) * limit) avg_bleu_ch = score_ch/(len(pred) * limit) avg_loss = sum_loss/limit print("ROUGE_1 = ",avg_rouge_1) print("ROUGE_2 = ",avg_rouge_2) print("ROUGE_L = ",avg_rouge_L) print("BLEU = ", avg_bleu_ch) print("Perplexity = ", math.pow(2, avg_loss)) model.train()
def __init__(self, metrics: List[str] = [ "rouge_1", "rouge_2", "rouge_l", "rouge_be", "bleu" ], lang: str = "en", stopwords: bool = True, stemming: bool = True, use_porter=True): if use_porter: self.rouge = RougeCalculator(stopwords=stopwords, stemming=stemming, lang="en-porter") else: self.rouge = RougeCalculator(stopwords=stopwords, stemming=stemming, lang="en") self.bleu = BLEUCalculator(lang=lang) self.metrics = sorted(metrics)
def computeSacreBleu(translation_path, reference_path, lang, detokenize_trans=True, detokenize_ref=False): bleu = BLEUCalculator(lang=lang) trans_raw = trans = readSentences(translation_path) reference_raw = reference = readSentences(reference_path) if detokenize_trans or detokenize_ref: detok = MosesDetokenizer(lang) if detokenize_trans: trans = [detok([d]) for d in trans_raw] if detokenize_ref: reference = [detok([d]) for d in reference_raw] bleu_score = bleu.bleu(summary=trans, references=[reference], score_only=True) print(bleu_score) return bleu_score
def test_custom_lang(self): class Custom(BaseLang): def __init__(self): super(Custom, self).__init__("cs") def tokenize(self, text): return text.split("/") lang = Custom() rouge = RougeCalculator(lang=lang) rouge_score = rouge.rouge_n( summary="I/went/to/the/Mars/from/my/living/town.", references="I/went/to/Mars", n=1) bleu = BLEUCalculator(lang=lang) bleu_score = bleu.bleu("I/am/waiting/on/the/beach", "He/is/walking/on/the/beach") self.assertGreater(rouge_score, 0) self.assertGreater(bleu_score, 0)
class LanguageMetrics(object): bleu = BLEUCalculator(tokenizer=SimpleTokenizer()) rouge = RougeCalculator(stopwords=True, lang="en", tokenizer=SimpleTokenizer()) @staticmethod def _computeScore(summary, refs, criteria): if isinstance(refs, str): refs = [refs] score = criteria(summary=summary, references=refs) return score @staticmethod def blue_score(summary, refs): score = LanguageMetrics._computeScore(summary, refs, LanguageMetrics.bleu.bleu) return score @staticmethod def rouge_1_score(summary, refs): score = LanguageMetrics._computeScore(summary, refs, LanguageMetrics.rouge.rouge_1) return score @staticmethod def rouge_2_score(summary, refs): score = LanguageMetrics._computeScore(summary, refs, LanguageMetrics.rouge.rouge_2) return score @staticmethod def rouge_l_score(summary, refs): score = LanguageMetrics._computeScore(summary, refs, LanguageMetrics.rouge.rouge_l) return score @staticmethod def rouge_be_score(summary, refs): score = LanguageMetrics._computeScore(summary, refs, LanguageMetrics.rouge.rouge_be) return score @staticmethod def rouge_n_score(summary, refs, n): rouge_n = partial(func=LanguageMetrics.rouge.rouge_n, n=n) score = LanguageMetrics._computeScore(summary, refs, rouge_n) return score
def test_bleu(self): bleu = BLEUCalculator() score = bleu.bleu( "I am waiting on the beach", "He is walking on the beach", ) score_from_list = bleu.bleu("I am waiting on the beach".split(), ["He is walking on the beach".split()]) self.assertLess(abs(score - score_from_list), 1e-8) bleu = BLEUCalculator(lang="ja") score_ja = bleu.bleu("私はビーチで待ってる", "彼がベンチで待ってる") self.assertLess(abs(score - score_ja), 1e-8)
class SacreBleu(): def __init__(self): self.bleu_calc = BLEUCalculator() def compute_reward(self, samples, sequence, model): references = [ pair.get_text(pair.full_target_tokens, model.vocab).split(" EOS")[0] for pair in samples ] summaries = [ " ".join([str(token) for token in s]).split(" EOS")[0] for s in sequence ] scores = [] for i in range(len(references)): scores.append( self.bleu_calc.bleu(summaries[i], references[i]) / 100) return scores
def main(score_desc: ("ex: To calculate ROUGE-N, L, BE => 'r-nlb'"), use_file: ("read data from file", "flag", "f"), include_stopwords: ("don't ignore stop words", "flag", "in"), stemming: ("use stemming", "flag", "st"), word_limit: ("word limit count", "option", "wl") = -1, length_limit: ("sentence limit length", "option", "ll") = -1, alpha: ("alpha for f1-score", "option") = 0.5, language: ("word limit count", "option", "la") = "en", *params): if "-" in score_desc: score_type, score_kinds = score_desc.lower().split("-") else: score_type = score_desc.lower() score_kinds = "" if len(params) < 2: print("You have to specify at least one summary and reference.") return summary = params[0] references = params[1:] if isinstance(references, tuple): references = list(references) stopwords = not include_stopwords generator = None if use_file: generator = file_generator(summary, references) else: generator = sentence_to_generator(summary, references) scores = [] keys = [] if score_type == "r": scorer = RougeCalculator(stopwords=stopwords, stemming=stemming, word_limit=word_limit, length_limit=length_limit, lang=language) for s, rs in generator: score = {} for k in score_kinds: if k == "n": score["ROUGE-1"] = scorer.rouge_1(s, rs, alpha) score["ROUGE-2"] = scorer.rouge_2(s, rs, alpha) elif k == "l": score["ROUGE-L"] = scorer.rouge_l(s, rs, alpha) elif k == "b": score["ROUGE-BE"] = scorer.rouge_be(s, rs, "HMR", alpha) if len(keys) == 0: keys = list(score.keys()) scores.append(score) elif score_type == "b": scorer = BLEUCalculator(lang=language) for s, rs in generator: score = {} print(s, rs) score["BLEU"] = scorer.bleu(s, rs) if len(keys) == 0: keys = list(score.keys()) scores.append(score) avgs = {} for k in keys: avg = mean([s[k] for s in scores]) avgs[k] = avg result = { "options": { "stopwords": stopwords, "stemming": stemming, "word_limit": word_limit, "length_limit": length_limit, "alpha": alpha, "input-summary": summary, "input-references": references }, "averages": avgs, "scores": scores } output = json.dumps(result, indent=2, ensure_ascii=False) print(output)
document.ref_keywords) tokens: Set[str] = retrieve_lemmatized_tokens(document.lang, document.keywords) tp = len(ref_tokens.intersection(tokens)) fp = len(tokens) - tp fn = len(ref_tokens) - tp eps = 1e-6 precision = tp / (tp + fp + eps) recall = tp / (tp + fn + eps) f1 = 2 * (precision * recall) / (precision + recall + eps) res = {'precision': precision, 'recall': recall, 'f1': f1} return res rouge_en = RougeCalculator(stopwords=True, lang="en") bleu = BLEUCalculator() def evaluate_summary(document: DocumentForEval) -> Dict[str, float]: summary = '\n'.join(document.summary) ref_summary = '\n'.join(document.ref_summary) if document.lang.lower() != 'en': raise ValueError("Only English language is supported at the moment.") rouge_1 = rouge_en.rouge_n(summary=summary, references=ref_summary, n=1) rouge_2 = rouge_en.rouge_n(summary=summary, references=ref_summary, n=2) rouge_3 = rouge_en.rouge_n(summary=summary, references=ref_summary, n=3) rouge_4 = rouge_en.rouge_n(summary=summary, references=ref_summary, n=4)
class SumEvaluator: """Evaluator class for generation. A wrapper class of sumeval library """ def __init__(self, metrics: List[str] = [ "rouge_1", "rouge_2", "rouge_l", "rouge_be", "bleu" ], lang: str = "en", stopwords: bool = True, stemming: bool = True, use_porter=True): if use_porter: self.rouge = RougeCalculator(stopwords=stopwords, stemming=stemming, lang="en-porter") else: self.rouge = RougeCalculator(stopwords=stopwords, stemming=stemming, lang="en") self.bleu = BLEUCalculator(lang=lang) self.metrics = sorted(metrics) def eval(self, true_gens: List[str], pred_gens: List[str]): assert len(true_gens) == len(pred_gens) eval_list = [] colnames = [] for i, (true_gen, pred_gen) in enumerate(zip(true_gens, pred_gens)): evals = [] # BLEU if "bleu" in self.metrics: bleu_score = self.bleu.bleu(pred_gen, true_gen) / 100.0 # align scale evals.append(bleu_score) # ROUGE if "rouge_1" in self.metrics: rouge_1 = self.rouge.rouge_n(summary=pred_gen, references=[true_gen], n=1) evals.append(rouge_1) if "rouge_2" in self.metrics: rouge_2 = self.rouge.rouge_n(summary=pred_gen, references=[true_gen], n=2) evals.append(rouge_2) if "rouge_be" in self.metrics: rouge_be = self.rouge.rouge_be(summary=pred_gen, references=[true_gen]) evals.append(rouge_be) if "rouge_l" in self.metrics: rouge_l = self.rouge.rouge_l(summary=pred_gen, references=[true_gen]) evals.append(rouge_l) eval_list.append([pred_gen, true_gen] + evals) eval_df = pd.DataFrame(eval_list, columns=["pred", "true"] + self.metrics) return eval_df
#Evaluation task ################################## from sumeval.metrics.rouge import RougeCalculator rouge = RougeCalculator(stopwords=True, lang="en") rouge_1 = rouge.rouge_n(summary="I went to the Mars from my living town.", references="I went to Mars", n=1) rouge_2 = rouge.rouge_n(summary="I went to the Mars from my living town.", references=["I went to Mars", "It's my living town"], n=2) rouge_l = rouge.rouge_l(summary="I went to the Mars from my living town.", references=["I went to Mars", "It's my living town"]) # You need spaCy to calculate ROUGE-BE rouge_be = rouge.rouge_be(summary="I went to the Mars from my living town.", references=["I went to Mars", "It's my living town"]) print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}, ROUGE-BE: {}".format( rouge_1, rouge_2, rouge_l, rouge_be).replace(", ", "\n")) from sumeval.metrics.bleu import BLEUCalculator bleu = BLEUCalculator() score = bleu.bleu("I am waiting on the beach", "He is walking on the beach") bleu_ja = BLEUCalculator(lang="en")
nx_graph = nx.from_numpy_array(sim_mat) scores = nx.pagerank(nx_graph) ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) # Specify number of sentences to form the summary sn = 2 generated_summary = '' # Generate summary for i in range(sn): generated_summary = ''.join(ranked_sentences[i][1]) # Evaluation rouge = RougeCalculator(stopwords=True, lang="en") bleu = BLEUCalculator() rouge_1_scores = [] rouge_1_scores.append( rouge.rouge_n(summary=generated_summary, references=summaries[0], n=1)) rouge_1_scores.append( rouge.rouge_n(summary=generated_summary, references=summaries[1], n=1)) rouge_1_scores.append( rouge.rouge_n(summary=generated_summary, references=summaries[2], n=1)) rouge_1_scores.append( rouge.rouge_n(summary=generated_summary, references=summaries[3], n=1)) rouge_1_scores.append(
# Get the dense tf-idf matrix for the document story_freq_term_matrix = count_vect.transform([doc]) story_tfidf_matrix = tfidf.transform(story_freq_term_matrix) story_dense = story_tfidf_matrix.todense() doc_matrix = story_dense.tolist()[0] # Get Top Ranking Sentences and join them as a summary top_sents = rank_sentences(doc, doc_matrix, feature_names, 2) summary = '.'.join([cleaned_document.split('.')[i] for i in [pair[0] for pair in top_sents]]) summary = ' '.join(summary.split()) print(summary) # Evaluation rouge = RougeCalculator(stopwords=True, lang="en") bleu = BLEUCalculator() rouge_1_scores = [] rouge_1_scores.append(rouge.rouge_n( summary=summary, references=summaries[0], n=1)) rouge_1_scores.append(rouge.rouge_n( summary=summary, references=summaries[1], n=1)) rouge_1_scores.append(rouge.rouge_n( summary=summary, references=summaries[2],
def __init__(self): self.bleu_calc = BLEUCalculator()
import math from sumeval.metrics.rouge import RougeCalculator from sumeval.metrics.bleu import BLEUCalculator # load data data_number = 200 directory = r'D:\社交舆情实验\Experiment_for_Social_Network-master\Experiment_for_Social_Network-master\Experiment_1\2_automatic_summarization\data\cnn_stories_tokenized\\' stories = load_stories(directory, data_number) print('Loaded Stories %d' % len(stories)) ref = [] ans = [] bleu_score = 0 rouge_1 = 0 rouge_2 = 0 rouge_l = 0 rouge = RougeCalculator(stopwords=True, lang="en") bleu = BLEUCalculator() empty = 0 count_max_0 = 0 for index, story in enumerate(stories): ref.append(story['highlights']) txt = story['story'] sents = txt.split('\n') for i in range(len(sents)): if '' in sents: sents.remove('') else: break for i in range(len(sents)): trans = str.maketrans({key: None for key in string.punctuation}) sents[i] = sents[i].translate(trans) tokenized_sents = [nltk.word_tokenize(x) for x in sents]