def __init__(self): """ Recall Oriented Understudy of Gisting Evaluation. Use 'rouge-metric' package as backend. """ super().__init__() self.rouge = PyRouge(rouge_l=True)
def evaluation_metrics(summaries, hypotheses_list): rouge = PyRouge(rouge_n=(1, 2, 4), rouge_l=True, rouge_w=True, rouge_w_weight=1.2, rouge_s=True, rouge_su=True, skip_gap=4) actual_summary_list = [] references_list = [] folder_path = "../BBCNewsSummary/Summaries/business" for filename in glob.glob(os.path.join(folder_path, '*.txt')): with open(filename, 'r') as f: text = f.read() references = [] actual_summary_list.append(text) # Pre-process and tokenize the summaries as you like references.append(text.split()) references_list.append(references) for i in range(len(summaries)): fuzz_ratio.append(fuzz.ratio(summaries[i], actual_summary_list[i])) scores = rouge.evaluate_tokenized(hypotheses_list[i], references_list[i]) recall_scores_list.append(scores['rouge-1']['r']) f_scores_list.append(scores['rouge-1']['f'])
def calc_metrics(refs, hyps, metric="all"): metrics = dict() metrics["count"] = len(hyps) metrics["ref_example"] = refs[-1][-1] metrics["hyp_example"] = hyps[-1] if metric in ("bleu", "all"): metrics["bleu"] = corpus_bleu(refs, hyps) if metric in ("rouge", "all"): rouge = PyRouge(rouge_l=True, multi_ref_mode="best") scores = rouge.evaluate(hyps, refs) metrics.update(scores) return metrics
def test_compare_multi_ref_summaries(): for hyp, ref in load_multi_ref_summary_pairs(): gt = PerlRouge(MAX_N, True, True, 1.2, True, True, 4, 'best').evaluate([hyp], [ref]) out = PyRouge(MAX_N, True, True, 1.2, True, True, 4, 'best').evaluate([hyp], [ref]) assert_close_rouge(out, gt) gt = PerlRouge(MAX_N, True, True, 1.2, True, True, 4, 'average').evaluate([hyp], [ref]) out = PyRouge(MAX_N, True, True, 1.2, True, True, 4, 'average').evaluate([hyp], [ref]) assert_close_rouge(out, gt)
class RougeL(Module): def __init__(self): """ Recall Oriented Understudy of Gisting Evaluation. Use 'rouge-metric' package as backend. """ super().__init__() self.rouge = PyRouge(rouge_l=True) def forward(self, hypothesis: List[List[str]], references: List[List[List[str]]]) -> float: if len(hypothesis) != len(references): raise ValueError( f'Batch size of hypothesis and references are different ({len(hypothesis)} != {len(references)}).' ) hypothesis = [' '.join(hyp) for hyp in hypothesis] references = [[' '.join(ref) for ref in refs] for refs in references] scores = self.rouge.evaluate(hypotheses=hypothesis, multi_references=references) rouge_l_scores = scores['rouge-l'] # 3 scores = Recall r, Precision p, FScore f # {'r': ..., 'p': ..., 'f': ...} f_score = rouge_l_scores['f'] return f_score
def test_compare_all_multi_ref_summaries(): hyp, ref = load_all_multi_ref_summaries() avg = PyRouge(MAX_N, True, True, 1.2, True, True, 4, mode='average').evaluate(hyp, ref) indiv = PyRouge(MAX_N, True, True, 1.2, True, True, 4, mode='individual').evaluate(hyp, ref) for key, avg_score in avg.items(): assert avg_score['f'] == py_rouge._f_score(avg_score['p'], avg_score['r'], 0.5) avg_score['p'] *= len(indiv) avg_score['r'] *= len(indiv) for case in indiv: avg_score['p'] -= case[key]['p'] avg_score['r'] -= case[key]['r'] for score in avg.values(): assert isclose(score['p'], 0, abs_tol=1e-9) assert isclose(score['r'], 0, abs_tol=1e-9)
def rouge(hypotheses, references): """ calculate the rouge score for each system :param hypotheses: dict type, hypotheses data :param references: dict type, references data :return rougeScore: dict type, including rouge-1, rouge-2, rouge-L """ rougeScore = dict() for ids in range(1, 21): systemId = 'S_' + str(ids) rougeScore[systemId] = list() r_references = list(map(list, zip(*references))) rouge = PyRouge(rouge_n=2, rouge_l=True) for systemId in hypotheses: pred = hypotheses[systemId] scores = rouge.evaluate(pred, r_references) rougeScore[systemId].append(scores) return rougeScore
class RougeL(Metric): def __init__(self): """ Recall Oriented Understudy of Gisting Evaluation. Use 'rouge-metric' package as backend. """ super().__init__() self.rouge = PyRouge(rouge_l=True) def compute_score(self, references: List[str], hypothesis: List[List[str]]) -> Tensor: scores = self.rouge.evaluate(references, hypothesis) rouge_l_scores = scores['rouge-l'] # 3 scores = Recall r, Precision p, FScore f # {'r': ..., 'p': ..., 'f': ...} f_score = rouge_l_scores['f'] return torch.scalar_tensor(f_score)
precision = tp / (tp + fp) recall = tp / (tp + fn) if precision == 0 and recall == 0: f1 = 0 else: f1 = 2 * (precision * recall) / (precision + recall) f_scores.append(f1) #print(precision, recall, f1) print('F1-Score:', mean(f_scores)) #-------ROUGE--------- references = [] for i in range(len(predictions_df)): for k in range(3): references.append(test_df['eng'].loc[ test_df['de'] == predictions_df['de'].iloc[i]].reset_index( drop=True).to_list()) # Evaluate document-wise ROUGE scores rouge = PyRouge(rouge_n=False, rouge_l=True, rouge_w=True, rouge_w_weight=1.2, rouge_s=True, rouge_su=True, skip_gap=4) scores = rouge.evaluate(preds_bleu_3, references) print(scores)