def evaluate_summarization(test_set: dict, vocabulary: dict, word_cnt, ref_summaries: dict, elim_stopwords, lemmatization, bigrams): my_summaries = {} for key in test_set: my_summaries[key] = [] for doc in test_set[key]: summary = [] for sentence in doc: (result, prob) = predict_sentence_in_sum(vocabulary, sentence, word_cnt, 1, elim_stopwords, lemmatization, bigrams) if result == "yes": summary.append(sentence) my_summaries[key].append(summary) if bigrams: scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=False) else: scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=False) avg_p = 0 avg_r = 0 avg_f = 0 total = 0 for key in test_set: for mysum, ref in zip(my_summaries[key], ref_summaries[key]): if bigrams: score = scorer.score(" ".join(ref), " ".join(mysum))['rouge2'] else: score = scorer.score(" ".join(ref), " ".join(mysum))['rouge1'] avg_p += score.precision avg_r += score.recall avg_f += score.fmeasure total += 1 return avg_p / total, avg_r / total, avg_f / total
def create_rouge_scorer(self): if self.config is None: self.rouge_scorer = rouge_scorer.RougeScorer( ['rouge1', 'rouge2', 'rouge3', 'rougeL'], use_stemmer=False) self.lemmatized_rouge_scorer = rouge_scorer.RougeScorer( ['rouge1', 'rouge2', 'rouge3', 'rougeL'], use_stemmer=True) else: raise NotImplementedError( "Rouge Scorer has not been written yet to handle a config object." )
def get_rouge_scores(target_summaries_path, predicted_summaries_path, use_stem=False): scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=use_stem) target_summary_files = [ x for x in Path(target_summaries_path).rglob('*.summary') ] rouge1 = [] rouge2 = [] rougel = [] for target_summary_path in target_summary_files: basename = Path(target_summary_path).stem with open(target_summary_path, 'r') as f: target_summary = f.read() predicted_summary_path = f'{predicted_summaries_path}/{basename}.summary' if not os.path.exists(predicted_summary_path): continue with open(predicted_summary_path, 'r') as f: predicted_summary = f.read() output = scorer.score(target_summary, predicted_summary) rouge1 += [output['rouge1'].fmeasure] rouge2 += [output['rouge2'].fmeasure] rougel += [output['rougeL'].fmeasure] return sum(rouge1) / len(rouge1) * 100, sum(rouge2) / len( rouge2) * 100, sum(rougel) / len(rougel) * 100
def _compute(self, preds, refs, rouge_types=None, use_agregator=True, use_stemmer=False): if rouge_types is None: rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"] scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer) if use_agregator: aggregator = scoring.BootstrapAggregator() else: scores = [] for r, p in zip(refs, preds): score = scorer.score(r, p) if use_agregator: aggregator.add_scores(score) else: scores.append(score) if use_agregator: y = aggregator.aggregate() else: y = {} for k in scores[0]: y[k] = list(score[k] for score in scores) return y
def calculate_rouge( pred_lns: List[str], tgt_lns: List[str], use_stemmer=True, rouge_keys=ROUGE_KEYS, return_precision_and_recall=False, bootstrap_aggregation=True, newline_sep=True, ) -> Dict: scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer) aggregator = scoring.BootstrapAggregator() for pred, tgt in zip(tgt_lns, pred_lns): # rougeLsum expects "\n" separated sentences within a summary if newline_sep: pred = add_newline_to_end_of_each_sentence(pred) tgt = add_newline_to_end_of_each_sentence(tgt) scores = scorer.score(pred, tgt) aggregator.add_scores(scores) if bootstrap_aggregation: result = aggregator.aggregate() if return_precision_and_recall: return extract_rouge_mid_statistics(result) # here we return dict else: return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()} else: return aggregator._scores # here we return defaultdict(list)
def validation_step(self, batch, batch_nb): for p in self.model.parameters(): p.requires_grad = False outputs = self.forward(*batch) vloss = outputs[0] input_ids, output_ids = batch input_ids, attention_mask = self._prepare_input(input_ids) generated_ids = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, use_cache=True, max_length=self.args.max_output_len, num_beams=1) generated_str = self.tokenizer.batch_decode(generated_ids.tolist(), skip_special_tokens=True) gold_str = self.tokenizer.batch_decode(output_ids.tolist(), skip_special_tokens=True) scorer = rouge_scorer.RougeScorer(rouge_types=['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=False) rouge1 = rouge2 = rougel = rougelsum = 0.0 for ref, pred in zip(gold_str, generated_str): score = scorer.score(ref, pred) rouge1 += score['rouge1'].fmeasure rouge2 += score['rouge2'].fmeasure rougel += score['rougeL'].fmeasure rougelsum += score['rougeLsum'].fmeasure rouge1 /= len(generated_str) rouge2 /= len(generated_str) rougel /= len(generated_str) rougelsum /= len(generated_str) return {'vloss': vloss, 'rouge1': vloss.new_zeros(1) + rouge1, 'rouge2': vloss.new_zeros(1) + rouge2, 'rougeL': vloss.new_zeros(1) + rougel, 'rougeLsum': vloss.new_zeros(1) + rougelsum, }
def evaluate_all(model_type: str): gold_dir = "output/" + model_type + "/gold/" scorer = rouge_scorer.RougeScorer( ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']) all_scores = {'rouge1': [], 'rouge2': [], 'rougeL': [], 'rougeLsum': []} for file in glob.glob(gold_dir + "*.txt"): gold = " ".join(open(file, 'r').readlines()) model = " ".join(open(file.replace("gold", "model"), "r").readlines()) scores = scorer.score(gold, model) for type, score in scores.items(): all_scores[type].append(score) all_averages = {} for type in all_scores.keys(): all_averages[type] = {} all_averages[type]["f_mean"] = np.mean( [score.fmeasure for score in all_scores[type]]) all_averages[type]["f_sd"] = np.std( [score.fmeasure for score in all_scores[type]]) all_averages[type]["recall_mean"] = np.mean( [score.recall for score in all_scores[type]]) all_averages[type]["recall_sd"] = np.std( [score.recall for score in all_scores[type]]) all_averages[type]["precision_mean"] = np.mean( [score.precision for score in all_scores[type]]) all_averages[type]["precision_sd"] = np.std( [score.precision for score in all_scores[type]]) print(all_averages) return all_scores, all_averages
def compute_rouge_data(path="model_output.txt", scoring="rouge1"): scorer = rouge_scorer.RougeScorer([scoring]) fscore = [] precision = [] recall = [] with open(path, "r") as f: testing_results = json.load(f) for i, element in enumerate(testing_results): scores = scorer.score(TOKENIZER.DecodeIds(element['prediction']), TOKENIZER.DecodeIds(element['reference'])) precision.append(scores[scoring][0]) recall.append(scores[scoring][1]) fscore.append(scores[scoring][2]) return { 'Max': { 'Recall': (max(recall), np.argmax(recall)), 'Precision': (max(precision), np.argmax(precision)), 'Fscore': (max(fscore), np.argmax(fscore)) }, 'Min': { 'Recall': (min(recall), np.argmin(recall)), 'Precision': (min(precision), np.argmin(precision)), 'Fscore': (min(fscore), np.argmin(fscore)) } }
def rouge_evauation(y_true, y_pred, tokenizer): rouge1_precision = [] rouge1_recall = [] rouge1_f = [] rouge2_precision = [] rouge2_recall = [] rouge2_f = [] rougeL_precision = [] rougeL_recall = [] rougeL_f = [] for i, j in zip(y_true, y_pred): y_t = tokenizer.decode(rem_zero(i)) y_p = tokenizer.decode(rem_zero(j)) scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) scores = scorer.score(y_t, y_p) rouge1_precision.append(scores['rouge1'].precision) rouge1_recall.append(scores['rouge1'].recall) rouge1_f.append(scores['rouge1'].fmeasure) rouge2_precision.append(scores['rouge2'].precision) rouge2_recall.append(scores['rouge2'].recall) rouge2_f.append(scores['rouge2'].fmeasure) rougeL_precision.append(scores['rougeL'].precision) rougeL_recall.append(scores['rougeL'].recall) rougeL_f.append(scores['rougeL'].fmeasure) return scores, rouge1_precision, rouge1_recall, rouge1_f, rouge2_precision, rouge2_recall, rouge2_f, rougeL_precision, rougeL_recall, rougeL_f
def rouge(self, refs, preds): """ Returns `t5` style ROUGE scores. See the related implementation: https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68 :param refs: A `list` of reference `strs`. :param preds: A `list` of predicted `strs`. """ rouge_types = ["rouge1", "rouge2", "rougeLsum"] scorer = rouge_scorer.RougeScorer(rouge_types) # Add newlines between sentences to correctly compute `rougeLsum`. def _prepare_summary(summary): summary = summary.replace(" . ", ".\n") return summary # Accumulate confidence intervals. aggregator = scoring.BootstrapAggregator() for ref, pred in zip(refs, preds): ref = _prepare_summary(ref) pred = _prepare_summary(pred) aggregator.add_scores(scorer.score(ref, pred)) result = aggregator.aggregate() return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
def bbc_dataset_rouge(): scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) test_data = get_bbc_dataset_files() generic_baseline, generic_improved, generic_improved_redundancy_removal = None, None, None for v in test_data: text = v['full'] summary = v['summary'] scores = scorer.score(summarize(text, ratio=0.4), summary) generic_baseline = update_score(generic_baseline, scores) scores = scorer.score( improved_summarize(text, ratio=0.4, redundancy_removal=False), summary) generic_improved = update_score(generic_improved, scores) scores = scorer.score( improved_summarize(text, ratio=0.4, redundancy_removal=True), summary) generic_improved_redundancy_removal = update_score( generic_improved_redundancy_removal, scores) total_news = len(test_data) return { 'generic_baseline': get_score_avg(generic_baseline, total_news), 'generic_improved_redundancy_removal': get_score_avg(generic_improved_redundancy_removal, total_news), 'generic_improved': get_score_avg(generic_improved, total_news) }
def calc_rouge_scores(pred_summaries, gold_summaries, keys=['rouge1', 'rougeL'], use_stemmer=True): #Calculate rouge scores scorer = rouge_scorer.RougeScorer(keys, use_stemmer= use_stemmer) n = len(pred_summaries) scores = [scorer.score(pred_summaries[j], gold_summaries[j]) for j in range(n)] dict_scores={} for key in keys: dict_scores.update({key: {}}) for key in keys: precision_list = [scores[j][key][0] for j in range(len(scores))] recall_list = [scores[j][key][1] for j in range(len(scores))] f1_list = [scores[j][key][2] for j in range(len(scores))] precision = np.mean(precision_list) recall = np.mean(recall_list) f1 = np.mean(f1_list) dict_results = {'recall': recall, 'precision': precision, 'f1': f1} dict_scores[key] = dict_results return dict_scores
def cal_rouge_score(filename1, filename2): f1 = open(filename1, 'r') f2 = open(filename2, 'r') summary = f1.readlines() reference = f2.readlines() """ for i in range(len(summary)): summary[i] = re.sub('[%s]' % re.escape(string.punctuation), '', summary[i]) reference[i] = re.sub('[%s]' % re.escape(string.punctuation), '', reference[i]) """ for i in range(len(summary)): summary[i] = summary[i].strip().replace('<t>', '').replace('</t>', '').strip() reference[i] = reference[i].strip().replace('<t>', '').replace('</t>', '').strip() print(len(summary)) print(len(reference)) #summary = summary[1:1000] #reference = reference[1:1000] scorer = rouge_scorer.RougeScorer( ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True) scores = {"rouge1": [], "rouge2": [], "rougeL": [], "rougeLsum": []} for i in range(len(summary)): s = scorer.score(summary[i], reference[i]) if i % 1000 == 0: print(i) for k, v in s.items(): scores[k].append(s[k].fmeasure) for k, v in scores.items(): scores[k] = sum(v) / len(v) return scores
def test_rouge(temp_dir, cand, ref): candidates = [line.strip() for line in open(cand, encoding='utf-8')] references = [line.strip() for line in open(ref, encoding='utf-8')] print("test number", len(candidates)) # print(len(references)) assert len(candidates) == len(references) scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False) results = { 'rouge1' : {'precision':0, 'recall':0, 'fmeasure':0}, 'rouge2' : {'precision':0, 'recall':0, 'fmeasure':0}, 'rougeL' : {'precision':0, 'recall':0, 'fmeasure':0} } cnt = len(candidates) for i in range(cnt): # print(candidates[i]) # print(references[i]) scores = scorer.score(candidates[i], references[i]) results['rouge1']['precision'] += scores['rouge1'].precision results['rouge1']['recall'] += scores['rouge1'].recall results['rouge1']['fmeasure'] += scores['rouge1'].fmeasure results['rouge2']['precision'] += scores['rouge2'].precision results['rouge2']['recall'] += scores['rouge2'].recall results['rouge2']['fmeasure'] += scores['rouge2'].fmeasure results['rougeL']['precision'] += scores['rougeL'].precision results['rougeL']['recall'] += scores['rougeL'].recall results['rougeL']['fmeasure'] += scores['rougeL'].fmeasure for k,v in results.items(): for key, value in v.items(): results[k][key] = value/cnt return results
def _compute(self, predictions, references, rouge_types=None, use_agregator=True, use_stemmer=False): if rouge_types is None: rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"] scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer) if use_agregator: aggregator = scoring.BootstrapAggregator() else: scores = [] for ref, pred in zip(references, predictions): score = scorer.score(ref, pred) if use_agregator: aggregator.add_scores(score) else: scores.append(score) if use_agregator: result = aggregator.aggregate() else: result = {} for key in scores[0]: result[key] = list(score[key] for score in scores) return result
def evaluate_on_rouge_scores(targets: List, preds: List) -> Dict: scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True) scores = [ scorer.score(target, pred) for target, pred in zip(targets, preds) ] rouge1_f1, rouge2_f1, rougeL_f1 = 0.0, 0.0, 0.0 # TODO: Very ugly - refactor needed for score in scores: for k, v in score.items(): if k == "rouge1": rouge1_f1 += v.fmeasure if k == "rouge2": rouge2_f1 += v.fmeasure if k == "rougeL": rougeL_f1 += v.fmeasure eval_dict = { "rouge1": rouge1_f1 / len(scores), "rouge2": rouge2_f1 / len(scores), "rougeL": rougeL_f1 / len(scores), } return eval_dict
def _compute(self, predictions, references, use_agregator=True, use_stemmer=False): rouge_types = ["rougeL"] predictions = " ".join([str(p) for p in predictions]) references = " ".join([str(r) for r in references]) scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer) if use_agregator: aggregator = scoring.BootstrapAggregator() else: scores = [] for ref, pred in zip(references, predictions): score = scorer.score(ref, pred) if use_agregator: aggregator.add_scores(score) else: scores.append(score) if use_agregator: result = aggregator.aggregate() else: result = {} for key in scores[0]: result[key] = list(score[key] for score in scores) return result
def text_eval(encoder, features_iter, model_dir, global_step, eval_tag, enable_logging, inputs_pattern="^inputs[0-9]*$", targets_key="targets", predictions_key="outputs", additional_keys=(), num_reserved=None): """Evaluates a set of text targets/predictions.""" decode_fn = lambda x: ids2str(encoder, x, num_reserved) scorers_dict = {} scorers_dict[_ROUGE_METRIC] = rouge_scorer.RougeScorer( ["rouge1", "rouge2", "rougeL", "rougeLsum"], use_stemmer=True) scorers_dict[_BLEU_METRIC] = bleu_scorer.BleuScorer() scorers_dict[_REPETITION_METRIC] = repetition_scorer.RepetitionScorer( ["regs1", "regs2", "regs3", "regsTCR"]) scorers_dict[_LENGTH_METRIC] = length_scorer.LengthScorer(["word", "char"]) aggregators_dict = {k: scoring.BootstrapAggregator() for k in scorers_dict} with LogWriter(additional_keys, model_dir, global_step, eval_tag, enable_logging) as log_writer: for i, features in enumerate(features_iter): inputs_list = [] for k in sorted(features): if re.match(inputs_pattern, k): single_inputs = decode_matrix(decode_fn, features[k]) if isinstance(single_inputs, list): inputs_list.extend(single_inputs) else: inputs_list.append(single_inputs) inputs = "\n".join(inputs_list) targets = decode_fn(features[targets_key]) preds = decode_fn(features[predictions_key]) text_dict = { "inputs": inputs_list, "targets": targets, "predictions": preds } for key in additional_keys: if key == "selected_ids": text_dict[key] = decode_selected_indices(decode_fn, features) else: text_dict[key] = decode_matrix(decode_fn, features[key]) log_writer.write(text_dict, i) for key, scorer in scorers_dict.items(): scores_i = scorer.score(targets, preds) aggregators_dict[key].add_scores(scores_i) aggregates_dict = {k: v.aggregate() for k, v in aggregators_dict.items()} length_histograms = scorers_dict[_LENGTH_METRIC].histograms(as_string=True) _write_aggregates(model_dir, global_step, eval_tag, aggregates_dict, length_histograms) _write_aggregate_summaries(model_dir, global_step, eval_tag, aggregates_dict)
def run_rouge(self): ''' Computes the ROUGE score between the set of hypothesis and reference summaries. ''' print('\n===== ROUGE =====\n') rouge = rouge_scorer.RougeScorer(self.rouge_metrics, use_stemmer=True) for hyps_path, refs_path in zip(self.hyps_paths, self.refs_paths): self.load_summs(hyps_path, refs_path) hyps, refs = self.hyps, self.refs start_time = time.time() scores = [] for i, (c, r) in tqdm(enumerate(zip(hyps, refs))): c = c.replace('. ', '\n') r = r.replace('. ', '\n') ref = text_normalization(c) hyp = text_normalization(r) rouge_scores = rouge.score(r, c) scores.append( [rouge_scores[m].fmeasure for m in self.rouge_metrics]) self.df_scores.loc[self.df_scores['hyps_path'] == hyps_path, ROUGE_METRICS] = scores self.save_temp_csv()
def simple_rouge(generated_answers, possible_aswers_list): scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rouge3'], use_stemmer=True) rouge_1_list = [] rouge_2_list = [] rouge_3_list = [] for ind, elem in enumerate(generated_answers): if (elem != "We can't recognize objects for comparision"): generated_answer = generated_answers[ind] scores = [ scorer.score(generated_answer, answ) for answ in possible_aswers_list[ind] ] sorted_scores_1 = sorted(scores, key=lambda x: x['rouge1'].fmeasure, reverse=True) sorted_scores_2 = sorted(scores, key=lambda x: x['rouge2'].fmeasure, reverse=True) sorted_scores_3 = sorted(scores, key=lambda x: x['rouge3'].fmeasure, reverse=True) rouge_1_list.append(sorted_scores_1[0]['rouge1']) rouge_2_list.append(sorted_scores_2[0]['rouge2']) rouge_3_list.append(sorted_scores_3[0]['rouge3']) return { 'rouge1': rouge_1_list, 'rouge2': rouge_2_list, 'rouge3': rouge_3_list }
def __init__( self, model_name_or_path: str = 'albert-large-uncased', # './vocab.txt' datasets_loader: str = 'race', # 'RACELocalLoader.py' task_name: str = 'all', max_seq_length: int = 512, train_batch_size: int = 32, eval_batch_size: int = 32, num_workers: int = 8, num_preprocess_processes: int = 8, use_sentence_selection: bool = True, best_k_sentences: int = 5, **kwargs): super().__init__() self.model_name_or_path = model_name_or_path self.dataset_loader = datasets_loader self.task_name = task_name self.max_seq_length = max_seq_length self.train_batch_size = train_batch_size self.eval_batch_size = eval_batch_size self.num_workers = num_workers self.num_preprocess_processes = num_preprocess_processes self.use_sentence_selection = use_sentence_selection self.best_k_sentences = best_k_sentences self.tokenizer = AlbertTokenizerFast.from_pretrained( self.model_name_or_path, use_fast=True, do_lower_case=True) self.scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True) self.dataset = None
def __init__(self, metric): self.metric = metric if self.metric == 'rouge': # TODO: does this package do lower case, normalization self.scorer = rouge_scorer.RougeScorer( ['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--gold_file") parser.add_argument("--predictions_file") parser.add_argument("--output_file", required=False) args = parser.parse_args() # Load the Rouge metric metric = rouge_scorer.RougeScorer(['rougeL']) # Load the gold and predictions file gold_lines = {l['id']: l for l in jsonlines.open(args.gold_file)} pred_lines = {l['id']: l for l in jsonlines.open(args.predictions_file)} scores = [] for id in gold_lines: if id not in pred_lines: print(f'Could not find a generated summary for ID {id}. ' f'Assigning a score of 0 for this instance.') scores.append(0) else: score = metric.score( gold_lines[id]['summary'], pred_lines[id]['generated_summary'])['rougeL'].fmeasure scores.append(score) pred_lines[id]['rougeL'] = score print(f'Rouge-L score: {sum(scores)/len(scores)*100:.2f}') # If an output_file is provided, we write out the instance-wise # rouge scores to file if args.output_file: with open(args.output_file, 'w', encoding='utf-8') as f: for l in pred_lines.values(): f.write(json.dumps(l, ensure_ascii=False) + '\n')
def compute_scores(self): self.preprocess() scorer = rouge_scorer.RougeScorer([self.rouge_type[0]], use_stemmer=True) self._scores = list(map(lambda p,x,y,z: max(scorer.score(p,x)[self.rouge_type[0]][self.rouge_type[1]], scorer.score(p,y)[self.rouge_type[0]][self.rouge_type[1]], scorer.score(p,z)[self.rouge_type[0]][self.rouge_type[1]]), self._predictions, self._references[0], self._references[1], self._references[2]))
def __init__(self, model_name="t5-small", **config_kw): self.config = T5ModelConfig(**config_kw) self.tokenizer = transformers.T5Tokenizer.from_pretrained(model_name) self.model = transformers.TFT5ForConditionalGeneration.from_pretrained( model_name, output_hidden_states=True, output_attentions=True) # TODO(gehrmann): temp solution for ROUGE. self._scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
def computeROUGE(originalTextList, paraphraseTextList): outputScores = [] for originalSent, paraphraseSent in zip(originalTextList, paraphraseTextList): scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False) scores = scorer.score(originalSent, paraphraseSent) outputScores.append(scores['rougeL'][0]) return outputScores
def calc_rouge_score(orig_text: str, gen_text: str): """ :param orig_text: original speech as a string :param gen_text: generated speech as a string :return: rouge score between the two speeches (f1 measure) """ scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True) scores = scorer.score(orig_text, gen_text) return scores["rouge1"][2]
def __init__(self, sequence_transform_fn, batch_size): self.scorer = rouge_scorer.RougeScorer( [f'rouge{i}' for i in ROUGE_VARIATIONS], use_stemmer=True, ) self.sequence_transform_fn = sequence_transform_fn self.batch_size = batch_size self.target_batches = [] self.prediction_batches = []
def rouge(references, pred): rouge_names = ['rouge1', 'rouge2', 'rougeL'] scorer = rouge_scorer.RougeScorer(rouge_names, use_stemmer=True) scores = {rouge_name: 0 for rouge_name in rouge_names} for reference in references: ref_scores = scorer.score(reference, pred) for rouge_name, cur_score in scores.items(): scores[rouge_name] = max(scores[rouge_name], ref_scores[rouge_name].fmeasure) return scores
def calculate_rouge(output_lns: List[str], reference_lns: List[str], use_stemmer=True) -> Dict: scorer = rouge_scorer.RougeScorer(ROUGE_KEYS, use_stemmer=use_stemmer) aggregator = scoring.BootstrapAggregator() for reference_ln, output_ln in zip(reference_lns, output_lns): scores = scorer.score(reference_ln, output_ln) aggregator.add_scores(scores) result = aggregator.aggregate() return {k: v.mid.fmeasure for k, v in result.items()}