def find_rouge(file1,file2): with open(file1, 'r') as myfile: text1=myfile.read() with open(file2, 'r') as myfile: text2=myfile.read() rouge = Rouge() scores = rouge.get_scores(text1, text2) print scores return scores
def decode(self): start = time.time() counter = 0 batch_generator = self.dataset.batches while True: try: batch = next(batch_generator) best_summary = self.beam_search(batch) # Run beam search to get best Hypothesis # Extract the output ids from the hypothesis and convert back to words output_ids = [int(t) for t in best_summary.tokens[1:]] decoded_words = self.dataset.vocab.outputids2words(output_ids, (batch.art_oovs[0] if self.args.pointer_gen else None)) # Remove the [STOP] token from decoded_words, if necessary try: fst_stop_idx = decoded_words.index(opt.EOS) decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words hypothesis = ' '.join(decoded_words) self.hypotheses.append(hypothesis) self.references.append(batch.original_abstracts[0]) # single_pass counter += 1 if counter % 10 == 0: print('Beam Search %d example in %d sec' % (counter, time.time() - start)) start = time.time() except StopIteration: print('StopIteration, Beam Search end. Writing to file:', self._rouge_ref_dir) break self.write_for_rouge() rouge = Rouge() scores = rouge.get_scores(self.references, self.hypotheses, avg=True) return scores
def calculate_metric(hyp, ref, context, effective_length=1024): # ===== Calculate rouge ======== with open('../result/rouge.txt', 'a') as f_result: rouge = Rouge() print(len(hyp)) print(len(ref)) hyp, ref = zip(*[(x, y) for x, y in zip(hyp, ref) if len(x) > 3 and len(y) > 3]) print(len(hyp)) hyp = [x[:effective_length] for x in hyp] ref = [x[:effective_length] for x in ref] scores = rouge.get_scores(hyp, ref, avg=True) print("ROUGE", scores) import time f_result.write(time.asctime() + '\n') f_result.write(args.model_dir + '\t' + str(effective_length) + '\n') f_result.write(str(scores)) f_result.write('\n') # == dump output==== print("#ref{} #hyp{}".format(len(ref), len(hyp))) with open( "../data_processed/output_" + args.model_dir + 'p{}k{}'.format(args.top_p, args.top_k), 'wb') as f_output: pickle.dump(zip(hyp, ref, context), f_output)
def __init__(self, summaries, references): #global config cfg = Config() #class variables self.f1 = [] self.p1 = [] self.r1 = [] self.f2 = [] self.p2 = [] self.r2 = [] self.f_l = [] self.p_l = [] self.r_l = [] #summaries and references self.summaries = summaries self.references = references #global results self.results = [] self.stddev = [] #folder destination path self.testResultPath = cfg.testResultPath self.avgResultPath = cfg.avgResultPath self.stdResultPath = cfg.stdResultPath self.rouge = Rouge()
def cosine(texts, ref): #computes similarity as the normalized dot product of X and Y vec = TfidfVectorizer(tokenizer=textblob_tokenizer, stop_words='english', use_idf=True) matrix = vec.fit_transform(texts) cosine_similarities = cosine_similarity(matrix[0:1], matrix).flatten() nb_sentences_in_base_summary = len(ref.split('.')) cosine_similarities = list(cosine_similarities) cos_results = [] for i in range(0, nb_sentences_in_base_summary): n = cosine_similarities.index(max(cosine_similarities)) cos_results.append(texts[n]) del cosine_similarities[n] res = ' '.join(cos_results) r = Rouge() rouge = r.get_scores(res, ref) return gen_serie('Cosine Similarity', rouge, res)
def reward_function(self, decoded_sents, original_sents): rouge = Rouge() try: scores = rouge.get_scores(decoded_sents, original_sents) except Exception: # print("Rouge failed for multi sentence evaluation.. Finding exact pair") self.logger.info("Rouge failed for multi sentence evaluation.. Finding exact pair") scores = [] for i in range(len(decoded_sents)): try: score = rouge.get_scores(decoded_sents[i], original_sents[i]) except Exception: # print("Error occured at:") # print("decoded_sents:", decoded_sents[i]) # print("original_sents:", original_sents[i]) self.logger.info("Error occured at:") self.logger.info("decoded_sents:", decoded_sents[i]) self.logger.info("original_sents:", original_sents[i]) score = [{"rouge-l": {"f": 0.0}}] scores.append(score[0]) rouge_l_f1 = [score["rouge-l"]["f"] for score in scores] avg_rouge_l_f1 = sum(rouge_l_f1) / len(rouge_l_f1) rouge_l_f1 = get_cuda(T.FloatTensor(rouge_l_f1)) return rouge_l_f1, scores, avg_rouge_l_f1
def scores(k, index, generate_text, model, trump_tweets): initial_sentence = trump_tweets[index][:k] length = len(trump_tweets[index]) hyp = generate_text(model, initial_sentence, length, ' ') ref = trump_tweets[index] ref = " ".join(ref) print("Generated sentence:", hyp) print() print("Reference sentence:", ref) print() print( "--------------------------------------------------------------------------" ) print() rouge = Rouge() r_scores = rouge.get_scores(hyp, ref) print( str(k) + " initial words from #" + str(index) + " sentences -- rouge scores:") for key, v in r_scores[0].items(): print(str(key), v) b_scores = sentence_bleu(ref.split(), hyp) print() print( "--------------------------------------------------------------------------" ) print() print( str(k) + " initial words from #" + str(index) + " sentences -- BLEU scores:") print(b_scores) print() print( "##########################################################################" ) print()
def no_bertscore(self): rouge_scorer = Rouge() def r1_score(hypothesis: str, reference: str): scores_rouge = rouge_scorer.get_scores(hypothesis, reference)[0] return scores_rouge["rouge-1"]["f"] def r2_score(hypothesis: str, reference: str): scores_rouge = rouge_scorer.get_scores(hypothesis, reference)[0] return scores_rouge["rouge-2"]["f"] def rl_score(hypothesis: str, reference: str): scores_rouge = rouge_scorer.get_scores(hypothesis, reference)[0] return scores_rouge["rouge-l"]["f"] self.scores = [r1_score, r2_score, rl_score]
class Evaluator(keras.callbacks.Callback): """评估与保存 """ def __init__(self): self.rouge = Rouge() self.smooth = SmoothingFunction().method1 self.best_bleu = 0. def on_epoch_end(self, epoch, logs=None): metrics = self.evaluate(valid_data) # 评测模型 if metrics['bleu'] > self.best_bleu: self.best_bleu = metrics['bleu'] model.save_weights('./best_model.weights') # 保存模型 metrics['best_bleu'] = self.best_bleu print('valid_data:', metrics) def evaluate(self, data, topk=1): total = 0 rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 for title, content in tqdm(data): total += 1 pred_title = autotitle.generate(content, topk) print() print(title) print(pred_title) print(content) print('') title = ' '.join(title).lower() pred_title = ' '.join(pred_title).lower() if pred_title.strip(): scores = self.rouge.get_scores(hyps=pred_title, refs=title) rouge_1 += scores[0]['rouge-1']['f'] rouge_2 += scores[0]['rouge-2']['f'] rouge_l += scores[0]['rouge-l']['f'] bleu += sentence_bleu(references=[title.split(' ')], hypothesis=pred_title.split(' '), smoothing_function=self.smooth) rouge_1 /= total rouge_2 /= total rouge_l /= total bleu /= total return { 'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'bleu': bleu, }
def score(ref, hypo): scorers = [ (Bleu(4),["Bleu_1","Bleu_2","Bleu_3","Bleu_4"]), (Meteor(),"METEOR"), (Rouge(),"ROUGE_L"), (Cider(),"CIDEr") ] final_scores = {} for scorer,method in scorers: score,scores = scorer.compute_score(ref,hypo) if type(score)==list: for m,s in zip(method,score): final_scores[m] = s else: final_scores[method] = score return final_scores
def print_rouge_scores(pred_path, true_path): get_rouge_scores = Rouge().get_scores with open(pred_path, 'r') as f: summaries = f.readlines() with open(true_path, 'r') as f: ground_truth = f.readlines() assert len(summaries) == len(ground_truth) all_scores = [] # 看不同的长度,那个rouge得分高 for i in range(len(summaries)): # rouge_scores = get_rouge_scores(summaries[i][j], ground_truth[i])[0] hyps = ' '.join(list(summaries[i])) refs = ' '.join(list(ground_truth[i])) rouge_scores = get_rouge_scores(hyps, refs)[0] r1f = rouge_scores["rouge-1"]["f"] r2f = rouge_scores["rouge-2"]["f"] rlf = rouge_scores["rouge-l"]["f"] temp = r1f * 0.2 + r2f * 0.4 + rlf * 0.4 all_scores.append([temp, r1f, r2f, rlf]) rouge_based_on_zi = np.mean(np.array(all_scores), axis=0).tolist() # jieba 分词 all_scores = [] # 看不同的长度,那个rouge得分高 for i in range(len(summaries)): # rouge_scores = get_rouge_scores(summaries[i][j], ground_truth[i])[0] hyps = ' '.join([w for w in jieba.cut(summaries[i])]) refs = ' '.join([w for w in jieba.cut(ground_truth[i])]) rouge_scores = get_rouge_scores(hyps, refs)[0] r1f = rouge_scores["rouge-1"]["f"] r2f = rouge_scores["rouge-2"]["f"] rlf = rouge_scores["rouge-l"]["f"] temp = r1f * 0.2 + r2f * 0.4 + rlf * 0.4 all_scores.append([temp, r1f, r2f, rlf]) rouge_based_on_ci = np.mean(np.array(all_scores), axis=0).tolist() return rouge_based_on_zi + rouge_based_on_ci
class Evaluate(keras.callbacks.Callback): def __init__(self): self.lowest = 3.5 self.rouge = Rouge() self.smooth = SmoothingFunction().method1 self.best_bleu = 0. def on_epoch_end(self, epoch, logs=None): rad = random.randint(0, len(train_data) - 1000) metrics = self.evaluate(train_data[rad:rad + 1000]) # 评测模型 if metrics['bleu'] > self.best_bleu and logs['loss'] <= self.lowest: self.best_bleu = metrics['bleu'] self.lowest = logs['loss'] model.save_weights( os.path.join('/home/' + ser + '/STC3/result/', str(self.lowest)[:5] + '.weights')) # 保存最优 metrics['best_bleu'] = self.best_bleu print('valid_data:', metrics) just_show(self.lowest) # 演示效果 def evaluate(self, data, topk=1): total = 0 rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 for ques, answ, ids in tqdm(data): total += 1 answ = ''.join(answ.split(' ')) pred_answ = ''.join(autotitle.generate(ques, ids)) scores = self.rouge.get_scores(hyps=pred_answ, refs=answ) rouge_1 += scores[0]['rouge-1']['f'] rouge_2 += scores[0]['rouge-2']['f'] rouge_l += scores[0]['rouge-l']['f'] bleu += sentence_bleu(references=[list(answ)], hypothesis=list(pred_answ), smoothing_function=self.smooth) rouge_1 /= total rouge_2 /= total rouge_l /= total bleu /= total return { 'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'bleu': bleu, }
def rougeScoreExcludeStopWords(hyp_str,ref_str): ''' 返回两个句子去除stop word 后的 :param hyp_str: 原文对应的句子 :param ref_str: SR中的PIO句子 :return: ''' rouge = Rouge() hyp_str = " ".join([word for word in hyp_str.translate(str.maketrans('', '', string.punctuation)).split() if word not in stopwords.words('english') ]) # print(hyp_str) if not hyp_str.strip(): hyp_str = hyp_str + '*' ref_str = " ".join([word for word in ref_str.translate(str.maketrans('', '', string.punctuation)).split() if word not in stopwords.words('english') ]) # print(ref_str) if not ref_str.strip(): ref_str = ref_str + '*' return rouge.get_scores(hyp_str,ref_str)
def score(ref, hypo): """ ref, dictionary of reference sentences (id, sentence) hypo, dictionary of hypothesis sentences (id, sentence) score, dictionary of scores """ scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Rouge(), "ROUGE_L"), ] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(ref, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
class Evaluator(keras.callbacks.Callback): """模型评测与保存 """ def __init__(self): self.rouge = Rouge() self.smooth = SmoothingFunction().method1 self.best_bleu = 0. def on_epoch_end(self, epoch, logs=None): metrics = self.evaluate(valid_data) # 评测模型 if metrics['bleu'] > self.best_bleu: self.best_bleu = metrics['bleu'] # model.save_weights('/home/jiangweiwei/pretrained-unilm-Chinese/output/webqa/best_model.weights') # 保存模型 metrics['best_bleu'] = self.best_bleu print('valid_data:', metrics) show() def evaluate(self, data, topk=1): total = 0 rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 for question, passage, answer in tqdm(data): total += 1 answer = ' '.join(answer).lower() pred_answer = ' '.join(autotitle.generate(question, passage, topk)).lower() if pred_answer.strip(): scores = self.rouge.get_scores(hyps=pred_answer, refs=answer) rouge_1 += scores[0]['rouge-1']['f'] rouge_2 += scores[0]['rouge-2']['f'] rouge_l += scores[0]['rouge-l']['f'] bleu += sentence_bleu(references=[answer.split(' ')], hypothesis=pred_answer.split(' '), smoothing_function=self.smooth) rouge_1 /= total rouge_2 /= total rouge_l /= total bleu /= total return { 'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'bleu': bleu, }
def evaluate(eval_file, answer_dict): from rouge import Rouge f1 = exact_match = rouge_l_ = total = 0 rouge = Rouge() for key, value in answer_dict.items(): total += 1 ground_truths = eval_file[key]["answers"] prediction = value exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths) f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths) rouge_l_ += rouge_l(rouge, prediction, ground_truths[0]) exact_match = 100.0 * exact_match / total f1 = 100.0 * f1 / total rouge_l_ = 100.0 * rouge_l_ / total return {'exact_match': exact_match, 'f1': f1, 'rouge-l': rouge_l_}
class CalculateRouge(chainer.training.Extension): trigger = 1, 'epoch' priority = chainer.training.PRIORITY_WRITER def __init__(self, model, test_data, key, batch=100, device=-1, max_length=100): self.model = model self.test_data = test_data self.key = key self.batch = batch self.device = device self.max_length = max_length self.rouge = Rouge() def __call__(self, trainer): with chainer.no_backprop_mode(): references = [] hypotheses = [] for i in range(0, len(self.test_data), self.batch): sources, targets = zip(*self.test_data[i:i + self.batch]) references.extend( [' '.join(map(str, t.tolist())) for t in targets]) sources = [ chainer.dataset.to_device(self.device, x) for x in sources ] ys = [ ' '.join(map(str, y.tolist())) for y in self.model.translate(sources, self.max_length) ] hypotheses.extend(ys) scores = self.rouge.get_scores(hypotheses, references, avg=True) rouge_l = scores["rouge-l"] chainer.report({self.key[0]: rouge_l["p"]}) chainer.report({self.key[1]: rouge_l["r"]}) chainer.report({self.key[2]: rouge_l["f"]})
def get_rouge(results): # 读取结果 seg_test_report = pd.read_csv(test_seg_path, header=None).iloc[:, 5].tolist() seg_test_report = [ ' '.join(str(token) for token in str(line).split()) for line in seg_test_report ] rouge_scores = Rouge().get_scores(results, seg_test_report, avg=True) print_rouge = json.dumps(rouge_scores, indent=2) with open(os.path.join(os.path.dirname(test_seg_path), 'results.csv'), 'w', encoding='utf8') as f: json.dump(list(zip(results, seg_test_report)), f, indent=2, ensure_ascii=False) print('*' * 8 + ' rouge score ' + '*' * 8) print(print_rouge)
def compare_summarizers(data, summarizers): # construct rouge metric function ROUGE-1 F compute_rouge = Rouge(metrics=["rouge-1"], stats=["f"]) def get_score(reference, hypothesis): """ Compute ROUGE-1 F score :param reference: true summary :param hypothesis: predicted summary :return: the value of ROUGE-1 F """ return compute_rouge.get_scores(hypothesis, reference)[0]["rouge-1"]["f"] # Compare summarizers on the part of the validation dataset. # Dataset is a list of dicts, each dict has two keys: "document" and "summary". validation = deepcopy(data["validation"]) if args.validation_size is None: validation_size = len(validation) else: validation_size = args.validation_size # NB: always shuffle the data! random.shuffle(validation) # A document is a text of news articles separated by special token "|||||". # For proper sentence segmentation we need to clean up the data. def clean_document(text): return "\n".join(text.split("|||||")) print("Compute scores on the validation dataset") scores = defaultdict(list) for i in tqdm(range(validation_size)): document = clean_document(validation[i]["document"]) true_summary = validation[i]["summary"] for summarizer_name, summarizer in summarizers.items(): summary = summarizer(document) scores[summarizer_name].append(get_score(true_summary, summary)) for summarizer_name in summarizers: print("Score of '{}' is {}".format(summarizer_name, np.mean(scores[summarizer_name])))
class RougeMetric(Metric): def __init__(self, output_transform=lambda x: x, batch_size=lambda x: len(x), **kwargs): self._stats = kwargs.get("stats", Rouge.DEFAULT_STATS) self._metrics = kwargs.get("metrics", Rouge.DEFAULT_METRICS) self._batch_size = batch_size self._count = 0 self._total_stats = {} super(RougeMetric, self).__init__(output_transform) self.rouge = Rouge(**kwargs) def update(self, output): self._count += 1 try: rouge_res = self.rouge.get_scores(output[0], output[1], avg=True) for metric, metric_val in rouge_res.items(): for stat, val in metric_val.items(): self._total_stats[metric][stat] += val except ValueError: return def reset(self): self._total_stats = { metric: {stat: 0 for stat in self._stats} for metric in self._metrics } self._count = 0 def compute(self): for metric, metric_val in self._total_stats.items(): for stat, val in metric_val.items(): self._total_stats[metric][stat] /= self._count return self._total_stats def __str__(self): representations = [ "{}-{}: {}".format(m, s, self._total_stats[m][s]).title() for m in self._metrics for s in self._stats ] return "\n".join(representations)
def evalLead3(args): data = Dataset(path=args.data_path) Rouge_list, Rouge155_list = [], [] Rouge155_obj = Rouge155(stem=True, tmp='./tmp2') for batch_iter, valid_batch in tqdm(enumerate(data.gen_train_minibatch()), total=data.test_size): if not (batch_iter % 100 == 0): continue doc, sums, doc_len, sums_len = valid_batch selected_indexs = range(min(doc.size(0), 1)) doc_matrix = doc.data.numpy() doc_len_arr = doc_len.data.numpy() golden_summ_matrix = sums[0].data.numpy() golden_summ_len_arr = sums_len[0].data.numpy() doc_arr = [] for i in range(np.shape(doc_matrix)[0]): temp_sent = " ".join([data.itow[x] for x in doc_matrix[i]][:doc_len_arr[i]]) doc_arr.append(temp_sent) golden_summ_arr = [] for i in range(np.shape(golden_summ_matrix)[0]): temp_sent = " ".join([data.itow[x] for x in golden_summ_matrix[i] ][:golden_summ_len_arr[i]]) golden_summ_arr.append(temp_sent) summ_matrix = torch.stack([doc[x] for x in selected_indexs]).data.numpy() summ_len_arr = torch.stack([doc_len[x] for x in selected_indexs]).data.numpy() summ_arr = [] for i in range(np.shape(summ_matrix)[0]): temp_sent = " ".join([data.itow[x] for x in summ_matrix[i]][:summ_len_arr[i]]) summ_arr.append(temp_sent) score_Rouge = Rouge().get_scores(" ".join(summ_arr), " ".join(golden_summ_arr)) Rouge_list.append(score_Rouge[0]['rouge-l']['f']) print(Rouge_list[-1]) print('=' * 60) print(np.mean(Rouge_list))
class Evaluate(keras.callbacks.Callback): def __init__(self): self.rouge = Rouge() self.smooth = SmoothingFunction().method1 self.best_bleu = 0. def on_epoch_end(self, epoch, logs=None): metrics = self.evaluate(valid_data) # 评测模型 if metrics['bleu'] > self.best_bleu: self.best_bleu = metrics['bleu'] model.save_weights( './best_model.baseline_e7_newpro_gp.weights') # 保存模型 metrics['best_bleu'] = self.best_bleu print('valid_data:', metrics) def evaluate(self, data, topk=1): total = 0 rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 for text, question, answer in tqdm(data): total += 1 question = ' '.join(question).lower() pred_question = ' '.join(autotitle.generate(text, answer, topk)).lower() if pred_question.strip(): scores = self.rouge.get_scores(hyps=pred_question, refs=question) rouge_1 += scores[0]['rouge-1']['f'] rouge_2 += scores[0]['rouge-2']['f'] rouge_l += scores[0]['rouge-l']['f'] bleu += sentence_bleu(references=[question.split(' ')], hypothesis=pred_question.split(' '), smoothing_function=self.smooth) rouge_1 /= total rouge_2 /= total rouge_l /= total bleu /= total return { 'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'bleu': bleu, }
class Evaluate(keras.callbacks.Callback): def __init__(self, val_data_path, topk): self.rouge = Rouge() self.smooth = SmoothingFunction().method1 self.data = pd.read_csv( val_data_path, sep='\t', header=None, ) self.lowest = 1e10 self.topk = topk def on_epoch_end(self, epoch, logs=None): just_show() total = 0 rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 for a, b in self.data.iterrows(): total += 1 generated_title = gen_sent(b[1], self.topk) real_title = b[0] real_title = " ".join(real_title) generated_title = " ".join(generated_title) scores = self.rouge.get_scores(generated_title, real_title) rouge_1 += scores[0]['rouge-1']['f'] rouge_2 += scores[0]['rouge-2']['f'] rouge_l += scores[0]['rouge-l']['f'] bleu += sentence_bleu(references=[real_title.split(' ')], hypothesis=generated_title.split(' '), smoothing_function=self.smooth) rouge_1 /= total rouge_2 /= total rouge_l /= total bleu /= total return { 'rouge-1': rouge_1, 'rouge-2': rouge_2, 'rouge-l': rouge_l, 'bleu': bleu, }
class Evaluator(keras.callbacks.Callback): def __init__(self): self.rouge = Rouge() self.smooth = SmoothingFunction().method1 self.best_bleu = 0.0 def on_epoch_end(self, epoch, logs=None): metrics = self.evaluate(valid_data) if metrics['bleu'] > self.best_bleu: self.best_bleu = metrics['bleu'] model.save_weights('best_vis_model_epoch_' + str(epoch) + '.weights') metrics['best_bleu'] = self.best_bleu print('valid_data: ', metrics) def evaluate(self, data, topk=1): total = 1 rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 for target, bland, image_path in tqdm(data): total += 1 target = ' '.join(target).lower() pred_target = ' '.join( seq2seq_model.generate(bland, image_path, topk)).lower() if pred_target.strip(): scores = self.rouge.get_scores(hyps=pred_target, refs=target) rouge_1 += scores[0]['rouge-1']['f'] rouge_2 += scores[0]['rouge-2']['f'] rouge_l += scores[0]['rouge-l']['f'] bleu += sentence_bleu(references=[target.split(' ')], hypothesis=pred_target.split(' '), smoothing_function=self.smooth) rouge_1 /= total rouge_2 /= total rouge_l /= total bleu /= total return { 'rouge_1': rouge_1, 'rouge_2': rouge_2, 'rouge_l': rouge_l, 'bleu': bleu }
class RougeEvaluator(BaseTextEvaluator): """ :class:`RougeEvaluator` Evaluate Rouge score between acutal and ground truth. """ def __init__(self, metric: str = 'rouge-1', stat: str = 'r', *args, **kwargs): """metric: can be rouge-1, rouge-2 or rouge-l stat: can be r for recall, p for precision and f for f1 """ super().__init__(*args, **kwargs) self._metric = metric.lower() self.stat = stat.lower() def post_init(self): super().post_init() from rouge import Rouge self.rouge = Rouge(metrics=[self._metric], stats=[self.stat]) def evaluate(self, actual: str, desired: str) -> float: if (not len(actual)) or (not len(desired)): return 0.0 return float(self.rouge.get_scores(actual, desired)[0][self._metric][self.stat])
class RougeScorer(object): """ compute rouge score of string >>> rouge_scorer = RougeScorer() >>> rouge_scorer.add_string(ref='The dog bit the man.', hyp='The dog bit the man.') >>> score = rouge_scorer.score() >>> score {'rouge-1': {'f': 1.0, 'p': 1.0, 'r': 1.0}, 'rouge-2': {'f': 1.0, 'p': 1.0, 'r': 1.0}, 'rouge-l': {'f': 1.0, 'p': 1.0, 'r': 1.0}} """ def __init__(self, precision=2): from rouge import Rouge self.rouge = Rouge() self._precision = precision self.reset() def reset(self): self.refs = [] self.hyps = [] def add_string(self, ref, hyp): self.refs.append(ref) self.hyps.append(hyp) def add_strings(self, refs, hyps): self.refs.extend(refs) self.hyps.extend(hyps) def score(self, avg=True): assert len(self.hyps) == len(self.refs) and len(self.refs) > 0 performance = self.rouge.get_scores(hyps=self.hyps, refs=self.refs, avg=avg) return { name: { avg_name: round(avg_value, self._precision) for avg_name, avg_value in value.items() } for name, value in performance.items() }
def add_oracle_summary_to_records(records, max_sentences=30, lower=True, nrows=1000): rouge = Rouge() for i, record in enumerate(records): if i >= nrows: break # text = record["text"] sentences = record["sentences"] summary = record["title"] summary = summary.lower() if lower else summary sentences = sentences[:max_sentences] oracle_summary, sentences_indicies = build_oracle_summary_greedy( sentences, summary, calc_score=lambda x, y: calc_single_score(x, y, rouge), lower=lower) record["sentences"] = sentences record["oracle_sentences"] = list(sentences_indicies) record["oracle_summary"] = oracle_summary return records[:nrows]
def evaluate(eval_file, answer_dict): f1 = rouge_l_ = exact_match = total = 0 from rouge import Rouge rouge = Rouge() #for key in answer_dict.items() ## converting eval_file keys to format of answer_dict keys format # i.e (remapped_answer_dict format): read utils->convert_tokens and main.py->test last few lines remapped_eval_file = {} for key, value in eval_file.items(): uuid = eval_file[key]["uuid"] #print(type(uuid)) remapped_eval_file[str(uuid)] = eval_file[key]["answers"][0] a = remapped_eval_file.keys() b = [] for i in answer_dict.keys(): b.append(str(i)) #print(len(a)) #print(len(b)) print(len(list(set(a).intersection(b)))) for key, value in answer_dict.items(): total += 1 ground_truths = remapped_eval_file[str(key)] prediction = value exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths) f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths) rouge_l_ += rouge_l(rouge, prediction, ground_truths) #print(key) exact_match = 100.0 * exact_match / total f1 = 100.0 * f1 / total rouge_l_ = 100.0 * rouge_l_ / total return {'exact_match': exact_match, 'f1': f1, 'rouge-l': rouge_l_}
def rouge_12L(sen: str, ref: str, ev: rouge.Rouge = None, R1=True, R2=True, RL=True): if not ev: ev = rouge.Rouge() metrics = [0, 0, 0] try: metrics = ev.get_scores(sen, ref) metrics = [ metrics[0]['rouge-1']['f'], metrics[0]['rouge-2']['f'], metrics[0]['rouge-l']['f'] ] except ValueError: pass rm = [] if R1: rm.append(metrics[0]) if R2: rm.append(metrics[1]) if RL: rm.append(metrics[2]) if len(rm) == 1: rm = rm[0] return rm
def __init__(self, reward_name: str, reward_type: str, error_penalty: float = 0, default_reward: float = 0, discount_factor: float = 1.0, is_terminal=False, is_stochastic=False): """Helper class for reward shaping in ExtractiveEnv. Currently, this helper class allows these reward-shaping use cases: 1. Constant-type reward (e.g. all episode rewards are rouge-2 f) 2. Scheduled reward (e.g. first 1000 episodes are rouge-1 f, next 1000 episodes are rouge-2 f) Requires updating the env's reward helper during training with a stable-baselines callback. e.g. env.reward_helper = RewardHelper(**new_params) 3. Stochastic reward types. 4. Terminal rewards (only returned at end-of-episode) vs intermediate rewards (returned every action). :param reward_name: ROUGE algorithm ('rouge-1', 'rouge-2', 'rouge-l', 'average') :param reward_type: ROUGE type ('f', 'r', 'p') i.e. F1, Precision, Recall :param error_penalty: Reward <= 0 to penalize invalid actions (already selected or out-of-range sentences) :param default_reward: Reward <=0 to penalize slow episode terminations when is_terminal is True. :param is_terminal: Whether to return reward only on episode termination :param is_stochastic: Whether to return random ROUGE algorithm/type score """ assert reward_name in ['rouge-1', 'rouge-2', 'rouge-l', 'average'] assert reward_type in ['f', 'r', 'p'] self.reward_name = reward_name self.reward_type = reward_type self.error_penalty = error_penalty self.default_reward = default_reward self.discount_factor = discount_factor self.is_terminal = is_terminal self.is_stochastic = is_stochastic self.reward_calculator = Rouge().get_scores
def eval(gt_text, arg_text, non_arg_text=None): if non_arg_text: length_arg = len(arg_text) length_no_arg = len(non_arg_text) fpr_values = [] for arg_length in [220, 330, 440]: ratio_arg = arg_length / length_arg ratio_no_arg = (660 - arg_length) / length_no_arg if ratio_arg > 0.3: summary_arg = extractive_summary(arg_text, min(ratio_arg, 1)) else: summary_arg = extractive_summary(arg_text, ratio_arg, 20, 200) summary_no_arg = extractive_summary(non_arg_text, ratio_no_arg, 20, 200) summary = summary_no_arg + summary_arg rouge = Rouge() score = rouge.get_scores(summary, gt_text) print(summary) print(score[0]['rouge-1']) sco = score[0]['rouge-1'] fpr_values.append(sco['f']) fpr_values.append(sco['p']) fpr_values.append(sco['r']) return fpr_values else: summary = arg_text #length = len(arg_text) #ratio = 665/length #if ratio > 0.3: # summary = extractive_summary(arg_text, min(ratio,1)) #else: # summary = extractive_summary(arg_text, ratio, 20, 200) print(summary) rouge = Rouge() score = rouge.get_scores(summary, gt_text) print(score) sco = score[0]['rouge-1'] return sco['f'], sco['p'], sco['r']
def get_rouge(pred, ref): rouge = Rouge() scores = rouge.get_scores(pred, ref) print(scores[0]) return scores[0]
r.system_filename_pattern = 'tmp.(\d+).txt' r.model_filename_pattern = 'tmp.[A-Z].#ID#.txt' output = r.convert_and_evaluate() print(output) output_dict = r.output_to_dict(output) ################################################################## ## 第二种, 纯 Python 实现 from rouge import Rouge from pprint import pprint ################################################################## ## Score 1 sentence hypothesis = "the #### transcript is a written version of each day 's cnn student news program use this transcript to he lp students with reading comprehension and vocabulary use the weekly newsquiz to test your knowledge of storie s you saw on cnn student news" reference = "this page includes the show transcript use the transcript to help students with reading comprehension and vocabulary at the bottom of the page , comment for a chance to be mentioned on cnn student news . you must be a teac her or a student age # # or older to request a mention on the cnn student news roll call . the weekly newsquiz tests students ' knowledge of even ts in the news" rouge = Rouge() scores = rouge.get_scores(hypothesis, reference) pprint(scores) # [{'rouge-1': {'f': 0.49411764217577864, # 'p': 0.5833333333333334, # 'r': 0.42857142857142855}, # 'rouge-2': {'f': 0.23423422957552154, # 'p': 0.3170731707317073, # 'r': 0.18571428571428572}, # 'rouge-l': {'f': 0.42751590030718895, # 'p': 0.5277777777777778, # 'r': 0.3877551020408163}}] print(scores[0]['rouge-l']['f']) # 0.42751590030718895 ################################################################## ## Score multiple sentences hyps = ['i am jiaruipeng', 'hello world', 'ni hao']
_iter = 435864 dec_path = args.beam_dir + args.mode + '_iter_' + str(_iter) + '_beam_size_' + str(args.beam_size) + '/' + 'rouge_dec_dir/' + '*.txt' print(dec_path) print('decode:', len(glob.glob(dec_path))) hyps = [' '.join(open(f).readlines()) for f in glob.glob(dec_path)] print('hyps:', len(hyps)) print() print('hyps first 10 lines:') print('\n'.join(hyps[:10])) print() print('hyps last 10 lines:') print('\n'.join(hyps[-10:])) print() if args.mode == 'final': with open('result.txt', 'w') as f: for line in hyps: f.write(line.replace("\n", "\\n") + '\n') else: ref_path = args.beam_dir + args.mode + '_iter_' + str(_iter) + '_beam_size_' + str(args.beam_size) + '/' + 'rouge_ref_dir/' + '*.txt' print('reference:', len(glob.glob(ref_path))) refs = [open(f).readline() for f in glob.glob(ref_path)] print('refs:') print('\n'.join(refs[:10])) rouge = Rouge() scores = rouge.get_scores(hyps, refs, avg=True) print(scores)