def fast_rouge(self, step): self.logger.info("Calculating Rouge") gold_path = self.args.result_path + '.%d.gold' % step can_path = self.args.result_path + '.%d.candidate' % step if self.args.dataset in ["DUC2006", "DUC2007"]: ## give only one reference data = [] with open(gold_path, 'r') as f: for line in f.read().splitlines(): data.append(line.strip()) data = [d.split("<x>")[0].strip() for d in data] with open(gold_path, 'w') as f: f.write("\n".join(data)) f.flush() print(8 * "=" + "DEBUG TEST FOR DUC" + 8 * "=") print(f"reference sample: {data[0]}") files_rouge = FilesRouge(can_path, gold_path) scores = files_rouge.get_scores(avg=True) rouges = {} rouges["rouge_l_f_score"] = scores["rouge-l"]["f"] rouges["rouge_2_f_score"] = scores["rouge-2"]["f"] rouges["rouge_1_f_score"] = scores["rouge-1"]["f"] self.logger.info(rouges) return rouges
def compute_metrics(pred): labels = output_dir + '/labels.txt' preds = output_dir + '/preds.txt' lf = open(labels, 'w') pf = open(preds, 'w') labels_list = [] preds_list = [] for label, pred in zip(pred.label_ids, pred.predictions): de_label = tokenizer.decode(label) de_pred = tokenizer.decode(pred.argmax(-1)) lf.write(de_label + '\n') pf.write(de_pred + '\n') labels_list.append([de_label]) preds_list.append(de_pred) lf.close() pf.close() bleu_score = corpus_bleu(labels_list, preds_list) files_rouge = FilesRouge() rouge_score = files_rouge.get_scores(preds, labels, avg=True) return { 'bleu score: ': bleu_score * 100, 'rouge-l: ': rouge_score['rouge-l']['f'] * 100 }
def print_out_rouge_score(predicted_path, expected_path): files_rouge = FilesRouge() scores = files_rouge.get_scores(predicted_path, expected_path, avg=True) print("ROUGE scores ", scores) return 0
def main(): import argparse parser = argparse.ArgumentParser(description='Rouge Metric Calculator') parser.add_argument('-f', '--file', help="File mode", action='store_true') parser.add_argument('-a', '--avg', help="Average mode", action='store_true') parser.add_argument('hypothesis', type=str, help='Text of file path') parser.add_argument('reference', type=str, help='Text or file path') args = parser.parse_args() if args.file: hyp, ref = args.hypothesis, args.reference assert (os.path.isfile(hyp)) assert (os.path.isfile(ref)) files_rouge = FilesRouge(hyp, ref) scores = files_rouge.get_scores(avg=args.avg) print(json.dumps(scores, indent=2)) else: hyp, ref = args.hypothesis, args.reference assert (type(hyp) == str) assert (type(ref) == str) rouge = Rouge() scores = rouge.get_scores(hyp, ref, avg=args.avg) print(json.dumps(scores, indent=2))
def validate(self, data_iter, step, attn_debug=False): self.model.eval() gold_path = self.args.result_path + 'step.%d.gold_temp' % step pred_path = self.args.result_path + 'step.%d.pred_temp' % step gold_out_file = codecs.open(gold_path, 'w', 'utf-8') pred_out_file = codecs.open(pred_path, 'w', 'utf-8') ct = 0 ext_acc_num = 0 ext_pred_num = 0 ext_gold_num = 0 with torch.no_grad(): for batch in data_iter: output_data, tgt_data, ext_pred, ext_gold = self.translate_batch( batch) translations = self.from_batch_dev(output_data, tgt_data) for idx in range(len(translations)): if ct % 100 == 0: print("Processing %d" % ct) pred_summ, gold_data = translations[idx] # ext f1 calculate acc_num = len(ext_pred[idx] + ext_gold[idx]) - len( set(ext_pred[idx] + ext_gold[idx])) pred_num = len(ext_pred[idx]) gold_num = len(ext_gold[idx]) ext_acc_num += acc_num ext_pred_num += pred_num ext_gold_num += gold_num pred_out_file.write(pred_summ + '\n') gold_out_file.write(gold_data + '\n') ct += 1 pred_out_file.flush() gold_out_file.flush() pred_out_file.close() gold_out_file.close() if (step != -1): pred_bleu = test_bleu(pred_path, gold_path) file_rouge = FilesRouge(hyp_path=pred_path, ref_path=gold_path) pred_rouges = file_rouge.get_scores(avg=True) f1, p, r = test_f1(ext_acc_num, ext_pred_num, ext_gold_num) self.logger.info( 'Ext Sent Score at step %d: \n>> P/R/F1: %.2f/%.2f/%.2f' % (step, p * 100, r * 100, f1 * 100)) self.logger.info( 'Gold Length at step %d: %.2f' % (step, test_length(gold_path, gold_path, ratio=False))) self.logger.info('Prediction Length ratio at step %d: %.2f' % (step, test_length(pred_path, gold_path))) self.logger.info('Prediction Bleu at step %d: %.2f' % (step, pred_bleu * 100)) self.logger.info('Prediction Rouges at step %d: \n%s\n' % (step, rouge_results_to_str(pred_rouges))) rouge_results = (pred_rouges["rouge-1"]['f'], pred_rouges["rouge-l"]['f']) return rouge_results
def using_155_processed_data(input_dir): fpout_ref = open("tmp.ref", "w") fpout_hyp = open("tmp.hyp", "w") for i in range(0, 50): for line in open(input_dir + "/model/ref." + str(i) + ".txt"): line = line.strip() if line.startswith("<a name=\"1\">[1]</a> <a href=\"#1\" id=1>"): line = line.replace( "<a name=\"1\">[1]</a> <a href=\"#1\" id=1>", "").replace("</a>", "") fpout_ref.write(line + "\n") for line in open(input_dir + "system/cand." + str(i) + ".txt"): line = line.strip() if line.startswith("<a name=\"1\">[1]</a> <a href=\"#1\" id=1>"): line = line.replace( "<a name=\"1\">[1]</a> <a href=\"#1\" id=1>", "").replace("</a>", "") fpout_hyp.write(line + "\n") fpout_ref.close() fpout_hyp.close() # get rouge files_rouge = FilesRouge() scores = files_rouge.get_scores("tmp.hyp", "tmp.ref", avg=True) for item in scores: print(item, scores[item])
def compare_summaries(): """ Compares the src summaries with the tgt summaries and prints the ROUGE scores. """ files_rouge = FilesRouge('summaries.src.txt', 'summaries.tgt.txt') rouge_scores = files_rouge.get_scores(avg=True) print_rouge_scores(rouge_scores)
def getRouge(hyp_path, ref_path): files_rouge = FilesRouge() rouge = Rouge() #files_rouge = Rouge155() # or #scores = rouge.get_scores(hyp_path,ref_path,avg=True) scores = files_rouge.get_scores(hyp_path, ref_path, avg=True) pprint(scores)
def main(): parser = argparse.ArgumentParser(description='Rouge Metric Calculator') parser.add_argument('-f', '--file', help="File mode", action='store_true') parser.add_argument('-a', '--avg', help="Average mode", action='store_true') parser.add_argument('--ignore_empty', action='store_true', help="Ignore empty hypothesis") parser.add_argument('hypothesis', type=str, help='Text of file path') parser.add_argument('reference', type=str, help='Text or file path') parser.add_argument("--metrics", nargs="+", type=str.upper, choices=METRICS_CHOICES.keys(), help="Metrics to use (default=all)") parser.add_argument("--stats", nargs="+", type=str.upper, choices=STATS_CHOICES, help="Stats to use (default=all)") args = parser.parse_args() metrics = args.metrics stats = args.stats if metrics is not None: metrics = [METRICS_CHOICES[m] for m in args.metrics] if args.file: hyp, ref = args.hypothesis, args.reference assert (os.path.isfile(hyp)) assert (os.path.isfile(ref)) files_rouge = FilesRouge(metrics, stats) scores = files_rouge.get_scores(hyp, ref, avg=args.avg, ignore_empty=args.ignore_empty) print(json.dumps(scores, indent=2)) else: hyp, ref = args.hypothesis, args.reference assert (type(hyp) == str) assert (type(ref) == str) rouge = Rouge(metrics, stats) scores = rouge.get_scores(hyp, ref, avg=args.avg) print(json.dumps(scores, indent=2))
def get_rouge(path): hyp_names = sorted(os.listdir(path), key=lambda x: int(x[x.find('_') + 1: x.find('.txt')])) ref_names = sorted(os.listdir(ref_path), key=lambda x: int(x[x.find('_') + 1: x.find('.txt')])) all_scores = [] for hyp_name, ref_name in zip(hyp_names, ref_names): files_rouge = FilesRouge(os.path.join(path, hyp_name), os.path.join(ref_path, ref_name)) scores = files_rouge.get_scores() avg_score = {'rouge-1': 0, 'rouge-2': 0, 'rouge-l': 0} for x in avg_score: avg_score[x] = {'p': sum([score[x]['p'] for score in scores]) / len(scores), 'r': sum([score[x]['r'] for score in scores]) / len(scores), 'f': sum([score[x]['f'] for score in scores]) / len(scores)} all_scores.append(avg_score) return all_scores
def call(self, y_true=None, y_pred=None, arguments=None): ref_sents = [] for tgt_path in self.tgt_paths_after_pre_process: with open(tgt_path, "r", encoding='utf8') as tgt_f: ref_sents.extend(tgt_f.readlines()) ref_sents = [sent.strip() for sent in ref_sents] with open(self.ref_path, "w", encoding="utf-8") as in_f: for ref_sent in ref_sents: in_f.write(ref_sent) in_f.write("\n") files_rouge = FilesRouge() scores = files_rouge.get_scores(self.hyp_path, self.ref_path, avg=True) return self.get_scores_output(scores)
def validate(self, data_iter, step, attn_debug=False): self.model.eval() gold_path = self.args.result_path + '.step.%d.gold_temp' % step pred_path = self.args.result_path + '.step.%d.pred_temp' % step gold_out_file = codecs.open(gold_path, 'w', 'utf-8') pred_out_file = codecs.open(pred_path, 'w', 'utf-8') # pred_results, gold_results = [], [] ct = 0 with torch.no_grad(): for batch in data_iter: doc_data, summ_data = self.translate_batch(batch) translations = self.from_batch_dev(batch, doc_data) for idx in range(len(translations)): if ct % 100 == 0: print("Processing %d" % ct) doc_short_context = translations[idx][1] gold_data = summ_data[idx] pred_out_file.write(doc_short_context + '\n') gold_out_file.write(gold_data + '\n') ct += 1 pred_out_file.flush() gold_out_file.flush() pred_out_file.close() gold_out_file.close() if (step != -1): pred_bleu = test_bleu(pred_path, gold_path) file_rouge = FilesRouge(hyp_path=pred_path, ref_path=gold_path) pred_rouges = file_rouge.get_scores(avg=True) self.logger.info( 'Gold Length at step %d: %.2f' % (step, test_length(gold_path, gold_path, ratio=False))) self.logger.info('Prediction Length ratio at step %d: %.2f' % (step, test_length(pred_path, gold_path))) self.logger.info('Prediction Bleu at step %d: %.2f' % (step, pred_bleu * 100)) self.logger.info('Prediction Rouges at step %d: \n%s\n' % (step, rouge_results_to_str(pred_rouges))) rouge_results = (pred_rouges["rouge-1"]['f'], pred_rouges["rouge-l"]['f']) return rouge_results
def calculate_scores(): hyp_fn, ref_fn = 'tmp.%s.src' % mode, 'tmp.%s.tgt' % mode write_token_id_arrays_to_text_file(hypotheses, os.path.join(model_dir, hyp_fn), tokenizer) write_token_id_arrays_to_text_file(references, os.path.join(model_dir, ref_fn), tokenizer) hyp_fn, ref_fn = os.path.join(model_dir, hyp_fn), os.path.join( model_dir, ref_fn) files_rouge = FilesRouge(hyp_fn, ref_fn) rouge_scores = files_rouge.get_scores(avg=True) bleu_score = bleu_wrapper(ref_fn, hyp_fn, case_sensitive=True) return rouge_scores, bleu_score
def evaluate(hyp, ref): with open(hyp, 'r') as r: hypothesis = r.readlines() res = {k: [" ".join(v.strip().lower().split())] for k, v in enumerate(hypothesis)} with open(ref, 'r') as r: references = r.readlines() gts = {k: [v.strip().lower()] for k, v in enumerate(references)} score_Bleu , stderr = Bleu().compute_score(hyp, ref) print("Bleu_4: " + str(score_Bleu)) score_Meteor, scores_Meteor = Meteor().compute_score(gts, res) print("Meteor: "), score_Meteor files_rouge = FilesRouge(hyp, ref) scores = files_rouge.get_scores(avg=True) print('Rouge: ' + str(scores)) score_Cider, scores_Cider = Cider().compute_score(gts, res) print("Cider: "), score_Cider
def _report_rouge(result_file, golden_file): "1" #import sys #sys.path.append('../Lib/site-packages/rouge') #from rouge_wrap import RougeWrapper #r=RougeWrapper() #results = r.evaluate_for_pair_files(golden_file, result_file) "2" from rouge import Rouge #with open("pred.txt", "r") as f1, open("data/task1_ref0.txt", "r") as f2: # rouge = Rouge() # for l1, l2 in zip(f1, f2): # scores = rouge.get_scores(l1, l2, avg=True) #print(scores) "3" from rouge import FilesRouge r = FilesRouge(result_file, golden_file) results = r.get_scores(avg=True) for k, v in results.items(): if 'f' not in v: continue print(k, v)
def get_file_rouge(ref_dir, hyp_dir): ref_map = {} hyp_map = {} for filename in os.listdir(ref_dir): file_id = filename.split(".")[0] with open(ref_dir + filename, 'r') as file: ref_map[file_id] = file.read().strip() for filename in os.listdir(hyp_dir): file_id = filename.split(".")[0] with open(hyp_dir + filename, 'r') as file: hyp_map[file_id] = file.read().strip() fpout_ref = open("tmp.ref", "w") fpout_hyp = open("tmp.hyp", "w") for file_id in ref_map: fpout_ref.write(ref_map[file_id] + "\n") fpout_hyp.write(hyp_map[file_id] + "\n") fpout_ref.close() fpout_hyp.close() # get rouge files_rouge = FilesRouge() scores = files_rouge.get_scores("tmp.hyp", "tmp.ref", avg=True) for item in scores: print(item, scores[item])
def translate(self, data_iter, step, attn_debug=False): self.model.eval() output_path = self.args.result_path + '.%d.output' % step output_file = codecs.open(output_path, 'w', 'utf-8') gold_path = self.args.result_path + '.%d.gold_test' % step pred_path = self.args.result_path + '.%d.pred_test' % step gold_out_file = codecs.open(gold_path, 'w', 'utf-8') pred_out_file = codecs.open(pred_path, 'w', 'utf-8') # pred_results, gold_results = [], [] ct = 0 ext_acc_num = 0 ext_pred_num = 0 ext_gold_num = 0 with torch.no_grad(): rouge = Rouge() for batch in data_iter: output_data, tgt_data, ext_pred, ext_gold = self.translate_batch( batch) translations = self.from_batch_test(batch, output_data, tgt_data) for idx in range(len(translations)): origin_sent, pred_summ, gold_data = translations[idx] if ct % 100 == 0: print("Processing %d" % ct) output_file.write("ID : %d\n" % ct) output_file.write("ORIGIN : \n " + origin_sent.replace('<S>', '\n ') + "\n") output_file.write("GOLD : " + gold_data.strip() + "\n") output_file.write("DOC_GEN : " + pred_summ.strip() + "\n") rouge_score = rouge.get_scores(pred_summ, gold_data) bleu_score = sentence_bleu( [gold_data.split()], pred_summ.split(), smoothing_function=SmoothingFunction().method1) output_file.write( "DOC_GEN bleu & rouge-f 1/2/l: %.4f & %.4f/%.4f/%.4f\n" % (bleu_score, rouge_score[0]["rouge-1"]["f"], rouge_score[0]["rouge-2"]["f"], rouge_score[0]["rouge-l"]["f"])) # ext f1 calculate acc_num = len(ext_pred[idx] + ext_gold[idx]) - len( set(ext_pred[idx] + ext_gold[idx])) pred_num = len(ext_pred[idx]) gold_num = len(ext_gold[idx]) ext_acc_num += acc_num ext_pred_num += pred_num ext_gold_num += gold_num f1, p, r = test_f1(acc_num, pred_num, gold_num) output_file.write( "EXT_GOLD: [" + ','.join([str(i) for i in sorted(ext_gold[idx])]) + "]\n") output_file.write( "EXT_PRED: [" + ','.join([str(i) for i in sorted(ext_pred[idx])]) + "]\n") output_file.write( "EXT_SCORE P/R/F1: %.4f/%.4f/%.4f\n\n" % (p, r, f1)) pred_out_file.write(pred_summ.strip() + '\n') gold_out_file.write(gold_data.strip() + '\n') ct += 1 pred_out_file.flush() gold_out_file.flush() output_file.flush() pred_out_file.close() gold_out_file.close() output_file.close() if (step != -1): pred_bleu = test_bleu(pred_path, gold_path) file_rouge = FilesRouge(hyp_path=pred_path, ref_path=gold_path) pred_rouges = file_rouge.get_scores(avg=True) f1, p, r = test_f1(ext_acc_num, ext_pred_num, ext_gold_num) self.logger.info( 'Ext Sent Score at step %d: \n>> P/R/F1: %.2f/%.2f/%.2f' % (step, p * 100, r * 100, f1 * 100)) self.logger.info( 'Gold Length at step %d: %.2f' % (step, test_length(gold_path, gold_path, ratio=False))) self.logger.info('Prediction Length ratio at step %d: %.2f' % (step, test_length(pred_path, gold_path))) self.logger.info('Prediction Bleu at step %d: %.2f' % (step, pred_bleu * 100)) self.logger.info('Prediction Rouges at step %d: \n%s' % (step, rouge_results_to_str(pred_rouges)))
def translate(self, data_iter, step, attn_debug=False): self.model.eval() output_path = self.args.result_path + '.%d.output' % step output_file = codecs.open(output_path, 'w', 'utf-8') gold_path = self.args.result_path + '.%d.gold_test' % step pred_path = self.args.result_path + '.%d.pred_test' % step ex_single_path = self.args.result_path + '.%d.ex_test' % step + ".short" ex_context_path = self.args.result_path + '.%d.ex_test' % step + ".long" gold_out_file = codecs.open(gold_path, 'w', 'utf-8') pred_out_file = codecs.open(pred_path, 'w', 'utf-8') short_ex_out_file = codecs.open(ex_single_path, 'w', 'utf-8') long_ex_out_file = codecs.open(ex_context_path, 'w', 'utf-8') # pred_results, gold_results = [], [] ct = 0 with torch.no_grad(): rouge = Rouge() for batch in data_iter: doc_data, summ_data = self.translate_batch(batch) translations = self.from_batch_test(batch, doc_data) for idx in range(len(translations)): origin_sent, doc_extract, context_doc_extract, \ doc_pred, lead = translations[idx] if ct % 100 == 0: print("Processing %d" % ct) output_file.write("ID : %d\n" % ct) output_file.write( "ORIGIN : " + origin_sent.replace('<S>', '\n ') + "\n") gold_data = summ_data[idx] output_file.write("GOLD : " + gold_data + "\n") output_file.write("LEAD : " + lead + "\n") output_file.write("DOC_EX : " + doc_extract.strip() + "\n") output_file.write("DOC_CONT: " + context_doc_extract.strip() + "\n") output_file.write("DOC_GEN : " + doc_pred.strip() + "\n") gold_list = gold_data.strip().split() lead_list = lead.strip().replace("[unused2]", "").replace( "[unused3]", "").split() rouge_score = rouge.get_scores(lead, gold_data) bleu_score = sentence_bleu( [gold_list], lead_list, smoothing_function=SmoothingFunction().method1) output_file.write( "LEAD bleu & rouge-f 1/2/l: %.4f & %.4f/%.4f/%.4f\n" % (bleu_score, rouge_score[0]["rouge-1"]["f"], rouge_score[0]["rouge-2"]["f"], rouge_score[0]["rouge-l"]["f"])) doc_extract_list = doc_extract.strip().replace( "[unused2]", "").replace("[unused3]", "").split() rouge_score = rouge.get_scores(doc_extract, gold_data) bleu_score = sentence_bleu( [gold_list], doc_extract_list, smoothing_function=SmoothingFunction().method1) output_file.write( "DOC_EX bleu & rouge-f 1/2/l: %.4f & %.4f/%.4f/%.4f\n" % (bleu_score, rouge_score[0]["rouge-1"]["f"], rouge_score[0]["rouge-2"]["f"], rouge_score[0]["rouge-l"]["f"])) doc_context_list = context_doc_extract.strip().replace( "[unused2]", "").replace("[unused3]", "").split() rouge_score = rouge.get_scores(context_doc_extract, gold_data) bleu_score = sentence_bleu( [gold_list], doc_context_list, smoothing_function=SmoothingFunction().method1) output_file.write( "DOC_CONT bleu & rouge-f 1/2/l: %.4f & %.4f/%.4f/%.4f\n" % (bleu_score, rouge_score[0]["rouge-1"]["f"], rouge_score[0]["rouge-2"]["f"], rouge_score[0]["rouge-l"]["f"])) doc_long_list = doc_pred.strip().replace( "[unused2]", "").replace("[unused3]", "").split() rouge_score = rouge.get_scores(doc_pred, gold_data) bleu_score = sentence_bleu( [gold_list], doc_long_list, smoothing_function=SmoothingFunction().method1) output_file.write( "DOC_GEN bleu & rouge-f 1/2/l: %.4f & %.4f/%.4f/%.4f\n\n" % (bleu_score, rouge_score[0]["rouge-1"]["f"], rouge_score[0]["rouge-2"]["f"], rouge_score[0]["rouge-l"]["f"])) short_ex_out_file.write(doc_extract.strip().replace( "[unused2]", "").replace("[unused3]", "") + '\n') long_ex_out_file.write(context_doc_extract.strip().replace( "[unused2]", "").replace("[unused3]", "") + '\n') pred_out_file.write(doc_pred.strip().replace( "[unused2]", "").replace("[unused3]", "") + '\n') gold_out_file.write(gold_data.strip() + '\n') ct += 1 pred_out_file.flush() short_ex_out_file.flush() long_ex_out_file.flush() gold_out_file.flush() output_file.flush() pred_out_file.close() short_ex_out_file.close() long_ex_out_file.close() gold_out_file.close() output_file.close() if (step != -1): ex_short_bleu = test_bleu(gold_path, ex_single_path) ex_long_bleu = test_bleu(gold_path, ex_context_path) pred_bleu = test_bleu(gold_path, pred_path) file_rouge = FilesRouge(hyp_path=ex_single_path, ref_path=gold_path) ex_short_rouges = file_rouge.get_scores(avg=True) file_rouge = FilesRouge(hyp_path=ex_context_path, ref_path=gold_path) ex_long_rouges = file_rouge.get_scores(avg=True) file_rouge = FilesRouge(hyp_path=pred_path, ref_path=gold_path) pred_rouges = file_rouge.get_scores(avg=True) self.logger.info( 'Gold Length at step %d: %.2f\n' % (step, test_length(gold_path, gold_path, ratio=False))) self.logger.info('Short Extraction Length ratio at step %d: %.2f' % (step, test_length(ex_single_path, gold_path))) self.logger.info('Short Extraction Bleu at step %d: %.2f' % (step, ex_short_bleu * 100)) self.logger.info('Short Extraction Rouges at step %d \n%s' % (step, rouge_results_to_str(ex_short_rouges))) self.logger.info('Long Extraction Length ratio at step %d: %.2f' % (step, test_length(ex_context_path, gold_path))) self.logger.info('Long Extraction Bleu at step %d: %.2f' % (step, ex_long_bleu * 100)) self.logger.info('Long Extraction Rouges at step %d \n%s' % (step, rouge_results_to_str(ex_long_rouges))) self.logger.info('Prediction Length ratio at step %d: %.2f' % (step, test_length(pred_path, gold_path))) self.logger.info('Prediction Bleu at step %d: %.2f' % (step, pred_bleu * 100)) self.logger.info('Prediction Rouges at step %d \n%s' % (step, rouge_results_to_str(pred_rouges)))
from rouge import FilesRouge import sys def prepare_results(p, r, f): return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format( metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f) #Script arguments REFERENCE_PATH = sys.argv[1] HYPOTHESIS_PATH = sys.argv[2] files_rouge = FilesRouge(HYPOTHESIS_PATH, REFERENCE_PATH) scores = files_rouge.get_scores(avg=True) for metric, results in sorted(scores.items(), key=lambda x: x[0]): print(prepare_results(results['p'], results['r'], results['f'])) print() #print(scores)
def report_rouge(ref_path, dec_path): print("Now starting ROUGE eval...") files_rouge = FilesRouge(dec_path, ref_path) scores = files_rouge.get_scores(avg=True) logging(str(scores))
def evaluate(self, release=False): eval_start_time = time.time() if self.eval_queue is None: self.eval_queue = androidreader.Reader( subtoken_to_index=self.subtoken_to_index, node_to_index=self.node_to_index, target_to_index=self.target_to_index, config=self.config, is_evaluating=True) reader_output = self.eval_queue.get_output() self.eval_predicted_indices_op, self.eval_topk_values, _, _, self.method_embedding = \ self.build_test_graph(reader_output) self.eval_true_target_strings_op = reader_output[ androidreader.TARGET_STRING_KEY] self.eval_tag_key_op = reader_output[androidreader.TARGET_TAG_KEY] self.saver = tf.train.Saver(max_to_keep=10) if self.config.LOAD_PATH and not self.config.TRAIN_PATH: self.initialize_session_variables(self.sess) self.load_model(self.sess) if release: release_name = self.config.LOAD_PATH + '.release' print('Releasing model, output model: %s' % release_name) self.saver.save(self.sess, release_name) shutil.copyfile(src=self.config.LOAD_PATH + '.dict', dst=release_name + '.dict') return None model_dirname = os.path.dirname(self.config.SAVE_PATH if self.config. SAVE_PATH else self.config.LOAD_PATH) ref_file_name = model_dirname + '/ref.txt' predicted_file_name = model_dirname + '/pred.txt' embedding_file_name = model_dirname + '/embedding.txt' if not os.path.exists(model_dirname): os.makedirs(model_dirname) # print("itern decoder size is " + str(self.config.DECODER_SIZE)) with open(model_dirname + '/log.txt', 'w') as output_file, \ open(ref_file_name, 'w') as ref_file, \ open( predicted_file_name, 'w') as pred_file, \ open( embedding_file_name, "w") as embedding_file: num_correct_predictions = 0 if self.config.BEAM_WIDTH == 0 \ else np.zeros([self.config.BEAM_WIDTH], dtype=np.int32) total_predictions = 0 total_prediction_batches = 0 true_positive, false_positive, false_negative = 0, 0, 0 self.eval_queue.reset(self.sess) start_time = time.time() try: while True: predicted_indices, true_target_strings, top_values, method_embeddings, tag = self.sess.run( [ self.eval_predicted_indices_op, self.eval_true_target_strings_op, self.eval_topk_values, self.method_embedding, self.eval_tag_key_op ], ) #print( tag.shape ) #print( tag[0]) # print( method_embeddings.shape ) # print( "0,0 " + str(method_embeddings[0,0])) # print( "MAX_LINE,MAX_COLUMN " + str(method_embeddings[ method_embeddings.shape[0] - 1 , method_embeddings.shape[1] - 1])) #print( true_target_strings ) true_target_strings = Common.binary_to_string_list( true_target_strings) ref_file.write('\n'.join([ name.replace(Common.internal_delimiter, ' ') for name in true_target_strings ]) + '\n') if self.config.BEAM_WIDTH > 0: # predicted indices: (batch, time, beam_width) predicted_strings = [[[ self.index_to_target[i] for i in timestep ] for timestep in example] for example in predicted_indices] predicted_strings = [ list(map(list, zip(*example))) for example in predicted_strings ] # (batch, top-k, target_length) pred_file.write('\n'.join([ ' '.join(Common.filter_impossible_names(words)) for words in predicted_strings[0] ]) + '\n') else: predicted_strings = [[ self.index_to_target[i] for i in example ] for example in predicted_indices] pred_file.write('\n'.join([ ' '.join(Common.filter_impossible_names(words)) for words in predicted_strings ]) + '\n') num_correct_predictions = self.update_correct_predictions( num_correct_predictions, output_file, zip(true_target_strings, predicted_strings)) true_positive, false_positive, false_negative = self.update_per_subtoken_statistics( zip(true_target_strings, predicted_strings), true_positive, false_positive, false_negative) total_predictions += len(true_target_strings) total_prediction_batches += 1 if total_prediction_batches % self.num_batches_to_log == 0: elapsed = time.time() - start_time self.trace_evaluation(output_file, num_correct_predictions, total_predictions, elapsed) embedding_file.write('\n'.join([ Common.binary_to_string(tag[i]) + ',' + ','.join([ str(method_embeddings[i, j]) for j in range(method_embeddings.shape[1]) ]) for i in range(method_embeddings.shape[0]) ]) + '\n') except tf.errors.OutOfRangeError: pass print('Done testing, epoch reached') output_file.write( str(num_correct_predictions / total_predictions) + '\n') # Common.compute_bleu(ref_file_name, predicted_file_name) elapsed = int(time.time() - eval_start_time) precision, recall, f1 = self.calculate_results(true_positive, false_positive, false_negative) try: files_rouge = FilesRouge() rouge = files_rouge.get_scores(hyp_path=predicted_file_name, ref_path=ref_file_name, avg=True, ignore_empty=True) except ValueError: rouge = 0 print("Evaluation time: %sh%sm%ss" % ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60)) return num_correct_predictions / total_predictions, \ precision, recall, f1, rouge
def __call__(self, ref_path, hyp_path): from rouge import FilesRouge # pylint: disable=import-outside-toplevel files_rouge = FilesRouge(hyp_path, ref_path) rouge_scores = files_rouge.get_scores(avg=True) return {name: rouge_scores[name]["f"] for name in self.scores_name}
def baseline(args, cal_lead=False, cal_oracle=False): test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, args.test_batch_ex_size, 'cpu', shuffle=False, is_test=True) if cal_lead: mode = "lead" else: mode = "oracle" rouge = Rouge() pred_path = '%s.%s.pred' % (args.result_path, mode) gold_path = '%s.%s.gold' % (args.result_path, mode) save_pred = open(pred_path, 'w', encoding='utf-8') save_gold = open(gold_path, 'w', encoding='utf-8') with torch.no_grad(): for batch in test_iter: summaries = batch.summ_txt origin_sents = batch.original_str ex_segs = batch.ex_segs ex_segs = [sum(ex_segs[:i]) for i in range(len(ex_segs)+1)] for idx in range(len(summaries)): summary = summaries[idx] txt = origin_sents[ex_segs[idx]:ex_segs[idx+1]] if cal_oracle: selected = [] max_rouge = 0. while len(selected) < args.ranking_max_k: cur_max_rouge = max_rouge cur_id = -1 for i in range(len(txt)): if (i in selected): continue c = selected + [i] temp_txt = " ".join([txt[j] for j in c]) rouge_score = rouge.get_scores(temp_txt, summary) rouge_1 = rouge_score[0]["rouge-1"]["f"] rouge_l = rouge_score[0]["rouge-l"]["f"] rouge_score = rouge_1 + rouge_l if rouge_score > cur_max_rouge: cur_max_rouge = rouge_score cur_id = i if (cur_id == -1): break selected.append(cur_id) max_rouge = cur_max_rouge pred_txt = " ".join([txt[j] for j in selected]) else: k = min(max(len(txt) // (2*args.win_size+1), 1), args.ranking_max_k) pred_txt = " ".join(txt[:k]) save_gold.write(summary + "\n") save_pred.write(pred_txt + "\n") save_gold.flush() save_pred.flush() save_gold.close() save_pred.close() length = test_length(pred_path, gold_path) bleu = test_bleu(pred_path, gold_path) file_rouge = FilesRouge(hyp_path=pred_path, ref_path=gold_path) pred_rouges = file_rouge.get_scores(avg=True) logger.info('Length ratio:\n%s' % str(length)) logger.info('Bleu:\n%.2f' % (bleu*100)) logger.info('Rouges:\n%s' % rouge_results_to_str(pred_rouges))
def rouge_score(filename_gold, filename_result): files_rouge = FilesRouge(filename_result, filename_gold) scores = files_rouge.get_scores(avg=True) return scores
scores = rouge.get_scores(hyps, refs, avg=True) pprint(scores) # {'rouge-1': {'f': 0.41111110617777785, # 'p': 0.4444444444444444, # 'r': 0.38888888888888884}, # 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, # 'rouge-l': {'f': 0.40158730158694217, # 'p': 0.4444444444444444, # 'r': 0.38888888888888884}} print(scores['rouge-l']['f']) # 0.40158730158694217 ################################################################## ## Score two files (line by line) # Given two files hyp_path, ref_path, with the same number (n) of lines, calculate score for each of this lines, or, the average over the whole file. from rouge import FilesRouge files_rouge = FilesRouge(hyp_path, ref_path) scores = files_rouge.get_scores() ################################################################## ## Rouge for Chinese from rouge import Rouge from pprint import pprint rouge = Rouge() ## for sentence s_1 = '我是好人' s_2 = '我是坏人' score = rouge.get_scores(list(s_1), list(s_2)) print(score) ## for list data_1 = ['塔台管制员 空难 融合 目视 飞机', '光电 研究 材料 生长 传感器', '亚稳纳米材料生长 生长 纳米材料 纳米 研究']
def score(self, labels_file, predictions_path): from rouge import FilesRouge files_rouge = FilesRouge(predictions_path, labels_file) rouge_scores = files_rouge.get_scores(avg=True) return {k: v["f"] for k, v in six.iteritems(rouge_scores)}
# model parameters parser.add_argument('--vocab_len', type=int, default=12269, help='') parser.add_argument('--inredient_dim', type=int, default=3925, help='') parser.add_argument('--word_dim', type=int, default=256, help='') parser.add_argument('--sentEnd_hiddens', type=int, default=512, help='') parser.add_argument('--sentEnd_nlayers', type=int, default=1, help='') parser.add_argument('--recipe_inDim', type=int, default=1024, help='') parser.add_argument('--recipe_hiddens', type=int, default=1024, help='') parser.add_argument('--recipe_nlayers', type=int, default=1, help='') parser.add_argument('--sentDec_inDim', type=int, default=1024, help='') parser.add_argument('--sentDec_hiddens', type=int, default=512, help='') parser.add_argument('--sentDec_nlayers', type=int, default=1, help='') parser.add_argument('--sentences_sorted', type=int, default=1, help='') args = parser.parse_args() # Load dictionary mapping index to vocab string (decoded) with open('data/index_to_vocab.json', 'r') as vocabFile: index_to_vocab = json.load(vocabFile) # Generate results files with one recipe per line generate(args, saved_model_folder, epochs, split, results_path, index_to_vocab, video) # Calculate rouge score files_rouge = FilesRouge() outputs_path = os.path.join(results_path, 'outputs_{}.txt'.format(split)) ref_path = os.path.join(results_path, 'gt_{}.txt'.format(split)) scores = files_rouge.get_scores(outputs_path, ref_path, avg=True) print("ROUGE scores ", scores)
def evaluate(self): if not self.model: print('Model is not initialized') exit(-1) print("Testing...") eval_start_time = time.time() if self.config.LOAD_PATH and not self.config.TRAIN_PATH: model_dirname = os.path.dirname(self.config.LOAD_PATH) elif self.config.MODEL_PATH: model_dirname = os.path.dirname(self.config.MODEL_PATH) else: model_dirname = None print('Model directory is mossing') exit(-1) ref_file_name = os.path.join(model_dirname, 'ref.txt') predicted_file_name = os.path.join(model_dirname, 'pred.txt') if not os.path.exists(model_dirname): os.makedirs(model_dirname) log_file_name = os.path.join(model_dirname, 'log.txt') with open(log_file_name, 'w') as output_file, open( ref_file_name, 'w') as ref_file, open(predicted_file_name, 'w') as pred_file: num_correct_predictions = 0 if self.config.BEAM_WIDTH == 0 \ else np.zeros([self.config.BEAM_WIDTH], dtype=np.int32) total_predictions = 0 total_prediction_batches = 0 true_positive, false_positive, false_negative = 0, 0, 0 dataset = self.test_dataset_reader.get_dataset() start_time = time.time() for input_tensors in dataset: true_target_strings = input_tensors[reader.TARGET_STRING_KEY] batched_contexts = self.model.run_encoder(input_tensors, is_training=False) outputs, final_states = self.model.run_decoder( batched_contexts, input_tensors, is_training=False) if self.config.BEAM_WIDTH > 0: predicted_indices = outputs.predicted_ids else: predicted_indices = outputs.sample_id true_target_strings = Common.binary_to_string_list( true_target_strings.numpy()) ref_file.write('\n'.join([ name.replace(Common.internal_delimiter, ' ') for name in true_target_strings ]) + '\n') if self.config.BEAM_WIDTH > 0: # predicted indices: (batch, time, beam_width) predicted_strings = [[ [self.index_to_target[i] for i in timestep] for timestep in example ] for example in predicted_indices.numpy()] predicted_strings = [ list(map(list, zip(*example))) for example in predicted_strings ] # (batch, top-k, target_length) pred_file.write('\n'.join([ ' '.join(Common.filter_impossible_names(words)) for words in predicted_strings[0] ]) + '\n') else: predicted_strings = [[ self.index_to_target[i] for i in example ] for example in predicted_indices.numpy()] pred_file.write('\n'.join([ ' '.join(Common.filter_impossible_names(words)) for words in predicted_strings ]) + '\n') num_correct_predictions = update_correct_predictions( self.config.BEAM_WIDTH, num_correct_predictions, output_file, zip(true_target_strings, predicted_strings)) true_positive, false_positive, false_negative = update_per_subtoken_statistics( self.config.BEAM_WIDTH, zip(true_target_strings, predicted_strings), true_positive, false_positive, false_negative) total_predictions += len(true_target_strings) total_prediction_batches += 1 if total_prediction_batches % self.num_batches_to_log == 0: elapsed = time.time() - start_time trace_evaluation(output_file, num_correct_predictions, total_predictions, elapsed) print('Done testing, epoch reached', flush=True) output_file.write( str(num_correct_predictions / total_predictions) + '\n') elapsed = int(time.time() - eval_start_time) precision, recall, f1 = calculate_results(true_positive, false_positive, false_negative) files_rouge = FilesRouge(predicted_file_name, ref_file_name) rouge = files_rouge.get_scores(avg=True, ignore_empty=True) print("Evaluation time: %sh%sm%ss" % ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60)) return num_correct_predictions / total_predictions, precision, recall, f1, rouge
from rouge import FilesRouge import os import os.path hyp_path = "C:\\Users\\Admin\\Desktop\\virtual\\predicted_post.txt" ref_path = "C:\\Users\\Admin\\Desktop\\virtual\\answer_true_post.txt" files_rouge = FilesRouge() scores = files_rouge.get_scores(hyp_path, ref_path, avg=True) print(scores)
def __call__(self, ref_path, hyp_path): scorer = FilesRouge(metrics=list(self.scores_name)) rouge_scores = scorer.get_scores(hyp_path, ref_path, avg=True) return {name:rouge_scores[name]["f"] for name in self.scores_name}
def _compute_file_rouge(ref_path, hyp_path): """hyp_path:predict file""" files_rouge = FilesRouge(hyp_path, ref_path) scores = files_rouge.get_scores(avg=True) return scores