def get_metrics(self, ref_dir, dec_dir): reference = [] decoded = [] for i, j in zip(sorted(glob.glob(dec_dir + '/' + '*.txt')), sorted(glob.glob(ref_dir + '/' + '*.txt'))): ref_tex = '' dec_tex = '' for k in open(i).readlines(): dec_tex = dec_text + k.strip() if len(dec_tex) == 0: dec_tex = ' ' for l in open(j).readlines(): ref_tex = ref_tex + l reference.append(ref_tex) decoded.append(dec_tex) if len(reference) != len(decoded): raise ValueError( "Hypotheses and References don't have equal lengths") rouge_dict = rouge.rouge(decoded, reference) file_path = os.path.join(self._decode_dir, 'results.txt') f = open(file_path, 'w') for key in rouge_dict: print("%s\t%f" % (key, rouge_dict[key]), file=f) bleu_score = bleu.moses_multi_bleu(decoded, reference) print("%s\t%f" % ('bleu', bleu_score), file=f) tf.logging.info("BLEU, ROUGE values saved to results.txt")
def moses_bl_rouge(p, l): bl = bleu.moses_multi_bleu(p, l) x = rouge.rouge(p, l) print( 'Moses BLEU: %f\nROUGE1-F: %f\nROUGE1-P: %f\nROUGE1-R: %f\nROUGE2-F: %f\nROUGE2-P: %f\nROUGE2-R: %f\nROUGEL-F: %f\nROUGEL-P: %f\nROUGEL-R: %f' % (bl, x['rouge_1/f_score'], x['rouge_1/p_score'], x['rouge_1/r_score'], x['rouge_2/f_score'], x['rouge_2/p_score'], x['rouge_2/r_score'], x['rouge_l/f_score'], x['rouge_l/p_score'], x['rouge_l/r_score']))
def np_rouge(val, ref, start, end): def trim_seq(seq, start, end): seq = seq[list(seq).index(start) + 1:] if start in seq else seq seq = seq[:list(seq).index(end)] if end in seq else seq return np.trim_zeros(seq, 'b') val, ref = list(val), list(ref) for i in range(len(val)): val[i] = " ".join(str(c) for c in trim_seq(val[i], start, end)) ref[i] = " ".join(str(c) for c in trim_seq(ref[i], start, end)) return rouge(val, ref)
def _rouge(ref_file, summarization_file, mode="brief"): """Compute ROUGE scores and handling BPE.""" results = {} references = [] role_tokens = [] with codecs.getreader("utf-8")(tf.gfile.GFile(ref_file, "rb")) as fh: for line in fh: ref, role = process_dialogue_infer(line.rstrip(), get_role_token=True) references.append(ref) role_tokens.append(role) hypotheses = [] with codecs.getreader("utf-8")(tf.gfile.GFile(summarization_file, "rb")) as fh: for line in fh: hypotheses.append(line) rouge_score_map = rouge.rouge(hypotheses, references) results["all"] = 100 * rouge_score_map["rouge_l/f_score"] if mode == "brief": return results["all"] for role in ROLE_TOKENS: _sub_ref_texts = [] _sub_hypos = [] for _r, _t, _role in zip(references, hypotheses, role_tokens): if _role == role: _sub_ref_texts.append(_r) _sub_hypos.append(_t) rouge_score_map = rouge.rouge(_sub_hypos, _sub_ref_texts) results[role] = 100 * rouge_score_map["rouge_l/f_score"] return results
def _rouge(ref_file, summarization_file): """Compute ROUGE scores and handling BPE.""" references = [] with codecs.getreader("utf-8")(tf.gfile.GFile(ref_file, "rb")) as fh: for line in fh: references.append(process_dialogue_infer(line)) hypotheses = [] with codecs.getreader("utf-8")(tf.gfile.GFile(summarization_file, "rb")) as fh: for line in fh: hypotheses.append(line) rouge_score_map = rouge.rouge(hypotheses, references) return 100 * rouge_score_map["rouge_l/f_score"]
def evaluate(dataset_f, predictions_f, all_metrics=False, save_dir=""): with open(dataset_f) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json['data'] with open(predictions_f) as prediction_file: predictions = json.load(prediction_file) gt = [] pred = [] f1 = exact_match = total = count = 0 for article in dataset: for paragraph in article['paragraphs']: if str( article['title'] ) not in predictions: #needs a lookup in case of dev-v1.1.json continue for qa in paragraph['qas']: total += 1 ground_truths = list(map(lambda x: x['text'], qa['answers'])) if str(qa['id']) not in predictions: prediction = "" else: prediction = predictions[str(qa['id'])] if prediction == "": prediction = 'n_a' gt.append(ground_truths[0]) pred.append(prediction) exact_match += metric_max_over_ground_truths( exact_match_score, prediction, ground_truths) f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths) exact_match = 100.0 * exact_match / total f1 = 100.0 * f1 / total if all_metrics: rouge_dict = rouge(pred, gt) file_path = os.path.join(save_dir, 'results.txt') f = open(file_path, 'w') for key in rouge_dict: print("%s\t%f" % (key, rouge_dict[key]), file=f) bleu_score = moses_multi_bleu(pred, gt) print("%s\t%f" % ('bleu', bleu_score), file=f) print("%s\t%f" % ('f1', f1), file=f) print("%s\t%f" % ('exact_match', exact_match), file=f) return exact_match, f1
def target_based_np_rouge(val, ref, start, end, tz=True): def trim_seqs(val, ref, start, end): start_idx = list(ref).index(start) + 1 if start in ref else 0 val = val[start_idx:] ref = ref[start_idx:] val = val[:list(val).index(end)] if end in val else val ref = ref[:list(ref).index(end)] if end in ref else ref if tz: val = np.trim_zeros(val, 'b') ref = np.trim_zeros(ref, 'b') return val, ref val, ref = list(val), list(ref) for i in range(len(val)): sval, sref = trim_seqs(val[i], ref[i], start, end) sval = " ".join(str(c) for c in sval) sref = " ".join(str(c) for c in sref) val[i] = sval ref[i] = sref return rouge(val, ref)
def calculate_metrics_results(results: dict): for data in results.keys(): references = [] translations = [] references_rouge = [] translations_rouge = [] wert = 0.0 meteort = 0.0 for video in results[data].keys(): translation = results[data][video]["prediction_sentence"] if '</s>' in translation: translation.remove('</s>') translation = " ".join(translation) reference = results[data][video]["target_sentence"] if '</s>' in reference: reference.remove('</s>') reference = " ".join(reference) wert += jiwer.wer(truth=reference, hypothesis=translation) meteort += single_meteor_score(reference, translation) translations.append(translation.split(" ")) translations_rouge.append(translation) references.append([reference.split(" ")]) references_rouge.append(reference) print(len(references)) rouge_score_map = rouge.rouge(translations_rouge, references_rouge) print(data + ' rouge: ' + str(100 * rouge_score_map["rouge_l/f_score"])) print(data + ' WER: ' + str((wert / len(references)) * 100)) print(data + ' Meteor: ' + str((meteort / len(references)) * 100)) for max_ in range(1, 5): bleu_score, _, _, _, _, _ = bleu.compute_bleu(references, translations, max_order=max_) print(data + ' bleu: ' + str(max_) + " " + str(bleu_score * 100))
def evaluate(infer, ref, inferred_spans, ref_spans): bl = cal_bleu(infer, ref) x = rouge.rouge(infer, ref) f, e, total = f1.evaluate(inferred_spans, ref_spans) return bl, x['rouge_1/f_score'] * 100, x['rouge_2/f_score'] * 100, x[ 'rouge_l/f_score'] * 100, f, e, total
def cal_rouge(infer, ref): x = rouge.rouge(infer, ref) return x['rouge_1/f_score'] * 100, x['rouge_2/f_score'] * 100, x[ 'rouge_l/f_score'] * 100