/
evaluator.py
108 lines (100 loc) · 3.52 KB
/
evaluator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
Computes the BLEU, ROUGE
using the COCO metrics scripts
"""
from bleu import Bleu
from rouge import Rouge
import glob
def load_textfiles(references, hypothesis):
hypo = {idx: [lines.strip()] for (idx, lines) in enumerate(hypothesis)}
# take out newlines before creating dictionary
raw_refs = [list(map(str.strip, r)) for r in zip(references)]
refs = {idx: rr for idx, rr in enumerate(raw_refs)}
# sanity check that we have the same number of references as hypothesis
if len(hypo) != len(refs):
raise ValueError("There is a sentence number mismatch between the inputs")
return refs, hypo
def score(ref, hypo):
"""
ref, dictionary of reference sentences (id, sentence)
hypo, dictionary of hypothesis sentences (id, sentence)
score, dictionary of scores
"""
scorers = [
(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
(Rouge(), "ROUGE_L"),
]
final_scores = {}
for scorer, method in scorers:
score, scores = scorer.compute_score(ref, hypo)
if type(score) == list:
for m, s in zip(method, score):
final_scores[m] = s
else:
final_scores[method] = score
return final_scores
def evaluate(refs, hyps):
BLEU_1 = 0.
BLEU_2 = 0.
BLEU_3 = 0.
BLEU_4 = 0.
ROUGE_L = 0.
num_files = 0
print('Start evaluating BLEU and ROUGE')
for reference, hypothesis in zip(refs, hyps):
num_files += 1
ref, hypo = load_textfiles([reference], [hypothesis])
score_map = score(ref, hypo)
print(score_map)
BLEU_1 += score_map['Bleu_1']
BLEU_2 += score_map['Bleu_2']
BLEU_3 += score_map['Bleu_3']
BLEU_4 += score_map['Bleu_4']
ROUGE_L += score_map['ROUGE_L']
print('\nAverage All Pairs:')
print('Bleu - 1gram:', BLEU_1 / num_files)
print('Bleu - 2gram:', BLEU_2 / num_files)
print('Bleu - 3gram:', BLEU_3 / num_files)
print('Bleu - 4gram:', BLEU_4 / num_files)
print('Rouge:', ROUGE_L / num_files)
return BLEU_1 / num_files, BLEU_2 / num_files, BLEU_3 / num_files, BLEU_4 / num_files, ROUGE_L / num_files
# if __name__ == '__main__':
# # Feed in the directory where the hypothesis summary and true summary is stored
# hyp_file = glob.glob('hypothesis/*')
# ref_file = glob.glob('reference/*')
# hyp_file = sorted(hyp_file, key=lambda x: x.split('.')[0][-1])
# ref_file = sorted(ref_file, key=lambda x: x.split('.')[0][-1])
# BLEU_1 = 0.
# BLEU_2 = 0.
# BLEU_3 = 0.
# BLEU_4 = 0.
# ROUGE_L = 0.
# num_files = 0
# for reference_file, hypothesis_file in zip(ref_file, hyp_file):
# num_files += 1
# print(reference_file, hypothesis_file)
#
# with open(reference_file) as rf:
# reference = rf.readlines()
#
# with open(hypothesis_file) as hf:
# hypothesis = hf.readlines()
#
# print(reference)
# print(hypothesis)
#
# ref, hypo = load_textfiles(reference, hypothesis)
# score_map = score(ref, hypo)
# BLEU_1 += score_map['Bleu_1']
# BLEU_2 += score_map['Bleu_2']
# BLEU_3 += score_map['Bleu_3']
# BLEU_4 += score_map['Bleu_4']
# ROUGE_L += score_map['ROUGE_L']
#
#
# print('\nAverage Metric Score for All Review Summary Pairs:')
# print('Bleu - 1gram:', BLEU_1/num_files)
# print('Bleu - 2gram:', BLEU_2/num_files)
# print('Bleu - 3gram:', BLEU_3/num_files)
# print('Bleu - 4gram:', BLEU_4/num_files)
# print('Rouge:', ROUGE_L/num_files)