-
Notifications
You must be signed in to change notification settings - Fork 2
/
evaluate_multiple.py
104 lines (93 loc) · 3.39 KB
/
evaluate_multiple.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/bin/python
# -*- coding: utf-8 -*-
from sklearn import cluster
import numpy as np
import nltk
import os
from nltk.corpus import stopwords
from clustering import kcluster
from nltk.stem.snowball import EnglishStemmer
from svd import svd
import feature_extract
import raketr
import re
#import rake
import sys
nltk.data.path.append('/home/jocelyn/usb/nltk_data')
def get_stemmed_keywords(keywords):
stemmer = EnglishStemmer()
stemmed_keywords = list(keywords)
# split into list of list
stemmed_keywords = [keyword.split() for keyword in stemmed_keywords]
# stem individual words
stemmed_keywords = [list(stemmer.stem(word) for word in keyword) for keyword in stemmed_keywords]
# list of words to string
stemmed_keywords = [' '.join(keyword).encode('ascii') for keyword in stemmed_keywords]
return stemmed_keywords
def main():
semeval_dir = 'data/maui-semeval2010-test/'
filenames = sorted(os.listdir(semeval_dir))
manual_keywords = []
total_precision = 0
total_recall = 0
total_docs = 0
method = str(sys.argv[1])
for filename in filenames:
if filename[-3:] == 'key':
# ignored due to issue on Mac or empty keyfile
if filename == "H-5.key" or filename == "C-86.key":
continue
with open(semeval_dir + filename, 'r') as f:
last_key_file = filename
key_lines = f.read().splitlines()
key_lines = [word.encode('ascii') for word in key_lines]
manual_keywords = get_stemmed_keywords(key_lines)
elif filename[-3:] == 'txt':
# ignored due to issue on Mac or empty keyfile
if filename == "H-5.txt" or filename == "C-86.txt":
continue
total_docs += 1
print(filename)
with open(semeval_dir + filename, 'r') as f:
correct = 0
f = open(semeval_dir + filename, 'r')
content = f.read()
if method == 'svd':
keywords = svd(content, 1, False)
elif method == 'raketr':
keywords = raketr.main(content, False)
elif method == 'cluster':
keywords = kcluster(content, 6, 15, False)
# benchmark against RAKE
# keywords = rake_object.run(content)[:15]
# keywords = [word[0] for word in keywords]
# keywords = [''.join([i if ord(i) < 128 and i != '\n' else ' ' for i in keyword]).encode('ascii') for keyword in keywords]
else:
print('methods accepted: svd raketr cluster')
exit(0)
print(keywords)
print('-'*100)
# print('--------manual keywords---------')
# print(manual_keywords)
# print('--------extracted keywords---------')
# print(keywords)
stemmed_keywords = get_stemmed_keywords(keywords)
for keyword in stemmed_keywords:
if keyword in set(manual_keywords):
correct += 1
if len(manual_keywords) == 0:
print(filename)
print(last_key_file)
print('^^^^ issue with this file ^^^^')
exit(0)
total_precision += correct/float(len(keywords))
total_recall += correct/float(len(manual_keywords))
total_precision /= total_docs
total_recall /= total_docs
total_fmeasure = round(2*total_precision*total_recall/(total_precision + total_recall), 5)
print('total docs: ' + str(total_docs))
print('total precision: ' + str(total_precision))
print('total recall: ' + str(total_recall))
print('total fmeasure: ' + str(total_fmeasure))
if __name__ == '__main__':
main()