/
Evaluation.py
176 lines (149 loc) · 5.95 KB
/
Evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import pickle
from nltk.corpus.reader.xmldocs import XMLCorpusReader
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import numpy as np
import pickle
from textstat.textstat import textstat
from bs4 import BeautifulSoup
import HTMLParser
from pattern.en import parse
from nltk.util import bigrams
def gender_feature(text, feature_vect):
"""
Extract the gender features
:param text:
:param feature_vect: contains a bag of words and a list of bigrams
:return: a dictionary which contains the feature and its computed value
"""
#sentence length and vocab features
tokens = word_tokenize(text.lower())
sentences = sent_tokenize(text.lower())
words_per_sent = np.asarray([len(word_tokenize(s)) for s in sentences])
#bag_of_word features
bag_dict = {}
for bag in feature_vect[:29]:
bag_dict[bag] = bag in tokens
#bigrams features
bigram_dict = {}
for big in feature_vect[29:]:
bigram_dict[big] = big in bigrams(tokens)
#POS tagging features
POS_tag = ['ADJ', 'ADV', 'DET', 'NOUN', 'PRT', 'VERB', '.']
tagged_word = parse(text, chunks=False, tagset='UNIVERSAL').split()
simplified_tagged_word = [(tag[0], map_tag('en-ptb', 'universal', tag[1])) for s in tagged_word for tag in s]
freq_POS = nltk.FreqDist(tag[1] for tag in simplified_tagged_word if tag[1] in POS_tag)
d = dict({'sentence_length_variation': words_per_sent.std()}, **bag_dict)
return dict(dict(d, **bigram_dict), **freq_POS)
def age_feature(text, feature_vect):
"""
Extract age features
:param text:
:param feature_vect: contains a bag of words
:return:a dictionary which contains the feature and its computed value
"""
tokens = word_tokenize(text.lower())
features = {}
for word in feature_vect:
features['contains(%s)' % word] = (word in set(tokens))
return dict(features, **dict({'FRE': textstat.flesch_reading_ease(text),
'FKGL': textstat.flesch_kincaid_grade(text)}))
def create_feature_vect(file_name):
"""
upload for each class list of element needed in the feature extractor function
:param file_name: path of the pickle file containing the desired list
:return:feature list
"""
fp2 = open(file_name)
feature_vect = pickle.load(fp2)
fp2.close()
print feature_vect
return feature_vect
def fetch_text(doc):
"""
Process the text contained in a document
:param doc:
:return:processed text
"""
txt = " ".join([doc[0][j].text for j in range(int(doc[0].attrib["count"])) if doc[0][j].text is not None])
try:
txt = BeautifulSoup(txt).get_text()
except HTMLParser.HTMLParseError:
txt = ""
return txt
def extract_true_pred(file):
"""
Extract the true age and gender from truth-en.txt file provided with test corpus
:param file: path of the truth-en.txt file
:return:dictionary which has name of xml file as a key and a list value composed from real age and gender values
"""
f = open(file)
true_pred = {}
for line in f.readlines():
line = line.split(":::")
true_pred[line[0]+"_en_XXX_XXX.xml"] = [line[1], line[2][:3]]
f.close()
return true_pred
def test_set(corpus_dir, feature_extrator, vect_path, i):
"""
Read ,process the test set and extract features for each document
:param corpus_dir:path of the test set
:param feature_extrator: function that extract features
:param vect_path:
:param i:index of class in the true_pred dictionay values; if 0 it refers to the gender else it refers to the age
:return:vector that contain the extracted features
"""
vect = create_feature_vect(vect_path)
newcorpus = XMLCorpusReader(corpus_dir, '.*')
doc_list = newcorpus.fileids()
test_feature_set = []
true_pred = extract_true_pred(corpus_dir[:-2]+"truth-en.txt")
for doc in doc_list:
xml_name = doc
doc = newcorpus.xml(doc)
print(doc[0].attrib["count"])
txt = fetch_text(doc)
if (textstat.sentence_count(txt) != 0) and (txt != ""):
test_feature_set.append((feature_extrator(txt, vect), true_pred[xml_name][i]))
return test_feature_set
def evaluation(test_feature_set, classifier, classes):
"""
Evaluate a classifier with a feature test list; for each class in classes
it calculates the f-measure, recall and precision metrics and display the confusion matrix
:param test_feature_set:
:param classifier: trained classifier
:param classes: list containing the classes of an attribute (gender or age)
:return:
"""
import collections
import nltk.metrics
import nltk
ref_set = collections.defaultdict(set)
test_set = collections.defaultdict(set)
ref_matrix = []
test_matrix = []
for i, (feats, label) in enumerate(test_feature_set):
ref_set[label].add(i)
observed = classifier.classify(feats)
test_set[observed].add(i)
ref_matrix.append(label)
test_matrix.append(observed)
for cls in classes:
print cls, ' precision:', nltk.metrics.precision(ref_set[cls], test_set[cls])
print cls, ' recall:', nltk.metrics.recall(ref_set[cls], test_set[cls])
print cls, ' F-measure:', nltk.metrics.f_measure(ref_set[cls], test_set[cls])
cm = nltk.ConfusionMatrix(ref_matrix, test_matrix)
print(cm)
print(cm.pp(sort_by_count=True, show_percents=True, truncate=9))
#change the parameters for the age_classifier evaluation
# by changing file and uncomment the call of the evaluation function
f = open("gender_classifier.pickle")
classifier = pickle.load(f)
f.close()
test_feature_set = test_set("pan13-test-corpus1\\en", gender_feature, "gender_words.txt", 0)
f = open("test_feature_set.pickle", 'w')
pickle.dump(test_feature_set, f)
f.close
print nltk.classify.accuracy(classifier, test_feature_set)
#evaluation(test_feature_set, classifier, ['10s', '20s', '30s'])
evaluation(test_feature_set, classifier, ['female', 'male'])