/
test.py
139 lines (103 loc) · 3.43 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
'''
Created on Sep 10, 2015
@author: Mustafa
'''
from classifiers import TransparentLogisticRegression
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from time import time
from scipy.sparse.construct import diags
from utils import load_imdb
if __name__ == '__main__':
# Load the data
print "Loading the data"
t0 = time()
vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))
X_train, y_train, X_test, y_test, train_corpus, test_corpus = load_imdb("C:\\Users\\Mustafa\\Desktop\\aclImdb", shuffle=True, vectorizer=vect)
feature_names = vect.get_feature_names()
duration = time() - t0
print
print "Loading took %0.2fs." % duration
print
print "Fitting the classifier"
t0 = time()
clf = TransparentLogisticRegression(penalty='l1', C=0.1)
clf.fit(X_train, y_train)
duration = time() - t0
print
print "Fitting took %0.2fs." % duration
print
print "Predicting the evidences"
t0 = time()
neg_evi, pos_evi = clf.predict_evidences(X_test)
duration = time() - t0
print
print "Predicting evidences took %0.2fs." % duration
print
print "Predicting the probs"
t0 = time()
probs = clf.predict_proba(X_test)
duration = time() - t0
print
print "Predicting probs took %0.2fs." % duration
print
total_evi = neg_evi + pos_evi
evi_sorted = np.argsort(total_evi)
coef_diags = diags(clf.coef_[0], 0)
# Most negative
print
print "Most negative"
print
i = evi_sorted[0]
print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
print test_corpus[i]
ind_evi = (X_test[i] * coef_diags).toarray()[0]
ind_evi_sorted = np.argsort(ind_evi)
print "Negative words"
for j in ind_evi_sorted:
if ind_evi[j] >= 0:
break
print feature_names[j], ind_evi[j]
print "\nPositive words"
for j in ind_evi_sorted[::-1]:
if ind_evi[j] <= 0:
break
print feature_names[j], ind_evi[j]
print
print "Most positive"
print
i = evi_sorted[-1]
print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
print test_corpus[i]
ind_evi = (X_test[i] * coef_diags).toarray()[0]
ind_evi_sorted = np.argsort(ind_evi)
print "Negative words"
for j in ind_evi_sorted:
if ind_evi[j] >= 0:
break
print feature_names[j], ind_evi[j]
print "\nPositive words"
for j in ind_evi_sorted[::-1]:
if ind_evi[j] <= 0:
break
print feature_names[j], ind_evi[j]
print
print "Least total absolute evidence"
print
total_abs_evi = abs(neg_evi) + pos_evi
abs_evi_sorted = np.argsort(total_abs_evi)
i = abs_evi_sorted[0]
print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
print test_corpus[i]
ind_evi = (X_test[i] * coef_diags).toarray()[0]
ind_evi_sorted = np.argsort(ind_evi)
print "Negative words"
for j in ind_evi_sorted:
if ind_evi[j] >= 0:
break
print feature_names[j], ind_evi[j]
print "\nPositive words"
for j in ind_evi_sorted[::-1]:
if ind_evi[j] <= 0:
break
print feature_names[j], ind_evi[j]