/
classifier.py
340 lines (285 loc) · 15.7 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
#!/usr/bin/env python
# coding: utf-8
import numpy as np
import glob, sys, time
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import EnglishStemmer
class Classifier:
HAM = 0
SPAM = 1
def __init__(self, ham_list, spam_list, f_train = None):
self.ham_list = ham_list # file list of ham emails
self.spam_list = spam_list # file list of spam emails
self.N_HAM = np.size(ham_list) # number of total ham emails
self.N_SPAM = np.size(spam_list) # number of total spam emails
self.N = np.asarray([self.N_HAM, self.N_SPAM])
self.label = np.asarray([self.HAM]* self.N_HAM + [self.SPAM]* self.N_SPAM)
# container for vocabulary list
self.vocab = None
self.nvocab = 0
self.f_train = 0.8 if not f_train else f_train # fraction of training in total, default to be 0.8
self.N_TRAINING = np.asarray([int(np.floor(self.N_HAM * self.f_train)), int(np.floor(self.N_SPAM * self.f_train))]) # number of training docs in ham and spam folder
self.N_TESTING = self.N - self.N_TRAINING
self.training_X, self.training_label, self.testing_X, self.testing_label = self.vectorize(self.f_train)
self.result = None
# def nwords(self, X):
# '''return the number of distinct words in input matrix X by counting the non-empty columns in X
# parameters:
# X: 2d numpy array'''
# return np.count_nonzero(X.sum(axis = 0))
def vectorize(self, f_train = None):
start_time = time.time()
if f_train is not None:
self.f_train = f_train # else f_train = 0.8 by default
# [number of ham in training, number of spam in training]
self.N_TRAINING = np.asarray([int(np.floor(self.N_HAM * self.f_train)),
int(np.floor(self.N_SPAM * self.f_train))])
# [number of ham in testing, number of spam in testing]
self.N_TESTING = self.N - self.N_TRAINING
print('vectorizing the emails...')
print('%s %% of emails are used for training...' % (self.f_train * 100))
# word stemming
pre = CountVectorizer(input = 'filename', decode_error = 'ignore',
token_pattern = u'(?ui)\\b\\w*[a-z]+\\w{3,}\\b', max_df = 0.95, min_df = 5)
pre_X = pre.fit_transform(self.ham_list[:self.N_TRAINING[self.HAM]] + self.spam_list[:self.N_TRAINING[self.SPAM]]).toarray()
# get the vocabulary list from training data
prevocab = pre.get_feature_names()
stemmer = EnglishStemmer()
stemmed = [stemmer.stem(w) for w in prevocab]
self.vocab = list(set(stemmed))
self.nvocab = np.size(self.vocab)
# training data vectorized with our vocabulary
training = CountVectorizer(input = 'filename', decode_error = 'ignore', vocabulary = self.vocab)
self.training_X = training.fit_transform(self.ham_list[:self.N_TRAINING[self.HAM]] + self.spam_list[:self.N_TRAINING[self.SPAM]]).toarray()
# testing data vectorized with our vocabulary
testing = CountVectorizer(input = 'filename', vocabulary = self.vocab, decode_error = 'ignore')
self.testing_X = testing.fit_transform(self.ham_list[-self.N_TESTING[self.HAM]:] + self.spam_list[-self.N_TESTING[self.SPAM]:]).toarray()
# create the label arrays
self.training_label = np.asarray([self.HAM] * self.N_TRAINING[self.HAM] + [self.SPAM] * self.N_TRAINING[self.SPAM])
self.testing_label = np.asarray([self.HAM] * self.N_TESTING[self.HAM] + [self.SPAM] * self.N_TESTING[self.SPAM])
print('vectorizing done! it took %.2f s' % (time.time() - start_time))
return self.training_X, self.training_label, self.testing_X, self.testing_label
def get_training(self):
'''return the input matrix and label for training set
Output:
trainging_X, training_label'''
return self.training_X, self.training_label
def get_testing(self):
'''return the input matrix and label for testing set
Output:
testing_X, testing_label'''
return self.testing_X, self.testing_label
def get_ham(self, X, label):
return X[np.where(label == self.HAM)]
def get_spam(self, X, label):
return X[np.where(label == self.SPAM)]
def accuracy(self, result = None):
if result is None:
if self.result is None:
print('there is no results!')
return np.nan
else:
return np.mean(self.result == self.testing_label) # number of correct predictions / total testing cases
else:
return np.mean(result == self.testing_label) # number of correct predictions / total testing cases
def naive_bayes(self, f_train = None):
if f_train is not None:
# re-vectorize the data
self.vectorize(f_train)
# we use the multinomial naive bayes model from
# https://web.stanford.edu/class/cs124/lec/naivebayes.pdf
def get_prior():
'''get the prior of for the Naive Bayes method which will be
[fraction of ham emails in training set,
fraction of spam emails in training set]'''
prior = self.N_TRAINING / self.N_TRAINING.sum()
return prior
def get_conditionals():
'''get the conditionals of for the Naive Bayes method with some smoothing'''
# split the traning data by label
# training_ham = self.training_X[:self.N_TRAINING[self.HAM]]
# training_spam = self.training_X[-self.N_TRAINING[self.SPAM]:]
training_ham = self.get_ham(self.training_X, self.training_label)
training_spam = self.get_spam(self.training_X, self.training_label)
# conditionals with Laplace smoothing
con_ham = (training_ham.sum(axis = 0) + 1) / (training_ham.sum() + self.nvocab)
con_spam = (training_spam.sum(axis = 0) + 1) / (training_spam.sum() + self.nvocab)
conditionals = np.asarray([con_ham, con_spam])
return conditionals
print('cross validating...')
start_time = time.time()
prior = get_prior()
conditionals = get_conditionals()
# start applying labels to our testing data!
self.result = np.empty(self.N_TESTING.sum()) # the results of our classifier
for i in np.arange(self.N_TESTING.sum()):
# use log likelihood for easier calculation
loglike_ham = np.dot(np.log(conditionals[self.HAM]), self.testing_X[i]) + np.log(prior[self.HAM])
loglike_spam = np.dot(np.log(conditionals[self.SPAM]), self.testing_X[i]) + np.log(prior[self.SPAM])
self.result[i] = self.HAM if loglike_ham > loglike_spam else self.SPAM
print('testing took %.2f s' % (time.time() - start_time))
return self.result
def nearest_neighbor(self, f_train = None):
if f_train != None:
# re-vectorize the data
self.vectorize(f_train)
print('running classifier...')
start_time = time.time()
def calculate_l1_distance(train_row, test_row):
diff_row = np.subtract(train_row, test_row) # find element wise difference
diff_row = np.absolute(diff_row) # take absolute value of differences
distance = np.sum(diff_row) # sum the distances
return distance
def calculate_l2_distance(train_row, test_row):
diff_row = np.subtract(train_row, test_row)
diff_row = np.square(diff_row)
distance = np.sum(diff_row)
return np.sqrt(distance)
def calculate_linf_distance(train_row, test_row):
diff_row = np.subtract(train_row, test_row)
diff_row = np.absolute(diff_row)
return np.amax(diff_row)
predicted_label_l1 = np.empty(shape = (len(self.testing_X), 1), dtype = int)
predicted_label_l2 = np.empty(shape = (len(self.testing_X), 1), dtype = int)
predicted_label_linf = np.empty(shape = (len(self.testing_X), 1), dtype = int)
for test_row, i in zip(self.testing_X, range(len(self.testing_X))):
row_distance_l1 = np.empty(shape = (len(self.training_X), 1), dtype = int)
row_distance_l2 = np.empty(shape = (len(self.training_X), 1), dtype = int)
row_distance_linf = np.empty(shape = (len(self.training_X), 1), dtype = int)
for train_row, j in zip(self.training_X, range(len(self.training_X))):
distance_l1 = calculate_l1_distance(train_row, test_row)
distance_l2 = calculate_l2_distance(train_row, test_row)
distance_linf = calculate_linf_distance(train_row, test_row)
row_distance_l1[j] = distance_l1 # array of distances for each test row
row_distance_l2[j] = distance_l2
row_distance_linf[j] = distance_linf
# print("test row:", test_row, " | label: ", self.testing_X_label[i])
# print("train row:", train_row, " | label: ", self.training_label[j])
# print("dist sum: ", distance)
min_dist_index_l1 = np.argmin(row_distance_l1) # min distance's index in array of distances
min_dist_index_l2 = np.argmin(row_distance_l2)
min_dist_index_linf = np.argmin(row_distance_linf)
predicted_label_l1[i] = self.training_label[min_dist_index_l1]
predicted_label_l2[i] = self.training_label[min_dist_index_l2]
predicted_label_linf[i] = self.training_label[min_dist_index_linf]
# print("-----------------------")
# print("min dist: ", np.amin(row_distance))
# print("index of min: ", np.argmin(row_distance))
# print("predicted label: ", predicted_label[i])
# print("-----------------------\n")
self.result = [predicted_label_l1.flatten(),
predicted_label_l2.flatten(),
predicted_label_linf.flatten()]
print('testing took %.2f s' % (time.time() - start_time))
return self.result
def decision_tree(self, f_train = None):
if f_train is not None:
# re-vectorize the data
self.vectorize(f_train)
SPLIT = 30
training_ham = self.get_ham(self.training_X, self.training_label)
training_spam = self.get_spam(self.training_X, self.training_label)
hmean = training_ham.mean(axis = 0)
smean = training_spam.mean(axis = 0)
freq_diff = abs(hmean - smean) # difference of each word freq in ham ans spam
arg_fdiff = np.flip(np.argsort(freq_diff)) # arg of freq difference in descending order
arg_list = list(arg_fdiff[np.where(freq_diff > 0)]) # list of indexes where we do the cuts
class Node:
def __init__(self, idx):
self.idx = idx
# print('a new node at index %d' % self.idx)
self.value = None
self.left = None
self.right = None
def get_idx():
if arg_list:
idx = arg_list[0]
arg_list.pop(0)
return idx
else:
print('Reached maximum number of nodes')
return None
def build_tree(rows):
if np.size(rows) == 1:
node_labels = self.training_label[rows]
return node_labels
else:
new_N = Node(get_idx())
col = self.training_X[:, new_N.idx] # the column turned into array
node_col = col[rows]
node_labels = self.training_label[rows]
new_N.value = Gini_min(node_col, node_labels) # node value is whatever with lowest gini index
l_rows = rows[np.where(node_col <= new_N.value)] # cut rows by N.value
r_rows = rows[np.where(node_col > new_N.value)]
if np.size(l_rows) == 0:
if np.count_nonzero(self.training_label[r_rows] ==self.HAM) / np.size(self.training_label[r_rows]) > 0.5: # fraction ofself.HAM is higher
new_N.right =self.HAM
new_N.left =self.SPAM
else:
new_N.right =self.SPAM
new_N.left =self.HAM
return new_N
elif np.size(r_rows) == 0: # if one side has no other data
if np.count_nonzero(self.training_label[l_rows] ==self.HAM) / np.size(self.training_label[l_rows]) > 0.5: # fraction ofself.HAM is higher
new_N.right =self.SPAM
new_N.left =self.HAM
else:
new_N.right =self.HAM
new_N.left =self.SPAM
return new_N
else:
new_N.left = build_tree(l_rows)
new_N.right = build_tree(r_rows)
return new_N
def Gini_min(col, c):
'''return the minimum gini index given an array of word freq and associated label c'''
# look for the lowest gini value in the column
c_max = np.max(col) # the largest value
c_min = np.min(col) # the smallest value/freq in column
if c_max == c_min: # if the maximun equals the minimum -> all values are equal
return c_max # that's the value to split
else: # if not all elements are zero
split_value = np.linspace(c_min, c_max, num = SPLIT) # the values of diff split
gini_idx = Gini_index(split_value, col, c) # list of gini idx at diff split
return gini_idx.argmin() # return the value for minimum gini index
# Calculate the Gini index for a split dataset
def Gini_index(cut, col, c):
lc = c[np.where(col <= cut)] # labels for the left
rc = c[np.where(col > cut)] # labels for the right
len_left, len_right = len(lc), len(rc)
len_total = len_left + len_right
unc = 0.0
if len_left == 0:
unc += 0 # weighted uncertainty
else:
l_p_ham = (np.count_nonzero(lc ==self.HAM) / np.size(lc))
l_p_spam = (np.count_nonzero(lc ==self.SPAM) / np.size(lc))
unc += (1 - (l_p_ham**2 + l_p_spam**2)) * len_left / len_total
if len_right == 0:
unc += 0
else:
r_p_ham = (np.count_nonzero(rc ==self.HAM) / np.size(rc))
r_p_spam = (np.count_nonzero(rc ==self.SPAM) / np.size(rc))
unc += (1 - (r_p_ham**2 + r_p_spam**2)) * len_right / len_total
return unc
Gini_index = np.vectorize(Gini_index, excluded = [1, 2])
def classify(case, N):
if N == self.HAM:
return self.HAM
elif N == self.SPAM:
return self.SPAM
else:
if case[N.idx] < N.value:
return classify(case, N.left)
else:
return classify(case, N.right)
root = build_tree(np.arange(self.training_X.shape[0]))
self.result = np.empty(self.N_TESTING.sum())
for i in np.arange(np.size(self.testing_label)):
self.result[i] = classify(self.testing_X[i], root)
return self.result
# How to use the classifier:
# Example:
# test = Classifier(HAM_LIST, SPAM_LIST) # initialize
# result = test.naive_bayes()
# test.accuracy()