-
Notifications
You must be signed in to change notification settings - Fork 0
/
BayesClassifier.py
executable file
·158 lines (125 loc) · 4.88 KB
/
BayesClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# Name: Parker Woodworth and Will Potter
# Date: 03/21/2013
# Description: Our awesome BayesClassifier (improved)
#
#
import math, os, pickle
from DataReader import *
class BayesClassifier:
def __init__(self):
'''This method initializes the Naive Bayes classifier'''
self.word_counts = {} # Dictionary of dictionaries. Label as first key, word as second key
self.docs = {} # Label is the key. Total docs / label
self.word_sums = {} # Label is the key. Total # of words / label
self.total_docs = 0
def train(self, dataFile):
'''Trains the Naive Bayes Sentiment Classifier.'''
dr = DataReader(dataFile)
label, data = dr.next()
while(label):
try:
if label not in self.word_counts:
self.word_counts[label] = {}
if label in self.docs:
self.docs[label] += 1
else:
self.docs[label] = 1
self.total_docs += 1
for i in range(len(data)):
if data[i] in self.word_counts[label]:
self.word_counts[label][data[i]] += 1
else:
self.word_counts[label][data[i]] = 1
label, data = dr.next()
except StopIteration:
# Calculate the total number of words / label
for label, label_words in self.word_counts.items():
self.word_sums[label] = 0
for word, word_count in label_words.items():
self.word_sums[label] += word_count
self.save(dataFile+".pickle")
return
def classify(self, sText):
'''Given a target string sText, this function returns the most likely document
class to which the target string belongs (i.e., positive or negative ).
'''
words = tokenize(sText)
probs = {}
# Calculate probability for each label
for label, label_words in self.word_counts.items():
# Calculate label probability
probs[label] = math.log(float(self.docs[label])/float(self.total_docs)) # Start off with p(label)
# Calculate probability for each word
for word in words:
if word in self.word_counts[label]:
probs[label] += math.log(float(self.word_counts[label][word]) / float(self.word_sums[label]))
else:
probs[label] += math.log(.05)
# Now find the maximum probability
prob_label, prob_number = False, False
for key, value in probs.items():
# print key, value
if prob_label == False:
prob_label, prob_number = key, value
if value > prob_number:
prob_label, prob_number = key, value
# print "-----------------------------"
# print probs
return prob_label, prob_number
def test(self, dataName, logFilename):
''' Tests against dataName and logs to logFilename. '''
dr = DataReader(dataName)
correct = 0
total = 0
found_counts = {}
actual_counts = {}
label, data = dr.next()
log = open(logFilename, 'w')
while( label,data ):
try:
if label in actual_counts:
actual_counts[label] += 1
else:
actual_counts[label] = 1
total += 1
string = ""
for i in data:
string += i + " "
bayes_label, bayes_prob = self.classify(string)
# print "Result:" + bayes_label + " Correct Label: " + label
# log.write("Result:" + bayes_label + " Correct Label: " + label+ "\n")
if bayes_label == label:
if bayes_label in found_counts:
found_counts[bayes_label] += 1
else:
found_counts[bayes_label] = 1
correct += 1
label, data = dr.next()
except StopIteration:
for k, v in actual_counts.items():
if k in found_counts:
log.write(k + " " + str(float(found_counts[k])/actual_counts[k]) + "\n")
else:
log.write(k + " 0\n")
log.close()
return float(correct)/total
def save(self, sFilename):
'''Save the learned data during training to a file using pickle.'''
f = open(sFilename, "w")
p = pickle.Pickler(f)
# use dump to dump your variables
p.dump(self.word_counts)
p.dump(self.docs)
p.dump(self.word_sums)
p.dump(self.total_docs)
f.close()
def load(self, sFilename):
'''Given a file name of stored data, load and return the object stored in the file.'''
f = open(sFilename, "r")
u = pickle.Unpickler(f)
# use load to load in previously dumped variables
self.word_counts = u.load()
self.docs = u.load()
self.word_sums = u.load()
self.total_docs = u.load()
f.close()