-
Notifications
You must be signed in to change notification settings - Fork 0
/
classifier.py
93 lines (67 loc) · 2.9 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, NuSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import f1_score
import random
class Classifier:
def __init__(self, objective_data, subjective_data):
OBJECTIVE = 0
SUBJECTIVE = 1
self.objective_data = objective_data
self.subjective_data = subjective_data
self.text = objective_data + subjective_data
self.labels = [OBJECTIVE for i in objective_data] + [SUBJECTIVE for i in subjective_data]
tuple_list = zip(self.text, self.labels)
random.shuffle(tuple_list)
self.text = [x for x,y in tuple_list]
self.label = [y for x,y in tuple_list]
self.count_vectorizer = CountVectorizer(stop_words="english", min_df=3)
# count vectorizer and specific classifier that will be used
self.counts = self.count_vectorizer.fit_transform(self.text)
self.classifier = None
self.tf_transformer = TfidfTransformer(use_idf=True)
self.frequencies = self.tf_transformer.fit_transform(self.counts)
def multinomialNB(self):
self.classifier = MultinomialNB(alpha=.001)
self.classifier.fit(self.frequencies, self.labels)
def predict(self, examples):
example_counts = self.count_vectorizer.transform(examples)
example_tf = self.tf_transformer.transform(example_counts)
predictions = self.classifier.predict(example_tf)
return predictions
def linearSVC(self):
self.classifier = LinearSVC()
self.classifier.fit(self.frequencies, self.labels)
def nuSVC(self):
self.classifier = NuSVC()
self.classifier.fit(self.frequencies, self.labels)
def accurracy(self, text, labels):
prediction = self.predict(text)
accurracy = 0
for i in range(len(prediction)):
if prediction[i] == labels[i]:
accurracy += 1
return accurracy / float(len(prediction))
def f1(self, text, actual):
prediction = self.predict(text)
return f1_score(actual, prediction)
objective_file = open("data/objective_train.data", "r")
subjective_file = open("data/subjective_train.data", "r")
objective_text = objective_file.readlines()
subjective_text = subjective_file.readlines()
# create testing data
objective_test = open("data/objective_test.data","r").readlines()
subjective_test = open("data/subjective_test.data","r").readlines()
test_data = objective_test + subjective_test
labels = [0 for i in range(len(objective_test))] + [1 for i in range(len(subjective_test))]
c = Classifier(objective_text, subjective_text)
c.linearSVC()
print "Linear SVC accuracy: %f" % c.accurracy(test_data, labels)
print "Linear SVC F1: %f" % c.f1(test_data, labels)
c.multinomialNB()
print "Multinomial accuracy: %f" % c.accurracy(test_data, labels)
print "Multinomial F1: %f" % c.f1(test_data, labels)
# c.nuSVC()
# print "Nu-Support SVM accuracy: %f" % c.accurracy(test_data, labels)
# print "Nu-Support SVM F1: %f" % c.f1(test_data, labels)