-
Notifications
You must be signed in to change notification settings - Fork 0
/
clf.py
46 lines (38 loc) · 1.5 KB
/
clf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
import os
path = os.path.dirname(os.path.realpath(__file__))
# Load the movie reviews from file
data = datasets.load_files(path + "/reviews", shuffle=False, categories=["pos", "neg"])
# Get index->label relationships
y = data.target
# Vectorize the movie reviews using our 8 words
vect = CountVectorizer(vocabulary=["awful", "bad", "boring", "dull", "effective", "enjoyable", "great", "hilarious"])
X = vect.fit_transform(data.data)
X = X.toarray()
# Define our classifier and cross-validation
clf = BernoulliNB(binarize=True)
kf = KFold(n_splits=10, shuffle=True)
kf.get_n_splits(X)
# Perform cross-validation
score = 0
for k, (train, test) in enumerate(kf.split(X, y)):
clf.partial_fit(X[train], y[train], [0,1])
score += clf.score(X[test], y[test])
# Calculate average prediction accuracy
score = score / 10
print("Bernoulli Average Score: {0:.5f}".format(score))
# Define our classifier and cross-validation
clf = MultinomialNB()
kf = KFold(n_splits=10, shuffle=True)
kf.get_n_splits(X)
# Perform cross-validation
score = 0
for k, (train, test) in enumerate(kf.split(X, y)):
clf.partial_fit(X[train], y[train], [0,1])
score += clf.score(X[test], y[test])
# Calculate average prediction accuracy
score = score / 10
print("MultinomialNB Average Score: {0:.5f}".format(score))