-
Notifications
You must be signed in to change notification settings - Fork 0
/
naivebayes.py
executable file
·119 lines (107 loc) · 5.16 KB
/
naivebayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#! /usr/bin/env python3
__author__ = 'flire'
import sys
import numpy
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from optparse import OptionParser
from tweetsParser import *
def extract_features(filename, sentiment, count):
# result = []
# for tweet in tweetsParser(filename, sentiment, count):
# result += [' '.join(tweet.lemmatized)] # вот тут хардкод
# return result
for tweet in tweetsParser(filename, sentiment, count):
yield " ".join(tweet.lemmatized)
def main(pos_tweets_filename, neg_tweets_filename, tweets_count=float("inf"), pos_label="pos",
neg_label="neg", chunk_size = 10000, features_used=(1,1)):
machine = BernoulliNB()
vectorizer = HashingVectorizer(ngram_range=features_used)
chunk_features = []
# pos_features = extract_features(pos_tweets_filename, pos_label, tweets_count)
# neg_features = extract_features(neg_tweets_filename, neg_label, tweets_count)
# features = vectorizer.fit_transform(pos_features + neg_features)
# labels = [pos_label, ] * len(pos_features) + [neg_label,] * len(neg_features)
# machine.fit(features, labels)
# return (machine, vectorizer, features.toarray())
for pos_features in extract_features(pos_tweets_filename, pos_label, tweets_count):
chunk_features.append(pos_features)
if len(chunk_features) >= chunk_size:
features = vectorizer.fit_transform(chunk_features)
machine.partial_fit(features, [pos_label, ]*len(chunk_features), classes=[pos_label, neg_label])
chunk_features = []
if len(chunk_features)!=0:
features = vectorizer.fit_transform(chunk_features)
machine.partial_fit(features, [pos_label, ]*len(chunk_features), classes=[pos_label, neg_label])
chunk_features = []
for neg_features in extract_features(neg_tweets_filename, neg_label, tweets_count):
chunk_features.append(neg_features)
if len(chunk_features) >= chunk_size:
features = vectorizer.fit_transform(chunk_features)
machine.partial_fit(features, [neg_label, ]*len(chunk_features), classes=[pos_label, neg_label])
chunk_features = []
if len(chunk_features)!=0:
features = vectorizer.fit_transform(chunk_features)
machine.partial_fit(features, [neg_label, ]*len(chunk_features), classes=[pos_label, neg_label])
chunk_features = []
return (machine, vectorizer)
def count_preciseness(machine, vectorizer, source, label):
matched = 0
chunk = []
for tweet in tweetsParser(source, "undef"):
line = " ".join(tweet.lemmatized)
chunk.append(line)
tr = vectorizer.transform(chunk)
result = machine.predict(tr)
for res in result:
# print(res)
if res == label:
matched+=1
return (len(result), matched)
if __name__ == "__main__":
# vectorizer = CountVectorizer(ngram_range=(1,2))
# q = ['qqq www eee', 'qqq eee ttt']
# f = vectorizer.fit_transform(q)
# print(f)
# print(vectorizer.get_feature_names())
# w = ['ttt www eee zzz']
# print(vectorizer.transform(w))
# exit()
parser = OptionParser()
parser.add_option("-c", "--count", type='int', dest="count", default=float("inf"))
parser.add_option("--mystempath", dest="mystempath", default="./mystem")
parser.add_option("-p", "--postweets", dest="postweetsfilename")
parser.add_option("-n", "--negtweets", dest="negtweetsfilename")
parser.add_option("--checkpos", dest="checkposfilename")
parser.add_option("--checkneg", dest="checknegfilename")
parser.add_option("--chunksize", type='int', dest = "chunksize", default=10000)
parser.add_option("--features", dest="features", default="unigrams")
(options, args) = parser.parse_args()
features_used = (1,1)
if options.features == "bigrams":
features_used = (2,2)
elif options.features == "unigrams_bigrams":
features_used = (1,2)
print("learning...")
machine, vectorizer = main(options.postweetsfilename,
options.negtweetsfilename,
options.count,
pos_label="pos",
neg_label="neg",
chunk_size=options.chunksize,
features_used=features_used)
print("checking pos...")
allpos, matchpos = count_preciseness(machine, vectorizer, options.checkposfilename, "pos")
print("checking neg...")
allneg, matchneg = count_preciseness(machine, vectorizer, options.checknegfilename, "neg")
print(matchpos , '/', allpos)
print(matchneg , '/', allneg)
print((matchpos + matchneg) / (allpos + allneg))
# for tweet in tweetsParser(options.postweetsfilename, "undef", 10):
# line = " ".join(tweet.lemmatized)
# tr = vectorizer.transform([line,])
# print(line + " -> " + str(machine.predict(tr)) + "\n")
# for tweet in tweetsParser(options.negtweetsfilename, "undef", 10):
# line = " ".join(tweet.lemmatized)
# tr = vectorizer.transform([line,])
# print(line + " -> " + str(machine.predict(tr)) + "\n")