forked from fallingcoconut/cs175b
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
434 lines (336 loc) · 14.4 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
__author__ = 'Michael'
"""
CS 175 UCI AI Project Class by Michael and Kelvin.
This module contains all the classes and functions used to represent the data, train the classifier, and test the
classifier.
"""
import time
import glob
import random
from nltk.corpus import stopwords
stopwordslist = stopwords.words('english')
from TwitterAPI import TwitterAPI
from TwitterAPI import TwitterRestPager
from datetime import datetime
import pickle
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from pybrain.tools.shortcuts import buildNetwork
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.datasets import ClassificationDataSet
from pybrain.structure.modules import SoftmaxLayer
from pybrain.utilities import percentError
consumer_key = 'XKHtqeaKlCR9n4BG3MI5cwftj'
consumer_secret = 'NcQXp5pStiFqJnPn3DjJZZyHRaNb7lV01lcOfZ4t42QfoA08pQ'
access_token_key = '168559355-Qi1ARhn8IkDls2MlFYZtzX0fXhNEkX51DEKK0ni1'
access_token_secret = 'BV7mPSXRnXel0frvjsMUh5R2ePmlDriyXj7nGI5P0nZ5d'
class Tweet:
"""
Small class to hold the tweet information.
"""
def __init__(self):
self.hashtags = None
self.text = None
self.target = None
def __eq__(self, other):
return self.text == other.text
def __hash__(self):
return hash(self.text)
def pull_tweets(tweets: int, hashtag: str) -> None:
"""
Pulls specified number of tweets and writes them into file as Pickled python sets.
:param tweets: number of tweets to pull
:return: None
"""
start_time = datetime.now()
print(start_time)
api = TwitterAPI(consumer_key, consumer_secret, access_token_key, access_token_secret)
data_file = open('data/{}{}'.format(str(start_time), '.txt'), 'wb+')
# use this so that we don't retrieve tweets that we have already gotten
r = TwitterRestPager(api, 'search/tweets', {'q': '#{}'.format(hashtag), 'count': 100, 'lang': 'en'})
tweet_set = set()
for item in r.get_iterator():
tweet = Tweet()
if len(tweet_set) >= tweets:
break
if 'text' in item:
tweet.hashtags = [hashtag['text'] for hashtag in item['entities']['hashtags']]
tweet.text = item['text'].replace('\n', ' ')
tweet.target = hashtag
if tweet not in tweet_set:
tweet_set.add(tweet)
print(tweet.hashtags, tweet.text, tweet.target)
print(len(tweet_set))
elif 'message' in item and item['code'] == 88:
print('SUSPEND, RATE LIMIT EXCEEDED: %s\n' % item['message'])
time.sleep(16 * 60)
pickle.dump(tweet_set, data_file, 2)
data_file.close()
print(datetime.now() - start_time)
return start_time
def read_data(filename: str) -> list:
"""
Helper function for the read_all_data function. reads one file and returns the set contained in it.
:param filename: file to read from
:return: list of Tweet objects
"""
with open(filename, 'rb') as f:
try:
return pickle.load(f)
except EOFError:
return set()
all_data = set()
def read_all_data(tone=None):
"""
Reads all the data contained in all the files we have dumped so far. Returns a set that corresponds to the
tweets that have the parameter tone.
:param tone: the target tone (hashtag) that you want the tweets to have.
:return:
"""
global all_data
# retrieve all data from stored files once
if not all_data:
all_data = set()
for filename in glob.glob("data/*.txt"):
data = read_data(filename)
all_data.update(data)
target_data = {tweet for tweet in all_data if tweet.target == tone} if tone else all_data
import re
if tone:
tone_replace = re.compile(re.escape(tone), re.IGNORECASE)
for tweet in target_data:
tweet.text = tone_replace.sub('', tweet.text)
return target_data
# Remove stop words
from sklearn.feature_extraction.text import CountVectorizer
def remove_stop_word_tokenizer(s):
"""
Custom tokenizer
"""
count_vect = CountVectorizer()
default_tokenizer_function = count_vect.build_tokenizer()
words = default_tokenizer_function(s)
words = list(w for w in words if w.lower() not in stopwordslist)
return words
count_vect = CountVectorizer()
count_vect.tokenizer = remove_stop_word_tokenizer
tfidf_transformer = TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)
def trainNaiveBayes(data: list, targets: list):
"""
Trains a Naive Bayes classifier with the SciKitLearn modules
:param data: list of tweets
:param targets: list of associated targets for each tweet
:return: Predictor
"""
X_tweet_counts = count_vect.fit_transform(data)
# Compute term frequencies and store in X_train_tf
# Compute tfidf feature values and store in X_train_tfidf
X_train_tfidf = tfidf_transformer.fit_transform(X_tweet_counts)
# train and test a Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
return MultinomialNB().fit(X_train_tfidf, targets)
def trainLogisticRegression(data: list, targets: list):
"""
Trains a Logistic Regression classifier with the SciKitLearn modules
:param data: list of tweets
:param targets: list of associated targets for each tweet
:return: Predictor
"""
X_tweet_counts = count_vect.fit_transform(data)
# Compute term frequencies and store in X_train_tf
# Compute tfidf feature values and store in X_train_tfidf
X_train_tfidf = tfidf_transformer.fit_transform(X_tweet_counts)
# train and test a Logistic Regression Classifier
from sklearn.linear_model.logistic import LogisticRegression
return LogisticRegression().fit(X_train_tfidf, targets)
def trainSVM(data: list, targets: list):
X_tweet_counts = count_vect.fit_transform(data)
# Compute term frequencies and store in X_train_tf
# Compute tfidf feature values and store in X_train_tfidf
X_train_tfidf = tfidf_transformer.fit_transform(X_tweet_counts)
# train and test a SVM Classifier
from sklearn import svm
return svm.SVC().fit(X_train_tfidf, targets)
def predict(predictor, test_data):
"""
Uses the predictor to run a prediction against all tweets in test_data
:param predictor: classifier trained from one of our train functions
:param test_data: Tweets that we want to predict with
:return: An iterable that contains all the results of the predictions
"""
X_new_counts = count_vect.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predictedNB = predictor.predict(X_new_tfidf)
return predictedNB
def test(positive: list, negative: list, seed: int, trainingFunction, use_f):
"""
Trains and tests our classifiers.
1. Combine test data
2. Shuffle test data and verification targets with same seed
3. Train data with 75% of data and retrieve predictor
4. Use predictor on other 25% of the data
5. Retrieve the number of predictions that are correct
6. Report the percentage correct as well as F-Score
:param positive: positive tone tweets of the category
:param negative: negative tone tweets of the category
:param seed: the random generator seed used on both training and targets
:param trainingFunction: which type classifier to train
:param use_f: use the f-score to balance for amounts or have 50/50 pos / negative data
:return: predictor that we trained
"""
all_test_data = positive + negative
# creates a list of target values. Positive entries will be "1" and negative entries will be "0"
targets = [1] * len(positive)
targets = targets + ([0] * len(negative))
random.seed(seed)
random.shuffle(all_test_data)
random.seed(seed)
random.shuffle(targets)
training_data = all_test_data[:int(.75 * len(all_test_data))]
test_data = all_test_data[int(.75 * len(all_test_data)):]
training_targets = targets[:int(.75 * len(targets))]
test_targets = targets[int(.75 * len(targets)):]
predictor = None
if trainingFunction is None:
predictor = trainNaiveBayes(training_data, training_targets)
else:
predictor = trainingFunction(training_data, training_targets)
predicted = predict(predictor, test_data)
successes = [1 for prediction, target in zip(predicted, test_targets) if prediction == target]
besttest = len(successes) / len(test_data)
print("Best test error accuracy: {:.4f}%".format(besttest))
print("Best test error f1 score: {:.4f}%".format(f1_score(test_targets, predicted, average='micro')))
print("Confusion Matrix:")
print(confusion_matrix(test_targets, predicted))
return predictor
def testNN(positive: list, negative: list, seed: int):
"""
Tests a neural network
"""
all_data = positive + negative
# creates a list of target values. Positive entries will be "1" and negative entries will be "0"
targets = [1] * len(positive)
targets = targets + ([0] * len(negative))
random.seed(seed)
random.shuffle(targets)
random.seed(seed)
random.shuffle(all_data)
predictor = trainNN(all_data, targets, seed)
# predicted = predictNN(predictor, test_data)
# for doc, category in zip(test_data, predicted):
# print('%r %s' % (doc, category))
# print('\n')
return predictor
def trainNN(data: list, targets: list, seed):
"""
Trains a neural network
"""
X_tweet_counts = count_vect.fit_transform(data)
# Compute term frequencies and store in X_train_tf
# Compute tfidf feature values and store in X_train_tfidf
X_train_tfidf = tfidf_transformer.fit_transform(X_tweet_counts)
arr = X_train_tfidf.toarray()
trainingdata = arr[:int(.75 * len(arr))]
testdata = arr[int(.75 * len(arr)):]
trainingtargets = targets[:int(.75 * len(targets))]
testtargets = targets[int(.75 * len(targets)):]
trainingds = ClassificationDataSet(len(arr[0]), 1, nb_classes=2)
testds = ClassificationDataSet(len(arr[0]), 1, nb_classes=2)
for index, data in enumerate(trainingdata):
trainingds.addSample(data, trainingtargets[index])
for index, data in enumerate(testdata):
testds.addSample(data, testtargets[index])
trainingds._convertToOneOfMany()
testds._convertToOneOfMany()
net = buildNetwork(trainingds.indim, 10, 10, 10, trainingds.outdim, outclass=SoftmaxLayer)
trainer = BackpropTrainer(net, dataset=trainingds, learningrate=.65, momentum=.1)
besttrain = 99.9
besttest = 99.9
bestresults = []
bestclass = []
for i in range(20):
trainer.trainEpochs(1)
trainresult = percentError(trainer.testOnClassData(), trainingds['class'])
teststuff = trainer.testOnClassData(dataset=testds)
testresult = percentError(teststuff, testds['class'])
if testresult < besttest:
besttest = testresult
besttrain = trainresult
bestresults = teststuff
bestclass = testds['class']
print("epoch: %2d" % trainer.totalepochs)
print("train error: %2.2f%%" % trainresult)
print("test error: %2.2f%%" % testresult)
print("Best test error accuracy: {:.2f}%".format(besttest))
print("Best test error f1 score: {:.4f}%".format(f1_score(bestclass, bestresults, average='macro')))
print("Confusion Matrix:")
print(confusion_matrix(bestclass, bestresults))
return besttest
def test_category(name: str, pos_data, neg_data, use_f):
"""
Tests a given category by running our test function on all our classifiers
:param name: Classifier name
:param pos_data: test positive data
:param neg_data: test negative data
:param use_f: To use 50/50 pos negative (False) or to use an F-score to balance it out
:return:
"""
# IF we don't want to use an F_score to correct for dataset length,
# this ensures that the length of the two datasets are the same, so that there's a 50% chance of being right by default
length = max(len(pos_data), len(neg_data)) if use_f else min(len(pos_data), len(neg_data))
results = []
print("\n\nCategory: {}".format(name))
print("-" * len(name))
# print("{}NN".format(name))
# results.append(
# [testNN([w.text for w in pos_data[:3500]], [w.text for w in neg_data[:3500]], 1)])
print("{}NB".format(name))
results.append(
[test([w.text for w in pos_data[:length]], [w.text for w in neg_data[:length]], 1, trainNaiveBayes, use_f)])
print("\n{}LR".format(name))
results.append([
test([w.text for w in pos_data[:length]], [w.text for w in neg_data[:length]], 1, trainLogisticRegression,
use_f)])
def dataset_statistics(dataset):
data = [w.text for w in dataset]
character_count = 0
for i in data:
character_count += len(i)
avg_length = character_count / len(data)
print("Average length of document is {} characters".format(avg_length))
tweet_counts = count_vect.fit_transform(data)
num_non_zero = tweet_counts.nnz
print('Dimensions of X_tweet_counts are', tweet_counts.shape)
print('Number of non-zero elements in x_tweet_counts:', num_non_zero)
def tweet_puller():
pull_tweets(5000, 'sad')
pull_tweets(10000, 'courage')
pull_tweets(10000, 'scared')
pull_tweets(10000, 'sarcasm')
pull_tweets(10000, 'serious')
pull_tweets(10000, 'relaxed')
pull_tweets(10000, 'stressed')
def test_driver():
"""
Main driver for restoring data into our program and calling the training/test functions.
:return: None
"""
happy = list(read_all_data('happy'))
sad = list(read_all_data('sad'))
fearful = list(read_all_data('scary'))
courageous = list(read_all_data('courage'))
sarcastic = list(read_all_data('sarcasm'))
sincere = list(read_all_data('serious'))
relaxed = list(read_all_data('relaxed'))
stressed = list(read_all_data('stressed'))
dataset_statistics(happy)
print({'happy': len(happy), 'sad': len(sad), 'fearful': len(fearful), 'courageous': len(courageous),
'sarcastic': len(sarcastic), 'sincere': len(sincere), 'relaxed': len(relaxed), 'stressed': len(stressed)})
test_category('HappySad', happy, sad, True)
test_category('Courage', courageous, fearful, True)
test_category('Sarcasm', sarcastic, sincere, True)
test_category('Stress', relaxed, stressed, True)
if __name__ == '__main__':
# tweet_puller()
test_driver()