/
machine_learning.py
227 lines (140 loc) · 6.92 KB
/
machine_learning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import pandas as pd
import json
import pprint
import numpy as np
import os
import string
import unicodedata
import re
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#analyser = SentimentIntensityAnalyzer()
import nltk
# from nltk.tokenize.t import ToktokTokenizer
# tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
from gensim import corpora, models
from keras import layers, models, optimizers
from sklearn.decomposition import LatentDirichletAllocation
from yellowbrick.classifier import ClassificationReport
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)
#preprocessing pipeline
#Pipeline models features like word count, tfidf, word density, word embeddings (GloVe)
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from keras.preprocessing import text, sequence
from sklearn.metrics import f1_score, classification_report
from preprocessing import PreprocessingPipeline
class LoadDataframe:
def __init__(self,file1, file2):
self.file1 = file1
self.file2 = file2
self.df = pd.DataFrame()
def load(self):
hs = pd.read_csv(self.file1, encoding="ISO-8859-1",index_col=6, keep_default_na=False)
#print(hs.head())
orig = pd.read_csv(self.file2, index_col=0, header=None)
orig.index.name = 'ID'
orig = orig.rename(columns={1: 'Class'})
orig.index = orig.index.astype(str)
#merging the two dataframes
hs = pd.merge(hs, orig, how='inner', left_index=True, right_index=True)
self.df = hs
self.df = self.df.dropna()
self.df = self.df[['Tweets','Class']]
self.df.columns = ['data','label']
return self.df
class Feature:
def __init__(self, df):
self.df = df
self.train_x, self.valid_x, self.train_y, self.valid_y = model_selection.train_test_split(df['data'], df['label'])
#Label Encoder converts yes and no into 1 and 0
encoder = preprocessing.LabelEncoder()
self.train_y = encoder.fit_transform(self.train_y)
self.valid_y = encoder.fit_transform(self.valid_y)
def count_vectorizer(self):
#Count vectorizer will calculate count of every word in text data and will ignore number and whitespaces
count_vector = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vector.fit(self.df['data'])
#using the count vector defined above, we'll transform our existing text data into train_x_count where every row will indicate
#tweet and every column will represent count of word indexed at that loc
train_x_count = count_vector.transform(self.train_x)
valid_x_count = count_vector.transform(self.valid_x)
return train_x_count, valid_x_count
def word_tf_idf(self):
tfidf_vect = TfidfVectorizer(analyzer='word',token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(self.df['data'])
xtrain_tfidf = tfidf_vect.transform(self.train_x)
xvalid_tfidf = tfidf_vect.transform(self.valid_x)
return xtrain_tfidf, xvalid_tfidf
def ngram_tdidf(self):
tfidf_ngram = TfidfVectorizer(analyzer='word',token_pattern=r'\w{1,}', ngram_range=(2,3) ,max_features=5000)
tfidf_ngram.fit(self.df['data'])
xtrain_tfidf_ngram = tfidf_ngram.transform(self.train_x)
xvalid_tfidf_ngram = tfidf_ngram.transform(self.valid_x)
return xtrain_tfidf_ngram, xvalid_tfidf_ngram
class MachineLearning:
def __init__(self, df):
feature_instance = Feature(df)
self.train_x_count, self.valid_x_count = feature_instance.count_vectorizer()
self.xtrain_tfidf, self.xvalid_tfidf = feature_instance.word_tf_idf()
self.xtrain_tfidf_ngram, self.xvalid_tfidf_ngram = feature_instance.ngram_tdidf()
self.train_y, self.valid_y = feature_instance.train_y, feature_instance.valid_y
def train_model(self,classifier, feature_vector_train,label, feature_vector_valid, is_neural_net = False):
classifier.fit(feature_vector_train, label)
prediction = classifier.predict(feature_vector_valid)
if is_neural_net:
prediction = prediction.argmax(axis=-1)
print(classification_report(prediction, self.valid_y))
return metrics.accuracy_score(prediction, self.valid_y), f1_score(prediction, self.valid_y,average=None)
def support_vector_machine(self):
from sklearn import svm
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear', 'rbf'),'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
svc = svm.SVC(gamma="scale")
svc = GridSearchCV(svc, parameters, cv=5)
accuracy, f1 = self.train_model(svc, self.train_x_count, self.train_y, self.valid_x_count)
print("SVM (Count Vectors)",accuracy, f1)
accuracy, f1 = self.train_model(svc, self.xtrain_tfidf, self.train_y, self.xvalid_tfidf)
print("SVM (TF-IDF)", accuracy, f1)
accuracy, f1 = self.train_model(svc, self.xtrain_tfidf_ngram, self.train_y, self.xvalid_tfidf_ngram)
print("SVM (TDIDF-ngram)", accuracy, f1)
print("Best parameters! -", svc.best_params_)
return svc
def logistic_regression(self):
from sklearn.model_selection import GridSearchCV
parameters = {'penalty':['l1','l2'],'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
clf_log = linear_model.LogisticRegression()
clf_log = GridSearchCV(clf_log, parameters, cv=5)
accuracy, f1 = self.train_model(clf_log, self.train_x_count, self.train_y, self.valid_x_count)
print("Logistic Regression (Count Vectors)",accuracy, f1)
accuracy, f1 = self.train_model(clf_log, self.xtrain_tfidf, self.train_y, self.xvalid_tfidf)
print("Logistic Regression)", accuracy, f1)
accuracy, f1 = self.train_model(clf_log, self.xtrain_tfidf_ngram, self.train_y, self.xvalid_tfidf_ngram)
print("Logistic Regression (TDIDF-ngram)", accuracy, f1)
print("Best parameters! -", clf_log.best_params_)
return clf_log
def naive_bayes(self):
from sklearn.model_selection import GridSearchCV
parameters = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
clf_log = naive_bayes.MultinomialNB()
clf_log = GridSearchCV(clf_log, parameters, cv=5)
accuracy, f1 = self.train_model(clf_log, self.train_x_count, self.train_y, self.valid_x_count)
print("Logistic Regression (Count Vectors)",accuracy, f1)
accuracy, f1 = self.train_model(clf_log, self.xtrain_tfidf, self.train_y, self.xvalid_tfidf)
print("Logistic Regression)", accuracy, f1)
accuracy, f1 = self.train_model(clf_log, self.xtrain_tfidf_ngram, self.train_y, self.xvalid_tfidf_ngram)
print("Logistic Regression (TDIDF-ngram)", accuracy, f1)
print("Best parameters! -", clf_log.best_params_)
return clf_log
if __name__ == '__main__':
instance_load_dataframe = LoadDataframe("hatespeech.csv","NAACL_SRW_2016.csv")
df = instance_load_dataframe.load()
instance_preprocessing = PreprocessingPipeline()
df['data'] = df['data'].apply(lambda x: instance_preprocessing.normalize(x))
machine_learning = MachineLearning(df)
machine_learning.support_vector_machine()
machine_learning.logistic_regression()
print(df.head())