def get_glove_embedding(self): g = GloveEmbedding(self.embedding_filename, dimensions=50) word_to_idx, idx_to_word, embedding = g.read_embedding() s = SentenceToIndices(word_to_idx) x_train_indices, max_len = s.map_sentence_list(self.x_all) if max_len % 2 != 0: max_len = max_len + 1 p = PadSentences(max_len) x_train_pad = p.pad_list(x_train_indices) # TRIM Tweets to remove noisy data trim_size = max_len trim = TrimSentences(trim_size) x_train_pad = trim.trim_list(x_train_pad) return x_train_pad, max_len, g
def process(self, json_filename, h5_filename, plot=False, epochs=100, vect_dimensions=50): # open the file with tweets X_all = [] Y_all = [] All = [] #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: with open(self.labeled_tweets_filename, "r") as f: i = 0 csv_file = csv.reader(f, delimiter=',') ones_count = 0 for r in csv_file: if i != 0: All.append(r) i = i + 1 np.random.shuffle(All) ones_count = 0 two_count = 0 zero_count = 0 for r in All: tweet = r[0] label = int(r[1]) if (label == 0): zero_count += 1 elif (label == 1): ones_count += 1 else: two_count += 1 # if (label == 2): # label = 0 # if (label == 1) and (ones_count <= 4611): # X_all.append(tweet) # Y_all.append(label) # ones_count +=1 # elif (label == 0): X_all.append(tweet) Y_all.append(label) print("len(Y_all): ", len(Y_all)) class_weight_val = class_weight.compute_class_weight( 'balanced', np.unique(Y_all), Y_all) print("classes: ", np.unique(Y_all)) print("counts for 0, 1, 2: ", zero_count, ones_count, two_count) print("class weight_val: ", class_weight_val) class_weight_dictionary = { 0: class_weight_val[0], 1: class_weight_val[1], 2: class_weight_val[2] } print("dict: ", class_weight_dictionary) print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.80) X_train_sentences = X_all Y_train = Y_all # Divide after conversions # divide the data into X_train, Y_train, X_test, Y_test #X_train_sentences = X_all[0: limit] #Y_train = Y_all[0: limit] #X_test_sentences = X_all[limit:] #Y_test = Y_all[limit:] #print("Data Divided") #Get embeeding #G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions) G = GloveEmbedding(self.embedding_filename, dimensions=vect_dimensions) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") if max_len % 2 != 0: max_len = max_len + 1 P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # TRIM trim_size = max_len #trim_size = 33 Trim = TrimSentences(trim_size) X_train_pad = Trim.trim_list(X_train_pad) print("X[0], ", X_train_pad[0]) #convert to numPY arrays X_train = np.array(X_train_pad) Y_train = np.array(Y_train) ones_count = np.count_nonzero(Y_train) zeros_count = len(Y_train) - ones_count print("ones count: ", ones_count) print("zeros count: ", zeros_count) print("two count: ", two_count) Y_train_old = Y_train Y_train = to_categorical(Y_train, num_classes=3) # Divide the data X_test_text = X_all[limit:] X_test = X_train[limit:] Y_test = Y_train[limit:] X_train = X_train[0:limit] Y_train = Y_train[0:limit] print("data divided on value: ", limit) print("lengths X_train, Y_train: ", len(X_train), len(Y_train)) print("lengths X_test, Y_test: ", len(X_test), len(Y_test)) print("Train data convert to numpy arrays") #NN = TweetSentiment2DCNN(trim_size, G) #NN = TweetSentiment2LSTM2Dense(trim_size, G) #NN =TweetSentiment2LSTM2Dense3Layer(trim_size, G) #NN =TweetSentiment2LSTM2Dense4Layer(trim_size, G) #NN = TweetSentimentCNN(trim_size, G) #print("Build GRU") #NN = TweetSentimentGRUSM(max_len, G) NN = TweetSentiment1D(trim_size, G) #NN = TweetSentiment1DRev(trim_size, G) print("model created") kernel_regularizer = l2(0.001) #kernel_regularizer = None NN.build(filters=11, first_dropout=0, second_dropout=0.05, padding='valid', dense_units=16) #NN.build(first_layer_units = max_len, second_layer_units = max_len, relu_dense_layer=16, dense_layer_units = 3, # first_layer_dropout=0, second_layer_dropout=0, third_layer_dropout=0) print("model built") NN.summary() sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True) rmsprop = RMSprop(decay=0.003) adam = Adam(lr=0.0003, decay=0.001) sgd = SGD(lr=0.05) NN.compile(optimizer=adam, loss="categorical_crossentropy", metrics=['accuracy', precision, recall, f1, fprate]) print("model compiled") print("Begin training") #callback = TensorBoard(log_dir="/tmp/logs") #class_weight = {0: 0.67, 1: 0.33} #class_weight = None #history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=32, callbacks=[callback], class_weight=class_weight_dictionary) history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=64, class_weight=class_weight_dictionary, validation_split=0.2) print("Model trained") print("Predicting") print("len(X_test): ", X_test) preds = NN.predict(X_test) print("len(preds): ", len(preds)) print("type preds: ", type(preds)) print("preds before: ", preds) preds = np.argmax(preds, axis=1) print("preds: ", preds) print("len(preds): ", len(preds)) Y_test = Y_train_old[limit:] print("Y test: ", Y_test) c_matrix = confusion_matrix(Y_test, preds) print("matrix: ", c_matrix) print("Storing Errors: ") ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv") print("Errors stored") print("Confusion matrix: ") prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '') print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1) # # X_test_indices, max_len = S.map_sentence_list(X_test_sentences) # print("Test data mapped") # X_test_pad = P.pad_list(X_test_indices) # print("Test data padded") # X_test = np.array(X_test_pad) # Y_test = np.array(Y_test) # print("Test data converted to numpy arrays") # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback]) # print("accuracy: ", acc) T = "I have a bad case of vomit" X_Predict = [ "my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city" ] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i = 0 for s in X_Predict_Idx: print(str(i) + ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) X_Predict_Final = Trim.trim_list(X_Predict_Final) #X_Predict = [X_Predict] X_Predict_Final = np.array(X_Predict_Final) print("Predict: ", np.argmax(NN.predict(X_Predict_Final))) print("Storing model and weights") NN.save_model(json_filename, h5_filename) if plot: print("Ploting") self.plot(history) print("Done!")
def process(self, json_filename, h5_filename, plot=False, epochs = 100, vect_dimensions = 100): np.random.seed(11) # open the file with tweets X_all = [] Y_all = [] All = [] Zeros = [] with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: i = 0 csv_file = csv.reader(f, delimiter = ',') ones_count = 0 Ones = [] for r in csv_file: if i !=0: label = int(r[1]) #if label == 0: # Zeros.append(r) All.append(r) # tweet = r[0] # label = r[1] # X_all.append(tweet) # Y_all.append(label) i = i + 1 print("len(All): ", len(All)) np.random.shuffle(All) ones_count = 0 for r in All: tweet = r[0].strip() label = int(r[1]) if (label == 2): label = 0 # if (label == 1) and (ones_count <= 4611): # X_all.append(tweet) # Y_all.append(label) # ones_count +=1 # elif (label == 0): X_all.append(tweet) Y_all.append(label) print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.60) X_train_sentences = X_all Y_train = Y_all # divide the data into X_train, Y_train, X_test, Y_test #X_train_sentences = X_all[0: limit] #Y_train = Y_all[0: limit] #X_test_sentences = X_all[limit:] #Y_test = Y_all[limit:] #print("Data Divided") #Get embeeding #G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions) G = GloveEmbedding(self.embedding_filename, dimensions=50) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") if max_len % 2 !=0: max_len = max_len + 1 P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # TRIM trim_size = max_len #trim_size = 45 Trim = TrimSentences(trim_size) X_train_pad = Trim.trim_list(X_train_pad) print("X[0], ", X_train_pad[0]) #convert to numPY arrays X_train_reverse = [] for X in X_train_pad: t = X[::-1] X_train_reverse.append(t) X_train = np.array(X_train_pad) X_train_reverse = np.array(X_train_reverse) Y_train = np.array(Y_train) ones_count = np.count_nonzero(Y_train) zeros_count = len(Y_train) - ones_count print("ones count: ", ones_count) print("zeros count: ", zeros_count) #Y_train = to_categorical(Y_train, num_classes=3) print("Train data convert to numpy arrays") #NN = TweetSentiment2DCNN(trim_size, G) NN = TweetSentiment2DCNN2Channel(trim_size, G) #NN = TweetSentimentInception(trim_size, G) #print("Build GRU") #NN = TweetSentimentGRUSM(max_len, G) print("model created") kernel_regularizer = l2(0.001) #kernel_regularizer = None NN.build(filters=11, first_dropout=0, second_dropout=0.1, padding='valid', dense_units=32) print("model built") NN.summary() sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True) rmsprop = RMSprop(decay=0.003) adam = Adam(lr=0.1, decay=0.05) #sgd = SGD(lr=0.05) NN.compile(optimizer=rmsprop, loss="binary_crossentropy", metrics=['accuracy', precision, recall, f1, fprate]) print("model compiled") print("Begin training") callback = TensorBoard(log_dir="/tmp/logs") #class_weight = {0: 0.67, 1: 0.33} class_weight = None history = NN.fit([X_train, X_train_reverse], Y_train, epochs=epochs, batch_size=32, callbacks=[callback], validation_split=0.20, class_weight=class_weight) print("Model trained") # X_test_indices, max_len = S.map_sentence_list(X_test_sentences) # print("Test data mapped") # X_test_pad = P.pad_list(X_test_indices) # print("Test data padded") # X_test = np.array(X_test_pad) # Y_test = np.array(Y_test) # print("Test data converted to numpy arrays") # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback]) # print("accuracy: ", acc) T = "I have a bad case of vomit" X_Predict = ["my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city"] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i =0 for s in X_Predict_Idx: print(str(i)+ ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) X_Predict_Final = Trim.trim_list(X_Predict_Final) #X_Predict = [X_Predict] X_Predict_Reverse = [] for r in X_Predict_Final: t = r[::-1] X_Predict_Reverse.append(t) X_Predict_Final = np.array(X_Predict_Final) X_Predict_Reverse = np.array(X_Predict_Reverse) Preds = NN.predict([X_Predict_Final, X_Predict_Reverse]) Preds = ((Preds >= 0.5)*1).flatten() print("Predict: ", Preds) print("Storing model and weights") NN.save_model(json_filename, h5_filename) if plot: print("Ploting") self.plot(history) print("Done!")
def main(model_file, model_weights, labeled_tweets, embedding_filename): # load json and create model json_file = open(model_file, 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json) # load weights into new model loaded_model.load_weights(model_weights) print("Loaded model from disk") # evaluate loaded model on test data loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) # open the file with tweets X_all = [] Y_all = [] All = [] with open(labeled_tweets, "r", encoding="ISO-8859-1") as f: i = 0 csv_file = csv.reader(f, delimiter=',') ones_count = 0 for r in csv_file: if i != 0: label = int(r[1]) if (label == 1) or (label == 2): if ones_count <= 13000: All.append(r) ones_count += 1 else: All.append(r) # tweet = r[0] # label = r[1] # X_all.append(tweet) # Y_all.append(label) i = i + 1 ones_count = 0 for r in All: tweet = r[0] label = int(r[1]) if (label == 2): label = 0 # if (label == 1) and (ones_count <= 4611): # X_all.append(tweet) # Y_all.append(label) # ones_count +=1 # elif (label == 0): X_all.append(tweet) Y_all.append(label) print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.60) X_train_sentences = X_all Y_train = Y_all # divide the data into X_train, Y_train, X_test, Y_test # X_train_sentences = X_all[0: limit] # Y_train = Y_all[0: limit] # X_test_sentences = X_all[limit:] # Y_test = Y_all[limit:] # print("Data Divided") # Get embeeding # G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions) G = GloveEmbedding(embedding_filename, dimensions=50) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") if max_len % 2 != 0: max_len = max_len + 1 P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # TRIM trim_size = max_len Trim = TrimSentences(trim_size) X_train_pad = Trim.trim_list(X_train_pad) print("X[0], ", X_train_pad[0]) # convert to numPY arrays X_train = np.array(X_train_pad) Y_train = np.array(Y_train) ones_count = np.count_nonzero(Y_train) zeros_count = len(Y_train) - ones_count print("ones count: ", ones_count) print("zeros count: ", zeros_count) # Y_train = to_categorical(Y_train, num_classes=3) print("Train data convert to numpy arrays") Preds = loaded_model.predict(X_train) Preds = ((Preds >= 0.5) * 1).flatten() with open("data/alltweetsanderrors.csv", "w") as f: csv_writer = csv.writer(f, delimiter=",") i = 0 err_count = 0 for r in All: tweet = r[0] label = int(r[1]) if label == 2: label = 0 if Preds[i] != label: err_count += 1 condition = 0 else: condition = 1 error_pred = [] error_pred.append(tweet) error_pred.append(label) error_pred.append(Preds[i]) error_pred.append(condition) csv_writer.writerow(error_pred) i += 1 print("All tweets: ", i) print("Error count: ", err_count)
def process(self, json_filename, h5_filename, plot=False, epochs=100, vect_dimensions=100): np.random.seed(11) # open the file with tweets X_all = [] Y_all = [] All = [] with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: i = 0 csv_file = csv.reader(f, delimiter=',') for r in csv_file: if i != 0: All.append(r) # tweet = r[0] # label = r[1] # X_all.append(tweet) # Y_all.append(label) i = i + 1 np.random.shuffle(All) for r in All: tweet = r[0] label = r[1] if int(label) == 2: label = '0' X_all.append(tweet) Y_all.append(label) print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.60) X_train_sentences = X_all Y_train = Y_all # divide the data into X_train, Y_train, X_test, Y_test #X_train_sentences = X_all[0: limit] #Y_train = Y_all[0: limit] #X_test_sentences = X_all[limit:] #Y_test = Y_all[limit:] #print("Data Divided") #Get embeeding G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions) #G = GloveEmbedding(self.embedding_filename, dimensions=50) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") if max_len % 2 != 0: max_len = max_len + 1 P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # TRIM trim_size = 40 Trim = TrimSentences(trim_size) X_train_pad = Trim.trim_list(X_train_pad) print("X[0], ", X_train_pad[0]) #convert to numPY arrays X_train = np.array(X_train_pad) Y_train = np.array(Y_train) #Y_train = to_categorical(Y_train, num_classes=3) print("Train data convert to numpy arrays") #NN = TweetSentiment2DCNN(trim_size, G) NN = TweetSentiment2DCNNv4(trim_size, G) #print("Build GRU") #NN = TweetSentimentGRUSM(max_len, G) print("model created") kernel_regularizer = l2(0.001) #kernel_regularizer = None NN.build(filters=3, first_dropout=0.01, second_dropout=0.01, padding='valid', dense_units=16) print("model built") NN.summary() sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True) rmsprop = RMSprop(decay=0.003) adam = Adam(lr=0.1, decay=0.05) sgd = SGD(lr=0.05) NN.compile(optimizer='adam', loss="binary_crossentropy", metrics=['accuracy', f1, precision, recall]) print("model compiled") print("Begin training") callback = TensorBoard(log_dir="/tmp/logs") history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=32, callbacks=[callback], validation_split=0.4) print("Model trained") # X_test_indices, max_len = S.map_sentence_list(X_test_sentences) # print("Test data mapped") # X_test_pad = P.pad_list(X_test_indices) # print("Test data padded") # X_test = np.array(X_test_pad) # Y_test = np.array(Y_test) # print("Test data converted to numpy arrays") # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback]) # print("accuracy: ", acc) T = "I have a bad case of vomit" X_Predict = [ "my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city" ] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i = 0 for s in X_Predict_Idx: print(str(i) + ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) X_Predict_Final = Trim.trim_list(X_Predict_Final) #X_Predict = [X_Predict] X_Predict_Final = np.array(X_Predict_Final) print("Predict: ", NN.predict(X_Predict_Final)) print("Storing model and weights") NN.save_model(json_filename, h5_filename) if plot: print("Ploting") self.plot(history) print("Done!")
def process(self, json_filename, h5_filename, plot=False, epochs=100): np.random.seed(11) # open the file with tweets X_all = [] Y_all = [] with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: i = 0 csv_file = csv.reader(f, delimiter=',') for r in csv_file: if i != 0: tweet = r[0] label = int(r[1]) if (label == 2): label = 0 X_all.append(tweet) Y_all.append(label) i = i + 1 print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.60) X_train_sentences = X_all Y_train = Y_all # divide the data into X_train, Y_train, X_test, Y_test #X_train_sentences = X_all[0: limit] #Y_train = Y_all[0: limit] #X_test_sentences = X_all[limit:] #Y_test = Y_all[limit:] #print("Data Divided") #Get embeeding G = GloveEmbedding(self.embedding_filename) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # Trim trim_size = 33 max_len = trim_size Trim = TrimSentences(trim_size) X_train_pad = Trim.trim_list(X_train_pad) #convert to numPY arrays X_train = np.array(X_train_pad) Y_train = np.array(Y_train) print("Train data convert to numpy arrays") NN = TweetSentiment2LSTM2Dense(max_len, G) print("model created") NN.build(first_layer_units=max_len, second_layer_units=max_len, relu_dense_layer=16, dense_layer_units=1, first_layer_dropout=0, second_layer_dropout=0) print("model built") NN.summary() sgd = SGD(lr=0.01, momentum=0.9, decay=0.1, nesterov=False) rmsprop = RMSprop(decay=0.001) NN.compile(optimizer=rmsprop, loss="binary_crossentropy", metrics=['accuracy', precision, recall, f1, fprate]) print("model compiled") print("Begin training") callback = TensorBoard(log_dir="/tmp/logs") class_weight = {1: (1 - 0.63), 0: 0.63} history = NN.fit(X_train, Y_train, epochs=epochs, callbacks=[callback], validation_split=0.2, class_weight=class_weight) print("Model trained") # X_test_indices, max_len = S.map_sentence_list(X_test_sentences) # print("Test data mapped") # X_test_pad = P.pad_list(X_test_indices) # print("Test data padded") # X_test = np.array(X_test_pad) # Y_test = np.array(Y_test) # print("Test data converted to numpy arrays") # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback]) # print("accuracy: ", acc) T = "I have a bad case of vomit" X_Predict = [ "my zika is bad", "i love colombia every day of the week dude", "my has been tested for ebola", "there is a diarrhea outbreak in the city" ] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i = 0 for s in X_Predict_Idx: print(str(i) + ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) X_Predict_Final = Trim.trim_list(X_Predict_Final) #X_Predict = [X_Predict] X_Predict_Final = np.array(X_Predict_Final) print("Predict: ", NN.predict(X_Predict_Final)) print("Storing model and weights") NN.save_model(json_filename, h5_filename) if plot: print("Ploting") self.plot(history) print("Done!")