def set_trained_data(data, NN): new_data = [] for row in data: new_data.append(row[1]) G = GloveEmbedding("data/glove.6B.50d.txt", dimensions=50) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_Predict_Idx, max_len = S.map_sentence_list(new_data) i = 0 for s in X_Predict_Idx: # print(str(i)+ ": ", s) i = i + 1 #if max_len % 2 != 0: # max_len = max_len + 1 max_len = 72 print("Max Len", max_len) P = PadSentences(max_len) Trim = TrimSentences(max_len) X_Predict_Final = P.pad_list(X_Predict_Idx) X_Predict_Final = Trim.trim_list(X_Predict_Final) X_Predict_Final = np.array(X_Predict_Final) X_Prediction = NN.predict(X_Predict_Final) final = np.argmax(X_Prediction, axis=1) return new_data, final
def main(): G = GloveEmbedding("glove.6B.50d.txt") word_to_idx, idx_to_word, embedding = G.read_embedding() #print("locon: ", word_to_idx["locon"]) s = "I love New York and music locon" s = s.lower() print("Sentence: ", s) S = SentenceToIndices(word_to_idx) sentence = S.map_sentence(s) print("Sentence to indices: ", sentence) print("Padded: ", PadSentences(10).pad(sentence)) SE = SentenceToEmbedding(word_to_idx, idx_to_word, embedding) matrix = SE.map_sentence(s, max_len=10) print("Matrix: ", matrix) print("Matrix.shape: ", matrix.shape) print("Embedding i: ", embedding[word_to_idx["i"]]) sentences = [] sentences.append("I esta malo".lower()) sentences.append("Love la musica salsa.".lower()) sentences.append("Uff, q mal te va nene".lower()) mapped, mlen = S.map_sentence_list(sentences) print("mlen: ", mlen) for s in mapped: print(s)
def main(): G = GloveEmbedding("data/glove.6B.50d.txt") word_to_idx, idx_to_word, embedding = G.read_embedding() M = TweetSimilaryBasic(72, G, 5, 3) M.build() M.summary() M.plot("data/model3")
def main(): G = GloveEmbedding("data/glove.6B.50d.txt") word_to_idx, idx_to_word, embedding = G.read_embedding() print("embedding shape: ", embedding.shape) print("idx hello: ", word_to_idx["hello"]) print("word 20: ", idx_to_word[20]) e = embedding[word_to_idx["hello"]] print("embedding hello: ", e) print("e.shape: ", e.shape) print("<UNK>: ", word_to_idx['<unk>']) print("embedding: <UNK>: ", embedding[word_to_idx['<unk>']])
def main(): G = GloveEmbedding("../test/data/glove.6B.50d.txt") word_to_idx, idx_to_word, embedding = G.read_embedding() #print("locon: ", word_to_idx["locon"]) print("Length dictionary: ", len(word_to_idx)) #s = "I love New York and music locon" s = "The flu is making me sad" s = s.lower() print("Sentence: ", s) S = SentenceToIndices(word_to_idx) sentence = S.map_sentence(s) print("Sentence to indices: ", sentence) print("Padded: ", PadSentences(10).pad(sentence)) SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding) matrix1 = SE.map_sentence(s, max_len=len(s)) s2 = "The flu is making me sad".lower() matrix2 = SE.map_sentence(s2, max_len=len(s2)) print("Matrix 1: ", matrix1) print("Matrix.shape: ", matrix1.shape) print("\n Matrix 2: ", matrix2) print("Matrix.shape: ", matrix2.shape) print("\n Self Similarity: ", matrix_cosine_similary(matrix1, matrix1)) M1 = np.array([-1, 40, 0.04]).reshape((3, 1)) M2 = np.array([100, 2, 3]).reshape((3, 1)) print("M1: \n ", M1) print("M2: \n", M2) SimM = matrix_cosine_similary(M1, M2) print("SimM: \n", SimM) D = distance_similarity_matrix(SimM) print("D: ", D) M3 = np.array([[1, 2, 3, 1], [4, 5, 6, 2], [7, 8, 9, 1]]) M4 = np.array([[1, 2, 3.000001, 1], [4, 5, 6, 2], [7, 8, 9, 1]]) SimM = matrix_cosine_similary(M3, M3) print("SimM: \n", SimM) D = distance_similarity_matrix(SimM) print("D: ", D) SimM = matrix_cosine_similary(M3, M4) print("\nSimM: \n", SimM) Up = np.triu(SimM) D = distance_similarity_matrix(SimM) print("D: ", D) print("Up: ", Up) print("sum Up: ", np.sum(Up)) print("up I: ", np.triu(np.ones(Up.shape))) print("sum I: ", np.sum(np.triu(np.ones(Up.shape))))
def get_glove(glove_dims): # get glove embedding matrix if glove_dims == 50: G = GloveEmbedding(filename="../test/data/glove.twitter.27B.50d.txt", dimensions=50) elif glove_dims==200: G = GloveEmbedding(filename="../test/data/glove.twitter.27B.200d.txt", dimensions=200) elif glove_dims==300: G = GloveEmbedding(filename="../test/data/glove.840B.300d.txt", dimensions=300) else: print("Wrong Number of dimensions") exit(0) word_to_idx, idx_to_word, embedding = G.read_embedding() #S = SentenceToIndices(word_to_idx) SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding) return SE
def get_glove_embedding(self): g = GloveEmbedding(self.embedding_filename, dimensions=50) word_to_idx, idx_to_word, embedding = g.read_embedding() s = SentenceToIndices(word_to_idx) x_train_indices, max_len = s.map_sentence_list(self.x_all) if max_len % 2 != 0: max_len = max_len + 1 p = PadSentences(max_len) x_train_pad = p.pad_list(x_train_indices) # TRIM Tweets to remove noisy data trim_size = max_len trim = TrimSentences(trim_size) x_train_pad = trim.trim_list(x_train_pad) return x_train_pad, max_len, g
def getGlove(): G = GloveEmbedding("../test/data/glove.twitter.27B.50d.txt") word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding) return SE
i = 0 csv_file = csv.reader(f, delimiter=',') for r in csv_file: if i != 0: tweet = r[0] label = r[1] X_all.append(tweet) Y_all.append(label) i = i + 1 print("Data Ingested") num_data = len(X_all) limit = math.ceil(num_data * 0.60) X_train_sentences = X_all Y_train = Y_all G = GloveEmbedding(embedding_filename) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # convert to numPY arrays X_train = np.array(X_train_pad) Y_train = np.array(Y_train) Y_train = to_categorical(Y_train, num_classes=3) print("Train data convert to numpy arrays") model = KerasClassifier(build_fn=create_model(G, max_len)) print("Model created") # define the grid search parameters batch_size = [10, 20, 40, 60, 80, 100]
def set_prediction(pretain): data = pretain[:, 0] #Load Model model2 = load_model('trained/model1_50d_stoplemma_10e_new_prod.h5', custom_objects={'tf': tf}) # summarize model. model2.summary() # Load data G = GloveEmbedding("data/glove.twitter.27B.50d.txt", dimensions=50) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) premise = "same busy and just over the flu so feeling great" premise = "when ebola struck the doctors stepped up to the plate and the rest of us sat and watched them do their stuff to all engineers and environmentalists this is our time to step up and find answers to these consequences of our failure to coexist with nature" premise = remove_stopwords(premise) premise = lemmatizer_spacy(premise) x_premise = remove_stopwords(premise) x_premise = np.full((len(data)), x_premise) x_hypothesis = [] for row in data: #row = row.replace("’", "'") #row = fix_text_format(row) row = remove_stopwords(row) #row = lemmatizer_spacy(row) #row = remove_stopwords(row) x_hypothesis.append(row) x_hypothesis = np.array(x_hypothesis) X_one_indices, max_len1 = map_to_idx(S, x_premise) X_two_indices, max_len2 = map_to_idx(S, x_hypothesis) print("len: ", max_len1, max_len2) #max_len = max(max_len1, max_len2) max_len = 44 print("max_len_final: ", max_len) P = PadSentences(max_len) Trim = TrimSentences(max_len) X_one_train = P.pad_list(X_one_indices) X_two_train = P.pad_list(X_two_indices) #X_one_train = Trim.trim_list(X_one_indices) X_two_train = Trim.trim_list(X_two_train) X_one_train = np.array(X_one_train) X_two_train = np.array(X_two_train) X_one_aux_disease = set_disease(x_premise) X_two_aux_disease = set_disease(x_hypothesis) new_dis = [] for _ in range(len(data)): new_dis.append([0, 0, 1, 0, 0, 1]) X_one_aux_train = new_dis X_two_aux_label = pretain[:, 1] #X_one_aux_train = binarize_aux(s4, X_one_aux_label) X_two_aux_train = binarize_aux(X_two_aux_disease, X_two_aux_label) #new_two = [] #for row in range(len(data)): # new_two.append([row[0], row[1], row[2], row[3], row[4], row[5], row[6]]) X_two_aux_train = X_two_aux_train.tolist() for row in X_two_aux_train: del row[6] X_one_aux_train = np.array(X_one_aux_train) X_two_aux_train = np.array(X_two_aux_train) print("one_aux: ", np.array(X_one_aux_train).shape) print(X_one_aux_train[:5]) print("two_aux: ", np.array(X_two_aux_train).shape) print(X_two_aux_train[:5]) model2.load_weights('trained/model1_50d_stoplemma_10e_prod.h5') #model2.compile(optimizer='rmsprop',loss={'R1': 'mean_squared_error'},metrics={'R1': 'mse'}, loss_weights={'R1': 0.25}) #model2.compile(optimizer='rmsprop') #model2.load_weights('trained/model1_50d_stoplemma_10e_prod.h5') X_Prediction = model2.predict( [X_one_train, X_two_train, X_one_aux_train, X_two_aux_train]) return X_Prediction
def main(): G = GloveEmbedding("data/glove.6B.50d.txt") word_to_idx, idx_to_word, embedding = G.read_embedding() print("embedding shape: ", embedding.shape) print("idx hello: ", word_to_idx["hello"]) print("word 20: ", idx_to_word[20]) e = embedding[word_to_idx["hello"]] print("embedding hello: ", e) print("e.shape: ", e.shape) print("<UNK>: ", word_to_idx['<unk>']) print("embedding: <UNK>: ", embedding[word_to_idx['<unk>']]) you = embedding[word_to_idx['you']] he = embedding[word_to_idx['he']] ise = embedding[word_to_idx['is']] crazy = embedding[word_to_idx['crazy']] nuts = embedding[word_to_idx['nuts']] print("embedding of you: ", you) print("embedding of he: ", he) print("embedding of ise: ", ise) print("embedding of crazy: ", crazy) print("embedding of nuts: ", nuts) tweet1 = "You are crazy" tweet2 = "You are nuts" tweet3 = "He is crazy" tweet4 = "You are lazy" tweet5 = "You are crazy man" tweet6 = "Yes You are crazy" tweet7 = "The fast train" mapper = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding) emb1 = mapper.map_sentence(tweet1.lower(), 4) emb2 = mapper.map_sentence(tweet2.lower(), 4) emb3 = mapper.map_sentence(tweet3.lower(), 4) emb4 = mapper.map_sentence(tweet4.lower(), 4) emb5 = mapper.map_sentence(tweet5.lower(), 4) emb6 = mapper.map_sentence(tweet6.lower(), 4) emb7 = mapper.map_sentence(tweet7.lower(), 4) print("Distance tweet1 vs tweet2: ") print("Frobenious: ", sim.Frobenius_Distance(emb1, emb2)) print("Cos Tri: ", sim.TriUL_sim(emb1, emb2)) print("Distance tweet1 vs tweet3: ") print("Frobenious: ", sim.Frobenius_Distance(emb1, emb3)) print("Cos Tri: ", sim.TriUL_sim(emb1, emb3)) print("Distance tweet2 vs tweet3: ") print("Frobenious: ", sim.Frobenius_Distance(emb2, emb3)) print("Cos Tri: ", sim.TriUL_sim(emb2, emb3)) print("Distance tweet1 vs tweet4: ") print("Frobenious: ", sim.Frobenius_Distance(emb1, emb4)) print("Cos Tri: ", sim.TriUL_sim(emb1, emb4)) print("Distance tweet1 vs tweet5: ") print("Frobenious: ", sim.Frobenius_Distance(emb1, emb5)) print("Cos Tri: ", sim.TriUL_sim(emb1, emb5)) print("Distance tweet1 vs tweet6: ") print("Frobenious: ", sim.Frobenius_Distance(emb1, emb6)) print("Cos Tri: ", sim.TriUL_sim(emb1, emb6)) print("Distance tweet1 vs tweet7: ") print("Frobenious: ", sim.Frobenius_Distance(emb1, emb7)) print("Cos Tri: ", sim.TriUL_sim(emb1, emb7)) print("Embedding tweet1: ") print(emb1) print("Embedding tweet6: ") print(emb6)
def process(self, json_filename, h5_filename, plot=False, epochs=100): np.random.seed(11) # open the file with tweets X_all = [] Y_all = [] with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: i = 0 csv_file = csv.reader(f, delimiter=',') for r in csv_file: if i != 0: tweet = r[0] label = r[1] X_all.append(tweet) Y_all.append(label) i = i + 1 print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.60) X_train_sentences = X_all Y_train = Y_all # divide the data into X_train, Y_train, X_test, Y_test #X_train_sentences = X_all[0: limit] #Y_train = Y_all[0: limit] #X_test_sentences = X_all[limit:] #Y_test = Y_all[limit:] #print("Data Divided") #Get embeeding G = GloveEmbedding(self.embedding_filename) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") #convert to numPY arrays X_train = np.array(X_train_pad) Y_train = np.array(Y_train) Y_train = to_categorical(Y_train, num_classes=3) print("Train data convert to numpy arrays") NN = TweetSentiment2LSTM2DenseSM(max_len, G) print("model created") kernel_regularizer = l2(0.001) kernel_regularizer = None NN.build(first_layer_units=max_len, second_layer_units=max_len, relu_dense_layer=5, dense_layer_units=3, first_layer_dropout=0.3, second_layer_dropout=0.6, l2=kernel_regularizer) print("model built") NN.summary() sgd = SGD(lr=0.001, momentum=0.09, decay=0.001, nesterov=True) rmsprop = RMSprop(decay=0.003) adam = Adam(lr=0.1, decay=0.05) NN.compile(optimizer=rmsprop, loss="categorical_crossentropy", metrics=['accuracy', precision, recall, f1, fprate]) print("model compiled") print("Begin training") callback = TensorBoard(log_dir="/tmp/logs") w_dict = {0: 0.31, 1: 0.63, 2: 0.06} history = NN.fit(X_train, Y_train, epochs=epochs, callbacks=[callback], validation_split=0.2, class_weight=w_dict) print("Model trained") # X_test_indices, max_len = S.map_sentence_list(X_test_sentences) # print("Test data mapped") # X_test_pad = P.pad_list(X_test_indices) # print("Test data padded") # X_test = np.array(X_test_pad) # Y_test = np.array(Y_test) # print("Test data converted to numpy arrays") # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback]) # print("accuracy: ", acc) T = "I have a bad case of vomit" X_Predict = [ "my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city" ] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i = 0 for s in X_Predict_Idx: print(str(i) + ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) #X_Predict = [X_Predict] X_Predict_Final = np.array(X_Predict_Final) print("Predict: ", NN.predict(X_Predict_Final)) print("Storing model and weights") NN.save_model(json_filename, h5_filename) if plot: print("Ploting") self.plot(history) print("Done!")
def process(self, json_filename, h5_filename): np.random.seed(11) # open the file with tweets X_all = [] Y_all = [] with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: i = 0 csv_file = csv.reader(f, delimiter=',') for r in csv_file: if i != 0: tweet = r[0] label = r[1] X_all.append(tweet) Y_all.append(label) i = i + 1 print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.60) # divide the data into X_train, Y_train, X_test, Y_test X_train_sentences = X_all[0:limit] Y_train = Y_all[0:limit] X_test_sentences = X_all[limit:] Y_test = Y_all[limit:] print("Data Divided") #Get embeeding G = GloveEmbedding(self.embedding_filename) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # Trim #trim_size = 40 #Trim = TrimSentences(trim_size) #X_train_pad = Trim.trim_list(X_train_pad) #convert to numPY arrays X_train = np.array(X_train_pad) Y_train = np.array(Y_train) print("Train data convert to numpy arrays") NN = TweetSentiment2LSTM2Dense(max_len, G) #NN = TweetSentiment2LSTM2Dense(trim_size, G) print("model created") NN.build(first_layer_units=128, dense_layer_units=1, first_layer_dropout=0, second_layer_dropout=0) print("model built") NN.summary() sgd = SGD(lr=0.3, momentum=0.001, decay=0.01, nesterov=False) adam = Adam(lr=0.03) #NN.compile(loss="binary_crossentropy", metrics=['binary_accuracy'], optimizer=adam) NN.compile(loss="binary_crossentropy", metrics=['binary_accuracy'], optimizer='rmsprop') print("model compiled") print("Begin training") callback = TensorBoard(log_dir="/tmp/logs") NN.fit(X_train, Y_train, epochs=5, callbacks=[callback]) print("Model trained") X_test_indices, max_len = S.map_sentence_list(X_test_sentences) print("Test data mapped") X_test_pad = P.pad_list(X_test_indices) print("Test data padded") X_test = np.array(X_test_pad) Y_test = np.array(Y_test) print("Test data converted to numpy arrays") loss, acc = NN.evaluate(X_test, Y_test) print("accuracy: ", acc, ", loss: ", loss) T = "I have a bad case of vomit" X_Predict = [ "my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city" ] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i = 0 for s in X_Predict_Idx: print(str(i) + ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) #X_Predict_Final = Trim.trim_list(X_Predict_Final) #X_Predict = [X_Predict] X_Predict_Final = np.array(X_Predict_Final) print("Predict: ", NN.predict(X_Predict_Final)) print("Storing model and weights") NN.save_model(json_filename, h5_filename) print("Done!")
def process(self, json_filename, h5_filename, plot=False, epochs=100, vect_dimensions=100): # open the file with tweets X_all = [] Y_all = [] All = [] #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: with open(self.labeled_tweets_filename, "r", encoding="utf-8") as f: i = 0 csv_file = csv.reader(f, delimiter=',') ones_count = 0 for r in csv_file: if i != 0: All.append(r) i = i + 1 np.random.shuffle(All) ones_count = 0 two_count = 0 zero_count = 0 for r in All: tweet = r[0] label = int(r[1]) if (label == 0): zero_count += 1 elif (label == 1): ones_count += 1 else: two_count += 1 X_all.append(tweet) Y_all.append(label) print("len(Y_all): ", len(Y_all)) class_weight_val = class_weight.compute_class_weight( 'balanced', np.unique(Y_all), Y_all) print("classes: ", np.unique(Y_all)) print("counts for 0, 1, 2: ", zero_count, ones_count, two_count) print("class weight_val: ", class_weight_val) class_weight_dictionary = { 0: class_weight_val[0], 1: class_weight_val[1], 2: class_weight_val[2] } print("dict: ", class_weight_dictionary) print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.80) X_train_sentences = X_all Y_train = Y_all G = GloveEmbedding(self.embedding_filename, dimensions=100) word_to_idx, idx_to_word, embedding = G.read_embedding() # print("hello", embedding[47]) # print("hello", embedding[9876]) # S = SentenceToEmbedding(word_to_idx, idx_to_word, embedding) # # edata = [] # padding_vect = [0] * 100 # # # // exit(0) # n = 0 # for i in X_train_sentences: # # print("Buenoooooooo", n) # m = S.map_sentence(i) # # print("n:", n) # if m.shape[0] < 75: # m = np.vstack((m, np.zeros((75-m.shape[0],100)))) # # cuando codear "eficientemente" tiene que ser utilizado # # while m.shape[0] < 75: # # m = np.vstack((m,np.array(padding_vect))) # else: # if m.shape[0] == 100: # m = np.array([m]) # m = np.vstack((m, np.zeros((75-m.shape[0],100)))) # p = np.array([m]) # # print("ghjkl", str(p.shape), " ghjkluhnm ", n, i) # if n > 0: # edata = np.vstack((edata, p)) # else: # edata = p # # print("----------------------------------->" + str(edata.shape)) # n = n+1 # np.save("array", edata) hjkl = np.load("data/array.npy") print("----------------------------------->" + str(hjkl.shape)) # exit(0) X_train = hjkl Y_train = np.array(Y_train) ones_count = np.count_nonzero(Y_train) zeros_count = len(Y_train) - ones_count print("ones count: ", ones_count) print("zeros count: ", zeros_count) print("two count: ", two_count) Y_train_old = Y_train Y_train = to_categorical(Y_train, num_classes=3) # plt.imshow(X_train[0]) # plt.show() #Divide the data X_test_text = X_all[limit:] X_test = X_train[limit:] Y_test = Y_train[limit:] X_train = X_train[0:limit] Y_train = Y_train[0:limit] print("----------------------------------->" + str(X_train.shape)) print( "Entiendo que esto es la data que utiliza para hacer le training ", len(X_train), len(X_train[0]), len(X_train[0][0]), " Y_train ", len(Y_train)) print("Train data convert to numpy arrays") NN = KerasInceptionCNN(0, G) print("model created") kernel_regularizer = l2(0.001) NN.build(filters=11, first_dropout=0, second_dropout=0.05, padding='valid', dense_units=16) print("model built") NN.summary() sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True) rmsprop = RMSprop(decay=0.003) adam = Adam(lr=0.1, decay=0.05) sgd = SGD(lr=0.05) NN.compile(optimizer='adam', loss="categorical_crossentropy", metrics=['accuracy', precision, recall, f1, fprate]) print("model compiled") print("Begin training") #class_weight = {0: 0.67, 1: 0.33} #class_weight = None # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - HERE history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=3, class_weight=class_weight_dictionary) print("Model trained") print("Predicting") print("len(X_test): ", X_test) preds = NN.predict(X_test) print("len(preds): ", len(preds)) print("type preds: ", type(preds)) print("preds before: ", preds) preds = np.argmax(preds, axis=1) print("preds: ", preds) print("len(preds): ", len(preds)) Y_test = Y_train_old[limit:] print("Y test: ", Y_test) c_matrix = confusion_matrix(Y_test, preds) print("matrix: ", c_matrix) print("Storing Errors: ") ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv") print("Errors stored") print("Confusion matrix: ") prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '') print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1) # # X_test_indices, max_len = S.map_sentence_list(X_test_sentences) # print("Test data mapped") # X_test_pad = P.pad_list(X_test_indices) # print("Test data padded") # X_test = np.array(X_test_pad) # Y_test = np.array(Y_test) # print("Test data converted to numpy arrays") # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback]) # print("accuracy: ", acc) T = "I have a bad case of vomit" X_Predict = [ "my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city" ] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i = 0 for s in X_Predict_Idx: print(str(i) + ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) X_Predict_Final = Trim.trim_list(X_Predict_Final) #X_Predict = [X_Predict] X_Predict_Final = np.array(X_Predict_Final) print("Predict: ", np.argmax(NN.predict(X_Predict_Final))) print("Storing model and weights") NN.save_model(json_filename, h5_filename) if plot: print("Ploting") self.plot(history) print("Done!")
def process(self, json_filename, h5_filename, plot=False, epochs=100, vect_dimensions=100): # open the file with tweets X_all = [] Y_all = [] All = [] #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: with open(self.labeled_tweets_filename, "r", encoding="utf-8") as f: i = 0 csv_file = csv.reader(f, delimiter=',') ones_count = 0 for r in csv_file: if i != 0: All.append(r) i = i + 1 np.random.shuffle(All) ones_count = 0 two_count = 0 zero_count = 0 for r in All: tweet = r[0] label = int(r[1]) if (label == 0): zero_count += 1 elif (label == 1): ones_count += 1 else: two_count += 1 X_all.append(tweet) Y_all.append(label) print("len(Y_all): ", len(Y_all)) class_weight_val = class_weight.compute_class_weight( 'balanced', np.unique(Y_all), Y_all) print("classes: ", np.unique(Y_all)) print("counts for 0, 1, 2: ", zero_count, ones_count, two_count) print("class weight_val: ", class_weight_val) class_weight_dictionary = { 0: class_weight_val[0], 1: class_weight_val[1], 2: class_weight_val[2] } print("dict: ", class_weight_dictionary) print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.80) X_train_sentences = X_all Y_train = Y_all G = GloveEmbedding(self.embedding_filename, dimensions=100) word_to_idx, idx_to_word, embedding = G.read_embedding() print("hello", embedding[47]) print("hello", embedding[9876]) S = SentenceToEmbedding(word_to_idx, idx_to_word, embedding) #X_train_matrixes = S.map_sentence(c) edata = [] padding_vect = [0] * 100 # # s = "I love New York and music locon" # # e = "I love New York and music locon" # # mskdn = [e, s] for i in X_train_sentences: m = S.map_sentence(i) if len(m) < 75: while len(m) < 75: m = np.vstack((m, padding_vect)) edata.append(m) print("hello", edata[1]) print("len", len(edata[1]), " ", len(edata[1][1]), " ") # padding_len = self.max_len - len(sentence) # if (padding_len > 0): # padding = [] # r = range(0, padding_len) # for _ in r: # padding.append(0) # return sentence + padding # print("Train data mappend to indices") # if max_len % 2 !=0: # max_len = max_len + 1 # # P = PadSentences(max_len) # X_train_pad = P.pad_list(X_train_indices) # print("Train data padded") # # TRIM # trim_size = max_len # #trim_size = 33 # Trim = TrimSentences(trim_size) # X_train_pad = Trim.trim_list(X_train_pad) # print("X[0], ", X_train_pad[0]) # #convert to numPY arrays X_train = np.array(edata) Y_train = np.array(Y_train) ones_count = np.count_nonzero(Y_train) zeros_count = len(Y_train) - ones_count print("ones count: ", ones_count) print("zeros count: ", zeros_count) print("two count: ", two_count) Y_train_old = Y_train Y_train = to_categorical(Y_train, num_classes=3) #Divide the data X_test_text = X_all[limit:] X_test = X_train[limit:] Y_test = Y_train[limit:] X_train = X_train[0:limit] Y_train = Y_train[0:limit] print( "Entiendo que esto es la data que utiliza para hacer le training ", X_train) # print ("data divided on value: ", limit) # print("lengths X_train, Y_train: ", len(X_train), len(Y_train)) # print("lengths X_test, Y_test: ", len(X_test), len(Y_test)) print("Train data convert to numpy arrays") NN = KerasInceptionCNN(0, G) print("model created") kernel_regularizer = l2(0.001) NN.build(filters=11, first_dropout=0, second_dropout=0.05, padding='valid', dense_units=16) print("model built") NN.summary() sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True) rmsprop = RMSprop(decay=0.003) adam = Adam(lr=0.1, decay=0.05) sgd = SGD(lr=0.05) NN.compile(optimizer='adam', loss="categorical_crossentropy", metrics=['accuracy', precision, recall, f1, fprate]) print("model compiled") print("Begin training") callback = TensorBoard(log_dir="/tmp/logs") #class_weight = {0: 0.67, 1: 0.33} #class_weight = None history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=32, callbacks=[callback], class_weight=class_weight_dictionary) print("Model trained") print("Predicting") print("len(X_test): ", X_test) preds = NN.predict(X_test) print("len(preds): ", len(preds)) print("type preds: ", type(preds)) print("preds before: ", preds) preds = np.argmax(preds, axis=1) print("preds: ", preds) print("len(preds): ", len(preds)) Y_test = Y_train_old[limit:] print("Y test: ", Y_test) c_matrix = confusion_matrix(Y_test, preds) print("matrix: ", c_matrix) print("Storing Errors: ") ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv") print("Errors stored") print("Confusion matrix: ") prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '') print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1) # # X_test_indices, max_len = S.map_sentence_list(X_test_sentences) # print("Test data mapped") # X_test_pad = P.pad_list(X_test_indices) # print("Test data padded") # X_test = np.array(X_test_pad) # Y_test = np.array(Y_test) # print("Test data converted to numpy arrays") # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback]) # print("accuracy: ", acc) T = "I have a bad case of vomit" X_Predict = [ "my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city" ] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i = 0 for s in X_Predict_Idx: print(str(i) + ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) X_Predict_Final = Trim.trim_list(X_Predict_Final) #X_Predict = [X_Predict] X_Predict_Final = np.array(X_Predict_Final) print("Predict: ", np.argmax(NN.predict(X_Predict_Final))) print("Storing model and weights") NN.save_model(json_filename, h5_filename) if plot: print("Ploting") self.plot(history) print("Done!")
def process(self, json_filename, h5_filename, plot=False, epochs = 100, vect_dimensions = 100): np.random.seed(11) # open the file with tweets X_all = [] Y_all = [] All = [] Zeros = [] with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: i = 0 csv_file = csv.reader(f, delimiter = ',') ones_count = 0 Ones = [] for r in csv_file: if i !=0: label = int(r[1]) #if label == 0: # Zeros.append(r) All.append(r) # tweet = r[0] # label = r[1] # X_all.append(tweet) # Y_all.append(label) i = i + 1 print("len(All): ", len(All)) np.random.shuffle(All) ones_count = 0 for r in All: tweet = r[0].strip() label = int(r[1]) if (label == 2): label = 0 # if (label == 1) and (ones_count <= 4611): # X_all.append(tweet) # Y_all.append(label) # ones_count +=1 # elif (label == 0): X_all.append(tweet) Y_all.append(label) print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.60) X_train_sentences = X_all Y_train = Y_all # divide the data into X_train, Y_train, X_test, Y_test #X_train_sentences = X_all[0: limit] #Y_train = Y_all[0: limit] #X_test_sentences = X_all[limit:] #Y_test = Y_all[limit:] #print("Data Divided") #Get embeeding #G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions) G = GloveEmbedding(self.embedding_filename, dimensions=50) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") if max_len % 2 !=0: max_len = max_len + 1 P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # TRIM trim_size = max_len #trim_size = 45 Trim = TrimSentences(trim_size) X_train_pad = Trim.trim_list(X_train_pad) print("X[0], ", X_train_pad[0]) #convert to numPY arrays X_train_reverse = [] for X in X_train_pad: t = X[::-1] X_train_reverse.append(t) X_train = np.array(X_train_pad) X_train_reverse = np.array(X_train_reverse) Y_train = np.array(Y_train) ones_count = np.count_nonzero(Y_train) zeros_count = len(Y_train) - ones_count print("ones count: ", ones_count) print("zeros count: ", zeros_count) #Y_train = to_categorical(Y_train, num_classes=3) print("Train data convert to numpy arrays") #NN = TweetSentiment2DCNN(trim_size, G) NN = TweetSentiment2DCNN2Channel(trim_size, G) #NN = TweetSentimentInception(trim_size, G) #print("Build GRU") #NN = TweetSentimentGRUSM(max_len, G) print("model created") kernel_regularizer = l2(0.001) #kernel_regularizer = None NN.build(filters=11, first_dropout=0, second_dropout=0.1, padding='valid', dense_units=32) print("model built") NN.summary() sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True) rmsprop = RMSprop(decay=0.003) adam = Adam(lr=0.1, decay=0.05) #sgd = SGD(lr=0.05) NN.compile(optimizer=rmsprop, loss="binary_crossentropy", metrics=['accuracy', precision, recall, f1, fprate]) print("model compiled") print("Begin training") callback = TensorBoard(log_dir="/tmp/logs") #class_weight = {0: 0.67, 1: 0.33} class_weight = None history = NN.fit([X_train, X_train_reverse], Y_train, epochs=epochs, batch_size=32, callbacks=[callback], validation_split=0.20, class_weight=class_weight) print("Model trained") # X_test_indices, max_len = S.map_sentence_list(X_test_sentences) # print("Test data mapped") # X_test_pad = P.pad_list(X_test_indices) # print("Test data padded") # X_test = np.array(X_test_pad) # Y_test = np.array(Y_test) # print("Test data converted to numpy arrays") # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback]) # print("accuracy: ", acc) T = "I have a bad case of vomit" X_Predict = ["my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city"] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i =0 for s in X_Predict_Idx: print(str(i)+ ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) X_Predict_Final = Trim.trim_list(X_Predict_Final) #X_Predict = [X_Predict] X_Predict_Reverse = [] for r in X_Predict_Final: t = r[::-1] X_Predict_Reverse.append(t) X_Predict_Final = np.array(X_Predict_Final) X_Predict_Reverse = np.array(X_Predict_Reverse) Preds = NN.predict([X_Predict_Final, X_Predict_Reverse]) Preds = ((Preds >= 0.5)*1).flatten() print("Predict: ", Preds) print("Storing model and weights") NN.save_model(json_filename, h5_filename) if plot: print("Ploting") self.plot(history) print("Done!")
def process(self, json_filename, h5_filename, plot=False, epochs=100, vect_dimensions=50): # open the file with tweets X_all = [] Y_all = [] All = [] #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: with open(self.labeled_tweets_filename, "r") as f: i = 0 csv_file = csv.reader(f, delimiter=',') ones_count = 0 for r in csv_file: if i != 0: All.append(r) i = i + 1 np.random.shuffle(All) ones_count = 0 two_count = 0 zero_count = 0 for r in All: tweet = r[0] label = int(r[1]) if (label == 0): zero_count += 1 elif (label == 1): ones_count += 1 else: two_count += 1 # if (label == 2): # label = 0 # if (label == 1) and (ones_count <= 4611): # X_all.append(tweet) # Y_all.append(label) # ones_count +=1 # elif (label == 0): X_all.append(tweet) Y_all.append(label) print("len(Y_all): ", len(Y_all)) class_weight_val = class_weight.compute_class_weight( 'balanced', np.unique(Y_all), Y_all) print("classes: ", np.unique(Y_all)) print("counts for 0, 1, 2: ", zero_count, ones_count, two_count) print("class weight_val: ", class_weight_val) class_weight_dictionary = { 0: class_weight_val[0], 1: class_weight_val[1], 2: class_weight_val[2] } print("dict: ", class_weight_dictionary) print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.80) X_train_sentences = X_all Y_train = Y_all # Divide after conversions # divide the data into X_train, Y_train, X_test, Y_test #X_train_sentences = X_all[0: limit] #Y_train = Y_all[0: limit] #X_test_sentences = X_all[limit:] #Y_test = Y_all[limit:] #print("Data Divided") #Get embeeding #G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions) G = GloveEmbedding(self.embedding_filename, dimensions=vect_dimensions) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") if max_len % 2 != 0: max_len = max_len + 1 P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # TRIM trim_size = max_len #trim_size = 33 Trim = TrimSentences(trim_size) X_train_pad = Trim.trim_list(X_train_pad) print("X[0], ", X_train_pad[0]) #convert to numPY arrays X_train = np.array(X_train_pad) Y_train = np.array(Y_train) ones_count = np.count_nonzero(Y_train) zeros_count = len(Y_train) - ones_count print("ones count: ", ones_count) print("zeros count: ", zeros_count) print("two count: ", two_count) Y_train_old = Y_train Y_train = to_categorical(Y_train, num_classes=3) # Divide the data X_test_text = X_all[limit:] X_test = X_train[limit:] Y_test = Y_train[limit:] X_train = X_train[0:limit] Y_train = Y_train[0:limit] print("data divided on value: ", limit) print("lengths X_train, Y_train: ", len(X_train), len(Y_train)) print("lengths X_test, Y_test: ", len(X_test), len(Y_test)) print("Train data convert to numpy arrays") #NN = TweetSentiment2DCNN(trim_size, G) #NN = TweetSentiment2LSTM2Dense(trim_size, G) #NN =TweetSentiment2LSTM2Dense3Layer(trim_size, G) #NN =TweetSentiment2LSTM2Dense4Layer(trim_size, G) #NN = TweetSentimentCNN(trim_size, G) #print("Build GRU") #NN = TweetSentimentGRUSM(max_len, G) NN = TweetSentiment1D(trim_size, G) #NN = TweetSentiment1DRev(trim_size, G) print("model created") kernel_regularizer = l2(0.001) #kernel_regularizer = None NN.build(filters=11, first_dropout=0, second_dropout=0.05, padding='valid', dense_units=16) #NN.build(first_layer_units = max_len, second_layer_units = max_len, relu_dense_layer=16, dense_layer_units = 3, # first_layer_dropout=0, second_layer_dropout=0, third_layer_dropout=0) print("model built") NN.summary() sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True) rmsprop = RMSprop(decay=0.003) adam = Adam(lr=0.0003, decay=0.001) sgd = SGD(lr=0.05) NN.compile(optimizer=adam, loss="categorical_crossentropy", metrics=['accuracy', precision, recall, f1, fprate]) print("model compiled") print("Begin training") #callback = TensorBoard(log_dir="/tmp/logs") #class_weight = {0: 0.67, 1: 0.33} #class_weight = None #history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=32, callbacks=[callback], class_weight=class_weight_dictionary) history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=64, class_weight=class_weight_dictionary, validation_split=0.2) print("Model trained") print("Predicting") print("len(X_test): ", X_test) preds = NN.predict(X_test) print("len(preds): ", len(preds)) print("type preds: ", type(preds)) print("preds before: ", preds) preds = np.argmax(preds, axis=1) print("preds: ", preds) print("len(preds): ", len(preds)) Y_test = Y_train_old[limit:] print("Y test: ", Y_test) c_matrix = confusion_matrix(Y_test, preds) print("matrix: ", c_matrix) print("Storing Errors: ") ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv") print("Errors stored") print("Confusion matrix: ") prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '') print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1) # # X_test_indices, max_len = S.map_sentence_list(X_test_sentences) # print("Test data mapped") # X_test_pad = P.pad_list(X_test_indices) # print("Test data padded") # X_test = np.array(X_test_pad) # Y_test = np.array(Y_test) # print("Test data converted to numpy arrays") # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback]) # print("accuracy: ", acc) T = "I have a bad case of vomit" X_Predict = [ "my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city" ] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i = 0 for s in X_Predict_Idx: print(str(i) + ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) X_Predict_Final = Trim.trim_list(X_Predict_Final) #X_Predict = [X_Predict] X_Predict_Final = np.array(X_Predict_Final) print("Predict: ", np.argmax(NN.predict(X_Predict_Final))) print("Storing model and weights") NN.save_model(json_filename, h5_filename) if plot: print("Ploting") self.plot(history) print("Done!")
def main(model_file, model_weights, labeled_tweets, embedding_filename): # load json and create model json_file = open(model_file, 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json) # load weights into new model loaded_model.load_weights(model_weights) print("Loaded model from disk") # evaluate loaded model on test data loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) # open the file with tweets X_all = [] Y_all = [] All = [] with open(labeled_tweets, "r", encoding="ISO-8859-1") as f: i = 0 csv_file = csv.reader(f, delimiter=',') ones_count = 0 for r in csv_file: if i != 0: label = int(r[1]) if (label == 1) or (label == 2): if ones_count <= 13000: All.append(r) ones_count += 1 else: All.append(r) # tweet = r[0] # label = r[1] # X_all.append(tweet) # Y_all.append(label) i = i + 1 ones_count = 0 for r in All: tweet = r[0] label = int(r[1]) if (label == 2): label = 0 # if (label == 1) and (ones_count <= 4611): # X_all.append(tweet) # Y_all.append(label) # ones_count +=1 # elif (label == 0): X_all.append(tweet) Y_all.append(label) print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.60) X_train_sentences = X_all Y_train = Y_all # divide the data into X_train, Y_train, X_test, Y_test # X_train_sentences = X_all[0: limit] # Y_train = Y_all[0: limit] # X_test_sentences = X_all[limit:] # Y_test = Y_all[limit:] # print("Data Divided") # Get embeeding # G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions) G = GloveEmbedding(embedding_filename, dimensions=50) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") if max_len % 2 != 0: max_len = max_len + 1 P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # TRIM trim_size = max_len Trim = TrimSentences(trim_size) X_train_pad = Trim.trim_list(X_train_pad) print("X[0], ", X_train_pad[0]) # convert to numPY arrays X_train = np.array(X_train_pad) Y_train = np.array(Y_train) ones_count = np.count_nonzero(Y_train) zeros_count = len(Y_train) - ones_count print("ones count: ", ones_count) print("zeros count: ", zeros_count) # Y_train = to_categorical(Y_train, num_classes=3) print("Train data convert to numpy arrays") Preds = loaded_model.predict(X_train) Preds = ((Preds >= 0.5) * 1).flatten() with open("data/alltweetsanderrors.csv", "w") as f: csv_writer = csv.writer(f, delimiter=",") i = 0 err_count = 0 for r in All: tweet = r[0] label = int(r[1]) if label == 2: label = 0 if Preds[i] != label: err_count += 1 condition = 0 else: condition = 1 error_pred = [] error_pred.append(tweet) error_pred.append(label) error_pred.append(Preds[i]) error_pred.append(condition) csv_writer.writerow(error_pred) i += 1 print("All tweets: ", i) print("Error count: ", err_count)