def main(): G = GloveEmbedding("glove.6B.50d.txt") word_to_idx, idx_to_word, embedding = G.read_embedding() #print("locon: ", word_to_idx["locon"]) s = "I love New York and music locon" s = s.lower() print("Sentence: ", s) S = SentenceToIndices(word_to_idx) sentence = S.map_sentence(s) print("Sentence to indices: ", sentence) print("Padded: ", PadSentences(10).pad(sentence)) SE = SentenceToEmbedding(word_to_idx, idx_to_word, embedding) matrix = SE.map_sentence(s, max_len=10) print("Matrix: ", matrix) print("Matrix.shape: ", matrix.shape) print("Embedding i: ", embedding[word_to_idx["i"]]) sentences = [] sentences.append("I esta malo".lower()) sentences.append("Love la musica salsa.".lower()) sentences.append("Uff, q mal te va nene".lower()) mapped, mlen = S.map_sentence_list(sentences) print("mlen: ", mlen) for s in mapped: print(s)
def get_glove_embedding(self): g = GloveEmbedding(self.embedding_filename, dimensions=50) word_to_idx, idx_to_word, embedding = g.read_embedding() s = SentenceToIndices(word_to_idx) x_train_indices, max_len = s.map_sentence_list(self.x_all) if max_len % 2 != 0: max_len = max_len + 1 p = PadSentences(max_len) x_train_pad = p.pad_list(x_train_indices) # TRIM Tweets to remove noisy data trim_size = max_len trim = TrimSentences(trim_size) x_train_pad = trim.trim_list(x_train_pad) return x_train_pad, max_len, g
def main(): G = GloveEmbedding("../test/data/glove.6B.50d.txt") word_to_idx, idx_to_word, embedding = G.read_embedding() #print("locon: ", word_to_idx["locon"]) print("Length dictionary: ", len(word_to_idx)) #s = "I love New York and music locon" s = "The flu is making me sad" s = s.lower() print("Sentence: ", s) S = SentenceToIndices(word_to_idx) sentence = S.map_sentence(s) print("Sentence to indices: ", sentence) print("Padded: ", PadSentences(10).pad(sentence)) SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding) matrix1 = SE.map_sentence(s, max_len=len(s)) s2 = "The flu is making me sad".lower() matrix2 = SE.map_sentence(s2, max_len=len(s2)) print("Matrix 1: ", matrix1) print("Matrix.shape: ", matrix1.shape) print("\n Matrix 2: ", matrix2) print("Matrix.shape: ", matrix2.shape) print("\n Self Similarity: ", matrix_cosine_similary(matrix1, matrix1)) M1 = np.array([-1, 40, 0.04]).reshape((3, 1)) M2 = np.array([100, 2, 3]).reshape((3, 1)) print("M1: \n ", M1) print("M2: \n", M2) SimM = matrix_cosine_similary(M1, M2) print("SimM: \n", SimM) D = distance_similarity_matrix(SimM) print("D: ", D) M3 = np.array([[1, 2, 3, 1], [4, 5, 6, 2], [7, 8, 9, 1]]) M4 = np.array([[1, 2, 3.000001, 1], [4, 5, 6, 2], [7, 8, 9, 1]]) SimM = matrix_cosine_similary(M3, M3) print("SimM: \n", SimM) D = distance_similarity_matrix(SimM) print("D: ", D) SimM = matrix_cosine_similary(M3, M4) print("\nSimM: \n", SimM) Up = np.triu(SimM) D = distance_similarity_matrix(SimM) print("D: ", D) print("Up: ", Up) print("sum Up: ", np.sum(Up)) print("up I: ", np.triu(np.ones(Up.shape))) print("sum I: ", np.sum(np.triu(np.ones(Up.shape))))
def process(self, json_filename, h5_filename, plot=False, epochs=100, vect_dimensions=50): # open the file with tweets X_all = [] Y_all = [] All = [] #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: with open(self.labeled_tweets_filename, "r") as f: i = 0 csv_file = csv.reader(f, delimiter=',') ones_count = 0 for r in csv_file: if i != 0: All.append(r) i = i + 1 np.random.shuffle(All) ones_count = 0 two_count = 0 zero_count = 0 for r in All: tweet = r[0] label = int(r[1]) if (label == 0): zero_count += 1 elif (label == 1): ones_count += 1 else: two_count += 1 # if (label == 2): # label = 0 # if (label == 1) and (ones_count <= 4611): # X_all.append(tweet) # Y_all.append(label) # ones_count +=1 # elif (label == 0): X_all.append(tweet) Y_all.append(label) print("len(Y_all): ", len(Y_all)) class_weight_val = class_weight.compute_class_weight( 'balanced', np.unique(Y_all), Y_all) print("classes: ", np.unique(Y_all)) print("counts for 0, 1, 2: ", zero_count, ones_count, two_count) print("class weight_val: ", class_weight_val) class_weight_dictionary = { 0: class_weight_val[0], 1: class_weight_val[1], 2: class_weight_val[2] } print("dict: ", class_weight_dictionary) print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.80) X_train_sentences = X_all Y_train = Y_all # Divide after conversions # divide the data into X_train, Y_train, X_test, Y_test #X_train_sentences = X_all[0: limit] #Y_train = Y_all[0: limit] #X_test_sentences = X_all[limit:] #Y_test = Y_all[limit:] #print("Data Divided") #Get embeeding #G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions) G = GloveEmbedding(self.embedding_filename, dimensions=vect_dimensions) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") if max_len % 2 != 0: max_len = max_len + 1 P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # TRIM trim_size = max_len #trim_size = 33 Trim = TrimSentences(trim_size) X_train_pad = Trim.trim_list(X_train_pad) print("X[0], ", X_train_pad[0]) #convert to numPY arrays X_train = np.array(X_train_pad) Y_train = np.array(Y_train) ones_count = np.count_nonzero(Y_train) zeros_count = len(Y_train) - ones_count print("ones count: ", ones_count) print("zeros count: ", zeros_count) print("two count: ", two_count) Y_train_old = Y_train Y_train = to_categorical(Y_train, num_classes=3) # Divide the data X_test_text = X_all[limit:] X_test = X_train[limit:] Y_test = Y_train[limit:] X_train = X_train[0:limit] Y_train = Y_train[0:limit] print("data divided on value: ", limit) print("lengths X_train, Y_train: ", len(X_train), len(Y_train)) print("lengths X_test, Y_test: ", len(X_test), len(Y_test)) print("Train data convert to numpy arrays") #NN = TweetSentiment2DCNN(trim_size, G) #NN = TweetSentiment2LSTM2Dense(trim_size, G) #NN =TweetSentiment2LSTM2Dense3Layer(trim_size, G) #NN =TweetSentiment2LSTM2Dense4Layer(trim_size, G) #NN = TweetSentimentCNN(trim_size, G) #print("Build GRU") #NN = TweetSentimentGRUSM(max_len, G) NN = TweetSentiment1D(trim_size, G) #NN = TweetSentiment1DRev(trim_size, G) print("model created") kernel_regularizer = l2(0.001) #kernel_regularizer = None NN.build(filters=11, first_dropout=0, second_dropout=0.05, padding='valid', dense_units=16) #NN.build(first_layer_units = max_len, second_layer_units = max_len, relu_dense_layer=16, dense_layer_units = 3, # first_layer_dropout=0, second_layer_dropout=0, third_layer_dropout=0) print("model built") NN.summary() sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True) rmsprop = RMSprop(decay=0.003) adam = Adam(lr=0.0003, decay=0.001) sgd = SGD(lr=0.05) NN.compile(optimizer=adam, loss="categorical_crossentropy", metrics=['accuracy', precision, recall, f1, fprate]) print("model compiled") print("Begin training") #callback = TensorBoard(log_dir="/tmp/logs") #class_weight = {0: 0.67, 1: 0.33} #class_weight = None #history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=32, callbacks=[callback], class_weight=class_weight_dictionary) history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=64, class_weight=class_weight_dictionary, validation_split=0.2) print("Model trained") print("Predicting") print("len(X_test): ", X_test) preds = NN.predict(X_test) print("len(preds): ", len(preds)) print("type preds: ", type(preds)) print("preds before: ", preds) preds = np.argmax(preds, axis=1) print("preds: ", preds) print("len(preds): ", len(preds)) Y_test = Y_train_old[limit:] print("Y test: ", Y_test) c_matrix = confusion_matrix(Y_test, preds) print("matrix: ", c_matrix) print("Storing Errors: ") ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv") print("Errors stored") print("Confusion matrix: ") prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '') print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1) # # X_test_indices, max_len = S.map_sentence_list(X_test_sentences) # print("Test data mapped") # X_test_pad = P.pad_list(X_test_indices) # print("Test data padded") # X_test = np.array(X_test_pad) # Y_test = np.array(Y_test) # print("Test data converted to numpy arrays") # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback]) # print("accuracy: ", acc) T = "I have a bad case of vomit" X_Predict = [ "my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city" ] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i = 0 for s in X_Predict_Idx: print(str(i) + ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) X_Predict_Final = Trim.trim_list(X_Predict_Final) #X_Predict = [X_Predict] X_Predict_Final = np.array(X_Predict_Final) print("Predict: ", np.argmax(NN.predict(X_Predict_Final))) print("Storing model and weights") NN.save_model(json_filename, h5_filename) if plot: print("Ploting") self.plot(history) print("Done!")
def process(self, json_filename, h5_filename, plot=False, epochs = 100, vect_dimensions = 100): np.random.seed(11) # open the file with tweets X_all = [] Y_all = [] All = [] Zeros = [] with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: i = 0 csv_file = csv.reader(f, delimiter = ',') ones_count = 0 Ones = [] for r in csv_file: if i !=0: label = int(r[1]) #if label == 0: # Zeros.append(r) All.append(r) # tweet = r[0] # label = r[1] # X_all.append(tweet) # Y_all.append(label) i = i + 1 print("len(All): ", len(All)) np.random.shuffle(All) ones_count = 0 for r in All: tweet = r[0].strip() label = int(r[1]) if (label == 2): label = 0 # if (label == 1) and (ones_count <= 4611): # X_all.append(tweet) # Y_all.append(label) # ones_count +=1 # elif (label == 0): X_all.append(tweet) Y_all.append(label) print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.60) X_train_sentences = X_all Y_train = Y_all # divide the data into X_train, Y_train, X_test, Y_test #X_train_sentences = X_all[0: limit] #Y_train = Y_all[0: limit] #X_test_sentences = X_all[limit:] #Y_test = Y_all[limit:] #print("Data Divided") #Get embeeding #G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions) G = GloveEmbedding(self.embedding_filename, dimensions=50) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") if max_len % 2 !=0: max_len = max_len + 1 P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # TRIM trim_size = max_len #trim_size = 45 Trim = TrimSentences(trim_size) X_train_pad = Trim.trim_list(X_train_pad) print("X[0], ", X_train_pad[0]) #convert to numPY arrays X_train_reverse = [] for X in X_train_pad: t = X[::-1] X_train_reverse.append(t) X_train = np.array(X_train_pad) X_train_reverse = np.array(X_train_reverse) Y_train = np.array(Y_train) ones_count = np.count_nonzero(Y_train) zeros_count = len(Y_train) - ones_count print("ones count: ", ones_count) print("zeros count: ", zeros_count) #Y_train = to_categorical(Y_train, num_classes=3) print("Train data convert to numpy arrays") #NN = TweetSentiment2DCNN(trim_size, G) NN = TweetSentiment2DCNN2Channel(trim_size, G) #NN = TweetSentimentInception(trim_size, G) #print("Build GRU") #NN = TweetSentimentGRUSM(max_len, G) print("model created") kernel_regularizer = l2(0.001) #kernel_regularizer = None NN.build(filters=11, first_dropout=0, second_dropout=0.1, padding='valid', dense_units=32) print("model built") NN.summary() sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True) rmsprop = RMSprop(decay=0.003) adam = Adam(lr=0.1, decay=0.05) #sgd = SGD(lr=0.05) NN.compile(optimizer=rmsprop, loss="binary_crossentropy", metrics=['accuracy', precision, recall, f1, fprate]) print("model compiled") print("Begin training") callback = TensorBoard(log_dir="/tmp/logs") #class_weight = {0: 0.67, 1: 0.33} class_weight = None history = NN.fit([X_train, X_train_reverse], Y_train, epochs=epochs, batch_size=32, callbacks=[callback], validation_split=0.20, class_weight=class_weight) print("Model trained") # X_test_indices, max_len = S.map_sentence_list(X_test_sentences) # print("Test data mapped") # X_test_pad = P.pad_list(X_test_indices) # print("Test data padded") # X_test = np.array(X_test_pad) # Y_test = np.array(Y_test) # print("Test data converted to numpy arrays") # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback]) # print("accuracy: ", acc) T = "I have a bad case of vomit" X_Predict = ["my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city"] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i =0 for s in X_Predict_Idx: print(str(i)+ ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) X_Predict_Final = Trim.trim_list(X_Predict_Final) #X_Predict = [X_Predict] X_Predict_Reverse = [] for r in X_Predict_Final: t = r[::-1] X_Predict_Reverse.append(t) X_Predict_Final = np.array(X_Predict_Final) X_Predict_Reverse = np.array(X_Predict_Reverse) Preds = NN.predict([X_Predict_Final, X_Predict_Reverse]) Preds = ((Preds >= 0.5)*1).flatten() print("Predict: ", Preds) print("Storing model and weights") NN.save_model(json_filename, h5_filename) if plot: print("Ploting") self.plot(history) print("Done!")
def process(self, json_filename, h5_filename, prod_json_file, prod_h5_filename, plot=False, epochs=100, vect_dimensions=50): np.random.seed(11) # open the file with tweets X_all = [] Y_all = [] All = [] # array with every row in the file # Load all rows into array All #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: with open(self.labeled_tweets_filename, "r") as f: i = 0 csv_file = csv.reader(f, delimiter=',') ones_count = 0 for r in csv_file: All.append(r) # Mix the rows randoming to prevent order dependencies between runs np.random.shuffle(All) X_one_All = [] X_two_All = [] X_three_All = [] X_one_aux_All = [] X_two_aux_All = [] X_three_aux_All = [] X_one_aux_disease = [] X_two_aux_disease = [] X_three_aux_disease = [] X_one_aux_label = [] X_two_aux_label = [] X_three_aux_label = [] Y_t1_t2_relevance = [] Y_t1_t3_relevance = [] for r in All: # Collect_Tweets X_one_All.append(r[0].lower().strip()) X_two_All.append(r[3].lower().strip()) X_three_All.append(r[6].lower().strip()) #Collect Aux Info #X_one_aux_All.append(r[1:3]) #X_two_aux_All.append(r[4:6]) #X_three_aux_All.append(r[7:9]) X_one_aux_disease.append(r[1]) X_one_aux_label.append(r[2]) X_two_aux_disease.append(r[4]) X_two_aux_label.append(r[5]) X_three_aux_disease.append(r[7]) X_three_aux_label.append(r[8]) # Collect Y's Y_t1_t2_relevance.append(r[9]) Y_t1_t3_relevance.append(r[10]) Y_all.append(r[11]) # Convert the data to a form the NN can understand num_data = len(All) #print("All: ", All) print("All: ") for r in All: print(r) print("All.len: ", len(All)) print("X_one_All: ") for r in X_one_All: print(r) G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_one_indices, max_len1 = self.map_to_idx(S, X_one_All) X_two_indices, max_len2 = self.map_to_idx(S, X_two_All) X_three_indices, max_len3 = self.map_to_idx(S, X_three_All) #print("X_one_indices: ", X_one_indices) print("X_one_indices: ") for r in X_one_indices: print(r) print("max_len1 : ", max_len1) for r in X_two_indices: print(r) print("max_len2 : ", max_len2) for r in X_three_indices: print(r) print("max_len3 : ", max_len3) # get max len of all max_len = max(max_len1, max_len2) max_len = max(max_len, max_len3) if max_len % 2 != 0: max_len = max_len + 1 # now padd the 3 senteces with 0 to make them all the same size P = PadSentences(max_len) X_one_train = P.pad_list(X_one_indices) X_two_train = P.pad_list(X_two_indices) X_three_train = P.pad_list(X_three_indices) # now make the sencentes into np.array X_one_train = np.array(X_one_train) X_two_train = np.array(X_two_train) X_three_train = np.array(X_three_train) # change to categorical the disease type and the label X_one_aux_train = self.binarize_aux(X_one_aux_disease, X_one_aux_label) print('X_one_aux_train.shape: ', X_one_aux_train.shape) X_two_aux_train = self.binarize_aux(X_two_aux_disease, X_two_aux_label) X_three_aux_train = self.binarize_aux(X_three_aux_disease, X_three_aux_label) # Create the NN labels_dim = 2 diases_dim = 4 #NN = TweetSimilaryBasic(max_sentence_len=max_len, embedding_builder=G, labels_dim = labels_dim, diases_dim = diases_dim) #NN = TweetSimilaryBasicBiDirectional(max_sentence_len=max_len, embedding_builder=G, labels_dim = labels_dim, diases_dim = diases_dim) NN = TweetSimilaryConvInception(max_sentence_len=max_len, embedding_builder=G, labels_dim=labels_dim, diases_dim=diases_dim) # Build the NN NN.build() # Summary NN.summary() # Compile the NN #NN.compile(optimizer='rmsprop', loss=['mean_squared_error','mean_squared_error', 'binary_crossentropy'], # metrics=['mse', 'mse','acc'], loss_weight=[ 1., 1., 1.0]) NN.compile(optimizer='rmsprop', loss={ 'R1': 'mean_squared_error', 'R2': 'mean_squared_error', 'FINAL': 'binary_crossentropy' }, metrics={ 'R1': 'mse', 'R2': 'mse', 'FINAL': 'acc' }, loss_weights={ 'R1': 0.25, 'R2': 0.25, 'FINAL': 10 }) Y_t1_t2_relevance = np.array(Y_t1_t2_relevance) Y_t1_t3_relevance = np.array(Y_t1_t3_relevance) Y_all = np.array(Y_all) class_weight_val = class_weight.compute_class_weight( 'balanced', np.unique(Y_all), Y_all) print("type(class_weight_val): ", type(class_weight_val)) print("class_weight_val", class_weight_val) final_class_weight_val = { 'R1': None, 'R2': None, 'FINAL': class_weight_val } print("final_class_weight_val: ", final_class_weight_val) history = NN.fit(X=[ X_one_train, X_two_train, X_three_train, X_one_aux_train, X_two_aux_train, X_three_aux_train ], Y=[Y_t1_t2_relevance, Y_t1_t3_relevance, Y_all], epochs=epochs, validation_split=0.20, class_weight=final_class_weight_val) # Save the model NN.save_model_data(json_filename, h5_filename, prod_json_file, prod_h5_filename) NN.plot_stats(history) #print(history) #print(history.history.keys()) print("Done!")
tweet = r[0] label = r[1] X_all.append(tweet) Y_all.append(label) i = i + 1 print("Data Ingested") num_data = len(X_all) limit = math.ceil(num_data * 0.60) X_train_sentences = X_all Y_train = Y_all G = GloveEmbedding(embedding_filename) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # convert to numPY arrays X_train = np.array(X_train_pad) Y_train = np.array(Y_train) Y_train = to_categorical(Y_train, num_classes=3) print("Train data convert to numpy arrays") model = KerasClassifier(build_fn=create_model(G, max_len)) print("Model created") # define the grid search parameters batch_size = [10, 20, 40, 60, 80, 100] epochs = [10, 50, 100] param_grid = dict(batch_size=batch_size, epochs=epochs) grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1) print(type(grid))
def set_prediction(pretain): data = pretain[:, 0] #Load Model model2 = load_model('trained/model1_50d_stoplemma_10e_new_prod.h5', custom_objects={'tf': tf}) # summarize model. model2.summary() # Load data G = GloveEmbedding("data/glove.twitter.27B.50d.txt", dimensions=50) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) premise = "same busy and just over the flu so feeling great" premise = "when ebola struck the doctors stepped up to the plate and the rest of us sat and watched them do their stuff to all engineers and environmentalists this is our time to step up and find answers to these consequences of our failure to coexist with nature" premise = remove_stopwords(premise) premise = lemmatizer_spacy(premise) x_premise = remove_stopwords(premise) x_premise = np.full((len(data)), x_premise) x_hypothesis = [] for row in data: #row = row.replace("’", "'") #row = fix_text_format(row) row = remove_stopwords(row) #row = lemmatizer_spacy(row) #row = remove_stopwords(row) x_hypothesis.append(row) x_hypothesis = np.array(x_hypothesis) X_one_indices, max_len1 = map_to_idx(S, x_premise) X_two_indices, max_len2 = map_to_idx(S, x_hypothesis) print("len: ", max_len1, max_len2) #max_len = max(max_len1, max_len2) max_len = 44 print("max_len_final: ", max_len) P = PadSentences(max_len) Trim = TrimSentences(max_len) X_one_train = P.pad_list(X_one_indices) X_two_train = P.pad_list(X_two_indices) #X_one_train = Trim.trim_list(X_one_indices) X_two_train = Trim.trim_list(X_two_train) X_one_train = np.array(X_one_train) X_two_train = np.array(X_two_train) X_one_aux_disease = set_disease(x_premise) X_two_aux_disease = set_disease(x_hypothesis) new_dis = [] for _ in range(len(data)): new_dis.append([0, 0, 1, 0, 0, 1]) X_one_aux_train = new_dis X_two_aux_label = pretain[:, 1] #X_one_aux_train = binarize_aux(s4, X_one_aux_label) X_two_aux_train = binarize_aux(X_two_aux_disease, X_two_aux_label) #new_two = [] #for row in range(len(data)): # new_two.append([row[0], row[1], row[2], row[3], row[4], row[5], row[6]]) X_two_aux_train = X_two_aux_train.tolist() for row in X_two_aux_train: del row[6] X_one_aux_train = np.array(X_one_aux_train) X_two_aux_train = np.array(X_two_aux_train) print("one_aux: ", np.array(X_one_aux_train).shape) print(X_one_aux_train[:5]) print("two_aux: ", np.array(X_two_aux_train).shape) print(X_two_aux_train[:5]) model2.load_weights('trained/model1_50d_stoplemma_10e_prod.h5') #model2.compile(optimizer='rmsprop',loss={'R1': 'mean_squared_error'},metrics={'R1': 'mse'}, loss_weights={'R1': 0.25}) #model2.compile(optimizer='rmsprop') #model2.load_weights('trained/model1_50d_stoplemma_10e_prod.h5') X_Prediction = model2.predict( [X_one_train, X_two_train, X_one_aux_train, X_two_aux_train]) return X_Prediction
def main(model_file, model_weights, labeled_tweets, embedding_filename): # load json and create model json_file = open(model_file, 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json) # load weights into new model loaded_model.load_weights(model_weights) print("Loaded model from disk") # evaluate loaded model on test data loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) # open the file with tweets X_all = [] Y_all = [] All = [] with open(labeled_tweets, "r", encoding="ISO-8859-1") as f: i = 0 csv_file = csv.reader(f, delimiter=',') ones_count = 0 for r in csv_file: if i != 0: label = int(r[1]) if (label == 1) or (label == 2): if ones_count <= 13000: All.append(r) ones_count += 1 else: All.append(r) # tweet = r[0] # label = r[1] # X_all.append(tweet) # Y_all.append(label) i = i + 1 ones_count = 0 for r in All: tweet = r[0] label = int(r[1]) if (label == 2): label = 0 # if (label == 1) and (ones_count <= 4611): # X_all.append(tweet) # Y_all.append(label) # ones_count +=1 # elif (label == 0): X_all.append(tweet) Y_all.append(label) print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.60) X_train_sentences = X_all Y_train = Y_all # divide the data into X_train, Y_train, X_test, Y_test # X_train_sentences = X_all[0: limit] # Y_train = Y_all[0: limit] # X_test_sentences = X_all[limit:] # Y_test = Y_all[limit:] # print("Data Divided") # Get embeeding # G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions) G = GloveEmbedding(embedding_filename, dimensions=50) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") if max_len % 2 != 0: max_len = max_len + 1 P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # TRIM trim_size = max_len Trim = TrimSentences(trim_size) X_train_pad = Trim.trim_list(X_train_pad) print("X[0], ", X_train_pad[0]) # convert to numPY arrays X_train = np.array(X_train_pad) Y_train = np.array(Y_train) ones_count = np.count_nonzero(Y_train) zeros_count = len(Y_train) - ones_count print("ones count: ", ones_count) print("zeros count: ", zeros_count) # Y_train = to_categorical(Y_train, num_classes=3) print("Train data convert to numpy arrays") Preds = loaded_model.predict(X_train) Preds = ((Preds >= 0.5) * 1).flatten() with open("data/alltweetsanderrors.csv", "w") as f: csv_writer = csv.writer(f, delimiter=",") i = 0 err_count = 0 for r in All: tweet = r[0] label = int(r[1]) if label == 2: label = 0 if Preds[i] != label: err_count += 1 condition = 0 else: condition = 1 error_pred = [] error_pred.append(tweet) error_pred.append(label) error_pred.append(Preds[i]) error_pred.append(condition) csv_writer.writerow(error_pred) i += 1 print("All tweets: ", i) print("Error count: ", err_count)
def process(self, json_filename, h5_filename, plot=False, epochs=100, vect_dimensions=100): np.random.seed(11) # open the file with tweets X_all = [] Y_all = [] All = [] with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: i = 0 csv_file = csv.reader(f, delimiter=',') for r in csv_file: if i != 0: All.append(r) # tweet = r[0] # label = r[1] # X_all.append(tweet) # Y_all.append(label) i = i + 1 np.random.shuffle(All) for r in All: tweet = r[0] label = r[1] if int(label) == 2: label = '0' X_all.append(tweet) Y_all.append(label) print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.60) X_train_sentences = X_all Y_train = Y_all # divide the data into X_train, Y_train, X_test, Y_test #X_train_sentences = X_all[0: limit] #Y_train = Y_all[0: limit] #X_test_sentences = X_all[limit:] #Y_test = Y_all[limit:] #print("Data Divided") #Get embeeding G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions) #G = GloveEmbedding(self.embedding_filename, dimensions=50) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") if max_len % 2 != 0: max_len = max_len + 1 P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # TRIM trim_size = 40 Trim = TrimSentences(trim_size) X_train_pad = Trim.trim_list(X_train_pad) print("X[0], ", X_train_pad[0]) #convert to numPY arrays X_train = np.array(X_train_pad) Y_train = np.array(Y_train) #Y_train = to_categorical(Y_train, num_classes=3) print("Train data convert to numpy arrays") #NN = TweetSentiment2DCNN(trim_size, G) NN = TweetSentiment2DCNNv4(trim_size, G) #print("Build GRU") #NN = TweetSentimentGRUSM(max_len, G) print("model created") kernel_regularizer = l2(0.001) #kernel_regularizer = None NN.build(filters=3, first_dropout=0.01, second_dropout=0.01, padding='valid', dense_units=16) print("model built") NN.summary() sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True) rmsprop = RMSprop(decay=0.003) adam = Adam(lr=0.1, decay=0.05) sgd = SGD(lr=0.05) NN.compile(optimizer='adam', loss="binary_crossentropy", metrics=['accuracy', f1, precision, recall]) print("model compiled") print("Begin training") callback = TensorBoard(log_dir="/tmp/logs") history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=32, callbacks=[callback], validation_split=0.4) print("Model trained") # X_test_indices, max_len = S.map_sentence_list(X_test_sentences) # print("Test data mapped") # X_test_pad = P.pad_list(X_test_indices) # print("Test data padded") # X_test = np.array(X_test_pad) # Y_test = np.array(Y_test) # print("Test data converted to numpy arrays") # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback]) # print("accuracy: ", acc) T = "I have a bad case of vomit" X_Predict = [ "my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city" ] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i = 0 for s in X_Predict_Idx: print(str(i) + ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) X_Predict_Final = Trim.trim_list(X_Predict_Final) #X_Predict = [X_Predict] X_Predict_Final = np.array(X_Predict_Final) print("Predict: ", NN.predict(X_Predict_Final)) print("Storing model and weights") NN.save_model(json_filename, h5_filename) if plot: print("Ploting") self.plot(history) print("Done!")
def process(self, json_filename, h5_filename, plot=False, epochs=100): np.random.seed(11) # open the file with tweets X_all = [] Y_all = [] with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: i = 0 csv_file = csv.reader(f, delimiter=',') for r in csv_file: if i != 0: tweet = r[0] label = r[1] X_all.append(tweet) Y_all.append(label) i = i + 1 print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.60) X_train_sentences = X_all Y_train = Y_all # divide the data into X_train, Y_train, X_test, Y_test #X_train_sentences = X_all[0: limit] #Y_train = Y_all[0: limit] #X_test_sentences = X_all[limit:] #Y_test = Y_all[limit:] #print("Data Divided") #Get embeeding G = GloveEmbedding(self.embedding_filename) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") #convert to numPY arrays X_train = np.array(X_train_pad) Y_train = np.array(Y_train) Y_train = to_categorical(Y_train, num_classes=3) print("Train data convert to numpy arrays") NN = TweetSentiment2LSTM2DenseSM(max_len, G) print("model created") kernel_regularizer = l2(0.001) kernel_regularizer = None NN.build(first_layer_units=max_len, second_layer_units=max_len, relu_dense_layer=5, dense_layer_units=3, first_layer_dropout=0.3, second_layer_dropout=0.6, l2=kernel_regularizer) print("model built") NN.summary() sgd = SGD(lr=0.001, momentum=0.09, decay=0.001, nesterov=True) rmsprop = RMSprop(decay=0.003) adam = Adam(lr=0.1, decay=0.05) NN.compile(optimizer=rmsprop, loss="categorical_crossentropy", metrics=['accuracy', precision, recall, f1, fprate]) print("model compiled") print("Begin training") callback = TensorBoard(log_dir="/tmp/logs") w_dict = {0: 0.31, 1: 0.63, 2: 0.06} history = NN.fit(X_train, Y_train, epochs=epochs, callbacks=[callback], validation_split=0.2, class_weight=w_dict) print("Model trained") # X_test_indices, max_len = S.map_sentence_list(X_test_sentences) # print("Test data mapped") # X_test_pad = P.pad_list(X_test_indices) # print("Test data padded") # X_test = np.array(X_test_pad) # Y_test = np.array(Y_test) # print("Test data converted to numpy arrays") # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback]) # print("accuracy: ", acc) T = "I have a bad case of vomit" X_Predict = [ "my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city" ] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i = 0 for s in X_Predict_Idx: print(str(i) + ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) #X_Predict = [X_Predict] X_Predict_Final = np.array(X_Predict_Final) print("Predict: ", NN.predict(X_Predict_Final)) print("Storing model and weights") NN.save_model(json_filename, h5_filename) if plot: print("Ploting") self.plot(history) print("Done!")
def process(self, json_filename, h5_filename): np.random.seed(11) # open the file with tweets X_all = [] Y_all = [] with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f: i = 0 csv_file = csv.reader(f, delimiter=',') for r in csv_file: if i != 0: tweet = r[0] label = r[1] X_all.append(tweet) Y_all.append(label) i = i + 1 print("Data Ingested") # divide the data into training and test num_data = len(X_all) limit = math.ceil(num_data * 0.60) # divide the data into X_train, Y_train, X_test, Y_test X_train_sentences = X_all[0:limit] Y_train = Y_all[0:limit] X_test_sentences = X_all[limit:] Y_test = Y_all[limit:] print("Data Divided") #Get embeeding G = GloveEmbedding(self.embedding_filename) word_to_idx, idx_to_word, embedding = G.read_embedding() S = SentenceToIndices(word_to_idx) X_train_indices, max_len = S.map_sentence_list(X_train_sentences) print("Train data mappend to indices") P = PadSentences(max_len) X_train_pad = P.pad_list(X_train_indices) print("Train data padded") # Trim #trim_size = 40 #Trim = TrimSentences(trim_size) #X_train_pad = Trim.trim_list(X_train_pad) #convert to numPY arrays X_train = np.array(X_train_pad) Y_train = np.array(Y_train) print("Train data convert to numpy arrays") NN = TweetSentiment2LSTM2Dense(max_len, G) #NN = TweetSentiment2LSTM2Dense(trim_size, G) print("model created") NN.build(first_layer_units=128, dense_layer_units=1, first_layer_dropout=0, second_layer_dropout=0) print("model built") NN.summary() sgd = SGD(lr=0.3, momentum=0.001, decay=0.01, nesterov=False) adam = Adam(lr=0.03) #NN.compile(loss="binary_crossentropy", metrics=['binary_accuracy'], optimizer=adam) NN.compile(loss="binary_crossentropy", metrics=['binary_accuracy'], optimizer='rmsprop') print("model compiled") print("Begin training") callback = TensorBoard(log_dir="/tmp/logs") NN.fit(X_train, Y_train, epochs=5, callbacks=[callback]) print("Model trained") X_test_indices, max_len = S.map_sentence_list(X_test_sentences) print("Test data mapped") X_test_pad = P.pad_list(X_test_indices) print("Test data padded") X_test = np.array(X_test_pad) Y_test = np.array(Y_test) print("Test data converted to numpy arrays") loss, acc = NN.evaluate(X_test, Y_test) print("accuracy: ", acc, ", loss: ", loss) T = "I have a bad case of vomit" X_Predict = [ "my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city" ] X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict) i = 0 for s in X_Predict_Idx: print(str(i) + ": ", s) i = i + 1 print(X_Predict) X_Predict_Final = P.pad_list(X_Predict_Idx) #X_Predict_Final = Trim.trim_list(X_Predict_Final) #X_Predict = [X_Predict] X_Predict_Final = np.array(X_Predict_Final) print("Predict: ", NN.predict(X_Predict_Final)) print("Storing model and weights") NN.save_model(json_filename, h5_filename) print("Done!")