Пример #1
0
def main():
    G = GloveEmbedding("glove.6B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    #print("locon: ", word_to_idx["locon"])
    s = "I love New York and music locon"
    s = s.lower()
    print("Sentence: ", s)
    S = SentenceToIndices(word_to_idx)
    sentence = S.map_sentence(s)
    print("Sentence to indices: ", sentence)
    print("Padded: ", PadSentences(10).pad(sentence))
    SE = SentenceToEmbedding(word_to_idx, idx_to_word, embedding)
    matrix = SE.map_sentence(s, max_len=10)
    print("Matrix: ", matrix)
    print("Matrix.shape: ", matrix.shape)
    print("Embedding i: ", embedding[word_to_idx["i"]])

    sentences = []
    sentences.append("I esta malo".lower())
    sentences.append("Love la musica salsa.".lower())
    sentences.append("Uff, q mal te va nene".lower())
    mapped, mlen = S.map_sentence_list(sentences)
    print("mlen: ", mlen)
    for s in mapped:
        print(s)
Пример #2
0
    def get_glove_embedding(self):
        g = GloveEmbedding(self.embedding_filename, dimensions=50)
        word_to_idx, idx_to_word, embedding = g.read_embedding()
        s = SentenceToIndices(word_to_idx)
        x_train_indices, max_len = s.map_sentence_list(self.x_all)

        if max_len % 2 != 0:
            max_len = max_len + 1

        p = PadSentences(max_len)
        x_train_pad = p.pad_list(x_train_indices)

        # TRIM Tweets to remove noisy data
        trim_size = max_len
        trim = TrimSentences(trim_size)
        x_train_pad = trim.trim_list(x_train_pad)

        return x_train_pad, max_len, g
Пример #3
0
def main():
    G = GloveEmbedding("../test/data/glove.6B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    #print("locon: ", word_to_idx["locon"])
    print("Length dictionary: ", len(word_to_idx))
    #s = "I love New York and music locon"
    s = "The flu is making me sad"
    s = s.lower()
    print("Sentence: ", s)
    S = SentenceToIndices(word_to_idx)
    sentence = S.map_sentence(s)
    print("Sentence to indices: ", sentence)
    print("Padded: ", PadSentences(10).pad(sentence))
    SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    matrix1 = SE.map_sentence(s, max_len=len(s))

    s2 = "The flu is making me sad".lower()
    matrix2 = SE.map_sentence(s2, max_len=len(s2))

    print("Matrix 1: ", matrix1)
    print("Matrix.shape: ", matrix1.shape)
    print("\n Matrix 2: ", matrix2)
    print("Matrix.shape: ", matrix2.shape)

    print("\n Self Similarity: ", matrix_cosine_similary(matrix1, matrix1))

    M1 = np.array([-1, 40, 0.04]).reshape((3, 1))
    M2 = np.array([100, 2, 3]).reshape((3, 1))
    print("M1: \n ", M1)
    print("M2: \n", M2)
    SimM = matrix_cosine_similary(M1, M2)
    print("SimM: \n", SimM)
    D = distance_similarity_matrix(SimM)
    print("D: ", D)

    M3 = np.array([[1, 2, 3, 1], [4, 5, 6, 2], [7, 8, 9, 1]])
    M4 = np.array([[1, 2, 3.000001, 1], [4, 5, 6, 2], [7, 8, 9, 1]])

    SimM = matrix_cosine_similary(M3, M3)
    print("SimM: \n", SimM)
    D = distance_similarity_matrix(SimM)
    print("D: ", D)

    SimM = matrix_cosine_similary(M3, M4)
    print("\nSimM: \n", SimM)
    Up = np.triu(SimM)
    D = distance_similarity_matrix(SimM)
    print("D: ", D)
    print("Up: ", Up)
    print("sum Up: ", np.sum(Up))
    print("up I: ", np.triu(np.ones(Up.shape)))
    print("sum I: ", np.sum(np.triu(np.ones(Up.shape))))
Пример #4
0
    def process(self,
                json_filename,
                h5_filename,
                plot=False,
                epochs=100,
                vect_dimensions=50):
        # open the file with tweets
        X_all = []
        Y_all = []
        All = []

        #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f:
        with open(self.labeled_tweets_filename, "r") as f:
            i = 0
            csv_file = csv.reader(f, delimiter=',')
            ones_count = 0

            for r in csv_file:
                if i != 0:
                    All.append(r)
                i = i + 1

        np.random.shuffle(All)

        ones_count = 0
        two_count = 0
        zero_count = 0
        for r in All:
            tweet = r[0]
            label = int(r[1])
            if (label == 0):
                zero_count += 1
            elif (label == 1):
                ones_count += 1
            else:
                two_count += 1
            # if (label == 2):
            #     label = 0
            # if (label == 1) and (ones_count <= 4611):
            #     X_all.append(tweet)
            #     Y_all.append(label)
            #     ones_count +=1
            # elif (label == 0):
            X_all.append(tweet)
            Y_all.append(label)

        print("len(Y_all): ", len(Y_all))
        class_weight_val = class_weight.compute_class_weight(
            'balanced', np.unique(Y_all), Y_all)
        print("classes: ", np.unique(Y_all))
        print("counts for 0, 1, 2: ", zero_count, ones_count, two_count)
        print("class weight_val: ", class_weight_val)
        class_weight_dictionary = {
            0: class_weight_val[0],
            1: class_weight_val[1],
            2: class_weight_val[2]
        }
        print("dict: ", class_weight_dictionary)

        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.80)
        X_train_sentences = X_all
        Y_train = Y_all
        # Divide after conversions
        # divide the data into X_train, Y_train, X_test, Y_test
        #X_train_sentences = X_all[0: limit]
        #Y_train = Y_all[0: limit]
        #X_test_sentences = X_all[limit:]
        #Y_test = Y_all[limit:]
        #print("Data Divided")

        #Get embeeding
        #G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions)

        G = GloveEmbedding(self.embedding_filename, dimensions=vect_dimensions)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        S = SentenceToIndices(word_to_idx)
        X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
        print("Train data mappend to indices")
        if max_len % 2 != 0:
            max_len = max_len + 1

        P = PadSentences(max_len)
        X_train_pad = P.pad_list(X_train_indices)
        print("Train data padded")
        # TRIM
        trim_size = max_len
        #trim_size = 33
        Trim = TrimSentences(trim_size)
        X_train_pad = Trim.trim_list(X_train_pad)
        print("X[0], ", X_train_pad[0])
        #convert to numPY arrays
        X_train = np.array(X_train_pad)
        Y_train = np.array(Y_train)
        ones_count = np.count_nonzero(Y_train)
        zeros_count = len(Y_train) - ones_count
        print("ones count: ", ones_count)
        print("zeros count: ", zeros_count)
        print("two count: ", two_count)
        Y_train_old = Y_train
        Y_train = to_categorical(Y_train, num_classes=3)

        # Divide the data
        X_test_text = X_all[limit:]
        X_test = X_train[limit:]
        Y_test = Y_train[limit:]
        X_train = X_train[0:limit]
        Y_train = Y_train[0:limit]
        print("data divided on value: ", limit)
        print("lengths X_train, Y_train: ", len(X_train), len(Y_train))
        print("lengths X_test, Y_test: ", len(X_test), len(Y_test))

        print("Train data convert to numpy arrays")
        #NN = TweetSentiment2DCNN(trim_size, G)
        #NN = TweetSentiment2LSTM2Dense(trim_size, G)
        #NN =TweetSentiment2LSTM2Dense3Layer(trim_size, G)
        #NN =TweetSentiment2LSTM2Dense4Layer(trim_size, G)
        #NN = TweetSentimentCNN(trim_size, G)
        #print("Build GRU")
        #NN = TweetSentimentGRUSM(max_len, G)
        NN = TweetSentiment1D(trim_size, G)
        #NN = TweetSentiment1DRev(trim_size, G)

        print("model created")
        kernel_regularizer = l2(0.001)
        #kernel_regularizer = None
        NN.build(filters=11,
                 first_dropout=0,
                 second_dropout=0.05,
                 padding='valid',
                 dense_units=16)

        #NN.build(first_layer_units = max_len, second_layer_units = max_len, relu_dense_layer=16, dense_layer_units = 3,
        #         first_layer_dropout=0, second_layer_dropout=0, third_layer_dropout=0)
        print("model built")
        NN.summary()
        sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True)
        rmsprop = RMSprop(decay=0.003)
        adam = Adam(lr=0.0003, decay=0.001)
        sgd = SGD(lr=0.05)
        NN.compile(optimizer=adam,
                   loss="categorical_crossentropy",
                   metrics=['accuracy', precision, recall, f1, fprate])

        print("model compiled")
        print("Begin training")
        #callback = TensorBoard(log_dir="/tmp/logs")
        #class_weight = {0: 0.67, 1: 0.33}
        #class_weight = None
        #history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=32, callbacks=[callback], class_weight=class_weight_dictionary)
        history = NN.fit(X_train,
                         Y_train,
                         epochs=epochs,
                         batch_size=64,
                         class_weight=class_weight_dictionary,
                         validation_split=0.2)

        print("Model trained")
        print("Predicting")
        print("len(X_test): ", X_test)
        preds = NN.predict(X_test)
        print("len(preds): ", len(preds))
        print("type preds: ", type(preds))
        print("preds before: ", preds)
        preds = np.argmax(preds, axis=1)
        print("preds: ", preds)
        print("len(preds): ", len(preds))
        Y_test = Y_train_old[limit:]
        print("Y test: ", Y_test)
        c_matrix = confusion_matrix(Y_test, preds)
        print("matrix: ", c_matrix)
        print("Storing Errors: ")
        ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv")
        print("Errors stored")
        print("Confusion matrix: ")
        prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '')
        print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1)

        #
        # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        # print("Test data mapped")
        # X_test_pad = P.pad_list(X_test_indices)
        # print("Test data padded")
        # X_test = np.array(X_test_pad)
        # Y_test = np.array(Y_test)
        # print("Test data converted to numpy arrays")
        # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
        # print("accuracy: ", acc)
        T = "I have a bad case of vomit"
        X_Predict = [
            "my zika is bad", "i love colombia",
            "my has been tested for ebola",
            "there is a diarrhea outbreak in the city"
        ]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i = 0
        for s in X_Predict_Idx:
            print(str(i) + ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Final = np.array(X_Predict_Final)
        print("Predict: ", np.argmax(NN.predict(X_Predict_Final)))
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        if plot:
            print("Ploting")
            self.plot(history)
        print("Done!")
Пример #5
0
    def process(self, json_filename, h5_filename, plot=False, epochs = 100, vect_dimensions = 100):
        np.random.seed(11)
        # open the file with tweets
        X_all = []
        Y_all = []
        All  = []
        Zeros = []
        with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f:
            i = 0
            csv_file = csv.reader(f, delimiter = ',')
            ones_count = 0
            Ones = []
            for r in csv_file:
                if i !=0:
                    label = int(r[1])
                    #if label == 0:
                    #  Zeros.append(r)
                    All.append(r)
                    # tweet = r[0]
                    # label = r[1]
                    # X_all.append(tweet)
                    # Y_all.append(label)
                i = i + 1

        print("len(All): ", len(All))
        np.random.shuffle(All)

        ones_count = 0
        for r in All:
            tweet = r[0].strip()
            label = int(r[1])
            if (label == 2):
                label = 0
            # if (label == 1) and (ones_count <= 4611):
            #     X_all.append(tweet)
            #     Y_all.append(label)
            #     ones_count +=1
            # elif (label == 0):
            X_all.append(tweet)
            Y_all.append(label)

        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.60)
        X_train_sentences = X_all
        Y_train = Y_all
        # divide the data into X_train, Y_train, X_test, Y_test
        #X_train_sentences = X_all[0: limit]
        #Y_train = Y_all[0: limit]
        #X_test_sentences = X_all[limit:]
        #Y_test = Y_all[limit:]
        #print("Data Divided")
        #Get embeeding
        #G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions)
        G = GloveEmbedding(self.embedding_filename, dimensions=50)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        S = SentenceToIndices(word_to_idx)
        X_train_indices, max_len  = S.map_sentence_list(X_train_sentences)
        print("Train data mappend to indices")
        if max_len % 2 !=0:
            max_len = max_len + 1

        P = PadSentences(max_len)
        X_train_pad = P.pad_list(X_train_indices)
        print("Train data padded")
        # TRIM
        trim_size = max_len
        #trim_size = 45
        Trim = TrimSentences(trim_size)
        X_train_pad = Trim.trim_list(X_train_pad)
        print("X[0], ", X_train_pad[0])
        #convert to numPY arrays
        X_train_reverse = []
        for X in X_train_pad:
            t = X[::-1]
            X_train_reverse.append(t)
        X_train = np.array(X_train_pad)
        X_train_reverse = np.array(X_train_reverse)
        Y_train = np.array(Y_train)

        ones_count = np.count_nonzero(Y_train)
        zeros_count = len(Y_train) - ones_count
        print("ones count: ", ones_count)
        print("zeros count: ", zeros_count)
        #Y_train = to_categorical(Y_train, num_classes=3)
        print("Train data convert to numpy arrays")
        #NN = TweetSentiment2DCNN(trim_size, G)
        NN = TweetSentiment2DCNN2Channel(trim_size, G)
        #NN = TweetSentimentInception(trim_size, G)
        #print("Build GRU")
        #NN = TweetSentimentGRUSM(max_len, G)

        print("model created")
        kernel_regularizer = l2(0.001)
        #kernel_regularizer = None
        NN.build(filters=11, first_dropout=0, second_dropout=0.1, padding='valid', dense_units=32)
        print("model built")
        NN.summary()
        sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True)
        rmsprop = RMSprop(decay=0.003)
        adam = Adam(lr=0.1, decay=0.05)
        #sgd = SGD(lr=0.05)
        NN.compile(optimizer=rmsprop, loss="binary_crossentropy", metrics=['accuracy', precision, recall, f1, fprate])
        print("model compiled")
        print("Begin training")
        callback = TensorBoard(log_dir="/tmp/logs")
        #class_weight = {0: 0.67, 1: 0.33}
        class_weight = None
        history = NN.fit([X_train, X_train_reverse], Y_train, epochs=epochs, batch_size=32, callbacks=[callback], validation_split=0.20, class_weight=class_weight)
        print("Model trained")
        # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        # print("Test data mapped")
        # X_test_pad = P.pad_list(X_test_indices)
        # print("Test data padded")
        # X_test = np.array(X_test_pad)
        # Y_test = np.array(Y_test)
        # print("Test data converted to numpy arrays")
        # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
        # print("accuracy: ", acc)
        T = "I have a bad case of vomit"
        X_Predict = ["my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city"]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i =0
        for s in X_Predict_Idx:
            print(str(i)+ ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Reverse = []
        for r in X_Predict_Final:
            t = r[::-1]
            X_Predict_Reverse.append(t)

        X_Predict_Final = np.array(X_Predict_Final)
        X_Predict_Reverse = np.array(X_Predict_Reverse)
        Preds = NN.predict([X_Predict_Final, X_Predict_Reverse])
        Preds = ((Preds >= 0.5)*1).flatten()
        print("Predict: ", Preds)
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        if plot:
            print("Ploting")
            self.plot(history)
        print("Done!")
Пример #6
0
    def process(self,
                json_filename,
                h5_filename,
                prod_json_file,
                prod_h5_filename,
                plot=False,
                epochs=100,
                vect_dimensions=50):
        np.random.seed(11)
        # open the file with tweets
        X_all = []
        Y_all = []
        All = []  # array with every row in the file

        # Load all rows into array All
        #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f:
        with open(self.labeled_tweets_filename, "r") as f:

            i = 0
            csv_file = csv.reader(f, delimiter=',')
            ones_count = 0

            for r in csv_file:
                All.append(r)

        # Mix the rows randoming to prevent order dependencies between runs
        np.random.shuffle(All)

        X_one_All = []
        X_two_All = []
        X_three_All = []

        X_one_aux_All = []
        X_two_aux_All = []
        X_three_aux_All = []

        X_one_aux_disease = []
        X_two_aux_disease = []
        X_three_aux_disease = []

        X_one_aux_label = []
        X_two_aux_label = []
        X_three_aux_label = []

        Y_t1_t2_relevance = []
        Y_t1_t3_relevance = []

        for r in All:
            # Collect_Tweets
            X_one_All.append(r[0].lower().strip())
            X_two_All.append(r[3].lower().strip())
            X_three_All.append(r[6].lower().strip())

            #Collect Aux Info
            #X_one_aux_All.append(r[1:3])
            #X_two_aux_All.append(r[4:6])
            #X_three_aux_All.append(r[7:9])
            X_one_aux_disease.append(r[1])
            X_one_aux_label.append(r[2])

            X_two_aux_disease.append(r[4])
            X_two_aux_label.append(r[5])

            X_three_aux_disease.append(r[7])
            X_three_aux_label.append(r[8])

            # Collect Y's
            Y_t1_t2_relevance.append(r[9])
            Y_t1_t3_relevance.append(r[10])
            Y_all.append(r[11])

        # Convert the data to a form the NN can understand
        num_data = len(All)

        #print("All: ", All)
        print("All: ")
        for r in All:
            print(r)
        print("All.len: ", len(All))

        print("X_one_All: ")
        for r in X_one_All:
            print(r)

        G = Word2VecEmbedding(self.embedding_filename,
                              dimensions=vect_dimensions)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        S = SentenceToIndices(word_to_idx)
        X_one_indices, max_len1 = self.map_to_idx(S, X_one_All)

        X_two_indices, max_len2 = self.map_to_idx(S, X_two_All)
        X_three_indices, max_len3 = self.map_to_idx(S, X_three_All)

        #print("X_one_indices: ", X_one_indices)
        print("X_one_indices: ")
        for r in X_one_indices:
            print(r)
        print("max_len1 : ", max_len1)

        for r in X_two_indices:
            print(r)
        print("max_len2 : ", max_len2)
        for r in X_three_indices:
            print(r)
        print("max_len3 : ", max_len3)

        # get max len of all
        max_len = max(max_len1, max_len2)
        max_len = max(max_len, max_len3)
        if max_len % 2 != 0:
            max_len = max_len + 1

        # now padd the 3 senteces with 0 to make them all the same size
        P = PadSentences(max_len)
        X_one_train = P.pad_list(X_one_indices)
        X_two_train = P.pad_list(X_two_indices)
        X_three_train = P.pad_list(X_three_indices)

        # now make the sencentes into np.array
        X_one_train = np.array(X_one_train)
        X_two_train = np.array(X_two_train)
        X_three_train = np.array(X_three_train)

        # change to categorical the disease type and the label
        X_one_aux_train = self.binarize_aux(X_one_aux_disease, X_one_aux_label)
        print('X_one_aux_train.shape: ', X_one_aux_train.shape)
        X_two_aux_train = self.binarize_aux(X_two_aux_disease, X_two_aux_label)
        X_three_aux_train = self.binarize_aux(X_three_aux_disease,
                                              X_three_aux_label)

        # Create the NN
        labels_dim = 2
        diases_dim = 4
        #NN = TweetSimilaryBasic(max_sentence_len=max_len, embedding_builder=G, labels_dim = labels_dim, diases_dim = diases_dim)
        #NN = TweetSimilaryBasicBiDirectional(max_sentence_len=max_len, embedding_builder=G, labels_dim = labels_dim, diases_dim = diases_dim)
        NN = TweetSimilaryConvInception(max_sentence_len=max_len,
                                        embedding_builder=G,
                                        labels_dim=labels_dim,
                                        diases_dim=diases_dim)

        # Build the NN
        NN.build()
        # Summary
        NN.summary()
        # Compile the NN
        #NN.compile(optimizer='rmsprop', loss=['mean_squared_error','mean_squared_error', 'binary_crossentropy'],
        #           metrics=['mse', 'mse','acc'], loss_weight=[ 1., 1., 1.0])

        NN.compile(optimizer='rmsprop',
                   loss={
                       'R1': 'mean_squared_error',
                       'R2': 'mean_squared_error',
                       'FINAL': 'binary_crossentropy'
                   },
                   metrics={
                       'R1': 'mse',
                       'R2': 'mse',
                       'FINAL': 'acc'
                   },
                   loss_weights={
                       'R1': 0.25,
                       'R2': 0.25,
                       'FINAL': 10
                   })

        Y_t1_t2_relevance = np.array(Y_t1_t2_relevance)
        Y_t1_t3_relevance = np.array(Y_t1_t3_relevance)
        Y_all = np.array(Y_all)
        class_weight_val = class_weight.compute_class_weight(
            'balanced', np.unique(Y_all), Y_all)
        print("type(class_weight_val): ", type(class_weight_val))
        print("class_weight_val", class_weight_val)
        final_class_weight_val = {
            'R1': None,
            'R2': None,
            'FINAL': class_weight_val
        }
        print("final_class_weight_val: ", final_class_weight_val)
        history = NN.fit(X=[
            X_one_train, X_two_train, X_three_train, X_one_aux_train,
            X_two_aux_train, X_three_aux_train
        ],
                         Y=[Y_t1_t2_relevance, Y_t1_t3_relevance, Y_all],
                         epochs=epochs,
                         validation_split=0.20,
                         class_weight=final_class_weight_val)

        # Save the model
        NN.save_model_data(json_filename, h5_filename, prod_json_file,
                           prod_h5_filename)
        NN.plot_stats(history)
        #print(history)
        #print(history.history.keys())
        print("Done!")
Пример #7
0
            tweet = r[0]
            label = r[1]
            X_all.append(tweet)
            Y_all.append(label)
        i = i + 1
print("Data Ingested")
num_data = len(X_all)
limit = math.ceil(num_data * 0.60)
X_train_sentences = X_all
Y_train = Y_all
G = GloveEmbedding(embedding_filename)
word_to_idx, idx_to_word, embedding = G.read_embedding()
S = SentenceToIndices(word_to_idx)
X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
print("Train data mappend to indices")
P = PadSentences(max_len)
X_train_pad = P.pad_list(X_train_indices)
print("Train data padded")
# convert to numPY arrays
X_train = np.array(X_train_pad)
Y_train = np.array(Y_train)
Y_train = to_categorical(Y_train, num_classes=3)
print("Train data convert to numpy arrays")
model = KerasClassifier(build_fn=create_model(G, max_len))
print("Model created")
# define the grid search parameters
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
print(type(grid))
Пример #8
0
def set_prediction(pretain):
    data = pretain[:, 0]
    #Load Model
    model2 = load_model('trained/model1_50d_stoplemma_10e_new_prod.h5',
                        custom_objects={'tf': tf})
    # summarize model.
    model2.summary()
    # Load data
    G = GloveEmbedding("data/glove.twitter.27B.50d.txt", dimensions=50)
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    S = SentenceToIndices(word_to_idx)
    premise = "same busy and just over the flu so feeling great"
    premise = "when ebola struck the doctors stepped up to the plate and the rest of us sat and watched them do their stuff to all engineers and environmentalists this is our time to step up and find answers to these consequences of our failure to coexist with nature"
    premise = remove_stopwords(premise)
    premise = lemmatizer_spacy(premise)
    x_premise = remove_stopwords(premise)
    x_premise = np.full((len(data)), x_premise)

    x_hypothesis = []

    for row in data:
        #row = row.replace("’", "'")
        #row = fix_text_format(row)
        row = remove_stopwords(row)
        #row = lemmatizer_spacy(row)
        #row = remove_stopwords(row)
        x_hypothesis.append(row)
    x_hypothesis = np.array(x_hypothesis)

    X_one_indices, max_len1 = map_to_idx(S, x_premise)
    X_two_indices, max_len2 = map_to_idx(S, x_hypothesis)
    print("len: ", max_len1, max_len2)
    #max_len = max(max_len1, max_len2)
    max_len = 44
    print("max_len_final: ", max_len)

    P = PadSentences(max_len)
    Trim = TrimSentences(max_len)

    X_one_train = P.pad_list(X_one_indices)
    X_two_train = P.pad_list(X_two_indices)
    #X_one_train = Trim.trim_list(X_one_indices)
    X_two_train = Trim.trim_list(X_two_train)

    X_one_train = np.array(X_one_train)
    X_two_train = np.array(X_two_train)

    X_one_aux_disease = set_disease(x_premise)
    X_two_aux_disease = set_disease(x_hypothesis)

    new_dis = []
    for _ in range(len(data)):
        new_dis.append([0, 0, 1, 0, 0, 1])

    X_one_aux_train = new_dis

    X_two_aux_label = pretain[:, 1]

    #X_one_aux_train = binarize_aux(s4, X_one_aux_label)
    X_two_aux_train = binarize_aux(X_two_aux_disease, X_two_aux_label)
    #new_two = []
    #for row in range(len(data)):
    #    new_two.append([row[0], row[1], row[2], row[3], row[4], row[5], row[6]])

    X_two_aux_train = X_two_aux_train.tolist()
    for row in X_two_aux_train:
        del row[6]

    X_one_aux_train = np.array(X_one_aux_train)
    X_two_aux_train = np.array(X_two_aux_train)
    print("one_aux: ", np.array(X_one_aux_train).shape)
    print(X_one_aux_train[:5])
    print("two_aux: ", np.array(X_two_aux_train).shape)
    print(X_two_aux_train[:5])
    model2.load_weights('trained/model1_50d_stoplemma_10e_prod.h5')
    #model2.compile(optimizer='rmsprop',loss={'R1': 'mean_squared_error'},metrics={'R1': 'mse'}, loss_weights={'R1': 0.25})
    #model2.compile(optimizer='rmsprop')
    #model2.load_weights('trained/model1_50d_stoplemma_10e_prod.h5')
    X_Prediction = model2.predict(
        [X_one_train, X_two_train, X_one_aux_train, X_two_aux_train])

    return X_Prediction
Пример #9
0
def main(model_file, model_weights, labeled_tweets, embedding_filename):
    # load json and create model
    json_file = open(model_file, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(model_weights)
    print("Loaded model from disk")
    # evaluate loaded model on test data
    loaded_model.compile(loss='binary_crossentropy',
                         optimizer='rmsprop',
                         metrics=['accuracy'])

    # open the file with tweets
    X_all = []
    Y_all = []
    All = []

    with open(labeled_tweets, "r", encoding="ISO-8859-1") as f:
        i = 0
        csv_file = csv.reader(f, delimiter=',')
        ones_count = 0

        for r in csv_file:
            if i != 0:
                label = int(r[1])
                if (label == 1) or (label == 2):
                    if ones_count <= 13000:
                        All.append(r)
                        ones_count += 1
                else:
                    All.append(r)
                # tweet = r[0]
                # label = r[1]
                # X_all.append(tweet)
                # Y_all.append(label)
            i = i + 1

    ones_count = 0
    for r in All:
        tweet = r[0]
        label = int(r[1])
        if (label == 2):
            label = 0
        # if (label == 1) and (ones_count <= 4611):
        #     X_all.append(tweet)
        #     Y_all.append(label)
        #     ones_count +=1
        # elif (label == 0):
        X_all.append(tweet)
        Y_all.append(label)

    print("Data Ingested")
    # divide the data into training and test
    num_data = len(X_all)
    limit = math.ceil(num_data * 0.60)
    X_train_sentences = X_all
    Y_train = Y_all
    # divide the data into X_train, Y_train, X_test, Y_test
    # X_train_sentences = X_all[0: limit]
    # Y_train = Y_all[0: limit]
    # X_test_sentences = X_all[limit:]
    # Y_test = Y_all[limit:]
    # print("Data Divided")
    # Get embeeding
    # G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions)
    G = GloveEmbedding(embedding_filename, dimensions=50)
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    S = SentenceToIndices(word_to_idx)
    X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
    print("Train data mappend to indices")
    if max_len % 2 != 0:
        max_len = max_len + 1

    P = PadSentences(max_len)
    X_train_pad = P.pad_list(X_train_indices)
    print("Train data padded")
    # TRIM
    trim_size = max_len
    Trim = TrimSentences(trim_size)
    X_train_pad = Trim.trim_list(X_train_pad)
    print("X[0], ", X_train_pad[0])
    # convert to numPY arrays
    X_train = np.array(X_train_pad)
    Y_train = np.array(Y_train)
    ones_count = np.count_nonzero(Y_train)
    zeros_count = len(Y_train) - ones_count
    print("ones count: ", ones_count)
    print("zeros count: ", zeros_count)
    # Y_train = to_categorical(Y_train, num_classes=3)
    print("Train data convert to numpy arrays")
    Preds = loaded_model.predict(X_train)
    Preds = ((Preds >= 0.5) * 1).flatten()
    with open("data/alltweetsanderrors.csv", "w") as f:
        csv_writer = csv.writer(f, delimiter=",")
        i = 0
        err_count = 0
        for r in All:
            tweet = r[0]
            label = int(r[1])
            if label == 2:
                label = 0
            if Preds[i] != label:
                err_count += 1
                condition = 0
            else:
                condition = 1

            error_pred = []
            error_pred.append(tweet)
            error_pred.append(label)
            error_pred.append(Preds[i])
            error_pred.append(condition)
            csv_writer.writerow(error_pred)
            i += 1
        print("All tweets: ", i)
        print("Error count: ", err_count)
Пример #10
0
    def process(self,
                json_filename,
                h5_filename,
                plot=False,
                epochs=100,
                vect_dimensions=100):
        np.random.seed(11)
        # open the file with tweets
        X_all = []
        Y_all = []
        All = []
        with open(self.labeled_tweets_filename, "r",
                  encoding="ISO-8859-1") as f:
            i = 0
            csv_file = csv.reader(f, delimiter=',')
            for r in csv_file:
                if i != 0:
                    All.append(r)
                    # tweet = r[0]
                    # label = r[1]
                    # X_all.append(tweet)
                    # Y_all.append(label)
                i = i + 1
        np.random.shuffle(All)

        for r in All:
            tweet = r[0]
            label = r[1]
            if int(label) == 2:
                label = '0'
            X_all.append(tweet)
            Y_all.append(label)
        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.60)
        X_train_sentences = X_all
        Y_train = Y_all
        # divide the data into X_train, Y_train, X_test, Y_test
        #X_train_sentences = X_all[0: limit]
        #Y_train = Y_all[0: limit]
        #X_test_sentences = X_all[limit:]
        #Y_test = Y_all[limit:]
        #print("Data Divided")
        #Get embeeding
        G = Word2VecEmbedding(self.embedding_filename,
                              dimensions=vect_dimensions)
        #G = GloveEmbedding(self.embedding_filename, dimensions=50)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        S = SentenceToIndices(word_to_idx)
        X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
        print("Train data mappend to indices")
        if max_len % 2 != 0:
            max_len = max_len + 1

        P = PadSentences(max_len)
        X_train_pad = P.pad_list(X_train_indices)
        print("Train data padded")
        # TRIM
        trim_size = 40
        Trim = TrimSentences(trim_size)
        X_train_pad = Trim.trim_list(X_train_pad)
        print("X[0], ", X_train_pad[0])
        #convert to numPY arrays
        X_train = np.array(X_train_pad)
        Y_train = np.array(Y_train)
        #Y_train = to_categorical(Y_train, num_classes=3)
        print("Train data convert to numpy arrays")
        #NN = TweetSentiment2DCNN(trim_size, G)
        NN = TweetSentiment2DCNNv4(trim_size, G)

        #print("Build GRU")
        #NN = TweetSentimentGRUSM(max_len, G)

        print("model created")
        kernel_regularizer = l2(0.001)
        #kernel_regularizer = None
        NN.build(filters=3,
                 first_dropout=0.01,
                 second_dropout=0.01,
                 padding='valid',
                 dense_units=16)
        print("model built")
        NN.summary()
        sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True)
        rmsprop = RMSprop(decay=0.003)
        adam = Adam(lr=0.1, decay=0.05)
        sgd = SGD(lr=0.05)
        NN.compile(optimizer='adam',
                   loss="binary_crossentropy",
                   metrics=['accuracy', f1, precision, recall])
        print("model compiled")
        print("Begin training")
        callback = TensorBoard(log_dir="/tmp/logs")
        history = NN.fit(X_train,
                         Y_train,
                         epochs=epochs,
                         batch_size=32,
                         callbacks=[callback],
                         validation_split=0.4)
        print("Model trained")
        # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        # print("Test data mapped")
        # X_test_pad = P.pad_list(X_test_indices)
        # print("Test data padded")
        # X_test = np.array(X_test_pad)
        # Y_test = np.array(Y_test)
        # print("Test data converted to numpy arrays")
        # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
        # print("accuracy: ", acc)
        T = "I have a bad case of vomit"
        X_Predict = [
            "my zika is bad", "i love colombia",
            "my has been tested for ebola",
            "there is a diarrhea outbreak in the city"
        ]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i = 0
        for s in X_Predict_Idx:
            print(str(i) + ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Final = np.array(X_Predict_Final)
        print("Predict: ", NN.predict(X_Predict_Final))
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        if plot:
            print("Ploting")
            self.plot(history)
        print("Done!")
Пример #11
0
 def process(self, json_filename, h5_filename, plot=False, epochs=100):
     np.random.seed(11)
     # open the file with tweets
     X_all = []
     Y_all = []
     with open(self.labeled_tweets_filename, "r",
               encoding="ISO-8859-1") as f:
         i = 0
         csv_file = csv.reader(f, delimiter=',')
         for r in csv_file:
             if i != 0:
                 tweet = r[0]
                 label = r[1]
                 X_all.append(tweet)
                 Y_all.append(label)
             i = i + 1
     print("Data Ingested")
     # divide the data into training and test
     num_data = len(X_all)
     limit = math.ceil(num_data * 0.60)
     X_train_sentences = X_all
     Y_train = Y_all
     # divide the data into X_train, Y_train, X_test, Y_test
     #X_train_sentences = X_all[0: limit]
     #Y_train = Y_all[0: limit]
     #X_test_sentences = X_all[limit:]
     #Y_test = Y_all[limit:]
     #print("Data Divided")
     #Get embeeding
     G = GloveEmbedding(self.embedding_filename)
     word_to_idx, idx_to_word, embedding = G.read_embedding()
     S = SentenceToIndices(word_to_idx)
     X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
     print("Train data mappend to indices")
     P = PadSentences(max_len)
     X_train_pad = P.pad_list(X_train_indices)
     print("Train data padded")
     #convert to numPY arrays
     X_train = np.array(X_train_pad)
     Y_train = np.array(Y_train)
     Y_train = to_categorical(Y_train, num_classes=3)
     print("Train data convert to numpy arrays")
     NN = TweetSentiment2LSTM2DenseSM(max_len, G)
     print("model created")
     kernel_regularizer = l2(0.001)
     kernel_regularizer = None
     NN.build(first_layer_units=max_len,
              second_layer_units=max_len,
              relu_dense_layer=5,
              dense_layer_units=3,
              first_layer_dropout=0.3,
              second_layer_dropout=0.6,
              l2=kernel_regularizer)
     print("model built")
     NN.summary()
     sgd = SGD(lr=0.001, momentum=0.09, decay=0.001, nesterov=True)
     rmsprop = RMSprop(decay=0.003)
     adam = Adam(lr=0.1, decay=0.05)
     NN.compile(optimizer=rmsprop,
                loss="categorical_crossentropy",
                metrics=['accuracy', precision, recall, f1, fprate])
     print("model compiled")
     print("Begin training")
     callback = TensorBoard(log_dir="/tmp/logs")
     w_dict = {0: 0.31, 1: 0.63, 2: 0.06}
     history = NN.fit(X_train,
                      Y_train,
                      epochs=epochs,
                      callbacks=[callback],
                      validation_split=0.2,
                      class_weight=w_dict)
     print("Model trained")
     # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
     # print("Test data mapped")
     # X_test_pad = P.pad_list(X_test_indices)
     # print("Test data padded")
     # X_test = np.array(X_test_pad)
     # Y_test = np.array(Y_test)
     # print("Test data converted to numpy arrays")
     # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
     # print("accuracy: ", acc)
     T = "I have a bad case of vomit"
     X_Predict = [
         "my zika is bad", "i love colombia",
         "my has been tested for ebola",
         "there is a diarrhea outbreak in the city"
     ]
     X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
     i = 0
     for s in X_Predict_Idx:
         print(str(i) + ": ", s)
         i = i + 1
     print(X_Predict)
     X_Predict_Final = P.pad_list(X_Predict_Idx)
     #X_Predict = [X_Predict]
     X_Predict_Final = np.array(X_Predict_Final)
     print("Predict: ", NN.predict(X_Predict_Final))
     print("Storing model and weights")
     NN.save_model(json_filename, h5_filename)
     if plot:
         print("Ploting")
         self.plot(history)
     print("Done!")
Пример #12
0
    def process(self, json_filename, h5_filename):
        np.random.seed(11)
        # open the file with tweets
        X_all = []
        Y_all = []
        with open(self.labeled_tweets_filename, "r",
                  encoding="ISO-8859-1") as f:
            i = 0
            csv_file = csv.reader(f, delimiter=',')
            for r in csv_file:
                if i != 0:
                    tweet = r[0]
                    label = r[1]
                    X_all.append(tweet)
                    Y_all.append(label)
                i = i + 1
        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.60)
        # divide the data into X_train, Y_train, X_test, Y_test
        X_train_sentences = X_all[0:limit]
        Y_train = Y_all[0:limit]
        X_test_sentences = X_all[limit:]
        Y_test = Y_all[limit:]
        print("Data Divided")
        #Get embeeding
        G = GloveEmbedding(self.embedding_filename)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        S = SentenceToIndices(word_to_idx)
        X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
        print("Train data mappend to indices")
        P = PadSentences(max_len)
        X_train_pad = P.pad_list(X_train_indices)
        print("Train data padded")
        # Trim
        #trim_size = 40
        #Trim = TrimSentences(trim_size)
        #X_train_pad = Trim.trim_list(X_train_pad)
        #convert to numPY arrays
        X_train = np.array(X_train_pad)
        Y_train = np.array(Y_train)
        print("Train data convert to numpy arrays")
        NN = TweetSentiment2LSTM2Dense(max_len, G)
        #NN = TweetSentiment2LSTM2Dense(trim_size, G)

        print("model created")
        NN.build(first_layer_units=128,
                 dense_layer_units=1,
                 first_layer_dropout=0,
                 second_layer_dropout=0)
        print("model built")
        NN.summary()
        sgd = SGD(lr=0.3, momentum=0.001, decay=0.01, nesterov=False)
        adam = Adam(lr=0.03)
        #NN.compile(loss="binary_crossentropy", metrics=['binary_accuracy'], optimizer=adam)
        NN.compile(loss="binary_crossentropy",
                   metrics=['binary_accuracy'],
                   optimizer='rmsprop')

        print("model compiled")
        print("Begin training")
        callback = TensorBoard(log_dir="/tmp/logs")
        NN.fit(X_train, Y_train, epochs=5, callbacks=[callback])
        print("Model trained")
        X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        print("Test data mapped")
        X_test_pad = P.pad_list(X_test_indices)
        print("Test data padded")
        X_test = np.array(X_test_pad)
        Y_test = np.array(Y_test)
        print("Test data converted to numpy arrays")
        loss, acc = NN.evaluate(X_test, Y_test)
        print("accuracy: ", acc, ", loss: ", loss)
        T = "I have a bad case of vomit"
        X_Predict = [
            "my zika is bad", "i love colombia",
            "my has been tested for ebola",
            "there is a diarrhea outbreak in the city"
        ]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i = 0
        for s in X_Predict_Idx:
            print(str(i) + ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        #X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Final = np.array(X_Predict_Final)
        print("Predict: ", NN.predict(X_Predict_Final))
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        print("Done!")