Пример #1
0
def main():
    G = GloveEmbedding("glove.6B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    #print("locon: ", word_to_idx["locon"])
    s = "I love New York and music locon"
    s = s.lower()
    print("Sentence: ", s)
    S = SentenceToIndices(word_to_idx)
    sentence = S.map_sentence(s)
    print("Sentence to indices: ", sentence)
    print("Padded: ", PadSentences(10).pad(sentence))
    SE = SentenceToEmbedding(word_to_idx, idx_to_word, embedding)
    matrix = SE.map_sentence(s, max_len=10)
    print("Matrix: ", matrix)
    print("Matrix.shape: ", matrix.shape)
    print("Embedding i: ", embedding[word_to_idx["i"]])

    sentences = []
    sentences.append("I esta malo".lower())
    sentences.append("Love la musica salsa.".lower())
    sentences.append("Uff, q mal te va nene".lower())
    mapped, mlen = S.map_sentence_list(sentences)
    print("mlen: ", mlen)
    for s in mapped:
        print(s)
Пример #2
0
def set_trained_data(data, NN):
    new_data = []
    for row in data:
        new_data.append(row[1])

    G = GloveEmbedding("data/glove.6B.50d.txt", dimensions=50)
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    S = SentenceToIndices(word_to_idx)

    X_Predict_Idx, max_len = S.map_sentence_list(new_data)
    i = 0
    for s in X_Predict_Idx:
        #    print(str(i)+ ": ", s)
        i = i + 1

    #if max_len % 2 != 0:
    #    max_len = max_len + 1

    max_len = 72

    print("Max Len", max_len)

    P = PadSentences(max_len)
    Trim = TrimSentences(max_len)

    X_Predict_Final = P.pad_list(X_Predict_Idx)
    X_Predict_Final = Trim.trim_list(X_Predict_Final)
    X_Predict_Final = np.array(X_Predict_Final)
    X_Prediction = NN.predict(X_Predict_Final)
    final = np.argmax(X_Prediction, axis=1)
    return new_data, final
Пример #3
0
    def get_glove_embedding(self):
        g = GloveEmbedding(self.embedding_filename, dimensions=50)
        word_to_idx, idx_to_word, embedding = g.read_embedding()
        s = SentenceToIndices(word_to_idx)
        x_train_indices, max_len = s.map_sentence_list(self.x_all)

        if max_len % 2 != 0:
            max_len = max_len + 1

        p = PadSentences(max_len)
        x_train_pad = p.pad_list(x_train_indices)

        # TRIM Tweets to remove noisy data
        trim_size = max_len
        trim = TrimSentences(trim_size)
        x_train_pad = trim.trim_list(x_train_pad)

        return x_train_pad, max_len, g
Пример #4
0
    def process(self,
                json_filename,
                h5_filename,
                plot=False,
                epochs=100,
                vect_dimensions=50):
        # open the file with tweets
        X_all = []
        Y_all = []
        All = []

        #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f:
        with open(self.labeled_tweets_filename, "r") as f:
            i = 0
            csv_file = csv.reader(f, delimiter=',')
            ones_count = 0

            for r in csv_file:
                if i != 0:
                    All.append(r)
                i = i + 1

        np.random.shuffle(All)

        ones_count = 0
        two_count = 0
        zero_count = 0
        for r in All:
            tweet = r[0]
            label = int(r[1])
            if (label == 0):
                zero_count += 1
            elif (label == 1):
                ones_count += 1
            else:
                two_count += 1
            # if (label == 2):
            #     label = 0
            # if (label == 1) and (ones_count <= 4611):
            #     X_all.append(tweet)
            #     Y_all.append(label)
            #     ones_count +=1
            # elif (label == 0):
            X_all.append(tweet)
            Y_all.append(label)

        print("len(Y_all): ", len(Y_all))
        class_weight_val = class_weight.compute_class_weight(
            'balanced', np.unique(Y_all), Y_all)
        print("classes: ", np.unique(Y_all))
        print("counts for 0, 1, 2: ", zero_count, ones_count, two_count)
        print("class weight_val: ", class_weight_val)
        class_weight_dictionary = {
            0: class_weight_val[0],
            1: class_weight_val[1],
            2: class_weight_val[2]
        }
        print("dict: ", class_weight_dictionary)

        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.80)
        X_train_sentences = X_all
        Y_train = Y_all
        # Divide after conversions
        # divide the data into X_train, Y_train, X_test, Y_test
        #X_train_sentences = X_all[0: limit]
        #Y_train = Y_all[0: limit]
        #X_test_sentences = X_all[limit:]
        #Y_test = Y_all[limit:]
        #print("Data Divided")

        #Get embeeding
        #G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions)

        G = GloveEmbedding(self.embedding_filename, dimensions=vect_dimensions)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        S = SentenceToIndices(word_to_idx)
        X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
        print("Train data mappend to indices")
        if max_len % 2 != 0:
            max_len = max_len + 1

        P = PadSentences(max_len)
        X_train_pad = P.pad_list(X_train_indices)
        print("Train data padded")
        # TRIM
        trim_size = max_len
        #trim_size = 33
        Trim = TrimSentences(trim_size)
        X_train_pad = Trim.trim_list(X_train_pad)
        print("X[0], ", X_train_pad[0])
        #convert to numPY arrays
        X_train = np.array(X_train_pad)
        Y_train = np.array(Y_train)
        ones_count = np.count_nonzero(Y_train)
        zeros_count = len(Y_train) - ones_count
        print("ones count: ", ones_count)
        print("zeros count: ", zeros_count)
        print("two count: ", two_count)
        Y_train_old = Y_train
        Y_train = to_categorical(Y_train, num_classes=3)

        # Divide the data
        X_test_text = X_all[limit:]
        X_test = X_train[limit:]
        Y_test = Y_train[limit:]
        X_train = X_train[0:limit]
        Y_train = Y_train[0:limit]
        print("data divided on value: ", limit)
        print("lengths X_train, Y_train: ", len(X_train), len(Y_train))
        print("lengths X_test, Y_test: ", len(X_test), len(Y_test))

        print("Train data convert to numpy arrays")
        #NN = TweetSentiment2DCNN(trim_size, G)
        #NN = TweetSentiment2LSTM2Dense(trim_size, G)
        #NN =TweetSentiment2LSTM2Dense3Layer(trim_size, G)
        #NN =TweetSentiment2LSTM2Dense4Layer(trim_size, G)
        #NN = TweetSentimentCNN(trim_size, G)
        #print("Build GRU")
        #NN = TweetSentimentGRUSM(max_len, G)
        NN = TweetSentiment1D(trim_size, G)
        #NN = TweetSentiment1DRev(trim_size, G)

        print("model created")
        kernel_regularizer = l2(0.001)
        #kernel_regularizer = None
        NN.build(filters=11,
                 first_dropout=0,
                 second_dropout=0.05,
                 padding='valid',
                 dense_units=16)

        #NN.build(first_layer_units = max_len, second_layer_units = max_len, relu_dense_layer=16, dense_layer_units = 3,
        #         first_layer_dropout=0, second_layer_dropout=0, third_layer_dropout=0)
        print("model built")
        NN.summary()
        sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True)
        rmsprop = RMSprop(decay=0.003)
        adam = Adam(lr=0.0003, decay=0.001)
        sgd = SGD(lr=0.05)
        NN.compile(optimizer=adam,
                   loss="categorical_crossentropy",
                   metrics=['accuracy', precision, recall, f1, fprate])

        print("model compiled")
        print("Begin training")
        #callback = TensorBoard(log_dir="/tmp/logs")
        #class_weight = {0: 0.67, 1: 0.33}
        #class_weight = None
        #history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=32, callbacks=[callback], class_weight=class_weight_dictionary)
        history = NN.fit(X_train,
                         Y_train,
                         epochs=epochs,
                         batch_size=64,
                         class_weight=class_weight_dictionary,
                         validation_split=0.2)

        print("Model trained")
        print("Predicting")
        print("len(X_test): ", X_test)
        preds = NN.predict(X_test)
        print("len(preds): ", len(preds))
        print("type preds: ", type(preds))
        print("preds before: ", preds)
        preds = np.argmax(preds, axis=1)
        print("preds: ", preds)
        print("len(preds): ", len(preds))
        Y_test = Y_train_old[limit:]
        print("Y test: ", Y_test)
        c_matrix = confusion_matrix(Y_test, preds)
        print("matrix: ", c_matrix)
        print("Storing Errors: ")
        ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv")
        print("Errors stored")
        print("Confusion matrix: ")
        prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '')
        print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1)

        #
        # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        # print("Test data mapped")
        # X_test_pad = P.pad_list(X_test_indices)
        # print("Test data padded")
        # X_test = np.array(X_test_pad)
        # Y_test = np.array(Y_test)
        # print("Test data converted to numpy arrays")
        # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
        # print("accuracy: ", acc)
        T = "I have a bad case of vomit"
        X_Predict = [
            "my zika is bad", "i love colombia",
            "my has been tested for ebola",
            "there is a diarrhea outbreak in the city"
        ]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i = 0
        for s in X_Predict_Idx:
            print(str(i) + ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Final = np.array(X_Predict_Final)
        print("Predict: ", np.argmax(NN.predict(X_Predict_Final)))
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        if plot:
            print("Ploting")
            self.plot(history)
        print("Done!")
Пример #5
0
    def process(self, json_filename, h5_filename, plot=False, epochs = 100, vect_dimensions = 100):
        np.random.seed(11)
        # open the file with tweets
        X_all = []
        Y_all = []
        All  = []
        Zeros = []
        with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f:
            i = 0
            csv_file = csv.reader(f, delimiter = ',')
            ones_count = 0
            Ones = []
            for r in csv_file:
                if i !=0:
                    label = int(r[1])
                    #if label == 0:
                    #  Zeros.append(r)
                    All.append(r)
                    # tweet = r[0]
                    # label = r[1]
                    # X_all.append(tweet)
                    # Y_all.append(label)
                i = i + 1

        print("len(All): ", len(All))
        np.random.shuffle(All)

        ones_count = 0
        for r in All:
            tweet = r[0].strip()
            label = int(r[1])
            if (label == 2):
                label = 0
            # if (label == 1) and (ones_count <= 4611):
            #     X_all.append(tweet)
            #     Y_all.append(label)
            #     ones_count +=1
            # elif (label == 0):
            X_all.append(tweet)
            Y_all.append(label)

        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.60)
        X_train_sentences = X_all
        Y_train = Y_all
        # divide the data into X_train, Y_train, X_test, Y_test
        #X_train_sentences = X_all[0: limit]
        #Y_train = Y_all[0: limit]
        #X_test_sentences = X_all[limit:]
        #Y_test = Y_all[limit:]
        #print("Data Divided")
        #Get embeeding
        #G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions)
        G = GloveEmbedding(self.embedding_filename, dimensions=50)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        S = SentenceToIndices(word_to_idx)
        X_train_indices, max_len  = S.map_sentence_list(X_train_sentences)
        print("Train data mappend to indices")
        if max_len % 2 !=0:
            max_len = max_len + 1

        P = PadSentences(max_len)
        X_train_pad = P.pad_list(X_train_indices)
        print("Train data padded")
        # TRIM
        trim_size = max_len
        #trim_size = 45
        Trim = TrimSentences(trim_size)
        X_train_pad = Trim.trim_list(X_train_pad)
        print("X[0], ", X_train_pad[0])
        #convert to numPY arrays
        X_train_reverse = []
        for X in X_train_pad:
            t = X[::-1]
            X_train_reverse.append(t)
        X_train = np.array(X_train_pad)
        X_train_reverse = np.array(X_train_reverse)
        Y_train = np.array(Y_train)

        ones_count = np.count_nonzero(Y_train)
        zeros_count = len(Y_train) - ones_count
        print("ones count: ", ones_count)
        print("zeros count: ", zeros_count)
        #Y_train = to_categorical(Y_train, num_classes=3)
        print("Train data convert to numpy arrays")
        #NN = TweetSentiment2DCNN(trim_size, G)
        NN = TweetSentiment2DCNN2Channel(trim_size, G)
        #NN = TweetSentimentInception(trim_size, G)
        #print("Build GRU")
        #NN = TweetSentimentGRUSM(max_len, G)

        print("model created")
        kernel_regularizer = l2(0.001)
        #kernel_regularizer = None
        NN.build(filters=11, first_dropout=0, second_dropout=0.1, padding='valid', dense_units=32)
        print("model built")
        NN.summary()
        sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True)
        rmsprop = RMSprop(decay=0.003)
        adam = Adam(lr=0.1, decay=0.05)
        #sgd = SGD(lr=0.05)
        NN.compile(optimizer=rmsprop, loss="binary_crossentropy", metrics=['accuracy', precision, recall, f1, fprate])
        print("model compiled")
        print("Begin training")
        callback = TensorBoard(log_dir="/tmp/logs")
        #class_weight = {0: 0.67, 1: 0.33}
        class_weight = None
        history = NN.fit([X_train, X_train_reverse], Y_train, epochs=epochs, batch_size=32, callbacks=[callback], validation_split=0.20, class_weight=class_weight)
        print("Model trained")
        # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        # print("Test data mapped")
        # X_test_pad = P.pad_list(X_test_indices)
        # print("Test data padded")
        # X_test = np.array(X_test_pad)
        # Y_test = np.array(Y_test)
        # print("Test data converted to numpy arrays")
        # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
        # print("accuracy: ", acc)
        T = "I have a bad case of vomit"
        X_Predict = ["my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city"]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i =0
        for s in X_Predict_Idx:
            print(str(i)+ ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Reverse = []
        for r in X_Predict_Final:
            t = r[::-1]
            X_Predict_Reverse.append(t)

        X_Predict_Final = np.array(X_Predict_Final)
        X_Predict_Reverse = np.array(X_Predict_Reverse)
        Preds = NN.predict([X_Predict_Final, X_Predict_Reverse])
        Preds = ((Preds >= 0.5)*1).flatten()
        print("Predict: ", Preds)
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        if plot:
            print("Ploting")
            self.plot(history)
        print("Done!")
Пример #6
0
    for r in csv_file:
        if i != 0:
            tweet = r[0]
            label = r[1]
            X_all.append(tweet)
            Y_all.append(label)
        i = i + 1
print("Data Ingested")
num_data = len(X_all)
limit = math.ceil(num_data * 0.60)
X_train_sentences = X_all
Y_train = Y_all
G = GloveEmbedding(embedding_filename)
word_to_idx, idx_to_word, embedding = G.read_embedding()
S = SentenceToIndices(word_to_idx)
X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
print("Train data mappend to indices")
P = PadSentences(max_len)
X_train_pad = P.pad_list(X_train_indices)
print("Train data padded")
# convert to numPY arrays
X_train = np.array(X_train_pad)
Y_train = np.array(Y_train)
Y_train = to_categorical(Y_train, num_classes=3)
print("Train data convert to numpy arrays")
model = KerasClassifier(build_fn=create_model(G, max_len))
print("Model created")
# define the grid search parameters
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)
Пример #7
0
def main(model_file, model_weights, labeled_tweets, embedding_filename):
    # load json and create model
    json_file = open(model_file, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(model_weights)
    print("Loaded model from disk")
    # evaluate loaded model on test data
    loaded_model.compile(loss='binary_crossentropy',
                         optimizer='rmsprop',
                         metrics=['accuracy'])

    # open the file with tweets
    X_all = []
    Y_all = []
    All = []

    with open(labeled_tweets, "r", encoding="ISO-8859-1") as f:
        i = 0
        csv_file = csv.reader(f, delimiter=',')
        ones_count = 0

        for r in csv_file:
            if i != 0:
                label = int(r[1])
                if (label == 1) or (label == 2):
                    if ones_count <= 13000:
                        All.append(r)
                        ones_count += 1
                else:
                    All.append(r)
                # tweet = r[0]
                # label = r[1]
                # X_all.append(tweet)
                # Y_all.append(label)
            i = i + 1

    ones_count = 0
    for r in All:
        tweet = r[0]
        label = int(r[1])
        if (label == 2):
            label = 0
        # if (label == 1) and (ones_count <= 4611):
        #     X_all.append(tweet)
        #     Y_all.append(label)
        #     ones_count +=1
        # elif (label == 0):
        X_all.append(tweet)
        Y_all.append(label)

    print("Data Ingested")
    # divide the data into training and test
    num_data = len(X_all)
    limit = math.ceil(num_data * 0.60)
    X_train_sentences = X_all
    Y_train = Y_all
    # divide the data into X_train, Y_train, X_test, Y_test
    # X_train_sentences = X_all[0: limit]
    # Y_train = Y_all[0: limit]
    # X_test_sentences = X_all[limit:]
    # Y_test = Y_all[limit:]
    # print("Data Divided")
    # Get embeeding
    # G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions)
    G = GloveEmbedding(embedding_filename, dimensions=50)
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    S = SentenceToIndices(word_to_idx)
    X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
    print("Train data mappend to indices")
    if max_len % 2 != 0:
        max_len = max_len + 1

    P = PadSentences(max_len)
    X_train_pad = P.pad_list(X_train_indices)
    print("Train data padded")
    # TRIM
    trim_size = max_len
    Trim = TrimSentences(trim_size)
    X_train_pad = Trim.trim_list(X_train_pad)
    print("X[0], ", X_train_pad[0])
    # convert to numPY arrays
    X_train = np.array(X_train_pad)
    Y_train = np.array(Y_train)
    ones_count = np.count_nonzero(Y_train)
    zeros_count = len(Y_train) - ones_count
    print("ones count: ", ones_count)
    print("zeros count: ", zeros_count)
    # Y_train = to_categorical(Y_train, num_classes=3)
    print("Train data convert to numpy arrays")
    Preds = loaded_model.predict(X_train)
    Preds = ((Preds >= 0.5) * 1).flatten()
    with open("data/alltweetsanderrors.csv", "w") as f:
        csv_writer = csv.writer(f, delimiter=",")
        i = 0
        err_count = 0
        for r in All:
            tweet = r[0]
            label = int(r[1])
            if label == 2:
                label = 0
            if Preds[i] != label:
                err_count += 1
                condition = 0
            else:
                condition = 1

            error_pred = []
            error_pred.append(tweet)
            error_pred.append(label)
            error_pred.append(Preds[i])
            error_pred.append(condition)
            csv_writer.writerow(error_pred)
            i += 1
        print("All tweets: ", i)
        print("Error count: ", err_count)
Пример #8
0
    def process(self,
                json_filename,
                h5_filename,
                plot=False,
                epochs=100,
                vect_dimensions=100):
        np.random.seed(11)
        # open the file with tweets
        X_all = []
        Y_all = []
        All = []
        with open(self.labeled_tweets_filename, "r",
                  encoding="ISO-8859-1") as f:
            i = 0
            csv_file = csv.reader(f, delimiter=',')
            for r in csv_file:
                if i != 0:
                    All.append(r)
                    # tweet = r[0]
                    # label = r[1]
                    # X_all.append(tweet)
                    # Y_all.append(label)
                i = i + 1
        np.random.shuffle(All)

        for r in All:
            tweet = r[0]
            label = r[1]
            if int(label) == 2:
                label = '0'
            X_all.append(tweet)
            Y_all.append(label)
        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.60)
        X_train_sentences = X_all
        Y_train = Y_all
        # divide the data into X_train, Y_train, X_test, Y_test
        #X_train_sentences = X_all[0: limit]
        #Y_train = Y_all[0: limit]
        #X_test_sentences = X_all[limit:]
        #Y_test = Y_all[limit:]
        #print("Data Divided")
        #Get embeeding
        G = Word2VecEmbedding(self.embedding_filename,
                              dimensions=vect_dimensions)
        #G = GloveEmbedding(self.embedding_filename, dimensions=50)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        S = SentenceToIndices(word_to_idx)
        X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
        print("Train data mappend to indices")
        if max_len % 2 != 0:
            max_len = max_len + 1

        P = PadSentences(max_len)
        X_train_pad = P.pad_list(X_train_indices)
        print("Train data padded")
        # TRIM
        trim_size = 40
        Trim = TrimSentences(trim_size)
        X_train_pad = Trim.trim_list(X_train_pad)
        print("X[0], ", X_train_pad[0])
        #convert to numPY arrays
        X_train = np.array(X_train_pad)
        Y_train = np.array(Y_train)
        #Y_train = to_categorical(Y_train, num_classes=3)
        print("Train data convert to numpy arrays")
        #NN = TweetSentiment2DCNN(trim_size, G)
        NN = TweetSentiment2DCNNv4(trim_size, G)

        #print("Build GRU")
        #NN = TweetSentimentGRUSM(max_len, G)

        print("model created")
        kernel_regularizer = l2(0.001)
        #kernel_regularizer = None
        NN.build(filters=3,
                 first_dropout=0.01,
                 second_dropout=0.01,
                 padding='valid',
                 dense_units=16)
        print("model built")
        NN.summary()
        sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True)
        rmsprop = RMSprop(decay=0.003)
        adam = Adam(lr=0.1, decay=0.05)
        sgd = SGD(lr=0.05)
        NN.compile(optimizer='adam',
                   loss="binary_crossentropy",
                   metrics=['accuracy', f1, precision, recall])
        print("model compiled")
        print("Begin training")
        callback = TensorBoard(log_dir="/tmp/logs")
        history = NN.fit(X_train,
                         Y_train,
                         epochs=epochs,
                         batch_size=32,
                         callbacks=[callback],
                         validation_split=0.4)
        print("Model trained")
        # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        # print("Test data mapped")
        # X_test_pad = P.pad_list(X_test_indices)
        # print("Test data padded")
        # X_test = np.array(X_test_pad)
        # Y_test = np.array(Y_test)
        # print("Test data converted to numpy arrays")
        # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
        # print("accuracy: ", acc)
        T = "I have a bad case of vomit"
        X_Predict = [
            "my zika is bad", "i love colombia",
            "my has been tested for ebola",
            "there is a diarrhea outbreak in the city"
        ]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i = 0
        for s in X_Predict_Idx:
            print(str(i) + ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Final = np.array(X_Predict_Final)
        print("Predict: ", NN.predict(X_Predict_Final))
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        if plot:
            print("Ploting")
            self.plot(history)
        print("Done!")
Пример #9
0
 def process(self, json_filename, h5_filename, plot=False, epochs=100):
     np.random.seed(11)
     # open the file with tweets
     X_all = []
     Y_all = []
     with open(self.labeled_tweets_filename, "r",
               encoding="ISO-8859-1") as f:
         i = 0
         csv_file = csv.reader(f, delimiter=',')
         for r in csv_file:
             if i != 0:
                 tweet = r[0]
                 label = r[1]
                 X_all.append(tweet)
                 Y_all.append(label)
             i = i + 1
     print("Data Ingested")
     # divide the data into training and test
     num_data = len(X_all)
     limit = math.ceil(num_data * 0.60)
     X_train_sentences = X_all
     Y_train = Y_all
     # divide the data into X_train, Y_train, X_test, Y_test
     #X_train_sentences = X_all[0: limit]
     #Y_train = Y_all[0: limit]
     #X_test_sentences = X_all[limit:]
     #Y_test = Y_all[limit:]
     #print("Data Divided")
     #Get embeeding
     G = GloveEmbedding(self.embedding_filename)
     word_to_idx, idx_to_word, embedding = G.read_embedding()
     S = SentenceToIndices(word_to_idx)
     X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
     print("Train data mappend to indices")
     P = PadSentences(max_len)
     X_train_pad = P.pad_list(X_train_indices)
     print("Train data padded")
     #convert to numPY arrays
     X_train = np.array(X_train_pad)
     Y_train = np.array(Y_train)
     Y_train = to_categorical(Y_train, num_classes=3)
     print("Train data convert to numpy arrays")
     NN = TweetSentiment2LSTM2DenseSM(max_len, G)
     print("model created")
     kernel_regularizer = l2(0.001)
     kernel_regularizer = None
     NN.build(first_layer_units=max_len,
              second_layer_units=max_len,
              relu_dense_layer=5,
              dense_layer_units=3,
              first_layer_dropout=0.3,
              second_layer_dropout=0.6,
              l2=kernel_regularizer)
     print("model built")
     NN.summary()
     sgd = SGD(lr=0.001, momentum=0.09, decay=0.001, nesterov=True)
     rmsprop = RMSprop(decay=0.003)
     adam = Adam(lr=0.1, decay=0.05)
     NN.compile(optimizer=rmsprop,
                loss="categorical_crossentropy",
                metrics=['accuracy', precision, recall, f1, fprate])
     print("model compiled")
     print("Begin training")
     callback = TensorBoard(log_dir="/tmp/logs")
     w_dict = {0: 0.31, 1: 0.63, 2: 0.06}
     history = NN.fit(X_train,
                      Y_train,
                      epochs=epochs,
                      callbacks=[callback],
                      validation_split=0.2,
                      class_weight=w_dict)
     print("Model trained")
     # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
     # print("Test data mapped")
     # X_test_pad = P.pad_list(X_test_indices)
     # print("Test data padded")
     # X_test = np.array(X_test_pad)
     # Y_test = np.array(Y_test)
     # print("Test data converted to numpy arrays")
     # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
     # print("accuracy: ", acc)
     T = "I have a bad case of vomit"
     X_Predict = [
         "my zika is bad", "i love colombia",
         "my has been tested for ebola",
         "there is a diarrhea outbreak in the city"
     ]
     X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
     i = 0
     for s in X_Predict_Idx:
         print(str(i) + ": ", s)
         i = i + 1
     print(X_Predict)
     X_Predict_Final = P.pad_list(X_Predict_Idx)
     #X_Predict = [X_Predict]
     X_Predict_Final = np.array(X_Predict_Final)
     print("Predict: ", NN.predict(X_Predict_Final))
     print("Storing model and weights")
     NN.save_model(json_filename, h5_filename)
     if plot:
         print("Ploting")
         self.plot(history)
     print("Done!")
Пример #10
0
    def process(self, json_filename, h5_filename):
        np.random.seed(11)
        # open the file with tweets
        X_all = []
        Y_all = []
        with open(self.labeled_tweets_filename, "r",
                  encoding="ISO-8859-1") as f:
            i = 0
            csv_file = csv.reader(f, delimiter=',')
            for r in csv_file:
                if i != 0:
                    tweet = r[0]
                    label = r[1]
                    X_all.append(tweet)
                    Y_all.append(label)
                i = i + 1
        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.60)
        # divide the data into X_train, Y_train, X_test, Y_test
        X_train_sentences = X_all[0:limit]
        Y_train = Y_all[0:limit]
        X_test_sentences = X_all[limit:]
        Y_test = Y_all[limit:]
        print("Data Divided")
        #Get embeeding
        G = GloveEmbedding(self.embedding_filename)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        S = SentenceToIndices(word_to_idx)
        X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
        print("Train data mappend to indices")
        P = PadSentences(max_len)
        X_train_pad = P.pad_list(X_train_indices)
        print("Train data padded")
        # Trim
        #trim_size = 40
        #Trim = TrimSentences(trim_size)
        #X_train_pad = Trim.trim_list(X_train_pad)
        #convert to numPY arrays
        X_train = np.array(X_train_pad)
        Y_train = np.array(Y_train)
        print("Train data convert to numpy arrays")
        NN = TweetSentiment2LSTM2Dense(max_len, G)
        #NN = TweetSentiment2LSTM2Dense(trim_size, G)

        print("model created")
        NN.build(first_layer_units=128,
                 dense_layer_units=1,
                 first_layer_dropout=0,
                 second_layer_dropout=0)
        print("model built")
        NN.summary()
        sgd = SGD(lr=0.3, momentum=0.001, decay=0.01, nesterov=False)
        adam = Adam(lr=0.03)
        #NN.compile(loss="binary_crossentropy", metrics=['binary_accuracy'], optimizer=adam)
        NN.compile(loss="binary_crossentropy",
                   metrics=['binary_accuracy'],
                   optimizer='rmsprop')

        print("model compiled")
        print("Begin training")
        callback = TensorBoard(log_dir="/tmp/logs")
        NN.fit(X_train, Y_train, epochs=5, callbacks=[callback])
        print("Model trained")
        X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        print("Test data mapped")
        X_test_pad = P.pad_list(X_test_indices)
        print("Test data padded")
        X_test = np.array(X_test_pad)
        Y_test = np.array(Y_test)
        print("Test data converted to numpy arrays")
        loss, acc = NN.evaluate(X_test, Y_test)
        print("accuracy: ", acc, ", loss: ", loss)
        T = "I have a bad case of vomit"
        X_Predict = [
            "my zika is bad", "i love colombia",
            "my has been tested for ebola",
            "there is a diarrhea outbreak in the city"
        ]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i = 0
        for s in X_Predict_Idx:
            print(str(i) + ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        #X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Final = np.array(X_Predict_Final)
        print("Predict: ", NN.predict(X_Predict_Final))
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        print("Done!")