Exemplo n.º 1
0
def data():
    docs, y, links = preprocessing("../crawler/clcjsondata.txt").process()
    # integer encode documents
    t = Tokenizer()
    t.fit_on_texts(docs)
    X = t.texts_to_sequences(docs)
    global vocab_size
    vocab_size = len(t.word_index) + 1
    one_hot_y = to_categorical(y, num_classes=10)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        one_hot_y,
                                                        test_size=0.2,
                                                        random_state=777)
    X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                      y_train,
                                                      test_size=0.2,
                                                      random_state=777)
    # truncate and pad input sequences
    max_sequence_length = 18
    X_train = sequence.pad_sequences(X_train,
                                     maxlen=max_sequence_length,
                                     padding="post")
    X_test = sequence.pad_sequences(X_test,
                                    maxlen=max_sequence_length,
                                    padding="post")
    X_val = sequence.pad_sequences(X_val,
                                   maxlen=max_sequence_length,
                                   padding="post")
    return X_train, X_val, X_test, y_train, y_val, y_test
Exemplo n.º 2
0
def data():
    paras, y, links = preprocessing("../crawler/clcjsondata.txt").process()
    v = TfidfVectorizer()
    X = v.fit_transform(paras)
    y = to_categorical(y, num_classes=10)
    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=777)

    X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                      y_train,
                                                      test_size=0.2,
                                                      random_state=777)

    return X_train, X_val, X_test, y_train, y_val, y_test
Exemplo n.º 3
0
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_vector_length, input_length=max_sequence_length))
    # model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='tanh'))
    # model.add(MaxPooling1D(pool_size=2))
    # model.add(Flatten())

    model.add(LSTM(100, recurrent_dropout=dropout_rate))
    # model.add(LSTM(units=200,activation='relu'))
    model.add(Dense(units=10, activation="softmax", kernel_initializer="uniform"))
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model
if __name__ == "__main__":
    # preprocess data
    pt = preprocessing("../crawler/clcjsondata.txt")
    docs, labels, y = pt.process()
    y_onehot = to_categorical(y,num_classes=10)
    t = Tokenizer()
    t.fit_on_texts(docs)
    X = t.texts_to_sequences(docs)
    vocab_size = len(t.word_index) + 1
    X_trn,X_tst,y_trn,y_tst = train_test_split(X,y_onehot,random_state=32,test_size=0.2)
    #truncate and pad sequence
    max_sequence_length = 40
    X_trn = sequence.pad_sequences(X_trn, maxlen=max_sequence_length, padding="post")
    X_tst = sequence.pad_sequences(X_tst, maxlen=max_sequence_length, padding="post")
    params = dict(
        dropout_rate=[.5], embedding_vector_length = [400,600],
        max_sequence_length=[max_sequence_length], optimizer = ["rmsprop"],
        vocab_size = [vocab_size], batch_size = [32,64], epochs = [32])
Exemplo n.º 4
0
from classifier.preprocessing import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.optimizers import SGD
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.preprocessing.text import one_hot, Tokenizer
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
# pre-process data
docs, y, links = preprocessing("../crawler/clcjsondata.txt").process()
# integer encode documents
t = Tokenizer()
t.fit_on_texts(docs)
X = t.texts_to_sequences(docs)
one_hot_y = to_categorical(y, num_classes=10)
X_trn, X_tst, y_trn, y_tst = train_test_split(X,
                                              one_hot_y,
                                              test_size=0.2,
                                              random_state=32)

# truncate and pad input sequences
max_sequence_length = 18
X_trn = sequence.pad_sequences(X_trn,
                               maxlen=max_sequence_length,
                               padding="post")