예제 #1
0
파일: 16-7.py 프로젝트: DemonZeros/1book
def do_rnn(x_train,x_test,y_train,y_test):
    global n_words
    # Data preprocessing
    # Sequence padding
    print "GET n_words embedding %d" % n_words


    #x_train = pad_sequences(x_train, maxlen=100, value=0.)
    #x_test = pad_sequences(x_test, maxlen=100, value=0.)
    # Converting labels to binary vectors
    y_train = to_categorical(y_train, nb_classes=2)
    y_test = to_categorical(y_test, nb_classes=2)

    # Network building
    net = tflearn.input_data(shape=[None, 100,n_words])
    net = tflearn.lstm(net, 10,  return_seq=True)
    net = tflearn.lstm(net, 10, )
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.1,name="output",
                             loss='categorical_crossentropy')

    # Training

    model = tflearn.DNN(net, tensorboard_verbose=3)
    model.fit(x_train, y_train, validation_set=(x_test, y_test), show_metric=True,
             batch_size=32,run_id="maidou")
예제 #2
0
def do_rnn(x,y):
    global max_document_length
    print "RNN"
    trainX, testX, trainY, testY = train_test_split(x, y, test_size=0.4, random_state=0)
    y_test=testY

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, max_document_length])
    net = tflearn.embedding(net, input_dim=10240000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX, trainY, validation_set=0.1, show_metric=True,
              batch_size=10,run_id="webshell",n_epoch=5)

    y_predict_list=model.predict(testX)
    y_predict=[]
    for i in y_predict_list:
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    do_metrics(y_test, y_predict)
예제 #3
0
파일: 16-3.py 프로젝트: DemonZeros/1book
def do_rnn(trainX, testX, trainY, testY):
    global n_words
    # Data preprocessing
    # Sequence padding
    print "GET n_words embedding %d" % n_words


    trainX = pad_sequences(trainX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
    testX = pad_sequences(testX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, MAX_DOCUMENT_LENGTH])
    net = tflearn.embedding(net, input_dim=n_words, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training



    model = tflearn.DNN(net, tensorboard_verbose=3)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
             batch_size=32,run_id="maidou")
예제 #4
0
def do_rnn(trainX, testX, trainY, testY):
    max_document_length=64
    y_test=testY
    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, max_document_length])
    net = tflearn.embedding(net, input_dim=10240000, output_dim=64)
    net = tflearn.lstm(net, 64, dropout=0.1)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0,tensorboard_dir="dga_log")
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
              batch_size=10,run_id="dga",n_epoch=1)

    y_predict_list = model.predict(testX)
    #print y_predict_list

    y_predict = []
    for i in y_predict_list:
        print  i[0]
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    print(classification_report(y_test, y_predict))
    print metrics.confusion_matrix(y_test, y_predict)
예제 #5
0
def do_cnn_doc2vec(trainX, testX, trainY, testY):
    global max_features
    print "CNN and doc2vec"

    #trainX = pad_sequences(trainX, maxlen=max_features, value=0.)
    #testX = pad_sequences(testX, maxlen=max_features, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Building convolutional network
    network = input_data(shape=[None,max_features], name='input')
    network = tflearn.embedding(network, input_dim=1000000, output_dim=128,validate_indices=False)
    branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
    branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
    branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
    network = merge([branch1, branch2, branch3], mode='concat', axis=1)
    network = tf.expand_dims(network, 2)
    network = global_max_pool(network)
    network = dropout(network, 0.8)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy', name='target')
    # Training
    model = tflearn.DNN(network, tensorboard_verbose=0)
    model.fit(trainX, trainY,
              n_epoch=5, shuffle=True, validation_set=(testX, testY),
              show_metric=True, batch_size=100,run_id="review")
예제 #6
0
파일: 17-2.py 프로젝트: DemonZeros/1book
def  do_cnn(trainX, trainY,testX, testY):
    global n_words
    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
    testX = pad_sequences(testX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Building convolutional network
    network = input_data(shape=[None, MAX_DOCUMENT_LENGTH], name='input')
    network = tflearn.embedding(network, input_dim=n_words+1, output_dim=128)
    branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
    branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
    branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
    network = merge([branch1, branch2, branch3], mode='concat', axis=1)
    network = tf.expand_dims(network, 2)
    network = global_max_pool(network)
    network = dropout(network, 0.5)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy', name='target')
    # Training
    model = tflearn.DNN(network, tensorboard_verbose=0)
    model.fit(trainX, trainY, n_epoch = 20, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=32)
예제 #7
0
def do_cnn(x,y):
    global max_document_length
    print "CNN and tf"
    trainX, testX, trainY, testY = train_test_split(x, y, test_size=0.4, random_state=0)
    y_test=testY

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Building convolutional network
    network = input_data(shape=[None,max_document_length], name='input')
    network = tflearn.embedding(network, input_dim=1000000, output_dim=128)
    branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
    branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
    branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
    network = merge([branch1, branch2, branch3], mode='concat', axis=1)
    network = tf.expand_dims(network, 2)
    network = global_max_pool(network)
    network = dropout(network, 0.8)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy', name='target')

    model = tflearn.DNN(network, tensorboard_verbose=0)
    #if not os.path.exists(pkl_file):
        # Training
    model.fit(trainX, trainY,
                  n_epoch=5, shuffle=True, validation_set=0.1,
                  show_metric=True, batch_size=100,run_id="webshell")
    #    model.save(pkl_file)
    #else:
    #    model.load(pkl_file)

    y_predict_list=model.predict(testX)
    #y_predict = list(model.predict(testX,as_iterable=True))

    y_predict=[]
    for i in y_predict_list:
        print  i[0]
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)
    print 'y_predict_list:'
    print y_predict_list
    print 'y_predict:'
    print  y_predict
    #print  y_test

    do_metrics(y_test, y_predict)
예제 #8
0
def process_form_data(filename) :
    data = h5py.File(filename, 'r')
    output = h5py.File('forms_out.h5', 'w')

    test_image = output.create_dataset('test_image', (330, 3, 256, 256), dtype=np.uint8)
    train_image = output.create_dataset('train_image', (770, 3, 256, 256), dtype=np.uint8)
    test_label  = output.create_dataset('test_label', (330,11), dtype=np.int8)
    train_label  = output.create_dataset('train_label', (770,11), dtype=np.int8)

    image, labels = shuffle(data['image'], data['form'])

    onehot_labels = to_categorical(labels, 11)


    count = {}
    train_count = 0
    test_count = 0
    for i, l in enumerate(labels) :

        if l not in count :
            count[l] = 0

        if count[l] > 29 :
            train_image[train_count] = image[i]
            train_label[train_count] = onehot_labels[i]
            train_count += 1

        else :
            test_image[test_count] = image[i]
            test_label[test_count] = onehot_labels[i]
            test_count += 1

        count[l] += 1

    output.close()
예제 #9
0
파일: 16-5.py 프로젝트: DemonZeros/1book
def do_rnn(trainX, testX, trainY, testY):
    global max_sequences_len
    global max_sys_call
    # Data preprocessing
    # Sequence padding

    trainX = pad_sequences(trainX, maxlen=max_sequences_len, value=0.)
    testX = pad_sequences(testX, maxlen=max_sequences_len, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY_old=testY
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    print "GET max_sequences_len embedding %d" % max_sequences_len
    print "GET max_sys_call embedding %d" % max_sys_call

    net = tflearn.input_data([None, max_sequences_len])
    net = tflearn.embedding(net, input_dim=max_sys_call+1, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.3)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.1,
                             loss='categorical_crossentropy')

    # Training



    model = tflearn.DNN(net, tensorboard_verbose=3)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
             batch_size=32,run_id="maidou")

    y_predict_list = model.predict(testX)
    #print y_predict_list

    y_predict = []
    for i in y_predict_list:
        #print  i[0]
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    #y_predict=to_categorical(y_predict, nb_classes=2)

    print(classification_report(testY_old, y_predict))
    print metrics.confusion_matrix(testY_old, y_predict)
예제 #10
0
def get_error(model,f,t,label):
    pred_probs=[model.predict(f[i*500:min((i+1)*500,len(f))]) for i in range(int(len(f)/500)+1)]
    y=[val for sublist in pred_probs for val in list(sublist)]
    # y=model.predict(f)
    yy=np.argmax(y,axis=1)
    acc=accuracy_score(t,to_categorical(yy,6))
    get_statistics(y, yy, t, label)
    return 1-acc
예제 #11
0
def do_cnn_word2vec_2d(trainX, testX, trainY, testY):
    global max_features
    global max_document_length
    print "CNN and word2vec2d"
    y_test = testY
    #trainX = pad_sequences(trainX, maxlen=max_features, value=0.)
    #testX = pad_sequences(testX, maxlen=max_features, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Building convolutional network
    network = input_data(shape=[None,max_document_length,max_features,1], name='input')

    network = conv_2d(network, 32, 3, activation='relu', regularizer="L2")
    network = max_pool_2d(network, 2)
    network = local_response_normalization(network)
    network = conv_2d(network, 64, 3, activation='relu', regularizer="L2")
    network = max_pool_2d(network, 2)
    network = local_response_normalization(network)
    network = fully_connected(network, 128, activation='tanh')
    network = dropout(network, 0.8)
    network = fully_connected(network, 256, activation='tanh')
    network = dropout(network, 0.8)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=0.01,
                         loss='categorical_crossentropy', name='target')

    model = tflearn.DNN(network, tensorboard_verbose=0)
    model.fit(trainX, trainY,
              n_epoch=5, shuffle=True, validation_set=(testX, testY),
              show_metric=True,run_id="sms")

    y_predict_list = model.predict(testX)
    print y_predict_list

    y_predict = []
    for i in y_predict_list:
        print  i[0]
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    print(classification_report(y_test, y_predict))
    print metrics.confusion_matrix(y_test, y_predict)
예제 #12
0
def get_data(filename, num_frames, num_classes, input_length):
    """Get the data from our saved predictions or pooled features."""

    # Local vars.
    X = []
    y = []
    temp_list = deque()

    classes = get_classes()

    # Open and get the features.
    with open(filename, 'rb') as fin:
        frames = pickle.load(fin)

        print(f"Frames {len(frames)}")
        print(f"Frame tipo: {frames[0]}")

        for frame in frames:
            features = frame[0]
            actual = frame[1]

            # Add to the queue.
            if len(temp_list) == num_frames - 1:
                temp_list.append(features)
                flat = list(temp_list)
                X.append(np.array(flat))
                y.append(
                    classes.index(actual))  # Convert our labels into integer.
                temp_list.popleft()
            else:
                temp_list.append(features)
                continue

    print("Total dataset size: %d" % len(X))

    # Numpy.
    X = np.array(X)
    y = np.array(y)

    print(f"X {X.shape}")
    print(f"y {y.shape}")

    # Reshape.
    X = X.reshape(-1, num_frames, input_length)

    # One-hot encoded categoricals.
    y = to_categorical(y, num_classes)

    print(f"X {X.shape}")
    print(f"y {y.shape}")

    # Split into train and test.
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=42)

    return X_train, X_test, y_train, y_test
예제 #13
0
def prep_train_test(n, dev_pct): 
    np.random.seed(87)
    shuffle_indices = np.random.permutation(np.arange(n))
    split = int(n*dev_pct)
    return shuffle_indices[split:], shuffle_indices[:split]


    np.random.seed(10) 
    shuffle_indices = np.random.permutation(np.arange(len(pos_x)))
    pos_x_shuffled = pos_x[shuffle_indices]
    dev_idx = -1 * int(dev_pct * float(len(pos_x)))
    pos_train = pos_x_shuffled[:dev_idx]
    pos_test = pos_x_shuffled[dev_idx:]

    np.random.seed(10)
    shuffle_indices=np.random.permutation(np.arange(len(neg_x)))
    neg_x_shuffled = neg_x[shuffle_indices]
    dev_idx = -1 * int(dev_pct * float(len(neg_x)))
    neg_train = neg_x_shuffled[:dev_idx]
    neg_test = neg_x_shuffled[dev_idx:] 

    x_train = np.array(list(pos_train) + list(neg_train))
    y_train = len(pos_train)*[1] + len(neg_train)*[0]
    x_test = np.array(list(pos_test) + list(neg_test))
    y_test = len(pos_test)*[1] + len(neg_test)*[0]

    y_train = to_categorical(y_train, nb_classes=2)
    y_test = to_categorical(y_test, nb_classes=2) 

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(x_train)))
    x_train = x_train[shuffle_indices]
    y_train = y_train[shuffle_indices]

    np.random.seed(10) 
    shuffle_indices = np.random.permutation(np.arange(len(x_test)))
    x_test = x_test[shuffle_indices]
    y_test = y_test[shuffle_indices] 
    
    print("Train Mal/Ben split: {}/{}".format(len(pos_train), len(neg_train)))
    print("Test Mal/Ben split: {}/{}".format(len(pos_test), len(neg_test)))
    print("Train/Test split: {}/{}".format(len(y_train), len(y_test)))
    print("Train/Test split: {}/{}".format(len(x_train), len(x_test)))

    return x_train, y_train, x_test, y_test 
예제 #14
0
def prepare_data(data):
    # acquring tokenizer, and tokenizing strings related to action made
    # TODO this probably can be reworked to dialog choice using TKinter, but I didn't find such need in my case
    if os.path.isfile('tokenizer.pickle'):
        with open('tokenizer.pickle', 'rb') as handle:
            tokenizer = pickle.load(handle)
    # if tokenizer isn't found, new token is created
    else:
        tokenizer = tfds.features.text.Tokenizer()
        # saving tokenizer for backu[
        with open('tokenizer.pickle', 'wb') as handle:
            pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    vocabulary_set = set()
    # creating additional copy of data to resolve problems with IndexErrors exceptions
    data_copy = []
    # tokenizing actions from data sets and creating an vocabulary set
    for i in data:
        some_tokens = tokenizer.tokenize(i[1][1])
        vocabulary_set.update(some_tokens)
    with open('vocabulary.pickle', 'wb') as handle:
        pickle.dump(vocabulary_set, handle, protocol=pickle.HIGHEST_PROTOCOL)
    encoder = prepare_encoder(vocabulary_set)
    # saving
    for i in data:
        # i is in format [image array,[(x_postion,y_position), action]] before this processing
        # Preparing new set of processed data. Getting previous screen array, encoding string and
        # getting screen position data from (x,y) tuple to [x,y] array
        screen = i[0]
        # token is a 1 element list with int as tokenized string value.
        token = encoder.encode(i[1][1])[0]
        screen_position_tuple = i[1][0]
        x_value = screen_position_tuple[0]
        y_value = screen_position_tuple[1]
        # creating a row of preprocessed data and appending new array
        new_data = (screen, [x_value, y_value, token])
        data_copy.append(new_data)
    a = []
    for i in data_copy:
        a.append(i[1][2])
    data_utils.to_categorical(a, encoder.vocab_size)
    index = 0
    for i in data_copy:
        i[1][2] = a[index]
        index += 1
    return data_copy
예제 #15
0
def prep_train_test(pos_x, neg_x, dev_ratio):
    """
    构建训练测试集
    :param pos_x:
    :param neg_x:
    :param dev_ratio: 测试集比例
    :return:
    """
    np.random.seed(10) 
    shuffle_indices = np.random.permutation(np.arange(len(pos_x)))
    pos_x_shuffled = pos_x[shuffle_indices]
    dev_idx = -1 * int(dev_ratio * float(len(pos_x)))
    pos_train = pos_x_shuffled[:dev_idx]
    pos_test = pos_x_shuffled[dev_idx:]

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(neg_x)))
    neg_x_shuffled = neg_x[shuffle_indices]
    dev_idx = -1 * int(dev_ratio * float(len(neg_x)))
    neg_train = neg_x_shuffled[:dev_idx]
    neg_test = neg_x_shuffled[dev_idx:] 

    x_train = np.array(list(pos_train) + list(neg_train))
    y_train = len(pos_train)*[1] + len(neg_train)*[0]
    x_test = np.array(list(pos_test) + list(neg_test))
    y_test = len(pos_test)*[1] + len(neg_test)*[0]
    y_train = to_categorical(y_train, nb_classes=2)
    y_test = to_categorical(y_test, nb_classes=2) 

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(x_train)))
    x_train = x_train[shuffle_indices]
    y_train = y_train[shuffle_indices]

    np.random.seed(10) 
    shuffle_indices = np.random.permutation(np.arange(len(x_test)))
    x_test = x_test[shuffle_indices]
    y_test = y_test[shuffle_indices] 
    
    print("Train Mal/Ben split: {}/{}".format(len(pos_train), len(neg_train)))
    print("Test Mal/Ben split: {}/{}".format(len(pos_test), len(neg_test)))
    print("Train/Test split: {}/{}".format(len(y_train), len(y_test)))
    print("Train/Test split: {}/{}".format(len(x_train), len(x_test)))

    return x_train, y_train, x_test, y_test 
예제 #16
0
def getData_imdb():
    from tflearn.datasets import imdb

    train, test, _ = imdb.load_data(path='imdb.pkl',
                                    n_words=10000,
                                    valid_portion=0.1)
    trainX, trainY = train
    testX, testY = test
    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=100, value=0.)
    print(trainX.shape)
    print(trainY)
    testX = pad_sequences(testX, maxlen=100, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)
    return trainX, testX, trainY, testY
예제 #17
0
def get_data(input_data_dump, num_frames_per_video, labels, ifTrain):
    """Get the data from our saved predictions or pooled features."""

    # Local vars.
    X = []
    y = []
    temp_list = deque()

    # Open and get the features.
    with open(input_data_dump, 'rb') as fin:
        frames = pickle.load(fin)
        for i, frame in enumerate(frames):

            features = frame[0]
            actual = frame[1].lower()

            # frameCount = frame[2]

            # Convert our labels into binary.
            actual = labels[actual]

            # Add to the queue.
            if len(temp_list) == num_frames_per_video - 1:
                temp_list.append(features)
                flat = list(temp_list)
                X.append(np.array(flat))
                y.append(actual)
                temp_list.clear()
            else:
                temp_list.append(features)
                continue

    print("Class Name\tNumeric Label")
    for key in labels:
        print("%s\t\t%d" % (key, labels[key]))

    print('DEBUG  X ', len(X))
    #print('tem', temp_list[0].shape)
    print(' Y ', len(y))
    # Numpy.
    X = np.array(X)
    y = np.array(y)

    print("Dataset shape: ", X.shape)

    # One-hot encoded categoricals.
    y = to_categorical(y, len(labels))

    # Split into train and test.
    if ifTrain:
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=42)
        return X_train, X_test, y_train, y_test
    else:
        return X, y
예제 #18
0
def bi_lstm(trainX, trainY,testX, testY):
    trainX = pad_sequences(trainX, maxlen=200, value=0.)
    testX = pad_sequences(testX, maxlen=200, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data(shape=[None, 200])
    net = tflearn.embedding(net, input_dim=20000, output_dim=128)
    net = tflearn.bidirectional_rnn(net, BasicLSTMCell(128), BasicLSTMCell(128))
    net = tflearn.dropout(net, 0.5)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=2)
    model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=64,run_id="rnn-bilstm")
def load_data_and_labels(args, input_file, word2idx: dict):
    """
    Load research data from files, padding sentences and generate one-hot labels.

    Args:
        args: The arguments.
        input_file: The research record.
        word2idx: The word2idx dict.
    Returns:
        The dict <Data> (includes the record tokenindex and record labels)
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not input_file.endswith('.json'):
        raise IOError(
            "[Error] The research record is not a json file. "
            "Please preprocess the research record into the json file.")

    def _token_to_index(x: list):
        result = []
        for item in x:
            if item not in word2idx.keys():
                result.append(word2idx['_UNK'])
            else:
                word_idx = word2idx[item]
                result.append(word_idx)
        return result

    Data = dict()
    with open(input_file) as fin:
        Data['f_id'] = []
        Data['b_id'] = []
        Data['f_content_index'] = []
        Data['b_content_index'] = []
        Data['labels'] = []

        for eachline in fin:
            record = json.loads(eachline)
            f_id = record['front_testid']
            b_id = record['behind_testid']
            f_content = record['front_features']
            b_content = record['behind_features']
            labels = record['label']

            Data['f_id'].append(f_id)
            Data['b_id'].append(b_id)
            Data['f_content_index'].append(_token_to_index(f_content))
            Data['b_content_index'].append(_token_to_index(b_content))
            Data['labels'].append(labels)
        Data['f_pad_seqs'] = pad_sequences(Data['f_content_index'],
                                           maxlen=args.pad_seq_len,
                                           value=0.)
        Data['b_pad_seqs'] = pad_sequences(Data['b_content_index'],
                                           maxlen=args.pad_seq_len,
                                           value=0.)
        Data['onehot_labels'] = to_categorical(Data['labels'], nb_classes=2)
    return Data
예제 #20
0
파일: sent.py 프로젝트: pank2210/chat-telco
    def buildTrainingData(self):
        print("Building train data for SentClassificationModel[{}]...".format(
            self.model_name))
        #initialize all keys required to browse data
        raw_data_key = 'raw'
        data_key = 'data'
        sent_class = 'class'
        conv_key = 'conv_ind'

        #read training data
        avg_words, avg_sents, conv = cu.processTaggedChat(self.train_data_file)
        trainX = []
        trainY = []

        for i, cdata in enumerate(conv):
            #if i >= 5:
            #  break
            for sdata in cdata:
                trainX.append(sdata[data_key])
                trainY.append(sdata[sent_class])
                #print("trainX[{}]****labels[{}]".format(sdata[data_key],sdata[sent_class]))

        print(
            "Training data of [{}] sentences and [{}] labels loaded for classification..."
            .format(len(trainX), len(trainY)))
        self.vocab = nu.Vocab(trainX,
                              self.config)  #build X vocab dict & required data
        self.labels = nu.Vocab(trainY, self.config,
                               label=True)  #build Y vocab dict & required data

        self.labels.setUNK(
            'UNK1')  #Explicitly set label for unknown classification

        #Create encoding for training data
        self.encodedXdata = self.vocab.getCodedData()
        self.encodedYdata = self.labels.getCodedData()

        print("Coded X {} data: {}".format(len(self.encodedXdata),
                                           self.vocab.getData()[:10]))
        print("Coded X code: {}".format(self.encodedXdata[:10]))
        print("Coded Y size {} unique {} data: {}".format(
            len(self.encodedYdata), len(set(self.encodedYdata)),
            self.labels.getData()[:10]))
        print("Coded Y code: {}".format(self.encodedYdata[:10]))

        #pad sequence with zero's
        self.encodedXdata = pad_sequences(self.encodedXdata,
                                          maxlen=self.config.sent_size,
                                          value=0)
        self.no_classes = len(set(self.encodedYdata))  #no of target classes
        self.Y = to_categorical(
            self.encodedYdata,
            nb_classes=self.no_classes)  #Y as required by tflearn

        #release unwanted variables.
        trainX = None
        trainY = None
def run_on_imdb():
    # IMDB Dataset loading
    train, test, _ = imdb.load_data(path=imdb_dataset_path,
                                    n_words=10000,
                                    valid_portion=0.1)
    trainX, trainY = train
    testX, testY = test

    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=100, value=0.)
    testX = pad_sequences(testX, maxlen=100, value=0.)

    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, 100])
    net = tflearn.embedding(net, input_dim=10000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net,
                             optimizer='adam',
                             learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)

    if check_file_exist(imdb_model_path):
        model.load(imdb_model_path)

    model.fit(trainX,
              trainY,
              validation_set=(testX, testY),
              show_metric=True,
              batch_size=32)

    if save_model:
        print("Saving model as 'imdb_model.tfl'")
        model.save(imdb_model_path)

    return 0
def do_rnn(x, y):
    global max_document_length
    print("RNN")
    trainX, testX, trainY, testY = train_test_split(x,
                                                    y,
                                                    test_size=0.4,
                                                    random_state=0)
    y_test = testY

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, max_document_length])
    net = tflearn.embedding(net, input_dim=10240000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net,
                             optimizer='adam',
                             learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX,
              trainY,
              validation_set=0.1,
              show_metric=True,
              batch_size=10,
              run_id="webshell",
              n_epoch=5)

    y_predict_list = model.predict(testX)
    y_predict = []
    for i in y_predict_list:
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    do_metrics(y_test, y_predict)
예제 #23
0
def load_sst(glove_data):
    # Get the phrases and their indices
    # print("Getting Phrase Dictionary...")
    # _, word_dict = get_phrases_dict('stanfordSentimentTreebank/dictionary.txt')

    if glove_data is None:
        word_dict = increment_word_dict()
    else:
        print("Getting glove word indices...")
        word_dict = glove_word_indices(glove_data)

    # Convert the phrases to ints with word indices so they can be processed by Neural Network
    print("Converting to Ints...")
    int_phrases = phrases2ints(
        word_dict, 'stanfordSentimentTreebank/datasetSentences.txt')

    # Convert indices to sentiment values
    print("Converting Indices to Sentiment Values...")
    phrase_sentiments = indices_to_sentiment(
        int_phrases, 'stanfordSentimentTreebank/sentiment_labels.txt')

    # Split into train, test, and dev groups
    print("Splitting into train, test, and dev groups...")
    train, test, val = split_database(
        phrase_sentiments, 'stanfordSentimentTreebank/datasetSplit.txt')

    # Unzip input sequences and sentiment labels
    trainX, trainY = unzip_examples(train)
    valX, valY = unzip_examples(val)
    testX, testY = unzip_examples(test)

    # Sequence padding
    print("Padding Sequences...")
    trainX = pad_sequences(trainX, maxlen=200, value=0.)
    valX = pad_sequences(valX, maxlen=200, value=0.)
    testX = pad_sequences(testX, maxlen=200, value=0.)

    # Converting labels to binary vectors
    print("Converting labels to binary vectors...")
    trainY = to_categorical(trainY, nb_classes=2)
    valY = to_categorical(valY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    return Dataset(trainX, trainY, valX, valY, testX, testY)
예제 #24
0
def evaluate(sess,
             dataset,
             model,
             step,
             max_dev_itr=100,
             verbose=True,
             mode='val'):
    results_dir = model.val_results_dir if mode == 'val' \
        else model.test_results_dir
    samples_path = os.path.join(results_dir,
                                '{}_samples_{}.txt'.format(mode, step))
    history_path = os.path.join(results_dir, '{}_history.txt'.format(mode))

    avg_val_loss, avg_acc = 0.0, 0.0
    print("Running Evaluation {}:".format(mode))
    tflearn.is_training(False, session=sess)

    # This is needed to reset the local variables initialized by
    # TF for calculating streaming Pearson Correlation and MSE
    all_dev_text, all_dev_pred, all_dev_gt = [], [], []
    dev_itr = 0
    while (dev_itr < max_dev_itr and max_dev_itr != 0) \
            or mode in ['test', 'train']:
        val_batch = dataset.next_batch(FLAGS.batch_size,
                                       pad=model.args["sequence_length"],
                                       one_hot=False,
                                       raw=False)
        cat_targets = [
            to_categorical(n, len(dataset.vocab_w2i[2])) for n in val_batch.ner
        ]
        loss, pred, acc = model.evaluate_step(sess, val_batch.sentences,
                                              val_batch.ner, cat_targets)
        avg_val_loss += loss
        avg_acc += acc
        all_dev_text += id2seq(val_batch.sentences, dataset.vocab_i2w[0])
        all_dev_pred += onehot2seq(pred, dataset.vocab_i2w[2])
        all_dev_gt += onehot2seq(cat_targets, dataset.vocab_i2w[2])
        dev_itr += 1

        if mode == 'test' and dataset.epochs_completed == 1: break
        if mode == 'train' and dataset.epochs_completed == 1: break

    result_set = (all_dev_text, all_dev_pred, all_dev_gt)
    avg_loss = avg_val_loss / dev_itr
    avg_acc = avg_acc / dev_itr
    if verbose:
        print("{}:\t Loss: {}".format(mode, avg_loss, avg_acc))

    with open(samples_path, 'w') as sf, open(history_path, 'a') as hf:
        for x1, pred, gt in zip(all_dev_text, all_dev_pred, all_dev_gt):
            sf.write('{}\t{}\t{}\n'.format(x1, pred, gt))
        hf.write('STEP:{}\tTIME:{}\tacc:{}\tLoss\t{}\n'.format(
            step,
            datetime.datetime.now().isoformat(), avg_acc, avg_loss))
    tflearn.is_training(True, session=sess)
    return avg_loss, avg_acc, result_set
예제 #25
0
 def convert(self, X, y=None):
     """Pad and index X, make y categorical."""
     X = np.array(list(self.processor.transform(X)))
     X = pad_sequences(X, maxlen=self.max_len, value=0.)
     if y:
         y = np.asarray(y)
         y = to_categorical(y, nb_classes=self.num_classes)
         return X, y
     else:
         return X
def trainEmbedding():
    X_train, y_train, X_test, y_test = loadInput(RUMOR_TF_INPUTPICKLE)

    y_train = to_categorical(y_train, nb_classes=2)
    y_test = to_categorical(y_test, nb_classes=2)
    print('X_train: ', X_train)
    print('X_test: ', X_test)
    model = build_model()

    sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    adagrad = optimizers.Adagrad(lr=0.01, epsilon=1e-08, decay=0.0)
    adadelta = optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=1e-8, decay=0.)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    model_checkpoint = ModelCheckpoint(MODEL_PATH_ADAGRAD, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='auto', period=1)

    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=64, callbacks=[tensor_board, model_checkpoint])
    model.evaluate(X_test, y_test, show_accuracy = True)
예제 #27
0
def do_rnn(trainX, testX, trainY, testY):
    max_document_length = 64
    y_test = testY
    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, max_document_length])
    net = tflearn.embedding(net, input_dim=10240000, output_dim=64)
    net = tflearn.lstm(net, 64, dropout=0.1)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net,
                             optimizer='adam',
                             learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0, tensorboard_dir="dga_log")
    model.fit(trainX,
              trainY,
              validation_set=(testX, testY),
              show_metric=True,
              batch_size=10,
              run_id="dga",
              n_epoch=1)

    y_predict_list = model.predict(testX)
    #print y_predict_list

    y_predict = []
    for i in y_predict_list:
        print(i[0])
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    print(classification_report(y_test, y_predict))
    print(metrics.confusion_matrix(y_test, y_predict))
    return y_predict, y_test
예제 #28
0
def GetData():
    max_features = 20000
    maxlen = 80  # cut texts after this number of words (among top max_features most common words)
    batch_size = 32

    print('Loading data...\n')
    (x_train, y_train), (x_test,
                         y_test) = imdb.load_data(num_words=max_features)
    print(len(x_train), 'train sequences\n')
    print(len(x_test), 'test sequences\n')

    print('Pad sequences (samples x time)\n')
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    y_train = to_categorical(y_train, 2)
    y_test = to_categorical(y_test, 2)
    print('x_train shape:\t', x_train.shape)
    print('x_test shape:\t', x_test.shape)
    return max_features, x_train, x_test, y_train, y_test, batch_size, maxlen
예제 #29
0
    def _train_model(self, save_path):
        '''
        :param save_path: Path to save the model to
        :return: None
        '''
        tf.reset_default_graph()
        train, test = imdb.load_data(num_words=10000, index_from=3)

        train_x, train_y = train
        test_x, test_y = test

        train_x = pad_sequences(train_x, maxlen=100, value=0.)
        test_x = pad_sequences(test_x, maxlen=100, value=0.)

        train_y = to_categorical(train_y, nb_classes=2)
        test_y = to_categorical(test_y, nb_classes=2)

        # Training
        self.model.fit(train_x, train_y, n_epoch=5, validation_set=(test_x, test_y), show_metric=True, batch_size=32)
        self.model.save(save_path)
예제 #30
0
	def predict_type(self,bookdir):
		# Import dataset
		X, Y = image_preloader(bookdir, image_shape=(128, 128), mode='file',
			                   categorical_labels=False, normalize=False)
		Y = to_categorical(Y, 3)
		print("-- Runbook Import Complete.")

		# Predict label
		prediction = self.model.predict(X)
		
		return prediction
예제 #31
0
def trainArtToPrimaryTypeModel(artPath, jsonPath, testProp, numEpochs=50):
  '''
  Trains a convolutional network to categorize card art by primary type
  Inputs:
    artPath: path to card art
    jsonPath: path to card data json file
    testProp: proportion of samples to be used for test/validation
    numEpochs: number of epochs to train for (50)
  '''
  (X, Y), (X_Test, Y_Test), numCategories = turnPicsToSimpleInputs(artPath,
                                                                    jsonPath,
                                                                    testProp=testProp)
  X, Y = shuffle(X, Y)
  Y = to_categorical(Y, numCategories)
  Y_Test = to_categorical(Y_Test, numCategories)

  # Train model as classifier
  model = artToMainTypeModel(numCategories)
  model.fit(X, Y, n_epoch=numEpochs, shuffle=True, validation_set=(X_Test, Y_Test),
              show_metric=True, batch_size=100, run_id='mtg_classifier')
예제 #32
0
def MNISTRNN():
    train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,valid_portion=0.1)
    X_train, Y_train = train
    X_test, Y_test = test

    X_train = pad_sequences(X_train, maxlen=100,value=0.)
    X_test = pad_sequences(X_test, maxlen=100,value=0.)
    Y_train = to_categorical(Y_train, nb_classes=2)
    Y_test = to_categorical(Y_test, nb_classes=2)

    # LSTM
    RNN = tflearn.input_data([None, 100])
    RNN = tflearn.embedding(RNN, input_dim=10000, output_dim=128)
    RNN = tflearn.lstm(RNN, 128, dropout=0.8)
    RNN = tflearn.fully_connected(RNN, 2, activation='softmax')
    RNN = tflearn.regression(RNN, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy')

    # train
    model = tflearn.DNN(RNN, tensorboard_verbose=0, tensorboard_dir='MINST_tflearn_board_RNN/')
    model.fit(X_train, Y_train, validation_set=(X_test,Y_test),show_metric=True,batch_size=32)
예제 #33
0
def do_cnn_word2vec_2d_345(trainX, testX, trainY, testY):
    global max_features
    global max_document_length
    print "CNN and word2vec_2d_345"
    y_test = testY

    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Building convolutional network
    network = input_data(shape=[None,max_document_length,max_features,1], name='input')
    network = tflearn.embedding(network, input_dim=1, output_dim=128,validate_indices=False)
    branch1 = conv_2d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
    branch2 = conv_2d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
    branch3 = conv_2d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
    network = merge([branch1, branch2, branch3], mode='concat', axis=1)
    network = tf.expand_dims(network, 2)
    network = global_max_pool_2d(network)
    network = dropout(network, 0.8)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy', name='target')
    # Training
    model = tflearn.DNN(network, tensorboard_verbose=0)
    model.fit(trainX, trainY,
              n_epoch=5, shuffle=True, validation_set=(testX, testY),
              show_metric=True, batch_size=100,run_id="sms")

    y_predict_list = model.predict(testX)
    print y_predict_list

    y_predict = []
    for i in y_predict_list:
        print  i[0]
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    print(classification_report(y_test, y_predict))
    print metrics.confusion_matrix(y_test, y_predict)
예제 #34
0
파일: uba.py 프로젝트: zss8848/3book
def do_rnn_wordbag(trainX, testX, trainY, testY):
    y_test=testY
    #trainX = pad_sequences(trainX, maxlen=100, value=0.)
    #testX = pad_sequences(testX, maxlen=100, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, 100])
    net = tflearn.embedding(net, input_dim=1000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.1)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.005,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX, trainY, validation_set=0.1, show_metric=True,
              batch_size=1,run_id="uba",n_epoch=10)

    y_predict_list = model.predict(testX)
    #print y_predict_list

    y_predict = []
    for i in y_predict_list:
        #print  i[0]
        if i[0] >= 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    print(classification_report(y_test, y_predict))
    print metrics.confusion_matrix(y_test, y_predict)

    print y_train

    print "ture"
    print y_test
    print "pre"
    print y_predict
예제 #35
0
def main():
    (X_train, y_train), (X_test, y_test), _ = imdb.load_data()
    X_train = np.array(pad_sequences(X_train, maxlen=100))

    X_test = np.array(pad_sequences(X_test, maxlen=100))

    vocab_size = X_train.max() + 1
    print 'vocab size: {}'.format(vocab_size)
    y_train = to_categorical(np.array(y_train), 2)
    y_test = np.array(y_test)
    cnn = Discriminator(vocab_size, 100, 100, [2, 3], 50, 2)
    cnn.train(X_train, y_train, 5)
예제 #36
0
def create_datasets(file_path, vocab_size=30000, val_fraction=0.0):

    # IMDB Dataset loading
    train, test, _ = imdb.load_data(path=file_path,
                                    n_words=vocab_size,
                                    valid_portion=val_fraction,
                                    sort_by_len=False)
    trainX, trainY = train
    testX, testY = test

    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.)
    testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    train_dataset = DataSet(trainX, trainY)

    return train_dataset
예제 #37
0
    def load_data(self, ds):
        _ds = None
        if ds['name'] == 'mnist':
            from tflearn.datasets import mnist as _ds
            self._X, self._Y, self._test_X, self._test_Y = _ds.load_data(
                one_hot=ds.get('one_hot', False))

        if ds['name'] == 'cifar10':
            from tflearn.datasets import cifar10 as _ds
            (self._X, self._Y), (self._test_X, self._test_Y) = _ds.load_data(
                one_hot=ds.get('one_hot', False))
        from tflearn.data_utils import shuffle, to_categorical
        del _ds  # discard
        if 'reshape' in ds: self.reshape(ds['reshape'])
        if ds.get('shuffle', False):
            self._X, self._Y = shuffle(self._X, self._Y)

        if ds.get('to_categorical', False):
            self._Y = to_categorical(self._Y, None)
            self._test_Y = to_categorical(self._test_Y, None)
        return self
예제 #38
0
def pad_data(data, max_seq_len):
    """
    Padding each sentence of research data according to the max sentence length.
    Returns the padded data and data labels.
    :param data: The research data
    :param max_seq_len: The max sentence length of research data
    :returns: The padded data and data labels
    """
    data_front = pad_sequences(data.front_tokenindex, maxlen=max_seq_len, value=0.)
    data_behind = pad_sequences(data.behind_tokenindex, maxlen=max_seq_len, value=0.)
    labels = to_categorical(data.labels, nb_classes=2)
    return data_front, data_behind, labels
예제 #39
0
def lstm(trainX, trainY,testX, testY):
    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=100, value=0.)
    testX = pad_sequences(testX, maxlen=100, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, 100])
    net = tflearn.embedding(net, input_dim=10000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
              batch_size=32,run_id="rnn-lstm")
예제 #40
0
def extract_data(filename):
    """Extract the images into a 4D tensor [image index, y, x, channels].


        """
    print('Extracting', filename)
    # get data from h5py
    file = h5py.File(filename, 'r')
    train_data = file['train_data'].value
    train_label = file['train_label']
    test_data = file['test_data'].value
    test_label = file['test_label']
    train_label = np.int64(train_label)
    test_label = np.int64(test_label)
    train_num = train_data.shape[0]
    test_num = test_data.shape[0]

    max, min = train_data.max(), train_data.min()
    train_data_new = (train_data - min) / (max - min)
    train_data_out = np.zeros([
        train_data.shape[0], train_data.shape[3], train_data.shape[1],
        train_data.shape[2], 1
    ])
    for i in range(train_data.shape[3]):
        train_data_out[:, i, :, :, :] = train_data_new[:, :, :, i]

    max, min = test_data.max(), test_data.min()
    test_data_new = (test_data - min) / (max - min)
    test_data_out = np.zeros([
        test_data.shape[0], test_data.shape[3], test_data.shape[1],
        test_data.shape[2], 1
    ])
    for i in range(test_data.shape[3]):
        test_data_out[:, i, :, :, :] = test_data_new[:, :, :, i]

    train_data_out, train_label = shuffle(train_data_out, train_label)
    train_label = to_categorical(train_label, 20)
    test_label = to_categorical(test_label, 20)

    return train_data_out, train_label, test_data_out, test_label
예제 #41
0
def train_and_save_model():
    # Run this if pkl files already exist in directory pickle_files
    trainX, testX = dp.convert_reviews()
    trainY, testY = dp.get_sentiment_arrays()

    # AVG REVIEW LENGTH: 165.3178

    # REMOVE THIS JUNK
    print('trainX ' + str(trainX[0]))
    print('trainX ' + str(len(trainX[0])))
    print('trainY ' + str(trainY[0]))
    print('trainY ' + str(type(trainY[0])))

    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=200, value=0.)
    testX = pad_sequences(testX, maxlen=200, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    deep_net = tfl.input_data([None, 200])
    deep_net = tfl.embedding(deep_net, input_dim=10000, output_dim=128)
    deep_net = tfl.lstm(deep_net, 128, dropout=0.8)
    deep_net = tfl.fully_connected(deep_net, 2, activation='softmax')
    deep_net = tfl.regression(deep_net,
                              optimizer='adam',
                              learning_rate=0.001,
                              loss='categorical_crossentropy')

    # Training 1ST RUN
    model = tfl.DNN(deep_net, tensorboard_verbose=0)
    model.fit(trainX,
              trainY,
              validation_set=(testX, testY),
              show_metric=True,
              batch_size=32,
              n_epoch=20)

    model.save('./saved_models/model1.tfl')
def train(maxlen=100, embedding_dim=128):   # 主训练/测试代码
    start = time.time()
    l_trainX, r_trainX, ret_labels, l_topredictX, r_topredictX = do.load_data_bi_word2vec(maxlen=maxlen,
                                                                                          words_keep=50000,
                                                                                          validation_portion=0.,
                                                                                          embedding_dim=embedding_dim,
                                                                                          ma="A")
    trainY = to_categorical(ret_labels, nb_classes=3)
    del ret_labels
    lnet = tflearn.input_data([None, maxlen, embedding_dim])
    rnet = tflearn.input_data([None, maxlen, embedding_dim])
    lnet = tflearn.gru(lnet, embedding_dim, dropout=0.8, return_seq=False, dynamic=True)
    rnet = tflearn.gru(rnet, embedding_dim, dropout=0.8, return_seq=False, dynamic=True)
    net = tflearn.layers.merge_outputs([lnet, rnet])
    net = tflearn.fully_connected(net, 3, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')
    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit([l_trainX, r_trainX], trainY, validation_set=0.1, show_metric=True,
              batch_size=32)
    model.save('MODELS/E_W2V_GRU_TC{}_{}.dy'.format(embedding_dim, maxlen))
    # model.load('MODELS/E_W2V_GRU_TC{}_{}.dy'.format(embedding_dim, maxlen))
    del l_trainX
    del r_trainX
    del trainY
    idx2cla = {0: 'neu', 1: 'pos', 2: 'neg'}
    filename = "Result/result_{}.csv".format(datetime.datetime.now().strftime("%Y%m%d%H%M"))
    prefix = list(open('Result/A_AFTER_NRP_200', 'r').readlines())
    f = open(filename, 'w')
    f.write('SentenceId,View,Opinion\n')
    a = [0,     5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000]
    b = [5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000, 65000]
    ANS = []
    for i in range(12):
        ans = model.predict([l_topredictX[a[i]:b[i]], r_topredictX[a[i]:b[i]]])
        ANS.extend([s for s in ans])
        print("ANS.LENGTH: {}".format(len(ans)))
    for i, r in enumerate(ANS):
        f.write(prefix[i].strip())
        idx = int(np.argmax(r))
        f.write(idx2cla[idx])
        k = ""
        for l in r:
            k += ',{:.4f}'.format(l)
        f.write(k)
        f.write('\n')
    f.close()
    end = time.time()
    print("TIME COST: {}".format(end-start))
    outf = vote_by_score(filename)
    add(outf)
예제 #43
0
def __prepareData(document, labels, vocabulary):
    cv = CountVectorizer(vocabulary=vocabulary)
    le = LabelEncoder()

    x = cv.fit_transform(document).toarray()
    y_vector = le.fit_transform(labels)

    classes = le.classes_

    num_classes = len(classes)
    y = to_categorical(y_vector, nb_classes=num_classes)

    return x, y, classes
예제 #44
0
def do_rnn_wordbag(trainX, testX, trainY, testY):
    global max_document_length
    print "RNN and wordbag"

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, max_document_length])
    net = tflearn.embedding(net, input_dim=10240000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
              batch_size=10,run_id="review",n_epoch=5)
예제 #45
0
파일: reader.py 프로젝트: Biocodings/Paddle
def create_datasets(file_path, vocab_size=30000, val_fraction=0.0):

    # IMDB Dataset loading
    train, test, _ = imdb.load_data(
        path=file_path,
        n_words=vocab_size,
        valid_portion=val_fraction,
        sort_by_len=False)
    trainX, trainY = train
    testX, testY = test

    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.)
    testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    train_dataset = DataSet(trainX, trainY)

    return train_dataset
예제 #46
0
def do_cnn(trainX, testX, trainY, testY):
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=4)
    testY = to_categorical(testY, nb_classes=4)
    # Building convolutional network
    network = input_data(shape=[None, 32, 32,1], name='input')
    network = conv_2d(network, 16, 3, activation='relu', regularizer="L2")
    network = max_pool_2d(network, 2)
    network = local_response_normalization(network)
    network = conv_2d(network, 16, 3, activation='relu', regularizer="L2")
    network = max_pool_2d(network, 2)
    network = local_response_normalization(network)
    network = fully_connected(network, 16, activation='tanh')
    network = dropout(network, 0.1)
    network = fully_connected(network, 16, activation='tanh')
    network = dropout(network, 0.1)
    network = fully_connected(network, 4, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=0.01,
                         loss='categorical_crossentropy', name='target')

    # Training
    model = tflearn.DNN(network, tensorboard_verbose=0)
    model.fit(trainX, trainY, n_epoch=10, validation_set=(testX, testY),show_metric=True, run_id="malware")
def load_train_data():
    train_dict = sio.loadmat(train_location)
    X = np.asarray(train_dict['X'])

    X_train = []
    for i in xrange(X.shape[3]):
        X_train.append(X[:,:,:,i])
    X_train = np.asarray(X_train)

    Y_train = train_dict['y']
    for i in xrange(len(Y_train)):
        if Y_train[i]%10 == 0:
            Y_train[i] = 0
    Y_train = to_categorical(Y_train,10)
    return (X_train,Y_train)
def load_test_data():
    test_dict = sio.loadmat(test_location)
    X = np.asarray(test_dict['X'])

    X_test = []
    for i in xrange(X.shape[3]):
        X_test.append(X[:,:,:,i])
    X_test = np.asarray(X_test)

    Y_test = test_dict['y']
    for i in xrange(len(Y_test)):
        if Y_test[i]%10 == 0:
            Y_test[i] = 0
    Y_test = to_categorical(Y_test,10)
    return (X_test,Y_test)
예제 #49
0
def generate_image_sets_for_single_digit(nb_sample=SAMPLE_SIZE, single_digit_index=0):
    captcha = ImageCaptcha()

    labels = []
    images = []
    for i in range(0, nb_sample):
        digits = 0
        last_digit = INVALID_DIGIT
        for j in range(0, DIGIT_COUNT):
            digit = last_digit
            while digit == last_digit:
                digit = random.randint(0, 9)
            last_digit = digit
            digits = digits * 10 + digit
        digits_as_str = DIGIT_FORMAT_STR % digits
        labels.append(digits_as_str)
        images.append(captcha.generate_image(digits_as_str))

    digit_labels = list()

    for digit_index in range(0, DIGIT_COUNT):
        digit_labels.append(np.empty(nb_sample, dtype="int8"))

    shape = (nb_sample, IMAGE_STD_HEIGHT, IMAGE_STD_WIDTH, RGB_COLOR_COUNT)
    digit_image_data = np.empty(shape, dtype="float32")

    for index in range(0, nb_sample):
        img = images[index].resize((IMAGE_STD_WIDTH, IMAGE_STD_HEIGHT), PIL.Image.LANCZOS)
        img_arr = np.asarray(img, dtype="float32") / 255.0

        digit_image_data[index, :, :, :] = img_arr

        for digit_index in range(0, DIGIT_COUNT):
            digit_labels[digit_index][index] = labels[index][digit_index]

    x = digit_image_data
    y = to_categorical(digit_labels[single_digit_index], CLASS_COUNT)

    return x, y
def load_dataset(x_count, y_count):
  print '[+] Loading data'
  X = []
  Y = []
  places = Set()
  data = np.load('grid/data-{0}-{1}.npy'.format(x_count, y_count))
  for row in data:
    x = map(float, row[1:5])
    time = row[4]
    x.extend([
      (time // 60) % 24 + 1, # Hour
      (time // 1440) % 7 + 1, # Day
      (time // 43200) % 12 + 1, # Month
      (time // 525600) + 1 # Year
    ])
    X.append(x)
    Y.append(row[5])
    places.add(row[5])
  places = list(places)
  Y = [places.index(y) for y in Y]
  Y = to_categorical(Y, len(places))
  print '[+] All data loaded'
  return X, Y
예제 #51
0
def generate_image_sets_for_multi_digits(nb_sample=SAMPLE_SIZE):
    captcha = ImageCaptcha()

    labels = []
    images = []
    for i in range(0, nb_sample):
        digits = 0
        last_digit = INVALID_DIGIT
        for j in range(0, DIGIT_COUNT):
            digit = last_digit
            while digit == last_digit:
                digit = random.randint(0, 9)
            last_digit = digit
            digits = digits * 10 + digit
        digits_as_str = DIGIT_FORMAT_STR % digits
        labels.append(digits_as_str)
        images.append(captcha.generate_image(digits_as_str))

    digit_labels = np.empty((nb_sample, DIGIT_COUNT), dtype="int8")

    shape = (nb_sample, IMAGE_STD_HEIGHT, IMAGE_STD_WIDTH, RGB_COLOR_COUNT)
    digit_image_data = np.empty(shape, dtype="float32")

    for index in range(0, nb_sample):
        img = images[index].resize((IMAGE_STD_WIDTH, IMAGE_STD_HEIGHT), PIL.Image.LANCZOS)
        img_arr = np.asarray(img, dtype="float32") / 255.0

        digit_image_data[index, :, :, :] = img_arr

        for digit_index in range(0, DIGIT_COUNT):
            digit_labels[index][digit_index] = labels[index][digit_index]
    x, y_as_num = digit_image_data, np.rollaxis(digit_labels, 1)
    y = { (OUT_PUT_NAME_FORMAT % i ): to_categorical(y_as_num[i], CLASS_COUNT) for i in range(0, DIGIT_COUNT) }
    # y = [to_categorical(y_as_num[i], CLASS_COUNT) for i in range(0, DIGIT_COUNT)]

    return x, y
예제 #52
0
"""
from __future__ import division, print_function, absolute_import

import tflearn
from tflearn.data_utils import shuffle, to_categorical
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.estimator import regression
from tflearn.data_preprocessing import ImagePreprocessing
from tflearn.data_augmentation import ImageAugmentation

# Data loading and preprocessing
from tflearn.datasets import cifar10
(X, Y), (X_test, Y_test) = cifar10.load_data()
X, Y = shuffle(X, Y)
Y = to_categorical(Y)
Y_test = to_categorical(Y_test)

# Real-time data preprocessing
img_prep = ImagePreprocessing()
img_prep.add_featurewise_zero_center()
img_prep.add_featurewise_stdnorm()

# Real-time data augmentation
img_aug = ImageAugmentation()
img_aug.add_random_flip_leftright()
img_aug.add_random_rotation(max_angle=25.)

# Convolutional network building
network = input_data(shape=[None, 32, 32, 3],
                     data_preprocessing=img_prep,
예제 #53
0
        v = values.split('/')
        data[v[3]] = {}
        data[v[3]]['vector'] = process(values)
        data[v[3]]['class'] = line.split()[1]
    return data

testset = load_data(val)
trainset = load_data(train)

import pandas as pd
import numpy as np
test = pd.DataFrame(testset)
trainset = pd.DataFrame(trainset)

from tflearn.data_utils import shuffle, to_categorical
trainY = to_categorical(np.array(trainset.loc['class']),nb_classes=5)
testY = to_categorical(np.array(test.loc['class']),nb_classes=5)

import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.normalization import local_response_normalization
from tflearn.layers.estimator import regression


network = input_data(shape=[None, 32, 32, 3], name='input')
network = conv_2d(network, 32, 3, activation='relu', regularizer="L2")
network = max_pool_2d(network, 3)
network = local_response_normalization(network)
network = conv_2d(network, 64, 3, activation='relu', regularizer="L2")
network = max_pool_2d(network, 3)
예제 #54
0
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.embedding_ops import embedding
from tflearn.layers.recurrent import bidirectional_rnn, BasicLSTMCell
from tflearn.layers.estimator import regression

# IMDB Dataset loading
train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,
                                valid_portion=0.1)
trainX, trainY = train
testX, testY = test

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=200, value=0.)
testX = pad_sequences(testX, maxlen=200, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

# Network building
net = input_data(shape=[None, 200])
net = embedding(net, input_dim=20000, output_dim=128)
net = bidirectional_rnn(net, BasicLSTMCell(128), BasicLSTMCell(128))
net = dropout(net, 0.5)
net = fully_connected(net, 2, activation='softmax')
net = regression(net, optimizer='adam', loss='categorical_crossentropy')

# Training
model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=2)
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=64)
예제 #55
0
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

# IMDB Dataset loading 会自动下载
train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,
                                valid_portion=0.1)
trainX, trainY = train #22500个元素的list,每个元素类似这样[17,25,10,406,26,14,556,61,62,323,4],可见是词在词库的位置
testX, testY = test

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=100, value=0.)# 通过补零把list的每个元素长度都弄成100
testX = pad_sequences(testX, maxlen=100, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)#把0变成[0,1],把1变成[1,0]
testY = to_categorical(testY, nb_classes=2)

# Network building
net = tflearn.input_data([None, 100])
net = tflearn.embedding(net, input_dim=10000, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy')

# Training
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
          batch_size=32)
nb_classes = 10
neurons = 4000
epochs = 200
# the data, shuffled and split between train and test sets
(X_train, y_train), (X_test, y_test) = mnist.load_data()

X_train = X_train.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255

# convert class vectors to binary class matrices
Y_train = to_categorical(y_train, nb_classes)
Y_test = to_categorical(y_test, nb_classes)

X_train, X_val = X_train[:-10000], X_train[-10000:]
Y_train, Y_val = Y_train[:-10000], Y_train[-10000:]

print(X_train.shape[0], 'train samples')
print(X_val.shape[0], 'test samples')
print(X_test.shape[0], 'test samples')


print(Y_train.shape, Y_test.shape)

# TFLearn Network
network = input_data(shape=[None, 784], name='input')
예제 #57
0
"""

from __future__ import division, print_function, absolute_import

import tflearn
import tflearn.data_utils as du

# Data loading
from tflearn.datasets import cifar10
(X, Y), (testX, testY) = cifar10.load_data()
# Data pre-processing
X, mean = du.featurewise_zero_center(X)
X, std = du.featurewise_std_normalization(X)
testX = du.featurewise_zero_center(testX, mean)
testX = du.featurewise_std_normalization(testX, std)
Y = du.to_categorical(Y, 10)
testY = du.to_categorical(testY, 10)

# Building Residual Network
net = tflearn.input_data(shape=[None, 32, 32, 3])
net = tflearn.conv_2d(net, 32, 3)
net = tflearn.batch_normalization(net)
net = tflearn.activation(net, 'relu')
net = tflearn.shallow_residual_block(net, 4, 32, regularizer='L2')
net = tflearn.shallow_residual_block(net, 1, 32, downsample=True,
                                     regularizer='L2')
net = tflearn.shallow_residual_block(net, 4, 64, regularizer='L2')
net = tflearn.shallow_residual_block(net, 1, 64, downsample=True,
                                     regularizer='L2')
net = tflearn.shallow_residual_block(net, 5, 128, regularizer='L2')
net = tflearn.global_avg_pool(net)
예제 #58
0
from __future__ import division, print_function, absolute_import

import tflearn
from tflearn.data_utils import shuffle, to_categorical
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.estimator import regression
from tflearn.data_preprocessing import ImagePreprocessing
from tflearn.data_augmentation import ImageAugmentation

# Data loading and pre processing
from tflearn.datasets import cifar10

(X,Y), (X_test, Y_test) = cifar10.load_data()
X, Y = shuffle(X,Y)
Y = to_categorical(Y, 10)
Y_test = to_categorical(Y_test, 10)

# Data preprocessing
img_prep = ImagePreprocessing()
img_prep.add_featurewise_zero_center()
img_prep.add_featurewise_stdnorm()

# Data augmentation
img_aug = ImageAugmentation()
img_aug.add_random_flip_leftright()
img_aug.add_random_rotation()

# Building the CNN
network = input_data(shape=[None, 32, 32, 3], data_preprocessing=img_prep, data_augmentation=img_aug, name='first_layer')
network = max_pool_2d(network, 2) # Max pooling layer
from tflearn.layers.estimator import regression
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

# IMDB Dataset loading
train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,
                                valid_portion=0.1)
trainX, trainY = train
testX, testY = test

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=100, value=0.)
testX = pad_sequences(testX, maxlen=100, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY)
testY = to_categorical(testY)

# Building convolutional network
network = input_data(shape=[None, 100], name='input')
network = tflearn.embedding(network, input_dim=10000, output_dim=128)
branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
network = merge([branch1, branch2, branch3], mode='concat', axis=1)
network = tf.expand_dims(network, 2)
network = global_max_pool(network)
network = dropout(network, 0.5)
network = fully_connected(network, 2, activation='softmax')
network = regression(network, optimizer='adam', learning_rate=0.001,
                     loss='categorical_crossentropy', name='target')
예제 #60
0
print('Read content')

def load_content (file_name):
    with open(file_name) as f:
        return f.read()

X = []
for i in range (MAX_FILE_ID):
    file_name = data_dir + '/' + str(i + 1)
    if os.path.isfile (file_name):
        X.append (load_content(file_name)) 

X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y,
    test_size=0.2, random_state=2017)

Y_train = to_categorical (Y_train, nb_classes = len (qualities))
Y_test = to_categorical (Y_test, nb_classes = len (qualities))

### Process vocabulary

print('Process vocabulary')

vocab_processor = tflearn.data_utils.VocabularyProcessor(max_document_length = model_size, min_frequency = 0)
X_train = np.array(list(vocab_processor.fit_transform(X_train)))
X_test = np.array(list(vocab_processor.fit_transform(X_test)))

X_train = pad_sequences(X_train, maxlen=model_size, value=0.)
X_test = pad_sequences(X_test, maxlen=model_size, value=0.)

n_words = len(vocab_processor.vocabulary_)
print('Total words: %d' % n_words)