예제 #1
0
def keras_imdb_gru(num_words=None, maxlen=None, embedding_dim=128):
    """
    imdb影评二分类,使用gru门控循环单元进行分类
    :return:
    """
    (x_train, y_train), (x_test, y_test) = imdb.load_data(
        os.path.join(root_path, "data", "imdb", "imdb.npz"), num_words=num_words)

    if not num_words: num_words = max(max([max(x) for x in x_train]), max([max(x) for x in x_test])) + 1
    if not maxlen: maxlen = max(max([len(x) for x in x_train]), max([len(x) for x in x_test])) + 1

    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

    model = keras.models.Sequential()
    model.add(keras.layers.Embedding(num_words + 1, embedding_dim, input_length=maxlen))
    model.add(keras.layers.GRU(128))
    model.add(keras.layers.Dense(1, activation='sigmoid'))

    model.summary()
    model.compile(optimizer="rmsprop", loss='binary_crossentropy', metrics=['accuracy'])

    history = model.fit(x_train, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)
    keras_history_plotcurve(history)

    # 评估模型
    test_loss, test_accuracy = model.evaluate(x_test, y_test, batch_size=64, verbose=0)
    logger.info("\ntest_loss:{0},test_accuracy:{1}".format(test_loss, test_accuracy))
예제 #2
0
def keras_reuters_gru(num_words=None, maxlen=None, num_categorical=None):
    """
    使用gru门控循环单元进行新闻分类,gru包含输入门、输出门、重置门和更新门。
    :return:
    """
    (X_train, y_train), (X_test, y_test) = reuters.load_data(
        path=os.path.join(root_path, "data", "reuters", "reuters.npz"),
        num_words=num_words)

    if not num_words:
        num_words = max(max([max(x)
                             for x in X_train]), max([max(x)
                                                      for x in X_test])) + 1
    if not maxlen:
        maxlen = max(max([len(x)
                          for x in X_train]), max([len(x)
                                                   for x in X_test])) + 1
    if not num_categorical:
        num_categorical = max(max(y_train), max(y_test)) + 1

    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    y_train = keras.utils.to_categorical(y_train, num_categorical)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    y_test = keras.utils.to_categorical(y_test, num_categorical)

    input = keras.layers.Input(shape=(maxlen, ))
    x = keras.layers.Embedding(input_dim=num_words, output_dim=128)(input)
    x = keras.layers.GRU(32)(x)

    x = keras.layers.Dense(128, activation="relu")(x)
    x = keras.layers.Dense(num_categorical, activation="softmax")(x)
    model = keras.models.Model(inputs=input, outputs=x)
    model.summary()

    model.compile(optimizer="rmsprop",
                  loss="categorical_crossentropy",
                  metrics=["accuracy"])
    history = model.fit(X_train,
                        y_train,
                        batch_size=64,
                        epochs=10,
                        verbose=1,
                        validation_data=(X_test, y_test))
    keras_history_plotcurve(history)
예제 #3
0
def keras_reuters_cnn():
    (X_train, y_train), (X_test, y_test) = reuters.load_data(
        path=os.path.join(root_path, "data", "reuters", "reuters.npz"))

    num_words = max(max([len(x)
                         for x in X_train]), max([len(x) for x in X_test])) + 1
    num_classify = max(max(y_train), max(y_test)) + 1
    num_vocab = max(max([max(x)
                         for x in X_train]), max([max(x) for x in X_test])) + 1

    X_train = sequence.pad_sequences(X_train, maxlen=num_words)
    y_train = keras.utils.to_categorical(y_train, num_classify)
    X_test = sequence.pad_sequences(X_test, maxlen=num_words)
    y_test = keras.utils.to_categorical(y_test, num_classify)

    input = keras.layers.Input(shape=(num_words, ))
    x = keras.layers.Embedding(input_dim=num_vocab + 1, output_dim=128)(input)
    x = keras.layers.Convolution1D(32, 5, activation="relu")(x)
    x = keras.layers.Convolution1D(32, 5, activation="relu")(x)
    x = keras.layers.MaxPooling1D(5)(x)
    x = keras.layers.BatchNormalization()(x)

    x = keras.layers.Convolution1D(32, 5, activation="relu")(x)
    x = keras.layers.Convolution1D(32, 5, activation="relu")(x)
    x = keras.layers.MaxPooling1D(5)(x)
    x = keras.layers.BatchNormalization()(x)

    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(128, activation="relu")(x)
    x = keras.layers.Dense(num_classify, activation="softmax")(x)

    model = keras.models.Model(inputs=input, outputs=x)
    model.summary()

    model.compile(optimizer="rmsprop",
                  loss="categorical_crossentropy",
                  metrics=["accuracy"])
    history = model.fit(X_train,
                        y_train,
                        batch_size=64,
                        epochs=10,
                        verbose=1,
                        validation_data=(X_test, y_test))
    keras_history_plotcurve(history)
예제 #4
0
def keras_imdb_mlp(num_words=None, maxlen=None):
    """
    imdb分类,构建多层感知机进行分类
    :param num_words:
    :param maxlen:
    :return:
    """
    (x_train, y_train), (x_test, y_test) = imdb.load_data(
        os.path.join(root_path, "data", "imdb", "imdb.npz"), num_words=num_words)

    if not num_words: num_words = max(max([max(x) for x in x_train]), max([max(x) for x in x_test])) + 1
    if not maxlen: maxlen = max(max([len(x) for x in x_train]), max([len(x) for x in x_test])) + 1

    tokenizer = text.Tokenizer(num_words=num_words)
    x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
    x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')

    input = keras.layers.Input(shape=(num_words,))
    x = keras.layers.Dense(512, activation="relu", kernel_initializer="glorot_normal")(input)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(0.25)(x)

    x = keras.layers.Dense(256, activation="relu")(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(0.25)(x)

    x = keras.layers.Dense(128, activation="relu")(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(0.25)(x)

    x = keras.layers.Dense(1, activation="sigmoid")(x)

    model = keras.models.Model(inputs=input, outputs=x)
    model.summary()

    model.compile(optimizer="adadelta", loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(x_train, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)
    keras_history_plotcurve(history)

    # 评估模型
    test_loss, test_accuracy = model.evaluate(x_test, y_test, batch_size=64, verbose=0)
    logger.info("\ntest_loss:{0},test_accuracy:{1}".format(test_loss, test_accuracy))
예제 #5
0
def keras_imdb_fasttext(num_words=None, maxlen=None, batch_size=32, embedding_dims=50, epochs=5, ngram_range=1):
    """
    ngram 训练
    :param num_words:  词典单词数量
    :param maxlen:  句子长度
    :param batch_size:  批量大下
    :param embedding_dims:  嵌入层输出层数
    :param epochs:  训练轮数
    :param ngram_range:  ngram数量
    :return:
    """

    def create_ngram_set(input_list, ngram_value=2):
        """
        Extract a set of n-grams from a list of integers.
        >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
        {(4, 9), (4, 1), (1, 4), (9, 4)}
        >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
        [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
        """
        return set(zip(*[input_list[i:] for i in range(ngram_value)]))

    def add_ngram(sequences, token_indice, ngram_range=2):
        """
        Augment the input list of list (sequences) by appending n-grams values.
        Example: adding bi-gram
        >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
        >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
        >>> add_ngram(sequences, token_indice, ngram_range=2)
        [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
        Example: adding tri-gram
        >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
        >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
        >>> add_ngram(sequences, token_indice, ngram_range=3)
        [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
        """
        new_sequences = []
        for input_list in sequences:
            new_list = input_list[:]
            for ngram_value in range(2, ngram_range + 1):
                for i in range(len(new_list) - ngram_value + 1):
                    ngram = tuple(new_list[i:i + ngram_value])
                    if ngram in token_indice:
                        new_list.append(token_indice[ngram])
            new_sequences.append(new_list)

        return new_sequences

    print('Loading data...')
    (x_train, y_train), (x_test, y_test) = imdb.load_data(
        os.path.join(root_path, "data", "imdb", "imdb.npz"), num_words=num_words)
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')
    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

    if not num_words: num_words = max(max([max(x) for x in x_train]), max([max(x) for x in x_test])) + 1
    if not maxlen: maxlen = max(max([len(x) for x in x_train]), max([len(x) for x in x_test])) + 1

    if ngram_range > 1:
        print('Adding {}-gram features'.format(ngram_range))
        ngram_set = set()
        for input_list in x_train:
            for i in range(2, ngram_range + 1):
                set_of_ngram = create_ngram_set(input_list, ngram_value=i)
                ngram_set.update(set_of_ngram)

        # Dictionary mapping n-gram token to a unique integer.
        # Integer values are greater than max_features in order
        # to avoid collision with existing features.
        start_index = num_words + 1
        token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
        indice_token = {token_indice[k]: k for k in token_indice}

        # max_features is the highest integer that could be found in the dataset.
        num_words = np.max(list(indice_token.keys())) + 1

        # Augmenting x_train and x_test with n-grams features
        x_train = add_ngram(x_train, token_indice, ngram_range)
        x_test = add_ngram(x_test, token_indice, ngram_range)
        print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
        print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

    print('Pad sequences (samples x time)')
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)

    print('Build model...')
    model = keras.models.Sequential()

    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    model.add(keras.layers.Embedding(num_words, embedding_dims, input_length=maxlen))

    # we add a GlobalAveragePooling1D, which will average the embeddings
    # of all words in the document
    model.add(keras.layers.GlobalAveragePooling1D())

    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.summary()

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))
    keras_history_plotcurve(history)
예제 #6
0
def keras_reuters_mlp(num_words=None,
                      maxlen=None,
                      num_categorical=None,
                      batch_size=32,
                      epochs=10,
                      mode=None):
    """
    使用mlp多层感知机模式进行情感评估,同时对比自归一化mlp和常规mlp的性能对比
    :return:
    """
    (X_train, y_train), (X_test, y_test) = reuters.load_data(
        path=os.path.join(root_path, "data", "reuters", "reuters.npz"),
        num_words=num_words)

    if not num_words:
        num_words = max(max([max(x)
                             for x in X_train]), max([max(x)
                                                      for x in X_test])) + 1
    if not maxlen:
        maxlen = max(max([len(x)
                          for x in X_train]), max([len(x)
                                                   for x in X_test])) + 1
    if not num_categorical:
        num_categorical = max(max(y_train), max(y_test)) + 1

    tokenizer = text.Tokenizer(num_words=num_words)
    X_train = tokenizer.sequences_to_matrix(X_train)
    y_train = keras.utils.to_categorical(y_train, num_categorical)
    X_test = tokenizer.sequences_to_matrix(X_test)
    y_test = keras.utils.to_categorical(y_test, num_categorical)

    input = keras.layers.Input(shape=(num_words, ))
    # 自归一化snn
    if mode == "self-normalizing":
        x = keras.layers.Dense(512,
                               activation=keras.activations.selu,
                               kernel_initializer="lecun_normal")(input)
        x = keras.layers.AlphaDropout(0.1)(x)

        x = keras.layers.Dense(256,
                               activation="selu",
                               kernel_initializer="lecun_normal")(x)
        x = keras.layers.AlphaDropout(0.1)(x)

        x = keras.layers.Dense(128,
                               activation="selu",
                               kernel_initializer="lecun_normal")(x)
        x = keras.layers.AlphaDropout(0.1)(x)
    else:
        x = keras.layers.Dense(512,
                               activation="relu",
                               kernel_initializer="glorot_normal")(input)
        x = keras.layers.BatchNormalization()(x)
        # x = keras.layers.Dropout(0.4)(x)

        x = keras.layers.Dense(256,
                               activation="relu",
                               kernel_initializer="glorot_normal")(x)
        x = keras.layers.BatchNormalization()(x)
        # x = keras.layers.Dropout(0.4)(x)

        x = keras.layers.Dense(128,
                               activation="relu",
                               kernel_initializer="glorot_normal")(x)
        x = keras.layers.BatchNormalization()(x)
        # x = keras.layers.Dropout(0.4)(x)

    x = keras.layers.Dense(num_categorical, activation="softmax")(x)

    model = keras.models.Model(inputs=input, outputs=x)
    model.summary()

    model.compile(optimizer="adadelta",
                  loss="categorical_crossentropy",
                  metrics=["accuracy"])
    history = model.fit(X_train,
                        y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_split=0.2)
    keras_history_plotcurve(history)

    score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
    logger.info('Test loss:{0}'.format(score[0]))
    logger.info('Test accuracy:{0}'.format(score[1]))