Пример #1
0
from preprocess import Preprocess
from word2vec import Word2Vec
from news_embedding import NewsEmbeding
from han import HAN

if __name__ == '__main__':
    PREPROCESSED = True
    EMBEDDING_TRAINED = False
    WORD_EMBEDDING_READY = False

    preprocess = Preprocess()
    if not PREPROCESSED:
        preprocess.preprocess()
    preprocess.load_data()

    word2vec = Word2Vec(preprocess.data_dict)
    if not EMBEDDING_TRAINED:
        print('training word2vec...')
        word2vec.train_model()
    word2vec.load_model()

    news_emb = NewsEmbeding(word2vec.model, preprocess.data_dict)
    if not WORD_EMBEDDING_READY:
        news_emb.embed()
    news_emb.load_embeddings()
    news_emb.get_max_corpus_date_count()
    news_emb.pad_embeddings()
    #print(news_emb.embedict_padded['AAPL'])

    HAN = HAN(news_emb.emb_node_num, news_emb.date_num)
if FLAGS.run_type == "train":
    print("Training...\n")
    # create new graph set as default
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        session_conf.gpu_options.allocator_type = "BFC"
        # create new session set it as default
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # create new cnn model
            han = HAN(max_seq_len=imdb.max_seq_len,
                      max_sent_len=imdb.max_sent_len,
                      num_classes=len(y_test[0]),
                      vocab_size=imdb.vocab_size,
                      embedding_size=FLAGS.embedding_dim,
                      max_grad_norm=FLAGS.max_grad_norm,
                      dropout_keep_proba=FLAGS.dropout_keep_proba,
                      learning_rate=FLAGS.learning_rate)

            global_step = tf.Variable(0, name="global_step", trainable=False)
            tvars = tf.trainable_variables()
            grads, global_norm = tf.clip_by_global_norm(
                tf.gradients(han.loss, tvars), han.max_grad_norm)
            optimizer = tf.train.AdamOptimizer(han.learning_rate)
            train_op = optimizer.apply_gradients(zip(grads, tvars),
                                                 name="train_op",
                                                 global_step=global_step)

            # checkpoint model
            saver = tf.train.Saver(tf.global_variables(),
EPOCH = 40

if __name__ == "__main__":
    bf = BatchFeeder('train', 64, 15, 50)

    tf.reset_default_graph()

    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=True)

    with tf.device('/device:GPU:0'):
        model = HAN(vocab_size=len(bf.vocab_encode_dict.keys()) + 1,
                    embedding_size=200,
                    classes=10,
                    word_cell=tf.nn.rnn_cell.GRUCell(50, name='word-gru'),
                    sentence_cell=tf.nn.rnn_cell.GRUCell(50,
                                                         name='sentence-gru'),
                    word_context_size=100,
                    sentence_context_size=100)

    saver = tf.train.Saver()

    with tf.Session(config=config) as sess:
        writer = tf.summary.FileWriter('./han_graph',
                                       graph=tf.get_default_graph())
        sess.run(tf.global_variables_initializer())
        for i in range(EPOCH):
            for encoded_data, document_length_mask, sentence_length_mask, labels in bf:
                feed_dict = {
                    model.inputs: encoded_data,
                    model.word_length: sentence_length_mask,
Пример #4
0
epochs = 10

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x #sentence x #word)...')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen_sentence * maxlen_word)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen_sentence * maxlen_word)
x_train = x_train.reshape((len(x_train), maxlen_sentence, maxlen_word))
x_test = x_test.reshape((len(x_test), maxlen_sentence, maxlen_word))
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = HAN(maxlen_sentence, maxlen_word, max_features, embedding_dims)
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max')
model.fit(x_train,
          y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(x_test, y_test))

print('Test...')
result = model.predict(x_test)
Пример #5
0
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
except:
    (x_train, y_train), (x_test, y_test) = load_data(num_words=max_features)

logger.info('padding...')
x_train = sequence.pad_sequences(x_train, maxlen=max_seqs*max_words)
x_test = sequence.pad_sequences(x_test, maxlen=max_seqs*max_words)

x_train = x_train.reshape((len(x_train), max_seqs, max_words))
x_test = x_test.reshape((len(x_test), max_seqs, max_words))

logger.info('train data shape is: {}'.format(x_train.shape))
logger.info('test data shape is: {}'.format(x_test.shape))

logger.info('build model...')
model = HAN(max_features=max_features, max_words=max_words, max_seqs=max_seqs, emb_dim=emb_dim).build_model()
model.compile('adam', 'binary_crossentropy', ['acc'])

logger.info('training...')
earlystop = EarlyStopping(patience=3, monitor='val_acc', mode='max')
model.fit(x_train, y_train,
          callbacks=[earlystop],
          batch_size=batch_size,
          epochs=epochs,
          validation_data=[x_test, y_test])

logger.info('test...')
pred = model.predict(x_test)
logger.info(pred[:10])
logger.info(y_test[:10])