def model(data_file):
    """ 使用lstm模型,判断一对问题是否是重复的
    """

    # 数据预处理
    print "data pre-processing"
    ques_pairs = data_process.parse_quoar_dul_data(data_file)[0:3000]
    word2idx = data_process.build_vocab(ques_pairs)
    vocab_size = len(word2idx) + 1
    seq_maxlen = data_process.get_seq_maxlen(ques_pairs)
    pids, x_ques1, x_ques2, y = data_process.vectorize_ques_pair(
        ques_pairs, word2idx, seq_maxlen)

    # 计算embeding 初始weight;
    # 训练处word2vec模型,用n维向量表示一个词
    w2v_embedding_model = word2vec_pretrain.train_word2vec(
        ques_pairs, num_features=EMBED_DIM, min_word_count=1, context=5)
    embedding_weights = np.zeros((vocab_size, EMBED_DIM))
    # 将我们创建出的词表中每个文本词替换为词向量
    for word, index in word2idx.iteritems():
        if word in w2v_embedding_model:
            embedding_weights[index, :] = w2v_embedding_model[word]
        else:
            embedding_weights[index, :] = np.random.uniform(
                -0.25, 0.25, w2v_embedding_model.vector_size)
    # 建立lstm模型;
    print("Building model...")
    ques1_enc = Sequential()
    ques1_enc.add(
        Embedding(
            output_dim=EMBED_DIM,
            input_dim=vocab_size,
            weights=[embedding_weights],
            mask_zero=True))
    ques1_enc.add(
        LSTM(
            HIDDEN_DIM,
            input_shape=(EMBED_DIM, seq_maxlen),
            return_sequences=False))
    ques1_enc.add(Dropout(0.3))

    ques2_enc = Sequential()
    ques2_enc.add(
        Embedding(
            output_dim=EMBED_DIM,
            input_dim=vocab_size,
            weights=[embedding_weights],
            mask_zero=True))
    ques2_enc.add(
        LSTM(
            HIDDEN_DIM,
            input_shape=(EMBED_DIM, seq_maxlen),
            return_sequences=False))
    ques2_enc.add(Dropout(0.3))

    model = Sequential()
    model.add(Merge([ques1_enc, ques2_enc], mode="sum"))
    model.add(Dense(2, activation="softmax"))

    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=["accuracy"])

    # 10折交叉
    kf = KFold(n_splits=10)
    i = 0
    sum_acc = 0.0
    for train_index, test_index in kf.split(x_ques1):
        i += 1
        print "TRAIN-TEST: %d" % i
        x_ques1train, x_ques1test = x_ques1[train_index], x_ques1[test_index]
        x_ques2train, x_ques2test = x_ques2[train_index], x_ques2[test_index]
        ytrain, ytest = y[train_index], y[test_index]
        pidstrain, pidstest = pids[train_index], pids[test_index]

        print(x_ques1train.shape, x_ques1test.shape, x_ques2train.shape,
              x_ques2test.shape, ytrain.shape, ytest.shape, pidstrain.shape,
              pidstest.shape)

        print("Training...")
        checkpoint = ModelCheckpoint(
            filepath=os.path.join(MODEL_DIR, "quora_dul_best_lstm.hdf5"),
            verbose=1,
            save_best_only=True)
        model.fit(
            [x_ques1train, x_ques2train],
            ytrain,
            batch_size=BATCH_SIZE,
            epochs=NBR_EPOCHS,
            validation_split=0.1,
            verbose=2,
            callbacks=[checkpoint])

        # predict
        print("predict...")
        y_test_pred = model.predict_classes(
            [x_ques1test, x_ques2test], batch_size=BATCH_SIZE)
        print y_test_pred
        data_process.pred_save("../data/y_test_{:d}.pred".format(i),
                               y_test_pred, ytest, pidstest)

        print("Evaluation...")
        loss, acc = model.evaluate(
            [x_ques1test, x_ques2test], ytest, batch_size=BATCH_SIZE)
        print("Test loss/accuracy final model = %.4f, %.4f" % (loss, acc))

        model.save_weights(
            os.path.join(MODEL_DIR, "quora_dul_lstm-final.hdf5"))
        with open(os.path.join(MODEL_DIR, "quora_dul_lstm.json"),
                  "wb") as fjson:
            fjson.write(model.to_json())

        model.load_weights(filepath=os.path.join(MODEL_DIR,
                                                 "quora_dul_best_lstm.hdf5"))
        loss, acc = model.evaluate(
            [x_ques1test, x_ques2test], ytest, batch_size=BATCH_SIZE)
        print("Test loss/accuracy best model = %.4f, %.4f" % (loss, acc))
        sum_acc += acc
    print "After all the result acc:", sum_acc / 10
def model(in_file):
    """ 判断文档是否重复的lstm+attention模型;
    """
    # 数据预处理阶段;
    print("data process...")
    ques_pairs = data_process.parse_quora_dul_data(in_file)
    word2idx = data_process.build_vocab(ques_pairs)
    vocab_size = len(word2idx) + 1
    seq_maxlen = data_process.get_seq_maxlen(ques_pairs)
    x_ques1, x_ques2, y, pids = data_process.vectorize_ques_pair(
        ques_pairs, word2idx, seq_maxlen)

    x_ques1train, x_ques1test, x_ques2train, x_ques2test, ytrain, ytest, pidstrain, pidstest = \
    train_test_split(x_ques1, x_ques2, y, pids, test_size=0.2, random_state=42)
    print(x_ques1train.shape, x_ques1test.shape, x_ques2train.shape,
          x_ques2test.shape, ytrain.shape, ytest.shape, pidstrain.shape,
          pidstest.shape)

    # 计算embeding 初始weight;
    w2v_embedding_model = word2vec_pretrain.train_word2vec(
        ques_pairs, num_features=EMBED_DIM, min_word_count=1, context=5)
    embedding_weights = np.zeros((vocab_size, EMBED_DIM))
    for word, index in word2idx.iteritems():
        if word in w2v_embedding_model:
            embedding_weights[index, :] = w2v_embedding_model[word]
        else:
            embedding_weights[index, :] = np.random.uniform(
                -0.25, 0.25, w2v_embedding_model.vector_size)

    # 建立模型;
    print("Building model...")
    ques1_enc = Sequential()
    ques1_enc.add(
        Embedding(output_dim=EMBED_DIM,
                  input_dim=vocab_size,
                  input_length=seq_maxlen,
                  weights=[embedding_weights]))
    ques1_enc.add(LSTM(HIDDEN_DIM, return_sequences=True))
    ques1_enc.add(Dropout(0.3))

    ques2_enc = Sequential()
    ques2_enc.add(
        Embedding(output_dim=EMBED_DIM,
                  input_dim=vocab_size,
                  input_length=seq_maxlen,
                  weights=[embedding_weights]))
    ques2_enc.add(LSTM(HIDDEN_DIM, return_sequences=True))
    ques2_enc.add(Dropout(0.3))

    # attention model
    attn = Sequential()
    attn.add(Merge([ques1_enc, ques2_enc], mode="dot", dot_axes=[1, 1]))
    attn.add(Flatten())
    attn.add(Dense((seq_maxlen * HIDDEN_DIM)))
    attn.add(Reshape((seq_maxlen, HIDDEN_DIM)))

    model = Sequential()
    model.add(Merge([ques1_enc, attn], mode="sum"))
    model.add(Flatten())
    model.add(Dense(2, activation="softmax"))

    model.compile(optimizer="adam",
                  loss="categorical_crossentropy",
                  metrics=["accuracy"])

    print("Training...")
    checkpoint = ModelCheckpoint(filepath=os.path.join(
        MODEL_DIR, "quora_dul_best_lstm_atten.hdf5"),
                                 verbose=1,
                                 save_best_only=True)
    model.fit([x_ques1train, x_ques2train],
              ytrain,
              batch_size=BATCH_SIZE,
              epochs=NBR_EPOCHS,
              validation_split=0.1,
              verbose=2,
              callbacks=[checkpoint])

    # predict
    print("predict...")
    y_test_pred = model.predict_classes([x_ques1test, x_ques2test],
                                        batch_size=BATCH_SIZE)
    data_process.pred_save("../data/y_test.pred", y_test_pred, ytest, pidstest)

    print("Evaluation...")
    loss, acc = model.evaluate([x_ques1test, x_ques2test],
                               ytest,
                               batch_size=BATCH_SIZE)
    print("Test loss/accuracy final model = %.4f, %.4f" % (loss, acc))

    model.save_weights(
        os.path.join(MODEL_DIR, "quora_dul_lstm_atten-final.hdf5"))
    with open(os.path.join(MODEL_DIR, "quora_dul_lstm.json"), "wb") as fjson:
        fjson.write(model.to_json())

    model.load_weights(
        filepath=os.path.join(MODEL_DIR, "quora_dul_best_lstm_atten.hdf5"))
    loss, acc = model.evaluate([x_ques1test, x_ques2test],
                               ytest,
                               batch_size=BATCH_SIZE)
    print("Test loss/accuracy best model = %.4f, %.4f" % (loss, acc))
예제 #3
0
parse.add_argument("--dev_file", default="data/triplet_data_only/14res/dev.txt", type=str)
parse.add_argument("--test_file", default="data/triplet_data_only/14res/test.txt", type=str)
parse.add_argument("--vocab", default="./14res_vocab", type=str)
parse.add_argument("--embedding_file", default=None, type=str)



args = parse.parse_args()


if __name__ == '__main__':

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    if not os.path.exists(args.vocab):
        vocab = build_vocab([args.train_file,args.dev_file, args.test_file])
        embedding_mat = vocab.load_word_embedding(embedding_file=args.embedding_file)
        vocab.save(args.vocab)
    else:
        vocab = Vocab.load(args.vocab)
        embedding_mat = vocab.get_word_embedding()

    data_helper, features_train = build_data(args.train_file, vocab, mode="train")
    data_helper_dev, features_dev = build_data(args.dev_file, vocab, mode="test")
    data_helper_test, features_test = build_data(args.test_file, vocab, mode="test")


    num_embeddings = vocab.vocab_size
    num_embeddings_c = len(vocab.char2ids.keys())
    embedding_dim = 300
    rnn_1_dim = 100
예제 #4
0
import numpy as np
import tensorflow as tf
from model import reRNN
from data_process import read_txt, preprocess, build_vocab, batch_iter, sentenceToIndex
import os, json

if __name__ == "__main__":
    DIR = "models"

    # read and build dataset
    data = read_txt('./data/novel.txt')
    data = preprocess(data)
    vocab, reverse_vocab, vocab_size = build_vocab(data)

    # save vocab
    with open('vocab.json', 'w') as fp:
        json.dump(vocab, fp)

    # open session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    # make model instance
    model = reRNN(sess=sess, vocab_size=vocab_size, lr=1e-1)

    # make train batches
    batches = batch_iter(data, batch_size=64, num_epochs=1001)

    # model saver
    saver = tf.train.Saver(max_to_keep=3, keep_checkpoint_every_n_hours=0.5)