def model(data_file): """ 使用lstm模型,判断一对问题是否是重复的 """ # 数据预处理 print "data pre-processing" ques_pairs = data_process.parse_quoar_dul_data(data_file)[0:3000] word2idx = data_process.build_vocab(ques_pairs) vocab_size = len(word2idx) + 1 seq_maxlen = data_process.get_seq_maxlen(ques_pairs) pids, x_ques1, x_ques2, y = data_process.vectorize_ques_pair( ques_pairs, word2idx, seq_maxlen) # 计算embeding 初始weight; # 训练处word2vec模型,用n维向量表示一个词 w2v_embedding_model = word2vec_pretrain.train_word2vec( ques_pairs, num_features=EMBED_DIM, min_word_count=1, context=5) embedding_weights = np.zeros((vocab_size, EMBED_DIM)) # 将我们创建出的词表中每个文本词替换为词向量 for word, index in word2idx.iteritems(): if word in w2v_embedding_model: embedding_weights[index, :] = w2v_embedding_model[word] else: embedding_weights[index, :] = np.random.uniform( -0.25, 0.25, w2v_embedding_model.vector_size) # 建立lstm模型; print("Building model...") ques1_enc = Sequential() ques1_enc.add( Embedding( output_dim=EMBED_DIM, input_dim=vocab_size, weights=[embedding_weights], mask_zero=True)) ques1_enc.add( LSTM( HIDDEN_DIM, input_shape=(EMBED_DIM, seq_maxlen), return_sequences=False)) ques1_enc.add(Dropout(0.3)) ques2_enc = Sequential() ques2_enc.add( Embedding( output_dim=EMBED_DIM, input_dim=vocab_size, weights=[embedding_weights], mask_zero=True)) ques2_enc.add( LSTM( HIDDEN_DIM, input_shape=(EMBED_DIM, seq_maxlen), return_sequences=False)) ques2_enc.add(Dropout(0.3)) model = Sequential() model.add(Merge([ques1_enc, ques2_enc], mode="sum")) model.add(Dense(2, activation="softmax")) model.compile( optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]) # 10折交叉 kf = KFold(n_splits=10) i = 0 sum_acc = 0.0 for train_index, test_index in kf.split(x_ques1): i += 1 print "TRAIN-TEST: %d" % i x_ques1train, x_ques1test = x_ques1[train_index], x_ques1[test_index] x_ques2train, x_ques2test = x_ques2[train_index], x_ques2[test_index] ytrain, ytest = y[train_index], y[test_index] pidstrain, pidstest = pids[train_index], pids[test_index] print(x_ques1train.shape, x_ques1test.shape, x_ques2train.shape, x_ques2test.shape, ytrain.shape, ytest.shape, pidstrain.shape, pidstest.shape) print("Training...") checkpoint = ModelCheckpoint( filepath=os.path.join(MODEL_DIR, "quora_dul_best_lstm.hdf5"), verbose=1, save_best_only=True) model.fit( [x_ques1train, x_ques2train], ytrain, batch_size=BATCH_SIZE, epochs=NBR_EPOCHS, validation_split=0.1, verbose=2, callbacks=[checkpoint]) # predict print("predict...") y_test_pred = model.predict_classes( [x_ques1test, x_ques2test], batch_size=BATCH_SIZE) print y_test_pred data_process.pred_save("../data/y_test_{:d}.pred".format(i), y_test_pred, ytest, pidstest) print("Evaluation...") loss, acc = model.evaluate( [x_ques1test, x_ques2test], ytest, batch_size=BATCH_SIZE) print("Test loss/accuracy final model = %.4f, %.4f" % (loss, acc)) model.save_weights( os.path.join(MODEL_DIR, "quora_dul_lstm-final.hdf5")) with open(os.path.join(MODEL_DIR, "quora_dul_lstm.json"), "wb") as fjson: fjson.write(model.to_json()) model.load_weights(filepath=os.path.join(MODEL_DIR, "quora_dul_best_lstm.hdf5")) loss, acc = model.evaluate( [x_ques1test, x_ques2test], ytest, batch_size=BATCH_SIZE) print("Test loss/accuracy best model = %.4f, %.4f" % (loss, acc)) sum_acc += acc print "After all the result acc:", sum_acc / 10
def model(in_file): """ 判断文档是否重复的lstm+attention模型; """ # 数据预处理阶段; print("data process...") ques_pairs = data_process.parse_quora_dul_data(in_file) word2idx = data_process.build_vocab(ques_pairs) vocab_size = len(word2idx) + 1 seq_maxlen = data_process.get_seq_maxlen(ques_pairs) x_ques1, x_ques2, y, pids = data_process.vectorize_ques_pair( ques_pairs, word2idx, seq_maxlen) x_ques1train, x_ques1test, x_ques2train, x_ques2test, ytrain, ytest, pidstrain, pidstest = \ train_test_split(x_ques1, x_ques2, y, pids, test_size=0.2, random_state=42) print(x_ques1train.shape, x_ques1test.shape, x_ques2train.shape, x_ques2test.shape, ytrain.shape, ytest.shape, pidstrain.shape, pidstest.shape) # 计算embeding 初始weight; w2v_embedding_model = word2vec_pretrain.train_word2vec( ques_pairs, num_features=EMBED_DIM, min_word_count=1, context=5) embedding_weights = np.zeros((vocab_size, EMBED_DIM)) for word, index in word2idx.iteritems(): if word in w2v_embedding_model: embedding_weights[index, :] = w2v_embedding_model[word] else: embedding_weights[index, :] = np.random.uniform( -0.25, 0.25, w2v_embedding_model.vector_size) # 建立模型; print("Building model...") ques1_enc = Sequential() ques1_enc.add( Embedding(output_dim=EMBED_DIM, input_dim=vocab_size, input_length=seq_maxlen, weights=[embedding_weights])) ques1_enc.add(LSTM(HIDDEN_DIM, return_sequences=True)) ques1_enc.add(Dropout(0.3)) ques2_enc = Sequential() ques2_enc.add( Embedding(output_dim=EMBED_DIM, input_dim=vocab_size, input_length=seq_maxlen, weights=[embedding_weights])) ques2_enc.add(LSTM(HIDDEN_DIM, return_sequences=True)) ques2_enc.add(Dropout(0.3)) # attention model attn = Sequential() attn.add(Merge([ques1_enc, ques2_enc], mode="dot", dot_axes=[1, 1])) attn.add(Flatten()) attn.add(Dense((seq_maxlen * HIDDEN_DIM))) attn.add(Reshape((seq_maxlen, HIDDEN_DIM))) model = Sequential() model.add(Merge([ques1_enc, attn], mode="sum")) model.add(Flatten()) model.add(Dense(2, activation="softmax")) model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]) print("Training...") checkpoint = ModelCheckpoint(filepath=os.path.join( MODEL_DIR, "quora_dul_best_lstm_atten.hdf5"), verbose=1, save_best_only=True) model.fit([x_ques1train, x_ques2train], ytrain, batch_size=BATCH_SIZE, epochs=NBR_EPOCHS, validation_split=0.1, verbose=2, callbacks=[checkpoint]) # predict print("predict...") y_test_pred = model.predict_classes([x_ques1test, x_ques2test], batch_size=BATCH_SIZE) data_process.pred_save("../data/y_test.pred", y_test_pred, ytest, pidstest) print("Evaluation...") loss, acc = model.evaluate([x_ques1test, x_ques2test], ytest, batch_size=BATCH_SIZE) print("Test loss/accuracy final model = %.4f, %.4f" % (loss, acc)) model.save_weights( os.path.join(MODEL_DIR, "quora_dul_lstm_atten-final.hdf5")) with open(os.path.join(MODEL_DIR, "quora_dul_lstm.json"), "wb") as fjson: fjson.write(model.to_json()) model.load_weights( filepath=os.path.join(MODEL_DIR, "quora_dul_best_lstm_atten.hdf5")) loss, acc = model.evaluate([x_ques1test, x_ques2test], ytest, batch_size=BATCH_SIZE) print("Test loss/accuracy best model = %.4f, %.4f" % (loss, acc))
parse.add_argument("--dev_file", default="data/triplet_data_only/14res/dev.txt", type=str) parse.add_argument("--test_file", default="data/triplet_data_only/14res/test.txt", type=str) parse.add_argument("--vocab", default="./14res_vocab", type=str) parse.add_argument("--embedding_file", default=None, type=str) args = parse.parse_args() if __name__ == '__main__': device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if not os.path.exists(args.vocab): vocab = build_vocab([args.train_file,args.dev_file, args.test_file]) embedding_mat = vocab.load_word_embedding(embedding_file=args.embedding_file) vocab.save(args.vocab) else: vocab = Vocab.load(args.vocab) embedding_mat = vocab.get_word_embedding() data_helper, features_train = build_data(args.train_file, vocab, mode="train") data_helper_dev, features_dev = build_data(args.dev_file, vocab, mode="test") data_helper_test, features_test = build_data(args.test_file, vocab, mode="test") num_embeddings = vocab.vocab_size num_embeddings_c = len(vocab.char2ids.keys()) embedding_dim = 300 rnn_1_dim = 100
import numpy as np import tensorflow as tf from model import reRNN from data_process import read_txt, preprocess, build_vocab, batch_iter, sentenceToIndex import os, json if __name__ == "__main__": DIR = "models" # read and build dataset data = read_txt('./data/novel.txt') data = preprocess(data) vocab, reverse_vocab, vocab_size = build_vocab(data) # save vocab with open('vocab.json', 'w') as fp: json.dump(vocab, fp) # open session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) # make model instance model = reRNN(sess=sess, vocab_size=vocab_size, lr=1e-1) # make train batches batches = batch_iter(data, batch_size=64, num_epochs=1001) # model saver saver = tf.train.Saver(max_to_keep=3, keep_checkpoint_every_n_hours=0.5)