prepare_data = lambda data: (to_seq(data[0]), to_seq(data[1]), data[2]) training = prepare_data(training) validation = prepare_data(validation) test = prepare_data(test) premise = Input(shape=(MAX_LEN, ), dtype='int32') hypothesis = Input(shape=(MAX_LEN, ), dtype='int32') prem_reps = [] # premise sentence representations hypo_reps = [] # hypothesis sentence representations # read in embedding and translate if args.agg_we != None or args.align_op_we != None: print(" fetching word embedding") embedding_matrix = get_embedding_matrix(args.embedding, VOCAB, EMBED_HIDDEN_SIZE, tokenizer) embed = Embedding(VOCAB, EMBED_HIDDEN_SIZE, weights=[embedding_matrix], input_length=MAX_LEN, trainable=False) prem = embed(premise) hypo = embed(hypothesis) if args.timedist: translate = TimeDistributed( Dense(SENT_HIDDEN_SIZE, activation=ACTIVATION)) prem = translate(prem) hypo = translate(hypo)
EMBEDDING_DIM = 200 MAX_SEQUENCE_LENGTH = 200 MAX_JACCARD_LENGTH = 30 INC_BATCH_SIZE = 80000 BASE_DIR = '' # W2V_MODEL_DIR = '/Users/knight/Desktop/GodClassDetection/embedding_model/new_model6.bin' W2V_MODEL_DIR = '/Users/knight/Desktop/GodClassDetection/embedding_model/new_model6_nltk.bin' TRAIN_SET_DIR = '/Users/knight/Desktop/GodClassDetection/trainset' # 直接改成自己的路径 FULL_MN_DIR = TRAIN_SET_DIR tokenizer = preprocess.get_tokenizer(FULL_MN_DIR) all_word_index = tokenizer.word_index embedding_matrix = preprocess.get_embedding_matrix(all_word_index, W2V_MODEL_DIR, dim=EMBEDDING_DIM) acc_list = [] loss_list = [] print("11111111111111111") x_train, y_train = preprocess.get_xy_train(TRAIN_SET_DIR + '/finetune', tokenizer=tokenizer, mn_maxlen=MAX_SEQUENCE_LENGTH, embedding_matrix=embedding_matrix) print('Fine tune model.') # 微调 model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
for i in range(0, m): sentences1.append(text_to_word_list(questions1[i])) sentences2.append(text_to_word_list(questions2[i])) print 'Corpus length = %d' % m documents = sentences1 + sentences2 # Create the tokenizer tokenizer = Tokenizer() # fit the tokenizer on the documents tokenizer.fit_on_texts(documents) # Get the word to index dictionary word_to_idx = tokenizer.word_index print 'Vocabulary size = %d' % len(word_to_idx) # Generate the embedding matrix embedding_matrix, word2vec = get_embedding_matrix( word_to_idx, documents, network_config['pre_trained_vector_flag']) # Create training, validation and test set train_validation_dict = create_train_dev_test_set(tokenizer, sentences1, sentences2, sim_score) # Train the model lstm_network = BiLSTMNetwork() lstm_network.train_model(train_validation_dict, embedding_matrix)
OPTIMIZER = 'rmsprop' to_seq = lambda X: pad_sequences(tokenizer.texts_to_sequences(X), maxlen=MAX_LEN) prepare_data = lambda data: (to_seq(data[0]), to_seq(data[1]), data[2]) training = prepare_data(training) validation = prepare_data(validation) test = prepare_data(test) premise = Input(shape=(MAX_LEN, ), dtype='int32') hypothesis = Input(shape=(MAX_LEN, ), dtype='int32') # read in embedding and translate print("> fetching word embedding") embedding_matrix = get_embedding_matrix(args.embedding, VOCAB, EMBED_HIDDEN_SIZE, tokenizer) embed = Embedding(VOCAB, EMBED_HIDDEN_SIZE, weights=[embedding_matrix], input_length=MAX_LEN, trainable=False) prem = embed(premise) hypo = embed(hypothesis) translate = TimeDistributed(Dense(SENT_HIDDEN_SIZE, activation=ACTIVATION)) prem = translate(prem) hypo = translate(hypo) alignment = _align(prem, hypo, normalize=True)
PATIENCE = int(args.patience) BATCH_SIZE = 512 DENSE_NEURON_COUNT = int(args.neurons) DP = 0.2 L2 = 4e-6 to_seq = lambda X: pad_sequences(tokenizer.texts_to_sequences(X), maxlen=SENTENCE_MAX_LEN) prepare_data = lambda data: (to_seq(data[0]), to_seq(data[1]), data[2]) training = prepare_data(training) validation = prepare_data(validation) test = prepare_data(test) logging.info("> fetching embedding") embedding_matrix = get_embedding_matrix(emb_file, VOCAB_SIZE, WORD_DIM, tokenizer) logging.info("> fetching antonym embedding") ant_embedding_matrix = get_embedding_matrix(ant_emb_file, VOCAB_SIZE, ANT_WORD_DIM, tokenizer) embed = Embedding(VOCAB_SIZE, WORD_DIM, weights=[embedding_matrix], input_length=SENTENCE_MAX_LEN, trainable=False) ant_embed = Embedding(VOCAB_SIZE, ANT_WORD_DIM, weights=[ant_embedding_matrix], input_length=SENTENCE_MAX_LEN, trainable=False)
import numpy as np import preprocess as pp from keras.models import model_from_json question1, question2 = pp.extract_data("quora-question-pairs/test.csv", 'test') question1_word_sequences, question2_word_sequences, word_index = pp.tokenize( question1, question2) embeddings_index = pp.get_embeddings("glove.840B.300d/glove.840B.300d.txt") nb_words, word_embedding_matrix = pp.get_embedding_matrix( word_index, embeddings_index) q1_data, q2_data, word_embedding_matrix, nb_words = pp.process_data( question1_word_sequences, question2_word_sequences, word_embedding_matrix, nb_words, 'test') X_train = np.stack((q1_data, q2_data), axis=1) Q1_train = X_train[:, 0] Q2_train = X_train[:, 1] json_file = open('best_weights/model.json', 'r') loaded_model_json = json_file.read() json_file.close() model = model_from_json(loaded_model_json) model.load_weights("best_weights/weights.h5") print("Loaded model from disk") model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) score = model.predict([Q1_train, Q2_train]) print(score)
a = K.l2_normalize(a, axis=2) b = K.l2_normalize(b, axis=2) return K.batch_dot(a, b, axes=[2, 2]) def compute_output_shape(self, input_shape): a_shape, b_shape = input_shape return (a_shape[0], a_shape[1], b_shape[1]) premise = Input(shape=(MAX_LEN, ), dtype='int32') hypothesis = Input(shape=(MAX_LEN, ), dtype='int32') # read in embedding and translate print("> fetching word embedding") embedding_matrix = get_embedding_matrix(args.embedding, VOCAB, 300, tokenizer) embed = Embedding(VOCAB, 300, weights=[embedding_matrix], input_length=42, trainable=False) prem = embed(premise) hypo = embed(hypothesis) translate = TimeDistributed(Dense(300, activation="relu")) hypo = translate(prem) prem = translate(hypo) perspectives = 5