def train(): # input_text = ['1 2 3 4 5' # , '6 7 8 9 10' # , '11 12 13 14 15' # , '16 17 18 19 20' # , '21 22 23 24 25'] # tar_text = ['one two three four five' # , 'six seven eight nine ten' # , 'eleven twelve thirteen fourteen fifteen' # , 'sixteen seventeen eighteen nineteen twenty' # , 'twenty_one twenty_two twenty_three twenty_four twenty_five'] # vocab = sorted(reduce(lambda x, y: x | y, (set(tmp_list) for tmp_list in input_list + tar_list))) vocab = loaddic('./corpus/smalldic.txt') print('-----------') # print vocab print('-----------') # Reserve 0 for masking via pad_sequences vocab_size = len(vocab) + 1 # keras进行embedding的时候必须进行len(vocab)+1 # input_maxlen = max(map(len, (x for x in input_list))) # tar_maxlen = max(map(len, (x for x in tar_list))) input_maxlen = 70 tar_maxlen = 17 output_dim = vocab_size hidden_dim = 100 print('-') print('Vocab size:', vocab_size, 'unique words') print('Input max length:', input_maxlen, 'words') print('Target max length:', tar_maxlen, 'words') print('Dimension of hidden vectors:', hidden_dim) # print('Number of training stories:', len(input_list)) # print('Number of test stories:', len(input_list)) print('-') print('Vectorizing the word sequences...') word_to_idx = dict( (c, i + 1) for i, c in enumerate(vocab)) # 编码时需要将字符映射成数字index idx_to_word = dict( (i + 1, c) for i, c in enumerate(vocab)) # 解码时需要将数字index映射成字符 decoder_mode = 3 # 0 最简单模式,1 [1]向后模式,2 [2] Peek模式,3 [3]Attention模式 if decoder_mode == 3: encoder_top_layer = LSTM(hidden_dim, return_sequences=True) else: encoder_top_layer = LSTM(hidden_dim) if decoder_mode == 0: decoder_top_layer = LSTM(hidden_dim, return_sequences=True) decoder_top_layer.get_weights() elif decoder_mode == 1: decoder_top_layer = LSTMDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=tar_maxlen, state_input=False, return_sequences=True) elif decoder_mode == 2: decoder_top_layer = LSTMDecoder2(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=tar_maxlen, state_input=False, return_sequences=True) elif decoder_mode == 3: decoder_top_layer = AttentionDecoder(hidden_dim=hidden_dim, output_dim=hidden_dim, output_length=tar_maxlen, state_input=False, return_sequences=True) en_de_model = Sequential() en_de_model.add( Embedding(input_dim=vocab_size, output_dim=hidden_dim, input_length=input_maxlen)) en_de_model.add(encoder_top_layer) if decoder_mode == 0: en_de_model.add(RepeatVector(tar_maxlen)) en_de_model.add(decoder_top_layer) en_de_model.add(TimeDistributedDense(output_dim)) en_de_model.add(Activation('softmax')) en_de_model.load_weights('en_de_weights1-40.h5') print('Compiling...') time_start = time.time() en_de_model.compile(loss='categorical_crossentropy', optimizer='rmsprop') time_end = time.time() print('Compiled, cost time:%fsecond!' % (time_end - time_start)) # # input_text = loadfile('./corpus/content-12147.txt') # input_text = loadfile('./corpus/content1-500.txt') # # input_list = [] # for tmp_input in input_text: # input_list.append(chtokenize(tmp_input)) # # inputs_train = vectorize_x(input_list, word_to_idx, input_maxlen, tar_maxlen, vocab_size) # # out_predicts = en_de_model.predict(inputs_train) # for i_idx, out_predict in enumerate(out_predicts): # predict_sequence = [] # tempstr = '' # for predict_vector in out_predict: # next_index = np.argmax(predict_vector) # next_token = idx_to_word[next_index] # # print next_token # tempstr += next_token # predict_sequence.append(next_token) # print tempstr # # print('Predict output:', predict_sequence) # # print ('Train Ended') # def predict(input_text): import socket sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) host = socket.gethostbyname(socket.gethostname()) port = 50008 sock.bind((host, port)) sock.listen(5) while True: conn, addr = sock.accept() data = conn.recv(1024) list = [] # input_text = '实际上,上周主管部门就和大唐打过招呼了,内部消息人士透露,国资委已经就李小琳任职问题和大唐进行沟通,但李小琳本人至今未报到。情况比较复杂,上述人士表示,目前还不敢完全确定,不排除后续还有变化。' tmp = 'BEG ' + data + ' END' tmp = jiebacut(tmp) list.append(tmp) result = '' input_list = [] for tmp_input in list: print(tmp_input) print('---!--!---') input_list.append(chtokenize(tmp_input)) inputs_train = vectorize_x(input_list, word_to_idx, input_maxlen) out_predicts = en_de_model.predict(inputs_train) for i_idx, out_predict in enumerate(out_predicts): predict_sequence = [] tempstr = '' for predict_vector in out_predict: next_index = np.argmax(predict_vector) next_token = idx_to_word[next_index] # print next_token tempstr += next_token predict_sequence.append(next_token) print(tempstr) result = tempstr print('Predict output:', predict_sequence) reply = result conn.send(reply.encode())
def train(train_path, tokenizer_path): print('import data...') maxlen = 1024 X, label, Y = text2sequence(train_path, tokenizer_path, maxlen) num_class = len(set(label)) print('data import finished!') tokenizer = pickle.load(open(tokenizer_path, 'rb')) num_words = len(tokenizer.word_index)+1 print('prepare training data and validation data using k_fold') seed = 0 k = 10 k_fold = StratifiedKFold(n_splits = k, shuffle = True, random_state = seed) #10折交叉验证数据集划分 cw_1 = {0:1, 1:1, 2:1, 3:1, 4:1, 5:1, 6:1, 7:1} #不考虑数据不均衡 cw_2 = {0:0.348709, 1:3.457910, 2:1.451396, 3:2.116922, 4:17.358700, 5:0.404727, 6:3.370635, 7:1.167362} #每类权重为(1/8/该类出现频率) class_weight = [cw_1, cw_2] #使两种权重一样重要 #在100个文档的数据集上测试发现不使用class_weight的效果比使用class_weight的好 #使用class_weight的效果比只使用cw_2的效果好 print('create lstm model...') model = Sequential() model.add(Embedding(num_words, 300, input_length=maxlen)) model.add(Dropout(0.5)) model.add(Convolution1D(128, 3, padding = 'same', strides = 1)) model.add(Activation('relu')) model.add(MaxPooling1D(pool_size = 2)) model.add(LSTM(64, recurrent_dropout=0.5)) model.add(Dropout(0.5)) model.add(Dense(num_class, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) k_fold_cv_loss = [] k_fold_cv_acc = [] dt = datetime.now() d = dt.date() h = dt.time().hour m = dt.time().minute time_str = '{}_{}{}'.format(d, h, m) mckpt = ModelCheckpoint('model/best-cnn_lstm_weights_{}.h5'.format(time_str), monitor = 'val_loss', mode = 'auto', verbose = 1, save_best_only = True, save_weights_only = True, period = 1) rlstp = EarlyStopping(monitor = 'val_loss', patience = 3) tb= TensorBoard(log_dir='./logs', embeddings_freq=1, write_images = 1, histogram_freq = 1, batch_size = 32) turn = 1 for train, valid in k_fold.split(X,label): print('the {} turn training...'.format(turn)) turn += 1 model.fit(X[train], Y[train], validation_data = (X[valid], Y[valid]), class_weight = None, callbacks = [mckpt], verbose = 2, epochs=10, batch_size=32) # Evaluate model loss, acc = model.evaluate(X[valid], Y[valid], verbose=0, batch_size=32) k_fold_cv_loss.append(loss) k_fold_cv_acc.append(acc) print("Model loss: {:0.6f}".format(np.mean(k_fold_cv_loss))) print("Model Accuracy: {:0.6f}%".format(np.mean(k_fold_cv_acc) * 100)) # Save model model.save_weights('model/cnn_lstm_weights_{}.h5'.format(time_str)) model.save('model/cnn_lstm_model_{}.h5'.format(time_str)) with open('model/cnn_lstm_model_{}.json'.format(time_str), 'w') as outfile: outfile.write(model.to_json())
tokenizer.word_index['no'] sum(answers_test) #model creation from keras.models import Sequential, Model from keras.layers.embeddings import Embedding from keras.layers import Input, Activation, Dense, Permute, Dropout from keras.layers import add, dot, concatenate from keras.layers import LSTM #tupple (max_story_len,batch_size) here batch size not sure so only , input_sequence = Input((max_story_len, )) question = Input((max_question_len, )) input_encoder_m = Sequential() input_encoder_m.add(Embedding(input_dim=vocabulary_len, output_dim=64)) input_encoder_m.add(Dropout(0.3)) #output: (samples, story_maxlen, embedding_dim) input_encoder_c = Sequential() input_encoder_c.add( Embedding(input_dim=vocabulary_len, output_dim=max_question_len)) input_encoder_c.add(Dropout(0.3)) #output: (samples, story_maxlen, question_maxlen) question_encoder = Sequential() question_encoder.add( Embedding(input_dim=vocabulary_len, output_dim=64, input_length=max_question_len)) question_encoder.add(Dropout(0.3))
def computeRnnSkipgram(sourceTexts, targetTexts, irmodel, answers, filename): """Read source and target artefacts and built the RNN SKIPGRAM model and compute similarity for each pair of artefacts. Args: sourceTexts: a list of source artefacts tokenized with stopword removed; targetTexts: a list of target artefacts tokenized with stopword removed; answers: list of true links; irmodel: a statistic model result(LSI,LDA or VSM) #TODO align rnn vector without a statistic model result filename: file where the ir model result are saved. Returns: None. """ #load cbow model tokenizer, embedding_matrix, sequences, maxlen, num_words, embedding_dim, artifact_pairs = ReadingWordEmb.compute_Skipgram( sourceTexts, targetTexts) artifacts1 = [x[0] for x in artifact_pairs] artifacts2 = [x[1] for x in artifact_pairs] sequences_1 = tokenizer.texts_to_sequences(artifacts1) sequences_2 = tokenizer.texts_to_sequences(artifacts2) #compute the number of common words for each pair of artifacts leaks = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))] for x1, x2 in zip(sequences_1, sequences_2)] #we padded all our sentences to have the same length padded_data_1 = pad_sequences(sequences_1, maxlen=maxlen) padded_data_2 = pad_sequences(sequences_2, maxlen=maxlen) leaks = np.array(leaks) #build the answers vector labels = [] labels = CreateTraining_set.alignTraining_set(answers, irmodel) #building training set if (os.path.exists("outputs/trainingSet.p")): print("training set already created") print("evaluation set already created") else: CreateTraining_set.create_training_set(labels) # retrieved training data train_indices = pickle.load(open("outputs/trainingSet.p", "rb")) train_indices.sort() train_data_1_all = np.array(padded_data_1)[train_indices, :] train_data_2_all = np.array(padded_data_2)[train_indices, :] train_labels_all = np.array(labels)[train_indices, :] train_leaks_all = np.array(leaks)[train_indices, :] VALIDATION_SPLIT = 0.1 dev_idx = max(1, int(len(train_labels_all) * VALIDATION_SPLIT)) #splitting training data and validation data train_data_1, val_data_1 = train_data_1_all[:-dev_idx], train_data_1_all[ -dev_idx:] train_data_2, val_data_2 = train_data_2_all[:-dev_idx], train_data_2_all[ -dev_idx:] train_labels, val_labels = train_labels_all[:-dev_idx], train_labels_all[ -dev_idx:] train_leaks, val_leaks = train_leaks_all[:-dev_idx], train_leaks_all[ -dev_idx:] ''' # retrieved evaluated data for testing the model test_indices=pickle.load(open("outputs/evaluationSet.p","rb")) test_data_1=np.array(padded_data_1)[test_indices,:] test_data_2=np.array(padded_data_2)[test_indices,:] test_labels=np.array(labels)[test_indices,:] test_leaks=np.array(leaks)[test_indices,:] test_data=np.array(artifact_pairs)[test_indices,:] del padded_data_1 del padded_data_2 gc.collect() ''' #building Rnn model with LSTM RATE_DROP_LSTM = 0.17 RATE_DROP_DENSE = 0.25 NUMBER_LSTM = 50 NUMBER_DENSE_UNITS = 50 NUMBER_DENSE_UNITS_1 = 25 ACTIVATION_FUNCTION = 'relu' # Creating word embedding layer embedding_layer = Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False) # Creating LSTM Encoder lstm_layer = Bidirectional( LSTM(NUMBER_LSTM, dropout=RATE_DROP_LSTM, recurrent_dropout=RATE_DROP_LSTM)) # Creating LSTM Encoder layer for source artifact sequence_1_input = Input(shape=(maxlen, ), dtype='int32') embedded_sequences_1 = embedding_layer(sequence_1_input) x1 = lstm_layer(embedded_sequences_1) # Creating LSTM Encoder layer for for target artifact sequence_2_input = Input(shape=(maxlen, ), dtype='int32') embedded_sequences_2 = embedding_layer(sequence_2_input) x2 = lstm_layer(embedded_sequences_2) # Creating leaks input leaks_input = Input(shape=(leaks.shape[1], )) #leaks_dense = Dense(NUMBER_DENSE_UNITS/2, activation=ACTIVATION_FUNCTION, input_shape=leaks_input.get_shape()) #leaks_dense = Dense(NUMBER_DENSE_UNITS/2, activation=ACTIVATION_FUNCTION)(leaks_input) #print(leaks_dense.input_shape) # Merging two LSTM encodes vectors from sentences to # pass it to dense layer applying dropout and batch normalisation merged = concatenate([x1, x2, leaks_input]) merged = BatchNormalization()(merged) merged = Dropout(RATE_DROP_DENSE)(merged) merged = Dense(NUMBER_DENSE_UNITS, activation=ACTIVATION_FUNCTION)(merged) merged = BatchNormalization()(merged) merged = Dropout(RATE_DROP_DENSE)(merged) preds = Dense(1, activation='sigmoid')(merged) model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], outputs=preds) model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc']) early_stopping = EarlyStopping(monitor='val_loss', patience=3) STAMP = 'lstm_%d_%d_%.2f_%.2f' % (NUMBER_LSTM, NUMBER_DENSE_UNITS, RATE_DROP_LSTM, RATE_DROP_DENSE) model_save_directory = 'outputs/' checkpoint_dir = model_save_directory + 'checkpoints/' + str( int(time.time())) + '/' if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) bst_model_path = checkpoint_dir + STAMP + '.h5' model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=False) numExec = 1 RnnResult = [] for i in range(0, numExec): #tensorboard = TensorBoard(log_dir=checkpoint_dir + "logs/{}".format(time.time())) model.fit([train_data_1, train_data_2, train_leaks], train_labels, validation_data=([val_data_1, val_data_2, val_leaks], val_labels), epochs=200, batch_size=64, shuffle=True, callbacks=[early_stopping, model_checkpoint]) preds = list( model.predict([padded_data_1, padded_data_2, leaks], verbose=1).ravel()) RnnResult.append(preds) loss, accuracy = model.evaluate([padded_data_1, padded_data_2, leaks], labels, verbose=1) print('Accuracy: %f' % (accuracy * 100)) resvect = np.zeros((len(preds), 1)) for res in RnnResult: for i in range(0, len(preds)): resvect[i] = resvect[i] + res[i] resvect = resvect / numExec results = [(x[0], y[0], z) for (x, y), z in zip(artifact_pairs, preds)] print("RNN Skipgram model compute") #creation of the csv file with open(filename, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=("Artifact1", "Artifact2", "probability")) writer.writeheader() # in each row # add requirements names, model name , and value for res in results: writer.writerow({ 'Artifact1': str("{0}".format(res[0])), 'Artifact2': str("{0}".format(res[1])), 'probability': str("{0}".format(res[2])) }) print("similarity matrix build")
def Mem_Model2(story_maxlen, query_maxlen, vocab_size): input_encoder_m = Sequential() input_encoder_m.add( Embedding(input_dim=vocab_size, output_dim=128, input_length=story_maxlen)) input_encoder_m.add(Dropout(0.5)) # output: (samples, story_maxlen, embedding_dim) # embed the question into a sequence of vectors question_encoder = Sequential() question_encoder.add( Embedding(input_dim=vocab_size, output_dim=128, input_length=query_maxlen)) question_encoder.add(Dropout(0.5)) # output: (samples, query_maxlen, embedding_dim) # compute a 'match' between input sequence elements (which are vectors) # and the question vector sequence match = Sequential() match.add( Merge([input_encoder_m, question_encoder], mode='dot', dot_axes=[2, 2])) match.add(Activation('softmax')) plot(match, to_file='model_1.png') # output: (samples, story_maxlen, query_maxlen) # embed the input into a single vector with size = story_maxlen: input_encoder_c = Sequential() # input_encoder_c.add(Embedding(input_dim=vocab_size, # output_dim=query_maxlen, # input_length=story_maxlen)) input_encoder_c.add( Embedding(input_dim=vocab_size, output_dim=query_maxlen, input_length=story_maxlen)) input_encoder_c.add(Dropout(0.5)) # output: (samples, story_maxlen, query_maxlen) # sum the match vector with the input vector: response = Sequential() response.add(Merge([match, input_encoder_c], mode='sum')) # output: (samples, story_maxlen, query_maxlen) response.add(Permute( (2, 1))) # output: (samples, query_maxlen, story_maxlen) plot(response, to_file='model_2.png') # concatenate the match vector with the question vector, # and do logistic regression on top answer = Sequential() answer.add( Merge([response, question_encoder], mode='concat', concat_axis=-1)) # the original paper uses a matrix multiplication for this reduction step. # we choose to use a RNN instead. answer.add(LSTM(64)) # one regularization layer -- more would probably be needed. answer.add(Dropout(0.5)) answer.add(Dense(50)) # we output a probability distribution over the vocabulary answer.add(Activation('sigmoid')) return answer
vocab_size = len(tokenizer.word_index) + 1 sentences = tokenizer.texts_to_sequences(sentences) padded_docs = pad_sequences(sentences, maxlen=max_review_len) #sentences = tokenizer.texts_to_matrix(sentences) le = preprocessing.LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(padded_docs, y, test_size=0.25, random_state=1000) # Number of features # print(input_dim) model = Sequential() model.add(Embedding(vocab_size, 50, input_length=max_review_len)) model.add(Flatten()) model.add(layers.Dense(300, activation='relu')) model.add(layers.Dense(3, activation='softmax')) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc']) history = model.fit(X_train, y_train, epochs=5, verbose=True, validation_data=(X_test, y_test), batch_size=256) # For accuracy values plt.plot(history.history['acc'])
print(train_data["encoded_text"][:3]) # pad documents to a max length of 4 words X_train = pad_sequences(train_data["encoded_text"], maxlen=SEQUENCE_LENGTH, padding='post') X_eval = pad_sequences(eval_data["encoded_text"], maxlen=SEQUENCE_LENGTH, padding='post') print(X_train[:3]) # define the model logger.debug("Model definition") model = Sequential() model.add(Embedding(VOCAB_SIZE, OUTPUT_DIM, input_length=SEQUENCE_LENGTH)) model.add(Flatten()) model.add(Dense(3, activation='softmax')) # compile the model optimizer = Adam(lr=KERAS_LEARNING_RATE, decay=decay) model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['acc']) # summarize the model print(model.summary()) # fit the model # Use Early-Stopping callback_early_stopping = keras.callbacks.EarlyStopping( monitor='val_loss', patience=KERAS_EARLY_STOPPING, verbose=VERBOSE,
def run_model_varyembed(dataset, numhidden, hiddendim, idx2word, idx2label, w2v, basedir, embedding_dim=400, validate=True, num_epochs=30): train_toks, valid_toks, test_toks, \ train_lex, valid_lex, test_lex, \ train_y, valid_y, test_y = dataset maxlen = max([len(l) for l in train_lex]) if len(valid_lex) > 0: maxlen = max(maxlen, max([len(l) for l in valid_lex])) maxlen = max(maxlen, max([len(l) for l in test_lex])) vocsize = max(idx2word.keys()) + 1 nclasses = max(idx2label.keys()) + 1 # Pad inputs to max sequence length and turn into one-hot vectors train_lex = sequence.pad_sequences(train_lex, maxlen=maxlen) valid_lex = sequence.pad_sequences(valid_lex, maxlen=maxlen) test_lex = sequence.pad_sequences(test_lex, maxlen=maxlen) train_y = sequence.pad_sequences(train_y, maxlen=maxlen) valid_y = sequence.pad_sequences(valid_y, maxlen=maxlen) test_y = sequence.pad_sequences(test_y, maxlen=maxlen) train_y = vectorize_set(train_y, maxlen, nclasses) valid_y = vectorize_set(valid_y, maxlen, nclasses) test_y = vectorize_set(test_y, maxlen, nclasses) # Build the model ## BI-DIRECTIONAL print('Building the model...') H = numhidden model = Graph() model.add_input(name='input', input_shape=[maxlen], dtype='int') # Add embedding layer if w2v is None: model.add_node(Embedding(vocsize, embedding_dim, init='lecun_uniform', input_length=maxlen), name='embed', input='input') else: embeds = init_embedding_weights(idx2word, w2v) embed_dim = w2v.syn0norm.shape[1] model.add_node(Embedding(vocsize, embed_dim, input_length=maxlen, weights=[embeds], mask_zero=True), name='embed', input='input') # Build first hidden layer model.add_node(LSTM(hiddendim, return_sequences=True, activation='tanh'), name='forward0', input='embed') model.add_node(Dropout(0.1), name='dropout0f', input='forward0') model.add_node(LSTM(hiddendim, return_sequences=True, go_backwards=True, activation='tanh'), name='backwards0', input='embed') model.add_node(Dropout(0.1), name='dropout0b', input='backwards0') # Build subsequent hidden layers if H > 1: for i in range(1, H): model.add_node(LSTM(hiddendim, return_sequences=True, activation='tanh'), name='forward%d' % i, input='dropout%df' % (i - 1)) model.add_node(Dropout(0.1), name='dropout%df' % i, input='forward%d' % i) model.add_node(LSTM(hiddendim, return_sequences=True, go_backwards=True, activation='tanh'), name='backwards%d' % i, input='dropout%db' % (i - 1)) model.add_node(Dropout(0.1), name='dropout%db' % i, input='backwards%d' % i) # Finish up the network model.add_node(TimeDistributedDense(nclasses), name='tdd', inputs=['dropout%df' % (H - 1), 'dropout%db' % (H - 1)], merge_mode='ave') model.add_node(Activation('softmax'), name='softmax', input='tdd') model.add_output(name='output', input='softmax') model.compile(optimizer='rmsprop', loss={'output': 'categorical_crossentropy'}) # Set up callbacks fileprefix = 'embed_varied_' am = approximateMatch.ApproximateMatch_SEQ(valid_toks, valid_y, valid_lex, idx2label, pred_dir=os.path.join( basedir, 'predictions'), fileprefix=fileprefix) mc = callbacks.ModelCheckpoint( os.path.join(basedir, 'models', 'embedding.model.weights.{epoch:02d}.hdf5')) cbs = [am, mc] if validate: early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=3) cbs.append(early_stopping) # Train the model print('Training...') hist = model.fit({ 'input': train_lex, 'output': train_y }, nb_epoch=num_epochs, batch_size=1, validation_data={ 'input': valid_lex, 'output': valid_y }, callbacks=cbs) if validate: val_f1, best_model = learning_curve( hist, preddir=os.path.join(basedir, 'predictions'), pltname=os.path.join( basedir, 'charts', 'hist_varyembed%d_nhidden%d.pdf' % (hiddendim, numhidden)), fileprefix=fileprefix) else: best_model = num_epochs - 1 val_f1 = 0.0 # Save model json_string = model.to_json() open(os.path.join(basedir, 'models', 'embedding_model_architecture.json'), 'w').write(json_string) # Test bestmodelfile = os.path.join( basedir, 'models', 'embedding.model.weights.%02d.hdf5' % best_model) shutil.copyfile(bestmodelfile, bestmodelfile.replace('.hdf5', '.best.hdf5')) if validate: model = model_from_json( open( os.path.join(basedir, 'models', 'embedding_model_architecture.json')).read()) model.load_weights(bestmodelfile) scores = predict_score(model, test_lex, test_toks, test_y, os.path.join(basedir, 'predictions'), idx2label, maxlen, fileprefix=fileprefix) scores['val_f1'] = val_f1 return scores, hist.history, best_model
f.close() print('Loaded %s word vectors.' % len(embeddings_index)) # create a weight matrix for words in training docs embedding_matrix = np.zeros((vocab_size, 200)) for word, i in t.word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector print('Build model...') # create the model embedding_vecor_length = 200 model = Sequential() model.add(Embedding(vocab_size, embedding_vecor_length, weights=[embedding_matrix], input_length=traning_len)) model.add(LSTM(100)) #model.add(Flatten()) model.add(Dropout(0.1)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) model.fit(X_train_pad, y_train, epochs=10, validation_split=0.1 ,batch_size=128) # Final evaluation of the model loss, accuracy = model.evaluate(X_test_pad, y_test) EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto') print("Accuracy: ", accuracy *100)
def make_network( allele_encoding_dims, kmer_size, peptide_amino_acid_encoding, embedding_input_dim, embedding_output_dim, allele_dense_layer_sizes, peptide_dense_layer_sizes, peptide_allele_merge_method, peptide_allele_merge_activation, layer_sizes, dense_layer_l1_regularization, dense_layer_l2_regularization, activation, init, output_activation, dropout_probability, batch_normalization, embedding_init_method, locally_connected_layers): """ Helper function to make a keras network for class1 affinity prediction. """ # We import keras here to avoid tensorflow debug output, etc. unless we # are actually about to use Keras. from keras.layers import Input import keras.layers from keras.layers.core import Dense, Flatten, Dropout from keras.layers.embeddings import Embedding from keras.layers.normalization import BatchNormalization if peptide_amino_acid_encoding == "embedding": peptide_input = Input( shape=(kmer_size,), dtype='int32', name='peptide') current_layer = Embedding( input_dim=embedding_input_dim, output_dim=embedding_output_dim, input_length=kmer_size, embeddings_initializer=embedding_init_method, name="peptide_embedding")(peptide_input) else: peptide_input = Input( shape=( kmer_size, vector_encoding_length(peptide_amino_acid_encoding)), dtype='float32', name='peptide') current_layer = peptide_input inputs = [peptide_input] kernel_regularizer = None l1 = dense_layer_l1_regularization l2 = dense_layer_l2_regularization if l1 > 0 or l2 > 0: kernel_regularizer = keras.regularizers.l1_l2(l1, l2) for (i, locally_connected_params) in enumerate(locally_connected_layers): current_layer = keras.layers.LocallyConnected1D( name="lc_%d" % i, **locally_connected_params)(current_layer) current_layer = Flatten(name="flattened_0")(current_layer) for (i, layer_size) in enumerate(peptide_dense_layer_sizes): current_layer = Dense( layer_size, name="peptide_dense_%d" % i, kernel_regularizer=kernel_regularizer, activation=activation)(current_layer) if batch_normalization: current_layer = BatchNormalization(name="batch_norm_early")( current_layer) if dropout_probability: current_layer = Dropout(dropout_probability, name="dropout_early")( current_layer) if allele_encoding_dims: allele_input = Input( shape=allele_encoding_dims, dtype='float32', name='allele') inputs.append(allele_input) allele_embedding_layer = Flatten(name="allele_flat")(allele_input) for (i, layer_size) in enumerate(allele_dense_layer_sizes): allele_embedding_layer = Dense( layer_size, name="allele_dense_%d" % i, kernel_regularizer=kernel_regularizer, activation=activation)(allele_embedding_layer) if peptide_allele_merge_method == 'concatenate': current_layer = keras.layers.concatenate([ current_layer, allele_embedding_layer ], name="allele_peptide_merged") elif peptide_allele_merge_method == 'multiply': current_layer = keras.layers.multiply([ current_layer, allele_embedding_layer ], name="allele_peptide_merged") else: raise ValueError( "Unsupported peptide_allele_encoding_merge_method: %s" % peptide_allele_merge_method) if peptide_allele_merge_activation: current_layer = keras.layers.Activation( peptide_allele_merge_activation, name="alelle_peptide_merged_%s" % peptide_allele_merge_activation)(current_layer) for (i, layer_size) in enumerate(layer_sizes): current_layer = Dense( layer_size, activation=activation, kernel_regularizer=kernel_regularizer, name="dense_%d" % i)(current_layer) if batch_normalization: current_layer = BatchNormalization(name="batch_norm_%d" % i)\ (current_layer) if dropout_probability > 0: current_layer = Dropout( dropout_probability, name="dropout_%d" % i)(current_layer) output = Dense( 1, kernel_initializer=init, activation=output_activation, name="output")(current_layer) model = keras.models.Model( inputs=inputs, outputs=[output], name="predictor") return model
def create_model(args, initial_mean_value, overal_maxlen, vocab): import keras.backend as K from keras.layers.embeddings import Embedding from keras.models import Sequential, Model from keras.layers.core import Dense, Dropout, Activation from nea.my_layers import Attention, MeanOverTime, Conv1DWithMasking ############################################################################################################################### ## Recurrence unit type # if args.recurrent_unit == 'lstm': from keras.layers.recurrent import LSTM as RNN elif args.recurrent_unit == 'gru': from keras.layers.recurrent import GRU as RNN elif args.recurrent_unit == 'simple': from keras.layers.recurrent import SimpleRNN as RNN ############################################################################################################################### ## Create Model # dropout_W = 0.5 # default=0.5 dropout_U = 0.1 # default=0.1 cnn_border_mode = 'same' if initial_mean_value.ndim == 0: initial_mean_value = np.expand_dims(initial_mean_value, axis=1) num_outputs = len(initial_mean_value) if args.model_type == 'cls': raise NotImplementedError elif args.model_type == 'reg': logger.info('Building a REGRESSION model') model = Sequential() model.add(Embedding(args.vocab_size, args.emb_dim, mask_zero=True)) if args.cnn_dim > 0: model.add( Conv1DWithMasking(nb_filter=args.cnn_dim, filter_length=args.cnn_window_size, border_mode=cnn_border_mode, subsample_length=1)) if args.rnn_dim > 0: model.add( RNN(args.rnn_dim, return_sequences=False, dropout_W=dropout_W, dropout_U=dropout_U)) if args.dropout_prob > 0: model.add(Dropout(args.dropout_prob)) model.add(Dense(num_outputs)) if not args.skip_init_bias: bias_value = (np.log(initial_mean_value) - np.log(1 - initial_mean_value)).astype(K.floatx()) model.layers[-1].b.set_value(bias_value) model.add(Activation('sigmoid')) model.emb_index = 0 elif args.model_type == 'regp': logger.info('Building a REGRESSION model with POOLING') model = Sequential() model.add(Embedding(args.vocab_size, args.emb_dim, mask_zero=True)) if args.cnn_dim > 0: model.add( Conv1DWithMasking(nb_filter=args.cnn_dim, filter_length=args.cnn_window_size, border_mode=cnn_border_mode, subsample_length=1)) if args.rnn_dim > 0: model.add( RNN(args.rnn_dim, return_sequences=True, dropout_W=dropout_W, dropout_U=dropout_U)) if args.dropout_prob > 0: model.add(Dropout(args.dropout_prob)) if args.aggregation == 'mot': model.add(MeanOverTime(mask_zero=True)) elif args.aggregation.startswith('att'): model.add( Attention(op=args.aggregation, activation='tanh', init_stdev=0.01)) model.add(Dense(num_outputs)) if not args.skip_init_bias: bias_value = (np.log(initial_mean_value) - np.log(1 - initial_mean_value)).astype(K.floatx()) model.layers[-1].b.set_value(bias_value) model.add(Activation('sigmoid')) model.emb_index = 0 elif args.model_type == 'breg': logger.info('Building a BIDIRECTIONAL REGRESSION model') from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge model = Sequential() sequence = Input(shape=(overal_maxlen, ), dtype='int32') output = Embedding(args.vocab_size, args.emb_dim, mask_zero=True)(sequence) if args.cnn_dim > 0: output = Conv1DWithMasking(nb_filter=args.cnn_dim, filter_length=args.cnn_window_size, border_mode=cnn_border_mode, subsample_length=1)(output) if args.rnn_dim > 0: forwards = RNN(args.rnn_dim, return_sequences=False, dropout_W=dropout_W, dropout_U=dropout_U)(output) backwards = RNN(args.rnn_dim, return_sequences=False, dropout_W=dropout_W, dropout_U=dropout_U, go_backwards=True)(output) if args.dropout_prob > 0: forwards = Dropout(args.dropout_prob)(forwards) backwards = Dropout(args.dropout_prob)(backwards) merged = merge([forwards, backwards], mode='concat', concat_axis=-1) densed = Dense(num_outputs)(merged) if not args.skip_init_bias: raise NotImplementedError score = Activation('sigmoid')(densed) model = Model(input=sequence, output=score) model.emb_index = 1 elif args.model_type == 'bregp': logger.info('Building a BIDIRECTIONAL REGRESSION model with POOLING') from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge model = Sequential() sequence = Input(shape=(overal_maxlen, ), dtype='int32') output = Embedding(args.vocab_size, args.emb_dim, mask_zero=True)(sequence) if args.cnn_dim > 0: output = Conv1DWithMasking(nb_filter=args.cnn_dim, filter_length=args.cnn_window_size, border_mode=cnn_border_mode, subsample_length=1)(output) if args.rnn_dim > 0: forwards = RNN(args.rnn_dim, return_sequences=True, dropout_W=dropout_W, dropout_U=dropout_U)(output) backwards = RNN(args.rnn_dim, return_sequences=True, dropout_W=dropout_W, dropout_U=dropout_U, go_backwards=True)(output) if args.dropout_prob > 0: forwards = Dropout(args.dropout_prob)(forwards) backwards = Dropout(args.dropout_prob)(backwards) forwards_mean = MeanOverTime(mask_zero=True)(forwards) backwards_mean = MeanOverTime(mask_zero=True)(backwards) merged = merge([forwards_mean, backwards_mean], mode='concat', concat_axis=-1) densed = Dense(num_outputs)(merged) if not args.skip_init_bias: raise NotImplementedError score = Activation('sigmoid')(densed) model = Model(input=sequence, output=score) model.emb_index = 1 logger.info(' Done') ############################################################################################################################### ## Initialize embeddings if requested # if args.emb_path: from w2vEmbReader import W2VEmbReader as EmbReader logger.info('Initializing lookup table') emb_reader = EmbReader(args.emb_path, emb_dim=args.emb_dim) model.layers[model.emb_index].W.set_value( emb_reader.get_emb_matrix_given_vocab( vocab, model.layers[model.emb_index].W.get_value())) logger.info(' Done') return model
def create_model(self): global keras_trigger_model model_input_dict = dict() outputs_to_merge_1 = [] embedding_layer = Embedding( self.word_embeddings.shape[0], self.word_embeddings.shape[1], weights=[self.word_embeddings], trainable=self.hyper_params.train_embeddings, name=u'embedding_layer') # TODO : why was there a dropout=0.3 in the above Embedding layer? window_size = 2 * self.hyper_params.neighbor_distance + 1 if self.features.sentence_word_embedding: sentence_word_embedding_input = Input( shape=(self.hyper_params.max_sentence_length, ), dtype=u'int32', name=u'sentence_word_embedding') outputs_to_merge_1.append( embedding_layer(sentence_word_embedding_input)) model_input_dict[ self.features. c_sentence_word_embedding] = sentence_word_embedding_input # For each word the pos_array_input defines the distance to the target work. # Embed each distance into an 'embedding_vec_length' dimensional vector space if self.features.trigger_word_position: trigger_word_position_input = Input( shape=(self.hyper_params.max_sentence_length, ), dtype=u'int32', name=u'sentence_word_position') outputs_to_merge_1.append( Embedding(2 * self.hyper_params.max_sentence_length, self.hyper_params.position_embedding_vector_length)( trigger_word_position_input)) model_input_dict[ self.features. c_trigger_word_position] = trigger_word_position_input # Sentence feature input is the result of mergeing word vectors and embeddings if self.features.sentence_ner_type: sentence_ner_type_input = Input( shape=(self.hyper_params.max_sentence_length, ), dtype=u'int32', name=u'sentence_entity_type') ner_embedding = Embedding( self.number_of_entity_bio_types, self.hyper_params.entity_embedding_vector_length)( sentence_ner_type_input) outputs_to_merge_1.append(ner_embedding) model_input_dict[ self.features.c_sentence_ner_type] = sentence_ner_type_input merged = concatenate(outputs_to_merge_1, axis=-1) # Note: border_mode='same' to keep output the same width as the input maxpools = [] for filter_length in self.hyper_params.filter_lengths: conv = Convolution1D(self.hyper_params.number_of_feature_maps, filter_length, border_mode=u'same', activation='relu')(merged) maxpools.append(GlobalMaxPooling1D()(conv)) outputs_to_merge_2 = [] outputs_to_merge_2.extend(maxpools) if self.features.trigger_window: trigger_window_input = Input(shape=(window_size, ), dtype=u'int32', name=u'trigger_window') lex_words = embedding_layer(trigger_window_input) lex_flattened = Flatten()(lex_words) outputs_to_merge_2.append(lex_flattened) model_input_dict[ self.features.c_trigger_window] = trigger_window_input merged_all = concatenate( outputs_to_merge_2 ) # I used to use: merge(maxpools + [lex_flattened], mode=u'concat') # Dense MLP layer with dropout dropout = Dropout(self.hyper_params.dropout)(merged_all) out = Dense(self.num_output, activation=u'softmax')(dropout) model_inputs = [ model_input_dict[k] for k in self.features.feature_strings ] keras_trigger_model = Model(inputs=model_inputs, output=[out]) keras_trigger_model.compile(optimizer=self.optimizer, loss=u'categorical_crossentropy', metrics=[]) self.model = keras_trigger_model