return auc if __name__ == "__main__": device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(f"Running on {device}.") goal = "classification" # goal = "regression" dataset_path = os.path.join('dataset_vad') target_df_path = os.path.join('targets.csv') filenames = os.listdir(dataset_path) splits_path = "splits" splits_name = "5_2021_02_25_23_05" _, test_files = load_data.load_train_test(filenames, splits_path, splits_name) dataset = AggregatorDataset(test_files, dataset_path, target_df_path, goal, sample_duration=1) models_path = os.path.join('models', 'resnet50') model = models.ResNet50(pretrained=True, goal=goal) model_name = "2_2021_02_25_23_39" model.load_state_dict(torch.load(os.path.join(models_path, model_name))) print("Loaded model", model_name) ## Aggregate: full evaluation script evaluate(dataset, model, goal=goal, display=True)
import joblib from load_data import load_train_test def fit_naivebayes(X_train, y_train, X_test, y_test, batch=10000): clf = MultinomialNB() # Train ml for i in range(0, X_train.shape[0], batch): clf.partial_fit(X_train[i:i + batch, :], y_train[i:i + batch], classes=[0, 1]) yy_test = clf.predict(X_test) print('Accuracy:{}'.format(accuracy_score(y_test, yy_test))) return clf if __name__ == "__main__": print("Loading data") train_X, test_X, train_y, test_y = load_train_test() print("Extracting features(TFDF)") # Get Tf-idf object: Feature extracting # tfdf = TfidfVectorizer(analyzer="word", ngram_range=(1, 3)) # bigram features, need more memory tfdf = TfidfVectorizer(analyzer="word") tfdf.fit(train_X) # fit or traing data X_train_dtm = tfdf.transform(train_X) # transform our training data X_test_dtm = tfdf.transform(test_X) # transform our testing data train_X, test_X = None, None # free memory print("Training logistic regression") current_clf = fit_naivebayes(X_train_dtm, train_y, X_test_dtm, test_y) print("Saving Arctifacts") joblib.dump(dict(clf=current_clf, tfdf=tfdf), open("./cache/predict_param.pickle", "wb"))
def train(): # load data # vectors = [28782+1,200] # vectors = numpy.load('vectors.npy') train_sent, train_label, train_length, test_sent, test_label, test_length = load_data.load_train_test( ) # word embedding words_embedding = tf.random_uniform([vocab_size + 1, embedding_size], -1.0, 1.0, name="embedding") # input is a sentence train_data_node = tf.placeholder(tf.int32, shape=(None, max_sentence_length)) train_length_node = tf.placeholder(tf.int32, shape=(None, )) train_labels_node = tf.placeholder(tf.int32, shape=(None, max_sentence_length)) weights = tf.Variable(tf.random_uniform([num_hidden * 2, num_tag], -1.0, 1.0), name="w") biases = tf.Variable(tf.random_normal(shape=[num_tag], dtype=tf.float32), name="b") # CRF transitions = tf.Variable(tf.random_uniform([num_tag, num_tag], -1.0, 1.0), name="trans") # inputs = [batch_size,max_sentence_length] # lengths = [batch_size,] def blstm_crf(inputs): # sents = [batch_size,max_path_length,embedding_size] sents = tf.nn.embedding_lookup(words_embedding, inputs) x = tf.transpose(sents, [1, 0, 2]) x = tf.reshape(x, [-1, embedding_size]) # get a list of 'n_steps' tensors of shape (batch_size, embeddings) x = tf.split(0, num_steps, x) # bi-lstm fw_cell = tf.nn.rnn_cell.LSTMCell(num_hidden, forget_bias=1.0, state_is_tuple=True) fw_cell = tf.nn.rnn_cell.DropoutWrapper(fw_cell, output_keep_prob=0.5) bw_cell = tf.nn.rnn_cell.LSTMCell(num_hidden, forget_bias=1.0, state_is_tuple=True) bw_cell = tf.nn.rnn_cell.DropoutWrapper(bw_cell, output_keep_prob=0.5) if rnn_layer > 1: fw_cell = tf.nn.rnn_cell.MultiRNNCell([fw_cell] * rnn_layer) bw_cell = tf.nn.rnn_cell.MultiRNNCell([bw_cell] * rnn_layer) # output = [batch_size,num_hidden*2] outputs, fw_final_state, bw_final_state = tf.nn.bidirectional_rnn( fw_cell, bw_cell, x, dtype=tf.float32) # linear # rnn_output = [batch_size,num_steps,num_hidden*2] rnn_output = tf.transpose(tf.pack(outputs), perm=[1, 0, 2]) # output = [batch_size*num_steps,num_tag] output = tf.matmul(tf.reshape(rnn_output, [-1, num_hidden * 2]), weights) + biases # output = [batch_size,num_steps,num_tag] output = tf.reshape(output, [-1, num_steps, num_tag]) return output # unary_scores = [batch_size,num_steps,num_tag] unary_scores = blstm_crf(train_data_node) # CRF log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood( unary_scores, train_labels_node, train_length_node, transition_params=transitions) loss = tf.reduce_mean(-log_likelihood, name='cross_entropy_mean_loss') # train global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # runing the training with tf.Session() as sess: tf.initialize_all_variables().run() print('Initialized!') # generate batches batches = data_helpers.batch_iter( list(zip(train_sent, train_label, train_length)), BATCH_SIZE, NUM_EPOCHS) # batch count batch_count = 0 epoch = 1 print("Epoch " + str(epoch) + ":") for batch in batches: batch_count += 1 # train process x_batch, y_batch, length_batch = zip(*batch) feed_dict = { train_data_node: x_batch, train_labels_node: y_batch, train_length_node: length_batch } _, step, losses, tf_transition_params = sess.run( [train_op, global_step, loss, transition_params], feed_dict=feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}".format(time_str, step, losses)) # test process if float((batch_count * BATCH_SIZE) / Train_size) > epoch: epoch += 1 print("Epoch " + str(epoch) + ":") if batch_count % EVAL_FREQUENCY == 0: # get test scores feed_dict = { train_data_node: test_sent, train_labels_node: test_label, train_length_node: test_length } step, losses, scores = sess.run( [global_step, loss, unary_scores], feed_dict=feed_dict) correct_labels = 0 total_labels = 0 for i in range(Test_size): # Remove padding from the scores and tag sequence. current_score = scores[i][:test_length[i]] current_label = test_label[i][:test_length[i]] # Compute the highest scoring sequence. viterbi_sequence, _ = tf.contrib.crf.viterbi_decode( current_score, tf_transition_params) # Evaluate word-level accuracy. correct_labels += numpy.sum( numpy.equal(viterbi_sequence, current_label)) total_labels += test_length[i] time_str = datetime.datetime.now().isoformat() acc = 100.0 * correct_labels / float(total_labels) print("\n") print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, losses, acc)) print("\n")
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, word_nkerns=10, char_nkerns=4, batch_size=1, window_width=3, emb_size=50, char_emb_size=4, hidden_size=200, margin=0.5, L2_weight=0.0003, update_freq=1, norm_threshold=5.0, max_truncate=40, max_char_len=40, max_des_len=20, max_relation_len=5, max_Q_len=30, train_neg_size=5, test_neg_size=5, valid_neg_size=5, neg_all=5): # maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/' triple_files=['annotated_fb_data_train_PNQ_'+str(train_neg_size)+'nega_str&des.txt', 'annotated_fb_data_valid_PNQ_'+str(valid_neg_size)+'nega_str&des.txt', 'annotated_fb_data_test_PNQ_'+str(test_neg_size)+'nega_str&des.txt'] question_files=['annotated_fb_data_train_mention_remainQ.txt', 'annotated_fb_data_valid_mention_remainQ.txt', 'annotated_fb_data_test_mention_remainQ.txt'] rng = numpy.random.RandomState(23455) datasets, vocab_size, char_size=load_train_test(triple_files, question_files, max_char_len, max_des_len, max_relation_len, max_Q_len, neg_all)#max_char_len, max_des_len, max_relation_len, max_Q_len print 'vocab_size:', vocab_size, 'char_size:', char_size train_data=datasets[0] valid_data=datasets[1] test_data=datasets[2] train_pos_entity_char=train_data[0] train_pos_entity_des=train_data[1] train_relations=train_data[2] train_entity_char_lengths=train_data[3] train_entity_des_lengths=train_data[4] train_relation_lengths=train_data[5] train_mention_char_ids=train_data[6] train_remainQ_word_ids=train_data[7] train_mention_char_lens=train_data[8] train_remainQ_word_len=train_data[9] valid_pos_entity_char=valid_data[0] valid_pos_entity_des=valid_data[1] valid_relations=valid_data[2] valid_entity_char_lengths=valid_data[3] valid_entity_des_lengths=valid_data[4] valid_relation_lengths=valid_data[5] valid_mention_char_ids=valid_data[6] valid_remainQ_word_ids=valid_data[7] valid_mention_char_lens=valid_data[8] valid_remainQ_word_len=valid_data[9] test_pos_entity_char=test_data[0] #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51 test_pos_entity_des=test_data[1] #matrix, each row for a examle: 20*2*51 test_relations=test_data[2] #matrix, each row for a example: 5*51 test_entity_char_lengths=test_data[3] #matrix, each row for a example: 3*2*51 (three valies for one entity) test_entity_des_lengths=test_data[4] #matrix, each row for a example: 3*2*51 (three values for one entity) test_relation_lengths=test_data[5] #matrix, each row for a example: 3*51 test_mention_char_ids=test_data[6] #matrix, each row for a mention: 40 test_remainQ_word_ids=test_data[7] #matrix, each row for a question: 30 test_mention_char_lens=test_data[8] #matrix, each three values for a mention: 3 test_remainQ_word_len=test_data[9] #matrix, each three values for a remain question: 3 expected_train_size=len(train_pos_entity_char) train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\ len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len)] if sum(train_sizes)/len(train_sizes)!=expected_train_size: print 'weird size:', train_sizes exit(0) expected_test_size=len(test_pos_entity_char) test_sizes=[len(test_pos_entity_char), len(test_pos_entity_des), len(test_relations), len(test_entity_char_lengths), len(test_entity_des_lengths),\ len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len)] if sum(test_sizes)/len(test_sizes)!=expected_test_size: print 'weird size:', test_sizes exit(0) n_train_batches=expected_train_size/batch_size n_test_batches=expected_test_size/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_pos_entity_char=pythonList_into_theanoIntMatrix(train_pos_entity_char) indices_train_pos_entity_des=pythonList_into_theanoIntMatrix(train_pos_entity_des) indices_train_relations=pythonList_into_theanoIntMatrix(train_relations) indices_train_entity_char_lengths=pythonList_into_theanoIntMatrix(train_entity_char_lengths) indices_train_entity_des_lengths=pythonList_into_theanoIntMatrix(train_entity_des_lengths) indices_train_relation_lengths=pythonList_into_theanoIntMatrix(train_relation_lengths) indices_train_mention_char_ids=pythonList_into_theanoIntMatrix(train_mention_char_ids) indices_train_remainQ_word_ids=pythonList_into_theanoIntMatrix(train_remainQ_word_ids) indices_train_mention_char_lens=pythonList_into_theanoIntMatrix(train_mention_char_lens) indices_train_remainQ_word_len=pythonList_into_theanoIntMatrix(train_remainQ_word_len) indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char) indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des) indices_test_relations=pythonList_into_theanoIntMatrix(test_relations) indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths) indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths) indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths) indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids) indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids) indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens) indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len) rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb.txt') embeddings=theano.shared(value=rand_values, borrow=True) char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, numpy.random.RandomState(1234)) char_rand_values[0]=numpy.array(numpy.zeros(char_emb_size),dtype=theano.config.floatX) char_embeddings=theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data index = T.lscalar() ent_char_ids_M = T.lmatrix() ent_lens_M = T.lmatrix() men_char_ids = T.lvector() men_lens=T.lvector() rel_word_ids_M=T.lmatrix() rel_word_lens_M=T.lmatrix() desH_word_ids_M=T.lmatrix() desH_word_lens_M=T.lmatrix() desT_word_ids_M=T.lmatrix() desT_word_lens_M=T.lmatrix() q_word_ids=T.lvector() q_word_lens=T.lvector() #max_char_len, max_des_len, max_relation_len, max_Q_len # ent_men_ishape = (char_emb_size, max_char_len) # this is the size of MNIST images # rel_ishape=(emb_size, max_relation_len) # des_ishape=(emb_size, max_des_len) # q_ishape=(emb_size, max_Q_len) filter_size=(emb_size,window_width) char_filter_size=(char_emb_size, window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' char_filter_shape=(char_nkerns, 1, char_filter_size[0], char_filter_size[1]) word_filter_shape=(word_nkerns, 1, filter_size[0], filter_size[1]) char_conv_W, char_conv_b=create_conv_para(rng, filter_shape=char_filter_shape) q_rel_conv_W, q_rel_conv_b=create_conv_para(rng, filter_shape=word_filter_shape) q_desH_conv_W, q_desH_conv_b=create_conv_para(rng, filter_shape=word_filter_shape) q_desT_conv_W, q_desT_conv_b=create_conv_para(rng, filter_shape=word_filter_shape) def SimpleQ_matches_Triple(ent_char_ids_f,ent_lens_f,rel_word_ids_f,rel_word_lens_f,desH_word_ids_f, desH_word_lens_f,desT_word_ids_f,desT_word_lens_f): # rng = numpy.random.RandomState(23455) ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) men_char_input = char_embeddings[men_char_ids.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape((batch_size,max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) q_word_input = embeddings[q_word_ids.flatten()].reshape((batch_size,max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #ent_mention ent_char_conv = Conv_with_input_para(rng, input=ent_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) men_char_conv = Conv_with_input_para(rng, input=men_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) #q-rel q_rel_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) rel_conv = Conv_with_input_para(rng, input=rel_word_input, image_shape=(batch_size, 1, emb_size, max_relation_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) #q_desH q_desH_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) desH_conv = Conv_with_input_para(rng, input=desH_word_input, image_shape=(batch_size, 1, emb_size, max_des_len), filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) #q_desT q_desT_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b) desT_conv = Conv_with_input_para(rng, input=desT_word_input, image_shape=(batch_size, 1, emb_size, max_des_len), filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b) # ent_char_output=debug_print(ent_char_conv.output, 'ent_char.output') # men_char_output=debug_print(men_char_conv.output, 'men_char.output') ent_conv_pool=Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2]) men_conv_pool=Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens[0], right_l=men_lens[2]) q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2]) rel_conv_pool=Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2]) q_desH_pool=Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2]) desH_conv_pool=Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2]) q_desT_pool=Max_Pooling(rng, input_l=q_desT_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2]) desT_conv_pool=Max_Pooling(rng, input_l=desT_conv.output, left_l=desT_word_lens_f[0], right_l=desT_word_lens_f[2]) overall_simi=cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)+\ cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling)+\ cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling)+\ cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling) return overall_simi simi_list, updates = theano.scan( SimpleQ_matches_Triple, sequences=[ent_char_ids_M,ent_lens_M,rel_word_ids_M,rel_word_lens_M,desH_word_ids_M, desH_word_lens_M,desT_word_ids_M,desT_word_lens_M]) posi_simi=simi_list[0] nega_simies=simi_list[1:] loss_simi_list=T.maximum(0.0, margin-posi_simi.reshape((1,1))+nega_simies) loss_simi=T.sum(loss_simi_list) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((char_embeddings** 2).sum()+(embeddings** 2).sum()+(char_conv_W** 2).sum()+(q_rel_conv_W** 2).sum()+(q_desH_conv_W** 2).sum()+(q_desT_conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost=loss_simi+L2_weight*L2_reg #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([index], [loss_simi,simi_list], givens={ ent_char_ids_M : indices_test_pos_entity_char[index].reshape(((test_neg_size+1)*2, max_char_len))[::2], ent_lens_M : indices_test_entity_char_lengths[index].reshape(((test_neg_size+1)*2, 3))[::2], men_char_ids : indices_test_mention_char_ids[index], men_lens : indices_test_mention_char_lens[index], rel_word_ids_M : indices_test_relations[index].reshape((test_neg_size+1, max_relation_len)), rel_word_lens_M : indices_test_relation_lengths[index].reshape((test_neg_size+1, 3)), desH_word_ids_M : indices_test_pos_entity_des[index].reshape(((test_neg_size+1)*2, max_des_len))[::2], desH_word_lens_M : indices_test_entity_des_lengths[index].reshape(((test_neg_size+1)*2, 3))[::2], desT_word_ids_M : indices_test_pos_entity_des[index].reshape(((test_neg_size+1)*2, max_des_len))[1::2], desT_word_lens_M : indices_test_entity_des_lengths[index].reshape(((test_neg_size+1)*2, 3))[1::2], q_word_ids : indices_test_remainQ_word_ids[index], q_word_lens : indices_test_remainQ_word_len[index]}, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = [char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W, q_rel_conv_b, q_desH_conv_W, q_desH_conv_b, q_desT_conv_W, q_desT_conv_b]#+[embeddings]# + layer1.params # params_conv = [conv_W, conv_b] accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))) #AdaGrad # updates.append((acc_i, acc)) if param_i == embeddings: updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(emb_size))))) #Ada elif param_i == char_embeddings: updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(char_emb_size))))) #AdaGrad else: updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index], [loss_simi, cost], updates=updates, givens={ ent_char_ids_M : indices_train_pos_entity_char[index].reshape(((train_neg_size+1)*2, max_char_len))[::2], ent_lens_M : indices_train_entity_char_lengths[index].reshape(((train_neg_size+1)*2, 3))[::2], men_char_ids : indices_train_mention_char_ids[index], men_lens : indices_train_mention_char_lens[index], rel_word_ids_M : indices_train_relations[index].reshape((train_neg_size+1, max_relation_len)), rel_word_lens_M : indices_train_relation_lengths[index].reshape((train_neg_size+1, 3)), desH_word_ids_M : indices_train_pos_entity_des[index].reshape(((train_neg_size+1)*2, max_des_len))[::2], desH_word_lens_M : indices_train_entity_des_lengths[index].reshape(((train_neg_size+1)*2, 3))[::2], desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((train_neg_size+1)*2, max_des_len))[1::2], desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((train_neg_size+1)*2, 3))[1::2], q_word_ids : indices_train_remainQ_word_ids[index], q_word_lens : indices_train_remainQ_word_len[index]}, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False best_test_accu=0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 # ent_char_ids_M_train = indices_train_pos_entity_char[index].reshape(((train_neg_size+1)*2, max_char_len))[::2], # ent_lens_M : indices_train_entity_char_lengths[index].reshape(((train_neg_size+1)*2, 3))[::2], # men_char_ids : indices_train_mention_char_ids[index], # men_lens : indices_train_mention_char_lens[index], # rel_word_ids_M : indices_train_relations[index].reshape((train_neg_size+1, max_relation_len)), # rel_word_lens_M : indices_train_relation_lengths[index].reshape((train_neg_size+1, 3)), # desH_word_ids_M : indices_train_pos_entity_des[index].reshape(((train_neg_size+1)*2, max_des_len))[::2], # desH_word_lens_M : indices_train_entity_des_lengths[index].reshape(((train_neg_size+1)*2, 3))[::2], # desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((train_neg_size+1)*2, max_des_len))[1::2], # desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((train_neg_size+1)*2, 3))[1::2], # q_word_ids : indices_train_remainQ_word_ids[index], # q_word_lens : indices_train_remainQ_word_len[index]}, on_unused_input='ignore') for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #print batch_start loss_simi_i, cost_i= train_model(batch_start) if batch_start%10==0: print batch_start, 'loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+'loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_loss=[] succ=0 for i in test_batch_start: loss_simi_i,simi_list_i=test_model(i) test_loss.append(loss_simi_i) if simi_list_i[0]>=max(simi_list_i[1:]): succ+=1 succ=succ*1.0/len(expected_test_size) #now, check MAP and MRR print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best ' 'model %f') % (epoch, minibatch_index, n_train_batches,succ)) if best_test_accu< succ: best_test_accu=succ store_model_to_file(rootPath, params) if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, word_nkerns=10, char_nkerns=4, batch_size=1, window_width=3, emb_size=50, char_emb_size=4, hidden_size=200, margin=0.5, L2_weight=0.0003, update_freq=1, norm_threshold=5.0, max_truncate=40, max_char_len=40, max_des_len=20, max_relation_len=5, max_Q_len=30, train_neg_size=5, test_neg_size=5, valid_neg_size=5, neg_all=5): # maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/' triple_files = [ 'annotated_fb_data_train_PNQ_' + str(train_neg_size) + 'nega_str&des.txt', 'annotated_fb_data_valid_PNQ_' + str(valid_neg_size) + 'nega_str&des.txt', 'annotated_fb_data_test_PNQ_' + str(test_neg_size) + 'nega_str&des.txt' ] question_files = [ 'annotated_fb_data_train_mention_remainQ.txt', 'annotated_fb_data_valid_mention_remainQ.txt', 'annotated_fb_data_test_mention_remainQ.txt' ] rng = numpy.random.RandomState(23455) datasets, vocab_size, char_size = load_train_test( triple_files, question_files, max_char_len, max_des_len, max_relation_len, max_Q_len, neg_all) #max_char_len, max_des_len, max_relation_len, max_Q_len print 'vocab_size:', vocab_size, 'char_size:', char_size train_data = datasets[0] valid_data = datasets[1] test_data = datasets[2] train_pos_entity_char = train_data[0] train_pos_entity_des = train_data[1] train_relations = train_data[2] train_entity_char_lengths = train_data[3] train_entity_des_lengths = train_data[4] train_relation_lengths = train_data[5] train_mention_char_ids = train_data[6] train_remainQ_word_ids = train_data[7] train_mention_char_lens = train_data[8] train_remainQ_word_len = train_data[9] valid_pos_entity_char = valid_data[0] valid_pos_entity_des = valid_data[1] valid_relations = valid_data[2] valid_entity_char_lengths = valid_data[3] valid_entity_des_lengths = valid_data[4] valid_relation_lengths = valid_data[5] valid_mention_char_ids = valid_data[6] valid_remainQ_word_ids = valid_data[7] valid_mention_char_lens = valid_data[8] valid_remainQ_word_len = valid_data[9] test_pos_entity_char = test_data[ 0] #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51 test_pos_entity_des = test_data[1] #matrix, each row for a examle: 20*2*51 test_relations = test_data[2] #matrix, each row for a example: 5*51 test_entity_char_lengths = test_data[ 3] #matrix, each row for a example: 3*2*51 (three valies for one entity) test_entity_des_lengths = test_data[ 4] #matrix, each row for a example: 3*2*51 (three values for one entity) test_relation_lengths = test_data[5] #matrix, each row for a example: 3*51 test_mention_char_ids = test_data[6] #matrix, each row for a mention: 40 test_remainQ_word_ids = test_data[7] #matrix, each row for a question: 30 test_mention_char_lens = test_data[ 8] #matrix, each three values for a mention: 3 test_remainQ_word_len = test_data[ 9] #matrix, each three values for a remain question: 3 expected_train_size = len(train_pos_entity_char) train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\ len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len)] if sum(train_sizes) / len(train_sizes) != expected_train_size: print 'weird size:', train_sizes exit(0) expected_test_size = len(test_pos_entity_char) test_sizes=[len(test_pos_entity_char), len(test_pos_entity_des), len(test_relations), len(test_entity_char_lengths), len(test_entity_des_lengths),\ len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len)] if sum(test_sizes) / len(test_sizes) != expected_test_size: print 'weird size:', test_sizes exit(0) n_train_batches = expected_train_size / batch_size n_test_batches = expected_test_size / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) indices_train_pos_entity_char = pythonList_into_theanoIntMatrix( train_pos_entity_char) indices_train_pos_entity_des = pythonList_into_theanoIntMatrix( train_pos_entity_des) indices_train_relations = pythonList_into_theanoIntMatrix(train_relations) indices_train_entity_char_lengths = pythonList_into_theanoIntMatrix( train_entity_char_lengths) indices_train_entity_des_lengths = pythonList_into_theanoIntMatrix( train_entity_des_lengths) indices_train_relation_lengths = pythonList_into_theanoIntMatrix( train_relation_lengths) indices_train_mention_char_ids = pythonList_into_theanoIntMatrix( train_mention_char_ids) indices_train_remainQ_word_ids = pythonList_into_theanoIntMatrix( train_remainQ_word_ids) indices_train_mention_char_lens = pythonList_into_theanoIntMatrix( train_mention_char_lens) indices_train_remainQ_word_len = pythonList_into_theanoIntMatrix( train_remainQ_word_len) indices_test_pos_entity_char = pythonList_into_theanoIntMatrix( test_pos_entity_char) indices_test_pos_entity_des = pythonList_into_theanoIntMatrix( test_pos_entity_des) indices_test_relations = pythonList_into_theanoIntMatrix(test_relations) indices_test_entity_char_lengths = pythonList_into_theanoIntMatrix( test_entity_char_lengths) indices_test_entity_des_lengths = pythonList_into_theanoIntMatrix( test_entity_des_lengths) indices_test_relation_lengths = pythonList_into_theanoIntMatrix( test_relation_lengths) indices_test_mention_char_ids = pythonList_into_theanoIntMatrix( test_mention_char_ids) indices_test_remainQ_word_ids = pythonList_into_theanoIntMatrix( test_remainQ_word_ids) indices_test_mention_char_lens = pythonList_into_theanoIntMatrix( test_mention_char_lens) indices_test_remainQ_word_len = pythonList_into_theanoIntMatrix( test_remainQ_word_len) rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init(rand_values, rootPath + 'word_emb.txt') embeddings = theano.shared(value=rand_values, borrow=True) char_rand_values = random_value_normal((char_size + 1, char_emb_size), theano.config.floatX, numpy.random.RandomState(1234)) char_rand_values[0] = numpy.array(numpy.zeros(char_emb_size), dtype=theano.config.floatX) char_embeddings = theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data index = T.lscalar() ent_char_ids_M = T.lmatrix() ent_lens_M = T.lmatrix() men_char_ids = T.lvector() men_lens = T.lvector() rel_word_ids_M = T.lmatrix() rel_word_lens_M = T.lmatrix() desH_word_ids_M = T.lmatrix() desH_word_lens_M = T.lmatrix() desT_word_ids_M = T.lmatrix() desT_word_lens_M = T.lmatrix() q_word_ids = T.lvector() q_word_lens = T.lvector() #max_char_len, max_des_len, max_relation_len, max_Q_len # ent_men_ishape = (char_emb_size, max_char_len) # this is the size of MNIST images # rel_ishape=(emb_size, max_relation_len) # des_ishape=(emb_size, max_des_len) # q_ishape=(emb_size, max_Q_len) filter_size = (emb_size, window_width) char_filter_size = (char_emb_size, window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' char_filter_shape = (char_nkerns, 1, char_filter_size[0], char_filter_size[1]) word_filter_shape = (word_nkerns, 1, filter_size[0], filter_size[1]) char_conv_W, char_conv_b = create_conv_para(rng, filter_shape=char_filter_shape) q_rel_conv_W, q_rel_conv_b = create_conv_para( rng, filter_shape=word_filter_shape) q_desH_conv_W, q_desH_conv_b = create_conv_para( rng, filter_shape=word_filter_shape) q_desT_conv_W, q_desT_conv_b = create_conv_para( rng, filter_shape=word_filter_shape) def SimpleQ_matches_Triple(ent_char_ids_f, ent_lens_f, rel_word_ids_f, rel_word_lens_f, desH_word_ids_f, desH_word_lens_f, desT_word_ids_f, desT_word_lens_f): # rng = numpy.random.RandomState(23455) ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape( (batch_size, max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) men_char_input = char_embeddings[men_char_ids.flatten()].reshape( (batch_size, max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape( (batch_size, max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape( (batch_size, max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape( (batch_size, max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) q_word_input = embeddings[q_word_ids.flatten()].reshape( (batch_size, max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #ent_mention ent_char_conv = Conv_with_input_para(rng, input=ent_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) men_char_conv = Conv_with_input_para(rng, input=men_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) #q-rel q_rel_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) rel_conv = Conv_with_input_para(rng, input=rel_word_input, image_shape=(batch_size, 1, emb_size, max_relation_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) #q_desH q_desH_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) desH_conv = Conv_with_input_para(rng, input=desH_word_input, image_shape=(batch_size, 1, emb_size, max_des_len), filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) #q_desT q_desT_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b) desT_conv = Conv_with_input_para(rng, input=desT_word_input, image_shape=(batch_size, 1, emb_size, max_des_len), filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b) # ent_char_output=debug_print(ent_char_conv.output, 'ent_char.output') # men_char_output=debug_print(men_char_conv.output, 'men_char.output') ent_conv_pool = Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2]) men_conv_pool = Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens[0], right_l=men_lens[2]) q_rel_pool = Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2]) rel_conv_pool = Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2]) q_desH_pool = Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2]) desH_conv_pool = Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2]) q_desT_pool = Max_Pooling(rng, input_l=q_desT_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2]) desT_conv_pool = Max_Pooling(rng, input_l=desT_conv.output, left_l=desT_word_lens_f[0], right_l=desT_word_lens_f[2]) overall_simi=cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)+\ cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling)+\ cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling)+\ cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling) return overall_simi simi_list, updates = theano.scan(SimpleQ_matches_Triple, sequences=[ ent_char_ids_M, ent_lens_M, rel_word_ids_M, rel_word_lens_M, desH_word_ids_M, desH_word_lens_M, desT_word_ids_M, desT_word_lens_M ]) posi_simi = simi_list[0] nega_simies = simi_list[1:] loss_simi_list = T.maximum( 0.0, margin - posi_simi.reshape((1, 1)) + nega_simies) loss_simi = T.sum(loss_simi_list) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg = debug_print( (char_embeddings**2).sum() + (embeddings**2).sum() + (char_conv_W**2).sum() + (q_rel_conv_W**2).sum() + (q_desH_conv_W**2).sum() + (q_desT_conv_W**2).sum(), 'L2_reg') #+(layer1.W** 2).sum()++(embeddings**2).sum() cost = loss_simi + L2_weight * L2_reg #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function( [index], [loss_simi, simi_list], givens={ ent_char_ids_M: indices_test_pos_entity_char[index].reshape( ((test_neg_size + 1) * 2, max_char_len))[::2], ent_lens_M: indices_test_entity_char_lengths[index].reshape( ((test_neg_size + 1) * 2, 3))[::2], men_char_ids: indices_test_mention_char_ids[index], men_lens: indices_test_mention_char_lens[index], rel_word_ids_M: indices_test_relations[index].reshape( (test_neg_size + 1, max_relation_len)), rel_word_lens_M: indices_test_relation_lengths[index].reshape( (test_neg_size + 1, 3)), desH_word_ids_M: indices_test_pos_entity_des[index].reshape( ((test_neg_size + 1) * 2, max_des_len))[::2], desH_word_lens_M: indices_test_entity_des_lengths[index].reshape( ((test_neg_size + 1) * 2, 3))[::2], desT_word_ids_M: indices_test_pos_entity_des[index].reshape( ((test_neg_size + 1) * 2, max_des_len))[1::2], desT_word_lens_M: indices_test_entity_des_lengths[index].reshape( ((test_neg_size + 1) * 2, 3))[1::2], q_word_ids: indices_test_remainQ_word_ids[index], q_word_lens: indices_test_remainQ_word_len[index] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = [ char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W, q_rel_conv_b, q_desH_conv_W, q_desH_conv_b, q_desT_conv_W, q_desT_conv_b ] #+[embeddings]# + layer1.params # params_conv = [conv_W, conv_b] accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') acc = acc_i + T.sqr(grad_i) # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))) #AdaGrad # updates.append((acc_i, acc)) if param_i == embeddings: updates.append( (param_i, T.set_subtensor( (param_i - learning_rate * grad_i / T.sqrt(acc + 1e-10))[0], theano.shared(numpy.zeros(emb_size))))) #Ada elif param_i == char_embeddings: updates.append( (param_i, T.set_subtensor( (param_i - learning_rate * grad_i / T.sqrt(acc + 1e-10))[0], theano.shared(numpy.zeros(char_emb_size))))) #AdaGrad else: updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc + 1e-10))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [index], [loss_simi, cost], updates=updates, givens={ ent_char_ids_M: indices_train_pos_entity_char[index].reshape( ((train_neg_size + 1) * 2, max_char_len))[::2], ent_lens_M: indices_train_entity_char_lengths[index].reshape( ((train_neg_size + 1) * 2, 3))[::2], men_char_ids: indices_train_mention_char_ids[index], men_lens: indices_train_mention_char_lens[index], rel_word_ids_M: indices_train_relations[index].reshape( (train_neg_size + 1, max_relation_len)), rel_word_lens_M: indices_train_relation_lengths[index].reshape( (train_neg_size + 1, 3)), desH_word_ids_M: indices_train_pos_entity_des[index].reshape( ((train_neg_size + 1) * 2, max_des_len))[::2], desH_word_lens_M: indices_train_entity_des_lengths[index].reshape( ((train_neg_size + 1) * 2, 3))[::2], desT_word_ids_M: indices_train_pos_entity_des[index].reshape( ((train_neg_size + 1) * 2, max_des_len))[1::2], desT_word_lens_M: indices_train_entity_des_lengths[index].reshape( ((train_neg_size + 1) * 2, 3))[1::2], q_word_ids: indices_train_remainQ_word_ids[index], q_word_lens: indices_train_remainQ_word_len[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False best_test_accu = 0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 # ent_char_ids_M_train = indices_train_pos_entity_char[index].reshape(((train_neg_size+1)*2, max_char_len))[::2], # ent_lens_M : indices_train_entity_char_lengths[index].reshape(((train_neg_size+1)*2, 3))[::2], # men_char_ids : indices_train_mention_char_ids[index], # men_lens : indices_train_mention_char_lens[index], # rel_word_ids_M : indices_train_relations[index].reshape((train_neg_size+1, max_relation_len)), # rel_word_lens_M : indices_train_relation_lengths[index].reshape((train_neg_size+1, 3)), # desH_word_ids_M : indices_train_pos_entity_des[index].reshape(((train_neg_size+1)*2, max_des_len))[::2], # desH_word_lens_M : indices_train_entity_des_lengths[index].reshape(((train_neg_size+1)*2, 3))[::2], # desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((train_neg_size+1)*2, max_des_len))[1::2], # desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((train_neg_size+1)*2, 3))[1::2], # q_word_ids : indices_train_remainQ_word_ids[index], # q_word_lens : indices_train_remainQ_word_len[index]}, on_unused_input='ignore') for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 minibatch_index = minibatch_index + 1 #print batch_start loss_simi_i, cost_i = train_model(batch_start) if batch_start % 10 == 0: print batch_start, 'loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + 'loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_loss = [] succ = 0 for i in test_batch_start: loss_simi_i, simi_list_i = test_model(i) test_loss.append(loss_simi_i) if simi_list_i[0] >= max(simi_list_i[1:]): succ += 1 succ = succ * 1.0 / len(expected_test_size) #now, check MAP and MRR print(( '\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best ' 'model %f') % (epoch, minibatch_index, n_train_batches, succ)) if best_test_accu < succ: best_test_accu = succ store_model_to_file(rootPath, params) if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min' mid_time = time.clock() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def main(): # Read input args args = parse_to_argdict() ROOT_DIR = args['root_dir'] MODEL_NAME = args['model_name'] EPOCHS = args['epochs'] SAVE_DEMO = args['save_demo_results'] SAVE_PREDS = args['save_preds'] TRAIN_PATH = ROOT_DIR + '/data/cell_imgs/' MASK_PATH = ROOT_DIR + '/data/mask_imgs/' TEST_PATH = ROOT_DIR + '/data/test_imgs/' OUTPUT_DIR = ROOT_DIR + '/outs' WEIGHTS = OUTPUT_DIR + '/' + MODEL_NAME +'_weights.h5' LOG_DIR = OUTPUT_DIR + "/logs" RESULTS_DIR = OUTPUT_DIR + '/results' # Load train test data function_time_outs = "" start = time.time() X_train, Y_train, X_test, train_ids, test_ids, sizes_test = load_train_test(TRAIN_PATH, MASK_PATH, TEST_PATH, IMG_WIDTH, IMG_HEIGHT, IMG_CHANNELS) function_time_outs += "Load train test: %.3f sec\n" % (time.time() - start) train_size = int(X_train.shape[0] * 0.9) # Data augmentation train_generator, val_generator = create_image_mask_generator(X_train, Y_train, BATCH_SIZE, seed) # Build U-Net model start = time.time() model = build_unet(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS) function_time_outs += "Set up UNet: %.3f sec\n" % (time.time() - start) # Fit model start = time.time() earlystopper = EarlyStopping(patience=100, verbose=1) function_time_outs += "Set up EarlyStopper: %.3f sec\n" % (time.time() - start) start = time.time() checkpointer = ModelCheckpoint(WEIGHTS, verbose=1, save_best_only=True) function_time_outs += "Set up Checkpointer: %.3f sec\n" % (time.time() - start) start = time.time() tensorboard = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1, profile_batch='500,520') function_time_outs += "Set up TensorBoard: %.3f sec\n" % (time.time() - start) start = time.time() model_results = model.fit_generator(train_generator, validation_data=val_generator, validation_steps=10, steps_per_epoch=200, epochs=EPOCHS, callbacks=[tensorboard, earlystopper, checkpointer]) function_time_outs += "Model training: %.3f sec\n" % (time.time() - start) # Predict on train, val and test start = time.time() model = load_model(WEIGHTS) # custom_objects={'mean_iou': mean_iou} function_time_outs += "Load pretrained model: %.3f sec\n" % (time.time() - start) start = time.time() preds_train = model.predict(X_train[:train_size], verbose=1) preds_val = model.predict(X_train[train_size:], verbose=1) preds_test = model.predict(X_test, verbose=1) # Threshold predictions preds_train_t = (preds_train > 0.5).astype(np.uint8) preds_val_t = (preds_val > 0.5).astype(np.uint8) preds_test_t = (preds_test > 0.5).astype(np.uint8) function_time_outs += "Predict on train & test: %.3f sec\n" % (time.time() - start) # Validation Loss and Acc start = time.time() val_results = model.evaluate(X_train[train_size:], Y_train[train_size:], batch_size=BATCH_SIZE) function_time_outs += "Validation eval: %.3f sec" % (time.time() - start) print("Validation Loss:", val_results[0]) print("Validation Accuracy :", val_results[1] * 100, "%") # Save all predictions if SAVE_PREDS: start = time.time() for i, id_ in tqdm(enumerate(test_ids), total=len(test_ids)): test_mask = resize(np.squeeze(preds_test_t[i]), (sizes_test[i][0], sizes_test[i][1]), mode='constant', preserve_range=True) imsave(RESULTS_DIR + '/test%d_pred.png' % i, test_mask) function_time_outs += "Save predicted masks: %.3f sec\n" % (time.time() - start) # Save example prediction results if SAVE_DEMO: # Plot learning curve plot_learning(model_results, savepath=OUTPUT_DIR + "/learning_curve.png") i = 58 show_images(i, i, X_train, Y_train, preds_train, preds_train_t, savename=RESULTS_DIR + '/train%d_pred.png' % i) i = 20 show_images(i, i, X_train[train_size:], Y_train[train_size:], preds_val, preds_val_t, savename=RESULTS_DIR + '/val%d_pred.png' % i) i = 18 show_images(i, i, X_test, None, preds_test, preds_test_t, savename=RESULTS_DIR + '/test%d_pred.png' % i) return function_time_outs
def train(): # load data # train_data = [train_size,3] # test_data = [test_size,3] print("loading data...") test_data, train_data = load_data.load_train_test() # train_data = test_data[:Train_size, :] test_data = test_data[Test_size:, :] train_data_node = tf.placeholder(tf.int32, shape=(None, 3)) train_neg_node = tf.placeholder(tf.int32, shape=(None, 2 * num_sample, 3)) test_scores_node = tf.placeholder(tf.float32, shape=(Test_size, entity_size)) test_labels_node = tf.placeholder(tf.int32, shape=(Test_size, )) entity_embedding = tf.Variable(tf.random_uniform( [entity_size, embedding_size], -1.0, 1.0), name="entity_embedding") relation_embedding = tf.Variable(tf.random_uniform( [relation_size, embedding_size], -1.0, 1.0), name="relation_embedding") # inputs = [batch_size,3] # neg_inputs = [batch_size,2*(entity_size-1),3] def model(inputs, neg_inputs): # [batch_size] inputs_h = inputs[:, 0] inputs_t = inputs[:, 1] inputs_r = inputs[:, 2] # [batch_size,2*(entity_size-1)] neg_inputs_h = neg_inputs[:, :, 0] neg_inputs_t = neg_inputs[:, :, 1] neg_inputs_r = neg_inputs[:, :, 2] # [batch_size,embedding_size] h_embed = tf.nn.embedding_lookup(entity_embedding, inputs_h) t_embed = tf.nn.embedding_lookup(entity_embedding, inputs_t) r_embed = tf.nn.embedding_lookup(relation_embedding, inputs_r) # [batch_size , 2*(entity_size-1),embedding_size] h_neg = tf.nn.embedding_lookup(entity_embedding, neg_inputs_h) t_neg = tf.nn.embedding_lookup(entity_embedding, neg_inputs_t) r_neg = tf.nn.embedding_lookup(relation_embedding, neg_inputs_r) # [batch_size,1] delta = tf.reduce_sum((h_embed + r_embed - t_embed)**2, 1, keep_dims=True) # neg delta = [batch_size,2*(entity_size-1)] neg_delta = tf.reduce_sum((h_neg + t_neg - r_neg)**2, 2) # neg delta = [batch_size,1], equals to div (2*entity_size-2) neg_delta = tf.reduce_mean(neg_delta, 1, keep_dims=True) return delta, neg_delta pos_one, neg_one = model(train_data_node, train_neg_node) margin = 0.0 # loss = tf.reduce_mean(tf.maximum(pos_one + margin - neg_one, 0)) loss = tf.reduce_mean(pos_one + margin - neg_one) # predict # test_inputs = [batch_size,3] def get_embeddings(test_inputs): inputs_h = test_inputs[:, 0] inputs_t = test_inputs[:, 1] # labels = [batch_size] inputs_r = test_inputs[:, 2] # [batch_size,embedding_size] h_embed = tf.nn.embedding_lookup(entity_embedding, inputs_h) t_embed = tf.nn.embedding_lookup(entity_embedding, inputs_t) r_embed = tf.nn.embedding_lookup(relation_embedding, inputs_r) return h_embed, t_embed, r_embed def evalution(scores, labels): # get top k # scores = [Test_size,entity_size] # labels = [Test_size] # h_result = [Test_size] h_result = tf.nn.in_top_k(scores, labels, k=10) # acc h_acc = tf.reduce_mean(tf.cast(h_result, tf.float32)) return h_acc h_embed, t_embed, r_embed = get_embeddings(train_data_node) acc = evalution(test_scores_node, test_labels_node) # train global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # runing the training with tf.Session() as sess: tf.initialize_all_variables().run() print('Initialized!') # generate batches batches = data_helpers.batch_iter(list(zip(train_data)), BATCH_SIZE, NUM_EPOCHS) # batch count batch_count = 0 epoch = 1 print("Epoch " + str(epoch) + ":") for batch in batches: batch_count += 1 # train process x_batch = numpy.squeeze(batch) # generate neg data neg_x_batch = load_data.generate_neg_data(x_batch, num_sample=num_sample) feed_dict = {train_data_node: x_batch, train_neg_node: neg_x_batch} _, step, losses = sess.run([train_op, global_step, loss], feed_dict=feed_dict) time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print("{}: step {}, loss {:g}".format(time_str, step, losses)) # test process if float((batch_count * BATCH_SIZE) / Train_size) > epoch: epoch += 1 print("Epoch " + str(epoch) + ":") if batch_count % EVAL_FREQUENCY == 0: # get test scores feed_dict = {train_data_node: test_data} # get embedding print("get embedding...") h_embedding, t_embedding, r_embedding, entity_embed = sess.run( [h_embed, t_embed, r_embed, entity_embedding], feed_dict=feed_dict) # compute score t_start = time.time() h_acc, t_acc, h_mean_rank, t_mean_rank = evaluate.compute_acc( h_embedding, t_embedding, r_embedding, entity_embed) t_end = time.time() t = t_end - t_start print("computing acc..., cost :%s" % t) hit_acc = (h_acc + t_acc) / 2.0 mean_rank = int((h_mean_rank + t_mean_rank) / 2) time_str = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") print( "{}: step {}, h-acc {:g}, t-acc {:g}, Hit@10 {:g}, h_rank {}, t_rank {}, mean_rank {}" .format(time_str, step, h_acc, t_acc, hit_acc, h_mean_rank, t_mean_rank, mean_rank)) print("\n")
def main(args): N_ITER = args.n_iter K_CV = args.k_cv print('loading data') label_name = 'label' train, test = load_train_test(f=args.infile, label_name=label_name) #train=train.head(1000).copy() print('loaded', train.shape[0], test.shape[0]) print('label rate', train[label_name].mean(), test[label_name].mean()) # Read in feature set to use with open('models/in_vars.p', 'rb') as f: in_vars = pickle.load(f) print('Using', len(in_vars), 'vars') if args.model_type == 'rf': print('Fiting a RandomForestClassifier') rf = RandomForestClassifier(oob_score=True, bootstrap=True, random_state=42) # Look at parameters used by our current forest print('Starting parameters currently in use:\n') pprint(rf.get_params()) # Number of trees in random forest n_estimators = [ int(x) for x in np.linspace(start=10, stop=1000, num=10) ] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(5, 100, num=20)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4] # Method of selecting samples for training each tree random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf } pprint(random_grid) # Use the random grid to search for best hyperparameters # Random search of parameters, using k fold cross validation, # search across n_iter different combinations, and use all available cores rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, scoring='roc_auc', n_iter=N_ITER, cv=K_CV, verbose=2, random_state=42, n_jobs=-1) # Fit the random search model rf_random.fit(train[in_vars], train['label']) # Save Model with open('models/rf_random_search.p', 'wb') as f: pickle.dump(rf_random, f, pickle.HIGHEST_PROTOCOL) with open('models/rf_args.p', 'wb') as f: pickle.dump(args, f, pickle.HIGHEST_PROTOCOL) if args.model_type == 'gb': print('Fiting a GradientBoostingClassifier') gb = GradientBoostingClassifier(verbose=1, subsample=0.9, random_state=42, n_iter_no_change=5) print('Parameters currently in use:\n') pprint(gb.get_params()) max_features = ['auto', 'sqrt'] learning_rate = np.linspace(0.01, 0.2, num=10) max_depth = [int(x) for x in np.linspace(5, 100, num=20)] max_depth.append(None) min_samples_leaf = [1, 2, 4] min_samples_split = [2, 5, 10] n_estimators = [ int(x) for x in np.linspace(start=10, stop=1000, num=10) ] subsample = [0.5, 0.8, 1.0] loss = ['deviance', 'exponential'] random_grid = { 'max_features': max_features, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split, 'n_estimators': n_estimators, 'subsample': subsample, 'learning_rate': learning_rate, 'loss': loss } pprint(random_grid) gb_random = RandomizedSearchCV(estimator=gb, param_distributions=random_grid, scoring='roc_auc', n_iter=N_ITER, cv=K_CV, verbose=2, random_state=42, n_jobs=-1) # Fit the random search model gb_random.fit(train[in_vars], train['label']) with open('models/gb_random_search.p', 'wb') as f: pickle.dump(gb_random, f, pickle.HIGHEST_PROTOCOL) with open('models/gb_args.p', 'wb') as f: pickle.dump(args, f, pickle.HIGHEST_PROTOCOL)