def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=10, test_batch_size=200, emb_size=300, hidden_size=100, L2_weight=0.0001, para_len_limit=300, q_len_limit=30, max_EM=40.0): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) # glove_vocab=set(word2vec.keys()) train_para_list, train_Q_list, train_start_list,train_end_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train_AI2(para_len_limit, q_len_limit) train_size=len(train_para_list) if train_size!=len(train_Q_list) or train_size!=len(train_start_list) or train_size!=len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist, q_idlist= load_dev_or_test_AI2(word2id, para_len_limit, q_len_limit) test_size=len(test_para_list) if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in overall_word2id.iteritems()} word2vec=load_glove() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') # labels = T.imatrix('labels') #(batch, para_len) start_indices= T.ivector() #batch end_indices = T.ivector() #batch para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size=paragraph.shape[0] norm_extraF=normalize_matrix(extraF) fwd_para=create_LSTM_para(rng, emb_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) bwd_para=create_LSTM_para(rng, emb_size, hidden_size) paragraph_para=fwd_para.values()+ bwd_para.values() fwd_e1=create_LSTM_para(rng, 8*hidden_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) bwd_e1=create_LSTM_para(rng, 8*hidden_size, hidden_size) paragraph_para_e1=fwd_e1.values()+ bwd_e1.values() fwd_e11=create_LSTM_para(rng, 2*hidden_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) bwd_e11=create_LSTM_para(rng, 2*hidden_size, hidden_size) paragraph_para_e11=fwd_e11.values()+ bwd_e11.values() fwd_e2=create_LSTM_para(rng, 2*hidden_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) bwd_e2=create_LSTM_para(rng, 2*hidden_size, hidden_size) paragraph_para_e2=fwd_e2.values()+ bwd_e2.values() # U_e2, W_e2, b_e2=create_GRU_para(rng, hidden_size, hidden_size) # U_e2_b, W_e2_b, b_e2_b=create_GRU_para(rng, hidden_size, hidden_size) # paragraph_para_e2=[U_e2, W_e2, b_e2, U_e2_b, W_e2_b, b_e2_b] # fwd_Q=create_LSTM_para(rng, emb_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) # bwd_Q=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_Q.values()+ bwd_Q.values() # W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size)) # W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a1 = create_ensemble_para(rng, 1, 10*hidden_size) # 3 extra features U_a2 = create_ensemble_para(rng, 1, 10*hidden_size) # 3 extra features U_a3 = create_ensemble_para(rng, 1, 6*hidden_size) # 3 extra features # LR_b = theano.shared(value=numpy.zeros((2,), # dtype=theano.config.floatX), # @UndefinedVariable # name='LR_b', borrow=True) HL_paras=[U_a1, U_a2, U_a3] params = [embeddings]+paragraph_para+paragraph_para_e1+paragraph_para_e11+HL_paras+paragraph_para_e2 # load_model_from_file(rootPath+'Best_Paras_AI2_31.210974456', params) paragraph_input = embeddings[paragraph.flatten()].reshape((true_batch_size, paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen) #self, X, Mask, hidden_dim, fwd_tparams, bwd_tparams paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_para, bwd_tparams= bwd_para) para_reps=paragraph_model.output_tensor #(batch, 2*hidden, para_len) Qs_emb = embeddings[questions.flatten()].reshape((true_batch_size, questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength) questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, fwd_tparams=fwd_para, bwd_tparams= bwd_para) questions_reps_tensor=questions_model.output_tensor #(batch, 2*hidden ,q_len) # questions_reps=questions_model.output_sent_rep_maxpooling.reshape((true_batch_size, 1, hidden_size)) #(batch, 1, hidden) # questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) #(batch, para_len, hidden) # #LSTM for questions # fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters # questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) # questions_reps_tensor=questions_model.output_tensor # new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0) # ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2) # ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction # padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX) # ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1) # ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1) # ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad) norm_U_a3=normalize_matrix(U_a3) def example_in_batch(para_matrix, q_matrix): #assume both are (2*hidden, len) repeat_para_matrix_T=T.repeat(para_matrix.T, q_matrix.shape[1], axis=0) #(para_len*q_len, 2*hidden) repeat_q_matrix_3D = T.repeat(q_matrix.T.dimshuffle('x',0,1), para_matrix.shape[1], axis=0) #(para_len, q_len, 2*hidden) repeat_q_matrix_T= repeat_q_matrix_3D.reshape((repeat_q_matrix_3D.shape[0]*repeat_q_matrix_3D.shape[1], repeat_q_matrix_3D.shape[2])) #(para_len*q_len, 2*hidden) ele_mult =repeat_para_matrix_T*repeat_q_matrix_T #(#(para_len*q_len, 2*hidden)) overall_concv = T.concatenate([repeat_para_matrix_T, repeat_q_matrix_T, ele_mult], axis=1) ##(para_len*q_len, 6*hidden) scores=T.dot(overall_concv, norm_U_a3) #(para_len*q_len,1) interaction_matrix=scores.reshape((para_matrix.shape[1], q_matrix.shape[1])) #(para_len, q_len) # transpose_para_matrix=para_matrix.T # interaction_matrix=T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len) norm_interaction_matrix=T.nnet.softmax(interaction_matrix) # norm_interaction_matrix=T.maximum(0.0, interaction_matrix) q_by_para = T.dot(q_matrix, norm_interaction_matrix.T)/T.sum(norm_interaction_matrix.T, axis=0).dimshuffle('x',0) #(2*hidden, para_len) para_by_q = T.repeat(T.dot(para_matrix, T.nnet.softmax(T.max(interaction_matrix, axis=1).dimshuffle('x',0)).T), para_matrix.shape[1], axis=1) return (q_by_para, para_by_q) inter_return, updates = theano.scan(fn=example_in_batch, outputs_info=None, sequences=[para_reps, questions_reps_tensor]) #batch_q_reps (batch, hidden, para_len) batch_q_reps=inter_return[0] #(batch, 2*hidden, para_len) batch_para_reps=inter_return[1] #(batch, 2*hidden , para_len) #para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1), all are in (batch, hidden , para_len) ensemble_para_reps_tensor=T.concatenate([para_reps, batch_q_reps,para_reps*batch_q_reps, para_reps*batch_para_reps], axis=1) #(batch, 4*2*hidden, para_len) questions_reps.dimshuffle(0,2,1) para_ensemble_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=ensemble_para_reps_tensor, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_e1, bwd_tparams= bwd_e1) para_reps_tensor4score=para_ensemble_model.output_tensor #(batch, 2*hidden ,para_len) para_ensemble_model1=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=para_reps_tensor4score, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_e11, bwd_tparams= bwd_e11) para_reps_tensor4score1=para_ensemble_model1.output_tensor #(batch, 2*hidden ,para_len) Con_G_M=T.concatenate([ensemble_para_reps_tensor, para_reps_tensor4score1], axis=1) #(batch, 10*hidden, para_len) #score for each para word norm_U_a=normalize_matrix(U_a1) start_scores=T.dot(Con_G_M.dimshuffle(0,2,1), norm_U_a) #(batch, para_len, 1) start_scores=T.nnet.softmax(start_scores.reshape((true_batch_size, paragraph.shape[1]))) #(batch, para_len) # para_reps_tensor4score = T.concatenate([para_reps_tensor4score, start_scores.dimshuffle(0,'x',1)], axis=1) para_ensemble_model2=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=para_reps_tensor4score1, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_e2, bwd_tparams= bwd_e2) para_reps_tensor4score2=para_ensemble_model2.output_tensor #(batch, 2*hidden ,para_len) Con_G_M2=T.concatenate([ensemble_para_reps_tensor, para_reps_tensor4score2], axis=1) #(batch, 10*hidden, para_len) norm_U_a2=normalize_matrix(U_a2) end_scores=T.dot(Con_G_M2.dimshuffle(0,2,1), norm_U_a2) #(batch, para_len, 1) end_scores=T.nnet.softmax(end_scores.reshape((true_batch_size, paragraph.shape[1]))) #(batch, para_len) #loss train loss=-T.mean(T.log(start_scores[T.arange(true_batch_size), start_indices])+T.log(end_scores[T.arange(true_batch_size), end_indices])) #test co_simi_batch_matrix=T.batched_dot((para_mask*start_scores).dimshuffle(0,1,'x'), (para_mask*end_scores).dimshuffle(0,'x',1)) #(batch, para_len, para_len) #reset lower dialgonal cols = numpy.concatenate([numpy.array(range(i), dtype=numpy.uint) for i in xrange(para_len_limit)]) rows = numpy.concatenate([numpy.array([i]*i, dtype=numpy.uint) for i in xrange(para_len_limit)]) c = T.set_subtensor(co_simi_batch_matrix[:,rows, cols], theano.shared(numpy.zeros(para_len_limit*(para_len_limit-1)/2))) #reset longer than 7 size cols2 = numpy.concatenate([numpy.array(range(i+7,para_len_limit), dtype=numpy.uint) for i in xrange(para_len_limit-7)]) rows2 = numpy.concatenate([numpy.array([i]*(para_len_limit-7-i), dtype=numpy.uint) for i in xrange(para_len_limit-7)]) c2 = T.set_subtensor(c[:,rows2, cols2], theano.shared(numpy.zeros((para_len_limit-7)*(para_len_limit-6)/2))) test_return=T.argmax(c2.reshape((true_batch_size, para_len_limit*para_len_limit)), axis=1) #batch #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=loss#+ConvGRU_1.error# accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([paragraph, questions,start_indices, end_indices,para_mask, q_mask, extraF], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF], test_return, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/test_batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i+= train_model( numpy.asarray([train_para_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_Q_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_start_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_end_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_para_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), numpy.asarray([train_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), numpy.asarray([train_feature_matrixlist[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX)) #print iter if iter%10==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() # writefile=codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') # writefile.write('{') pred_dict={} # exact_match=0.0 # F1_match=0.0 q_amount=0 for test_para_id in test_batch_start: batch_predict_ids=test_model( numpy.asarray(test_para_list[test_para_id:test_para_id+test_batch_size], dtype='int32'), numpy.asarray(test_Q_list[test_para_id:test_para_id+test_batch_size], dtype='int32'), numpy.asarray(test_para_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), numpy.asarray(test_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), numpy.asarray(test_feature_matrixlist[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX)) # print distribution_matrix test_para_wordlist_list=test_text_list[test_para_id:test_para_id+test_batch_size] # para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+test_batch_size] q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size] # print 'q_ids_batch:', q_ids_batch # paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size] # sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size] # para_len=len(test_para_wordlist_list[0]) # if para_len!=len(distribution_matrix[0]): # print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0]) # exit(0) # q_size=len(distribution_matrix) q_amount+=test_batch_size # print q_size # print test_para_word_list # Q_list_inword=test_Q_list_word[test_para_id:test_para_id+test_batch_size] for q in range(test_batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans=decode_predict_id_AI2(batch_predict_ids[q], para_len_limit, test_para_wordlist_list[q]) q_id=q_ids_batch[q] pred_dict[q_id]=pred_ans # writefile.write('"'+str(q_id)+'": "'+pred_ans+'", ') # pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], numpy.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q]) # q_gold_ans_set=para_gold_ansset_list[q] # # print test_para_wordlist_list[q] # # print Q_list_inword[q] # # print pred_ans.encode('utf8'), q_gold_ans_set # if pred_ans in q_gold_ans_set: # exact_match+=1 # F1=MacroF1(pred_ans, q_gold_ans_set) # F1_match+=F1 with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt') # F1_acc=F1_match/q_amount # exact_acc=exact_match/q_amount if F1_acc> max_F1_acc: max_F1_acc=F1_acc if exact_acc> max_exact_acc: max_exact_acc=exact_acc if max_exact_acc > max_EM: store_model_to_file(rootPath+'Best_Paras_AI2_'+str(max_exact_acc), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc # os.system('python evaluate-v1.1.py '+rootPath+'dev-v1.1.json '+rootPath+'predictions.txt') if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=300, char_emb_size=20, hidden_size=300, L2_weight=0.0001, p_len_limit=400, test_p_len_limit=100, q_len_limit=20, char_len=15, filter_size = [5,5], char_filter_size=4, margin=0.5, max_EM=50.302743615): test_batch_size=batch_size*10 model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) word2id={} char2id={} #questions,paragraphs,q_masks,p_masks,labels, word2id train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_label_list, word2id, char2id=load_squad_cnn_rank_word_train(word2id, char2id, p_len_limit, q_len_limit, char_len) train_size=len(train_para_list) test_Q_list, test_para_list, test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, test_label_list, word2id, char2id, test_para_wordlist_list= load_squad_cnn_rank_word_dev(word2id, char2id, test_p_len_limit, q_len_limit, char_len) test_size=len(test_para_list) train_Q_list = numpy.asarray(train_Q_list, dtype='int32') train_para_list = numpy.asarray(train_para_list, dtype='int32') train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX) train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX) train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32') train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32') train_Q_char_mask = numpy.asarray(train_Q_char_mask, dtype=theano.config.floatX) train_para_char_mask = numpy.asarray(train_para_char_mask, dtype=theano.config.floatX) train_label_list = numpy.asarray(train_label_list, dtype='int32') test_Q_list = numpy.asarray(test_Q_list, dtype='int32') test_para_list = numpy.asarray(test_para_list, dtype='int32') test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX) test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX) test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32') test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32') test_Q_char_mask = numpy.asarray(test_Q_char_mask, dtype=theano.config.floatX) test_para_char_mask = numpy.asarray(test_para_char_mask, dtype=theano.config.floatX) vocab_size = len(word2id) print 'vocab size: ', vocab_size rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, rng) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_glove() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) char_size = len(char2id) print 'char size: ', char_size char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, rng) char_embeddings=theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') gold_indices= T.ivector() #batch, one gold word for each sample para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') char_paragraph = T.imatrix() #(batch, char_len*p_len) char_questions = T.imatrix() char_para_mask=T.fmatrix() char_q_mask=T.fmatrix() # true_p_len = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = paragraph.shape[0] true_p_len = paragraph.shape[1] common_input_p=embeddings[paragraph.flatten()].reshape((true_batch_size,true_p_len, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_q=embeddings[questions.flatten()].reshape((true_batch_size,q_len_limit, emb_size)) char_common_input_p=char_embeddings[char_paragraph.flatten()].reshape((true_batch_size*true_p_len, char_len, char_emb_size)) #the input format can be adapted into CNN or GRU or LSTM char_common_input_q=char_embeddings[char_questions.flatten()].reshape((true_batch_size*q_len_limit, char_len, char_emb_size)) char_p_masks = char_para_mask.reshape((true_batch_size*true_p_len, char_len)) char_q_masks = char_q_mask.reshape((true_batch_size*q_len_limit, char_len)) char_LSTM_para_dict=create_LSTM_para(rng, char_emb_size, char_emb_size) char_LSTM_para_dict_bw=create_LSTM_para(rng, char_emb_size, char_emb_size) char_lstm_layer_p=Bd_LSTM_Batch_Tensor_Input_with_Mask(char_common_input_p.dimshuffle(0,2,1), char_p_masks, char_emb_size, char_LSTM_para_dict,char_LSTM_para_dict_bw) char_word_embeddings_p = char_lstm_layer_p.output_sent_rep_conc.reshape((true_batch_size, true_p_len, 2*char_emb_size)).dimshuffle(0, 2,1) #(batch, 2*hidden) char_lstm_layer_q=Bd_LSTM_Batch_Tensor_Input_with_Mask(char_common_input_q.dimshuffle(0,2,1), char_q_masks, char_emb_size, char_LSTM_para_dict,char_LSTM_para_dict_bw) char_word_embeddings_q = char_lstm_layer_q.output_sent_rep_conc.reshape((true_batch_size, q_len_limit, 2*char_emb_size)).dimshuffle(0, 2,1) #(batch, 2*hidden) LSTM_para_dict=create_LSTM_para(rng, 2*char_emb_size+emb_size,hidden_size) #40+300 LSTM_para_dict_bw=create_LSTM_para(rng, 2*char_emb_size+emb_size,hidden_size) p_input2lstm = T.concatenate([common_input_p.dimshuffle(0,2,1), char_word_embeddings_p], axis=1) #(batch, emb_size+char_emb_size, p_len) q_input2lstm = T.concatenate([common_input_q.dimshuffle(0,2,1), char_word_embeddings_q], axis=1) #(batch, emb_size+char_emb_size, p_len) lstm_layer_p=Bd_LSTM_Batch_Tensor_Input_with_Mask(p_input2lstm, para_mask, hidden_size, LSTM_para_dict,LSTM_para_dict_bw) p_tensor3 = lstm_layer_p.output_tensor_conc #(batch, 2*hidden, p_len) lstm_layer_q=Bd_LSTM_Batch_Tensor_Input_with_Mask(q_input2lstm, q_mask, hidden_size, LSTM_para_dict,LSTM_para_dict_bw) q_reps = lstm_layer_q.output_sent_rep_conc #(batch, 2*hidden) NN_para=char_LSTM_para_dict.values()+char_LSTM_para_dict_bw.values()+LSTM_para_dict.values()+LSTM_para_dict_bw.values() input4score = T.concatenate([p_tensor3, T.repeat(q_reps.dimshuffle(0,1,'x'), true_p_len, axis=2)], axis=1) #(batch, 4*hidden, p_len) HL_1_para = create_ensemble_para(rng, hidden_size, 4*hidden_size) HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 1, hidden_size) norm_U_a=normalize_matrix(U_a) norm_HL_1_para=normalize_matrix(HL_1_para) norm_HL_2_para=normalize_matrix(HL_2_para) norm_HL_3_para=normalize_matrix(HL_3_para) norm_HL_4_para=normalize_matrix(HL_4_para) span_scores_matrix = add_HLs_2_tensor3(input4score, norm_HL_1_para,norm_HL_2_para,norm_HL_3_para,norm_HL_4_para, norm_U_a, true_batch_size,true_p_len) span_scores=T.nnet.softmax(span_scores_matrix) #(batch, para_len) loss_neg_likelihood=-T.mean(T.log(span_scores[T.arange(true_batch_size), gold_indices])) #ranking loss tanh_span_scores_matrix = span_scores#T.tanh(span_scores_matrix) #(batch, gram_size) index_matrix = T.zeros((true_batch_size, p_len_limit), dtype=theano.config.floatX) new_index_matrix = T.set_subtensor(index_matrix[T.arange(true_batch_size), gold_indices], 1.0) prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()] prob_batch_nega = tanh_span_scores_matrix[(1.0-new_index_matrix).nonzero()] repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0) repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x',0), prob_batch_posi.shape[0], axis=0).flatten() loss_rank = T.mean(T.maximum(0.0, margin-repeat_posi+repeat_nega)) loss = loss_neg_likelihood + loss_rank #test mask_test_return=T.argmax(span_scores_matrix*para_mask, axis=1) #batch params = [embeddings,char_embeddings]+NN_para+[U_a,HL_1_para,HL_2_para,HL_3_para,HL_4_para] # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=loss#+ConvGRU_1.error# accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([paragraph, questions,gold_indices, para_mask, q_mask, char_paragraph, #(batch, char_len*p_len) char_questions, char_para_mask, char_q_mask], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, char_paragraph, char_questions, char_para_mask, char_q_mask], mask_test_return, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/test_batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_batch = train_ids[para_id:para_id+batch_size] cost_i+= train_model( train_para_list[train_id_batch], train_Q_list[train_id_batch], train_label_list[train_id_batch], train_para_mask[train_id_batch], train_Q_mask[train_id_batch], train_para_char_list[train_id_batch], train_Q_char_list[train_id_batch], train_para_char_mask[train_id_batch], train_Q_char_mask[train_id_batch]) #print iter if iter%100==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() # pred_dict={} q_amount=0 p1=0 for test_para_id in test_batch_start: batch_predict_ids=test_model( test_para_list[test_para_id:test_para_id+test_batch_size], test_Q_list[test_para_id:test_para_id+test_batch_size], test_para_mask[test_para_id:test_para_id+test_batch_size], test_Q_mask[test_para_id:test_para_id+test_batch_size], test_para_char_list[test_para_id:test_para_id+test_batch_size], test_Q_char_list[test_para_id:test_para_id+test_batch_size], test_para_char_mask[test_para_id:test_para_id+test_batch_size], test_Q_char_mask[test_para_id:test_para_id+test_batch_size]) # test_para_wordlist_batch=test_para_wordlist_list[test_para_id:test_para_id+test_batch_size] test_label_batch=test_label_list[test_para_id:test_para_id+test_batch_size] q_amount+=test_batch_size for q in range(test_batch_size): #for each question predict_id = batch_predict_ids[q] ground_ids=test_label_batch[q] if predict_id in set(ground_ids): p1+=1 # print batch_predict_ids[q], mask_batch_predict_ids[q], test_p_len_limit - numpy.sum(test_para_mask[test_para_id+q]), scores_i[q], test_para_mask[test_para_id+q] exact_acc = p1*100.0/q_amount if exact_acc> max_exact_acc: max_exact_acc=exact_acc # if max_exact_acc > max_EM: # store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params) # print 'Finished storing best params at:', max_exact_acc print '\t\tcurrent exact:', exact_acc, '\t\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=500, test_batch_size=1000, emb_size=300, hidden_size=300, HL_hidden_size=200, L2_weight=0.0001, train_size=None, test_size=None, batch_size_pred=1000, para_len=60, question_len=20, c_len=7, e_len=2): model_options = locals().copy() print "model options", model_options rootPath='/mounts/work/hs/yin/20161219/'; storePath='/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = np.random.RandomState(23455) word2id={} word2id['UNK']=0 # use it to pad word2id, train_questions,train_questions_mask,train_paras,train_paras_mask,train_e_ids,train_e_masks,train_c_ids,train_c_masks, train_c_heads,train_c_tails,train_l_heads,train_l_tails,train_e_heads,train_e_tails,train_labels, train_labels_3c=load_SQUAD_hinrich_v2(train_size, para_len, question_len, e_len, c_len, word2id, rootPath+'squadnewtrn.txt') word2id, test_questions,test_questions_mask,test_paras,test_paras_mask,test_e_ids,test_e_masks,test_c_ids,test_c_masks, test_c_heads,test_c_tails,test_l_heads,test_l_tails,test_e_heads,test_e_tails,test_labels, test_labels_3c=load_SQUAD_hinrich_v2(test_size, para_len, question_len, e_len, c_len,word2id, rootPath+'squadnewdev.txt') print 'word2id size for bigger dataset:', len(word2id) word2id, train_questions,train_questions_mask,train_paras,train_paras_mask,train_e_ids,train_e_masks,train_c_ids,train_c_masks, train_c_heads,train_c_tails,train_l_heads,train_l_tails,train_e_heads,train_e_tails,train_labels, train_labels_3c=load_SQUAD_hinrich_v2(train_size, para_len, question_len,e_len, c_len, word2id, rootPath+'squadnewtrn,subset.000.txt') word2id, test_questions,test_questions_mask,test_paras,test_paras_mask,test_e_ids,test_e_masks,test_c_ids,test_c_masks, test_c_heads,test_c_tails,test_l_heads,test_l_tails,test_e_heads,test_e_tails,test_labels, test_labels_3c=load_SQUAD_hinrich_v2(test_size, para_len, question_len, e_len, c_len,word2id, rootPath+'squadnewdev,subset.000.txt') print 'word2id size for smaller dataset:', len(word2id) # if len(train_questions)!=train_size or len(test_questions)!=test_size: # print 'len(questions)!=train_size or len(test_questions)!=test_size:', len(train_questions),train_size,len(test_questions),test_size # exit(0) train_size=len(train_questions) test_size = len(test_questions) train_questions = np.asarray(train_questions, dtype='int32') # print train_questions[:10,:] # exit(0) train_questions_mask = np.asarray(train_questions_mask, dtype=theano.config.floatX) train_paras = np.asarray(train_paras, dtype='int32') train_paras_mask = np.asarray(train_paras_mask, dtype=theano.config.floatX) train_e_ids = np.asarray(train_e_ids, dtype='int32') train_e_masks = np.asarray(train_e_masks, dtype=theano.config.floatX) train_c_ids = np.asarray(train_c_ids, dtype='int32') train_c_masks = np.asarray(train_c_masks, dtype=theano.config.floatX) train_c_heads = np.asarray(train_c_heads, dtype='int32') train_c_tails = np.asarray(train_c_tails, dtype='int32') train_l_heads = np.asarray(train_l_heads, dtype='int32') train_l_tails = np.asarray(train_l_tails, dtype='int32') train_e_heads = np.asarray(train_e_heads, dtype='int32') train_e_tails = np.asarray(train_e_tails, dtype='int32') train_labels = np.asarray(train_labels, dtype='int32') train_labels_3c = np.asarray(train_labels_3c, dtype='int32') test_questions = np.asarray(test_questions, dtype='int32') test_questions_mask = np.asarray(test_questions_mask, dtype=theano.config.floatX) test_paras = np.asarray(test_paras, dtype='int32') test_paras_mask = np.asarray(test_paras_mask, dtype=theano.config.floatX) test_e_ids = np.asarray(test_e_ids, dtype='int32') test_e_masks = np.asarray(test_e_masks, dtype=theano.config.floatX) test_c_ids = np.asarray(test_c_ids, dtype='int32') test_c_masks = np.asarray(test_c_masks, dtype=theano.config.floatX) test_c_heads = np.asarray(test_c_heads, dtype='int32') test_c_tails = np.asarray(test_c_tails, dtype='int32') test_l_heads = np.asarray(test_l_heads, dtype='int32') test_l_tails = np.asarray(test_l_tails, dtype='int32') test_e_heads = np.asarray(test_e_heads, dtype='int32') test_e_tails = np.asarray(test_e_tails, dtype='int32') test_labels = np.asarray(test_labels, dtype='int32') overall_vocab_size=len(word2id) print 'train size:', train_size, 'test size:', test_size, 'vocab size:', overall_vocab_size rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, rng) rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() para=T.imatrix() #(2*batch, len) para_mask=T.fmatrix() #(2*batch, len) c_ids=T.imatrix() #(2*batch, len) c_mask=T.fmatrix() #(2*batch, len) e_ids=T.imatrix() #(2*batch, len) e_mask=T.fmatrix() #(2*batch, len) c_heads=T.ivector() #batch c_tails=T.ivector() #batch l_heads=T.ivector() #batch l_tails=T.ivector() #batch e_heads=T.ivector() #batch e_tails=T.ivector() #batch q=T.imatrix() #(2*batch, len_q) q_mask=T.fmatrix() #(2*batch, len_q) labels=T.ivector() #batch ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = para.shape[0] # U_p, W_p, b_p=create_GRU_para(rng, emb_size, hidden_size) # U_p_b, W_p_b, b_p_b=create_GRU_para(rng, emb_size, hidden_size) # GRU_p_para=[U_p, W_p, b_p, U_p_b, W_p_b, b_p_b] # # U_q, W_q, b_q=create_GRU_para(rng, emb_size, hidden_size) # U_q_b, W_q_b, b_q_b=create_GRU_para(rng, emb_size, hidden_size) # GRU_q_para=[U_q, W_q, b_q, U_q_b, W_q_b, b_q_b] paragraph_input = embeddings[para.flatten()].reshape((true_batch_size, para_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, para_len) q_input = embeddings[q.flatten()].reshape((true_batch_size, question_len, emb_size)).transpose((0, 2,1)) # (batch, emb_size, question_len) fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) paragraph_reps_tensor3=paragraph_model.output_tensor #(batch, 2*hidden, paralen) # paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U_p,W=W_p,b=b_p,Ub=U_p_b,Wb=W_p_b,bb=b_p_b) # paragraph_reps_tensor3=paragraph_model.output_tensor_conc #(batch, 2*hidden, para_len) fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) question_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(q_input, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) q_reps=questions_model.output_sent_rep_maxpooling #(batch, 2*hidden) # q_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=q_input, Mask=q_mask, hidden_dim=hidden_size,U=U_q,W=W_q,b=b_q,Ub=U_q_b,Wb=W_q_b,bb=b_q_b) # q_reps=q_model.output_sent_rep_conc #(batch, 2*hidden) #interaction batch_ids=T.arange(true_batch_size) c_heads_reps=paragraph_reps_tensor3[batch_ids,:,c_heads] #(batch, 2*hidden) c_tails_reps=paragraph_reps_tensor3[batch_ids,:,c_tails] #(batch, 2*hidden) candididates_reps=T.concatenate([c_heads_reps, c_tails_reps], axis=1) #(batch, 4*hidden) l_heads_reps=paragraph_reps_tensor3[batch_ids,:,l_heads] #(batch, 2*hidden) l_tails_reps=paragraph_reps_tensor3[batch_ids,:,l_tails] #(batch, 2*hidden) longs_reps=T.concatenate([l_heads_reps, l_tails_reps], axis=1) #(batch, 4*hidden) e_heads_reps=paragraph_reps_tensor3[batch_ids,:,e_heads] #(batch, 2*hidden) e_tails_reps=paragraph_reps_tensor3[batch_ids,:,e_tails] #(batch, 2*hidden) extensions_reps=T.concatenate([e_heads_reps, e_tails_reps], axis=1) #(batch, 4*hidden) #glove level average c_input = embeddings[c_ids.flatten()].reshape((true_batch_size, c_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) c_sum = T.sum(c_input*c_mask.dimshuffle(0,'x',1), axis=2) #(batch, emb_size) average_C_batch = c_sum/T.sqrt(T.sum(c_sum**2, axis=1)+1e-20).dimshuffle(0,'x') e_input = embeddings[e_ids.flatten()].reshape((true_batch_size, e_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) e_sum = T.sum(e_input*e_mask.dimshuffle(0,'x',1), axis=2) #(batch, emb_size) average_E_batch = e_sum/T.sqrt(T.sum(e_sum**2, axis=1)+1e-20).dimshuffle(0,'x') # e_input = embeddings[e_ids.flatten()].reshape((true_batch_size, e_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) q_sum = T.sum(q_input*q_mask.dimshuffle(0,'x',1), axis=2) #(batch, emb_size) average_Q_batch = q_sum/T.sqrt(T.sum(q_sum**2, axis=1)+1e-20).dimshuffle(0,'x') # def submatrix_average(matrix, head, tail): # return T.mean(matrix[:, head:tail+1], axis=1) #emb_size # def submatrix_average_q(matrix, head): # return T.mean(matrix[:, head:], axis=1) #emb_size # # average_E_batch, _ = theano.scan(fn=submatrix_average, # sequences=[paragraph_input,e_heads, e_tails]) #(batch, emb_size) # average_C_batch, _ = theano.scan(fn=submatrix_average, # sequences=[paragraph_input,c_heads, c_tails]) #(batch, emb_size) # # Q_valid_len=T.cast(T.sum(q_mask, axis=1), 'int32') # # average_Q_batch, _ = theano.scan(fn=submatrix_average_q, # sequences=[q_input,-Q_valid_len]) #(batch, emb_size) #classify HL_layer_subtask_input=T.concatenate([q_reps, extensions_reps, average_E_batch, average_Q_batch], axis=1) #(batch, 6*hidden+2*emb) HL_layer_subtask_size= 6*hidden_size+2*emb_size#HL_layer_1_input_size+2*HL_hidden_size HL_layer_subtask_1=HiddenLayer(rng, input=HL_layer_subtask_input, n_in=HL_layer_subtask_size, n_out=HL_hidden_size, activation=T.tanh) HL_layer_subtask_2=HiddenLayer(rng, input=HL_layer_subtask_1.output, n_in=HL_hidden_size, n_out=HL_hidden_size, activation=T.tanh) U_subtask_a = create_ensemble_para(rng, 2, HL_hidden_size) # the weight matrix hidden_size*2 norm_U_subtask_a=normalize_matrix(U_subtask_a) LR_subtask_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_subtask_para=[U_subtask_a, LR_subtask_b] layer_LR_subtask=LogisticRegression(rng, input=HL_layer_subtask_2.output, n_in=HL_hidden_size, n_out=2, W=norm_U_subtask_a, b=LR_subtask_b) #basically it is a multiplication between weight matrix and input feature vector HL_layer_1_input_size=14*hidden_size+3*emb_size+1 #, average_E_batch, average_C_batch, average_Q_batch HL_layer_1_input = T.concatenate([q_reps, longs_reps, extensions_reps, candididates_reps, average_E_batch, average_C_batch, average_Q_batch, layer_LR_subtask.prop_for_posi.reshape((true_batch_size,1))], axis=1) #(batch, 14*hidden_size+3*emb_size+1) HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=HL_hidden_size, activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=HL_hidden_size, n_out=HL_hidden_size, activation=T.tanh) LR_input=HL_layer_2.output #T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output], axis=1) #(batch, 10*hidden) LR_input_size= HL_hidden_size#HL_layer_1_input_size+2*HL_hidden_size U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2 norm_U_a=normalize_matrix(U_a) LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=norm_U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector loss=layer_LR.negative_log_likelihood(labels)+layer_LR_subtask.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = LR_para+[embeddings]+paragraph_para+question_para+HL_layer_1.params+HL_layer_2.params+LR_subtask_para+HL_layer_subtask_1.params+HL_layer_subtask_2.params # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=loss#+0.0005*T.mean(U_a**2) accumulator=[] for para_i in params: eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-20))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([para, para_mask,c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], cost, updates=updates,on_unused_input='ignore') train_model_pred = theano.function([para, para_mask, c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], layer_LR.y_pred, on_unused_input='ignore') test_model = theano.function([para, para_mask, c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], [layer_LR.errors(labels),layer_LR.y_pred], on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size #batch_size means how many pairs train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_train_batches_pred=train_size/batch_size_pred #batch_size means how many pairs train_batch_start_pred=list(np.arange(n_train_batches_pred)*batch_size_pred)+[train_size-batch_size_pred] n_test_batches=test_size/test_batch_size #batch_size means how many pairs test_batch_start=list(np.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_acc=0.0 cost_i=0.0 train_ids = range(train_size) train_ids_pred = range(train_size) best_test_statistic=defaultdict(int) # best_train_statistic=defaultdict(int) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) # print train_ids[:100] iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_list = train_ids[para_id:para_id+batch_size] # print 'train_labels[train_id_list]:', train_labels[train_id_list] cost_i+= train_model( train_paras[train_id_list], train_paras_mask[train_id_list], train_c_ids[train_id_list], train_c_masks[train_id_list], train_e_ids[train_id_list], train_e_masks[train_id_list], train_c_heads[train_id_list], train_c_tails[train_id_list], train_l_heads[train_id_list], train_l_tails[train_id_list], train_e_heads[train_id_list], train_e_tails[train_id_list], train_questions[train_id_list], train_questions_mask[train_id_list], train_labels[train_id_list]) #print iter if iter%10==0: #iter>=200 and print 'Epoch ', epoch, 'iter '+str(iter)+'/'+str(len(train_batch_start))+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' past_time = time.time() # print 'Training Pred...' # train_statistic=defaultdict(int) # for para_id in train_batch_start_pred: # train_id_list = train_ids_pred[para_id:para_id+batch_size_pred] # gold_train_labels_list = train_labels_3c[train_id_list] # # print 'train_id_list:', train_id_list # # print 'train_c_heads[train_id_list]:', train_c_heads[train_id_list] # train_preds_i= train_model_pred( # train_paras[train_id_list], # train_paras_mask[train_id_list], # train_c_ids[train_id_list], # train_c_masks[train_id_list], # train_e_ids[train_id_list], # train_e_masks[train_id_list], # train_c_heads[train_id_list], # train_c_tails[train_id_list], # train_l_heads[train_id_list], # train_l_tails[train_id_list], # train_e_heads[train_id_list], # train_e_tails[train_id_list], # train_questions[train_id_list], # train_questions_mask[train_id_list], # train_labels[train_id_list]) # # for ind, gold_label in enumerate(gold_train_labels_list): # train_statistic[(gold_label, train_preds_i[ind])]+=1 # train_acc= (train_statistic.get((1,1),0)+train_statistic.get((0,0),0))*1.0/(train_statistic.get((1,1),0)+train_statistic.get((0,0),0)+train_statistic.get((1,0),0)+train_statistic.get((0,1),0)) # # print '\t\tcurrnt train acc:', train_acc, ' train_statistic:', train_statistic print 'Testing...' error=0 test_statistic=defaultdict(int) for test_para_id in test_batch_start: test_id_list = range(test_para_id, test_para_id+test_batch_size) # print 'test_id_list:',test_id_list # print 'test_c_heads[test_id_list]', test_c_heads[test_id_list] gold_labels_list = test_labels_3c[test_para_id:test_para_id+test_batch_size] error_i, preds_i= test_model( test_paras[test_id_list], test_paras_mask[test_id_list], test_c_ids[test_id_list], test_c_masks[test_id_list], test_e_ids[test_id_list], test_e_masks[test_id_list], test_c_heads[test_id_list], test_c_tails[test_id_list], test_l_heads[test_id_list], test_l_tails[test_id_list], test_e_heads[test_id_list], test_e_tails[test_id_list], test_questions[test_id_list], test_questions_mask[test_id_list], test_labels[test_id_list]) error+=error_i for ind, gold_label in enumerate(gold_labels_list): test_statistic[(gold_label, preds_i[ind])]+=1 # acc=1.0-error*1.0/len(test_batch_start) acc= (test_statistic.get((1,1),0)+test_statistic.get((0,0),0))*1.0/(test_statistic.get((1,1),0)+test_statistic.get((0,0),0)+test_statistic.get((1,0),0)+test_statistic.get((0,1),0)) if acc> max_acc: max_acc=acc best_test_statistic=test_statistic store_model_to_file(storePath+'Best_Paras_HS_v2_000_subtask_'+str(max_acc), params) print 'Finished storing best params at:', max_acc print 'current average acc:', acc, '\t\tmax acc:', max_acc, '\ttest_statistic:', test_statistic print '\t\t\t\tbest statistic:', best_test_statistic if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.001, n_epochs=100, emb_size=300, batch_size=10, filter_size=[3, 5], maxSentLen=40, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) all_sentences, all_masks, all_labels, word2id = load_BBN_multi_labels_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence train_sents = np.asarray(all_sentences[0], dtype='int32') train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX) train_labels = np.asarray(all_labels[0], dtype='int32') train_size = len(train_labels) dev_sents = np.asarray(all_sentences[1], dtype='int32') dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX) dev_labels = np.asarray(all_labels[1], dtype='int32') dev_size = len(dev_labels) test_sents = np.asarray(all_sentences[2], dtype='int32') test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX) test_labels = np.asarray(all_labels[2], dtype='int32') test_size = len(test_labels) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.imatrix('labels') #batch*12 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM # U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size[0]) # NN_para=[U1, W1, b1] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias # gru_layer=GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask, hidden_size[0], U1, W1, b1) # sent_embeddings=gru_layer.output_sent_rep # (batch_size, hidden_size) LSTM_para_dict = create_LSTM_para(rng, emb_size, hidden_size[0]) NN_para = LSTM_para_dict.values() # .values returns a list of parameters lstm_layer = LSTM_Batch_Tensor_Input_with_Mask(common_input, sents_mask, hidden_size[0], LSTM_para_dict) sent_embeddings = lstm_layer.output_sent_rep # (batch_size, hidden_size) LR_input = sent_embeddings #T.concatenate([sent_embeddings,sent_embeddings2], axis=1) LR_input_size = hidden_size[0] #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para( rng, 12, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(layer_LR.before_softmax) #batch * 12 prob_pos = T.where(labels < 1, 1.0 - score_matrix, score_matrix) loss = -T.mean(T.log(prob_pos)) # loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [embeddings ] + NN_para + LR_para # put all model parameters together cost = loss #+Div_reg*diversify_reg#+L2_weight*L2_reg updates = Gradient_Cost_Para(cost, params, learning_rate) ''' testing ''' binarize_prob = T.where(score_matrix > 0.5, 1, 0) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([sents_id_matrix, sents_mask], binarize_prob, allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] # n_dev_batches=dev_size/batch_size # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] # max_acc_dev=0.0 max_meanf1_test = 0.0 max_weightf1_test = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) iter_accu = 0 cost_i = 0.0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter % 20 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() error_sum = 0.0 all_pred_labels = [] all_gold_labels = [] for test_batch_id in test_batch_start: # for each test batch pred_labels = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size]) gold_labels = test_labels[test_batch_id:test_batch_id + batch_size] all_pred_labels.append(pred_labels) all_gold_labels.append(gold_labels) all_pred_labels = np.concatenate(all_pred_labels) all_gold_labels = np.concatenate(all_gold_labels) test_mean_f1, test_weight_f1 = average_f1_two_array_by_col( all_pred_labels, all_gold_labels) if test_weight_f1 > max_weightf1_test: max_weightf1_test = test_weight_f1 if test_mean_f1 > max_meanf1_test: max_meanf1_test = test_mean_f1 print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(learning_rate=0.005, n_epochs=2000, batch_size=300, test_batch_size=400, emb_size=50, hidden_size=300, HL_hidden_size=200, L2_weight=0.0001, train_size=None, test_size=None, batch_size_pred=400, trichar_len=15,char_emb_size=50, para_len=101, question_len=20, c_len=1, model_type='train'): model_options = locals().copy() print "model options", model_options rootPath='/mounts/Users/cisintern/hs/l/workhs/yin/20170320/'; storePath='/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = np.random.RandomState(23455) word2id={} trichar2id={} word2id['UNK']=0 # use it to pad #word2id, trichar2id, questions,questions_mask,paras,paras_mask,labels, isInQ_para, paras_shape, questions_shape, types, types_shape,question_trichar_ids,question_trichar_masks,para_trichar_ids,para_trichar_masks,type_trichar_ids,type_trichar_masks word2id, trichar2id,train_questions,train_questions_mask,train_paras,train_paras_mask,train_labels, train_islabels, train_paras_shape, train_questions_shape, train_types, train_types_shape,train_question_trichar_ids,train_question_trichar_masks,train_para_trichar_ids,train_para_trichar_masks,train_type_trichar_ids,train_type_trichar_masks=load_SQUAD_hinrich_v4(train_size, para_len, question_len, trichar_len, word2id,trichar2id, rootPath+'trn20170320.txt') word2id, trichar2id,test_questions,test_questions_mask,test_paras,test_paras_mask,test_labels, test_islabels, test_paras_shape, test_questions_shape, test_types, test_types_shape,test_question_trichar_ids,test_question_trichar_masks,test_para_trichar_ids,test_para_trichar_masks,test_type_trichar_ids,test_type_trichar_masks=load_SQUAD_hinrich_v4(test_size, para_len, question_len, trichar_len,word2id, trichar2id, rootPath+'dev.big.20170320.txt') word2id, trichar2id,test_questions,test_questions_mask,test_paras,test_paras_mask,test_labels, test_islabels, test_paras_shape, test_questions_shape, test_types, test_types_shape,test_question_trichar_ids,test_question_trichar_masks,test_para_trichar_ids,test_para_trichar_masks,test_type_trichar_ids,test_type_trichar_masks=load_SQUAD_hinrich_v4(test_size, para_len, question_len, trichar_len,word2id, trichar2id, rootPath+'dev20170320.txt') print 'word2id size for bigger dataset:', len(word2id), 'trichar size:', len(trichar2id) train_size=len(train_questions) test_size = len(test_questions) #50010# train_questions = np.asarray(train_questions, dtype='int32') train_questions_shape = np.asarray(train_questions_shape, dtype='int32') train_questions_mask = np.asarray(train_questions_mask, dtype=theano.config.floatX) train_paras = np.asarray(train_paras, dtype='int32') train_paras_shape = np.asarray(train_paras_shape, dtype='int32') train_paras_mask = np.asarray(train_paras_mask, dtype=theano.config.floatX) train_types = np.asarray(train_types, dtype='int32') train_types_shape = np.asarray(train_types_shape, dtype='int32') # train_c_ids = np.asarray(train_c_ids, dtype='int32') # train_c_ids_shape = np.asarray(train_c_ids_shape, dtype='int32') # train_c_masks = np.asarray(train_c_masks, dtype=theano.config.floatX) train_islabels = np.asarray(train_islabels, dtype=theano.config.floatX) # train_c_heads = np.asarray(train_c_heads, dtype='int32') # train_c_tails = np.asarray(train_c_tails, dtype='int32') train_labels = np.asarray(train_labels, dtype='int32') #train_question_trichar_ids,train_question_trichar_masks,train_para_trichar_ids,train_para_trichar_masks,train_type_trichar_ids,train_type_trichar_masks train_question_trichar_ids = np.asarray(train_question_trichar_ids, dtype='int32') train_question_trichar_masks = np.asarray(train_question_trichar_masks, dtype=theano.config.floatX) train_para_trichar_ids = np.asarray(train_para_trichar_ids, dtype='int32') train_para_trichar_masks = np.asarray(train_para_trichar_masks, dtype=theano.config.floatX) train_type_trichar_ids = np.asarray(train_type_trichar_ids, dtype='int32') train_type_trichar_masks = np.asarray(train_type_trichar_masks, dtype=theano.config.floatX) test_questions = np.asarray(test_questions, dtype='int32') test_questions_shape = np.asarray(test_questions_shape, dtype='int32') test_questions_mask = np.asarray(test_questions_mask, dtype=theano.config.floatX) test_paras = np.asarray(test_paras, dtype='int32') test_paras_shape = np.asarray(test_paras_shape, dtype='int32') test_paras_mask = np.asarray(test_paras_mask, dtype=theano.config.floatX) test_types = np.asarray(test_types, dtype='int32') test_types_shape = np.asarray(test_types_shape, dtype='int32') # test_c_ids = np.asarray(test_c_ids, dtype='int32') # test_c_ids_shape = np.asarray(test_c_ids_shape, dtype='int32') # test_c_masks = np.asarray(test_c_masks, dtype=theano.config.floatX) test_islabels = np.asarray(test_islabels, dtype=theano.config.floatX) # test_c_heads = np.asarray(test_c_heads, dtype='int32') # test_c_tails = np.asarray(test_c_tails, dtype='int32') test_labels = np.asarray(test_labels, dtype='int32') test_question_trichar_ids = np.asarray(test_question_trichar_ids, dtype='int32') test_question_trichar_masks = np.asarray(test_question_trichar_masks, dtype=theano.config.floatX) test_para_trichar_ids = np.asarray(test_para_trichar_ids, dtype='int32') test_para_trichar_masks = np.asarray(test_para_trichar_masks, dtype=theano.config.floatX) test_type_trichar_ids = np.asarray(test_type_trichar_ids, dtype='int32') test_type_trichar_masks = np.asarray(test_type_trichar_masks, dtype=theano.config.floatX) overall_vocab_size=len(word2id) print 'train size:', train_size, 'test size:', test_size, 'vocab size:', overall_vocab_size rand_values=random_value_normal((overall_vocab_size, emb_size), theano.config.floatX, rng) rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) overall_trichar_size = len(trichar2id) char_rand_values=random_value_normal((overall_trichar_size, char_emb_size), theano.config.floatX, rng) char_embeddings=theano.shared(value=char_rand_values, borrow=True) para=T.imatrix() #(2*batch, len) para_shape = T.imatrix() para_mask=T.fmatrix() #(2*batch, len) q=T.imatrix() #(2*batch, len_q) q_shape = T.imatrix() q_mask=T.fmatrix() #(2*batch, len_q) islabels = T.fmatrix() labels=T.ivector() #batch types=T.imatrix() types_shape=T.imatrix() q_trichar_ids = T.imatrix() q_trichar_masks =T.fmatrix() para_trichar_ids = T.imatrix() para_trichar_masks =T.fmatrix() type_trichar_ids = T.imatrix() type_trichar_masks =T.fmatrix() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = para.shape[0] paragraph_input = embeddings[para.flatten()].reshape((true_batch_size, para_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, para_len) q_input = embeddings[q.flatten()].reshape((true_batch_size, question_len, emb_size)).transpose((0, 2,1)) # (batch, emb_size, question_len) q_types = embeddings[types.flatten()].reshape((true_batch_size, 2, emb_size)).transpose((0, 2,1)) paragraph_input_shape = embeddings[para_shape.flatten()].reshape((true_batch_size, para_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, para_len) q_input_shape = embeddings[q_shape.flatten()].reshape((true_batch_size, question_len, emb_size)).transpose((0, 2,1)) # (batch, emb_size, question_len) q_types_shape = embeddings[types_shape.flatten()].reshape((true_batch_size, 2, emb_size)).transpose((0, 2,1)) paragraph_input_trichar = char_embeddings[para_trichar_ids.flatten()].reshape((true_batch_size, para_len*trichar_len, char_emb_size)) #(batch, char_emb_size, para_len*trichar_len) q_input_trichar = char_embeddings[q_trichar_ids.flatten()].reshape((true_batch_size, question_len*trichar_len, char_emb_size)) # (batch, emb_size, question_len) q_types_trichar = char_embeddings[type_trichar_ids.flatten()].reshape((true_batch_size, 2*trichar_len, char_emb_size)) #sum up trichar emb as word level embs paragraph_input_trichar=T.sum((paragraph_input_trichar*para_trichar_masks.dimshuffle(0,1,'x')).reshape((true_batch_size, para_len, trichar_len,char_emb_size)),axis=2).dimshuffle(0,2,1) #(true_batch_size, char_emb_size,para_len) q_input_trichar=T.sum((q_input_trichar*q_trichar_masks.dimshuffle(0,1,'x')).reshape((true_batch_size, question_len, trichar_len,char_emb_size)),axis=2).dimshuffle(0,2,1) #(true_batch_size, char_emb_size,q_len) q_types_trichar=T.sum((q_types_trichar*type_trichar_masks.dimshuffle(0,1,'x')).reshape((true_batch_size, 2, trichar_len,char_emb_size)),axis=2).dimshuffle(0,2,1) #(true_batch_size, char_emb_size,2) #concatenate word emb with shape emb q_input = T.concatenate([q_input,q_input_shape, q_input_trichar],axis=1) #(batch, 2*emb_size+char_emb_size, q_len) paragraph_input = T.concatenate([paragraph_input,paragraph_input_shape, paragraph_input_trichar,islabels.dimshuffle(0,'x',1)],axis=1)#(batch, 2*emb_size+char_emb_size+1, para_len) q_types_input = T.sum(T.concatenate([q_types,q_types_shape,q_types_trichar],axis=1), axis=2) #(batch, 2*emb+char_emb_size) fwd_LSTM_para_dict=create_LSTM_para(rng, 2*emb_size+char_emb_size+1, hidden_size) bwd_LSTM_para_dict=create_LSTM_para(rng, 2*emb_size+char_emb_size+1, hidden_size) paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) paragraph_reps_tensor3=paragraph_model.output_tensor #(batch, 2*hidden, paralen) fwd_LSTM_q_dict=create_LSTM_para(rng, 2*emb_size+char_emb_size, hidden_size) bwd_LSTM_q_dict=create_LSTM_para(rng, 2*emb_size+char_emb_size, hidden_size) question_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(q_input, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) q_reps=questions_model.output_sent_rep_maxpooling #(batch, 2*hidden) #interaction batch_ids=T.arange(true_batch_size) # c_heads=theano.shared(value=np.asarray([(para_len-1)/2]*batch_size, dtype='int32'), borrow=True) c_heads = T.repeat(theano.shared(value=np.asarray([(para_len-1)/2], dtype='int32'), borrow=True), true_batch_size) c_tails=c_heads+1 c_heads_reps=paragraph_reps_tensor3[batch_ids,:,c_heads] #(batch, 2*hidden) c_tails_reps=paragraph_reps_tensor3[batch_ids,:,c_tails] #(batch, 2*hidden) candididates_reps=T.concatenate([c_heads_reps, c_tails_reps], axis=1) #(batch, 4*hidden) context_l=paragraph_model.forward_output[batch_ids,:,c_heads-1] #(batch, hidden) context_r=paragraph_model.backward_output[batch_ids,:,c_tails+1]#(batch, hidden) #glove level average # c_input = embeddings[c_ids.flatten()].reshape((true_batch_size, c_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) # c_input_shape = embeddings[c_ids_shape.flatten()].reshape((true_batch_size, c_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) # c_input = T.concatenate([c_input,c_input_shape],axis=1) c_sum = paragraph_input[:,:-1,(para_len-1)/2]#(batch, 2*emb_size+char_emb) c_sum_with_isInQLabel = paragraph_input[:,:,(para_len-1)/2] # e_input = embeddings[e_ids.flatten()].reshape((true_batch_size, e_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) q_sum = T.sum(q_input*q_mask.dimshuffle(0,'x',1), axis=2) #(batch, 2*emb_size+char_emb_size) # average_Q_batch = q_sum/T.sqrt(T.sum(q_sum**2, axis=1)+1e-20).dimshuffle(0,'x') HL_layer_1_input_size=2*hidden_size+4*hidden_size+(2*emb_size+char_emb_size+1)+(2*emb_size+char_emb_size)+1+hidden_size+hidden_size+(2*emb_size+char_emb_size)+1 cosine_Qtype_cand = cosine_row_wise_twoMatrix(q_types_input, c_sum).dimshuffle(0,'x') #(batch, 1) #, average_E_batch, average_C_batch, average_Q_batch HL_layer_1_input = T.concatenate([q_reps, candididates_reps, c_sum_with_isInQLabel, q_sum, islabels[:,(para_len-1)/2:(para_len-1)/2+1], context_l, context_r, q_types_input, cosine_Qtype_cand], axis=1) HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=HL_hidden_size, activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=HL_hidden_size, n_out=HL_hidden_size, activation=T.tanh) LR_input= T.concatenate([HL_layer_1.output, HL_layer_2.output, islabels[:,(para_len-1)/2:(para_len-1)/2+1], cosine_Qtype_cand], axis=1) #(batch, char_HL_hidden_size+HL_hidden_size) LR_input_size= HL_hidden_size+HL_hidden_size+1+1#HL_layer_1_input_size+2*HL_hidden_size U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2 norm_U_a=normalize_matrix(U_a) LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='char_LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=norm_U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = LR_para+[embeddings,char_embeddings]+paragraph_para+question_para+HL_layer_1.params+HL_layer_2.params # load_model_from_file(storePath+'Best_Paras_HS_20170316_0.760357142857', params) # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) # L2_reg = L2norm_paraList(params) cost=loss#+1e-6*L2_reg accumulator=[] for para_i in params: eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-20))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([para, para_shape, para_mask,q,q_shape, q_mask,islabels, labels, types, types_shape, q_trichar_ids,q_trichar_masks,para_trichar_ids,para_trichar_masks,type_trichar_ids,type_trichar_masks], cost, updates=updates,on_unused_input='ignore') # train_model_pred = theano.function([para, para_mask, c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], layer_LR.y_pred, on_unused_input='ignore') test_model = theano.function([para, para_shape, para_mask, q,q_shape, q_mask,islabels, labels, types, types_shape,q_trichar_ids,q_trichar_masks,para_trichar_ids,para_trichar_masks,type_trichar_ids,type_trichar_masks], [layer_LR.errors(labels),layer_LR.prop_for_posi], on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size #batch_size means how many pairs train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] # n_train_batches_pred=train_size/batch_size_pred #batch_size means how many pairs # train_batch_start_pred=list(np.arange(n_train_batches_pred)*batch_size_pred)+[train_size-batch_size_pred] n_test_batches=test_size/test_batch_size #batch_size means how many pairs n_test_remain=test_size%test_batch_size #batch_size means how many pairs test_batch_start=list(np.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_acc=0.0 cost_i=0.0 train_ids = range(train_size) # train_ids_pred = range(train_size) best_test_statistic=defaultdict(int) # best_train_statistic=defaultdict(int) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) # print train_ids[:100] iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_list = train_ids[para_id:para_id+batch_size] # print 'train_labels[train_id_list]:', train_labels[train_id_list] if model_type=='train': #para, para_shape, para_mask,q,q_shape, q_mask,islabels, labels, types, types_shape, q_trichar_ids,q_trichar_masks,para_trichar_ids,para_trichar_masks,type_trichar_ids,type_trichar_masks cost_i+= train_model( train_paras[train_id_list], train_paras_shape[train_id_list], train_paras_mask[train_id_list], train_questions[train_id_list], train_questions_shape[train_id_list], train_questions_mask[train_id_list], train_islabels[train_id_list], train_labels[train_id_list], train_types[train_id_list], train_types_shape[train_id_list], train_question_trichar_ids[train_id_list], train_question_trichar_masks[train_id_list], train_para_trichar_ids[train_id_list], train_para_trichar_masks[train_id_list], train_type_trichar_ids[train_id_list], train_type_trichar_masks[train_id_list]) #print iter if iter%10 ==0: print 'Epoch ', epoch, 'iter '+str(iter)+'/'+str(len(train_batch_start))+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' past_time = time.time() print 'Testing...' error=0 test_statistic=defaultdict(int) if model_type=='test': writefile=open(storePath+'predictions_20170317.txt', 'w') for id, test_para_id in enumerate(test_batch_start): test_id_list = range(test_para_id, test_para_id+test_batch_size) # print 'test_id_list:',test_id_list # print 'test_c_heads[test_id_list]', test_c_heads[test_id_list] # gold_labels_list = test_labels_3c[test_para_id:test_para_id+test_batch_size] error_i, preds_i= test_model( test_paras[test_id_list], test_paras_shape[test_id_list], test_paras_mask[test_id_list], test_questions[test_id_list], test_questions_shape[test_id_list], test_questions_mask[test_id_list], test_islabels[test_id_list], test_labels[test_id_list], test_types[test_id_list], test_types_shape[test_id_list], test_question_trichar_ids[test_id_list], test_question_trichar_masks[test_id_list], test_para_trichar_ids[test_id_list], test_para_trichar_masks[test_id_list], test_type_trichar_ids[test_id_list], test_type_trichar_masks[test_id_list]) if model_type=='test': if id < len(test_batch_start)-1: writefile.write('\n'.join(map(str,list(preds_i)))+'\n') else: writefile.write('\n'.join(map(str,list(preds_i)[-n_test_remain:]))+'\n') error+=error_i # for ind, gold_label in enumerate(gold_labels_list): # test_statistic[(gold_label, preds_i[ind])]+=1 if model_type=='test': writefile.close() acc=1.0-error*1.0/len(test_batch_start) # acc= (test_statistic.get((1,1),0)+test_statistic.get((0,0),0))*1.0/(test_statistic.get((1,1),0)+test_statistic.get((0,0),0)+test_statistic.get((1,0),0)+test_statistic.get((0,1),0)) if acc> max_acc: max_acc=acc # best_test_statistic=test_statistic if model_type=='train': store_model_to_file(storePath+'Best_Paras_HS_20170324_'+str(max_acc), params) print 'Finished storing best params at:', max_acc print 'current average acc:', acc, '\t\tmax acc:', max_acc#, '\ttest_statistic:', test_statistic # print '\t\t\t\tbest statistic:', best_test_statistic if model_type=='test': exit(0) if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=300, test_batch_size=10000, emb_size=50, hidden_size=50, L2_weight=0.0001, para_len_limit=70, q_len_limit=20, pred_q_len_limit=50, top_n_Qwords=1): model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = np.random.RandomState(23455) word2id = {} train_para_list, train_para_mask, train_Q_list, train_Q_mask, train_start_list, train_end_list, _, word2id = load_QGQA( word2id, para_len_limit, q_len_limit, top_n_Qwords, True) train_size = len(train_para_list) if train_size != len(train_Q_list) or train_size != len( train_start_list) or train_size != len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_para_mask, test_Q_list, test_Q_mask, test_start_list, test_end_list, _, word2id = load_QGQA( word2id, para_len_limit, q_len_limit, top_n_Qwords, False) test_size = len(test_para_list) train_para_list = np.asarray(train_para_list, dtype='int32') train_para_mask = np.asarray(train_para_mask, dtype=theano.config.floatX) train_Q_list = np.asarray(train_Q_list, dtype='int32') train_Q_mask = np.asarray(train_Q_mask, dtype=theano.config.floatX) train_start_list = np.asarray(train_start_list, dtype='int32') train_end_list = np.asarray(train_end_list, dtype='int32') test_para_list = np.asarray(test_para_list, dtype='int32') test_para_mask = np.asarray(test_para_mask, dtype=theano.config.floatX) test_Q_list = np.asarray(test_Q_list, dtype='int32') test_Q_mask = np.asarray(test_Q_mask, dtype=theano.config.floatX) test_start_list = np.asarray(test_start_list, dtype='int32') test_end_list = np.asarray(test_end_list, dtype='int32') vocab_size = len(word2id) + 1 # shared_decoder_mask = [0]*vocab_size # shared_decoder_mask[0]=1#we need this pad token in generated text # for id in train_top_Q_wordids: # shared_decoder_mask[id]=1 # shared_decoder_mask=theano.shared(value=np.asarray(shared_decoder_mask, dtype='int32'), borrow=True) # rand_values = random_value_normal((vocab_size, emb_size), theano.config.floatX, np.random.RandomState(1234)) rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_glove() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared(value=rand_values, borrow=True) train_top_Q_wordids = set() wh_words = [ 'What', 'Which', 'Where', 'When', 'Who', 'Whom', 'Whose', 'Why', 'How', 'far', 'many', 'much', 'long' ] for word in wh_words: idd = word2id.get(word) if idd is not None: train_top_Q_wordids.add(idd) iddd = word2id.get(word.lower()) if iddd is not None: train_top_Q_wordids.add(iddd) paragraph = T.imatrix('paragraph') questions_encoderIDs = T.imatrix() # is ground truth, questions_decoderIDS = T.imatrix( ) #note we convert then from encoder vocab id to decoder vocab id decoder_vocab = T.ivector() decoder_mask = T.fmatrix() #(batch, decoder_vocab_size) start_indices = T.ivector() #batch end_indices = T.ivector() #batch para_mask = T.fmatrix('para_mask') q_mask = T.fmatrix('q_mask') ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = paragraph.shape[0] paragraph_input = embeddings[paragraph.flatten()].reshape( (true_batch_size, para_len_limit, emb_size)).dimshuffle(0, 2, 1) #(batch, emb_size, para_len) q_input = embeddings[questions_encoderIDs.flatten()].reshape( (true_batch_size, q_len_limit, emb_size)).dimshuffle(0, 2, 1) decoder_vocab_embs = embeddings[decoder_vocab] fwd_LSTM_para_dict = create_LSTM_para(rng, emb_size, hidden_size) bwd_LSTM_para_dict = create_LSTM_para(rng, emb_size, hidden_size) paragraph_para = fwd_LSTM_para_dict.values() + bwd_LSTM_para_dict.values( ) # .values returns a list of parameters paragraph_model = Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate( paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) paragraph_reps_tensor3 = paragraph_model.output_tensor #(batch, 2*hidden, paralen) batch_ids = T.arange(true_batch_size) ans_heads = paragraph_reps_tensor3[batch_ids, :, start_indices] ans_tails = paragraph_reps_tensor3[batch_ids, :, end_indices] l_context_heads = paragraph_reps_tensor3[:, :, 0] l_context_tails = paragraph_reps_tensor3[batch_ids, :, start_indices - 1] r_context_heads = paragraph_reps_tensor3[batch_ids, :, end_indices + 1] r_context_tails = paragraph_reps_tensor3[:, :, -1] encoder_reps = T.concatenate([ l_context_heads, l_context_tails, ans_heads, ans_tails, r_context_heads, r_context_tails ], axis=1) #(batch, 6*2hidden_size) decoder_para_dict = create_LSTM_para(rng, emb_size + 12 * hidden_size, emb_size) attention_para_dict1 = create_LSTM_para(rng, 2 * hidden_size, hidden_size) attention_para_dict2 = create_LSTM_para(rng, 2 * hidden_size, hidden_size) ''' train ''' groundtruth_as_input = T.concatenate([ T.alloc(np.asarray(0., dtype=theano.config.floatX), true_batch_size, emb_size, 1), q_input[:, :, :-1] ], axis=2) # decoder = LSTM_Decoder_Train_with_Mask(groundtruth_as_input, encoder_reps, decoder_vocab_embs, q_mask, emb_size, decoder_para_dict) #X, Encoder_Tensor_Rep, Encoder_Mask, start_indices, end_indices, vocab_embs, Mask, emb_size, hidden_size, tparams, attention_para_dict1, attention_para_dict2 decoder = LSTM_Decoder_Train_with_Attention( groundtruth_as_input, paragraph_reps_tensor3, para_mask, start_indices, end_indices, decoder_vocab_embs, q_mask, emb_size, hidden_size, decoder_para_dict, attention_para_dict1, attention_para_dict2) prob_matrix = decoder.prob_matrix #(batch*senlen, decoder_vocab_size) probs = prob_matrix[T.arange(true_batch_size * q_len_limit), questions_decoderIDS.flatten()] mask_probs = probs[(q_mask.flatten()).nonzero()] #we shift question word ids so that in current step, the prob of previsouly predicted id gets lower and lower shifted_question_ids = T.concatenate([ T.alloc(np.asarray(0, dtype='int32'), true_batch_size, 1), questions_decoderIDS[:, :-1] ], axis=1) probs_to_minimize = prob_matrix[T.arange(true_batch_size * q_len_limit), shifted_question_ids.flatten()] mask_probs_to_minimize = probs_to_minimize[(q_mask.flatten()).nonzero()] #loss train loss = -T.mean(T.log(mask_probs)) + T.mean(T.exp(mask_probs_to_minimize)) cost = loss #+ConvGRU_1.error# params = [embeddings] + paragraph_para + decoder_para_dict.values( ) + attention_para_dict1.values() + attention_para_dict2.values() accumulator = [] for para_i in params: eps_p = np.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-8))) #AdaGrad updates.append((acc_i, acc)) # #test decoder mask # raw_masks = T.zeros((true_batch_size, vocab_size), dtype='int32') # x_axis = T.repeat(T.arange(true_batch_size).dimshuffle(0,'x'), paragraph.shape[1], axis=1) # input_specific_masks = T.set_subtensor(raw_masks[x_axis.flatten(),paragraph.flatten()],1) # overall_test_decoder_mask = T.or_(input_specific_masks, shared_decoder_mask.dimshuffle('x',0)) #(batch, vocab_size) # overall_test_decoder_mask=(1.0-overall_test_decoder_mask)*(overall_test_decoder_mask-10) ''' testing ''' # test_decoder = LSTM_Decoder_Test_with_Mask(q_len_limit, encoder_reps, decoder_vocab_embs, emb_size, decoder_para_dict) #nsteps, Encoder_Tensor_Rep, Encoder_Mask, start_indices, end_indices, vocab_embs, emb_size,hidden_size, tparams,attention_para_dict1, attention_para_dict2 test_decoder = LSTM_Decoder_Test_with_Attention( pred_q_len_limit, paragraph_reps_tensor3, para_mask, start_indices, end_indices, decoder_vocab_embs, decoder_mask, emb_size, hidden_size, decoder_para_dict, attention_para_dict1, attention_para_dict2) predictions = test_decoder.output_id_matrix #(batch, q_len_limit) train_model = theano.function([ paragraph, questions_encoderIDs, questions_decoderIDS, decoder_vocab, start_indices, end_indices, para_mask, q_mask ], cost, updates=updates, on_unused_input='ignore') test_model = theano.function([ paragraph, decoder_vocab, decoder_mask, start_indices, end_indices, para_mask ], predictions, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches = train_size / batch_size # remain_train=train_size%batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / test_batch_size remain_test = test_size % test_batch_size test_batch_start = list(np.arange(n_test_batches) * test_batch_size) + [test_size - remain_test] max_bleuscore = 0.0 max_exact_acc = 0.0 cost_i = 0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu = 0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 sub_Qs = train_Q_list[para_id:para_id + batch_size] decoder_vocab_set = train_top_Q_wordids | set( list(np.unique(sub_Qs))) decoder_vocab_batch = sorted( decoder_vocab_set) # a list of ids in order map_encoderid2decoderid = {} for encoderID in decoder_vocab_set: decoderID = decoder_vocab_batch.index(encoderID) map_encoderid2decoderid[encoderID] = decoderID Decoder_train_Q_list = [] for id in sub_Qs.flatten(): Decoder_train_Q_list.append(map_encoderid2decoderid.get(id)) Decoder_train_Q_list = np.asarray(Decoder_train_Q_list, dtype='int32').reshape( (batch_size, sub_Qs.shape[1])) decoder_vocab_batch = np.asarray(decoder_vocab_batch, dtype='int32') cost_i += train_model( train_para_list[para_id:para_id + batch_size], train_Q_list[para_id:para_id + batch_size], Decoder_train_Q_list, decoder_vocab_batch, train_start_list[para_id:para_id + batch_size], train_end_list[para_id:para_id + batch_size], train_para_mask[para_id:para_id + batch_size], train_Q_mask[para_id:para_id + batch_size]) #print iter if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' # print 'Testing...' past_time = time.time() outputfile = codecs.open('output.txt', 'w', 'utf-8') referencefile = codecs.open('reference.txt', 'w', 'utf-8') bleu_scores = [] for idd, test_para_id in enumerate(test_batch_start): sub_Qs = test_Q_list[test_para_id:test_para_id + test_batch_size] decoder_vocab_set = train_top_Q_wordids | set( list(np.unique(sub_Qs))) decoder_vocab_batch = sorted( decoder_vocab_set) # a list of ids in order map_decoderid2encoderid = {} for encoderID in decoder_vocab_set: decoderID = decoder_vocab_batch.index(encoderID) map_decoderid2encoderid[decoderID] = encoderID if idd == len(test_batch_start) - 1: true_test_batch_size = remain_test else: true_test_batch_size = test_batch_size #decoder mask batch decoder_mask_batch = [] for i in range(true_test_batch_size): Q_i = test_Q_list[test_para_id + i] decoder_vocab_Q = train_top_Q_wordids | set( list(np.unique(Q_i))) decoder_mask_ind = [] for ele in decoder_vocab_batch: if ele in decoder_vocab_Q: decoder_mask_ind.append(1.0) else: decoder_mask_ind.append(0.0) decoder_mask_batch.append(decoder_mask_ind) decoder_mask_batch = np.asarray(decoder_mask_batch, dtype=theano.config.floatX) decoder_vocab_batch = np.asarray(decoder_vocab_batch, dtype='int32') pred_id_in_batch = test_model( test_para_list[test_para_id:test_para_id + test_batch_size], decoder_vocab_batch, decoder_mask_batch, test_start_list[test_para_id:test_para_id + test_batch_size], test_end_list[test_para_id:test_para_id + test_batch_size], test_para_mask[test_para_id:test_para_id + test_batch_size]) #(batch, senlen) ground_truths = sub_Qs ground_mask = test_Q_mask[test_para_id:test_para_id + test_batch_size] back_pred_id_in_batch = [ map_decoderid2encoderid.get(id) for id in pred_id_in_batch.flatten() ] for i in range(true_test_batch_size): # print 'pred_id_in_batch[i]:', pred_id_in_batch[i] refined_preds, refined_g = refine_decoder_predictions( back_pred_id_in_batch[i * pred_q_len_limit:(i + 1) * pred_q_len_limit], ground_truths[i], ground_mask[i]) # bleu_i = nltk.translate.bleu_score.sentence_bleu([refined_g], refined_preds) # bleu_scores.append(bleu_i) pred_q = '' prev_w = '' for id in refined_preds: word = id2word.get(id) if word.isalnum(): if word != prev_w: pred_q += ' ' + word prev_w = word outputfile.write(pred_q + ' ?\n') referencefile.write( ' '.join([id2word.get(id) for id in refined_g]) + '\n') # bleuscore = np.average(np.array(bleu_scores)) outputfile.close() referencefile.close() system('perl multi-bleu.perl reference.txt < output.txt') # if max_bleuscore < bleuscore: # max_bleuscore = bleuscore # print '\t\t\t\t\t\t current bleu: ', bleuscore, ' ; max bleu:', max_bleuscore if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=4, L2_weight=0.001, emb_size=70, batch_size=50, filter_size=3, maxSentLen=50, nn='CNN'): hidden_size=emb_size model_options = locals().copy() print "model options", model_options rng = np.random.RandomState(1234) #random seed, control the model generates the same results all_sentences_l, all_masks_l, all_sentences_r, all_masks_r,all_labels, word2id =load_SNLI_dataset(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence train_sents_l=np.asarray(all_sentences_l[0], dtype='int32') dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32') test_sents_l=np.asarray(all_sentences_l[2], dtype='int32') train_masks_l=np.asarray(all_masks_l[0], dtype=theano.config.floatX) dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l=np.asarray(all_masks_l[2], dtype=theano.config.floatX) train_sents_r=np.asarray(all_sentences_r[0], dtype='int32') dev_sents_r=np.asarray(all_sentences_r[1] , dtype='int32') test_sents_r=np.asarray(all_sentences_r[2] , dtype='int32') train_masks_r=np.asarray(all_masks_r[0], dtype=theano.config.floatX) dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r=np.asarray(all_masks_r[2], dtype=theano.config.floatX) train_labels_store=np.asarray(all_labels[0], dtype='int32') dev_labels_store=np.asarray(all_labels[1], dtype='int32') test_labels_store=np.asarray(all_labels[2], dtype='int32') train_size=len(train_labels_store) dev_size=len(dev_labels_store) test_size=len(test_labels_store) vocab_size=len(word2id)+1 rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words # rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in word2id.iteritems()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_ids_l=T.imatrix() sents_mask_l=T.fmatrix() sents_ids_r=T.imatrix() sents_mask_r=T.fmatrix() labels=T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input_l=embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_r=embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)) #conv if nn=='CNN': conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size)) conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3])) NN_para=[conv_W, conv_b] conv_input_l = common_input_l.dimshuffle((0,'x', 2,1)) #(batch_size, 1, emb_size, maxsenlen) conv_model_l = Conv_with_input_para(rng, input=conv_input_l, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size, 1, emb_size, filter_size), W=conv_W, b=conv_b) conv_output_l=conv_model_l.narrow_conv_out #(batch, 1, hidden_size, maxsenlen-filter_size+1) conv_output_into_tensor3_l=conv_output_l.reshape((batch_size, hidden_size, maxSentLen-filter_size+1)) mask_for_conv_output_l=T.repeat(sents_mask_l[:,filter_size-1:].reshape((batch_size, 1, maxSentLen-filter_size+1)), hidden_size, axis=1) #(batch_size, emb_size, maxSentLen-filter_size+1) mask_for_conv_output_l=(1.0-mask_for_conv_output_l)*(mask_for_conv_output_l-10) masked_conv_output_l=conv_output_into_tensor3_l+mask_for_conv_output_l #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings_l=T.max(masked_conv_output_l, axis=2) #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_input_r = common_input_r.dimshuffle((0,'x', 2,1)) #(batch_size, 1, emb_size, maxsenlen) conv_model_r = Conv_with_input_para(rng, input=conv_input_r, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size, 1, emb_size, filter_size), W=conv_W, b=conv_b) conv_output_r=conv_model_r.narrow_conv_out #(batch, 1, hidden_size, maxsenlen-filter_size+1) conv_output_into_tensor3_r=conv_output_r.reshape((batch_size, hidden_size, maxSentLen-filter_size+1)) mask_for_conv_output_r=T.repeat(sents_mask_r[:,filter_size-1:].reshape((batch_size, 1, maxSentLen-filter_size+1)), hidden_size, axis=1) #(batch_size, emb_size, maxSentLen-filter_size+1) mask_for_conv_output_r=(1.0-mask_for_conv_output_r)*(mask_for_conv_output_r-10) masked_conv_output_r=conv_output_into_tensor3_r+mask_for_conv_output_r #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings_r=T.max(masked_conv_output_r, axis=2) #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size #GRU if nn=='GRU': U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size) NN_para=[U1, W1, b1] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias gru_input_l = common_input_l.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer_l=GRU_Batch_Tensor_Input_with_Mask(gru_input_l, sents_mask_l, hidden_size, U1, W1, b1) sent_embeddings_l=gru_layer_l.output_sent_rep # (batch_size, hidden_size) gru_input_r = common_input_r.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer_r=GRU_Batch_Tensor_Input_with_Mask(gru_input_r, sents_mask_r, hidden_size, U1, W1, b1) sent_embeddings_r=gru_layer_r.output_sent_rep # (batch_size, hidden_size) #LSTM if nn=='LSTM': LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) NN_para=LSTM_para_dict.values() # .values returns a list of parameters lstm_input_l = common_input_l.dimshuffle((0,2,1)) #LSTM has the same inpur format with GRU lstm_layer_l=LSTM_Batch_Tensor_Input_with_Mask(lstm_input_l, sents_mask_l, hidden_size, LSTM_para_dict) sent_embeddings_l=lstm_layer_l.output_sent_rep # (batch_size, hidden_size) lstm_input_r = common_input_r.dimshuffle((0,2,1)) #LSTM has the same inpur format with GRU lstm_layer_r=LSTM_Batch_Tensor_Input_with_Mask(lstm_input_r, sents_mask_r, hidden_size, LSTM_para_dict) sent_embeddings_r=lstm_layer_r.output_sent_rep # (batch_size, hidden_size) HL_layer_1_input = T.concatenate([sent_embeddings_l,sent_embeddings_r, sent_embeddings_l*sent_embeddings_r, cosine_matrix1_matrix2_rowwise(sent_embeddings_l,sent_embeddings_r).dimshuffle(0,'x')],axis=1) HL_layer_1_input_size = hidden_size*3+1 HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size, activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size, n_out=hidden_size, activation=T.tanh) #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative LR_input_size=HL_layer_1_input_size+2*hidden_size U_a = create_ensemble_para(rng, 3, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] LR_input=T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output],axis=1) layer_LR=LogisticRegression(rng, input=T.tanh(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [embeddings]+NN_para+LR_para+HL_layer_1.params+HL_layer_2.params # put all model parameters together # L2_reg =L2norm_paraList([embeddings,conv_W, U_a]) # diversify_reg= Diversify_Reg(U_a.T)+Diversify_Reg(conv_W_into_matrix) cost=loss#+Div_reg*diversify_reg#+L2_weight*L2_reg grads = T.grad(cost, params) # create a list of gradients for all model parameters accumulator=[] for para_i in params: eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #1e-8 is add to get rid of zero division updates.append((acc_i, acc)) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False n_train_batches=train_size/batch_size train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_dev_batches=dev_size/batch_size dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches=test_size/batch_size test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size] max_acc_dev=0.0 max_acc_test=0.0 while epoch < n_epochs: epoch = epoch + 1 train_indices = range(train_size) random.Random(200).shuffle(train_indices) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu=0 cost_i=0.0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_batch = train_indices[batch_id:batch_id+batch_size] cost_i+= train_model( train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter%500==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' past_time = time.time() # if epoch >=3 and iter >= len(train_batch_start)*2.0/3 and iter%500==0: # print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' # past_time = time.time() error_sum=0.0 for dev_batch_id in dev_batch_start: # for each test batch error_i=dev_model( dev_sents_l[dev_batch_id:dev_batch_id+batch_size], dev_masks_l[dev_batch_id:dev_batch_id+batch_size], dev_sents_r[dev_batch_id:dev_batch_id+batch_size], dev_masks_r[dev_batch_id:dev_batch_id+batch_size], dev_labels_store[dev_batch_id:dev_batch_id+batch_size] ) error_sum+=error_i dev_accuracy=1.0-error_sum/(len(dev_batch_start)) if dev_accuracy > max_acc_dev: max_acc_dev=dev_accuracy print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev #best dev model, do test error_sum=0.0 for test_batch_id in test_batch_start: # for each test batch error_i=test_model( test_sents_l[test_batch_id:test_batch_id+batch_size], test_masks_l[test_batch_id:test_batch_id+batch_size], test_sents_r[test_batch_id:test_batch_id+batch_size], test_masks_r[test_batch_id:test_batch_id+batch_size], test_labels_store[test_batch_id:test_batch_id+batch_size] ) error_sum+=error_i test_accuracy=1.0-error_sum/(len(test_batch_start)) if test_accuracy > max_acc_test: max_acc_test=test_accuracy print '\t\tcurrent testbacc:', test_accuracy, '\t\t\t\t\tmax_acc_test:', max_acc_test else: print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test