def __init__(self, batch_size, vocab_size, left_context, right_context, emb_size, k, unigram, l1_weight=0, l2_weight=0, nce_seed=2345): self.name = 'vLBL' self.batch_size = batch_size self.vocab_size = vocab_size self.left_context = left_context self.right_context = right_context self.context_size = self.left_context + self.right_context self.emb_size = emb_size self.k = k self.unigram = unigram self.p_n = debug_print(theano.shared(value=unigram, name='noise_probab'), 'noise') self.l1_weight = l1_weight self.l2_weight = l2_weight self.nce_seed = nce_seed # Create context and target embeddings rand_values = random_value_normal((self.vocab_size, self.emb_size), floatX, np.random.RandomState(1234)) self.R = theano.shared(value=rand_values, name='R') rand_values = random_value_normal((self.vocab_size, self.emb_size), floatX, np.random.RandomState(4321)) self.Q = theano.shared(value=rand_values, name='Q') b_values = zero_value((self.vocab_size,), dtype=floatX) self.bias = theano.shared(value=b_values, name='bias') # The learning rates are created the first time set_learning_rate is # called. self.lr = None
def __init__(self, learning_rate=0.2, n_epochs=2000, nkerns=[6, 14], batch_size=10, useAllSamples=0, ktop=4, filter_size=[7,5], L2_weight=0.00005, dropout_p=0.8, useEmb=0, task=2, corpus=1, dataMode=3, maxSentLength=60, sentEm_length=48, window=3, k=5, nce_seeds=2345, only_left_context=False, vali_cost_list_length=20, embedding_size=48, train_scheme=1): self.ini_learning_rate=learning_rate self.n_epochs=n_epochs self.nkerns=nkerns self.batch_size=batch_size self.useAllSamples=useAllSamples self.ktop=ktop self.filter_size=filter_size self.L2_weight=L2_weight self.dropout_p=dropout_p self.useEmb=useEmb self.task=task self.corpus=corpus self.dataMode=dataMode self.maxSentLength=maxSentLength self.kmax=self.maxSentLength/2+5 self.sentEm_length=sentEm_length self.window=window self.k=k self.only_left_context=only_left_context if self.only_left_context: self.context_size=self.window else: self.context_size=2*self.window self.nce_seed=nce_seeds self.embedding_size=0 self.train_scheme=train_scheme root="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/" wiki_path="/mounts/data/proj/wenpeng/PhraseEmbedding/enwiki-20130503-pages-articles-cleaned-tokenized" embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt' embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt' datasets, unigram, train_lengths, dev_lengths, word_count=load_model_for_training(wiki_path, root+str(self.task)+'classes/'+str(self.corpus)+'train.txt', root+str(self.task)+'classes/'+str(self.corpus)+'dev.txt',self.maxSentLength, self.dataMode, self.train_scheme) self.datasets=datasets self.embedding_size=embedding_size self.vocab_size=word_count rand_values=random_value_normal((self.vocab_size+1, self.embedding_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(self.embedding_size)) self.embeddings_R=theano.shared(value=rand_values) rand_values=random_value_normal((self.vocab_size+1, self.embedding_size), theano.config.floatX, numpy.random.RandomState(4321)) rand_values[0]=numpy.array(numpy.zeros(self.embedding_size)) self.embeddings_Q=theano.shared(value=rand_values) self.unigram=unigram self.p_n=theano.shared(value=self.unigram) self.train_lengths=train_lengths self.dev_lengths=dev_lengths b_values = zero_value((len(unigram),), dtype=theano.config.floatX) self.bias = theano.shared(value=b_values, name='bias') self.vali_cost_list_length=vali_cost_list_length
def new_load_glove(self, target_id2word): word2embeddings={} read_file=open('/mounts/data/proj/wenpeng/Dataset/embeddings-scaled.EMBEDDING_SIZE=200.txt') for line in read_file: tokens=line.strip().split() self.target_embedding_size=len(tokens)-1 embedding=[] for i in range(1, self.target_embedding_size+1): embedding.append(float(tokens[i])) word2embeddings[tokens[0]]=embedding words_number=len(target_id2word) print 'totally '+str(words_number)+' distinct target words' embedding_Q=random_value_normal((len(target_id2word), self.target_embedding_size), theano.config.floatX, numpy.random.RandomState(4321)) #for i in range(100): # embedding_Q[0][i]=0.6 unknown_words=0 for index in range(words_number): embed=word2embeddings.get(target_id2word[index], -1) embed_lowercase=word2embeddings.get(target_id2word[index].lower(), -1) if embed==-1 and embed_lowercase==-1: # a unknown word which has no embedding in glove embedding_Q[index]=numpy.array(numpy.random.rand(self.target_embedding_size)) unknown_words+=1 #print target_id2word[index] elif embed!=-1: embedding_Q[index]=numpy.array(embed) else: embedding_Q[index]=numpy.array(embed_lowercase) print 'Collobert embeddings loaded over, '+str(unknown_words)+' words find no embeddings.' #numpy.savetxt('matrix.txt', embedding_Q, delimiter=',') #exit(0) embedding_R=random_value_normal((self.trigram_size+1, self.context_embedding_size), theano.config.floatX, numpy.random.RandomState(1234)) ''' count=0 for word, embedding in word2embeddings.iteritems(): embedding_R[count]=numpy.array(embedding[:self.context_embedding_size]) count+=1 if count==(self.trigram_size+1): break ''' return embedding_R, embedding_Q
def __init__(self, batch_size, vocab_size, left_context, right_context, emb_size, k, unigram, l1_weight=0, l2_weight=0, nce_seed=2345): self.name = 'vLBL' self.batch_size = batch_size self.vocab_size = vocab_size self.left_context = left_context self.right_context = right_context self.context_size = self.left_context + self.right_context self.emb_size = emb_size self.k = k self.unigram = unigram self.p_n = debug_print( theano.shared(value=unigram, name='noise_probab'), 'noise') self.l1_weight = l1_weight self.l2_weight = l2_weight self.nce_seed = nce_seed # Create context and target embeddings rand_values = random_value_normal((self.vocab_size, self.emb_size), floatX, np.random.RandomState(1234)) self.R = theano.shared(value=rand_values, name='R') rand_values = random_value_normal((self.vocab_size, self.emb_size), floatX, np.random.RandomState(4321)) self.Q = theano.shared(value=rand_values, name='Q') b_values = zero_value((self.vocab_size, ), dtype=floatX) self.bias = theano.shared(value=b_values, name='bias') # The learning rates are created the first time set_learning_rate is # called. self.lr = None
def load_params(self, base_filename, params_str): super(VLblNceDistributional, self).load_params(base_filename, params_str) #if 'D' in self.__dict__: # self.D2 = theano.sparse.basic.dense_from_sparse(self.D) # do not re-initialize R if it is loaded - so here we check whether #dimensions of R are right if 'R' not in self.__dict__ \ or self.R.shape.eval()[0] != self.D.shape.eval()[1] : rand_values = random_value_normal((self.D.shape.eval()[1], \ self.emb_size), floatX, np.random.RandomState(999)) self.R = theano.shared(value=rand_values, name='R')
def evaluate_lenet5(learning_rate=0.085, n_epochs=2000, nkerns=[1,1], batch_size=1, window_width=3, maxSentLength=60, emb_size=300, L2_weight=0.0005, update_freq=1, unifiedWidth_conv0=8, k_dy=3, ktop=3): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/'; rng = numpy.random.RandomState(23455) datasets, vocab_size=load_msr_corpus(rootPath+'vocab.txt', rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength) mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/' #mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2,:] indices_train_r=indices_train[1::2,:] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2,:] indices_test_r=indices_test[1::2,:] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] n_train_batches=indices_train_l.shape[0]/batch_size n_test_batches=indices_test_l.shape[0]/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l=T.cast(indices_train_l, 'int64') indices_train_r=T.cast(indices_train_r, 'int64') indices_test_l=T.cast(indices_test_l, 'int64') indices_test_r=T.cast(indices_test_r, 'int64') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size)) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) error_sum=0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.lmatrix('x_index_l') # now, x is the index matrix, must be integer x_index_r = T.lmatrix('x_index_r') y = T.lvector('y') left_l=T.lscalar() right_l=T.lscalar() left_r=T.lscalar() right_r=T.lscalar() length_l=T.lscalar() length_r=T.lscalar() norm_length_l=T.dscalar() norm_length_r=T.dscalar() #mts=T.dmatrix() #wmf=T.dmatrix() cost_tmp=T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv0=ishape[1]+filter_size[1]-1 poolsize1=(1, length_after_wideConv0) length_after_wideConv1=unifiedWidth_conv0+filter_size[1]-1 poolsize2=(1, length_after_wideConv1) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_ll=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), poolsize=poolsize1, k=k_dy, unifiedWidth=unifiedWidth_conv0, left=left_l, right=right_l, W=conv_W, b=conv_b, firstLayer=True) layer0_rr=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), poolsize=poolsize1, k=k_dy, unifiedWidth=unifiedWidth_conv0, left=left_r, right=right_r, W=conv_W, b=conv_b, firstLayer=True) layer0_l_output=debug_print(layer0_ll.fold_output, 'layer0_l.output') layer0_r_output=debug_print(layer0_rr.fold_output, 'layer0_r.output') layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=ishape[0]/2, left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1) conv_W2, conv_b2=create_conv_para(rng, filter_shape=(1, 1, filter_size[0]/2, filter_size[1])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer1_ll=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_ll.output, image_shape=(batch_size, nkerns[0], ishape[0]/2, unifiedWidth_conv0), filter_shape=(nkerns[1], nkerns[0], filter_size[0]/2, filter_size[1]), poolsize=poolsize2, k=ktop, unifiedWidth=ktop, left=layer0_ll.leftPad, right=layer0_ll.rightPad, W=conv_W2, b=conv_b2, firstLayer=False) layer1_rr=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_rr.output, image_shape=(batch_size, nkerns[0], ishape[0]/2, unifiedWidth_conv0), filter_shape=(nkerns[1], nkerns[0], filter_size[0]/2, filter_size[1]), poolsize=poolsize2, k=ktop, unifiedWidth=ktop, left=layer0_rr.leftPad, right=layer0_rr.rightPad, W=conv_W2, b=conv_b2, firstLayer=False) layer1_l_output=debug_print(layer1_ll.fold_output, 'layer1_l.output') layer1_r_output=debug_print(layer1_rr.fold_output, 'layer1_r.output') layer2=Average_Pooling_for_Top(rng, input_l=layer1_l_output, input_r=layer1_r_output, kern=ishape[0]/4, left_l=layer0_ll.leftPad, right_l=layer0_ll.rightPad, left_r=layer0_rr.leftPad, right_r=layer0_rr.rightPad, length_l=k_dy+filter_size[1]-1, length_r=k_dy+filter_size[1]-1, dim=unifiedWidth_conv0+filter_size[1]-1) #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh) sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) uni_cosine=cosine(sum_uni_l, sum_uni_r) ''' linear=Linear(sum_uni_l, sum_uni_r) poly=Poly(sum_uni_l, sum_uni_r) sigmoid=Sigmoid(sum_uni_l, sum_uni_r) rbf=RBF(sum_uni_l, sum_uni_r) gesd=GESD(sum_uni_l, sum_uni_r) ''' eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2% #eucli_1=EUCLID(sum_uni_l, sum_uni_r) len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) ''' len_l=length_l.reshape((1,1)) len_r=length_r.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer3_input=T.concatenate([#mts, eucli_1, uni_cosine, #norm_uni_l, norm_uni_r,#uni_cosine,#norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, # layer1.output_eucli_to_simi,layer1.output_cosine, layer1.output_attentions, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, # #layer1.output_vector_l,layer1.output_vector_r, layer2.output_eucli_to_simi,layer2.output_cosine, layer2.output_attentions, #layer2.output_vector_l,layer2.output_vector_r, len_l, len_r #layer1.output_attentions, #wmf, ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer3=LogisticRegression(rng, input=layer3_input, n_in=(2)+(2+4*4)+(2+4*4)+2, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((layer3.W** 2).sum()+(conv_W** 2).sum()+(conv_W2**2).sum(), 'L2_reg')#+(layer1.W** 2).sum() cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost') test_model = theano.function([index], [layer3.errors(y), layer3.y_pred, layer3_input, y], givens={ x_index_l: indices_test_l[index: index + batch_size], x_index_r: indices_test_r[index: index + batch_size], y: testY[index: index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index] #mts: mt_test[index: index + batch_size], #wmf: wm_test[index: index + batch_size] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer3.params+ [conv_W]+[conv_W2]# + layer1.params accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): #grad_i=debug_print(grad_i,'grad_i') #norm=T.sqrt((grad_i**2).sum()) #if T.lt(norm_threshold, norm): # print 'big norm' # grad_i=grad_i*(norm_threshold/norm) acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index,cost_tmp], [cost,layer3.errors(y), layer3_input], updates=updates, givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index] #mts: mt_train[index: index + batch_size], #wmf: wm_train[index: index + batch_size] }, on_unused_input='ignore') train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y], givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index] #mts: mt_train[index: index + batch_size], #wmf: wm_train[index: index + batch_size] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False max_acc=0.0 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 shuffle(train_batch_start)#shuffle training data cost_tmp=0.0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) if iter%update_freq != 0: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) #print 'cost_ij: ', cost_ij cost_tmp+=cost_ij error_sum+=error_ij else: cost_average, error_ij, layer3_input= train_model(batch_start,cost_tmp) #print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' sum error: '+str(error_sum)+'/'+str(update_freq) error_sum=0 cost_tmp=0.0#reset for the next batch #print layer3_input #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_losses=[] test_y=[] test_features=[] for i in test_batch_start: test_loss, pred_y, layer3_input, y=test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) test_acc=1-test_score print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, (1-test_score) * 100.)) #now, see the results of svm #write_feature=open('feature_check.txt', 'w') train_y=[] train_features=[] for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n') #write_feature.close() clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33 clf.fit(train_features, train_y) results=clf.predict(test_features) lr=linear_model.LogisticRegression().fit(train_features, train_y) results_lr=lr.predict(test_features) corr_count=0 corr_lr=0 test_size=len(test_y) for i in range(test_size): if results[i]==test_y[i]: corr_count+=1 if numpy.absolute(results_lr[i]-test_y[i])<0.5: corr_lr+=1 acc=corr_count*1.0/test_size acc_lr=corr_lr*1.0/test_size if acc > max_acc: max_acc=acc best_epoch=epoch if acc_lr> max_acc: max_acc=acc_lr best_epoch=epoch if test_acc> max_acc: max_acc=test_acc best_epoch=epoch print '\t\t\t\t\t\t\t\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ', max_acc , ' at epoch: ', best_epoch #exit(0) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=500, test_batch_size=1000, emb_size=300, hidden_size=300, HL_hidden_size=200, L2_weight=0.0001, train_size=None, test_size=None, batch_size_pred=1000, para_len=60, question_len=20, c_len=7, e_len=2): model_options = locals().copy() print "model options", model_options rootPath='/mounts/work/hs/yin/20161219/'; storePath='/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = np.random.RandomState(23455) word2id={} word2id['UNK']=0 # use it to pad word2id, train_questions,train_questions_mask,train_paras,train_paras_mask,train_e_ids,train_e_masks,train_c_ids,train_c_masks, train_c_heads,train_c_tails,train_l_heads,train_l_tails,train_e_heads,train_e_tails,train_labels, train_labels_3c=load_SQUAD_hinrich_v2(train_size, para_len, question_len, e_len, c_len, word2id, rootPath+'squadnewtrn.txt') word2id, test_questions,test_questions_mask,test_paras,test_paras_mask,test_e_ids,test_e_masks,test_c_ids,test_c_masks, test_c_heads,test_c_tails,test_l_heads,test_l_tails,test_e_heads,test_e_tails,test_labels, test_labels_3c=load_SQUAD_hinrich_v2(test_size, para_len, question_len, e_len, c_len,word2id, rootPath+'squadnewdev.txt') print 'word2id size for bigger dataset:', len(word2id) word2id, train_questions,train_questions_mask,train_paras,train_paras_mask,train_e_ids,train_e_masks,train_c_ids,train_c_masks, train_c_heads,train_c_tails,train_l_heads,train_l_tails,train_e_heads,train_e_tails,train_labels, train_labels_3c=load_SQUAD_hinrich_v2(train_size, para_len, question_len,e_len, c_len, word2id, rootPath+'squadnewtrn,subset.000.txt') word2id, test_questions,test_questions_mask,test_paras,test_paras_mask,test_e_ids,test_e_masks,test_c_ids,test_c_masks, test_c_heads,test_c_tails,test_l_heads,test_l_tails,test_e_heads,test_e_tails,test_labels, test_labels_3c=load_SQUAD_hinrich_v2(test_size, para_len, question_len, e_len, c_len,word2id, rootPath+'squadnewdev,subset.000.txt') print 'word2id size for smaller dataset:', len(word2id) # if len(train_questions)!=train_size or len(test_questions)!=test_size: # print 'len(questions)!=train_size or len(test_questions)!=test_size:', len(train_questions),train_size,len(test_questions),test_size # exit(0) train_size=len(train_questions) test_size = len(test_questions) train_questions = np.asarray(train_questions, dtype='int32') # print train_questions[:10,:] # exit(0) train_questions_mask = np.asarray(train_questions_mask, dtype=theano.config.floatX) train_paras = np.asarray(train_paras, dtype='int32') train_paras_mask = np.asarray(train_paras_mask, dtype=theano.config.floatX) train_e_ids = np.asarray(train_e_ids, dtype='int32') train_e_masks = np.asarray(train_e_masks, dtype=theano.config.floatX) train_c_ids = np.asarray(train_c_ids, dtype='int32') train_c_masks = np.asarray(train_c_masks, dtype=theano.config.floatX) train_c_heads = np.asarray(train_c_heads, dtype='int32') train_c_tails = np.asarray(train_c_tails, dtype='int32') train_l_heads = np.asarray(train_l_heads, dtype='int32') train_l_tails = np.asarray(train_l_tails, dtype='int32') train_e_heads = np.asarray(train_e_heads, dtype='int32') train_e_tails = np.asarray(train_e_tails, dtype='int32') train_labels = np.asarray(train_labels, dtype='int32') train_labels_3c = np.asarray(train_labels_3c, dtype='int32') test_questions = np.asarray(test_questions, dtype='int32') test_questions_mask = np.asarray(test_questions_mask, dtype=theano.config.floatX) test_paras = np.asarray(test_paras, dtype='int32') test_paras_mask = np.asarray(test_paras_mask, dtype=theano.config.floatX) test_e_ids = np.asarray(test_e_ids, dtype='int32') test_e_masks = np.asarray(test_e_masks, dtype=theano.config.floatX) test_c_ids = np.asarray(test_c_ids, dtype='int32') test_c_masks = np.asarray(test_c_masks, dtype=theano.config.floatX) test_c_heads = np.asarray(test_c_heads, dtype='int32') test_c_tails = np.asarray(test_c_tails, dtype='int32') test_l_heads = np.asarray(test_l_heads, dtype='int32') test_l_tails = np.asarray(test_l_tails, dtype='int32') test_e_heads = np.asarray(test_e_heads, dtype='int32') test_e_tails = np.asarray(test_e_tails, dtype='int32') test_labels = np.asarray(test_labels, dtype='int32') overall_vocab_size=len(word2id) print 'train size:', train_size, 'test size:', test_size, 'vocab size:', overall_vocab_size rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, rng) rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() para=T.imatrix() #(2*batch, len) para_mask=T.fmatrix() #(2*batch, len) c_ids=T.imatrix() #(2*batch, len) c_mask=T.fmatrix() #(2*batch, len) e_ids=T.imatrix() #(2*batch, len) e_mask=T.fmatrix() #(2*batch, len) c_heads=T.ivector() #batch c_tails=T.ivector() #batch l_heads=T.ivector() #batch l_tails=T.ivector() #batch e_heads=T.ivector() #batch e_tails=T.ivector() #batch q=T.imatrix() #(2*batch, len_q) q_mask=T.fmatrix() #(2*batch, len_q) labels=T.ivector() #batch ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = para.shape[0] # U_p, W_p, b_p=create_GRU_para(rng, emb_size, hidden_size) # U_p_b, W_p_b, b_p_b=create_GRU_para(rng, emb_size, hidden_size) # GRU_p_para=[U_p, W_p, b_p, U_p_b, W_p_b, b_p_b] # # U_q, W_q, b_q=create_GRU_para(rng, emb_size, hidden_size) # U_q_b, W_q_b, b_q_b=create_GRU_para(rng, emb_size, hidden_size) # GRU_q_para=[U_q, W_q, b_q, U_q_b, W_q_b, b_q_b] paragraph_input = embeddings[para.flatten()].reshape((true_batch_size, para_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, para_len) q_input = embeddings[q.flatten()].reshape((true_batch_size, question_len, emb_size)).transpose((0, 2,1)) # (batch, emb_size, question_len) fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) paragraph_reps_tensor3=paragraph_model.output_tensor #(batch, 2*hidden, paralen) # paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U_p,W=W_p,b=b_p,Ub=U_p_b,Wb=W_p_b,bb=b_p_b) # paragraph_reps_tensor3=paragraph_model.output_tensor_conc #(batch, 2*hidden, para_len) fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) question_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(q_input, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) q_reps=questions_model.output_sent_rep_maxpooling #(batch, 2*hidden) # q_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=q_input, Mask=q_mask, hidden_dim=hidden_size,U=U_q,W=W_q,b=b_q,Ub=U_q_b,Wb=W_q_b,bb=b_q_b) # q_reps=q_model.output_sent_rep_conc #(batch, 2*hidden) #interaction batch_ids=T.arange(true_batch_size) c_heads_reps=paragraph_reps_tensor3[batch_ids,:,c_heads] #(batch, 2*hidden) c_tails_reps=paragraph_reps_tensor3[batch_ids,:,c_tails] #(batch, 2*hidden) candididates_reps=T.concatenate([c_heads_reps, c_tails_reps], axis=1) #(batch, 4*hidden) l_heads_reps=paragraph_reps_tensor3[batch_ids,:,l_heads] #(batch, 2*hidden) l_tails_reps=paragraph_reps_tensor3[batch_ids,:,l_tails] #(batch, 2*hidden) longs_reps=T.concatenate([l_heads_reps, l_tails_reps], axis=1) #(batch, 4*hidden) e_heads_reps=paragraph_reps_tensor3[batch_ids,:,e_heads] #(batch, 2*hidden) e_tails_reps=paragraph_reps_tensor3[batch_ids,:,e_tails] #(batch, 2*hidden) extensions_reps=T.concatenate([e_heads_reps, e_tails_reps], axis=1) #(batch, 4*hidden) #glove level average c_input = embeddings[c_ids.flatten()].reshape((true_batch_size, c_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) c_sum = T.sum(c_input*c_mask.dimshuffle(0,'x',1), axis=2) #(batch, emb_size) average_C_batch = c_sum/T.sqrt(T.sum(c_sum**2, axis=1)+1e-20).dimshuffle(0,'x') e_input = embeddings[e_ids.flatten()].reshape((true_batch_size, e_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) e_sum = T.sum(e_input*e_mask.dimshuffle(0,'x',1), axis=2) #(batch, emb_size) average_E_batch = e_sum/T.sqrt(T.sum(e_sum**2, axis=1)+1e-20).dimshuffle(0,'x') # e_input = embeddings[e_ids.flatten()].reshape((true_batch_size, e_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) q_sum = T.sum(q_input*q_mask.dimshuffle(0,'x',1), axis=2) #(batch, emb_size) average_Q_batch = q_sum/T.sqrt(T.sum(q_sum**2, axis=1)+1e-20).dimshuffle(0,'x') # def submatrix_average(matrix, head, tail): # return T.mean(matrix[:, head:tail+1], axis=1) #emb_size # def submatrix_average_q(matrix, head): # return T.mean(matrix[:, head:], axis=1) #emb_size # # average_E_batch, _ = theano.scan(fn=submatrix_average, # sequences=[paragraph_input,e_heads, e_tails]) #(batch, emb_size) # average_C_batch, _ = theano.scan(fn=submatrix_average, # sequences=[paragraph_input,c_heads, c_tails]) #(batch, emb_size) # # Q_valid_len=T.cast(T.sum(q_mask, axis=1), 'int32') # # average_Q_batch, _ = theano.scan(fn=submatrix_average_q, # sequences=[q_input,-Q_valid_len]) #(batch, emb_size) #classify HL_layer_subtask_input=T.concatenate([q_reps, extensions_reps, average_E_batch, average_Q_batch], axis=1) #(batch, 6*hidden+2*emb) HL_layer_subtask_size= 6*hidden_size+2*emb_size#HL_layer_1_input_size+2*HL_hidden_size HL_layer_subtask_1=HiddenLayer(rng, input=HL_layer_subtask_input, n_in=HL_layer_subtask_size, n_out=HL_hidden_size, activation=T.tanh) HL_layer_subtask_2=HiddenLayer(rng, input=HL_layer_subtask_1.output, n_in=HL_hidden_size, n_out=HL_hidden_size, activation=T.tanh) U_subtask_a = create_ensemble_para(rng, 2, HL_hidden_size) # the weight matrix hidden_size*2 norm_U_subtask_a=normalize_matrix(U_subtask_a) LR_subtask_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_subtask_para=[U_subtask_a, LR_subtask_b] layer_LR_subtask=LogisticRegression(rng, input=HL_layer_subtask_2.output, n_in=HL_hidden_size, n_out=2, W=norm_U_subtask_a, b=LR_subtask_b) #basically it is a multiplication between weight matrix and input feature vector HL_layer_1_input_size=14*hidden_size+3*emb_size+1 #, average_E_batch, average_C_batch, average_Q_batch HL_layer_1_input = T.concatenate([q_reps, longs_reps, extensions_reps, candididates_reps, average_E_batch, average_C_batch, average_Q_batch, layer_LR_subtask.prop_for_posi.reshape((true_batch_size,1))], axis=1) #(batch, 14*hidden_size+3*emb_size+1) HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=HL_hidden_size, activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=HL_hidden_size, n_out=HL_hidden_size, activation=T.tanh) LR_input=HL_layer_2.output #T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output], axis=1) #(batch, 10*hidden) LR_input_size= HL_hidden_size#HL_layer_1_input_size+2*HL_hidden_size U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2 norm_U_a=normalize_matrix(U_a) LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=norm_U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector loss=layer_LR.negative_log_likelihood(labels)+layer_LR_subtask.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = LR_para+[embeddings]+paragraph_para+question_para+HL_layer_1.params+HL_layer_2.params+LR_subtask_para+HL_layer_subtask_1.params+HL_layer_subtask_2.params # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=loss#+0.0005*T.mean(U_a**2) accumulator=[] for para_i in params: eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-20))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([para, para_mask,c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], cost, updates=updates,on_unused_input='ignore') train_model_pred = theano.function([para, para_mask, c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], layer_LR.y_pred, on_unused_input='ignore') test_model = theano.function([para, para_mask, c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], [layer_LR.errors(labels),layer_LR.y_pred], on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size #batch_size means how many pairs train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_train_batches_pred=train_size/batch_size_pred #batch_size means how many pairs train_batch_start_pred=list(np.arange(n_train_batches_pred)*batch_size_pred)+[train_size-batch_size_pred] n_test_batches=test_size/test_batch_size #batch_size means how many pairs test_batch_start=list(np.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_acc=0.0 cost_i=0.0 train_ids = range(train_size) train_ids_pred = range(train_size) best_test_statistic=defaultdict(int) # best_train_statistic=defaultdict(int) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) # print train_ids[:100] iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_list = train_ids[para_id:para_id+batch_size] # print 'train_labels[train_id_list]:', train_labels[train_id_list] cost_i+= train_model( train_paras[train_id_list], train_paras_mask[train_id_list], train_c_ids[train_id_list], train_c_masks[train_id_list], train_e_ids[train_id_list], train_e_masks[train_id_list], train_c_heads[train_id_list], train_c_tails[train_id_list], train_l_heads[train_id_list], train_l_tails[train_id_list], train_e_heads[train_id_list], train_e_tails[train_id_list], train_questions[train_id_list], train_questions_mask[train_id_list], train_labels[train_id_list]) #print iter if iter%10==0: #iter>=200 and print 'Epoch ', epoch, 'iter '+str(iter)+'/'+str(len(train_batch_start))+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' past_time = time.time() # print 'Training Pred...' # train_statistic=defaultdict(int) # for para_id in train_batch_start_pred: # train_id_list = train_ids_pred[para_id:para_id+batch_size_pred] # gold_train_labels_list = train_labels_3c[train_id_list] # # print 'train_id_list:', train_id_list # # print 'train_c_heads[train_id_list]:', train_c_heads[train_id_list] # train_preds_i= train_model_pred( # train_paras[train_id_list], # train_paras_mask[train_id_list], # train_c_ids[train_id_list], # train_c_masks[train_id_list], # train_e_ids[train_id_list], # train_e_masks[train_id_list], # train_c_heads[train_id_list], # train_c_tails[train_id_list], # train_l_heads[train_id_list], # train_l_tails[train_id_list], # train_e_heads[train_id_list], # train_e_tails[train_id_list], # train_questions[train_id_list], # train_questions_mask[train_id_list], # train_labels[train_id_list]) # # for ind, gold_label in enumerate(gold_train_labels_list): # train_statistic[(gold_label, train_preds_i[ind])]+=1 # train_acc= (train_statistic.get((1,1),0)+train_statistic.get((0,0),0))*1.0/(train_statistic.get((1,1),0)+train_statistic.get((0,0),0)+train_statistic.get((1,0),0)+train_statistic.get((0,1),0)) # # print '\t\tcurrnt train acc:', train_acc, ' train_statistic:', train_statistic print 'Testing...' error=0 test_statistic=defaultdict(int) for test_para_id in test_batch_start: test_id_list = range(test_para_id, test_para_id+test_batch_size) # print 'test_id_list:',test_id_list # print 'test_c_heads[test_id_list]', test_c_heads[test_id_list] gold_labels_list = test_labels_3c[test_para_id:test_para_id+test_batch_size] error_i, preds_i= test_model( test_paras[test_id_list], test_paras_mask[test_id_list], test_c_ids[test_id_list], test_c_masks[test_id_list], test_e_ids[test_id_list], test_e_masks[test_id_list], test_c_heads[test_id_list], test_c_tails[test_id_list], test_l_heads[test_id_list], test_l_tails[test_id_list], test_e_heads[test_id_list], test_e_tails[test_id_list], test_questions[test_id_list], test_questions_mask[test_id_list], test_labels[test_id_list]) error+=error_i for ind, gold_label in enumerate(gold_labels_list): test_statistic[(gold_label, preds_i[ind])]+=1 # acc=1.0-error*1.0/len(test_batch_start) acc= (test_statistic.get((1,1),0)+test_statistic.get((0,0),0))*1.0/(test_statistic.get((1,1),0)+test_statistic.get((0,0),0)+test_statistic.get((1,0),0)+test_statistic.get((0,1),0)) if acc> max_acc: max_acc=acc best_test_statistic=test_statistic store_model_to_file(storePath+'Best_Paras_HS_v2_000_subtask_'+str(max_acc), params) print 'Finished storing best params at:', max_acc print 'current average acc:', acc, '\t\tmax acc:', max_acc, '\ttest_statistic:', test_statistic print '\t\t\t\tbest statistic:', best_test_statistic if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=300, char_emb_size=20, hidden_size=300, L2_weight=0.0001, p_len_limit=400, test_p_len_limit=100, q_len_limit=20, char_len=15, filter_size = [5,5], char_filter_size=3, margin=2.0, max_EM=50.302743615): test_batch_size=batch_size*10 model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) word2id={} char2id={} #questions,paragraphs,q_masks,p_masks,labels, word2id train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_label_list, word2id, char2id=load_squad_cnn_rank_word_train(word2id, char2id, p_len_limit, q_len_limit, char_len) train_size=len(train_para_list) test_Q_list, test_para_list, test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, test_label_list, q_idlist, word2id, char2id, test_para_wordlist_list= load_squad_cnn_rank_word_dev(word2id, char2id, test_p_len_limit, q_len_limit, char_len) test_size=len(test_para_list) train_Q_list = numpy.asarray(train_Q_list, dtype='int32') train_para_list = numpy.asarray(train_para_list, dtype='int32') train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX) train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX) train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32') train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32') train_Q_char_mask = numpy.asarray(train_Q_char_mask, dtype=theano.config.floatX) train_para_char_mask = numpy.asarray(train_para_char_mask, dtype=theano.config.floatX) train_label_list = numpy.asarray(train_label_list, dtype='int32') test_Q_list = numpy.asarray(test_Q_list, dtype='int32') test_para_list = numpy.asarray(test_para_list, dtype='int32') test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX) test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX) test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32') test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32') test_Q_char_mask = numpy.asarray(test_Q_char_mask, dtype=theano.config.floatX) test_para_char_mask = numpy.asarray(test_para_char_mask, dtype=theano.config.floatX) vocab_size = len(word2id) print 'vocab size: ', vocab_size rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, rng) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_glove() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) char_size = len(char2id) print 'char size: ', char_size char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, rng) char_embeddings=theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') gold_indices= T.imatrix() #batch, (start, end) for each sample para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') char_paragraph = T.imatrix() #(batch, char_len*p_len) char_questions = T.imatrix() char_para_mask=T.fmatrix() char_q_mask=T.fmatrix() true_p_len = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = paragraph.shape[0] common_input_p=embeddings[paragraph.flatten()].reshape((true_batch_size,true_p_len, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_q=embeddings[questions.flatten()].reshape((true_batch_size,q_len_limit, emb_size)) char_common_input_p=char_embeddings[char_paragraph.flatten()].reshape((true_batch_size*true_p_len, char_len, char_emb_size)) #the input format can be adapted into CNN or GRU or LSTM char_common_input_q=char_embeddings[char_questions.flatten()].reshape((true_batch_size*q_len_limit, char_len, char_emb_size)) char_p_masks = char_para_mask.reshape((true_batch_size*true_p_len, char_len)) char_q_masks = char_q_mask.reshape((true_batch_size*q_len_limit, char_len)) conv_W_char, conv_b_char=create_conv_para(rng, filter_shape=(char_emb_size, 1, char_emb_size, char_filter_size)) conv_W_1, conv_b_1=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size+char_emb_size, filter_size[0])) conv_W_2, conv_b_2=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) conv_W_1_q, conv_b_1_q=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size+char_emb_size, filter_size[0])) conv_W_2_q, conv_b_2_q=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) NN_para=[conv_W_1, conv_b_1,conv_W_2, conv_b_2,conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char] input4score = squad_cnn_rank_word(rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q,batch_size, p_len_limit,q_len_limit, emb_size, char_emb_size,char_len,filter_size,char_filter_size,hidden_size, conv_W_1, conv_b_1,conv_W_2, conv_b_2,conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q,conv_W_char,conv_b_char, para_mask, q_mask, char_p_masks,char_q_masks) test_input4score = squad_cnn_rank_word(rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q,test_batch_size, test_p_len_limit,q_len_limit, emb_size, char_emb_size,char_len,filter_size,char_filter_size,hidden_size, conv_W_1, conv_b_1,conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q,conv_W_char,conv_b_char, para_mask, q_mask, char_p_masks,char_q_masks) #(batch, hidden, #(batch, 2*hidden, p_len_limit)) # gram_size = 5*true_p_len-(0+1+2+3+4) HL_1_para = create_ensemble_para(rng, hidden_size, 2*hidden_size) HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 1, hidden_size) norm_U_a=normalize_matrix(U_a) norm_HL_1_para=normalize_matrix(HL_1_para) norm_HL_2_para=normalize_matrix(HL_2_para) norm_HL_3_para=normalize_matrix(HL_3_para) norm_HL_4_para=normalize_matrix(HL_4_para) end_HL_1_para = create_ensemble_para(rng, hidden_size, 2*hidden_size) end_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) end_U_a = create_ensemble_para(rng, 1, hidden_size) end_norm_U_a=normalize_matrix(end_U_a) end_norm_HL_1_para=normalize_matrix(end_HL_1_para) end_norm_HL_2_para=normalize_matrix(end_HL_2_para) end_norm_HL_3_para=normalize_matrix(end_HL_3_para) end_norm_HL_4_para=normalize_matrix(end_HL_4_para) span_scores_matrix = add_HLs_2_tensor3(input4score, norm_HL_1_para,norm_HL_2_para,norm_HL_3_para,norm_HL_4_para, norm_U_a, batch_size,true_p_len) span_scores=T.nnet.softmax(span_scores_matrix) #(batch, para_len) end_span_scores_matrix = add_HLs_2_tensor3(input4score, end_norm_HL_1_para,end_norm_HL_2_para,end_norm_HL_3_para,end_norm_HL_4_para, end_norm_U_a, batch_size,true_p_len) end_span_scores=T.nnet.softmax(end_span_scores_matrix) #(batch, para_len) loss_neg_likelihood=-T.mean(T.log(span_scores[T.arange(batch_size), gold_indices[:,0]])) end_loss_neg_likelihood=-T.mean(T.log(span_scores[T.arange(batch_size), gold_indices[:,1]])) #ranking loss start tanh_span_scores_matrix = span_scores#T.tanh(span_scores_matrix) #(batch, gram_size) index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) new_index_matrix = T.set_subtensor(index_matrix[T.arange(batch_size), gold_indices[:,0]], 1.0) prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()] prob_batch_nega = tanh_span_scores_matrix[(1.0-new_index_matrix).nonzero()] repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0) repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x',0), prob_batch_posi.shape[0], axis=0).flatten() loss_rank = T.mean(T.maximum(0.0, margin-repeat_posi+repeat_nega)) #ranking loss END end_tanh_span_scores_matrix = end_span_scores#T.tanh(span_scores_matrix) #(batch, gram_size) end_index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) end_new_index_matrix = T.set_subtensor(end_index_matrix[T.arange(batch_size), gold_indices[:,1]], 1.0) end_prob_batch_posi = end_tanh_span_scores_matrix[end_new_index_matrix.nonzero()] end_prob_batch_nega = end_tanh_span_scores_matrix[(1.0-end_new_index_matrix).nonzero()] end_repeat_posi = T.extra_ops.repeat(end_prob_batch_posi, end_prob_batch_nega.shape[0], axis=0) end_repeat_nega = T.extra_ops.repeat(end_prob_batch_nega.dimshuffle('x',0), end_prob_batch_posi.shape[0], axis=0).flatten() end_loss_rank = T.mean(T.maximum(0.0, margin-end_repeat_posi+end_repeat_nega)) loss = loss_neg_likelihood +end_loss_neg_likelihood+loss_rank+end_loss_rank #test test_span_scores_matrix = add_HLs_2_tensor3(test_input4score, norm_HL_1_para,norm_HL_2_para,norm_HL_3_para,norm_HL_4_para,norm_U_a, true_batch_size,true_p_len) #(batch, test_p_len) mask_test_return=T.argmax(test_span_scores_matrix*para_mask, axis=1) #batch end_test_span_scores_matrix = add_HLs_2_tensor3(test_input4score, end_norm_HL_1_para,end_norm_HL_2_para,end_norm_HL_3_para,end_norm_HL_4_para,end_norm_U_a, true_batch_size,true_p_len) #(batch, test_p_len) end_mask_test_return=T.argmax(end_test_span_scores_matrix*para_mask, axis=1) #batch params = [embeddings,char_embeddings]+NN_para+[U_a,HL_1_para,HL_2_para,HL_3_para,HL_4_para]+[end_U_a,end_HL_1_para,end_HL_2_para,end_HL_3_para,end_HL_4_para] L2_reg =L2norm_paraList([embeddings,char_embeddings,conv_W_1,conv_W_2,conv_W_1_q, conv_W_2_q, conv_W_char,U_a,HL_1_para,HL_2_para,HL_3_para,HL_4_para]) #L2_reg = L2norm_paraList(params) cost=loss#+L2_weight*L2_reg accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([paragraph, questions,gold_indices, para_mask, q_mask, char_paragraph, #(batch, char_len*p_len) char_questions, char_para_mask, char_q_mask, true_p_len], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, char_paragraph, char_questions, char_para_mask, char_q_mask, true_p_len], [mask_test_return,end_mask_test_return], on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/test_batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_batch = train_ids[para_id:para_id+batch_size] cost_i+= train_model( train_para_list[train_id_batch], train_Q_list[train_id_batch], train_label_list[train_id_batch], train_para_mask[train_id_batch], train_Q_mask[train_id_batch], train_para_char_list[train_id_batch], train_Q_char_list[train_id_batch], train_para_char_mask[train_id_batch], train_Q_char_mask[train_id_batch], p_len_limit) #print iter if iter%100==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() pred_dict={} q_amount=0 p1=0 for test_para_id in test_batch_start: batch_predict_ids, batch_predict_end_ids=test_model( test_para_list[test_para_id:test_para_id+test_batch_size], test_Q_list[test_para_id:test_para_id+test_batch_size], test_para_mask[test_para_id:test_para_id+test_batch_size], test_Q_mask[test_para_id:test_para_id+test_batch_size], test_para_char_list[test_para_id:test_para_id+test_batch_size], test_Q_char_list[test_para_id:test_para_id+test_batch_size], test_para_char_mask[test_para_id:test_para_id+test_batch_size], test_Q_char_mask[test_para_id:test_para_id+test_batch_size], test_p_len_limit) test_para_wordlist_batch=test_para_wordlist_list[test_para_id:test_para_id+test_batch_size] # test_label_batch=test_label_list[test_para_id:test_para_id+test_batch_size] # q_amount+=test_batch_size q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size] q_amount+=test_batch_size for q in range(test_batch_size): #for each question # pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_batch[q]) start = batch_predict_ids[q] end = batch_predict_end_ids[q] if end < start: start, end = end, start pred_ans = ' '.join(test_para_wordlist_batch[q][start:end+1]) q_id=q_ids_batch[q] pred_dict[q_id]=pred_ans with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt') if F1_acc> max_F1_acc: max_F1_acc=F1_acc if exact_acc> max_exact_acc: max_exact_acc=exact_acc # if max_exact_acc > max_EM: # store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params) # print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=300, char_emb_size=20, hidden_size=300, L2_weight=0.0001, p_len_limit=400, test_p_len_limit=100, q_len_limit=20, char_len=15, filter_size=[5, 5], char_filter_size=3, margin=2.0, max_EM=50.302743615): test_batch_size = batch_size * 10 model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = numpy.random.RandomState(23455) word2id = {} char2id = {} #questions,paragraphs,q_masks,p_masks,labels, word2id train_Q_list, train_para_list, train_Q_mask, train_para_mask, train_Q_char_list, train_para_char_list, train_Q_char_mask, train_para_char_mask, train_label_list, word2id, char2id = load_squad_cnn_rank_word_train( word2id, char2id, p_len_limit, q_len_limit, char_len) train_size = len(train_para_list) test_Q_list, test_para_list, test_Q_mask, test_para_mask, test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, test_label_list, q_idlist, word2id, char2id, test_para_wordlist_list = load_squad_cnn_rank_word_dev( word2id, char2id, test_p_len_limit, q_len_limit, char_len) test_size = len(test_para_list) train_Q_list = numpy.asarray(train_Q_list, dtype='int32') train_para_list = numpy.asarray(train_para_list, dtype='int32') train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX) train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX) train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32') train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32') train_Q_char_mask = numpy.asarray(train_Q_char_mask, dtype=theano.config.floatX) train_para_char_mask = numpy.asarray(train_para_char_mask, dtype=theano.config.floatX) train_label_list = numpy.asarray(train_label_list, dtype='int32') test_Q_list = numpy.asarray(test_Q_list, dtype='int32') test_para_list = numpy.asarray(test_para_list, dtype='int32') test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX) test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX) test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32') test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32') test_Q_char_mask = numpy.asarray(test_Q_char_mask, dtype=theano.config.floatX) test_para_char_mask = numpy.asarray(test_para_char_mask, dtype=theano.config.floatX) vocab_size = len(word2id) print 'vocab size: ', vocab_size rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, rng) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_glove() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared(value=rand_values, borrow=True) char_size = len(char2id) print 'char size: ', char_size char_rand_values = random_value_normal((char_size + 1, char_emb_size), theano.config.floatX, rng) char_embeddings = theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') gold_indices = T.imatrix() #batch, (start, end) for each sample para_mask = T.fmatrix('para_mask') q_mask = T.fmatrix('q_mask') char_paragraph = T.imatrix() #(batch, char_len*p_len) char_questions = T.imatrix() char_para_mask = T.fmatrix() char_q_mask = T.fmatrix() true_p_len = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = paragraph.shape[0] common_input_p = embeddings[paragraph.flatten()].reshape( (true_batch_size, true_p_len, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_q = embeddings[questions.flatten()].reshape( (true_batch_size, q_len_limit, emb_size)) char_common_input_p = char_embeddings[char_paragraph.flatten()].reshape( (true_batch_size * true_p_len, char_len, char_emb_size )) #the input format can be adapted into CNN or GRU or LSTM char_common_input_q = char_embeddings[char_questions.flatten()].reshape( (true_batch_size * q_len_limit, char_len, char_emb_size)) char_p_masks = char_para_mask.reshape( (true_batch_size * true_p_len, char_len)) char_q_masks = char_q_mask.reshape( (true_batch_size * q_len_limit, char_len)) conv_W_char, conv_b_char = create_conv_para( rng, filter_shape=(char_emb_size, 1, char_emb_size, char_filter_size)) conv_W_1, conv_b_1 = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size + char_emb_size, filter_size[0])) conv_W_2, conv_b_2 = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) conv_W_1_q, conv_b_1_q = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size + char_emb_size, filter_size[0])) conv_W_2_q, conv_b_2_q = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) NN_para = [ conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char ] input4score = squad_cnn_rank_word( rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q, batch_size, p_len_limit, q_len_limit, emb_size, char_emb_size, char_len, filter_size, char_filter_size, hidden_size, conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char, para_mask, q_mask, char_p_masks, char_q_masks) #(batch, 4*hidden, p_len_limit) test_input4score = squad_cnn_rank_word( rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q, test_batch_size, test_p_len_limit, q_len_limit, emb_size, char_emb_size, char_len, filter_size, char_filter_size, hidden_size, conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char, para_mask, q_mask, char_p_masks, char_q_masks) #(batch, 4*hidden, p_len_limit) # gram_size = 5*true_p_len-(0+1+2+3+4) HL_1_para = create_ensemble_para(rng, hidden_size, 6 * hidden_size + char_emb_size) HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 1, hidden_size) norm_U_a = normalize_matrix(U_a) norm_HL_1_para = normalize_matrix(HL_1_para) norm_HL_2_para = normalize_matrix(HL_2_para) norm_HL_3_para = normalize_matrix(HL_3_para) norm_HL_4_para = normalize_matrix(HL_4_para) end_HL_1_para = create_ensemble_para(rng, hidden_size, 6 * hidden_size + char_emb_size) end_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) end_U_a = create_ensemble_para(rng, 1, hidden_size) end_norm_U_a = normalize_matrix(end_U_a) end_norm_HL_1_para = normalize_matrix(end_HL_1_para) end_norm_HL_2_para = normalize_matrix(end_HL_2_para) end_norm_HL_3_para = normalize_matrix(end_HL_3_para) end_norm_HL_4_para = normalize_matrix(end_HL_4_para) span_scores_matrix = add_HLs_2_tensor3(input4score, norm_HL_1_para, norm_HL_2_para, norm_HL_3_para, norm_HL_4_para, norm_U_a, batch_size, true_p_len) span_scores = T.nnet.softmax(span_scores_matrix) #(batch, para_len) end_span_scores_matrix = add_HLs_2_tensor3(input4score, end_norm_HL_1_para, end_norm_HL_2_para, end_norm_HL_3_para, end_norm_HL_4_para, end_norm_U_a, batch_size, true_p_len) end_span_scores = T.nnet.softmax( end_span_scores_matrix) #(batch, para_len) loss_neg_likelihood = -T.mean( T.log(span_scores[T.arange(batch_size), gold_indices[:, 0]])) end_loss_neg_likelihood = -T.mean( T.log(span_scores[T.arange(batch_size), gold_indices[:, 1]])) #ranking loss start tanh_span_scores_matrix = span_scores #T.tanh(span_scores_matrix) #(batch, gram_size) index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) new_index_matrix = T.set_subtensor( index_matrix[T.arange(batch_size), gold_indices[:, 0]], 1.0) prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()] prob_batch_nega = tanh_span_scores_matrix[(1.0 - new_index_matrix).nonzero()] repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0) repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x', 0), prob_batch_posi.shape[0], axis=0).flatten() loss_rank = T.mean(T.maximum(0.0, margin - repeat_posi + repeat_nega)) #ranking loss END end_tanh_span_scores_matrix = end_span_scores #T.tanh(span_scores_matrix) #(batch, gram_size) end_index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) end_new_index_matrix = T.set_subtensor( end_index_matrix[T.arange(batch_size), gold_indices[:, 1]], 1.0) end_prob_batch_posi = end_tanh_span_scores_matrix[ end_new_index_matrix.nonzero()] end_prob_batch_nega = end_tanh_span_scores_matrix[( 1.0 - end_new_index_matrix).nonzero()] end_repeat_posi = T.extra_ops.repeat(end_prob_batch_posi, end_prob_batch_nega.shape[0], axis=0) end_repeat_nega = T.extra_ops.repeat(end_prob_batch_nega.dimshuffle( 'x', 0), end_prob_batch_posi.shape[0], axis=0).flatten() end_loss_rank = T.mean( T.maximum(0.0, margin - end_repeat_posi + end_repeat_nega)) loss = loss_neg_likelihood + end_loss_neg_likelihood + loss_rank + end_loss_rank #test test_span_scores_matrix = add_HLs_2_tensor3( test_input4score, norm_HL_1_para, norm_HL_2_para, norm_HL_3_para, norm_HL_4_para, norm_U_a, true_batch_size, true_p_len) #(batch, test_p_len) mask_test_return = T.argmax(test_span_scores_matrix * para_mask, axis=1) #batch end_test_span_scores_matrix = add_HLs_2_tensor3( test_input4score, end_norm_HL_1_para, end_norm_HL_2_para, end_norm_HL_3_para, end_norm_HL_4_para, end_norm_U_a, true_batch_size, true_p_len) #(batch, test_p_len) end_mask_test_return = T.argmax(end_test_span_scores_matrix * para_mask, axis=1) #batch params = ( [embeddings, char_embeddings] + NN_para + [U_a, HL_1_para, HL_2_para, HL_3_para, HL_4_para] + [end_U_a, end_HL_1_para, end_HL_2_para, end_HL_3_para, end_HL_4_para]) L2_reg = L2norm_paraList([ embeddings, char_embeddings, conv_W_1, conv_W_2, conv_W_1_q, conv_W_2_q, conv_W_char, U_a, HL_1_para, HL_2_para, HL_3_para, HL_4_para ]) #L2_reg = L2norm_paraList(params) cost = loss + L2_weight * L2_reg accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function( [ paragraph, questions, gold_indices, para_mask, q_mask, char_paragraph, #(batch, char_len*p_len) char_questions, char_para_mask, char_q_mask, true_p_len ], cost, updates=updates, on_unused_input='ignore') test_model = theano.function([ paragraph, questions, para_mask, q_mask, char_paragraph, char_questions, char_para_mask, char_q_mask, true_p_len ], [mask_test_return, end_mask_test_return], on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches = train_size / batch_size # remain_train=train_size%batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / test_batch_size # remain_test=test_size%batch_size test_batch_start = list(numpy.arange(n_test_batches) * test_batch_size) + [test_size - test_batch_size] max_F1_acc = 0.0 max_exact_acc = 0.0 cost_i = 0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.Random(4).shuffle(train_ids) iter_accu = 0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_ids[para_id:para_id + batch_size] cost_i += train_model( train_para_list[train_id_batch], train_Q_list[train_id_batch], train_label_list[train_id_batch], train_para_mask[train_id_batch], train_Q_mask[train_id_batch], train_para_char_list[train_id_batch], train_Q_char_list[train_id_batch], train_para_char_mask[train_id_batch], train_Q_char_mask[train_id_batch], p_len_limit) #print iter if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' print 'Testing...' past_time = time.time() pred_dict = {} q_amount = 0 p1 = 0 for test_para_id in test_batch_start: batch_predict_ids, batch_predict_end_ids = test_model( test_para_list[test_para_id:test_para_id + test_batch_size], test_Q_list[test_para_id:test_para_id + test_batch_size], test_para_mask[test_para_id:test_para_id + test_batch_size], test_Q_mask[test_para_id:test_para_id + test_batch_size], test_para_char_list[test_para_id:test_para_id + test_batch_size], test_Q_char_list[test_para_id:test_para_id + test_batch_size], test_para_char_mask[test_para_id:test_para_id + test_batch_size], test_Q_char_mask[test_para_id:test_para_id + test_batch_size], test_p_len_limit) test_para_wordlist_batch = test_para_wordlist_list[ test_para_id:test_para_id + test_batch_size] # test_label_batch=test_label_list[test_para_id:test_para_id+test_batch_size] # q_amount+=test_batch_size q_ids_batch = q_idlist[test_para_id:test_para_id + test_batch_size] q_amount += test_batch_size for q in range(test_batch_size): #for each question # pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_batch[q]) start = batch_predict_ids[q] end = batch_predict_end_ids[q] if end < start: start, end = end, start pred_ans = ' '.join( test_para_wordlist_batch[q][start:end + 1]) q_id = q_ids_batch[q] pred_dict[q_id] = pred_ans with codecs.open(rootPath + 'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath + 'dev-v1.1.json', rootPath + 'predictions.txt') if F1_acc > max_F1_acc: max_F1_acc = F1_acc if exact_acc > max_exact_acc: max_exact_acc = exact_acc # if max_exact_acc > max_EM: # store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params) # print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, word_nkerns=500, char_nkerns=100, batch_size=1, window_width=3, emb_size=500, char_emb_size=100, hidden_size=200, margin=0.5, L2_weight=0.0003, update_freq=1, norm_threshold=5.0, max_truncate=40, max_char_len=40, max_des_len=20, max_relation_len=5, max_Q_len=30, train_neg_size=6, neg_all=100, train_size=75893, test_size=19168, mark='_BiasedMaxPool_lr0.1_word500_char100_noDes_ent2.0' ): #train_size=75909, test_size=17386 # maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/' triple_files = [ 'annotated_fb_data_train.entitylinking.top20_succSet_asInput.txt', 'annotated_fb_data_test.entitylinking.top20_succSet_asInput.fromMo_FB5M.txt' ] rng = numpy.random.RandomState(23455) word2id, char2id = load_word2id_char2id(mark) # datasets, datasets_test, length_per_example_test, vocab_size, char_size=load_test_or_valid(triple_files[0], triple_files[1], max_char_len, max_des_len, max_relation_len, max_Q_len, train_size, test_size)#max_char_len, max_des_len, max_relation_len, max_Q_len datasets_test, length_per_example_test, word2id, char2id = load_test_or_valid( triple_files[1], char2id, word2id, max_char_len, max_des_len, max_relation_len, max_Q_len, test_size) vocab_size = len(word2id) char_size = len(char2id) print 'vocab_size:', vocab_size, 'char_size:', char_size # train_data=datasets # valid_data=datasets[1] test_data = datasets_test # result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores) # # train_pos_entity_char=train_data[0] # train_pos_entity_des=train_data[1] # train_relations=train_data[2] # train_entity_char_lengths=train_data[3] # train_entity_des_lengths=train_data[4] # train_relation_lengths=train_data[5] # train_mention_char_ids=train_data[6] # train_remainQ_word_ids=train_data[7] # train_mention_char_lens=train_data[8] # train_remainQ_word_len=train_data[9] # train_entity_scores=train_data[10] test_pos_entity_char = test_data[0] # test_pos_entity_des=test_data[1] test_relations = test_data[2] test_entity_char_lengths = test_data[3] # test_entity_des_lengths=test_data[4] test_relation_lengths = test_data[5] test_mention_char_ids = test_data[6] test_remainQ_word_ids = test_data[7] test_mention_char_lens = test_data[8] test_remainQ_word_len = test_data[9] test_entity_scores = test_data[10] # # test_pos_entity_char=test_data[0] #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51 # test_pos_entity_des=test_data[1] #matrix, each row for a examle: 20*2*51 # test_relations=test_data[2] #matrix, each row for a example: 5*51 # test_entity_char_lengths=test_data[3] #matrix, each row for a example: 3*2*51 (three valies for one entity) # test_entity_des_lengths=test_data[4] #matrix, each row for a example: 3*2*51 (three values for one entity) # test_relation_lengths=test_data[5] #matrix, each row for a example: 3*51 # test_mention_char_ids=test_data[6] #matrix, each row for a mention: 40 # test_remainQ_word_ids=test_data[7] #matrix, each row for a question: 30 # test_mention_char_lens=test_data[8] #matrix, each three values for a mention: 3 # test_remainQ_word_len=test_data[9] #matrix, each three values for a remain question: 3 # train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\ # len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len), len(train_entity_scores)] # if sum(train_sizes)/len(train_sizes)!=train_size: # print 'weird size:', train_sizes # exit(0) test_sizes=[len(test_pos_entity_char), len(test_relations), len(test_entity_char_lengths),\ len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len), len(test_entity_scores)] if sum(test_sizes) / len(test_sizes) != test_size: print 'weird size:', test_sizes exit(0) # n_train_batches=train_size/batch_size # n_test_batches=test_size/batch_size # train_batch_start=list(numpy.arange(n_train_batches)*batch_size) # test_batch_start=list(numpy.arange(n_test_batches)*batch_size) # indices_train_pos_entity_char=pythonList_into_theanoIntMatrix(train_pos_entity_char) # indices_train_pos_entity_des=pythonList_into_theanoIntMatrix(train_pos_entity_des) # indices_train_relations=pythonList_into_theanoIntMatrix(train_relations) # indices_train_entity_char_lengths=pythonList_into_theanoIntMatrix(train_entity_char_lengths) # indices_train_entity_des_lengths=pythonList_into_theanoIntMatrix(train_entity_des_lengths) # indices_train_relation_lengths=pythonList_into_theanoIntMatrix(train_relation_lengths) # indices_train_mention_char_ids=pythonList_into_theanoIntMatrix(train_mention_char_ids) # indices_train_remainQ_word_ids=pythonList_into_theanoIntMatrix(train_remainQ_word_ids) # indices_train_mention_char_lens=pythonList_into_theanoIntMatrix(train_mention_char_lens) # indices_train_remainQ_word_len=pythonList_into_theanoIntMatrix(train_remainQ_word_len) # indices_train_entity_scores=pythonList_into_theanoFloatMatrix(train_entity_scores) # indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char) # indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des) # indices_test_relations=pythonList_into_theanoIntMatrix(test_relations) # indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths) # indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths) # indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths) # indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids) # indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids) # indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens) # indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len) # indices_test_entity_scores=pythonList_into_theanoIntMatrix(test_entity_scores) rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) # rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb.txt') embeddings = theano.shared(value=rand_values, borrow=True) char_rand_values = random_value_normal((char_size + 1, char_emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # char_rand_values[0]=numpy.array(numpy.zeros(char_emb_size),dtype=theano.config.floatX) char_embeddings = theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data index = T.iscalar() chosed_indices = T.ivector() ent_char_ids_M = T.imatrix() ent_lens_M = T.imatrix() men_char_ids_M = T.imatrix() men_lens_M = T.imatrix() rel_word_ids_M = T.imatrix() rel_word_lens_M = T.imatrix() #desH_word_ids_M=T.imatrix() #desH_word_lens_M=T.imatrix() q_word_ids_M = T.imatrix() q_word_lens_M = T.imatrix() ent_scores = T.fvector() filter_size = (emb_size, window_width) char_filter_size = (char_emb_size, window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' char_filter_shape = (char_nkerns, 1, char_filter_size[0], char_filter_size[1]) word_filter_shape = (word_nkerns, 1, filter_size[0], filter_size[1]) char_conv_W, char_conv_b = create_conv_para(rng, filter_shape=char_filter_shape) q_rel_conv_W, q_rel_conv_b = create_conv_para( rng, filter_shape=word_filter_shape) #q_desH_conv_W, q_desH_conv_b=create_conv_para(rng, filter_shape=word_filter_shape) params = [ char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W, q_rel_conv_b ] #, q_desH_conv_W, q_desH_conv_b] load_model_from_file(rootPath, params, mark) def SimpleQ_matches_Triple(ent_char_ids_f, ent_lens_f, rel_word_ids_f, rel_word_lens_f, men_char_ids_f, q_word_ids_f, men_lens_f, q_word_lens_f): # rng = numpy.random.RandomState(23455) ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape( (batch_size, max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) men_char_input = char_embeddings[men_char_ids_f.flatten()].reshape( (batch_size, max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape( (batch_size, max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) q_word_input = embeddings[q_word_ids_f.flatten()].reshape( (batch_size, max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #ent_mention ent_char_conv = Conv_with_input_para(rng, input=ent_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) men_char_conv = Conv_with_input_para(rng, input=men_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) #q-rel q_rel_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) rel_conv = Conv_with_input_para(rng, input=rel_word_input, image_shape=(batch_size, 1, emb_size, max_relation_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) #q_desH #q_desH_conv = Conv_with_input_para(rng, input=q_word_input, # image_shape=(batch_size, 1, emb_size, max_Q_len), # filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) #desH_conv = Conv_with_input_para(rng, input=desH_word_input, # image_shape=(batch_size, 1, emb_size, max_des_len), # filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) ent_conv_pool = Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2]) men_conv_pool = Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens_f[0], right_l=men_lens_f[2]) #q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) rel_conv_pool = Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2]) q_rel_pool = Average_Pooling_for_SimpleQA( rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1] + filter_size[1] - 1, dim=max_Q_len + filter_size[1] - 1, topk=2) #q_desH_pool=Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) #desH_conv_pool=Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2]) overall_simi=cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)*0.33333+\ cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling)*0.55 # 0.0*cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling) # cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling) return overall_simi simi_list, updates = theano.scan(SimpleQ_matches_Triple, sequences=[ ent_char_ids_M, ent_lens_M, rel_word_ids_M, rel_word_lens_M, men_char_ids_M, q_word_ids_M, men_lens_M, q_word_lens_M ]) simi_list += 0.2 * ent_scores posi_simi = simi_list[0] nega_simies = simi_list[1:] loss_simi_list = T.maximum( 0.0, margin - posi_simi.reshape((1, 1)) + nega_simies) loss_simi = T.sum(loss_simi_list) test_model = theano.function([ ent_char_ids_M, ent_lens_M, men_char_ids_M, men_lens_M, rel_word_ids_M, rel_word_lens_M, q_word_ids_M, q_word_lens_M, ent_scores ], [loss_simi, simi_list], on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... testing' start_time = time.clock() mid_time = start_time epoch = 0 test_loss = [] succ = 0 for i in range(test_size): #prepare data test_ent_char_ids_M = numpy.asarray(test_pos_entity_char[i], dtype='int32').reshape( (length_per_example_test[i], max_char_len)) test_ent_lens_M = numpy.asarray(test_entity_char_lengths[i], dtype='int32').reshape( (length_per_example_test[i], 3)) test_men_char_ids_M = numpy.asarray(test_mention_char_ids[i], dtype='int32').reshape( (length_per_example_test[i], max_char_len)) test_men_lens_M = numpy.asarray(test_mention_char_lens[i], dtype='int32').reshape( (length_per_example_test[i], 3)) test_rel_word_ids_M = numpy.asarray(test_relations[i], dtype='int32').reshape( (length_per_example_test[i], max_relation_len)) test_rel_word_lens_M = numpy.asarray(test_relation_lengths[i], dtype='int32').reshape( (length_per_example_test[i], 3)) #test_desH_word_ids_M =numpy.asarray( test_pos_entity_des[i], dtype='int32').reshape((length_per_example_test[i], max_des_len)) #test_desH_word_lens_M = numpy.asarray(test_entity_des_lengths[i], dtype='int32').reshape((length_per_example_test[i], 3)) test_q_word_ids_M = numpy.asarray(test_remainQ_word_ids[i], dtype='int32').reshape( (length_per_example_test[i], max_Q_len)) test_q_word_lens_M = numpy.asarray(test_remainQ_word_len[i], dtype='int32').reshape( (length_per_example_test[i], 3)) test_ent_scores = numpy.asarray(test_entity_scores[i], dtype=theano.config.floatX) loss_simi_i, simi_list_i = test_model( test_ent_char_ids_M, test_ent_lens_M, test_men_char_ids_M, test_men_lens_M, test_rel_word_ids_M, test_rel_word_lens_M, test_q_word_ids_M, test_q_word_lens_M, test_ent_scores) # print 'simi_list_i:', simi_list_i[:10] test_loss.append(loss_simi_i) if len(simi_list_i) == 1 or simi_list_i[0] >= max(simi_list_i[1:]): succ += 1 if i % 1000 == 0: print 'testing', i, '...acc:', (succ * 1.0 / (i + 1)) * (19168 * 1.0 / 21687) succ = succ * 100.0 / 21687 #now, check MAP and MRR print 'accu:', succ # store_model_to_file(rootPath, params, succ, mark) print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
def evaluate_lenet5(learning_rate=0.0001, n_epochs=2000, nkerns=[50,50], batch_size=10, window_width=3, maxSentLength=64, emb_size=50, hidden_size=200, margin=0.5, L2_weight=0.0006, update_freq=1, norm_threshold=5.0, max_truncate=33):# max_truncate can be 45 maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SICK/'; rng = numpy.random.RandomState(23455) # datasets, vocab_size=load_SICK_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'train.txt', rootPath+'test.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test datasets, vocab_size=load_SICK_corpus(rootPath+'vocab.txt', rootPath+'train_plus_dev.txt', rootPath+'test.txt', max_truncate,maxSentLength, entailment=True) mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt') extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt') discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2,:] indices_train_r=indices_train[1::2,:] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad = datasets[1] indices_test_l=indices_test[::2,:] indices_test_r=indices_test[1::2,:] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] n_train_batches=indices_train_l.shape[0]/batch_size n_test_batches=indices_test_l.shape[0]/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l=T.cast(indices_train_l, 'int64') indices_train_r=T.cast(indices_train_r, 'int64') indices_test_l=T.cast(indices_test_l, 'int64') indices_test_r=T.cast(indices_test_r, 'int64') ''' indices_train_l=T.cast(indices_train_l, 'int32') indices_train_r=T.cast(indices_train_r, 'int32') indices_test_l=T.cast(indices_test_l, 'int32') indices_test_r=T.cast(indices_test_r, 'int32') ''' rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_glove_50d.txt') # rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) error_sum=0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.lmatrix('x_index_l') # now, x is the index matrix, must be integer x_index_r = T.lmatrix('x_index_r') y = T.lvector('y') left_l=T.lvector() right_l=T.lvector() left_r=T.lvector() right_r=T.lvector() length_l=T.lvector() length_r=T.lvector() norm_length_l=T.dvector() norm_length_r=T.dvector() mts=T.dmatrix() extra=T.dmatrix() discri=T.dmatrix() cost_tmp=T.dscalar() # #GPU # index = T.iscalar() # x_index_l = T.imatrix('x_index_l') # now, x is the index matrix, must be integer # x_index_r = T.imatrix('x_index_r') # y = T.ivector('y') # left_l=T.iscalar() # right_l=T.iscalar() # left_r=T.iscalar() # right_r=T.iscalar() # length_l=T.iscalar() # length_r=T.iscalar() # norm_length_l=T.fscalar() # norm_length_r=T.fscalar() # #mts=T.dmatrix() # #wmf=T.dmatrix() # cost_tmp=T.fscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = debug_print(embeddings[x_index_l.flatten()].reshape((batch_size, maxSentLength, emb_size)).transpose(0,2,1), 'layer0_l_input') layer0_r_input = debug_print(embeddings[x_index_r.flatten()].reshape((batch_size, maxSentLength, emb_size)).transpose(0,2,1), 'layer0_r_input') #paras: U, W, b=create_GRU_para(rng, emb_size, nkerns[0]) layer0_para=[U, W, b] U1, W1, b1=create_GRU_para(rng, nkerns[0], nkerns[1]) layer1_para=[U1, W1, b1] def loop (l_left, l_right, l_matrix, r_left, r_right, r_matrix, mts_i, extra_i, norm_length_l_i, norm_length_r_i): l_input_tensor=debug_print(Matrix_Bit_Shift(l_matrix[:,l_left:-l_right]), 'l_input_tensor') r_input_tensor=debug_print(Matrix_Bit_Shift(r_matrix[:,r_left:-r_right]), 'r_input_tensor') addition_l=T.sum(l_matrix[:,l_left:-l_right], axis=1) addition_r=T.sum(r_matrix[:,r_left:-r_right], axis=1) cosine_addition=cosine(addition_l, addition_r) eucli_addition=1.0/(1.0+EUCLID(addition_l, addition_r))#25.2% layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1) layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1) cosine_sent=cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep) eucli_sent=1.0/(1.0+EUCLID(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep))#25.2% attention_matrix=compute_simi_feature_matrix_with_matrix(layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim, layer0_A2.dim, maxSentLength*(maxSentLength+1)/2) l_max_attention=T.max(attention_matrix, axis=1) neighborsArgSorted = T.argsort(l_max_attention) kNeighborsArg = neighborsArgSorted[:3]#only average the min 3 vectors ll = T.sort(kNeighborsArg).flatten() # make y indices in acending lie r_max_attention=T.max(attention_matrix, axis=0) neighborsArgSorted_r = T.argsort(r_max_attention) kNeighborsArg_r = neighborsArgSorted_r[:3]#only average the min 3 vectors rr = T.sort(kNeighborsArg_r).flatten() # make y indices in acending lie l_max_min_attention=debug_print(layer0_A1.output_matrix[:,ll], 'l_max_min_attention') r_max_min_attention=debug_print(layer0_A2.output_matrix[:,rr], 'r_max_min_attention') layer1_A1=GRU_Matrix_Input(X=l_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1) layer1_A2=GRU_Matrix_Input(X=r_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1) vec_l=debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])), 'vec_l') vec_r=debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])), 'vec_r') # sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) # aver_uni_l=sum_uni_l/layer0_l_input.shape[3] # norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) # sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) # aver_uni_r=sum_uni_r/layer0_r_input.shape[3] # norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) # uni_cosine=cosine(vec_l, vec_r) # aver_uni_cosine=cosine(aver_uni_l, aver_uni_r) # uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi') # ''' # linear=Linear(sum_uni_l, sum_uni_r) # poly=Poly(sum_uni_l, sum_uni_r) # sigmoid=Sigmoid(sum_uni_l, sum_uni_r) # rbf=RBF(sum_uni_l, sum_uni_r) # gesd=GESD(sum_uni_l, sum_uni_r) # ''' eucli_1=1.0/(1.0+EUCLID(vec_l, vec_r))#25.2% # #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r)) # len_l=norm_length_l_i.reshape((1,1)) len_r=norm_length_r_i.reshape((1,1)) # # ''' # len_l=length_l.reshape((1,1)) # len_r=length_r.reshape((1,1)) # ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts # layer3_input_nn=T.concatenate([vec_l, vec_r, # cosine_addition, eucli_addition, # # cosine_sent, eucli_sent, # uni_cosine,eucli_1], axis=1)#, layer2.output, layer1.output_cosine], axis=1) output_i=T.concatenate([vec_l, vec_r, cosine_addition, eucli_addition, # cosine_sent, eucli_sent, uni_cosine,eucli_1, mts_i.reshape((1,14)), len_l, len_r, extra_i.reshape((1,9))], axis=1)#, layer2.output, layer1.output_cosine], axis=1) return output_i layer3_input, _ = theano.scan(fn=loop, sequences=[left_l, right_l, layer0_l_input, left_r, right_r, layer0_r_input, mts, extra, norm_length_l, norm_length_r], outputs_info=None,#[self.h0, None], n_steps=batch_size) #l_left, l_right, l_matrix, r_left, r_right, r_matrix, mts_i, extra_i, norm_length_l_i, norm_length_r_i # x_index_l = T.lmatrix('x_index_l') # now, x is the index matrix, must be integer # x_index_r = T.lmatrix('x_index_r') # y = T.lvector('y') # left_l=T.lvector() # right_l=T.lvector() # left_r=T.lvector() # right_r=T.lvector() # length_l=T.lvector() # length_r=T.lvector() # norm_length_l=T.dvector() # norm_length_r=T.dvector() # mts=T.dmatrix() # extra=T.dmatrix() # discri=T.dmatrix() # cost_tmp=T.dscalar() #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) feature_size=2*nkerns[1]+2+2+14+2+9 layer3_input=layer3_input.reshape((batch_size, feature_size)) layer3=LogisticRegression(rng, input=layer3_input, n_in=feature_size, n_out=3) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((layer3.W** 2).sum()+(U** 2).sum()+(W** 2).sum()+(U1** 2).sum()+(W1** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([index], [layer3.errors(y),layer3_input, y], givens={ x_index_l: indices_test_l[index: index + batch_size], x_index_r: indices_test_r[index: index + batch_size], y: testY[index: index + batch_size], left_l: testLeftPad_l[index: index + batch_size], right_l: testRightPad_l[index: index + batch_size], left_r: testLeftPad_r[index: index + batch_size], right_r: testRightPad_r[index: index + batch_size], length_l: testLengths_l[index: index + batch_size], length_r: testLengths_r[index: index + batch_size], norm_length_l: normalized_test_length_l[index: index + batch_size], norm_length_r: normalized_test_length_r[index: index + batch_size], mts: mt_test[index: index + batch_size], extra: extra_test[index: index + batch_size], discri:discri_test[index: index + batch_size] }, on_unused_input='ignore', allow_input_downcast=True) #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer3.params+ layer1_para+layer0_para#+[embeddings]# + layer1.params # params_conv = [conv_W, conv_b] # accumulator=[] # for para_i in params: # eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) # accumulator.append(theano.shared(eps_p, borrow=True)) # # # create a list of gradients for all model parameters # grads = T.grad(cost, params) # # updates = [] # for param_i, grad_i, acc_i in zip(params, grads, accumulator): # grad_i=debug_print(grad_i,'grad_i') # acc = acc_i + T.sqr(grad_i) # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad # updates.append((acc_i, acc)) def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8): updates = [] grads = T.grad(cost, params) i = theano.shared(numpy.float64(0.)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates updates=Adam(cost=cost, params=params, lr=learning_rate) train_model = theano.function([index,cost_tmp], cost, updates=updates, givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index: index + batch_size], right_l: trainRightPad_l[index: index + batch_size], left_r: trainLeftPad_r[index: index + batch_size], right_r: trainRightPad_r[index: index + batch_size], length_l: trainLengths_l[index: index + batch_size], length_r: trainLengths_r[index: index + batch_size], norm_length_l: normalized_train_length_l[index: index + batch_size], norm_length_r: normalized_train_length_r[index: index + batch_size], mts: mt_train[index: index + batch_size], extra: extra_train[index: index + batch_size], discri:discri_train[index: index + batch_size] }, on_unused_input='ignore', allow_input_downcast=True) train_model_predict = theano.function([index, cost_tmp], [cost_this,layer3.errors(y), layer3_input, y], givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index: index + batch_size], right_l: trainRightPad_l[index: index + batch_size], left_r: trainLeftPad_r[index: index + batch_size], right_r: trainRightPad_r[index: index + batch_size], length_l: trainLengths_l[index: index + batch_size], length_r: trainLengths_r[index: index + batch_size], norm_length_l: normalized_train_length_l[index: index + batch_size], norm_length_r: normalized_train_length_r[index: index + batch_size], mts: mt_train[index: index + batch_size], extra: extra_train[index: index + batch_size], discri:discri_train[index: index + batch_size] }, on_unused_input='ignore', allow_input_downcast=True) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time epoch = 0 done_looping = False acc_max=0.0 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 # shuffle(train_batch_start)#shuffle training data cost_tmp=0.0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop # if (batch_start+1)%1000==0: # print batch_start+1, 'uses ', (time.time()-mid_time)/60.0, 'min' iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) #print batch_start if iter%update_freq != 0: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start, 0.0) #print 'layer3_input', layer3_input cost_tmp+=cost_ij error_sum+=error_ij #print 'cost_acc ',cost_acc #print 'cost_ij ', cost_ij #print 'cost_tmp before update',cost_tmp else: cost_average= train_model(batch_start,cost_tmp) #print 'layer3_input', layer3_input error_sum=0 cost_tmp=0.0#reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_losses=[] test_y=[] test_features=[] for i in test_batch_start: test_loss, layer3_input, y=test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y) test_features.append(layer3_input) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) test_features=numpy.concatenate(test_features, axis=0) test_y=numpy.concatenate(test_y, axis=0) print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, (1-test_score) * 100.)) acc_nn=1-test_score #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') #this step is risky: if the training data is too big, then this step will make the training time twice longer train_y=[] train_features=[] count=0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start, 0.0) train_y.append(y) train_features.append(layer3_input) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 train_features=numpy.concatenate(train_features, axis=0) train_y=numpy.concatenate(train_y, axis=0) clf = svm.SVC(C=1.0, kernel='linear') clf.fit(train_features, train_y) results=clf.predict(test_features) lr=linear_model.LogisticRegression(C=1e5) lr.fit(train_features, train_y) results_lr=lr.predict(test_features) corr_count=0 corr_count_lr=0 test_size=len(test_y) for i in range(test_size): if results[i]==test_y[i]: corr_count+=1 if results_lr[i]==test_y[i]: corr_count_lr+=1 acc_svm=corr_count*1.0/test_size acc_lr=corr_count_lr*1.0/test_size if acc_svm > acc_max: acc_max=acc_svm best_epoch=epoch if acc_lr > acc_max: acc_max=acc_lr best_epoch=epoch if acc_nn > acc_max: acc_max=acc_nn best_epoch=epoch print 'acc_nn:', acc_nn, 'acc_lr:', acc_lr, 'acc_svm:', acc_svm, ' max acc: ', acc_max , ' at epoch: ', best_epoch if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=10, test_batch_size=200, emb_size=300, hidden_size=100, L2_weight=0.0001, para_len_limit=300, q_len_limit=30, max_EM=40.0): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) # glove_vocab=set(word2vec.keys()) train_para_list, train_Q_list, train_start_list,train_end_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train_AI2(para_len_limit, q_len_limit) train_size=len(train_para_list) if train_size!=len(train_Q_list) or train_size!=len(train_start_list) or train_size!=len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist, q_idlist= load_dev_or_test_AI2(word2id, para_len_limit, q_len_limit) test_size=len(test_para_list) if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in overall_word2id.iteritems()} word2vec=load_glove() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') # labels = T.imatrix('labels') #(batch, para_len) start_indices= T.ivector() #batch end_indices = T.ivector() #batch para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size=paragraph.shape[0] norm_extraF=normalize_matrix(extraF) fwd_para=create_LSTM_para(rng, emb_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) bwd_para=create_LSTM_para(rng, emb_size, hidden_size) paragraph_para=fwd_para.values()+ bwd_para.values() fwd_e1=create_LSTM_para(rng, 8*hidden_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) bwd_e1=create_LSTM_para(rng, 8*hidden_size, hidden_size) paragraph_para_e1=fwd_e1.values()+ bwd_e1.values() fwd_e11=create_LSTM_para(rng, 2*hidden_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) bwd_e11=create_LSTM_para(rng, 2*hidden_size, hidden_size) paragraph_para_e11=fwd_e11.values()+ bwd_e11.values() fwd_e2=create_LSTM_para(rng, 2*hidden_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) bwd_e2=create_LSTM_para(rng, 2*hidden_size, hidden_size) paragraph_para_e2=fwd_e2.values()+ bwd_e2.values() # U_e2, W_e2, b_e2=create_GRU_para(rng, hidden_size, hidden_size) # U_e2_b, W_e2_b, b_e2_b=create_GRU_para(rng, hidden_size, hidden_size) # paragraph_para_e2=[U_e2, W_e2, b_e2, U_e2_b, W_e2_b, b_e2_b] # fwd_Q=create_LSTM_para(rng, emb_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) # bwd_Q=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_Q.values()+ bwd_Q.values() # W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size)) # W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a1 = create_ensemble_para(rng, 1, 10*hidden_size) # 3 extra features U_a2 = create_ensemble_para(rng, 1, 10*hidden_size) # 3 extra features U_a3 = create_ensemble_para(rng, 1, 6*hidden_size) # 3 extra features # LR_b = theano.shared(value=numpy.zeros((2,), # dtype=theano.config.floatX), # @UndefinedVariable # name='LR_b', borrow=True) HL_paras=[U_a1, U_a2, U_a3] params = [embeddings]+paragraph_para+paragraph_para_e1+paragraph_para_e11+HL_paras+paragraph_para_e2 # load_model_from_file(rootPath+'Best_Paras_AI2_31.210974456', params) paragraph_input = embeddings[paragraph.flatten()].reshape((true_batch_size, paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen) #self, X, Mask, hidden_dim, fwd_tparams, bwd_tparams paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_para, bwd_tparams= bwd_para) para_reps=paragraph_model.output_tensor #(batch, 2*hidden, para_len) Qs_emb = embeddings[questions.flatten()].reshape((true_batch_size, questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength) questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, fwd_tparams=fwd_para, bwd_tparams= bwd_para) questions_reps_tensor=questions_model.output_tensor #(batch, 2*hidden ,q_len) # questions_reps=questions_model.output_sent_rep_maxpooling.reshape((true_batch_size, 1, hidden_size)) #(batch, 1, hidden) # questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) #(batch, para_len, hidden) # #LSTM for questions # fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters # questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) # questions_reps_tensor=questions_model.output_tensor # new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0) # ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2) # ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction # padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX) # ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1) # ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1) # ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad) norm_U_a3=normalize_matrix(U_a3) def example_in_batch(para_matrix, q_matrix): #assume both are (2*hidden, len) repeat_para_matrix_T=T.repeat(para_matrix.T, q_matrix.shape[1], axis=0) #(para_len*q_len, 2*hidden) repeat_q_matrix_3D = T.repeat(q_matrix.T.dimshuffle('x',0,1), para_matrix.shape[1], axis=0) #(para_len, q_len, 2*hidden) repeat_q_matrix_T= repeat_q_matrix_3D.reshape((repeat_q_matrix_3D.shape[0]*repeat_q_matrix_3D.shape[1], repeat_q_matrix_3D.shape[2])) #(para_len*q_len, 2*hidden) ele_mult =repeat_para_matrix_T*repeat_q_matrix_T #(#(para_len*q_len, 2*hidden)) overall_concv = T.concatenate([repeat_para_matrix_T, repeat_q_matrix_T, ele_mult], axis=1) ##(para_len*q_len, 6*hidden) scores=T.dot(overall_concv, norm_U_a3) #(para_len*q_len,1) interaction_matrix=scores.reshape((para_matrix.shape[1], q_matrix.shape[1])) #(para_len, q_len) # transpose_para_matrix=para_matrix.T # interaction_matrix=T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len) norm_interaction_matrix=T.nnet.softmax(interaction_matrix) # norm_interaction_matrix=T.maximum(0.0, interaction_matrix) q_by_para = T.dot(q_matrix, norm_interaction_matrix.T)/T.sum(norm_interaction_matrix.T, axis=0).dimshuffle('x',0) #(2*hidden, para_len) para_by_q = T.repeat(T.dot(para_matrix, T.nnet.softmax(T.max(interaction_matrix, axis=1).dimshuffle('x',0)).T), para_matrix.shape[1], axis=1) return (q_by_para, para_by_q) inter_return, updates = theano.scan(fn=example_in_batch, outputs_info=None, sequences=[para_reps, questions_reps_tensor]) #batch_q_reps (batch, hidden, para_len) batch_q_reps=inter_return[0] #(batch, 2*hidden, para_len) batch_para_reps=inter_return[1] #(batch, 2*hidden , para_len) #para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1), all are in (batch, hidden , para_len) ensemble_para_reps_tensor=T.concatenate([para_reps, batch_q_reps,para_reps*batch_q_reps, para_reps*batch_para_reps], axis=1) #(batch, 4*2*hidden, para_len) questions_reps.dimshuffle(0,2,1) para_ensemble_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=ensemble_para_reps_tensor, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_e1, bwd_tparams= bwd_e1) para_reps_tensor4score=para_ensemble_model.output_tensor #(batch, 2*hidden ,para_len) para_ensemble_model1=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=para_reps_tensor4score, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_e11, bwd_tparams= bwd_e11) para_reps_tensor4score1=para_ensemble_model1.output_tensor #(batch, 2*hidden ,para_len) Con_G_M=T.concatenate([ensemble_para_reps_tensor, para_reps_tensor4score1], axis=1) #(batch, 10*hidden, para_len) #score for each para word norm_U_a=normalize_matrix(U_a1) start_scores=T.dot(Con_G_M.dimshuffle(0,2,1), norm_U_a) #(batch, para_len, 1) start_scores=T.nnet.softmax(start_scores.reshape((true_batch_size, paragraph.shape[1]))) #(batch, para_len) # para_reps_tensor4score = T.concatenate([para_reps_tensor4score, start_scores.dimshuffle(0,'x',1)], axis=1) para_ensemble_model2=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=para_reps_tensor4score1, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_e2, bwd_tparams= bwd_e2) para_reps_tensor4score2=para_ensemble_model2.output_tensor #(batch, 2*hidden ,para_len) Con_G_M2=T.concatenate([ensemble_para_reps_tensor, para_reps_tensor4score2], axis=1) #(batch, 10*hidden, para_len) norm_U_a2=normalize_matrix(U_a2) end_scores=T.dot(Con_G_M2.dimshuffle(0,2,1), norm_U_a2) #(batch, para_len, 1) end_scores=T.nnet.softmax(end_scores.reshape((true_batch_size, paragraph.shape[1]))) #(batch, para_len) #loss train loss=-T.mean(T.log(start_scores[T.arange(true_batch_size), start_indices])+T.log(end_scores[T.arange(true_batch_size), end_indices])) #test co_simi_batch_matrix=T.batched_dot((para_mask*start_scores).dimshuffle(0,1,'x'), (para_mask*end_scores).dimshuffle(0,'x',1)) #(batch, para_len, para_len) #reset lower dialgonal cols = numpy.concatenate([numpy.array(range(i), dtype=numpy.uint) for i in xrange(para_len_limit)]) rows = numpy.concatenate([numpy.array([i]*i, dtype=numpy.uint) for i in xrange(para_len_limit)]) c = T.set_subtensor(co_simi_batch_matrix[:,rows, cols], theano.shared(numpy.zeros(para_len_limit*(para_len_limit-1)/2))) #reset longer than 7 size cols2 = numpy.concatenate([numpy.array(range(i+7,para_len_limit), dtype=numpy.uint) for i in xrange(para_len_limit-7)]) rows2 = numpy.concatenate([numpy.array([i]*(para_len_limit-7-i), dtype=numpy.uint) for i in xrange(para_len_limit-7)]) c2 = T.set_subtensor(c[:,rows2, cols2], theano.shared(numpy.zeros((para_len_limit-7)*(para_len_limit-6)/2))) test_return=T.argmax(c2.reshape((true_batch_size, para_len_limit*para_len_limit)), axis=1) #batch #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=loss#+ConvGRU_1.error# accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([paragraph, questions,start_indices, end_indices,para_mask, q_mask, extraF], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF], test_return, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/test_batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i+= train_model( numpy.asarray([train_para_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_Q_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_start_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_end_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_para_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), numpy.asarray([train_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), numpy.asarray([train_feature_matrixlist[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX)) #print iter if iter%10==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() # writefile=codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') # writefile.write('{') pred_dict={} # exact_match=0.0 # F1_match=0.0 q_amount=0 for test_para_id in test_batch_start: batch_predict_ids=test_model( numpy.asarray(test_para_list[test_para_id:test_para_id+test_batch_size], dtype='int32'), numpy.asarray(test_Q_list[test_para_id:test_para_id+test_batch_size], dtype='int32'), numpy.asarray(test_para_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), numpy.asarray(test_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), numpy.asarray(test_feature_matrixlist[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX)) # print distribution_matrix test_para_wordlist_list=test_text_list[test_para_id:test_para_id+test_batch_size] # para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+test_batch_size] q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size] # print 'q_ids_batch:', q_ids_batch # paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size] # sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size] # para_len=len(test_para_wordlist_list[0]) # if para_len!=len(distribution_matrix[0]): # print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0]) # exit(0) # q_size=len(distribution_matrix) q_amount+=test_batch_size # print q_size # print test_para_word_list # Q_list_inword=test_Q_list_word[test_para_id:test_para_id+test_batch_size] for q in range(test_batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans=decode_predict_id_AI2(batch_predict_ids[q], para_len_limit, test_para_wordlist_list[q]) q_id=q_ids_batch[q] pred_dict[q_id]=pred_ans # writefile.write('"'+str(q_id)+'": "'+pred_ans+'", ') # pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], numpy.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q]) # q_gold_ans_set=para_gold_ansset_list[q] # # print test_para_wordlist_list[q] # # print Q_list_inword[q] # # print pred_ans.encode('utf8'), q_gold_ans_set # if pred_ans in q_gold_ans_set: # exact_match+=1 # F1=MacroF1(pred_ans, q_gold_ans_set) # F1_match+=F1 with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt') # F1_acc=F1_match/q_amount # exact_acc=exact_match/q_amount if F1_acc> max_F1_acc: max_F1_acc=F1_acc if exact_acc> max_exact_acc: max_exact_acc=exact_acc if max_exact_acc > max_EM: store_model_to_file(rootPath+'Best_Paras_AI2_'+str(max_exact_acc), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc # os.system('python evaluate-v1.1.py '+rootPath+'dev-v1.1.json '+rootPath+'predictions.txt') if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, learning_rate=0.2, n_epochs=2000, nkerns=[6, 14], batch_size=10, useAllSamples=True, ktop=4, filter_size=[7,5], L2_weight=0.00005, useEmb=0, maxSentLength=60, sentEm_length=48, window=3, k=5, nce_seeds=2345, only_left_context=False, wait_iter=20, embedding_size=48, newd=[100, 100], train_file_style=1, from_scratch=False, stop=1e-2): self.write_file_name_suffix='_lr'+str(learning_rate)+'_nk'+str(nkerns[0])+'&'+str(nkerns[1])+'_bs'+str(batch_size)+'_fs'+str(filter_size[0])+'&'+str(filter_size[1])\ +'_maxSL'+str(maxSentLength)+'_win'+str(window)+'_noi'+str(k)+'_wait'+str(wait_iter)+'_wdEm'+str(embedding_size)\ +'_stEm'+str(sentEm_length)+'_ts'+str(from_scratch)+'_newd'+str(newd[0])+'&'+str(newd[1])+'_trFi'+str(train_file_style)+'stop'+str(stop) model_options = locals().copy() print "model options", model_options self.ini_learning_rate=learning_rate self.n_epochs=n_epochs self.nkerns=nkerns self.batch_size=batch_size self.useAllSamples=useAllSamples self.ktop=ktop self.filter_size=filter_size self.L2_weight=L2_weight self.useEmb=useEmb self.maxSentLength=maxSentLength self.kmax=self.maxSentLength/2+5 self.sentEm_length=sentEm_length self.window=window self.k=k self.only_left_context=only_left_context if self.only_left_context: self.context_size=self.window else: self.context_size=2*self.window self.nce_seed=nce_seeds self.embedding_size=0 self.train_file_style=train_file_style #we define "train_file_style" as: 0 (wiki), 11(sent_train), 12 (senti_dev), 13 (senti_test) senti_trainfile="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/2classes/2train.txt" senti_devfile="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/2classes/2dev.txt" senti_testfile="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/2classes/2test.txt" wiki_path="/mounts/data/proj/wenpeng/PhraseEmbedding/enwiki-20130503-pages-articles-cleaned-tokenized" embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt' embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt' root='/mounts/data/proj/wenpeng/Thang/' if self.train_file_style !=0: datasets, unigram, train_lengths, word_count, self.id2word=load_training_file(senti_trainfile,self.maxSentLength,self.train_file_style) elif self.train_file_style == 0: #datasets, unigram, train_lengths, word_count, self.id2word=load_training_file(root+'train.txt',self.maxSentLength,self.train_file_style) datasets, unigram, train_lengths, dev_lengths, word_count, self.id2word=load_data_for_training(root+'train.txt', root+'dev_dev93.txt',self.maxSentLength) self.datasets=datasets self.embedding_size=embedding_size self.vocab_size=word_count self.rand_values_R=random_value_normal((self.vocab_size+1, self.embedding_size), theano.config.floatX, numpy.random.RandomState(1234)) self.rand_values_R[0]=numpy.array(numpy.zeros(self.embedding_size)) self.rand_values_Q=random_value_normal((self.vocab_size+1, self.embedding_size), theano.config.floatX, numpy.random.RandomState(4321)) self.rand_values_Q[0]=numpy.array(numpy.zeros(self.embedding_size)) self.from_scratch=from_scratch if not self.from_scratch: self.load_pretrained_embeddings() self.embeddings_R=theano.shared(value=self.rand_values_R) self.embeddings_Q=theano.shared(value=self.rand_values_Q) self.unigram=unigram # we use the average of unigram as probability of new word in dev set self.extend_unigram=numpy.append(unigram, [sum(unigram)/len(unigram)]) #print 'unigram, p_n length:', len(unigram), len(self.extend_unigram) self.p_n=theano.shared(value=self.extend_unigram) self.train_lengths=train_lengths self.vali_lengths=dev_lengths b_values = zero_value((len(unigram)+1,), dtype=theano.config.floatX)#the last bias is for new words in dev data #print 'bias length:', len(b_values) self.bias = theano.shared(value=b_values, name='bias') self.wait_iter=wait_iter self.newd=newd self.stop=stop
def evaluate_lenet5(learning_rate=0.08, n_epochs=2000, nkerns=[50], batch_size=1000, window_width=4, maxSentLength=64, emb_size=5, hidden_size=50, margin=0.5, L2_weight=0.0004, update_freq=1, norm_threshold=5.0, max_truncate=40, line_no=483142): maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options triple_path='/mounts/data/proj/wenpeng/Dataset/freebase/FB15k/' rng = numpy.random.RandomState(1234) triples, entity_size, relation_size, entity_count, relation_count=load_triples(triple_path+'freebase_mtr100_mte100-train.txt', line_no, triple_path)#vocab_size contain train, dev and test print 'triple size:', len(triples), 'entity_size:', entity_size, 'relation_size:', relation_size#, len(entity_count), len(relation_count) # print triples # print entity_count # print relation_count # exit(0) #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test # mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' # mt_train, mt_test=load_mts_wikiQA(mtPath+'result_train/concate_2mt_train.txt', mtPath+'result_test/concate_2mt_test.txt') # wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') entity_count=theano.shared(numpy.asarray(entity_count, dtype=theano.config.floatX), borrow=True) entity_count=T.cast(entity_count, 'int64') relation_count=theano.shared(numpy.asarray(relation_count, dtype=theano.config.floatX), borrow=True) relation_count=T.cast(relation_count, 'int64') rand_values=random_value_normal((entity_size, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) entity_E=theano.shared(value=rand_values, borrow=True) rand_values=random_value_normal((relation_size, emb_size), theano.config.floatX, numpy.random.RandomState(4321)) relation_E=theano.shared(value=rand_values, borrow=True) GRU_U, GRU_W, GRU_b=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size) GRU_U_combine, GRU_W_combine, GRU_b_combine=create_nGRUs_para(rng, word_dim=emb_size, hidden_dim=emb_size, n=2) #cost_tmp=0 n_batchs=line_no/batch_size remain_triples=line_no%batch_size if remain_triples>0: batch_start=list(numpy.arange(n_batchs)*batch_size)+[line_no-batch_size] else: batch_start=list(numpy.arange(n_batchs)*batch_size) batch_start=theano.shared(numpy.asarray(batch_start, dtype=theano.config.floatX), borrow=True) batch_start=T.cast(batch_start, 'int64') # allocate symbolic variables for the data # index = T.lscalar() x_index_l = T.lmatrix('x_index_l') # now, x is the index matrix, must be integer # x_index_r = T.imatrix('x_index_r') # y = T.ivector('y') # left_l=T.iscalar() # right_l=T.iscalar() # left_r=T.iscalar() # right_r=T.iscalar() # length_l=T.iscalar() # length_r=T.iscalar() # norm_length_l=T.fscalar() # norm_length_r=T.fscalar() # mts=T.fmatrix() # wmf=T.fmatrix() # cost_tmp=T.fscalar() # #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() # ishape = (emb_size, maxSentLength) # this is the size of MNIST images # filter_size=(emb_size,window_width) # #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' zero_entity_E=T.zeros((entity_size, emb_size)) zero_relation_E=T.zeros((relation_size, emb_size)) entity_E_hat_1, relation_E_hat_1=all_batches(batch_start, batch_size, x_index_l, entity_E, relation_E, GRU_U, GRU_W, GRU_b, emb_size, zero_entity_E,zero_relation_E, entity_count, entity_size, relation_count, relation_size) # for start in batch_start: # batch_triple_indices=x_index_l[start:start+batch_size] # # entity_E_hat_1, relation_E_hat_1=one_iteration_parallel(batch_triple_indices, entity_E, relation_E, GRU_U, GRU_W, GRU_b, emb_size, entity_size, relation_size, entity_count, relation_count) # new_entity_E,new_relation_E=one_batch_parallel(batch_triple_indices, entity_E, relation_E, GRU_U, GRU_W, GRU_b, emb_size, new_entity_E,new_relation_E) # # entity_count=debug_print(entity_count.reshape((entity_size,1)), 'entity_count') # relation_count=debug_print(relation_count.reshape((relation_size, 1)), 'relation_count') # entity_E_hat_1=debug_print(new_entity_E/entity_count+1e-6, 'entity_E_hat_1') #to get rid of zero incoming info # relation_E_hat_1=debug_print(new_relation_E/relation_count, 'relation_E_hat_1') # # entity_E_hat_1, relation_E_hat_1=one_iteration_parallel(x_index_l, entity_E, relation_E, GRU_U, GRU_W, GRU_b, emb_size, entity_size, relation_size, entity_count, relation_count) # entity_E_updated_1=GRU_Combine_2Matrix(entity_E, entity_E_hat_1, emb_size, GRU_U_combine[0], GRU_W_combine[0], GRU_b_combine[0]) relation_E_updated_1=GRU_Combine_2Matrix(relation_E, relation_E_hat_1, emb_size, GRU_U_combine[1], GRU_W_combine[1], GRU_b_combine[1]) # cost=((entity_E_hat_1-entity_E)**2).sum()+((relation_E_hat_1-relation_E)**2).sum() cost_1=((entity_E_updated_1-entity_E)**2).sum()+((relation_E_updated_1-relation_E)**2).sum() entity_E_hat_2, relation_E_hat_2=all_batches(batch_start, batch_size, x_index_l, entity_E_updated_1, relation_E_updated_1, GRU_U, GRU_W, GRU_b, emb_size, zero_entity_E,zero_relation_E, entity_count, entity_size, relation_count, relation_size) # entity_E_hat_2, relation_E_hat_2=one_iteration_parallel(x_index_l, entity_E_updated_1, relation_E_updated_1, GRU_U, GRU_W, GRU_b, emb_size, entity_size, relation_size, entity_count, relation_count) entity_E_last_2=GRU_Combine_2Matrix(entity_E_updated_1, entity_E_hat_2, emb_size, GRU_U_combine[0], GRU_W_combine[0], GRU_b_combine[0]) relation_E_last_2=GRU_Combine_2Matrix(relation_E_updated_1, relation_E_hat_2, emb_size, GRU_U_combine[1], GRU_W_combine[1], GRU_b_combine[1]) L2_loss=debug_print((entity_E** 2).sum()+(relation_E** 2).sum()\ +(GRU_U** 2).sum()+(GRU_W** 2).sum()\ +(GRU_U_combine** 2).sum()+(GRU_W_combine** 2).sum(), 'L2_reg') cost_sys=((entity_E_last_2-entity_E_updated_1)**2).sum()+((relation_E_last_2-relation_E_updated_1)**2).sum() cost=cost_sys+L2_weight*L2_loss #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = [entity_E, relation_E, GRU_U, GRU_W, GRU_b, GRU_U_combine, GRU_W_combine, GRU_b_combine] # params_conv = [conv_W, conv_b] params_to_store=[GRU_U, GRU_W, GRU_b, GRU_U_combine, GRU_W_combine, GRU_b_combine]#, entity_E_last_2, relation_E_last_2] accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([x_index_l], [cost_1,cost_sys, entity_E_last_2, relation_E_last_2], updates=updates,on_unused_input='ignore') # # train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y], # givens={ # x_index_l: indices_train_l[index: index + batch_size], # x_index_r: indices_train_r[index: index + batch_size], # y: trainY[index: index + batch_size], # left_l: trainLeftPad_l[index], # right_l: trainRightPad_l[index], # left_r: trainLeftPad_r[index], # right_r: trainRightPad_r[index], # length_l: trainLengths_l[index], # length_r: trainLengths_r[index], # norm_length_l: normalized_train_length_l[index], # norm_length_r: normalized_train_length_r[index], # mts: mt_train[index: index + batch_size], # wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant # validation_frequency = min(n_train_batches/5, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False svm_max=0.0 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 #shuffle(train_batch_start)#shuffle training data cost_1, cost_l, entity_E_store, relation_E_store= train_model(triples) #print 'layer3_input', layer3_input print 'epoch:', epoch, 'cost:', cost_1, cost_l # if patience <= iter: # done_looping = True # break #after each epoch, increase the batch_size # exit(0) #store the paras after epoch 15 # if epoch ==22: entity_E_store=theano.shared(numpy.asarray(entity_E_store, dtype=theano.config.floatX), borrow=True) relation_E_store=theano.shared(numpy.asarray(relation_E_store, dtype=theano.config.floatX), borrow=True) params_to_store=params_to_store+[entity_E_store, relation_E_store] store_model_to_file(triple_path+'Best_Paras', params_to_store) print 'Finished storing best params' # exit(0) print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, word_nkerns=50, char_nkerns=4, batch_size=1, window_width=[2, 5], emb_size=50, char_emb_size=4, hidden_size=200, margin=0.5, L2_weight=0.0003, Div_reg=0.03, update_freq=1, norm_threshold=5.0, max_truncate=40, max_char_len=40, max_des_len=20, max_relation_len=5, max_Q_len=30, train_neg_size=21, neg_all=100, train_size=200, test_size=200, mark='_forfun'): #train_size=75909, test_size=17386 # maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/' triple_files = [ 'annotated_fb_data_train.entitylinking.top20_succSet_asInput.txt', 'annotated_fb_data_test.entitylinking.top20_succSet_asInput.txt' ] rng = numpy.random.RandomState(23455) datasets, datasets_test, length_per_example_test, vocab_size, char_size = load_train( triple_files[0], triple_files[1], max_char_len, max_des_len, max_relation_len, max_Q_len, train_size, test_size, mark) #max_char_len, max_des_len, max_relation_len, max_Q_len print 'vocab_size:', vocab_size, 'char_size:', char_size train_data = datasets # valid_data=datasets[1] test_data = datasets_test # result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores) # train_pos_entity_char = train_data[0] train_pos_entity_des = train_data[1] train_relations = train_data[2] train_entity_char_lengths = train_data[3] train_entity_des_lengths = train_data[4] train_relation_lengths = train_data[5] train_mention_char_ids = train_data[6] train_remainQ_word_ids = train_data[7] train_mention_char_lens = train_data[8] train_remainQ_word_len = train_data[9] train_entity_scores = train_data[10] test_pos_entity_char = test_data[0] test_pos_entity_des = test_data[1] test_relations = test_data[2] test_entity_char_lengths = test_data[3] test_entity_des_lengths = test_data[4] test_relation_lengths = test_data[5] test_mention_char_ids = test_data[6] test_remainQ_word_ids = test_data[7] test_mention_char_lens = test_data[8] test_remainQ_word_len = test_data[9] test_entity_scores = test_data[10] # # test_pos_entity_char=test_data[0] #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51 # test_pos_entity_des=test_data[1] #matrix, each row for a examle: 20*2*51 # test_relations=test_data[2] #matrix, each row for a example: 5*51 # test_entity_char_lengths=test_data[3] #matrix, each row for a example: 3*2*51 (three valies for one entity) # test_entity_des_lengths=test_data[4] #matrix, each row for a example: 3*2*51 (three values for one entity) # test_relation_lengths=test_data[5] #matrix, each row for a example: 3*51 # test_mention_char_ids=test_data[6] #matrix, each row for a mention: 40 # test_remainQ_word_ids=test_data[7] #matrix, each row for a question: 30 # test_mention_char_lens=test_data[8] #matrix, each three values for a mention: 3 # test_remainQ_word_len=test_data[9] #matrix, each three values for a remain question: 3 train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\ len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len), len(train_entity_scores)] if sum(train_sizes) / len(train_sizes) != train_size: print 'weird size:', train_sizes exit(0) test_sizes=[len(test_pos_entity_char), len(test_pos_entity_des), len(test_relations), len(test_entity_char_lengths), len(test_entity_des_lengths),\ len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len), len(test_entity_scores)] if sum(test_sizes) / len(test_sizes) != test_size: print 'weird size:', test_sizes exit(0) n_train_batches = train_size / batch_size n_test_batches = test_size / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) indices_train_pos_entity_char = pythonList_into_theanoIntMatrix( train_pos_entity_char) indices_train_pos_entity_des = pythonList_into_theanoIntMatrix( train_pos_entity_des) indices_train_relations = pythonList_into_theanoIntMatrix(train_relations) indices_train_entity_char_lengths = pythonList_into_theanoIntMatrix( train_entity_char_lengths) indices_train_entity_des_lengths = pythonList_into_theanoIntMatrix( train_entity_des_lengths) indices_train_relation_lengths = pythonList_into_theanoIntMatrix( train_relation_lengths) indices_train_mention_char_ids = pythonList_into_theanoIntMatrix( train_mention_char_ids) indices_train_remainQ_word_ids = pythonList_into_theanoIntMatrix( train_remainQ_word_ids) indices_train_mention_char_lens = pythonList_into_theanoIntMatrix( train_mention_char_lens) indices_train_remainQ_word_len = pythonList_into_theanoIntMatrix( train_remainQ_word_len) indices_train_entity_scores = pythonList_into_theanoFloatMatrix( train_entity_scores) # indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char) # indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des) # indices_test_relations=pythonList_into_theanoIntMatrix(test_relations) # indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths) # indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths) # indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths) # indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids) # indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids) # indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens) # indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len) # indices_test_entity_scores=pythonList_into_theanoIntMatrix(test_entity_scores) rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init(rand_values, rootPath + 'word_emb' + mark + '.txt') embeddings = theano.shared(value=rand_values, borrow=True) char_rand_values = random_value_normal((char_size + 1, char_emb_size), theano.config.floatX, numpy.random.RandomState(1234)) char_rand_values[0] = numpy.array(numpy.zeros(char_emb_size), dtype=theano.config.floatX) char_embeddings = theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data index = T.lscalar() chosed_indices = T.lvector() ent_char_ids_M = T.lmatrix() ent_lens_M = T.lmatrix() men_char_ids_M = T.lmatrix() men_lens_M = T.lmatrix() rel_word_ids_M = T.lmatrix() rel_word_lens_M = T.lmatrix() desH_word_ids_M = T.lmatrix() desH_word_lens_M = T.lmatrix() # desT_word_ids_M=T.lmatrix() # desT_word_lens_M=T.lmatrix() q_word_ids_M = T.lmatrix() q_word_lens_M = T.lmatrix() ent_scores = T.dvector() #max_char_len, max_des_len, max_relation_len, max_Q_len # ent_men_ishape = (char_emb_size, max_char_len) # this is the size of MNIST images # rel_ishape=(emb_size, max_relation_len) # des_ishape=(emb_size, max_des_len) # q_ishape=(emb_size, max_Q_len) filter_size = (emb_size, window_width[0]) char_filter_size = (char_emb_size, window_width[1]) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' char_filter_shape = (char_nkerns, 1, char_filter_size[0], char_filter_size[1]) word_filter_shape = (word_nkerns, 1, filter_size[0], filter_size[1]) char_conv_W, char_conv_b = create_conv_para(rng, filter_shape=char_filter_shape) q_rel_conv_W, q_rel_conv_b = create_conv_para( rng, filter_shape=word_filter_shape) q_desH_conv_W, q_desH_conv_b = create_conv_para( rng, filter_shape=word_filter_shape) params = [ char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W, q_rel_conv_b, q_desH_conv_W, q_desH_conv_b ] char_conv_W_into_matrix = char_conv_W.reshape( (char_conv_W.shape[0], char_conv_W.shape[2] * char_conv_W.shape[3])) q_rel_conv_W_into_matrix = q_rel_conv_W.reshape( (q_rel_conv_W.shape[0], q_rel_conv_W.shape[2] * q_rel_conv_W.shape[3])) q_desH_conv_W_into_matrix = q_desH_conv_W.reshape( (q_desH_conv_W.shape[0], q_desH_conv_W.shape[2] * q_desH_conv_W.shape[3])) # load_model_from_file(rootPath, params, '') def SimpleQ_matches_Triple(ent_char_ids_f, ent_lens_f, rel_word_ids_f, rel_word_lens_f, desH_word_ids_f, desH_word_lens_f, men_char_ids_f, q_word_ids_f, men_lens_f, q_word_lens_f): # rng = numpy.random.RandomState(23455) ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape( (batch_size, max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) men_char_input = char_embeddings[men_char_ids_f.flatten()].reshape( (batch_size, max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape( (batch_size, max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape( (batch_size, max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) q_word_input = embeddings[q_word_ids_f.flatten()].reshape( (batch_size, max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #ent_mention ent_char_conv = Conv_with_input_para(rng, input=ent_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) men_char_conv = Conv_with_input_para(rng, input=men_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) #q-rel q_rel_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) rel_conv = Conv_with_input_para(rng, input=rel_word_input, image_shape=(batch_size, 1, emb_size, max_relation_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) #q_desH q_desH_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) desH_conv = Conv_with_input_para(rng, input=desH_word_input, image_shape=(batch_size, 1, emb_size, max_des_len), filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) # #q_desT # q_desT_conv = Conv_with_input_para(rng, input=q_word_input, # image_shape=(batch_size, 1, emb_size, max_Q_len), # filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b) # desT_conv = Conv_with_input_para(rng, input=desT_word_input, # image_shape=(batch_size, 1, emb_size, max_des_len), # filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b) # ent_char_output=debug_print(ent_char_conv.output, 'ent_char.output') # men_char_output=debug_print(men_char_conv.output, 'men_char.output') ent_conv_pool = Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2]) men_conv_pool = Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens_f[0], right_l=men_lens_f[2]) # q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) rel_conv_pool = Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2]) q_rel_pool = Average_Pooling_for_SimpleQA( rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1] + filter_size[1] - 1, dim=max_Q_len + filter_size[1] - 1, topk=2) q_desH_pool = Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) desH_conv_pool = Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2]) # q_desT_pool=Max_Pooling(rng, input_l=q_desT_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2]) # desT_conv_pool=Max_Pooling(rng, input_l=desT_conv.output, left_l=desT_word_lens_f[0], right_l=desT_word_lens_f[2]) overall_simi=(cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)+\ cosine(q_rel_pool.topk_max_pooling, rel_conv_pool.output_maxpooling)+\ 0.1*cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling))/3.0 # cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling) return overall_simi simi_list, updates = theano.scan(SimpleQ_matches_Triple, sequences=[ ent_char_ids_M, ent_lens_M, rel_word_ids_M, rel_word_lens_M, desH_word_ids_M, desH_word_lens_M, men_char_ids_M, q_word_ids_M, men_lens_M, q_word_lens_M ]) simi_list += 0.5 * ent_scores posi_simi = simi_list[0] nega_simies = simi_list[1:] loss_simi_list = T.maximum( 0.0, margin - posi_simi.reshape((1, 1)) + nega_simies) loss_simi = T.mean(loss_simi_list) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg = debug_print( (char_embeddings**2).sum() + (embeddings**2).sum() + (char_conv_W**2).sum() + (q_rel_conv_W**2).sum() + (q_desH_conv_W**2).sum(), 'L2_reg') #+(layer1.W** 2).sum()++(embeddings**2).sum() diversify_reg = Diversify_Reg(char_conv_W_into_matrix) + Diversify_Reg( q_rel_conv_W_into_matrix) + Diversify_Reg(q_desH_conv_W_into_matrix) cost = loss_simi + L2_weight * L2_reg + Div_reg * diversify_reg #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([ ent_char_ids_M, ent_lens_M, men_char_ids_M, men_lens_M, rel_word_ids_M, rel_word_lens_M, desH_word_ids_M, desH_word_lens_M, q_word_ids_M, q_word_lens_M, ent_scores ], [loss_simi, simi_list], on_unused_input='ignore') # givens={ # ent_char_ids_M : test_pos_entity_char[index].reshape((length_per_example_test[index], max_char_len)), # ent_lens_M : test_entity_char_lengths[index].reshape((length_per_example_test[index], 3)), # men_char_ids_M : test_mention_char_ids[index].reshape((length_per_example_test[index], max_char_len)), # men_lens_M : test_mention_char_lens[index].reshape((length_per_example_test[index], 3)), # rel_word_ids_M : test_relations[index].reshape((length_per_example_test[index], max_relation_len)), # rel_word_lens_M : test_relation_lengths[index].reshape((length_per_example_test[index], 3)), # desH_word_ids_M : test_pos_entity_des[index].reshape((length_per_example_test[index], max_des_len)), # desH_word_lens_M : test_entity_des_lengths[index].reshape((length_per_example_test[index], 3)), # # desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2], # # desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2], # q_word_ids_M : test_remainQ_word_ids[index].reshape((length_per_example_test[index], max_Q_len)), # q_word_lens_M : test_remainQ_word_len[index].reshape((length_per_example_test[index], 3)), # ent_scores : test_entity_scores[index]}, #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] #+[embeddings]# + layer1.params # params_conv = [conv_W, conv_b] accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') acc = acc_i + T.sqr(grad_i) # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))) #AdaGrad # updates.append((acc_i, acc)) if param_i == embeddings: updates.append( (param_i, T.set_subtensor( (param_i - learning_rate * grad_i / T.sqrt(acc + 1e-10))[0], theano.shared(numpy.zeros(emb_size))))) #Ada elif param_i == char_embeddings: updates.append( (param_i, T.set_subtensor( (param_i - learning_rate * grad_i / T.sqrt(acc + 1e-10))[0], theano.shared(numpy.zeros(char_emb_size))))) #AdaGrad else: updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc + 1e-10))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [index, chosed_indices], [loss_simi, cost], updates=updates, givens={ ent_char_ids_M: indices_train_pos_entity_char[index].reshape( (neg_all, max_char_len))[chosed_indices].reshape( (train_neg_size, max_char_len)), ent_lens_M: indices_train_entity_char_lengths[index].reshape( (neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), men_char_ids_M: indices_train_mention_char_ids[index].reshape( (neg_all, max_char_len))[chosed_indices].reshape( (train_neg_size, max_char_len)), men_lens_M: indices_train_mention_char_lens[index].reshape( (neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), rel_word_ids_M: indices_train_relations[index].reshape( (neg_all, max_relation_len))[chosed_indices].reshape( (train_neg_size, max_relation_len)), rel_word_lens_M: indices_train_relation_lengths[index].reshape( (neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), desH_word_ids_M: indices_train_pos_entity_des[index].reshape( (neg_all, max_des_len))[chosed_indices].reshape( (train_neg_size, max_des_len)), desH_word_lens_M: indices_train_entity_des_lengths[index].reshape( (neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), # desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2], # desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2], q_word_ids_M: indices_train_remainQ_word_ids[index].reshape( (neg_all, max_Q_len))[chosed_indices].reshape( (train_neg_size, max_Q_len)), q_word_lens_M: indices_train_remainQ_word_len[index].reshape( (neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), ent_scores: indices_train_entity_scores[index][chosed_indices] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False best_test_accu = 0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 minibatch_index = minibatch_index + 1 #print batch_start sample_indices = [0] + random.sample(range(1, neg_all), train_neg_size - 1) loss_simi_i, cost_i = train_model(batch_start, sample_indices) # if batch_start%1==0: # print batch_start, '\t loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i # store_model_to_file(rootPath, params) if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + '\tloss_simi_i: ', loss_simi_i, 'cost_i:', cost_i #if iter ==1: # exit(0) # if iter % n_train_batches == 0: test_loss = [] succ = 0 for i in range(test_size): # print 'testing', i, '...' #prepare data test_ent_char_ids_M = numpy.asarray( test_pos_entity_char[i], dtype='int64').reshape( (length_per_example_test[i], max_char_len)) test_ent_lens_M = numpy.asarray( test_entity_char_lengths[i], dtype='int64').reshape( (length_per_example_test[i], 3)) test_men_char_ids_M = numpy.asarray( test_mention_char_ids[i], dtype='int64').reshape( (length_per_example_test[i], max_char_len)) test_men_lens_M = numpy.asarray( test_mention_char_lens[i], dtype='int64').reshape( (length_per_example_test[i], 3)) test_rel_word_ids_M = numpy.asarray( test_relations[i], dtype='int64').reshape( (length_per_example_test[i], max_relation_len)) test_rel_word_lens_M = numpy.asarray( test_relation_lengths[i], dtype='int64').reshape( (length_per_example_test[i], 3)) test_desH_word_ids_M = numpy.asarray( test_pos_entity_des[i], dtype='int64').reshape( (length_per_example_test[i], max_des_len)) test_desH_word_lens_M = numpy.asarray( test_entity_des_lengths[i], dtype='int64').reshape( (length_per_example_test[i], 3)) test_q_word_ids_M = numpy.asarray( test_remainQ_word_ids[i], dtype='int64').reshape( (length_per_example_test[i], max_Q_len)) test_q_word_lens_M = numpy.asarray( test_remainQ_word_len[i], dtype='int64').reshape( (length_per_example_test[i], 3)) test_ent_scores = numpy.asarray(test_entity_scores[i], dtype=theano.config.floatX) loss_simi_i, simi_list_i = test_model( test_ent_char_ids_M, test_ent_lens_M, test_men_char_ids_M, test_men_lens_M, test_rel_word_ids_M, test_rel_word_lens_M, test_desH_word_ids_M, test_desH_word_lens_M, test_q_word_ids_M, test_q_word_lens_M, test_ent_scores) # print 'simi_list_i:', simi_list_i[:10] test_loss.append(loss_simi_i) if simi_list_i[0] >= max(simi_list_i[1:]): succ += 1 # print 'testing', i, '...acc:', succ*1.0/(i+1) succ = succ * 1.0 / test_size #now, check MAP and MRR print(( '\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best ' 'model %f') % (epoch, minibatch_index, n_train_batches, succ)) if best_test_accu < succ: best_test_accu = succ store_model_to_file(rootPath, params, mark) if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min' mid_time = time.clock() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, nkerns=[50, 50], batch_size=1, window_width=3, maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200, L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=1.0, decay=0.95): maxSentLength = max_s_length + 2 * (window_width - 1) maxDocLength = max_d_length + 2 * (window_width - 1) model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/MCTest/' rng = numpy.random.RandomState(23455) train_data, train_size, test_data, test_size, vocab_size = load_MCTest_corpus_DQAAAA( rootPath + 'vocab_DQAAAA.txt', rootPath + 'mc500.train.tsv_standardlized.txt_DQAAAA.txt', rootPath + 'mc500.test.tsv_standardlized.txt_DQAAAA.txt', max_s_length, maxSentLength, maxDocLength) #vocab_size contain train, dev and test [ train_data_D, train_data_Q, train_data_A1, train_data_A2, train_data_A3, train_data_A4, train_Label, train_Length_D, train_Length_D_s, train_Length_Q, train_Length_A1, train_Length_A2, train_Length_A3, train_Length_A4, train_leftPad_D, train_leftPad_D_s, train_leftPad_Q, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3, train_leftPad_A4, train_rightPad_D, train_rightPad_D_s, train_rightPad_Q, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3, train_rightPad_A4 ] = train_data [ test_data_D, test_data_Q, test_data_A1, test_data_A2, test_data_A3, test_data_A4, test_Label, test_Length_D, test_Length_D_s, test_Length_Q, test_Length_A1, test_Length_A2, test_Length_A3, test_Length_A4, test_leftPad_D, test_leftPad_D_s, test_leftPad_Q, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3, test_leftPad_A4, test_rightPad_D, test_rightPad_D_s, test_rightPad_Q, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3, test_rightPad_A4 ] = test_data n_train_batches = train_size / batch_size n_test_batches = test_size / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int64') # indices_train_r=T.cast(indices_train_r, 'int64') # indices_test_l=T.cast(indices_test_l, 'int64') # indices_test_r=T.cast(indices_test_r, 'int64') rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init( rand_values, rootPath + 'vocab_DQAAAA_glove_50d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings = theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer index_Q = T.lvector() index_A1 = T.lvector() index_A2 = T.lvector() index_A3 = T.lvector() index_A4 = T.lvector() # y = T.lvector() len_D = T.lscalar() len_D_s = T.lvector() len_Q = T.lscalar() len_A1 = T.lscalar() len_A2 = T.lscalar() len_A3 = T.lscalar() len_A4 = T.lscalar() left_D = T.lscalar() left_D_s = T.lvector() left_Q = T.lscalar() left_A1 = T.lscalar() left_A2 = T.lscalar() left_A3 = T.lscalar() left_A4 = T.lscalar() right_D = T.lscalar() right_D_s = T.lvector() right_Q = T.lscalar() right_A1 = T.lscalar() right_A2 = T.lscalar() right_A3 = T.lscalar() right_A4 = T.lscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words = (emb_size, window_width) filter_sents = (nkerns[0], window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = debug_print(embeddings[index_D.flatten()].reshape( (maxDocLength, maxSentLength, emb_size)).transpose(0, 2, 1), 'layer0_D_input') #.dimshuffle(0, 'x', 1, 2) layer0_Q_input = debug_print(embeddings[index_Q.flatten()].reshape( (maxSentLength, emb_size)).transpose(), 'layer0_Q_input') #.dimshuffle(0, 'x', 1, 2) layer0_A1_input = debug_print(embeddings[index_A1.flatten()].reshape( (maxSentLength, emb_size)).transpose(), 'layer0_A1_input') #.dimshuffle(0, 'x', 1, 2) layer0_A2_input = embeddings[index_A2.flatten()].reshape( (maxSentLength, emb_size)).transpose() #.dimshuffle(0, 'x', 1, 2) layer0_A3_input = embeddings[index_A3.flatten()].reshape( (maxSentLength, emb_size)).transpose() #.dimshuffle(0, 'x', 1, 2) layer0_A4_input = embeddings[index_A4.flatten()].reshape( (maxSentLength, emb_size)).transpose() #.dimshuffle(0, 'x', 1, 2) U, W, b, Ub, Wb, bb = create_Bi_GRU_para(rng, emb_size, nkerns[0]) layer0_para = [U, W, b, Ub, Wb, bb] # conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) # layer2_para=[conv2_W, conv2_b] # high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1]) # highW_para=[high_W, high_b] #load_model(params) layer0_D = Bi_GRU_Tensor3_Input(T=layer0_D_input[left_D:-right_D, :, :], lefts=left_D_s[left_D:-right_D], rights=right_D_s[left_D:-right_D], hidden_dim=nkerns[0], U=U, W=W, b=b, Ub=Ub, Wb=Wb, bb=bb) layer0_Q = Bi_GRU_Matrix_Input(X=layer0_Q_input[:, left_Q:-right_Q], word_dim=emb_size, hidden_dim=nkerns[0], U=U, W=W, b=b, U_b=Ub, W_b=Wb, b_b=bb, bptt_truncate=-1) layer0_A1 = Bi_GRU_Matrix_Input(X=layer0_A1_input[:, left_A1:-right_A1], word_dim=emb_size, hidden_dim=nkerns[0], U=U, W=W, b=b, U_b=Ub, W_b=Wb, b_b=bb, bptt_truncate=-1) layer0_A2 = Bi_GRU_Matrix_Input(X=layer0_A2_input[:, left_A2:-right_A2], word_dim=emb_size, hidden_dim=nkerns[0], U=U, W=W, b=b, U_b=Ub, W_b=Wb, b_b=bb, bptt_truncate=-1) layer0_A3 = Bi_GRU_Matrix_Input(X=layer0_A3_input[:, left_A3:-right_A3], word_dim=emb_size, hidden_dim=nkerns[0], U=U, W=W, b=b, U_b=Ub, W_b=Wb, b_b=bb, bptt_truncate=-1) layer0_A4 = Bi_GRU_Matrix_Input(X=layer0_A4_input[:, left_A4:-right_A4], word_dim=emb_size, hidden_dim=nkerns[0], U=U, W=W, b=b, U_b=Ub, W_b=Wb, b_b=bb, bptt_truncate=-1) layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output') # hidden*2 layer0_Q_output = debug_print(layer0_Q.output_vector_last, 'layer0_Q.output') # hidden*4 layer0_A1_output = debug_print(layer0_A1.output_vector_last, 'layer0_A1.output') layer0_A2_output = debug_print(layer0_A2.output_vector_last, 'layer0_A2.output') layer0_A3_output = debug_print(layer0_A3.output_vector_last, 'layer0_A3.output') layer0_A4_output = debug_print(layer0_A4.output_vector_last, 'layer0_A4.output') #before reasoning, do a GRU for doc: d U_d, W_d, b_d, U_db, W_db, b_db = create_Bi_GRU_para( rng, nkerns[0] * 2, nkerns[0] * 2) layer_d_para = [U_d, W_d, b_d, U_db, W_db, b_db] layer_D_GRU = Bi_GRU_Matrix_Input(X=layer0_D_output, word_dim=nkerns[0] * 2, hidden_dim=nkerns[0] * 2, U=U_d, W=W_d, b=b_d, U_b=U_db, W_b=W_db, b_b=b_db, bptt_truncate=-1) #Reasoning Layer 1 repeat_Q = debug_print( T.repeat(layer0_Q_output.reshape((layer0_Q_output.shape[0], 1)), maxDocLength, axis=1)[:, :layer_D_GRU.output_matrix.shape[1]], 'repeat_Q') input_DNN = debug_print( T.concatenate([layer_D_GRU.output_matrix, repeat_Q], axis=0).transpose(), 'input_DNN') #each row is an example output_DNN1 = HiddenLayer(rng, input=input_DNN, n_in=nkerns[0] * 8, n_out=nkerns[0]) attention_W = create_ensemble_para(rng, nkerns[0], 1) attention_weights = T.nnet.softmax( T.dot(attention_W, output_DNN1.output.transpose())) repeat_attentions = T.repeat(attention_weights, layer_D_GRU.output_matrix.shape[0], axis=0) doc_r = T.sum(layer_D_GRU.output_matrix * repeat_attentions, axis=1) combine_DQ = T.concatenate([doc_r, layer0_Q_output], axis=0) # dim: hidden*6 output_DNN2 = HiddenLayer(rng, input=combine_DQ, n_in=nkerns[0] * 8, n_out=nkerns[0] * 4) # DNN_out=debug_print(output_DNN2.output.transpose(), 'DNN_out') # U_p, W_p, b_p=create_GRU_para(rng, nkerns[0], nkerns[0]) # layer_pooling_para=[U_p, W_p, b_p] # pooling=GRU_Matrix_Input(X=DNN_out, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p,W=W_p,b=b_p,bptt_truncate=-1) # translated_Q1=debug_print(pooling.output_vector_max, 'translated_Q1') # # # #before reasoning, do a GRU for doc: d2 # U_d2, W_d2, b_d2=create_GRU_para(rng, nkerns[0], nkerns[0]) # layer_d2_para=[U_d2, W_d2, b_d2] # layer_D2_GRU = GRU_Matrix_Input(X=layer_D_GRU.output_matrix, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_d2,W=W_d2,b=b_d2,bptt_truncate=-1) # #Reasoning Layer 2 # repeat_Q1=debug_print(T.repeat(translated_Q1.reshape((translated_Q1.shape[0],1)), maxDocLength, axis=1)[:,:layer_D2_GRU.output_matrix.shape[1]], 'repeat_Q1') # input_DNN2=debug_print(T.concatenate([layer_D2_GRU.output_matrix,repeat_Q1], axis=0).transpose(), 'input_DNN2')#each row is an example # output_DNN3=HiddenLayer(rng, input=input_DNN2, n_in=nkerns[0]*2, n_out=nkerns[0]) # output_DNN4=HiddenLayer(rng, input=output_DNN3.output, n_in=nkerns[0], n_out=nkerns[0]) # # DNN_out2=debug_print(output_DNN4.output.transpose(), 'DNN_out2') # U_p2, W_p2, b_p2=create_GRU_para(rng, nkerns[0], nkerns[0]) # layer_pooling_para2=[U_p2, W_p2, b_p2] # pooling2=GRU_Matrix_Input(X=DNN_out2, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p2,W=W_p2,b=b_p2,bptt_truncate=-1) translated_Q2 = debug_print(output_DNN2.output, 'translated_Q2') QA1 = T.concatenate([translated_Q2, layer0_A1_output], axis=0) #dim: hidden*5 QA2 = T.concatenate([translated_Q2, layer0_A2_output], axis=0) QA3 = T.concatenate([translated_Q2, layer0_A3_output], axis=0) QA4 = T.concatenate([translated_Q2, layer0_A4_output], axis=0) W_HL, b_HL = create_HiddenLayer_para(rng, n_in=nkerns[0] * 8, n_out=1) match_params = [W_HL, b_HL] QA1_match = HiddenLayer(rng, input=QA1, n_in=nkerns[0] * 8, n_out=1, W=W_HL, b=b_HL) QA2_match = HiddenLayer(rng, input=QA2, n_in=nkerns[0] * 8, n_out=1, W=W_HL, b=b_HL) QA3_match = HiddenLayer(rng, input=QA3, n_in=nkerns[0] * 8, n_out=1, W=W_HL, b=b_HL) QA4_match = HiddenLayer(rng, input=QA4, n_in=nkerns[0] * 8, n_out=1, W=W_HL, b=b_HL) # simi_overall_level1=debug_print(cosine(translated_Q2, layer0_A1_output), 'simi_overall_level1') # simi_overall_level2=debug_print(cosine(translated_Q2, layer0_A2_output), 'simi_overall_level2') # simi_overall_level3=debug_print(cosine(translated_Q2, layer0_A3_output), 'simi_overall_level3') # simi_overall_level4=debug_print(cosine(translated_Q2, layer0_A4_output), 'simi_overall_level4') simi_overall_level1 = debug_print(QA1_match.output[0], 'simi_overall_level1') simi_overall_level2 = debug_print(QA2_match.output[0], 'simi_overall_level2') simi_overall_level3 = debug_print(QA3_match.output[0], 'simi_overall_level3') simi_overall_level4 = debug_print(QA4_match.output[0], 'simi_overall_level4') # eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA)) #only use overall_simi cost = T.maximum( 0.0, margin + simi_overall_level2 - simi_overall_level1) + T.maximum( 0.0, margin + simi_overall_level3 - simi_overall_level1) + T.maximum( 0.0, margin + simi_overall_level4 - simi_overall_level1) # cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi) posi_simi = simi_overall_level1 nega_simi = T.max( [simi_overall_level2, simi_overall_level3, simi_overall_level4]) # #use ensembled simi # cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi) # posi_simi=simi_1 # nega_simi=T.max([simi_2, simi_3, simi_4]) L2_reg = debug_print( (U**2).sum() + (W**2).sum() + (Ub**2).sum() + (Wb**2).sum() + (output_DNN1.W**2).sum() + (output_DNN2.W**2).sum() + (U_d**2).sum() + (W_d**2).sum() + (U_db**2).sum() + (W_db**2).sum() + (W_HL**2).sum() + (attention_W**2).sum(), 'L2_reg' ) #+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost = debug_print(cost + L2_weight * L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function( [index], [cost, posi_simi, nega_simi], givens={ index_D: test_data_D[index], #a matrix index_Q: test_data_Q[index], index_A1: test_data_A1[index], index_A2: test_data_A2[index], index_A3: test_data_A3[index], index_A4: test_data_A4[index], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], len_Q: test_Length_Q[index], len_A1: test_Length_A1[index], len_A2: test_Length_A2[index], len_A3: test_Length_A3[index], len_A4: test_Length_A4[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], left_Q: test_leftPad_Q[index], left_A1: test_leftPad_A1[index], left_A2: test_leftPad_A2[index], left_A3: test_leftPad_A3[index], left_A4: test_leftPad_A4[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], right_Q: test_rightPad_Q[index], right_A1: test_rightPad_A1[index], right_A2: test_rightPad_A2[index], right_A3: test_rightPad_A3[index], right_A4: test_rightPad_A4[index] }, on_unused_input='ignore') params = layer0_para + output_DNN1.params + output_DNN2.params + match_params + layer_d_para + [ attention_W ] # accumulator=[] # for para_i in params: # eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) # accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) # updates = [] # for param_i, grad_i, acc_i in zip(params, grads, accumulator): # grad_i=debug_print(grad_i,'grad_i') # acc = decay*acc_i + (1-decay)*T.sqr(grad_i) #rmsprop # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-6))) # updates.append((acc_i, acc)) def AdaDelta_updates(parameters, gradients, rho, eps): # create variables to store intermediate updates gradients_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ] deltas_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ] # calculates the new "average" delta for the next iteration gradients_sq_new = [ rho * g_sq + (1 - rho) * (g**2) for g_sq, g in zip(gradients_sq, gradients) ] # calculates the step in direction. The square root is an approximation to getting the RMS for the average value deltas = [ (T.sqrt(d_sq + eps) / T.sqrt(g_sq + eps)) * grad for d_sq, g_sq, grad in zip(deltas_sq, gradients_sq_new, gradients) ] # calculates the new "average" deltas for the next step. deltas_sq_new = [ rho * d_sq + (1 - rho) * (d**2) for d_sq, d in zip(deltas_sq, deltas) ] # Prepare it as a list f gradient_sq_updates = zip(gradients_sq, gradients_sq_new) deltas_sq_updates = zip(deltas_sq, deltas_sq_new) parameters_updates = [(p, p - d) for p, d in zip(parameters, deltas)] return gradient_sq_updates + deltas_sq_updates + parameters_updates updates = AdaDelta_updates(params, grads, decay, 1e-6) train_model = theano.function( [index], [cost, posi_simi, nega_simi], updates=updates, givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index], right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') train_model_predict = theano.function( [index], [cost, posi_simi, nega_simi], givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index], right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc = 0.0 best_epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 # shuffle(train_batch_start)#shuffle training data corr_train = 0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 sys.stdout.write("Training :[%6f] %% complete!\r" % ((iter % train_size) * 100.0 / train_size)) sys.stdout.flush() minibatch_index = minibatch_index + 1 cost_average, posi_simi, nega_simi = train_model(batch_start) if posi_simi > nega_simi: corr_train += 1 if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + ' average cost: ' + str( cost_average) + 'corr rate:' + str( corr_train * 100.0 / train_size) if iter % validation_frequency == 0: corr_test = 0 for i in test_batch_start: cost, posi_simi, nega_simi = test_model(i) if posi_simi > nega_simi: corr_test += 1 #write_file.close() #test_score = numpy.mean(test_losses) test_acc = corr_test * 1.0 / test_size #test_acc=1-test_score print( ('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_acc * 100.)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') find_better = False if test_acc > max_acc: max_acc = test_acc best_epoch = epoch find_better = True print '\t\t\ttest_acc:', test_acc, 'max:', max_acc, '(at', best_epoch, ')' if find_better == True: store_model_to_file(params, best_epoch, max_acc) print 'Finished storing best params' if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min' mid_time = time.clock() #writefile.close() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.5, n_epochs=2000, batch_size=500, emb_size=300, hidden_size=300, L2_weight=0.0001, para_len_limit=700, q_len_limit=40): model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = numpy.random.RandomState(23455) train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist = load_train( para_len_limit, q_len_limit) train_size = len(train_para_list) if train_size != len(train_Q_list) or train_size != len( train_label_list) or train_size != len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist = load_dev_or_test( word2id, para_len_limit, q_len_limit) test_size = len(test_para_list) if test_size != len(test_Q_list) or test_size != len( test_mask) or test_size != len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) id2word = {y: x for x, y in overall_word2id.iteritems()} word2vec = load_word2vec() rand_values = random_value_normal((overall_vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') labels = T.imatrix('labels') para_mask = T.fmatrix('para_mask') q_mask = T.fmatrix('q_mask') extraF = T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) paragraph_input = embeddings[paragraph.flatten()].reshape( (paragraph.shape[0], paragraph.shape[1], emb_size)).transpose( (0, 2, 1)) # (batch_size, emb_size, maxparalen) # # # BdGRU(rng, str(0), shape, X, mask, is_train = 1, batch_size = 1, p = 0.5) # U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b = create_GRU_para(rng, emb_size, hidden_size) paragraph_para = [U1, W1, b1, U1_b, W1_b, b1_b] paragraph_model = Bd_GRU_Batch_Tensor_Input_with_Mask( X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size, U=U1, W=W1, b=b1, Ub=U1_b, Wb=W1_b, bb=b1_b) para_reps = paragraph_model.output_tensor #(batch, emb, para_len) Qs_emb = embeddings[questions.flatten()].reshape( (questions.shape[0], questions.shape[1], emb_size)).transpose( (0, 2, 1)) #(#questions, emb_size, maxsenlength) UQ, WQ, bQ = create_GRU_para(rng, emb_size, hidden_size) UQ_b, WQ_b, bQ_b = create_GRU_para(rng, emb_size, hidden_size) Q_para = [UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] questions_model = Bd_GRU_Batch_Tensor_Input_with_Mask( X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ, W=WQ, b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b) questions_reps = questions_model.output_sent_rep_maxpooling.reshape( (batch_size, 1, hidden_size)) #(batch, 2*out_size) #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) #attention distributions W_a1 = create_ensemble_para( rng, hidden_size, hidden_size) # init_weights((2*hidden_size, hidden_size)) W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 2, hidden_size + 3) # 3 extra features norm_W_a1 = normalize_matrix(W_a1) norm_W_a2 = normalize_matrix(W_a2) norm_U_a = normalize_matrix(U_a) LR_b = theano.shared( value=numpy.zeros((2, ), dtype=theano.config.floatX), # @UndefinedVariable name='LR_b', borrow=True) attention_paras = [W_a1, W_a2, U_a, LR_b] transformed_para_reps = T.tanh( T.dot(para_reps.transpose((0, 2, 1)), norm_W_a2)) transformed_q_reps = T.tanh(T.dot(questions_reps, norm_W_a1)) #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1) add_both = 0.5 * (transformed_para_reps + transformed_q_reps) prior_att = T.concatenate([add_both, normalize_matrix(extraF)], axis=2) #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2) valid_indices = para_mask.flatten().nonzero()[0] layer3 = LogisticRegression(rng, input=prior_att.reshape( (batch_size * prior_att.shape[1], hidden_size + 3)), n_in=hidden_size + 3, n_out=2, W=norm_U_a, b=LR_b) #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices]) error = -T.mean( T.log(layer3.p_y_given_x) [valid_indices, labels.flatten()[valid_indices]]) #[T.arange(y.shape[0]), y]) distributions = layer3.p_y_given_x[:, -1].reshape( (batch_size, para_mask.shape[1])) #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1])) masked_dis = distributions * para_mask ''' strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1) distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions') para_mask=para_mask masked_dis=distributions*para_mask # masked_label=debug_print(labels*para_mask, 'masked_label') # error=((masked_dis-masked_label)**2).mean() label_mask=T.gt(labels,0.0) neg_label_mask=T.lt(labels,0.0) dis_masked=distributions*label_mask remain_dis_masked=distributions*neg_label_mask ans_size=T.sum(label_mask) non_ans_size=T.sum(neg_label_mask) pos_error=T.sum((dis_masked-label_mask)**2)/ans_size neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)* ''' # def AttentionLayer(q_rep, ext_M): # theano_U_a=debug_print(norm_U_a, 'norm_U_a') # prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att') # f __name__ == '__main__': # prior_att=T.concatenate([prior_att, ext_M], axis=1) # # strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1) # return strength.transpose() #(1, #words) # distributions, updates = theano.scan( # AttentionLayer, # sequences=[questions_reps,extraF] ) # distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions') # labels=debug_print(labels, 'labels') # label_mask=T.gt(labels,0.0) # neg_label_mask=T.lt(labels,0.0) # dis_masked=distributions*label_mask # remain_dis_masked=distributions*neg_label_mask # pos_error=((dis_masked-1)**2).mean() # neg_error=((remain_dis_masked-(-1))**2).mean() # error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = [embeddings] + paragraph_para + Q_para + attention_paras L2_reg = L2norm_paraList( [embeddings, U1, W1, U1_b, W1_b, UQ, WQ, UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost = error #+L2_weight*L2_reg accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-8))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [paragraph, questions, labels, para_mask, q_mask, extraF], error, updates=updates, on_unused_input='ignore') test_model = theano.function( [paragraph, questions, para_mask, q_mask, extraF], masked_dis, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches = train_size / batch_size # remain_train=train_size%batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / batch_size # remain_test=test_size%batch_size test_batch_start = list( numpy.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_exact_acc = 0.0 cost_i = 0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #shuffle(train_batch_start) iter_accu = 0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i += train_model( np.asarray(train_para_list[para_id:para_id + batch_size], dtype='int32'), np.asarray(train_Q_list[para_id:para_id + batch_size], dtype='int32'), np.asarray(train_label_list[para_id:para_id + batch_size], dtype='int32'), np.asarray(train_para_mask[para_id:para_id + batch_size], dtype=theano.config.floatX), np.asarray(train_mask[para_id:para_id + batch_size], dtype=theano.config.floatX), np.asarray(train_feature_matrixlist[para_id:para_id + batch_size], dtype=theano.config.floatX)) #print iter if iter % 10 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' print 'Testing...' past_time = time.time() exact_match = 0.0 q_amount = 0 for test_para_id in test_batch_start: distribution_matrix = test_model( np.asarray(test_para_list[test_para_id:test_para_id + batch_size], dtype='int32'), np.asarray(test_Q_list[test_para_id:test_para_id + batch_size], dtype='int32'), np.asarray(test_para_mask[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX), np.asarray(test_mask[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX), np.asarray( test_feature_matrixlist[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX)) # print distribution_matrix test_para_wordlist_list = test_text_list[ test_para_id:test_para_id + batch_size] para_gold_ansset_list = q_ansSet_list[ test_para_id:test_para_id + batch_size] paralist_extra_features = test_feature_matrixlist[ test_para_id:test_para_id + batch_size] sub_para_mask = test_para_mask[test_para_id:test_para_id + batch_size] para_len = len(test_para_wordlist_list[0]) if para_len != len(distribution_matrix[0]): print 'para_len!=len(distribution_matrix[0]):', para_len, len( distribution_matrix[0]) exit(0) # q_size=len(distribution_matrix) q_amount += batch_size # print q_size # print test_para_word_list for q in range(batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans = extract_ansList_attentionList( test_para_wordlist_list[q], distribution_matrix[q], np.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q]) q_gold_ans_set = para_gold_ansset_list[q] F1 = MacroF1(pred_ans, q_gold_ans_set) exact_match += F1 # match_amount=len(pred_ans_set & q_gold_ans_set) # # print 'q_gold_ans_set:', q_gold_ans_set # # print 'pred_ans_set:', pred_ans_set # if match_amount>0: # exact_match+=match_amount*1.0/len(pred_ans_set) exact_acc = exact_match / q_amount if exact_acc > max_exact_acc: max_exact_acc = exact_acc print 'current average F1:', exact_acc, '\t\tmax F1:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.005, n_epochs=2000, batch_size=300, test_batch_size=400, emb_size=50, hidden_size=300, HL_hidden_size=200, L2_weight=0.0001, train_size=None, test_size=None, batch_size_pred=400, trichar_len=15,char_emb_size=50, para_len=101, question_len=20, c_len=1, model_type='train'): model_options = locals().copy() print "model options", model_options rootPath='/mounts/Users/cisintern/hs/l/workhs/yin/20170320/'; storePath='/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = np.random.RandomState(23455) word2id={} trichar2id={} word2id['UNK']=0 # use it to pad #word2id, trichar2id, questions,questions_mask,paras,paras_mask,labels, isInQ_para, paras_shape, questions_shape, types, types_shape,question_trichar_ids,question_trichar_masks,para_trichar_ids,para_trichar_masks,type_trichar_ids,type_trichar_masks word2id, trichar2id,train_questions,train_questions_mask,train_paras,train_paras_mask,train_labels, train_islabels, train_paras_shape, train_questions_shape, train_types, train_types_shape,train_question_trichar_ids,train_question_trichar_masks,train_para_trichar_ids,train_para_trichar_masks,train_type_trichar_ids,train_type_trichar_masks=load_SQUAD_hinrich_v4(train_size, para_len, question_len, trichar_len, word2id,trichar2id, rootPath+'trn20170320.txt') word2id, trichar2id,test_questions,test_questions_mask,test_paras,test_paras_mask,test_labels, test_islabels, test_paras_shape, test_questions_shape, test_types, test_types_shape,test_question_trichar_ids,test_question_trichar_masks,test_para_trichar_ids,test_para_trichar_masks,test_type_trichar_ids,test_type_trichar_masks=load_SQUAD_hinrich_v4(test_size, para_len, question_len, trichar_len,word2id, trichar2id, rootPath+'dev.big.20170320.txt') word2id, trichar2id,test_questions,test_questions_mask,test_paras,test_paras_mask,test_labels, test_islabels, test_paras_shape, test_questions_shape, test_types, test_types_shape,test_question_trichar_ids,test_question_trichar_masks,test_para_trichar_ids,test_para_trichar_masks,test_type_trichar_ids,test_type_trichar_masks=load_SQUAD_hinrich_v4(test_size, para_len, question_len, trichar_len,word2id, trichar2id, rootPath+'dev20170320.txt') print 'word2id size for bigger dataset:', len(word2id), 'trichar size:', len(trichar2id) train_size=len(train_questions) test_size = len(test_questions) #50010# train_questions = np.asarray(train_questions, dtype='int32') train_questions_shape = np.asarray(train_questions_shape, dtype='int32') train_questions_mask = np.asarray(train_questions_mask, dtype=theano.config.floatX) train_paras = np.asarray(train_paras, dtype='int32') train_paras_shape = np.asarray(train_paras_shape, dtype='int32') train_paras_mask = np.asarray(train_paras_mask, dtype=theano.config.floatX) train_types = np.asarray(train_types, dtype='int32') train_types_shape = np.asarray(train_types_shape, dtype='int32') # train_c_ids = np.asarray(train_c_ids, dtype='int32') # train_c_ids_shape = np.asarray(train_c_ids_shape, dtype='int32') # train_c_masks = np.asarray(train_c_masks, dtype=theano.config.floatX) train_islabels = np.asarray(train_islabels, dtype=theano.config.floatX) # train_c_heads = np.asarray(train_c_heads, dtype='int32') # train_c_tails = np.asarray(train_c_tails, dtype='int32') train_labels = np.asarray(train_labels, dtype='int32') #train_question_trichar_ids,train_question_trichar_masks,train_para_trichar_ids,train_para_trichar_masks,train_type_trichar_ids,train_type_trichar_masks train_question_trichar_ids = np.asarray(train_question_trichar_ids, dtype='int32') train_question_trichar_masks = np.asarray(train_question_trichar_masks, dtype=theano.config.floatX) train_para_trichar_ids = np.asarray(train_para_trichar_ids, dtype='int32') train_para_trichar_masks = np.asarray(train_para_trichar_masks, dtype=theano.config.floatX) train_type_trichar_ids = np.asarray(train_type_trichar_ids, dtype='int32') train_type_trichar_masks = np.asarray(train_type_trichar_masks, dtype=theano.config.floatX) test_questions = np.asarray(test_questions, dtype='int32') test_questions_shape = np.asarray(test_questions_shape, dtype='int32') test_questions_mask = np.asarray(test_questions_mask, dtype=theano.config.floatX) test_paras = np.asarray(test_paras, dtype='int32') test_paras_shape = np.asarray(test_paras_shape, dtype='int32') test_paras_mask = np.asarray(test_paras_mask, dtype=theano.config.floatX) test_types = np.asarray(test_types, dtype='int32') test_types_shape = np.asarray(test_types_shape, dtype='int32') # test_c_ids = np.asarray(test_c_ids, dtype='int32') # test_c_ids_shape = np.asarray(test_c_ids_shape, dtype='int32') # test_c_masks = np.asarray(test_c_masks, dtype=theano.config.floatX) test_islabels = np.asarray(test_islabels, dtype=theano.config.floatX) # test_c_heads = np.asarray(test_c_heads, dtype='int32') # test_c_tails = np.asarray(test_c_tails, dtype='int32') test_labels = np.asarray(test_labels, dtype='int32') test_question_trichar_ids = np.asarray(test_question_trichar_ids, dtype='int32') test_question_trichar_masks = np.asarray(test_question_trichar_masks, dtype=theano.config.floatX) test_para_trichar_ids = np.asarray(test_para_trichar_ids, dtype='int32') test_para_trichar_masks = np.asarray(test_para_trichar_masks, dtype=theano.config.floatX) test_type_trichar_ids = np.asarray(test_type_trichar_ids, dtype='int32') test_type_trichar_masks = np.asarray(test_type_trichar_masks, dtype=theano.config.floatX) overall_vocab_size=len(word2id) print 'train size:', train_size, 'test size:', test_size, 'vocab size:', overall_vocab_size rand_values=random_value_normal((overall_vocab_size, emb_size), theano.config.floatX, rng) rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) overall_trichar_size = len(trichar2id) char_rand_values=random_value_normal((overall_trichar_size, char_emb_size), theano.config.floatX, rng) char_embeddings=theano.shared(value=char_rand_values, borrow=True) para=T.imatrix() #(2*batch, len) para_shape = T.imatrix() para_mask=T.fmatrix() #(2*batch, len) q=T.imatrix() #(2*batch, len_q) q_shape = T.imatrix() q_mask=T.fmatrix() #(2*batch, len_q) islabels = T.fmatrix() labels=T.ivector() #batch types=T.imatrix() types_shape=T.imatrix() q_trichar_ids = T.imatrix() q_trichar_masks =T.fmatrix() para_trichar_ids = T.imatrix() para_trichar_masks =T.fmatrix() type_trichar_ids = T.imatrix() type_trichar_masks =T.fmatrix() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = para.shape[0] paragraph_input = embeddings[para.flatten()].reshape((true_batch_size, para_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, para_len) q_input = embeddings[q.flatten()].reshape((true_batch_size, question_len, emb_size)).transpose((0, 2,1)) # (batch, emb_size, question_len) q_types = embeddings[types.flatten()].reshape((true_batch_size, 2, emb_size)).transpose((0, 2,1)) paragraph_input_shape = embeddings[para_shape.flatten()].reshape((true_batch_size, para_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, para_len) q_input_shape = embeddings[q_shape.flatten()].reshape((true_batch_size, question_len, emb_size)).transpose((0, 2,1)) # (batch, emb_size, question_len) q_types_shape = embeddings[types_shape.flatten()].reshape((true_batch_size, 2, emb_size)).transpose((0, 2,1)) paragraph_input_trichar = char_embeddings[para_trichar_ids.flatten()].reshape((true_batch_size, para_len*trichar_len, char_emb_size)) #(batch, char_emb_size, para_len*trichar_len) q_input_trichar = char_embeddings[q_trichar_ids.flatten()].reshape((true_batch_size, question_len*trichar_len, char_emb_size)) # (batch, emb_size, question_len) q_types_trichar = char_embeddings[type_trichar_ids.flatten()].reshape((true_batch_size, 2*trichar_len, char_emb_size)) #sum up trichar emb as word level embs paragraph_input_trichar=T.sum((paragraph_input_trichar*para_trichar_masks.dimshuffle(0,1,'x')).reshape((true_batch_size, para_len, trichar_len,char_emb_size)),axis=2).dimshuffle(0,2,1) #(true_batch_size, char_emb_size,para_len) q_input_trichar=T.sum((q_input_trichar*q_trichar_masks.dimshuffle(0,1,'x')).reshape((true_batch_size, question_len, trichar_len,char_emb_size)),axis=2).dimshuffle(0,2,1) #(true_batch_size, char_emb_size,q_len) q_types_trichar=T.sum((q_types_trichar*type_trichar_masks.dimshuffle(0,1,'x')).reshape((true_batch_size, 2, trichar_len,char_emb_size)),axis=2).dimshuffle(0,2,1) #(true_batch_size, char_emb_size,2) #concatenate word emb with shape emb q_input = T.concatenate([q_input,q_input_shape, q_input_trichar],axis=1) #(batch, 2*emb_size+char_emb_size, q_len) paragraph_input = T.concatenate([paragraph_input,paragraph_input_shape, paragraph_input_trichar,islabels.dimshuffle(0,'x',1)],axis=1)#(batch, 2*emb_size+char_emb_size+1, para_len) q_types_input = T.sum(T.concatenate([q_types,q_types_shape,q_types_trichar],axis=1), axis=2) #(batch, 2*emb+char_emb_size) fwd_LSTM_para_dict=create_LSTM_para(rng, 2*emb_size+char_emb_size+1, hidden_size) bwd_LSTM_para_dict=create_LSTM_para(rng, 2*emb_size+char_emb_size+1, hidden_size) paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) paragraph_reps_tensor3=paragraph_model.output_tensor #(batch, 2*hidden, paralen) fwd_LSTM_q_dict=create_LSTM_para(rng, 2*emb_size+char_emb_size, hidden_size) bwd_LSTM_q_dict=create_LSTM_para(rng, 2*emb_size+char_emb_size, hidden_size) question_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(q_input, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) q_reps=questions_model.output_sent_rep_maxpooling #(batch, 2*hidden) #interaction batch_ids=T.arange(true_batch_size) # c_heads=theano.shared(value=np.asarray([(para_len-1)/2]*batch_size, dtype='int32'), borrow=True) c_heads = T.repeat(theano.shared(value=np.asarray([(para_len-1)/2], dtype='int32'), borrow=True), true_batch_size) c_tails=c_heads+1 c_heads_reps=paragraph_reps_tensor3[batch_ids,:,c_heads] #(batch, 2*hidden) c_tails_reps=paragraph_reps_tensor3[batch_ids,:,c_tails] #(batch, 2*hidden) candididates_reps=T.concatenate([c_heads_reps, c_tails_reps], axis=1) #(batch, 4*hidden) context_l=paragraph_model.forward_output[batch_ids,:,c_heads-1] #(batch, hidden) context_r=paragraph_model.backward_output[batch_ids,:,c_tails+1]#(batch, hidden) #glove level average # c_input = embeddings[c_ids.flatten()].reshape((true_batch_size, c_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) # c_input_shape = embeddings[c_ids_shape.flatten()].reshape((true_batch_size, c_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) # c_input = T.concatenate([c_input,c_input_shape],axis=1) c_sum = paragraph_input[:,:-1,(para_len-1)/2]#(batch, 2*emb_size+char_emb) c_sum_with_isInQLabel = paragraph_input[:,:,(para_len-1)/2] # e_input = embeddings[e_ids.flatten()].reshape((true_batch_size, e_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) q_sum = T.sum(q_input*q_mask.dimshuffle(0,'x',1), axis=2) #(batch, 2*emb_size+char_emb_size) # average_Q_batch = q_sum/T.sqrt(T.sum(q_sum**2, axis=1)+1e-20).dimshuffle(0,'x') HL_layer_1_input_size=2*hidden_size+4*hidden_size+(2*emb_size+char_emb_size+1)+(2*emb_size+char_emb_size)+1+hidden_size+hidden_size+(2*emb_size+char_emb_size)+1 cosine_Qtype_cand = cosine_row_wise_twoMatrix(q_types_input, c_sum).dimshuffle(0,'x') #(batch, 1) #, average_E_batch, average_C_batch, average_Q_batch HL_layer_1_input = T.concatenate([q_reps, candididates_reps, c_sum_with_isInQLabel, q_sum, islabels[:,(para_len-1)/2:(para_len-1)/2+1], context_l, context_r, q_types_input, cosine_Qtype_cand], axis=1) HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=HL_hidden_size, activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=HL_hidden_size, n_out=HL_hidden_size, activation=T.tanh) LR_input= T.concatenate([HL_layer_1.output, HL_layer_2.output, islabels[:,(para_len-1)/2:(para_len-1)/2+1], cosine_Qtype_cand], axis=1) #(batch, char_HL_hidden_size+HL_hidden_size) LR_input_size= HL_hidden_size+HL_hidden_size+1+1#HL_layer_1_input_size+2*HL_hidden_size U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2 norm_U_a=normalize_matrix(U_a) LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='char_LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=norm_U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = LR_para+[embeddings,char_embeddings]+paragraph_para+question_para+HL_layer_1.params+HL_layer_2.params # load_model_from_file(storePath+'Best_Paras_HS_20170316_0.760357142857', params) # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) # L2_reg = L2norm_paraList(params) cost=loss#+1e-6*L2_reg accumulator=[] for para_i in params: eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-20))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([para, para_shape, para_mask,q,q_shape, q_mask,islabels, labels, types, types_shape, q_trichar_ids,q_trichar_masks,para_trichar_ids,para_trichar_masks,type_trichar_ids,type_trichar_masks], cost, updates=updates,on_unused_input='ignore') # train_model_pred = theano.function([para, para_mask, c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], layer_LR.y_pred, on_unused_input='ignore') test_model = theano.function([para, para_shape, para_mask, q,q_shape, q_mask,islabels, labels, types, types_shape,q_trichar_ids,q_trichar_masks,para_trichar_ids,para_trichar_masks,type_trichar_ids,type_trichar_masks], [layer_LR.errors(labels),layer_LR.prop_for_posi], on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size #batch_size means how many pairs train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] # n_train_batches_pred=train_size/batch_size_pred #batch_size means how many pairs # train_batch_start_pred=list(np.arange(n_train_batches_pred)*batch_size_pred)+[train_size-batch_size_pred] n_test_batches=test_size/test_batch_size #batch_size means how many pairs n_test_remain=test_size%test_batch_size #batch_size means how many pairs test_batch_start=list(np.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_acc=0.0 cost_i=0.0 train_ids = range(train_size) # train_ids_pred = range(train_size) best_test_statistic=defaultdict(int) # best_train_statistic=defaultdict(int) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) # print train_ids[:100] iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_list = train_ids[para_id:para_id+batch_size] # print 'train_labels[train_id_list]:', train_labels[train_id_list] if model_type=='train': #para, para_shape, para_mask,q,q_shape, q_mask,islabels, labels, types, types_shape, q_trichar_ids,q_trichar_masks,para_trichar_ids,para_trichar_masks,type_trichar_ids,type_trichar_masks cost_i+= train_model( train_paras[train_id_list], train_paras_shape[train_id_list], train_paras_mask[train_id_list], train_questions[train_id_list], train_questions_shape[train_id_list], train_questions_mask[train_id_list], train_islabels[train_id_list], train_labels[train_id_list], train_types[train_id_list], train_types_shape[train_id_list], train_question_trichar_ids[train_id_list], train_question_trichar_masks[train_id_list], train_para_trichar_ids[train_id_list], train_para_trichar_masks[train_id_list], train_type_trichar_ids[train_id_list], train_type_trichar_masks[train_id_list]) #print iter if iter%10 ==0: print 'Epoch ', epoch, 'iter '+str(iter)+'/'+str(len(train_batch_start))+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' past_time = time.time() print 'Testing...' error=0 test_statistic=defaultdict(int) if model_type=='test': writefile=open(storePath+'predictions_20170317.txt', 'w') for id, test_para_id in enumerate(test_batch_start): test_id_list = range(test_para_id, test_para_id+test_batch_size) # print 'test_id_list:',test_id_list # print 'test_c_heads[test_id_list]', test_c_heads[test_id_list] # gold_labels_list = test_labels_3c[test_para_id:test_para_id+test_batch_size] error_i, preds_i= test_model( test_paras[test_id_list], test_paras_shape[test_id_list], test_paras_mask[test_id_list], test_questions[test_id_list], test_questions_shape[test_id_list], test_questions_mask[test_id_list], test_islabels[test_id_list], test_labels[test_id_list], test_types[test_id_list], test_types_shape[test_id_list], test_question_trichar_ids[test_id_list], test_question_trichar_masks[test_id_list], test_para_trichar_ids[test_id_list], test_para_trichar_masks[test_id_list], test_type_trichar_ids[test_id_list], test_type_trichar_masks[test_id_list]) if model_type=='test': if id < len(test_batch_start)-1: writefile.write('\n'.join(map(str,list(preds_i)))+'\n') else: writefile.write('\n'.join(map(str,list(preds_i)[-n_test_remain:]))+'\n') error+=error_i # for ind, gold_label in enumerate(gold_labels_list): # test_statistic[(gold_label, preds_i[ind])]+=1 if model_type=='test': writefile.close() acc=1.0-error*1.0/len(test_batch_start) # acc= (test_statistic.get((1,1),0)+test_statistic.get((0,0),0))*1.0/(test_statistic.get((1,1),0)+test_statistic.get((0,0),0)+test_statistic.get((1,0),0)+test_statistic.get((0,1),0)) if acc> max_acc: max_acc=acc # best_test_statistic=test_statistic if model_type=='train': store_model_to_file(storePath+'Best_Paras_HS_20170324_'+str(max_acc), params) print 'Finished storing best params at:', max_acc print 'current average acc:', acc, '\t\tmax acc:', max_acc#, '\ttest_statistic:', test_statistic # print '\t\t\t\tbest statistic:', best_test_statistic if model_type=='test': exit(0) if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=10, hidden_size=10, L2_weight=0.0001, para_len_limit=400, q_len_limit=40, max_EM=0.217545454546): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train(para_len_limit, q_len_limit) train_size=len(train_para_list) if train_size!=len(train_Q_list) or train_size!=len(train_label_list) or train_size!=len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist= load_dev_or_test(word2id, para_len_limit, q_len_limit) test_size=len(test_para_list) if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in overall_word2id.iteritems()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') labels = T.imatrix('labels') para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' norm_extraF=normalize_matrix(extraF) U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size) paragraph_para=[U1, W1, b1, U1_b, W1_b, b1_b] UQ, WQ, bQ=create_GRU_para(rng, emb_size, hidden_size) UQ_b, WQ_b, bQ_b=create_GRU_para(rng, emb_size, hidden_size) Q_para=[UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size)) W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 2, hidden_size+3) # 3 extra features LR_b = theano.shared(value=numpy.zeros((2,), dtype=theano.config.floatX), # @UndefinedVariable name='LR_b', borrow=True) attention_paras=[W_a1, W_a2, U_a, LR_b] params = [embeddings]+paragraph_para+Q_para+attention_paras load_model_from_file(rootPath+'Best_Paras_conv_0.217545454545', params) paragraph_input = embeddings[paragraph.flatten()].reshape((paragraph.shape[0], paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen) concate_paragraph_input=T.concatenate([paragraph_input, norm_extraF.dimshuffle((0,2,1))], axis=1) paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) para_reps=paragraph_model.output_tensor #(batch, emb, para_len) # #LSTM # fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters # paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) # para_reps=paragraph_model.output_tensor Qs_emb = embeddings[questions.flatten()].reshape((questions.shape[0], questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength) questions_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ,W=WQ,b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b) # questions_reps=questions_model.output_sent_rep_maxpooling.reshape((batch_size, 1, hidden_size)) #(batch, 2*out_size) questions_reps_tensor=questions_model.output_tensor #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) # #LSTM for questions # fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters # questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) # questions_reps_tensor=questions_model.output_tensor #use CNN for question modeling # Qs_emb_tensor4=Qs_emb.dimshuffle((0,'x', 1,2)) #(batch_size, 1, emb+3, maxparalen) # conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 5)) # Q_conv_para=[conv_W, conv_b] # conv_model = Conv_with_input_para(rng, input=Qs_emb_tensor4, # image_shape=(batch_size, 1, emb_size, q_len_limit), # filter_shape=(hidden_size, 1, emb_size, 5), W=conv_W, b=conv_b) # conv_output=conv_model.narrow_conv_out.reshape((batch_size, hidden_size, q_len_limit-5+1)) #(batch, 1, hidden_size, maxparalen-1) # gru_mask=(q_mask[:,:-4]*q_mask[:,1:-3]*q_mask[:,2:-2]*q_mask[:,3:-1]*q_mask[:,4:]).reshape((batch_size, 1, q_len_limit-5+1)) # masked_conv_output=conv_output*gru_mask # questions_conv_reps=T.max(masked_conv_output, axis=2).reshape((batch_size, 1, hidden_size)) # new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0) # ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2) # ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction # padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX) # ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1) # ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1) # ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad) # def example_in_batch(para_matrix, q_matrix): #assume both are (hidden, len) transpose_para_matrix=para_matrix.T interaction_matrix=T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len) norm_interaction_matrix=T.nnet.softmax(interaction_matrix) return T.dot(q_matrix, norm_interaction_matrix.T) #(len, para_len) batch_q_reps, updates = theano.scan(fn=example_in_batch, outputs_info=None, sequences=[para_reps, questions_reps_tensor]) #batch_q_reps (batch, hidden, para_len) #attention distributions norm_W_a1=normalize_matrix(W_a1) norm_W_a2=normalize_matrix(W_a2) norm_U_a=normalize_matrix(U_a) transformed_para_reps=T.maximum(T.dot(para_reps.transpose((0, 2,1)), norm_W_a2),0.0) #relu transformed_q_reps=T.maximum(T.dot(batch_q_reps.transpose((0, 2,1)), norm_W_a1),0.0) #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1) add_both=transformed_para_reps+transformed_q_reps # U_c, W_c, b_c=create_GRU_para(rng, hidden_size, hidden_size) # U_c_b, W_c_b, b_c_b=create_GRU_para(rng, hidden_size, hidden_size) # accumu_para=[U_c, W_c, b_c, U_c_b, W_c_b, b_c_b] # accumu_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_both.transpose((0,2,1)), Mask=para_mask, hidden_dim=hidden_size,U=U_c,W=W_c,b=b_c,Ub=U_c_b,Wb=W_c_b,bb=b_c_b) # accu_both=accumu_model.output_tensor.transpose((0,2,1)) prior_att=T.concatenate([add_both, norm_extraF], axis=2) #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2) valid_indices=para_mask.flatten().nonzero()[0] layer3=LogisticRegression(rng, input=prior_att.reshape((batch_size*prior_att.shape[1], hidden_size+3)), n_in=hidden_size+3, n_out=2, W=norm_U_a, b=LR_b) #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices]) error = -T.sum(T.log(layer3.p_y_given_x)[valid_indices, labels.flatten()[valid_indices]])#[T.arange(y.shape[0]), y]) distributions=layer3.p_y_given_x[:,-1].reshape((batch_size, para_mask.shape[1])) #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1])) # masked_dis=(distributions+ConvGRU_1_dis_into_unigram)*para_mask masked_dis=distributions*para_mask ''' strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1) distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions') para_mask=para_mask masked_dis=distributions*para_mask # masked_label=debug_print(labels*para_mask, 'masked_label') # error=((masked_dis-masked_label)**2).mean() label_mask=T.gt(labels,0.0) neg_label_mask=T.lt(labels,0.0) dis_masked=distributions*label_mask remain_dis_masked=distributions*neg_label_mask ans_size=T.sum(label_mask) non_ans_size=T.sum(neg_label_mask) pos_error=T.sum((dis_masked-label_mask)**2)/ans_size neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)* ''' # def AttentionLayer(q_rep, ext_M): # theano_U_a=debug_print(norm_U_a, 'norm_U_a') # prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att') # f __name__ == '__main__': # prior_att=T.concatenate([prior_att, ext_M], axis=1) # # strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1) # return strength.transpose() #(1, #words) # distributions, updates = theano.scan( # AttentionLayer, # sequences=[questions_reps,extraF] ) # distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions') # labels=debug_print(labels, 'labels') # label_mask=T.gt(labels,0.0) # neg_label_mask=T.lt(labels,0.0) # dis_masked=distributions*label_mask # remain_dis_masked=distributions*neg_label_mask # pos_error=((dis_masked-1)**2).mean() # neg_error=((remain_dis_masked-(-1))**2).mean() # error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=error#+ConvGRU_1.error# accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([paragraph, questions,labels, para_mask, q_mask, extraF], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF], masked_dis, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*batch_size)+[test_size-batch_size] max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i+= train_model( np.asarray([train_para_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), np.asarray([train_Q_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), np.asarray([train_label_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), np.asarray([train_para_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), np.asarray([train_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), np.asarray([train_feature_matrixlist[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX)) #print iter if iter%10==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() exact_match=0.0 F1_match=0.0 q_amount=0 for test_para_id in test_batch_start: distribution_matrix=test_model( np.asarray(test_para_list[test_para_id:test_para_id+batch_size], dtype='int32'), np.asarray(test_Q_list[test_para_id:test_para_id+batch_size], dtype='int32'), np.asarray(test_para_mask[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX), np.asarray(test_mask[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX), np.asarray(test_feature_matrixlist[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX)) # print distribution_matrix test_para_wordlist_list=test_text_list[test_para_id:test_para_id+batch_size] para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+batch_size] paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size] sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size] para_len=len(test_para_wordlist_list[0]) if para_len!=len(distribution_matrix[0]): print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0]) exit(0) # q_size=len(distribution_matrix) q_amount+=batch_size # print q_size # print test_para_word_list Q_list_inword=test_Q_list_word[test_para_id:test_para_id+batch_size] for q in range(batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], np.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q]) q_gold_ans_set=para_gold_ansset_list[q] # print test_para_wordlist_list[q] # print Q_list_inword[q] # print pred_ans.encode('utf8'), q_gold_ans_set if pred_ans in q_gold_ans_set: exact_match+=1 F1=MacroF1(pred_ans, q_gold_ans_set) F1_match+=F1 # match_amount=len(pred_ans_set & q_gold_ans_set) # # print 'q_gold_ans_set:', q_gold_ans_set # # print 'pred_ans_set:', pred_ans_set # if match_amount>0: # exact_match+=match_amount*1.0/len(pred_ans_set) F1_acc=F1_match/q_amount exact_acc=exact_match/q_amount if F1_acc> max_F1_acc: max_F1_acc=F1_acc if exact_acc> max_exact_acc: max_exact_acc=exact_acc if max_exact_acc > max_EM: store_model_to_file(rootPath+'Best_Paras_conv_'+str(max_exact_acc), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, word_nkerns=300, batch_size=1, window_width=[3,3], emb_size=300, margin=0.5, L2_weight=0.0003, Div_reg=0.03, update_freq=1, norm_threshold=5.0, max_truncate=40, max_relation_len=6, max_Q_len=30, neg_all=100, train_size=69967, test_size=19953, mark='_RC_newdata'): #train_size=75909, test_size=17386 # maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/home/wyin/Datasets/SimpleQuestions_v2/relation_classification/' triple_files=['train.replace_ne.withpoolwenpengFormat.txt', 'test.replace_ne.withpoolwenpengFormat.txt'] rng = numpy.random.RandomState(23455) datasets, datasets_test, length_per_example_train, length_per_example_test, vocab_size=load_train(triple_files[0], triple_files[1], max_relation_len, max_Q_len, train_size, test_size, mark)#max_char_len, max_des_len, max_relation_len, max_Q_len print 'vocab_size:', vocab_size train_data=datasets # valid_data=datasets[1] test_data=datasets_test # result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores) # train_relations=train_data[0] train_relation_lengths=train_data[1] train_remainQ_word_ids=train_data[2] train_remainQ_word_len=train_data[3] test_relations=test_data[0] test_relation_lengths=test_data[1] test_remainQ_word_ids=test_data[2] test_remainQ_word_len=test_data[3] train_sizes=[len(train_relations),len(train_relation_lengths),len(train_remainQ_word_ids), len(train_remainQ_word_len)] if sum(train_sizes)/len(train_sizes)!=train_size: print 'weird size:', train_sizes exit(0) test_sizes=[len(test_relations),len(test_relation_lengths), len(test_remainQ_word_ids),len(test_remainQ_word_len)] if sum(test_sizes)/len(test_sizes)!=test_size: print 'weird size:', test_sizes exit(0) n_train_batches=train_size/batch_size n_test_batches=test_size/batch_size # indices_train_pos_entity_char=theano.shared(numpy.asarray(train_pos_entity_char, dtype='int32'), borrow=True) # indices_train_pos_entity_des=theano.shared(numpy.asarray(train_pos_entity_des, dtype='int32'), borrow=True) # indices_train_relations=theano.shared(numpy.asarray(train_relations, dtype='int32'), borrow=True) # indices_train_entity_char_lengths=theano.shared(numpy.asarray(train_entity_char_lengths, dtype='int32'), borrow=True) # indices_train_entity_des_lengths=theano.shared(numpy.asarray(train_entity_des_lengths, dtype='int32'), borrow=True) # indices_train_relation_lengths=theano.shared(numpy.asarray(train_relation_lengths, dtype='int32'), borrow=True) # indices_train_mention_char_ids=theano.shared(numpy.asarray(train_mention_char_ids, dtype='int32'), borrow=True) # indices_train_remainQ_word_ids=theano.shared(numpy.asarray(train_remainQ_word_ids, dtype='int32'), borrow=True) # indices_train_mention_char_lens=theano.shared(numpy.asarray(train_mention_char_lens, dtype='int32'), borrow=True) # indices_train_remainQ_word_len=theano.shared(numpy.asarray(train_remainQ_word_len, dtype='int32'), borrow=True) # indices_train_entity_scores=theano.shared(numpy.asarray(train_entity_scores, dtype=theano.config.floatX), borrow=True) rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) # rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb'+mark+'.txt') embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data index = T.iscalar() rel_word_ids_M=T.imatrix() rel_word_lens_M=T.imatrix() q_word_ids_f=T.ivector() q_word_lens_f=T.ivector() filter_size=(emb_size,window_width[0]) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' word_filter_shape=(word_nkerns, 1, filter_size[0], filter_size[1]) q_rel_conv_W, q_rel_conv_b=create_conv_para(rng, filter_shape=word_filter_shape) params = [embeddings,q_rel_conv_W, q_rel_conv_b] q_rel_conv_W_into_matrix=q_rel_conv_W.reshape((q_rel_conv_W.shape[0], q_rel_conv_W.shape[2]*q_rel_conv_W.shape[3])) # load_model_from_file(rootPath, params, '') def SimpleQ_matches_Triple(rel_word_ids_f,rel_word_lens_f): rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape((batch_size,max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) q_word_input = embeddings[q_word_ids_f.flatten()].reshape((batch_size,max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #q-rel q_rel_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) rel_conv = Conv_with_input_para(rng, input=rel_word_input, image_shape=(batch_size, 1, emb_size, max_relation_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) # q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) rel_conv_pool=Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2]) q_rel_pool=Average_Pooling_for_SimpleQA(rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1]+filter_size[1]-1, dim=max_Q_len+filter_size[1]-1, topk=2) overall_simi=cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling) return overall_simi simi_list, updates = theano.scan( SimpleQ_matches_Triple, sequences=[rel_word_ids_M,rel_word_lens_M]) posi_simi=simi_list[0] nega_simies=simi_list[1:] loss_simi_list=T.maximum(0.0, margin-posi_simi.reshape((1,1))+nega_simies) loss_simi=T.sum(loss_simi_list) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((embeddings** 2).sum()+(q_rel_conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() diversify_reg= Diversify_Reg(q_rel_conv_W_into_matrix) cost=loss_simi+L2_weight*L2_reg+Div_reg*diversify_reg #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([rel_word_ids_M, rel_word_lens_M, q_word_ids_f, q_word_lens_f], [loss_simi, simi_list],on_unused_input='ignore') accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))) #AdaGrad # updates.append((acc_i, acc)) if param_i == embeddings: updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(emb_size))))) #Ada else: updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([rel_word_ids_M, rel_word_lens_M, q_word_ids_f, q_word_lens_f], [loss_simi, cost],updates=updates, on_unused_input='ignore') # train_model = theano.function([index, chosed_indices], [loss_simi, cost], updates=updates, # givens={ # rel_word_ids_M : indices_train_relations[index].reshape((neg_all, max_relation_len))[chosed_indices].reshape((train_neg_size, max_relation_len)), # rel_word_lens_M : indices_train_relation_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), # q_word_ids_M : indices_train_remainQ_word_ids[index].reshape((neg_all, max_Q_len))[chosed_indices].reshape((train_neg_size, max_Q_len)), # q_word_lens_M : indices_train_remainQ_word_len[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)) # # }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False best_test_accu=0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 for jj in range(train_size): # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #print batch_start train_rel_word_ids_M = numpy.asarray(train_relations[jj], dtype='int32').reshape((length_per_example_train[jj], max_relation_len)) train_rel_word_lens_M = numpy.asarray(train_relation_lengths[jj], dtype='int32').reshape((length_per_example_train[jj], 3)) train_q_word_ids_M = numpy.asarray(train_remainQ_word_ids[jj], dtype='int32')#.reshape((length_per_example_train[jj], max_Q_len)) train_q_word_lens_M = numpy.asarray(train_remainQ_word_len[jj], dtype='int32')#.reshape((length_per_example_train[jj], 3)) loss_simi_i, cost_i=train_model(train_rel_word_ids_M, train_rel_word_lens_M,train_q_word_ids_M, train_q_word_lens_M) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+'\tloss_simi_i: ', loss_simi_i, 'cost_i:', cost_i #if iter ==1: # exit(0) # if iter > 59999 and iter % 10000 == 0: test_loss=[] succ=0 for i in range(test_size): # print 'testing', i, '...' #prepare data test_rel_word_ids_M = numpy.asarray(test_relations[i], dtype='int32').reshape((length_per_example_test[i], max_relation_len)) test_rel_word_lens_M = numpy.asarray(test_relation_lengths[i], dtype='int32').reshape((length_per_example_test[i], 3)) test_q_word_ids_M = numpy.asarray(test_remainQ_word_ids[i], dtype='int32')#.reshape((length_per_example_test[i], max_Q_len)) test_q_word_lens_M = numpy.asarray(test_remainQ_word_len[i], dtype='int32')#.reshape((length_per_example_test[i], 3)) loss_simi_i,simi_list_i=test_model(test_rel_word_ids_M, test_rel_word_lens_M,test_q_word_ids_M, test_q_word_lens_M) # print 'simi_list_i:', simi_list_i[:10] test_loss.append(loss_simi_i) if simi_list_i[0]>=max(simi_list_i[1:]): succ+=1 # print 'testing', i, '...acc:', succ*1.0/(i+1) succ=(succ+20610-test_size)*1.0/20610 #now, check MAP and MRR print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best ' 'model %f') % (epoch, minibatch_index, n_train_batches,succ)) if best_test_accu< succ: best_test_accu=succ store_model_to_file(rootPath, params, mark) if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, batch_size=500, test_batch_size=500, emb_size=300, hidden_size=300, L2_weight=0.0001, margin=0.5, train_size=4000000, test_size=1000, max_context_len=25, max_span_len=7, max_q_len=40, max_EM=0.052): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = np.random.RandomState(23455) word2id,train_questions,train_questions_mask,train_lefts,train_lefts_mask,train_spans,train_spans_mask,train_rights,train_rights_mask=load_SQUAD_hinrich(train_size, max_context_len, max_span_len, max_q_len) test_ground_truth,test_candidates,test_questions,test_questions_mask,test_lefts,test_lefts_mask,test_spans,test_spans_mask,test_rights,test_rights_mask=load_dev_hinrich(word2id, test_size, max_context_len, max_span_len, max_q_len) overall_vocab_size=len(word2id) print 'vocab size:', overall_vocab_size rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, np.random.RandomState(1234)) # rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() left=T.imatrix() #(2*batch, len) left_mask=T.fmatrix() #(2*batch, len) span=T.imatrix() #(2*batch, span_len) span_mask=T.fmatrix() #(2*batch, span_len) right=T.imatrix() #(2*batch, len) right_mask=T.fmatrix() #(2*batch, len) q=T.imatrix() #(2*batch, len_q) q_mask=T.fmatrix() #(2*batch, len_q) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size) GRU1_para=[U1, W1, b1, U1_b, W1_b, b1_b] U2, W2, b2=create_GRU_para(rng, hidden_size, hidden_size) U2_b, W2_b, b2_b=create_GRU_para(rng, hidden_size, hidden_size) GRU2_para=[U2, W2, b2, U2_b, W2_b, b2_b] W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size)) W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) attend_para=[W_a1, W_a2] params = [embeddings]+GRU1_para+attend_para+GRU2_para # load_model_from_file(rootPath+'Best_Para_dim'+str(emb_size), params) left_input = embeddings[left.flatten()].reshape((left.shape[0], left.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_context) span_input = embeddings[span.flatten()].reshape((span.shape[0], span.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_span) right_input = embeddings[right.flatten()].reshape((right.shape[0], right.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_context) q_input = embeddings[q.flatten()].reshape((q.shape[0], q.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_q) left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=left_input, Mask=left_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) left_reps=left_model.output_tensor #(batch, emb, para_len) span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=span_input, Mask=span_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) span_reps=span_model.output_tensor #(batch, emb, para_len) right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=right_input, Mask=right_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) right_reps=right_model.output_tensor #(batch, emb, para_len) q_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=q_input, Mask=q_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) q_reps=q_model.output_tensor #(batch, emb, para_len) #interaction left_reps_via_q_reps, q_reps_via_left_reps=attention_dot_prod_between_2tensors(left_reps, q_reps) span_reps_via_q_reps, q_reps_via_span_reps=attention_dot_prod_between_2tensors(span_reps, q_reps) right_reps_via_q_reps, q_reps_via_right_reps=attention_dot_prod_between_2tensors(right_reps, q_reps) # q_reps_via_left_reps=attention_dot_prod_between_2tensors(q_reps, left_reps) # q_reps_via_span_reps=attention_dot_prod_between_2tensors(q_reps, span_reps) # q_reps_via_right_reps=attention_dot_prod_between_2tensors(q_reps, right_reps) #combine origin_W=normalize_matrix(W_a1) attend_W=normalize_matrix(W_a2) left_origin_reps=T.dot(left_reps.dimshuffle(0, 2,1), origin_W) span_origin_reps=T.dot(span_reps.dimshuffle(0, 2,1), origin_W) right_origin_reps=T.dot(right_reps.dimshuffle(0, 2,1), origin_W) q_origin_reps=T.dot(q_reps.dimshuffle(0, 2,1), origin_W) left_attend_q_reps=T.dot(q_reps_via_left_reps.dimshuffle(0, 2,1), attend_W) span_attend_q_reps=T.dot(q_reps_via_span_reps.dimshuffle(0, 2,1), attend_W) right_attend_q_reps=T.dot(q_reps_via_right_reps.dimshuffle(0, 2,1), attend_W) q_attend_left_reps=T.dot(left_reps_via_q_reps.dimshuffle(0, 2,1), attend_W) q_attend_span_reps=T.dot(span_reps_via_q_reps.dimshuffle(0, 2,1), attend_W) q_attend_right_reps=T.dot(right_reps_via_q_reps.dimshuffle(0, 2,1), attend_W) add_left=left_origin_reps+q_attend_left_reps #(2*batch, len ,hidden) add_span=span_origin_reps+q_attend_span_reps add_right=right_origin_reps+q_attend_right_reps add_q_by_left=q_origin_reps+left_attend_q_reps add_q_by_span=q_origin_reps+span_attend_q_reps add_q_by_right=q_origin_reps+right_attend_q_reps #second GRU add_left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_left.dimshuffle(0,2,1), Mask=left_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_left_reps=add_left_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_span.dimshuffle(0,2,1), Mask=span_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_span_reps=add_span_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_right.dimshuffle(0,2,1), Mask=right_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_right_reps=add_right_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_q_by_left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_left.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_q_by_left_reps=add_q_by_left_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_q_by_span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_span.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_q_by_span_reps=add_q_by_span_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_q_by_right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_right.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_q_by_right_reps=add_q_by_right_model.output_sent_rep_maxpooling #(batch, hidden_dim) paragraph_concat=T.concatenate([add_left_reps, add_span_reps, add_right_reps], axis=1) #(batch, 3*hidden) question_concat=T.concatenate([add_q_by_left_reps, add_q_by_span_reps, add_q_by_right_reps], axis=1) #(batch, 3*hidden) simi_list=cosine_row_wise_twoMatrix(paragraph_concat, question_concat) #(2*batch) pos_simi_vec=simi_list[::2] neg_simi_vec=simi_list[1::2] raw_loss=T.maximum(0.0, margin+neg_simi_vec-pos_simi_vec) #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=T.sum(raw_loss)#+ConvGRU_1.error# accumulator=[] for para_i in params: eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([left, left_mask, span, span_mask, right, right_mask, q, q_mask], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([left, left_mask, span, span_mask, right, right_mask, q, q_mask], simi_list, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size #batch_size means how many pairs remain_train=train_size%batch_size # train_batch_start=list(np.arange(n_train_batches)*batch_size*2)+[train_size*2-batch_size*2] # always ou shu if remain_train>0: train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] else: train_batch_start=list(np.arange(n_train_batches)*batch_size) max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_odd_ids = list(np.arange(train_size)*2) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_odd_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_list=[[train_odd_id, train_odd_id+1] for train_odd_id in train_odd_ids[para_id:para_id+batch_size]] train_id_list=sum(train_id_list,[]) # print train_id_list cost_i+= train_model( np.asarray([train_lefts[id] for id in train_id_list], dtype='int32'), np.asarray([train_lefts_mask[id] for id in train_id_list], dtype=theano.config.floatX), np.asarray([train_spans[id] for id in train_id_list], dtype='int32'), np.asarray([train_spans_mask[id] for id in train_id_list], dtype=theano.config.floatX), np.asarray([train_rights[id] for id in train_id_list], dtype='int32'), np.asarray([train_rights_mask[id] for id in train_id_list], dtype=theano.config.floatX), np.asarray([train_questions[id] for id in train_id_list], dtype='int32'), np.asarray([train_questions_mask[id] for id in train_id_list], dtype=theano.config.floatX)) #print iter if iter%100==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() exact_match=0.0 F1_match=0.0 for test_pair_id in range(test_size): test_example_lefts=test_lefts[test_pair_id] test_example_lefts_mask=test_lefts_mask[test_pair_id] test_example_spans=test_spans[test_pair_id] test_example_spans_mask=test_spans_mask[test_pair_id] test_example_rights=test_rights[test_pair_id] test_example_rights_mask=test_rights_mask[test_pair_id] test_example_questions=test_questions[test_pair_id] test_example_questions_mask=test_questions_mask[test_pair_id] test_example_candidates=test_candidates[test_pair_id] test_example_size=len(test_example_lefts) # print 'test_pair_id, test_example_size:', test_pair_id, test_example_size if test_example_size < test_batch_size: #pad pad_size=test_batch_size-test_example_size test_example_lefts+=test_example_lefts[-1:]*pad_size test_example_lefts_mask+=test_example_lefts_mask[-1:]*pad_size test_example_spans+=test_example_spans[-1:]*pad_size test_example_spans_mask+=test_example_spans_mask[-1:]*pad_size test_example_rights+=test_example_rights[-1:]*pad_size test_example_rights_mask+=test_example_rights_mask[-1:]*pad_size test_example_questions+=test_example_questions[-1:]*pad_size test_example_questions_mask+=test_example_questions_mask[-1:]*pad_size test_example_candidates+=test_example_candidates[-1:]*pad_size test_example_size=test_batch_size n_test_batches=test_example_size/test_batch_size n_test_remain=test_example_size%test_batch_size if n_test_remain > 0: test_batch_start=list(np.arange(n_test_batches)*test_batch_size)+[test_example_size-test_batch_size] else: test_batch_start=list(np.arange(n_test_batches)*test_batch_size) all_simi_list=[] all_cand_list=[] for test_para_id in test_batch_start: simi_return_vector=test_model( np.asarray(test_example_lefts[test_para_id:test_para_id+test_batch_size], dtype='int32'), np.asarray(test_example_lefts_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), np.asarray(test_example_spans[test_para_id:test_para_id+test_batch_size], dtype='int32'), np.asarray(test_example_spans_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), np.asarray(test_example_rights[test_para_id:test_para_id+test_batch_size], dtype='int32'), np.asarray(test_example_rights_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), np.asarray(test_example_questions[test_para_id:test_para_id+test_batch_size], dtype='int32'), np.asarray(test_example_questions_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX)) candidate_list=test_example_candidates[test_para_id:test_para_id+test_batch_size] all_simi_list+=list(simi_return_vector) all_cand_list+=candidate_list top1_cand=all_cand_list[np.argsort(all_simi_list)[-1]] # print top1_cand, test_ground_truth[test_pair_id] if top1_cand == test_ground_truth[test_pair_id]: exact_match+=1 F1=macrof1(top1_cand, test_ground_truth[test_pair_id]) # print '\t\t\t', F1 F1_match+=F1 # match_amount=len(pred_ans_set & q_gold_ans_set) # # print 'q_gold_ans_set:', q_gold_ans_set # # print 'pred_ans_set:', pred_ans_set # if match_amount>0: # exact_match+=match_amount*1.0/len(pred_ans_set) F1_acc=F1_match/test_size exact_acc=exact_match/test_size if F1_acc> max_F1_acc: max_F1_acc=F1_acc # store_model_to_file(params, emb_size) if exact_acc> max_exact_acc: max_exact_acc=exact_acc if max_exact_acc > max_EM: store_model_to_file(rootPath+'Best_Para_'+str(max_EM), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, nkerns=[50,50], batch_size=1, window_width=3, maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200, L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=1.0, decay=0.95): maxSentLength=max_s_length+2*(window_width-1) maxDocLength=max_d_length+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/'; rng = numpy.random.RandomState(23455) train_data,train_size, test_data, test_size, vocab_size=load_MCTest_corpus_DQAAAA(rootPath+'vocab_DQAAAA.txt', rootPath+'mc500.train.tsv_standardlized.txt_DQAAAA.txt', rootPath+'mc500.test.tsv_standardlized.txt_DQAAAA.txt', max_s_length,maxSentLength, maxDocLength)#vocab_size contain train, dev and test [train_data_D, train_data_Q, train_data_A1, train_data_A2, train_data_A3, train_data_A4, train_Label, train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A1, train_Length_A2, train_Length_A3, train_Length_A4, train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3, train_leftPad_A4, train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3, train_rightPad_A4]=train_data [test_data_D, test_data_Q, test_data_A1, test_data_A2, test_data_A3, test_data_A4, test_Label, test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A1, test_Length_A2, test_Length_A3, test_Length_A4, test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3, test_leftPad_A4, test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3, test_rightPad_A4]=test_data n_train_batches=train_size/batch_size n_test_batches=test_size/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int64') # indices_train_r=T.cast(indices_train_r, 'int64') # indices_test_l=T.cast(indices_test_l, 'int64') # indices_test_r=T.cast(indices_test_r, 'int64') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_DQAAAA_glove_50d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum=0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer index_Q = T.lvector() index_A1= T.lvector() index_A2= T.lvector() index_A3= T.lvector() index_A4= T.lvector() # y = T.lvector() len_D=T.lscalar() len_D_s=T.lvector() len_Q=T.lscalar() len_A1=T.lscalar() len_A2=T.lscalar() len_A3=T.lscalar() len_A4=T.lscalar() left_D=T.lscalar() left_D_s=T.lvector() left_Q=T.lscalar() left_A1=T.lscalar() left_A2=T.lscalar() left_A3=T.lscalar() left_A4=T.lscalar() right_D=T.lscalar() right_D_s=T.lvector() right_Q=T.lscalar() right_A1=T.lscalar() right_A2=T.lscalar() right_A3=T.lscalar() right_A4=T.lscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words=(emb_size,window_width) filter_sents=(nkerns[0], window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = debug_print(embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1), 'layer0_D_input')#.dimshuffle(0, 'x', 1, 2) layer0_Q_input = debug_print(embeddings[index_Q.flatten()].reshape((maxSentLength, emb_size)).transpose(), 'layer0_Q_input')#.dimshuffle(0, 'x', 1, 2) layer0_A1_input = debug_print(embeddings[index_A1.flatten()].reshape((maxSentLength, emb_size)).transpose(), 'layer0_A1_input')#.dimshuffle(0, 'x', 1, 2) layer0_A2_input = embeddings[index_A2.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2) layer0_A3_input = embeddings[index_A3.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2) layer0_A4_input = embeddings[index_A4.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2) U, W, b=create_GRU_para(rng, emb_size, nkerns[0]) layer0_para=[U, W, b] # conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) # layer2_para=[conv2_W, conv2_b] # high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1]) # highW_para=[high_W, high_b] #load_model(params) layer0_D = GRU_Tensor3_Input(T=layer0_D_input[left_D:-right_D,:,:], lefts=left_D_s[left_D:-right_D], rights=right_D_s[left_D:-right_D], hidden_dim=nkerns[0], U=U,W=W,b=b) layer0_Q = GRU_Matrix_Input(X=layer0_Q_input[:,left_Q:-right_Q], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1) layer0_A1 = GRU_Matrix_Input(X=layer0_A1_input[:,left_A1:-right_A1], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1) layer0_A2 = GRU_Matrix_Input(X=layer0_A2_input[:,left_A2:-right_A2], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1) layer0_A3 = GRU_Matrix_Input(X=layer0_A3_input[:,left_A3:-right_A3], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1) layer0_A4 = GRU_Matrix_Input(X=layer0_A4_input[:,left_A4:-right_A4], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1) layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output') layer0_Q_output=debug_print(layer0_Q.output_vector_mean, 'layer0_Q.output') layer0_A1_output=debug_print(layer0_A1.output_vector_mean, 'layer0_A1.output') layer0_A2_output=debug_print(layer0_A2.output_vector_mean, 'layer0_A2.output') layer0_A3_output=debug_print(layer0_A3.output_vector_mean, 'layer0_A3.output') layer0_A4_output=debug_print(layer0_A4.output_vector_mean, 'layer0_A4.output') #before reasoning, do a GRU for doc: d U_d, W_d, b_d=create_GRU_para(rng, nkerns[0], nkerns[0]) layer_d_para=[U_d, W_d, b_d] layer_D_GRU = GRU_Matrix_Input(X=layer0_D_output, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_d,W=W_d,b=b_d,bptt_truncate=-1) #Reasoning Layer 1 repeat_Q=debug_print(T.repeat(layer0_Q_output.reshape((layer0_Q_output.shape[0],1)), maxDocLength, axis=1)[:,:layer_D_GRU.output_matrix.shape[1]], 'repeat_Q') input_DNN=debug_print(T.concatenate([layer_D_GRU.output_matrix,repeat_Q], axis=0).transpose(), 'input_DNN')#each row is an example output_DNN1=HiddenLayer(rng, input=input_DNN, n_in=nkerns[0]*2, n_out=nkerns[0]) output_DNN2=HiddenLayer(rng, input=output_DNN1.output, n_in=nkerns[0], n_out=nkerns[0]) DNN_out=debug_print(output_DNN2.output.transpose(), 'DNN_out') U_p, W_p, b_p=create_GRU_para(rng, nkerns[0], nkerns[0]) layer_pooling_para=[U_p, W_p, b_p] pooling=GRU_Matrix_Input(X=DNN_out, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p,W=W_p,b=b_p,bptt_truncate=-1) translated_Q1=debug_print(pooling.output_vector_max, 'translated_Q1') #before reasoning, do a GRU for doc: d2 U_d2, W_d2, b_d2=create_GRU_para(rng, nkerns[0], nkerns[0]) layer_d2_para=[U_d2, W_d2, b_d2] layer_D2_GRU = GRU_Matrix_Input(X=layer_D_GRU.output_matrix, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_d2,W=W_d2,b=b_d2,bptt_truncate=-1) #Reasoning Layer 2 repeat_Q1=debug_print(T.repeat(translated_Q1.reshape((translated_Q1.shape[0],1)), maxDocLength, axis=1)[:,:layer_D2_GRU.output_matrix.shape[1]], 'repeat_Q1') input_DNN2=debug_print(T.concatenate([layer_D2_GRU.output_matrix,repeat_Q1], axis=0).transpose(), 'input_DNN2')#each row is an example output_DNN3=HiddenLayer(rng, input=input_DNN2, n_in=nkerns[0]*2, n_out=nkerns[0]) output_DNN4=HiddenLayer(rng, input=output_DNN3.output, n_in=nkerns[0], n_out=nkerns[0]) DNN_out2=debug_print(output_DNN4.output.transpose(), 'DNN_out2') U_p2, W_p2, b_p2=create_GRU_para(rng, nkerns[0], nkerns[0]) layer_pooling_para2=[U_p2, W_p2, b_p2] pooling2=GRU_Matrix_Input(X=DNN_out2, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p2,W=W_p2,b=b_p2,bptt_truncate=-1) translated_Q2=debug_print(pooling2.output_vector_max, 'translated_Q2') QA1=T.concatenate([translated_Q2, layer0_A1_output], axis=0) QA2=T.concatenate([translated_Q2, layer0_A2_output], axis=0) QA3=T.concatenate([translated_Q2, layer0_A3_output], axis=0) QA4=T.concatenate([translated_Q2, layer0_A4_output], axis=0) W_HL,b_HL=create_HiddenLayer_para(rng, n_in=nkerns[0]*2, n_out=1) match_params=[W_HL,b_HL] QA1_match=HiddenLayer(rng, input=QA1, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL) QA2_match=HiddenLayer(rng, input=QA2, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL) QA3_match=HiddenLayer(rng, input=QA3, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL) QA4_match=HiddenLayer(rng, input=QA4, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL) # simi_overall_level1=debug_print(cosine(translated_Q2, layer0_A1_output), 'simi_overall_level1') # simi_overall_level2=debug_print(cosine(translated_Q2, layer0_A2_output), 'simi_overall_level2') # simi_overall_level3=debug_print(cosine(translated_Q2, layer0_A3_output), 'simi_overall_level3') # simi_overall_level4=debug_print(cosine(translated_Q2, layer0_A4_output), 'simi_overall_level4') simi_overall_level1=debug_print(QA1_match.output[0], 'simi_overall_level1') simi_overall_level2=debug_print(QA2_match.output[0], 'simi_overall_level2') simi_overall_level3=debug_print(QA3_match.output[0], 'simi_overall_level3') simi_overall_level4=debug_print(QA4_match.output[0], 'simi_overall_level4') # eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA)) #only use overall_simi cost=T.maximum(0.0, margin+simi_overall_level2-simi_overall_level1)+T.maximum(0.0, margin+simi_overall_level3-simi_overall_level1)+T.maximum(0.0, margin+simi_overall_level4-simi_overall_level1) # cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi) posi_simi=simi_overall_level1 nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4]) # #use ensembled simi # cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi) # posi_simi=simi_1 # nega_simi=T.max([simi_2, simi_3, simi_4]) L2_reg =debug_print((U**2).sum()+(W**2).sum() +(U_p**2).sum()+(W_p**2).sum() +(U_p2**2).sum()+(W_p2**2).sum() +(U_d**2).sum()+(W_d**2).sum() +(U_d2**2).sum()+(W_d2**2).sum() +(output_DNN1.W**2).sum()+(output_DNN2.W**2).sum() +(output_DNN3.W**2).sum()+(output_DNN4.W**2).sum() +(W_HL**2).sum(), 'L2_reg')#+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost=debug_print(cost+L2_weight*L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([index], [cost, posi_simi, nega_simi], givens={ index_D: test_data_D[index], #a matrix index_Q: test_data_Q[index], index_A1: test_data_A1[index], index_A2: test_data_A2[index], index_A3: test_data_A3[index], index_A4: test_data_A4[index], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], len_Q: test_Length_Q[index], len_A1: test_Length_A1[index], len_A2: test_Length_A2[index], len_A3: test_Length_A3[index], len_A4: test_Length_A4[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], left_Q: test_leftPad_Q[index], left_A1: test_leftPad_A1[index], left_A2: test_leftPad_A2[index], left_A3: test_leftPad_A3[index], left_A4: test_leftPad_A4[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], right_Q: test_rightPad_Q[index], right_A1: test_rightPad_A1[index], right_A2: test_rightPad_A2[index], right_A3: test_rightPad_A3[index], right_A4: test_rightPad_A4[index] }, on_unused_input='ignore') params = layer0_para+output_DNN1.params+output_DNN2.params+output_DNN3.params+output_DNN4.params+layer_pooling_para+layer_pooling_para2+match_params+layer_d_para+layer_d2_para # accumulator=[] # for para_i in params: # eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) # accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) # updates = [] # for param_i, grad_i, acc_i in zip(params, grads, accumulator): # grad_i=debug_print(grad_i,'grad_i') # acc = decay*acc_i + (1-decay)*T.sqr(grad_i) #rmsprop # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-6))) # updates.append((acc_i, acc)) def AdaDelta_updates(parameters,gradients,rho,eps): # create variables to store intermediate updates gradients_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ] deltas_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ] # calculates the new "average" delta for the next iteration gradients_sq_new = [ rho*g_sq + (1-rho)*(g**2) for g_sq,g in zip(gradients_sq,gradients) ] # calculates the step in direction. The square root is an approximation to getting the RMS for the average value deltas = [ (T.sqrt(d_sq+eps)/T.sqrt(g_sq+eps))*grad for d_sq,g_sq,grad in zip(deltas_sq,gradients_sq_new,gradients) ] # calculates the new "average" deltas for the next step. deltas_sq_new = [ rho*d_sq + (1-rho)*(d**2) for d_sq,d in zip(deltas_sq,deltas) ] # Prepare it as a list f gradient_sq_updates = zip(gradients_sq,gradients_sq_new) deltas_sq_updates = zip(deltas_sq,deltas_sq_new) parameters_updates = [ (p,p - d) for p,d in zip(parameters,deltas) ] return gradient_sq_updates + deltas_sq_updates + parameters_updates updates=AdaDelta_updates(params, grads, decay, 1e-6) train_model = theano.function([index], [cost, posi_simi, nega_simi], updates=updates, givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index], right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') train_model_predict = theano.function([index], [cost, posi_simi, nega_simi], givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index], right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc=0.0 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 # shuffle(train_batch_start)#shuffle training data corr_train=0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 sys.stdout.write( "Training :[%6f] %% complete!\r" % ((iter%train_size)*100.0/train_size) ) sys.stdout.flush() minibatch_index=minibatch_index+1 cost_average, posi_simi, nega_simi= train_model(batch_start) if posi_simi>nega_simi: corr_train+=1 if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+'corr rate:'+str(corr_train*100.0/train_size) if iter % validation_frequency == 0: corr_test=0 for i in test_batch_start: cost, posi_simi, nega_simi=test_model(i) if posi_simi>nega_simi: corr_test+=1 #write_file.close() #test_score = numpy.mean(test_losses) test_acc=corr_test*1.0/test_size #test_acc=1-test_score print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches,test_acc * 100.)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') find_better=False if test_acc > max_acc: max_acc=test_acc best_epoch=epoch find_better=True print '\t\t\ttest_acc:', test_acc, 'max:', max_acc,'(at',best_epoch,')' if find_better==True: store_model_to_file(params, best_epoch, max_acc) print 'Finished storing best params' if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #writefile.close() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5( learning_rate=0.001, n_epochs=2000, nkerns=[90,90], batch_size=1, window_width=2, maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200, L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=0.2 ): maxSentLength = max_s_length+2*(window_width-1) maxDocLength = max_d_length+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='../data/MCTest/' rng = numpy.random.RandomState(23455) train_data, train_size, test_data, test_size, vocab_size=load_MCTest_corpus_DPNQ( rootPath+'vocab_DQAAAA.txt', # DPNQ.txt', rootPath+'mc500.train.tsv_standardlized.txt_with_state.txt_DSSSS.txt_DPN.txt_DPNQ.txt', rootPath+'mc500.test.tsv_standardlized.txt_with_state.txt_DSSSS.txt_DPN.txt_DPNQ.txt', max_s_length, maxSentLength, maxDocLength ) # vocab_size contains train, dev and test [train_data_D, train_data_A1, train_data_A2, train_data_A3, train_Label, train_Length_D,train_Length_D_s, train_Length_A1, train_Length_A2, train_Length_A3, train_leftPad_D,train_leftPad_D_s, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3, train_rightPad_D,train_rightPad_D_s, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3 ] = train_data [test_data_D, test_data_A1, test_data_A2, test_data_A3, test_Label, test_Length_D,test_Length_D_s, test_Length_A1, test_Length_A2, test_Length_A3, test_leftPad_D,test_leftPad_D_s, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3, test_rightPad_D,test_rightPad_D_s, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3 ] = test_data n_train_batches = train_size/batch_size n_test_batches = test_size/batch_size train_batch_start = list(numpy.arange(n_train_batches)*batch_size) test_batch_start = list(numpy.arange(n_test_batches)*batch_size) rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
def evaluate_lenet5(learning_rate=0.008, n_epochs=2000, nkerns=[400], batch_size=1, window_width=3, maxSentLength=30, emb_size=300, hidden_size=[300,10], margin=0.5, L2_weight=0.0001, Div_reg=0.0001, norm_threshold=5.0, use_svm=False): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/'; rng = numpy.random.RandomState(23455) datasets, word2id=load_msr_corpus_20161229(rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength) vocab_size=len(word2id)+1 mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/' mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt') wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2] indices_train_r=indices_train[1::2] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2] indices_test_r=indices_test[1::2] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] train_size = len(indices_train_l) test_size = len(indices_test_l) train_batch_start=range(train_size) test_batch_start=range(test_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int32') # indices_train_r=T.cast(indices_train_r, 'int32') # indices_test_l=T.cast(indices_test_l, 'int32') # indices_test_r=T.cast(indices_test_r, 'int32') rand_values=random_value_normal((vocab_size, emb_size), theano.config.floatX, rng) # rand_values[0]=numpy.array(numpy.zeros(emb_size)) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init_new(rand_values, id2word, word2vec) embeddings=theano.shared(value=numpy.array(rand_values,dtype=theano.config.floatX), borrow=True)#theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.iscalar() x_index_l = T.imatrix() # now, x is the index matrix, must be integer x_index_r = T.imatrix() y = T.ivector() left_l=T.iscalar() right_l=T.iscalar() left_r=T.iscalar() right_r=T.iscalar() length_l=T.iscalar() length_r=T.iscalar() norm_length_l=T.fscalar() norm_length_r=T.fscalar() mts=T.fmatrix() wmf=T.fmatrix() # cost_tmp=T.fscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1) layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output') layer0_l_output_maxpool = T.max(layer0_l.output_narrow_conv_out[:,:,:,left_l:], axis=3).reshape((1, nkerns[0])) layer0_r_output_maxpool = T.max(layer0_r.output_narrow_conv_out[:,:,:,left_r:], axis=3).reshape((1, nkerns[0])) layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1) sum_uni_l=T.sum(layer0_l_input[:,:,:,left_l:], axis=3).reshape((1, emb_size)) norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r=T.sum(layer0_r_input[:,:,:,left_r:], axis=3).reshape((1, emb_size)) norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) uni_cosine=cosine(sum_uni_l, sum_uni_r) ''' linear=Linear(sum_uni_l, sum_uni_r) poly=Poly(sum_uni_l, sum_uni_r) sigmoid=Sigmoid(sum_uni_l, sum_uni_r) rbf=RBF(sum_uni_l, sum_uni_r) gesd=GESD(sum_uni_l, sum_uni_r) ''' eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2% #eucli_1=EUCLID(sum_uni_l, sum_uni_r) len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) ''' len_l=length_l.reshape((1,1)) len_r=length_r.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts HL_layer_1_input=T.concatenate([ # mts, eucli_1, #uni_cosine,norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, # uni_cosine, # sum_uni_l, # sum_uni_r, # sum_uni_l+sum_uni_r, 1.0/(1.0+EUCLID(layer0_l_output_maxpool, layer0_r_output_maxpool)), cosine(layer0_l_output_maxpool, layer0_r_output_maxpool), layer0_l_output_maxpool, layer0_r_output_maxpool, T.sqrt((layer0_l_output_maxpool-layer0_r_output_maxpool)**2+1e-10), layer1.output_eucli_to_simi, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, # layer1.output_cosine, layer1.output_vector_l, layer1.output_vector_r, T.sqrt((layer1.output_vector_l-layer1.output_vector_r)**2+1e-10), # len_l, len_r layer1.output_attentions # wmf, ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) HL_layer_1_input_with_extra=T.concatenate([#HL_layer_1_input, mts, len_l, len_r # wmf ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) HL_layer_1_input_size=1+1+ 1+1+3* nkerns[0] +1+1+3*nkerns[0]+10*10 HL_layer_1_input_with_extra_size = HL_layer_1_input_size+15+2 HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[0], activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[1], activation=T.tanh) LR_layer_input=T.concatenate([HL_layer_2.output, HL_layer_1.output, HL_layer_1_input],axis=1) LR_layer_input_with_extra=T.concatenate([HL_layer_2.output, HL_layer_1_input_with_extra],axis=1)#HL_layer_1.output, LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=HL_layer_1_input_size+hidden_size[0]+hidden_size[1], n_out=2) # LR_layer_input=HL_layer_2.output # LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=hidden_size, n_out=2) # layer3=LogisticRegression(rng, input=layer3_input, n_in=15+1+1+2+3, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((LR_layer.W** 2).sum()+(HL_layer_2.W** 2).sum()+(HL_layer_1.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum() # diversify_reg= Diversify_Reg(LR_layer.W.T)+Diversify_Reg(HL_layer_2.W.T)+Diversify_Reg(HL_layer_1.W.T)+Diversify_Reg(conv_W_into_matrix) cost_this =debug_print(LR_layer.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=cost_this+L2_weight*L2_reg#+Div_reg*diversify_reg test_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [LR_layer.errors(y), LR_layer.y_pred, LR_layer_input_with_extra, y], on_unused_input='ignore',allow_input_downcast=True) params = LR_layer.params+ HL_layer_2.params+HL_layer_1.params+[conv_W, conv_b]+[embeddings]#+[embeddings]# + layer1.params accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): clipped_grad = T.clip(grad_i, -0.5, 0.5) acc = acc_i + T.sqr(clipped_grad) updates.append((param_i, param_i - learning_rate * clipped_grad / T.sqrt(acc+1e-10))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [cost,LR_layer.errors(y)], updates=updates, on_unused_input='ignore',allow_input_downcast=True) train_model_predict = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [cost_this,LR_layer.errors(y), LR_layer_input_with_extra, y],on_unused_input='ignore',allow_input_downcast=True) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is best_params = None best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() epoch = 0 done_looping = False max_acc=0.0 nn_max_acc=0.0 best_iter=0 cost_tmp=0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 shuffle(train_batch_start)#shuffle training data for index in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * train_size + minibatch_index +1 minibatch_index=minibatch_index+1 # if iter%update_freq != 0: # cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) # #print 'cost_ij: ', cost_ij # cost_tmp+=cost_ij # error_sum+=error_ij # else: cost_i, error_i= train_model(indices_train_l[index: index + batch_size], indices_train_r[index: index + batch_size], trainY[index: index + batch_size], trainLeftPad_l[index], trainRightPad_l[index], trainLeftPad_r[index], trainRightPad_r[index], trainLengths_l[index], trainLengths_r[index], normalized_train_length_l[index], normalized_train_length_r[index], mt_train[index: index + batch_size], wm_train[index: index + batch_size]) cost_tmp+=cost_i if iter < 6000 and iter %100 ==0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter) if iter >= 6000 and iter % 100 == 0: # if iter%100 ==0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter) test_losses=[] test_y=[] test_features=[] for index in test_batch_start: test_loss, pred_y, layer3_input, y=test_model(indices_test_l[index: index + batch_size], indices_test_r[index: index + batch_size], testY[index: index + batch_size], testLeftPad_l[index], testRightPad_l[index], testLeftPad_r[index], testRightPad_r[index], testLengths_l[index], testLengths_r[index], normalized_test_length_l[index], normalized_test_length_r[index], mt_test[index: index + batch_size], wm_test[index: index + batch_size]) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) test_acc = (1-test_score) * 100. if test_acc > nn_max_acc: nn_max_acc = test_acc print '\t\t\tepoch:', epoch, 'iter:', iter, 'current acc:', test_acc, 'nn_max_acc:', nn_max_acc #now, see the results of svm if use_svm: train_y=[] train_features=[] for index in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(indices_train_l[index: index + batch_size], indices_train_r[index: index + batch_size], trainY[index: index + batch_size], trainLeftPad_l[index], trainRightPad_l[index], trainLeftPad_r[index], trainRightPad_r[index], trainLengths_l[index], trainLengths_r[index], normalized_train_length_l[index], normalized_train_length_r[index], mt_train[index: index + batch_size], wm_train[index: index + batch_size]) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n') #write_feature.close() clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33 clf.fit(train_features, train_y) results=clf.predict(test_features) lr=LinearRegression().fit(train_features, train_y) results_lr=lr.predict(test_features) corr_count=0 corr_lr=0 test_size=len(test_y) for i in range(test_size): if results[i]==test_y[i]: corr_count+=1 if numpy.absolute(results_lr[i]-test_y[i])<0.5: corr_lr+=1 acc=corr_count*1.0/test_size acc_lr=corr_lr*1.0/test_size if acc > max_acc: max_acc=acc best_iter=iter if acc_lr> max_acc: max_acc=acc_lr best_iter=iter print '\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ', max_acc , ' at iter: ', best_iter if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.085, n_epochs=2000, nkerns=[50, 50], batch_size=1, window_width=7, maxSentLength=60, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.00005, update_freq=10, norm_threshold=5.0): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/'; rng = numpy.random.RandomState(23455) datasets, vocab_size=load_msr_corpus(rootPath+'vocab.txt', rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength) mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/' mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2,:] indices_train_r=indices_train[1::2,:] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2,:] indices_test_r=indices_test[1::2,:] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] n_train_batches=indices_train_l.shape[0]/batch_size n_test_batches=indices_test_l.shape[0]/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l=T.cast(indices_train_l, 'int32') indices_train_r=T.cast(indices_train_r, 'int32') indices_test_l=T.cast(indices_test_l, 'int32') indices_test_r=T.cast(indices_test_r, 'int32') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size)) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) cost_tmp=0 error_sum=0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.imatrix('x_index_l') # now, x is the index matrix, must be integer x_index_r = T.imatrix('x_index_r') y = T.ivector('y') left_l=T.iscalar() right_l=T.iscalar() left_r=T.iscalar() right_r=T.iscalar() length_l=T.iscalar() length_r=T.iscalar() norm_length_l=T.dscalar() norm_length_r=T.dscalar() mts=T.dmatrix() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output') layer0_para=[conv_W, conv_b] layer1=Average_Pooling(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1, window_size=window_width, maxSentLength=maxSentLength) conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_size[1])) layer2_l = Conv_with_input_para(rng, input=layer1.output_tensor_l, image_shape=(batch_size, 1, nkerns[0], ishape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_size[1]), W=conv2_W, b=conv2_b) layer2_r = Conv_with_input_para(rng, input=layer1.output_tensor_r, image_shape=(batch_size, 1, nkerns[0], ishape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_size[1]), W=conv2_W, b=conv2_b) layer2_para=[conv2_W, conv2_b] layer3=Average_Pooling_for_batch1(rng, input_l=layer2_l.output, input_r=layer2_r.output, kern=nkerns[1], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1) layer3_out=debug_print(layer3.output_simi, 'layer1_out') #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh) sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) #norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) #norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) ''' uni_cosine=cosine(sum_uni_l, sum_uni_r) linear=Linear(sum_uni_l, sum_uni_r) poly=Poly(sum_uni_l, sum_uni_r) sigmoid=Sigmoid(sum_uni_l, sum_uni_r) rbf=RBF(sum_uni_l, sum_uni_r) gesd=GESD(sum_uni_l, sum_uni_r) ''' eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2% #eucli_1=EUCLID(sum_uni_l, sum_uni_r) len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer4_input=T.concatenate([mts, eucli_1,layer1.output_eucli, layer3_out,len_l, len_r], axis=1)#, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer4=LogisticRegression(rng, input=layer4_input, n_in=15+3+2, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((layer4.W** 2).sum()+(conv2_W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum() cost_this =debug_print(layer4.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost') test_model = theano.function([index], [layer4.errors(y), layer4.y_pred], givens={ x_index_l: indices_test_l[index: index + batch_size], x_index_r: indices_test_r[index: index + batch_size], y: testY[index: index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index], mts: mt_test[index: index + batch_size]}, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer4.params+ layer2_para+ layer0_para# + layer1.params accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') #norm=T.sqrt((grad_i**2).sum()) #if T.lt(norm_threshold, norm): # print 'big norm' # grad_i=grad_i*(norm_threshold/norm) acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index], [cost,layer4.errors(y), layer4_input], updates=updates, givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index: index + batch_size]}, on_unused_input='ignore') train_model_predict = theano.function([index], [cost_this,layer4.errors(y)], givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index: index + batch_size]}, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 #shuffle(train_batch_start)#shuffle training data for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) if iter%update_freq != 0: cost_ij, error_ij=train_model_predict(batch_start) #print 'cost_ij: ', cost_ij cost_tmp+=cost_ij error_sum+=error_ij else: cost_average, error_ij, layer3_input= train_model(batch_start) #print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' sum error: '+str(error_sum)+'/'+str(update_freq) error_sum=0 cost_tmp=0#reset for the next batch #print layer3_input #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_losses=[] for i in test_batch_start: test_loss, pred_y=test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_score * 100.)) ''' #print 'validating & testing...' # compute zero-one loss on validation set validation_losses = [] for i in dev_batch_start: time.sleep(0.5) validation_losses.append(validate_model(i)) #validation_losses = [validate_model(i) for i in dev_batch_start] this_validation_loss = numpy.mean(validation_losses) print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index , n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in test_batch_start] test_score = numpy.mean(test_losses) print(('\t\t\t\tepoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_score * 100.)) ''' if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, word_nkerns=300, batch_size=1, window_width=[3, 3], emb_size=300, margin=0.5, L2_weight=0.0003, Div_reg=0.03, update_freq=1, norm_threshold=5.0, max_truncate=40, max_relation_len=6, max_Q_len=30, neg_all=100, train_size=69967, test_size=19953, mark='_RC_newdata'): #train_size=75909, test_size=17386 # maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath = '/home/wyin/Datasets/SimpleQuestions_v2/relation_classification/' triple_files = [ 'train.replace_ne.withpoolwenpengFormat.txt', 'test.replace_ne.withpoolwenpengFormat.txt' ] rng = numpy.random.RandomState(23455) datasets, datasets_test, length_per_example_train, length_per_example_test, vocab_size = load_train( triple_files[0], triple_files[1], max_relation_len, max_Q_len, train_size, test_size, mark) #max_char_len, max_des_len, max_relation_len, max_Q_len print 'vocab_size:', vocab_size train_data = datasets # valid_data=datasets[1] test_data = datasets_test # result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores) # train_relations = train_data[0] train_relation_lengths = train_data[1] train_remainQ_word_ids = train_data[2] train_remainQ_word_len = train_data[3] test_relations = test_data[0] test_relation_lengths = test_data[1] test_remainQ_word_ids = test_data[2] test_remainQ_word_len = test_data[3] train_sizes = [ len(train_relations), len(train_relation_lengths), len(train_remainQ_word_ids), len(train_remainQ_word_len) ] if sum(train_sizes) / len(train_sizes) != train_size: print 'weird size:', train_sizes exit(0) test_sizes = [ len(test_relations), len(test_relation_lengths), len(test_remainQ_word_ids), len(test_remainQ_word_len) ] if sum(test_sizes) / len(test_sizes) != test_size: print 'weird size:', test_sizes exit(0) n_train_batches = train_size / batch_size n_test_batches = test_size / batch_size # indices_train_pos_entity_char=theano.shared(numpy.asarray(train_pos_entity_char, dtype='int32'), borrow=True) # indices_train_pos_entity_des=theano.shared(numpy.asarray(train_pos_entity_des, dtype='int32'), borrow=True) # indices_train_relations=theano.shared(numpy.asarray(train_relations, dtype='int32'), borrow=True) # indices_train_entity_char_lengths=theano.shared(numpy.asarray(train_entity_char_lengths, dtype='int32'), borrow=True) # indices_train_entity_des_lengths=theano.shared(numpy.asarray(train_entity_des_lengths, dtype='int32'), borrow=True) # indices_train_relation_lengths=theano.shared(numpy.asarray(train_relation_lengths, dtype='int32'), borrow=True) # indices_train_mention_char_ids=theano.shared(numpy.asarray(train_mention_char_ids, dtype='int32'), borrow=True) # indices_train_remainQ_word_ids=theano.shared(numpy.asarray(train_remainQ_word_ids, dtype='int32'), borrow=True) # indices_train_mention_char_lens=theano.shared(numpy.asarray(train_mention_char_lens, dtype='int32'), borrow=True) # indices_train_remainQ_word_len=theano.shared(numpy.asarray(train_remainQ_word_len, dtype='int32'), borrow=True) # indices_train_entity_scores=theano.shared(numpy.asarray(train_entity_scores, dtype=theano.config.floatX), borrow=True) rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) # rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb'+mark+'.txt') embeddings = theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data index = T.iscalar() rel_word_ids_M = T.imatrix() rel_word_lens_M = T.imatrix() q_word_ids_f = T.ivector() q_word_lens_f = T.ivector() filter_size = (emb_size, window_width[0]) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' word_filter_shape = (word_nkerns, 1, filter_size[0], filter_size[1]) q_rel_conv_W, q_rel_conv_b = create_conv_para( rng, filter_shape=word_filter_shape) params = [embeddings, q_rel_conv_W, q_rel_conv_b] q_rel_conv_W_into_matrix = q_rel_conv_W.reshape( (q_rel_conv_W.shape[0], q_rel_conv_W.shape[2] * q_rel_conv_W.shape[3])) # load_model_from_file(rootPath, params, '') def SimpleQ_matches_Triple(rel_word_ids_f, rel_word_lens_f): rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape( (batch_size, max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) q_word_input = embeddings[q_word_ids_f.flatten()].reshape( (batch_size, max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #q-rel q_rel_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) rel_conv = Conv_with_input_para(rng, input=rel_word_input, image_shape=(batch_size, 1, emb_size, max_relation_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) # q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) rel_conv_pool = Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2]) q_rel_pool = Average_Pooling_for_SimpleQA( rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1] + filter_size[1] - 1, dim=max_Q_len + filter_size[1] - 1, topk=2) overall_simi = cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling) return overall_simi simi_list, updates = theano.scan( SimpleQ_matches_Triple, sequences=[rel_word_ids_M, rel_word_lens_M]) posi_simi = simi_list[0] nega_simies = simi_list[1:] loss_simi_list = T.maximum( 0.0, margin - posi_simi.reshape((1, 1)) + nega_simies) loss_simi = T.sum(loss_simi_list) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg = debug_print( (embeddings**2).sum() + (q_rel_conv_W**2).sum(), 'L2_reg') #+(layer1.W** 2).sum()++(embeddings**2).sum() diversify_reg = Diversify_Reg(q_rel_conv_W_into_matrix) cost = loss_simi + L2_weight * L2_reg + Div_reg * diversify_reg #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function( [rel_word_ids_M, rel_word_lens_M, q_word_ids_f, q_word_lens_f], [loss_simi, simi_list], on_unused_input='ignore') accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') acc = acc_i + T.sqr(grad_i) # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))) #AdaGrad # updates.append((acc_i, acc)) if param_i == embeddings: updates.append( (param_i, T.set_subtensor( (param_i - learning_rate * grad_i / T.sqrt(acc + 1e-10))[0], theano.shared(numpy.zeros(emb_size))))) #Ada else: updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc + 1e-10))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [rel_word_ids_M, rel_word_lens_M, q_word_ids_f, q_word_lens_f], [loss_simi, cost], updates=updates, on_unused_input='ignore') # train_model = theano.function([index, chosed_indices], [loss_simi, cost], updates=updates, # givens={ # rel_word_ids_M : indices_train_relations[index].reshape((neg_all, max_relation_len))[chosed_indices].reshape((train_neg_size, max_relation_len)), # rel_word_lens_M : indices_train_relation_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), # q_word_ids_M : indices_train_remainQ_word_ids[index].reshape((neg_all, max_Q_len))[chosed_indices].reshape((train_neg_size, max_Q_len)), # q_word_lens_M : indices_train_remainQ_word_len[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)) # # }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False best_test_accu = 0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 for jj in range(train_size): # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 minibatch_index = minibatch_index + 1 #print batch_start train_rel_word_ids_M = numpy.asarray( train_relations[jj], dtype='int32').reshape( (length_per_example_train[jj], max_relation_len)) train_rel_word_lens_M = numpy.asarray( train_relation_lengths[jj], dtype='int32').reshape( (length_per_example_train[jj], 3)) train_q_word_ids_M = numpy.asarray( train_remainQ_word_ids[jj], dtype='int32' ) #.reshape((length_per_example_train[jj], max_Q_len)) train_q_word_lens_M = numpy.asarray( train_remainQ_word_len[jj], dtype='int32') #.reshape((length_per_example_train[jj], 3)) loss_simi_i, cost_i = train_model(train_rel_word_ids_M, train_rel_word_lens_M, train_q_word_ids_M, train_q_word_lens_M) if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + '\tloss_simi_i: ', loss_simi_i, 'cost_i:', cost_i #if iter ==1: # exit(0) # if iter > 59999 and iter % 10000 == 0: test_loss = [] succ = 0 for i in range(test_size): # print 'testing', i, '...' #prepare data test_rel_word_ids_M = numpy.asarray( test_relations[i], dtype='int32').reshape( (length_per_example_test[i], max_relation_len)) test_rel_word_lens_M = numpy.asarray( test_relation_lengths[i], dtype='int32').reshape( (length_per_example_test[i], 3)) test_q_word_ids_M = numpy.asarray( test_remainQ_word_ids[i], dtype='int32' ) #.reshape((length_per_example_test[i], max_Q_len)) test_q_word_lens_M = numpy.asarray( test_remainQ_word_len[i], dtype='int32' ) #.reshape((length_per_example_test[i], 3)) loss_simi_i, simi_list_i = test_model( test_rel_word_ids_M, test_rel_word_lens_M, test_q_word_ids_M, test_q_word_lens_M) # print 'simi_list_i:', simi_list_i[:10] test_loss.append(loss_simi_i) if simi_list_i[0] >= max(simi_list_i[1:]): succ += 1 # print 'testing', i, '...acc:', succ*1.0/(i+1) succ = (succ + 20610 - test_size) * 1.0 / 20610 #now, check MAP and MRR print(( '\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best ' 'model %f') % (epoch, minibatch_index, n_train_batches, succ)) if best_test_accu < succ: best_test_accu = succ store_model_to_file(rootPath, params, mark) if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min' mid_time = time.clock() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=10, hidden_size=10, L2_weight=0.0001, para_len_limit=400, q_len_limit=40, max_EM=0.217545454546): model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = numpy.random.RandomState(23455) train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist = load_train( para_len_limit, q_len_limit) train_size = len(train_para_list) if train_size != len(train_Q_list) or train_size != len( train_label_list) or train_size != len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist = load_dev_or_test( word2id, para_len_limit, q_len_limit) test_size = len(test_para_list) if test_size != len(test_Q_list) or test_size != len( test_mask) or test_size != len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) rand_values = random_value_normal((overall_vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in overall_word2id.iteritems()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') labels = T.imatrix('labels') para_mask = T.fmatrix('para_mask') q_mask = T.fmatrix('q_mask') extraF = T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' norm_extraF = normalize_matrix(extraF) U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b = create_GRU_para(rng, emb_size, hidden_size) paragraph_para = [U1, W1, b1, U1_b, W1_b, b1_b] UQ, WQ, bQ = create_GRU_para(rng, emb_size, hidden_size) UQ_b, WQ_b, bQ_b = create_GRU_para(rng, emb_size, hidden_size) Q_para = [UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] W_a1 = create_ensemble_para( rng, hidden_size, hidden_size) # init_weights((2*hidden_size, hidden_size)) W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 2, hidden_size + 3) # 3 extra features LR_b = theano.shared( value=numpy.zeros((2, ), dtype=theano.config.floatX), # @UndefinedVariable name='LR_b', borrow=True) attention_paras = [W_a1, W_a2, U_a, LR_b] params = [embeddings] + paragraph_para + Q_para + attention_paras load_model_from_file(rootPath + 'Best_Paras_conv_0.217545454545', params) paragraph_input = embeddings[paragraph.flatten()].reshape( (paragraph.shape[0], paragraph.shape[1], emb_size)).transpose( (0, 2, 1)) # (batch_size, emb_size, maxparalen) concate_paragraph_input = T.concatenate( [paragraph_input, norm_extraF.dimshuffle((0, 2, 1))], axis=1) paragraph_model = Bd_GRU_Batch_Tensor_Input_with_Mask( X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size, U=U1, W=W1, b=b1, Ub=U1_b, Wb=W1_b, bb=b1_b) para_reps = paragraph_model.output_tensor #(batch, emb, para_len) # #LSTM # fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters # paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) # para_reps=paragraph_model.output_tensor Qs_emb = embeddings[questions.flatten()].reshape( (questions.shape[0], questions.shape[1], emb_size)).transpose( (0, 2, 1)) #(#questions, emb_size, maxsenlength) questions_model = Bd_GRU_Batch_Tensor_Input_with_Mask( X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ, W=WQ, b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b) # questions_reps=questions_model.output_sent_rep_maxpooling.reshape((batch_size, 1, hidden_size)) #(batch, 2*out_size) questions_reps_tensor = questions_model.output_tensor #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) # #LSTM for questions # fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters # questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) # questions_reps_tensor=questions_model.output_tensor #use CNN for question modeling # Qs_emb_tensor4=Qs_emb.dimshuffle((0,'x', 1,2)) #(batch_size, 1, emb+3, maxparalen) # conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 5)) # Q_conv_para=[conv_W, conv_b] # conv_model = Conv_with_input_para(rng, input=Qs_emb_tensor4, # image_shape=(batch_size, 1, emb_size, q_len_limit), # filter_shape=(hidden_size, 1, emb_size, 5), W=conv_W, b=conv_b) # conv_output=conv_model.narrow_conv_out.reshape((batch_size, hidden_size, q_len_limit-5+1)) #(batch, 1, hidden_size, maxparalen-1) # gru_mask=(q_mask[:,:-4]*q_mask[:,1:-3]*q_mask[:,2:-2]*q_mask[:,3:-1]*q_mask[:,4:]).reshape((batch_size, 1, q_len_limit-5+1)) # masked_conv_output=conv_output*gru_mask # questions_conv_reps=T.max(masked_conv_output, axis=2).reshape((batch_size, 1, hidden_size)) # new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0) # ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2) # ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction # padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX) # ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1) # ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1) # ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad) # def example_in_batch(para_matrix, q_matrix): #assume both are (hidden, len) transpose_para_matrix = para_matrix.T interaction_matrix = T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len) norm_interaction_matrix = T.nnet.softmax(interaction_matrix) return T.dot(q_matrix, norm_interaction_matrix.T) #(len, para_len) batch_q_reps, updates = theano.scan( fn=example_in_batch, outputs_info=None, sequences=[para_reps, questions_reps_tensor ]) #batch_q_reps (batch, hidden, para_len) #attention distributions norm_W_a1 = normalize_matrix(W_a1) norm_W_a2 = normalize_matrix(W_a2) norm_U_a = normalize_matrix(U_a) transformed_para_reps = T.maximum( T.dot(para_reps.transpose((0, 2, 1)), norm_W_a2), 0.0) #relu transformed_q_reps = T.maximum( T.dot(batch_q_reps.transpose((0, 2, 1)), norm_W_a1), 0.0) #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1) add_both = transformed_para_reps + transformed_q_reps # U_c, W_c, b_c=create_GRU_para(rng, hidden_size, hidden_size) # U_c_b, W_c_b, b_c_b=create_GRU_para(rng, hidden_size, hidden_size) # accumu_para=[U_c, W_c, b_c, U_c_b, W_c_b, b_c_b] # accumu_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_both.transpose((0,2,1)), Mask=para_mask, hidden_dim=hidden_size,U=U_c,W=W_c,b=b_c,Ub=U_c_b,Wb=W_c_b,bb=b_c_b) # accu_both=accumu_model.output_tensor.transpose((0,2,1)) prior_att = T.concatenate([add_both, norm_extraF], axis=2) #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2) valid_indices = para_mask.flatten().nonzero()[0] layer3 = LogisticRegression(rng, input=prior_att.reshape( (batch_size * prior_att.shape[1], hidden_size + 3)), n_in=hidden_size + 3, n_out=2, W=norm_U_a, b=LR_b) #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices]) error = -T.sum( T.log(layer3.p_y_given_x) [valid_indices, labels.flatten()[valid_indices]]) #[T.arange(y.shape[0]), y]) distributions = layer3.p_y_given_x[:, -1].reshape( (batch_size, para_mask.shape[1])) #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1])) # masked_dis=(distributions+ConvGRU_1_dis_into_unigram)*para_mask masked_dis = distributions * para_mask ''' strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1) distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions') para_mask=para_mask masked_dis=distributions*para_mask # masked_label=debug_print(labels*para_mask, 'masked_label') # error=((masked_dis-masked_label)**2).mean() label_mask=T.gt(labels,0.0) neg_label_mask=T.lt(labels,0.0) dis_masked=distributions*label_mask remain_dis_masked=distributions*neg_label_mask ans_size=T.sum(label_mask) non_ans_size=T.sum(neg_label_mask) pos_error=T.sum((dis_masked-label_mask)**2)/ans_size neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)* ''' # def AttentionLayer(q_rep, ext_M): # theano_U_a=debug_print(norm_U_a, 'norm_U_a') # prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att') # f __name__ == '__main__': # prior_att=T.concatenate([prior_att, ext_M], axis=1) # # strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1) # return strength.transpose() #(1, #words) # distributions, updates = theano.scan( # AttentionLayer, # sequences=[questions_reps,extraF] ) # distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions') # labels=debug_print(labels, 'labels') # label_mask=T.gt(labels,0.0) # neg_label_mask=T.lt(labels,0.0) # dis_masked=distributions*label_mask # remain_dis_masked=distributions*neg_label_mask # pos_error=((dis_masked-1)**2).mean() # neg_error=((remain_dis_masked-(-1))**2).mean() # error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] L2_reg = L2norm_paraList( [embeddings, U1, W1, U1_b, W1_b, UQ, WQ, UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost = error #+ConvGRU_1.error# accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-8))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [paragraph, questions, labels, para_mask, q_mask, extraF], cost, updates=updates, on_unused_input='ignore') test_model = theano.function( [paragraph, questions, para_mask, q_mask, extraF], masked_dis, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches = train_size / batch_size # remain_train=train_size%batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / batch_size # remain_test=test_size%batch_size test_batch_start = list( numpy.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_F1_acc = 0.0 max_exact_acc = 0.0 cost_i = 0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu = 0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i += train_model( np.asarray([ train_para_list[id] for id in train_ids[para_id:para_id + batch_size] ], dtype='int32'), np.asarray([ train_Q_list[id] for id in train_ids[para_id:para_id + batch_size] ], dtype='int32'), np.asarray([ train_label_list[id] for id in train_ids[para_id:para_id + batch_size] ], dtype='int32'), np.asarray([ train_para_mask[id] for id in train_ids[para_id:para_id + batch_size] ], dtype=theano.config.floatX), np.asarray([ train_mask[id] for id in train_ids[para_id:para_id + batch_size] ], dtype=theano.config.floatX), np.asarray([ train_feature_matrixlist[id] for id in train_ids[para_id:para_id + batch_size] ], dtype=theano.config.floatX)) #print iter if iter % 10 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' print 'Testing...' past_time = time.time() exact_match = 0.0 F1_match = 0.0 q_amount = 0 for test_para_id in test_batch_start: distribution_matrix = test_model( np.asarray(test_para_list[test_para_id:test_para_id + batch_size], dtype='int32'), np.asarray(test_Q_list[test_para_id:test_para_id + batch_size], dtype='int32'), np.asarray(test_para_mask[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX), np.asarray(test_mask[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX), np.asarray( test_feature_matrixlist[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX)) # print distribution_matrix test_para_wordlist_list = test_text_list[ test_para_id:test_para_id + batch_size] para_gold_ansset_list = q_ansSet_list[ test_para_id:test_para_id + batch_size] paralist_extra_features = test_feature_matrixlist[ test_para_id:test_para_id + batch_size] sub_para_mask = test_para_mask[test_para_id:test_para_id + batch_size] para_len = len(test_para_wordlist_list[0]) if para_len != len(distribution_matrix[0]): print 'para_len!=len(distribution_matrix[0]):', para_len, len( distribution_matrix[0]) exit(0) # q_size=len(distribution_matrix) q_amount += batch_size # print q_size # print test_para_word_list Q_list_inword = test_Q_list_word[ test_para_id:test_para_id + batch_size] for q in range(batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans = extract_ansList_attentionList( test_para_wordlist_list[q], distribution_matrix[q], np.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q]) q_gold_ans_set = para_gold_ansset_list[q] # print test_para_wordlist_list[q] # print Q_list_inword[q] # print pred_ans.encode('utf8'), q_gold_ans_set if pred_ans in q_gold_ans_set: exact_match += 1 F1 = MacroF1(pred_ans, q_gold_ans_set) F1_match += F1 # match_amount=len(pred_ans_set & q_gold_ans_set) # # print 'q_gold_ans_set:', q_gold_ans_set # # print 'pred_ans_set:', pred_ans_set # if match_amount>0: # exact_match+=match_amount*1.0/len(pred_ans_set) F1_acc = F1_match / q_amount exact_acc = exact_match / q_amount if F1_acc > max_F1_acc: max_F1_acc = F1_acc if exact_acc > max_exact_acc: max_exact_acc = exact_acc if max_exact_acc > max_EM: store_model_to_file( rootPath + 'Best_Paras_conv_' + str(max_exact_acc), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=100, batch_size=100, emb_size=300, char_emb_size=20, hidden_size=10, L2_weight=0.0001, p_len_limit=400, test_p_len_limit=100, q_len_limit=20, char_len=15, filter_size=[5, 5, 5, 5, 5], char_filter_size=5, margin=0.85, extra_size=5 + 11, extra_emb=10, distance=10, distance_emb=10, comment='add distance embs'): #extra_size=3+46+7 test_batch_size = batch_size model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = numpy.random.RandomState(23455) word2id = {} char2id = {} train_Q_list, train_para_list, train_Q_mask, train_para_mask, train_Q_char_list, train_para_char_list, train_Q_char_mask, train_para_char_mask, train_span_label_list, train_word_label_list, train_para_extras, word2id, char2id = load_squad_cnn_rank_span_word_train( word2id, char2id, p_len_limit, q_len_limit, char_len) test_Q_list, test_para_list, test_Q_mask, test_para_mask, test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, q_idlist, test_para_extras, word2id, char2id, test_para_wordlist_list = load_squad_cnn_rank_span_word_dev( word2id, char2id, test_p_len_limit, q_len_limit, char_len) ''' #store variables into file ''' # train_variables = [train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_span_label_list, train_word_label_list, train_para_extras] # test_variables =[test_Q_list, test_para_list, test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, q_idlist, test_para_extras, word2id, char2id, test_para_wordlist_list] # with open(rootPath+'extra.3.pickle', 'wb') as f: # Python 3: open(..., 'wb') # cPickle.dump(train_variables+test_variables, f, protocol=cPickle.HIGHEST_PROTOCOL) # f.close() # print 'variable stored successfully' # exit(0) ''' load variables from file ''' # before_load_time = time.time() # with open(rootPath+'extra.3.pickle', 'rb') as f: # Python 3: open(..., 'rb') # train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_span_label_list, train_word_label_list, train_para_extras,test_Q_list, test_para_list, test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, q_idlist, test_para_extras, word2id, char2id, test_para_wordlist_list = cPickle.load(f) # f.close() # print 'load data variables successfully, spend: ', (time.time()-before_load_time)/60.0, ' mins' train_size = len(train_para_list) test_size = len(test_para_list) train_Q_list = numpy.asarray(train_Q_list, dtype='int32') train_para_list = numpy.asarray(train_para_list, dtype='int32') train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX) train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX) train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32') train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32') train_Q_char_mask = numpy.asarray(train_Q_char_mask, dtype=theano.config.floatX) train_para_char_mask = numpy.asarray(train_para_char_mask, dtype=theano.config.floatX) train_para_extras = numpy.asarray(train_para_extras, dtype=theano.config.floatX) train_span_label_list = numpy.asarray(train_span_label_list, dtype='int32') train_word_label_list = numpy.asarray(train_word_label_list, dtype='int32') test_Q_list = numpy.asarray(test_Q_list, dtype='int32') test_para_list = numpy.asarray(test_para_list, dtype='int32') test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX) test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX) test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32') test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32') test_Q_char_mask = numpy.asarray(test_Q_char_mask, dtype=theano.config.floatX) test_para_char_mask = numpy.asarray(test_para_char_mask, dtype=theano.config.floatX) test_para_extras = numpy.asarray(test_para_extras, dtype=theano.config.floatX) vocab_size = len(word2id) print 'vocab size: ', vocab_size rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, rng) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_glove() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared(value=rand_values, borrow=True) char_size = len(char2id) print 'char size: ', char_size char_rand_values = random_value_normal((char_size + 1, char_emb_size), theano.config.floatX, rng) char_embeddings = theano.shared(value=char_rand_values, borrow=True) extra_rand_values = random_value_normal((extra_size, extra_emb), theano.config.floatX, rng) extra_embeddings = theano.shared(value=extra_rand_values, borrow=True) distance_rand_values = random_value_normal( (2 * distance + 1, distance_emb), theano.config.floatX, rng) distance_embeddings = theano.shared(value=distance_rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') span_indices = T.ivector() #batch word_indices = T.imatrix() #(batch, 2) ans_indices = T.ivector() # for one batch, the length is dynamic para_mask = T.fmatrix('para_mask') q_mask = T.fmatrix('q_mask') extra = T.ftensor3() #(batch, p_len, 3) char_paragraph = T.imatrix() #(batch, char_len*p_len) char_questions = T.imatrix() char_para_mask = T.fmatrix() char_q_mask = T.fmatrix() true_p_len = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = paragraph.shape[0] extra_rep_batch = T.concatenate( [extra.dot(extra_embeddings), extra], axis=2) #(batch, p_len, extra_emb+extra_size) zero_pad = T.zeros((true_batch_size, 1, extra_emb + extra_size)) left_context = T.concatenate([zero_pad, extra_rep_batch[:, :-1, :]], axis=1) #(batch, p_len, extra_emb+extra_size) right_context = T.concatenate( [extra_rep_batch[:, 1:, :], zero_pad], axis=1) #(batch, p_len, extra_emb+extra_size) left_context_2 = T.concatenate( [zero_pad, zero_pad, extra_rep_batch[:, :-2, :]], axis=1) #(batch, p_len, extra_emb+extra_size) right_context_2 = T.concatenate( [extra_rep_batch[:, 2:, :], zero_pad, zero_pad], axis=1) #(batch, p_len, extra_emb+extra_size) simi2left = T.sum(extra_rep_batch * left_context, axis=2).dimshuffle(0, 1, 'x') #(batch, p_len, 1) simi2right = T.sum(extra_rep_batch * right_context, axis=2).dimshuffle(0, 1, 'x') #(batch, p_len, 1) cos2left = cosine_tensor3(extra_rep_batch, left_context, 2).dimshuffle(0, 1, 'x') cos2right = cosine_tensor3(extra_rep_batch, right_context, 2).dimshuffle(0, 1, 'x') diff2left = extra_rep_batch - left_context diff2right = extra_rep_batch - right_context #(batch, p_len, extra_emb+extra_size) extra_rep_batch = T.concatenate( [ extra_rep_batch, left_context, right_context, left_context_2, right_context_2, diff2left, diff2right, simi2left, simi2right, cos2left, cos2right ], axis=2) #batch, p_len, 7*(extra_emb+extra_size)+4) true_extra_size = 7 * (extra_emb + extra_size) + 4 common_input_p = embeddings[paragraph.flatten()].reshape( (true_batch_size, true_p_len, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_q = embeddings[questions.flatten()].reshape( (true_batch_size, q_len_limit, emb_size)) char_common_input_p = char_embeddings[char_paragraph.flatten()].reshape( (true_batch_size * true_p_len, char_len, char_emb_size )) #the input format can be adapted into CNN or GRU or LSTM char_common_input_q = char_embeddings[char_questions.flatten()].reshape( (true_batch_size * q_len_limit, char_len, char_emb_size)) char_p_masks = char_para_mask.reshape( (true_batch_size * true_p_len, char_len)) char_q_masks = char_q_mask.reshape( (true_batch_size * q_len_limit, char_len)) conv_W_char, conv_b_char = create_conv_para( rng, filter_shape=(char_emb_size, 1, char_emb_size, char_filter_size)) conv_W_1, conv_b_1 = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size + char_emb_size + true_extra_size, filter_size[0])) conv_W_2, conv_b_2 = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) conv_W_3, conv_b_3 = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[2])) # conv_W_4, conv_b_4=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[3])) # conv_W_5, conv_b_5=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[4])) conv_W_1_q, conv_b_1_q = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size + char_emb_size, filter_size[0])) conv_W_2_q, conv_b_2_q = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) conv_W_3_q, conv_b_3_q = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[2])) # conv_W_4_q, conv_b_4_q=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[3])) # conv_W_5_q, conv_b_5_q=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[4])) CNN_para = [ conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_3, conv_b_3, conv_W_3_q, conv_b_3_q, # conv_W_4, conv_b_4, conv_W_5, conv_b_5,conv_W_4_q, conv_b_4_q, conv_W_5_q, conv_b_5_q, conv_W_char, conv_b_char ] span_input4score, word_input4score, overall_span_hidden_size, overall_word_hidden_size = squad_cnn_rank_spans_word( rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q, batch_size, p_len_limit, q_len_limit, emb_size, char_emb_size, char_len, filter_size, char_filter_size, hidden_size, conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char, conv_W_3, conv_b_3, conv_W_3_q, conv_b_3_q, # conv_W_4, conv_b_4, conv_W_4_q, conv_b_4_q, # conv_W_5, conv_b_5, conv_W_5_q, conv_b_5_q, para_mask, q_mask, char_p_masks, char_q_masks, extra_rep_batch, true_extra_size) test_span_input4score, test_word_input4score, _, _ = squad_cnn_rank_spans_word( rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q, test_batch_size, test_p_len_limit, q_len_limit, emb_size, char_emb_size, char_len, filter_size, char_filter_size, hidden_size, conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char, conv_W_3, conv_b_3, conv_W_3_q, conv_b_3_q, # conv_W_4, conv_b_4, conv_W_4_q, conv_b_4_q, # conv_W_5, conv_b_5, conv_W_5_q, conv_b_5_q, para_mask, q_mask, char_p_masks, char_q_masks, extra_rep_batch, true_extra_size) #(batch, hidden, gram_size) gram_size = 5 * true_p_len - (0 + 1 + 2 + 3 + 4) # U_a = create_ensemble_para(rng, 1, 4*hidden_size) # norm_U_a=normalize_matrix(U_a) # span_scores_matrix=T.dot(span_input4score.dimshuffle(0,2,1), norm_U_a).reshape((batch_size, gram_size)) #(batch, 13*para_len-78, 1) span_HL_1_para = create_ensemble_para(rng, hidden_size, overall_span_hidden_size) span_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) span_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) span_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) span_U_a = create_ensemble_para(rng, 1, hidden_size + overall_span_hidden_size) norm_span_U_a = normalize_matrix(span_U_a) norm_span_HL_1_para = normalize_matrix(span_HL_1_para) norm_span_HL_2_para = normalize_matrix(span_HL_2_para) norm_span_HL_3_para = normalize_matrix(span_HL_3_para) norm_span_HL_4_para = normalize_matrix(span_HL_4_para) span_scores_matrix = add_HLs_2_tensor3(span_input4score, norm_span_HL_1_para, norm_span_HL_2_para, norm_span_HL_3_para, norm_span_HL_4_para, norm_span_U_a, batch_size, gram_size) span_scores = T.nnet.softmax(span_scores_matrix) #(batch, 7*para_len-21) loss_neg_likelihood = -T.mean( T.log(span_scores[T.arange(batch_size), span_indices])) #ranking loss tanh_span_scores_matrix = span_scores #T.tanh(span_scores_matrix) #(batch, gram_size) index_matrix = T.zeros((batch_size, gram_size), dtype=theano.config.floatX) new_index_matrix = T.set_subtensor( index_matrix[T.arange(batch_size), span_indices], 1.0) prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()] prob_batch_nega = tanh_span_scores_matrix[(1.0 - new_index_matrix).nonzero()] repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0) repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x', 0), prob_batch_posi.shape[0], axis=0).flatten() loss_rank = T.mean(T.maximum(0.0, margin - repeat_posi + repeat_nega)) span_loss = loss_neg_likelihood + loss_rank # test_span_scores_matrix=T.dot(test_span_input4score.dimshuffle(0,2,1), norm_U_a).reshape((true_batch_size, gram_size)) #(batch, 13*para_len-78) test_span_scores_matrix = add_HLs_2_tensor3( test_span_input4score, norm_span_HL_1_para, norm_span_HL_2_para, norm_span_HL_3_para, norm_span_HL_4_para, norm_span_U_a, true_batch_size, gram_size) #word HL_1_para = create_ensemble_para(rng, hidden_size, overall_word_hidden_size) HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) start_U_a = create_ensemble_para(rng, 1, hidden_size + overall_word_hidden_size) norm_start_U_a = normalize_matrix(start_U_a) norm_HL_1_para = normalize_matrix(HL_1_para) norm_HL_2_para = normalize_matrix(HL_2_para) norm_HL_3_para = normalize_matrix(HL_3_para) norm_HL_4_para = normalize_matrix(HL_4_para) end_HL_1_para = create_ensemble_para( rng, hidden_size, overall_word_hidden_size + distance_emb) end_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) end_U_a = create_ensemble_para( rng, 1, hidden_size + overall_word_hidden_size + distance_emb) end_norm_U_a = normalize_matrix(end_U_a) end_norm_HL_1_para = normalize_matrix(end_HL_1_para) end_norm_HL_2_para = normalize_matrix(end_HL_2_para) end_norm_HL_3_para = normalize_matrix(end_HL_3_para) end_norm_HL_4_para = normalize_matrix(end_HL_4_para) start_scores_matrix = add_HLs_2_tensor3(word_input4score, norm_HL_1_para, norm_HL_2_para, norm_HL_3_para, norm_HL_4_para, norm_start_U_a, batch_size, true_p_len) start_scores = T.nnet.softmax(start_scores_matrix) #(batch, para_len) ''' forward start info to end prediction ''' distance_matrix = word_indices[:, 0].dimshuffle( 0, 'x') - T.arange(true_p_len).dimshuffle('x', 0) #(batch, p_len) distance_trunc_matrix = T.maximum( -distance, T.minimum(distance, distance_matrix)) + distance #(batch, p_len) zero_distance_matrix = T.zeros( (true_batch_size * true_p_len, 2 * distance + 1)) filled_distance_matrix = T.set_subtensor( zero_distance_matrix[T.arange(true_batch_size * true_p_len), distance_trunc_matrix.flatten()], 1.0) filled_distance_tensor3 = filled_distance_matrix.reshape( (true_batch_size, true_p_len, 2 * distance + 1)).dot(distance_embeddings).dimshuffle( 0, 2, 1) #(batch_size, distance_emb, p_len) end_word_input4score = T.concatenate( [word_input4score, filled_distance_tensor3], axis=1) #(batch, +distance_emb, p_len) end_scores_matrix = add_HLs_2_tensor3(end_word_input4score, end_norm_HL_1_para, end_norm_HL_2_para, end_norm_HL_3_para, end_norm_HL_4_para, end_norm_U_a, batch_size, true_p_len) end_scores = T.nnet.softmax(end_scores_matrix) #(batch, para_len) start_loss_neg_likelihood = -T.mean( T.log(start_scores[T.arange(batch_size), word_indices[:, 0]])) end_loss_neg_likelihood = -T.mean( T.log(end_scores[T.arange(batch_size), word_indices[:, 1]])) #ranking loss start tanh_start_scores_matrix = start_scores #T.tanh(span_scores_matrix) #(batch, gram_size) start_index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) start_new_index_matrix = T.set_subtensor( start_index_matrix[T.arange(batch_size), word_indices[:, 0]], 1.0) start_prob_batch_posi = tanh_start_scores_matrix[ start_new_index_matrix.nonzero()] start_prob_batch_nega = tanh_start_scores_matrix[( 1.0 - start_new_index_matrix).nonzero()] start_repeat_posi = T.extra_ops.repeat(start_prob_batch_posi, start_prob_batch_nega.shape[0], axis=0) start_repeat_nega = T.extra_ops.repeat(start_prob_batch_nega.dimshuffle( 'x', 0), start_prob_batch_posi.shape[0], axis=0).flatten() start_loss_rank = T.mean( T.maximum(0.0, margin - start_repeat_posi + start_repeat_nega)) #ranking loss END end_tanh_scores_matrix = end_scores #T.tanh(span_scores_matrix) #(batch, gram_size) end_index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) end_new_index_matrix = T.set_subtensor( end_index_matrix[T.arange(batch_size), word_indices[:, 1]], 1.0) end_prob_batch_posi = end_tanh_scores_matrix[ end_new_index_matrix.nonzero()] end_prob_batch_nega = end_tanh_scores_matrix[( 1.0 - end_new_index_matrix).nonzero()] end_repeat_posi = T.extra_ops.repeat(end_prob_batch_posi, end_prob_batch_nega.shape[0], axis=0) end_repeat_nega = T.extra_ops.repeat(end_prob_batch_nega.dimshuffle( 'x', 0), end_prob_batch_posi.shape[0], axis=0).flatten() end_loss_rank = T.mean( T.maximum(0.0, margin - end_repeat_posi + end_repeat_nega)) word_loss = start_loss_neg_likelihood + end_loss_neg_likelihood + start_loss_rank + end_loss_rank #test test_start_scores_matrix = add_HLs_2_tensor3( test_word_input4score, norm_HL_1_para, norm_HL_2_para, norm_HL_3_para, norm_HL_4_para, norm_start_U_a, true_batch_size, true_p_len) #(batch, test_p_len) mask_test_start_return = test_start_scores_matrix * para_mask #(batch, p_len) ''' forward start info to end prediction in testing ''' test_distance_matrix = T.argmax(mask_test_start_return, axis=1).dimshuffle( 0, 'x') - T.arange(true_p_len).dimshuffle('x', 0) #(batch, p_len) test_distance_trunc_matrix = T.maximum( -distance, T.minimum(distance, test_distance_matrix)) + distance #(batch, p_len) test_zero_distance_matrix = T.zeros( (true_batch_size * true_p_len, 2 * distance + 1)) test_filled_distance_matrix = T.set_subtensor( test_zero_distance_matrix[T.arange(true_batch_size * true_p_len), test_distance_trunc_matrix.flatten()], 1.0) test_filled_distance_tensor3 = test_filled_distance_matrix.reshape( (true_batch_size, true_p_len, 2 * distance + 1)).dot(distance_embeddings).dimshuffle( 0, 2, 1) #(batch_size, distance_emb, p_len) test_end_word_input4score = T.concatenate( [test_word_input4score, test_filled_distance_tensor3], axis=1) #(batch, +distance-emb, p_len) end_test_scores_matrix = add_HLs_2_tensor3( test_end_word_input4score, end_norm_HL_1_para, end_norm_HL_2_para, end_norm_HL_3_para, end_norm_HL_4_para, end_norm_U_a, true_batch_size, true_p_len) #(batch, test_p_len) end_mask_test_return = end_test_scores_matrix * para_mask #(batch, p_len) word_gram_1 = mask_test_start_return + end_mask_test_return word_gram_2 = mask_test_start_return[:, : -1] + end_mask_test_return[:, 1:] #(batch* hidden_size, maxsenlen-1) word_gram_3 = mask_test_start_return[:, : -2] + end_mask_test_return[:, 2:] #(batch* hidden_size, maxsenlen-2) word_gram_4 = mask_test_start_return[:, : -3] + end_mask_test_return[:, 3:] #(batch* hidden_size, maxsenlen-3) word_gram_5 = mask_test_start_return[:, : -4] + end_mask_test_return[:, 4:] #(batch* hidden_size, maxsenlen-4) word_pair_scores = T.concatenate( [word_gram_1, word_gram_2, word_gram_3, word_gram_4, word_gram_5], axis=1) #(batch_size, gram_size) #ans words train ans_HL_1_para = create_ensemble_para(rng, hidden_size, overall_word_hidden_size) ans_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) ans_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) ans_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) ans_U_a = create_ensemble_para(rng, 1, hidden_size + overall_word_hidden_size) norm_ans_U_a = normalize_matrix(ans_U_a) norm_ans_HL_1_para = normalize_matrix(ans_HL_1_para) norm_ans_HL_2_para = normalize_matrix(ans_HL_2_para) norm_ans_HL_3_para = normalize_matrix(ans_HL_3_para) norm_ans_HL_4_para = normalize_matrix(ans_HL_4_para) ans_scores_matrix = add_HLs_2_tensor3(word_input4score, norm_ans_HL_1_para, norm_ans_HL_2_para, norm_ans_HL_3_para, norm_ans_HL_4_para, norm_ans_U_a, batch_size, true_p_len) ans_scores_vec = T.nnet.softmax( ans_scores_matrix).flatten() #(batch, para_len) ans_loss_neg_likelihood = -T.mean(T.log(ans_scores_vec[ans_indices])) ans_index_vec = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX).flatten() ans_new_index = T.set_subtensor(ans_index_vec[ans_indices], 1.0) ans_prob_batch_posi = ans_scores_vec[ans_new_index.nonzero()] ans_prob_batch_nega = ans_scores_vec[(1.0 - ans_new_index).nonzero()] ans_repeat_posi = T.extra_ops.repeat(ans_prob_batch_posi, ans_prob_batch_nega.shape[0], axis=0) ans_repeat_nega = T.extra_ops.repeat(ans_prob_batch_nega.dimshuffle( 'x', 0), ans_prob_batch_posi.shape[0], axis=0).flatten() ans_loss_rank = T.mean( T.maximum(0.0, margin - ans_repeat_posi + ans_repeat_nega)) ans_loss = ans_loss_neg_likelihood + ans_loss_rank #ans words test test_ans_scores_matrix = add_HLs_2_tensor3( test_word_input4score, norm_ans_HL_1_para, norm_ans_HL_2_para, norm_ans_HL_3_para, norm_ans_HL_4_para, norm_ans_U_a, true_batch_size, true_p_len) test_ans_scores_matrix = test_ans_scores_matrix * para_mask #T.nnet.softmax(test_ans_scores_matrix) #(batch, para_len) ans_gram_1 = test_ans_scores_matrix ans_gram_2 = (test_ans_scores_matrix[:, :-1] + test_ans_scores_matrix[:, 1:] ) / 2.0 #(batch* hidden_size, maxsenlen-1) ans_gram_3 = (test_ans_scores_matrix[:, :-2] + test_ans_scores_matrix[:, 1:-1] + test_ans_scores_matrix[:, 2:] ) / 3.0 #(batch* hidden_size, maxsenlen-2) ans_gram_4 = ( test_ans_scores_matrix[:, :-3] + test_ans_scores_matrix[:, 1:-2] + test_ans_scores_matrix[:, 2:-1] + test_ans_scores_matrix[:, 3:] ) / 4.0 #(batch* hidden_size, maxsenlen-3) ans_gram_5 = ( test_ans_scores_matrix[:, :-4] + test_ans_scores_matrix[:, 1:-3] + test_ans_scores_matrix[:, 2:-2] + test_ans_scores_matrix[:, 3:-1] + test_ans_scores_matrix[:, 4:]) / 5.0 #(batch* hidden_size, maxsenlen-4) ans_word_scores = T.concatenate( [ans_gram_1, ans_gram_2, ans_gram_3, ans_gram_4, ans_gram_5], axis=1) #(batch, hidden_size, maxsenlen-(0+1+2+3+4)) ''' form test spans and masks ''' test_span_word_scores_matrix = word_pair_scores + ans_word_scores #test_span_scores_matrix+ test_spans_mask_1 = para_mask test_spans_mask_2 = para_mask[:, : -1] * para_mask[:, 1:] #(batch* hidden_size, maxsenlen-1) test_spans_mask_3 = para_mask[:, : -2] * para_mask[:, 1: -1] * para_mask[:, 2:] #(batch* hidden_size, maxsenlen-2) test_spans_mask_4 = para_mask[:, : -3] * para_mask[:, 1: -2] * para_mask[:, 2: -1] * para_mask[:, 3:] #(batch* hidden_size, maxsenlen-3) test_spans_mask_5 = para_mask[:, : -4] * para_mask[:, 1: -3] * para_mask[:, 2: -2] * para_mask[:, 3: -1] * para_mask[:, 4:] test_spans_mask = T.concatenate([ test_spans_mask_1, test_spans_mask_2, test_spans_mask_3, test_spans_mask_4, test_spans_mask_5 ], axis=1) #(batch, 5*p_len -) # test_return=T.argmax(test_span_word_scores_matrix, axis=1) #batch T.argmax(test_span_word_scores_matrix*test_spans_mask, axis=1) #batch test_return = T.argmax(test_span_word_scores_matrix * test_spans_mask, axis=1) #batch # params = [embeddings,char_embeddings]+NN_para+[U_a] params = ( [embeddings, char_embeddings, extra_embeddings, distance_embeddings] + CNN_para # +[span_U_a,span_HL_1_para,span_HL_2_para,span_HL_3_para,span_HL_4_para] + [start_U_a, HL_1_para, HL_2_para, HL_3_para, HL_4_para] + [end_U_a, end_HL_1_para, end_HL_2_para, end_HL_3_para, end_HL_4_para] + [ans_U_a, ans_HL_1_para, ans_HL_2_para, ans_HL_3_para, ans_HL_4_para]) L2_reg = L2norm_paraList([ embeddings, char_embeddings, extra_embeddings, distance_embeddings, conv_W_1, conv_W_2, conv_W_1_q, conv_W_2_q, conv_W_char, conv_W_3, conv_W_3_q, # conv_W_4, conv_W_5,conv_W_4_q, conv_W_5_q, # span_U_a,span_HL_1_para,span_HL_2_para,span_HL_3_para,span_HL_4_para, start_U_a, HL_1_para, HL_2_para, HL_3_para, HL_4_para, end_U_a, end_HL_1_para, end_HL_2_para, end_HL_3_para, end_HL_4_para, ans_U_a, ans_HL_1_para, ans_HL_2_para, ans_HL_3_para, ans_HL_4_para ]) #L2_reg = L2norm_paraList(params) cost = word_loss + ans_loss + L2_weight * L2_reg #span_loss+ accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([ paragraph, questions, span_indices, word_indices, ans_indices, para_mask, q_mask, extra, char_paragraph, char_questions, char_para_mask, char_q_mask, true_p_len ], cost, updates=updates, on_unused_input='ignore') test_model = theano.function([ paragraph, questions, para_mask, q_mask, extra, char_paragraph, char_questions, char_para_mask, char_q_mask, true_p_len ], test_return, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches = train_size / batch_size # remain_train=train_size%batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / test_batch_size # remain_test=test_size%batch_size test_batch_start = list(numpy.arange(n_test_batches) * test_batch_size) + [test_size - test_batch_size] max_F1_acc = 0.0 max_exact_acc = 0.0 cost_i = 0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.Random(200).shuffle(train_ids) iter_accu = 0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_ids[para_id:para_id + batch_size] boundary_labels_batch = train_word_label_list[train_id_batch] ans_label_list = [] for i in range(batch_size): start = boundary_labels_batch[i][0] + i * p_len_limit end = boundary_labels_batch[i][1] + i * p_len_limit ans_label_list += range(start, end + 1) ans_label_list = numpy.asarray(ans_label_list, dtype='int32') cost_i += train_model( train_para_list[train_id_batch], train_Q_list[train_id_batch], train_span_label_list[train_id_batch], boundary_labels_batch, ans_label_list, train_para_mask[train_id_batch], train_Q_mask[train_id_batch], train_para_extras[train_id_batch], train_para_char_list[train_id_batch], train_Q_char_list[train_id_batch], train_para_char_mask[train_id_batch], train_Q_char_mask[train_id_batch], p_len_limit) #print iter if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' print 'Testing...' past_time = time.time() pred_dict = {} q_amount = 0 for test_para_id in test_batch_start: batch_predict_ids = test_model( test_para_list[test_para_id:test_para_id + test_batch_size], test_Q_list[test_para_id:test_para_id + test_batch_size], test_para_mask[test_para_id:test_para_id + test_batch_size], test_Q_mask[test_para_id:test_para_id + test_batch_size], test_para_extras[test_para_id:test_para_id + test_batch_size], test_para_char_list[test_para_id:test_para_id + test_batch_size], test_Q_char_list[test_para_id:test_para_id + test_batch_size], test_para_char_mask[test_para_id:test_para_id + test_batch_size], test_Q_char_mask[test_para_id:test_para_id + test_batch_size], test_p_len_limit) test_para_wordlist_batch = test_para_wordlist_list[ test_para_id:test_para_id + test_batch_size] q_ids_batch = q_idlist[test_para_id:test_para_id + test_batch_size] q_amount += test_batch_size for q in range(test_batch_size): #for each question pred_ans = decode_predict_id( batch_predict_ids[q], test_para_wordlist_batch[q]) q_id = q_ids_batch[q] pred_dict[q_id] = pred_ans # print q_id, test_para_wordlist_batch[q],'\t',pred_ans with codecs.open(rootPath + 'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath + 'dev-v1.1.json', rootPath + 'predictions.txt') if F1_acc > max_F1_acc: max_F1_acc = F1_acc if exact_acc > max_exact_acc: max_exact_acc = exact_acc # if max_exact_acc > max_EM: # store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params) # print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_exact_acc
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=300, test_batch_size=10000, emb_size=50, hidden_size=50, L2_weight=0.0001, para_len_limit=70, q_len_limit=20, pred_q_len_limit=50, top_n_Qwords=1): model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = np.random.RandomState(23455) word2id = {} train_para_list, train_para_mask, train_Q_list, train_Q_mask, train_start_list, train_end_list, _, word2id = load_QGQA( word2id, para_len_limit, q_len_limit, top_n_Qwords, True) train_size = len(train_para_list) if train_size != len(train_Q_list) or train_size != len( train_start_list) or train_size != len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_para_mask, test_Q_list, test_Q_mask, test_start_list, test_end_list, _, word2id = load_QGQA( word2id, para_len_limit, q_len_limit, top_n_Qwords, False) test_size = len(test_para_list) train_para_list = np.asarray(train_para_list, dtype='int32') train_para_mask = np.asarray(train_para_mask, dtype=theano.config.floatX) train_Q_list = np.asarray(train_Q_list, dtype='int32') train_Q_mask = np.asarray(train_Q_mask, dtype=theano.config.floatX) train_start_list = np.asarray(train_start_list, dtype='int32') train_end_list = np.asarray(train_end_list, dtype='int32') test_para_list = np.asarray(test_para_list, dtype='int32') test_para_mask = np.asarray(test_para_mask, dtype=theano.config.floatX) test_Q_list = np.asarray(test_Q_list, dtype='int32') test_Q_mask = np.asarray(test_Q_mask, dtype=theano.config.floatX) test_start_list = np.asarray(test_start_list, dtype='int32') test_end_list = np.asarray(test_end_list, dtype='int32') vocab_size = len(word2id) + 1 # shared_decoder_mask = [0]*vocab_size # shared_decoder_mask[0]=1#we need this pad token in generated text # for id in train_top_Q_wordids: # shared_decoder_mask[id]=1 # shared_decoder_mask=theano.shared(value=np.asarray(shared_decoder_mask, dtype='int32'), borrow=True) # rand_values = random_value_normal((vocab_size, emb_size), theano.config.floatX, np.random.RandomState(1234)) rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_glove() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared(value=rand_values, borrow=True) train_top_Q_wordids = set() wh_words = [ 'What', 'Which', 'Where', 'When', 'Who', 'Whom', 'Whose', 'Why', 'How', 'far', 'many', 'much', 'long' ] for word in wh_words: idd = word2id.get(word) if idd is not None: train_top_Q_wordids.add(idd) iddd = word2id.get(word.lower()) if iddd is not None: train_top_Q_wordids.add(iddd) paragraph = T.imatrix('paragraph') questions_encoderIDs = T.imatrix() # is ground truth, questions_decoderIDS = T.imatrix( ) #note we convert then from encoder vocab id to decoder vocab id decoder_vocab = T.ivector() decoder_mask = T.fmatrix() #(batch, decoder_vocab_size) start_indices = T.ivector() #batch end_indices = T.ivector() #batch para_mask = T.fmatrix('para_mask') q_mask = T.fmatrix('q_mask') ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = paragraph.shape[0] paragraph_input = embeddings[paragraph.flatten()].reshape( (true_batch_size, para_len_limit, emb_size)).dimshuffle(0, 2, 1) #(batch, emb_size, para_len) q_input = embeddings[questions_encoderIDs.flatten()].reshape( (true_batch_size, q_len_limit, emb_size)).dimshuffle(0, 2, 1) decoder_vocab_embs = embeddings[decoder_vocab] fwd_LSTM_para_dict = create_LSTM_para(rng, emb_size, hidden_size) bwd_LSTM_para_dict = create_LSTM_para(rng, emb_size, hidden_size) paragraph_para = fwd_LSTM_para_dict.values() + bwd_LSTM_para_dict.values( ) # .values returns a list of parameters paragraph_model = Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate( paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) paragraph_reps_tensor3 = paragraph_model.output_tensor #(batch, 2*hidden, paralen) batch_ids = T.arange(true_batch_size) ans_heads = paragraph_reps_tensor3[batch_ids, :, start_indices] ans_tails = paragraph_reps_tensor3[batch_ids, :, end_indices] l_context_heads = paragraph_reps_tensor3[:, :, 0] l_context_tails = paragraph_reps_tensor3[batch_ids, :, start_indices - 1] r_context_heads = paragraph_reps_tensor3[batch_ids, :, end_indices + 1] r_context_tails = paragraph_reps_tensor3[:, :, -1] encoder_reps = T.concatenate([ l_context_heads, l_context_tails, ans_heads, ans_tails, r_context_heads, r_context_tails ], axis=1) #(batch, 6*2hidden_size) decoder_para_dict = create_LSTM_para(rng, emb_size + 12 * hidden_size, emb_size) attention_para_dict1 = create_LSTM_para(rng, 2 * hidden_size, hidden_size) attention_para_dict2 = create_LSTM_para(rng, 2 * hidden_size, hidden_size) ''' train ''' groundtruth_as_input = T.concatenate([ T.alloc(np.asarray(0., dtype=theano.config.floatX), true_batch_size, emb_size, 1), q_input[:, :, :-1] ], axis=2) # decoder = LSTM_Decoder_Train_with_Mask(groundtruth_as_input, encoder_reps, decoder_vocab_embs, q_mask, emb_size, decoder_para_dict) #X, Encoder_Tensor_Rep, Encoder_Mask, start_indices, end_indices, vocab_embs, Mask, emb_size, hidden_size, tparams, attention_para_dict1, attention_para_dict2 decoder = LSTM_Decoder_Train_with_Attention( groundtruth_as_input, paragraph_reps_tensor3, para_mask, start_indices, end_indices, decoder_vocab_embs, q_mask, emb_size, hidden_size, decoder_para_dict, attention_para_dict1, attention_para_dict2) prob_matrix = decoder.prob_matrix #(batch*senlen, decoder_vocab_size) probs = prob_matrix[T.arange(true_batch_size * q_len_limit), questions_decoderIDS.flatten()] mask_probs = probs[(q_mask.flatten()).nonzero()] #we shift question word ids so that in current step, the prob of previsouly predicted id gets lower and lower shifted_question_ids = T.concatenate([ T.alloc(np.asarray(0, dtype='int32'), true_batch_size, 1), questions_decoderIDS[:, :-1] ], axis=1) probs_to_minimize = prob_matrix[T.arange(true_batch_size * q_len_limit), shifted_question_ids.flatten()] mask_probs_to_minimize = probs_to_minimize[(q_mask.flatten()).nonzero()] #loss train loss = -T.mean(T.log(mask_probs)) + T.mean(T.exp(mask_probs_to_minimize)) cost = loss #+ConvGRU_1.error# params = [embeddings] + paragraph_para + decoder_para_dict.values( ) + attention_para_dict1.values() + attention_para_dict2.values() accumulator = [] for para_i in params: eps_p = np.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-8))) #AdaGrad updates.append((acc_i, acc)) # #test decoder mask # raw_masks = T.zeros((true_batch_size, vocab_size), dtype='int32') # x_axis = T.repeat(T.arange(true_batch_size).dimshuffle(0,'x'), paragraph.shape[1], axis=1) # input_specific_masks = T.set_subtensor(raw_masks[x_axis.flatten(),paragraph.flatten()],1) # overall_test_decoder_mask = T.or_(input_specific_masks, shared_decoder_mask.dimshuffle('x',0)) #(batch, vocab_size) # overall_test_decoder_mask=(1.0-overall_test_decoder_mask)*(overall_test_decoder_mask-10) ''' testing ''' # test_decoder = LSTM_Decoder_Test_with_Mask(q_len_limit, encoder_reps, decoder_vocab_embs, emb_size, decoder_para_dict) #nsteps, Encoder_Tensor_Rep, Encoder_Mask, start_indices, end_indices, vocab_embs, emb_size,hidden_size, tparams,attention_para_dict1, attention_para_dict2 test_decoder = LSTM_Decoder_Test_with_Attention( pred_q_len_limit, paragraph_reps_tensor3, para_mask, start_indices, end_indices, decoder_vocab_embs, decoder_mask, emb_size, hidden_size, decoder_para_dict, attention_para_dict1, attention_para_dict2) predictions = test_decoder.output_id_matrix #(batch, q_len_limit) train_model = theano.function([ paragraph, questions_encoderIDs, questions_decoderIDS, decoder_vocab, start_indices, end_indices, para_mask, q_mask ], cost, updates=updates, on_unused_input='ignore') test_model = theano.function([ paragraph, decoder_vocab, decoder_mask, start_indices, end_indices, para_mask ], predictions, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches = train_size / batch_size # remain_train=train_size%batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / test_batch_size remain_test = test_size % test_batch_size test_batch_start = list(np.arange(n_test_batches) * test_batch_size) + [test_size - remain_test] max_bleuscore = 0.0 max_exact_acc = 0.0 cost_i = 0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu = 0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 sub_Qs = train_Q_list[para_id:para_id + batch_size] decoder_vocab_set = train_top_Q_wordids | set( list(np.unique(sub_Qs))) decoder_vocab_batch = sorted( decoder_vocab_set) # a list of ids in order map_encoderid2decoderid = {} for encoderID in decoder_vocab_set: decoderID = decoder_vocab_batch.index(encoderID) map_encoderid2decoderid[encoderID] = decoderID Decoder_train_Q_list = [] for id in sub_Qs.flatten(): Decoder_train_Q_list.append(map_encoderid2decoderid.get(id)) Decoder_train_Q_list = np.asarray(Decoder_train_Q_list, dtype='int32').reshape( (batch_size, sub_Qs.shape[1])) decoder_vocab_batch = np.asarray(decoder_vocab_batch, dtype='int32') cost_i += train_model( train_para_list[para_id:para_id + batch_size], train_Q_list[para_id:para_id + batch_size], Decoder_train_Q_list, decoder_vocab_batch, train_start_list[para_id:para_id + batch_size], train_end_list[para_id:para_id + batch_size], train_para_mask[para_id:para_id + batch_size], train_Q_mask[para_id:para_id + batch_size]) #print iter if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' # print 'Testing...' past_time = time.time() outputfile = codecs.open('output.txt', 'w', 'utf-8') referencefile = codecs.open('reference.txt', 'w', 'utf-8') bleu_scores = [] for idd, test_para_id in enumerate(test_batch_start): sub_Qs = test_Q_list[test_para_id:test_para_id + test_batch_size] decoder_vocab_set = train_top_Q_wordids | set( list(np.unique(sub_Qs))) decoder_vocab_batch = sorted( decoder_vocab_set) # a list of ids in order map_decoderid2encoderid = {} for encoderID in decoder_vocab_set: decoderID = decoder_vocab_batch.index(encoderID) map_decoderid2encoderid[decoderID] = encoderID if idd == len(test_batch_start) - 1: true_test_batch_size = remain_test else: true_test_batch_size = test_batch_size #decoder mask batch decoder_mask_batch = [] for i in range(true_test_batch_size): Q_i = test_Q_list[test_para_id + i] decoder_vocab_Q = train_top_Q_wordids | set( list(np.unique(Q_i))) decoder_mask_ind = [] for ele in decoder_vocab_batch: if ele in decoder_vocab_Q: decoder_mask_ind.append(1.0) else: decoder_mask_ind.append(0.0) decoder_mask_batch.append(decoder_mask_ind) decoder_mask_batch = np.asarray(decoder_mask_batch, dtype=theano.config.floatX) decoder_vocab_batch = np.asarray(decoder_vocab_batch, dtype='int32') pred_id_in_batch = test_model( test_para_list[test_para_id:test_para_id + test_batch_size], decoder_vocab_batch, decoder_mask_batch, test_start_list[test_para_id:test_para_id + test_batch_size], test_end_list[test_para_id:test_para_id + test_batch_size], test_para_mask[test_para_id:test_para_id + test_batch_size]) #(batch, senlen) ground_truths = sub_Qs ground_mask = test_Q_mask[test_para_id:test_para_id + test_batch_size] back_pred_id_in_batch = [ map_decoderid2encoderid.get(id) for id in pred_id_in_batch.flatten() ] for i in range(true_test_batch_size): # print 'pred_id_in_batch[i]:', pred_id_in_batch[i] refined_preds, refined_g = refine_decoder_predictions( back_pred_id_in_batch[i * pred_q_len_limit:(i + 1) * pred_q_len_limit], ground_truths[i], ground_mask[i]) # bleu_i = nltk.translate.bleu_score.sentence_bleu([refined_g], refined_preds) # bleu_scores.append(bleu_i) pred_q = '' prev_w = '' for id in refined_preds: word = id2word.get(id) if word.isalnum(): if word != prev_w: pred_q += ' ' + word prev_w = word outputfile.write(pred_q + ' ?\n') referencefile.write( ' '.join([id2word.get(id) for id in refined_g]) + '\n') # bleuscore = np.average(np.array(bleu_scores)) outputfile.close() referencefile.close() system('perl multi-bleu.perl reference.txt < output.txt') # if max_bleuscore < bleuscore: # max_bleuscore = bleuscore # print '\t\t\t\t\t\t current bleu: ', bleuscore, ' ; max bleu:', max_bleuscore if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.09, n_epochs=2000, nkerns=[50, 50], batch_size=1, window_width=3, maxSentLength=64, maxDocLength=60, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.00065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59): maxSentLength = max_s_length + 2 * (window_width - 1) maxDocLength = max_d_length + 2 * (window_width - 1) model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/MCTest/' rng = numpy.random.RandomState(23455) train_data, train_size, test_data, test_size, vocab_size = load_MCTest_corpus( rootPath + 'vocab.txt', rootPath + 'mc500.train.tsv_standardlized.txt', rootPath + 'mc500.test.tsv_standardlized.txt', max_s_length, maxSentLength, maxDocLength) #vocab_size contain train, dev and test #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True) #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' # mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt') # extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt') # discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') [ train_data_D, train_data_Q, train_data_A, train_Y, train_Label, train_Length_D, train_Length_D_s, train_Length_Q, train_Length_A, train_leftPad_D, train_leftPad_D_s, train_leftPad_Q, train_leftPad_A, train_rightPad_D, train_rightPad_D_s, train_rightPad_Q, train_rightPad_A ] = train_data [ test_data_D, test_data_Q, test_data_A, test_Y, test_Label, test_Length_D, test_Length_D_s, test_Length_Q, test_Length_A, test_leftPad_D, test_leftPad_D_s, test_leftPad_Q, test_leftPad_A, test_rightPad_D, test_rightPad_D_s, test_rightPad_Q, test_rightPad_A ] = test_data n_train_batches = train_size / batch_size n_test_batches = test_size / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int64') # indices_train_r=T.cast(indices_train_r, 'int64') # indices_test_l=T.cast(indices_test_l, 'int64') # indices_test_r=T.cast(indices_test_r, 'int64') rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init(rand_values, rootPath + 'vocab_embs_300d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings = theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer index_Q = T.lvector() index_A = T.lvector() y = T.lvector() len_D = T.lscalar() len_D_s = T.lvector() len_Q = T.lscalar() len_A = T.lscalar() left_D = T.lscalar() left_D_s = T.lvector() left_Q = T.lscalar() left_A = T.lscalar() right_D = T.lscalar() right_D_s = T.lvector() right_Q = T.lscalar() right_A = T.lscalar() #wmf=T.dmatrix() cost_tmp = T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words = (emb_size, window_width) filter_sents = (nkerns[0], window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = embeddings[index_D.flatten()].reshape( (maxDocLength, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_Q_input = embeddings[index_Q.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A_input = embeddings[index_A.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b = create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1])) # load_model_for_conv1([conv_W, conv_b]) layer0_D = Conv_with_input_para( rng, input=layer0_D_input, image_shape=(maxDocLength, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_Q = Conv_with_input_para( rng, input=layer0_Q_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A = Conv_with_input_para( rng, input=layer0_A_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output') layer0_Q_output = debug_print(layer0_Q.output, 'layer0_Q.output') layer0_A_output = debug_print(layer0_A.output, 'layer0_A.output') layer0_para = [conv_W, conv_b] layer1_DQ = Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q, length_D_s=len_D_s + filter_words[1] - 1, length_r=len_Q + filter_words[1] - 1, dim=maxSentLength + filter_words[1] - 1, doc_len=maxDocLength, topk=3) layer1_DA = Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A, right_r=right_A, length_D_s=len_D_s + filter_words[1] - 1, length_r=len_A + filter_words[1] - 1, dim=maxSentLength + filter_words[1] - 1, doc_len=maxDocLength, topk=3) conv2_W, conv2_b = create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0] #conv from sentence to doc layer2_DQ = Conv_with_input_para( rng, input=layer1_DQ.output_D.reshape( (batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA = Conv_with_input_para( rng, input=layer1_DA.output_D.reshape( (batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) #conv single Q and A into doc level with same conv weights layer2_Q = Conv_with_input_para_one_col_featuremap( rng, input=layer1_DQ.output_QA_sent_level_rep.reshape( (batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A = Conv_with_input_para_one_col_featuremap( rng, input=layer1_DA.output_QA_sent_level_rep.reshape( (batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_Q_output_sent_rep_Dlevel = debug_print( layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel') layer2_A_output_sent_rep_Dlevel = debug_print( layer2_A.output_sent_rep_Dlevel, 'layer2_A.output_sent_rep_Dlevel') layer2_para = [conv2_W, conv2_b] layer3_DQ = Average_Pooling_for_Top( rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D + filter_sents[1] - 1, length_r=1, dim=maxDocLength + filter_sents[1] - 1, topk=3) layer3_DA = Average_Pooling_for_Top( rng, input_l=layer2_DA.output, input_r=layer2_A_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D + filter_sents[1] - 1, length_r=1, dim=maxDocLength + filter_sents[1] - 1, topk=3) #high-way high_W, high_b = create_highw_para(rng, nkerns[0], nkerns[1]) transform_gate_DQ = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ') transform_gate_DA = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA.output_D_sent_level_rep) + high_b), 'transform_gate_DA') transform_gate_Q = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b), 'transform_gate_Q') transform_gate_A = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA.output_QA_sent_level_rep) + high_b), 'transform_gate_A') highW_para = [high_W, high_b] overall_D_Q = debug_print( (1.0 - transform_gate_DQ) * layer1_DQ.output_D_sent_level_rep + transform_gate_DQ * layer3_DQ.output_D_doc_level_rep, 'overall_D_Q') overall_D_A = ( 1.0 - transform_gate_DA ) * layer1_DA.output_D_sent_level_rep + transform_gate_DA * layer3_DA.output_D_doc_level_rep overall_Q = ( 1.0 - transform_gate_Q ) * layer1_DQ.output_QA_sent_level_rep + transform_gate_Q * layer2_Q.output_sent_rep_Dlevel overall_A = ( 1.0 - transform_gate_A ) * layer1_DA.output_QA_sent_level_rep + transform_gate_A * layer2_A.output_sent_rep_Dlevel simi_sent_level = debug_print( cosine( layer1_DQ.output_D_sent_level_rep + layer1_DA.output_D_sent_level_rep, layer1_DQ.output_QA_sent_level_rep + layer1_DA.output_QA_sent_level_rep), 'simi_sent_level') simi_doc_level = debug_print( cosine( layer3_DQ.output_D_doc_level_rep + layer3_DA.output_D_doc_level_rep, layer2_Q.output_sent_rep_Dlevel + layer2_A.output_sent_rep_Dlevel), 'simi_doc_level') simi_overall_level = debug_print( cosine(overall_D_Q + overall_D_A, overall_Q + overall_A), 'simi_overall_level') # eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA)) layer4_input = debug_print( T.concatenate([simi_sent_level, simi_doc_level, simi_overall_level], axis=1), 'layer4_input') #, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer4 = LogisticRegression(rng, input=layer4_input, n_in=3, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg = debug_print( (layer4.W**2).sum() + (high_W**2).sum() + (conv2_W**2).sum() + (conv_W**2).sum(), 'L2_reg') #+(layer1.W** 2).sum()++(embeddings**2).sum() cost_this = debug_print(layer4.negative_log_likelihood(y), 'cost_this') #+L2_weight*L2_reg cost = debug_print( (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') # # [train_data_D, train_data_Q, train_data_A, train_Y, train_Label, # train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A, # train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A, # train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A]=train_data # [test_data_D, test_data_Q, test_data_A, test_Y, test_Label, # test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A, # test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A, # test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A]=test_data # index = T.lscalar() # index_D = T.lmatrix() # now, x is the index matrix, must be integer # index_Q = T.lvector() # index_A= T.lvector() # # y = T.lvector() # len_D=T.lscalar() # len_D_s=T.lvector() # len_Q=T.lscalar() # len_A=T.lscalar() # # left_D=T.lscalar() # left_D_s=T.lvector() # left_Q=T.lscalar() # left_A=T.lscalar() # # right_D=T.lscalar() # right_D_s=T.lvector() # right_Q=T.lscalar() # right_A=T.lscalar() # # # #wmf=T.dmatrix() # cost_tmp=T.dscalar() test_model = theano.function( [index], [layer4.errors(y), layer4_input, y, layer4.prop_for_posi], givens={ index_D: test_data_D[index], #a matrix index_Q: test_data_Q[index], index_A: test_data_A[index], y: test_Y[index:index + batch_size], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], len_Q: test_Length_Q[index], len_A: test_Length_A[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], left_Q: test_leftPad_Q[index], left_A: test_leftPad_A[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], right_Q: test_rightPad_Q[index], right_A: test_rightPad_A[index] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer4.params + layer2_para + layer0_para + highW_para accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') acc = acc_i + T.sqr(grad_i) updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [index, cost_tmp], cost, updates=updates, givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A: train_data_A[index], y: train_Y[index:index + batch_size], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A: train_Length_A[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A: train_leftPad_A[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A: train_rightPad_A[index] }, on_unused_input='ignore') train_model_predict = theano.function( [index], [cost_this, layer4.errors(y), layer4_input, y], givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A: train_data_A[index], y: train_Y[index:index + batch_size], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A: train_Length_A[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A: train_leftPad_A[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A: train_rightPad_A[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc = 0.0 best_epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 #shuffle(train_batch_start)#shuffle training data cost_tmp = 0.0 # readfile=open('/mounts/data/proj/wenpeng/Dataset/SICK/train_plus_dev.txt', 'r') # train_pairs=[] # train_y=[] # for line in readfile: # tokens=line.strip().split('\t') # listt=tokens[0]+'\t'+tokens[1] # train_pairs.append(listt) # train_y.append(tokens[2]) # readfile.close() # writefile=open('/mounts/data/proj/wenpeng/Dataset/SICK/weights_fine_tune.txt', 'w') for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 sys.stdout.write("Training :[%6f] %% complete!\r" % (batch_start * 100.0 / train_size)) sys.stdout.flush() minibatch_index = minibatch_index + 1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) #print batch_start if iter % update_freq != 0: cost_ij, error_ij, layer3_input, y = train_model_predict( batch_start) #print 'layer3_input', layer3_input cost_tmp += cost_ij error_sum += error_ij else: cost_average = train_model(batch_start, cost_tmp) #print 'layer3_input', layer3_input error_sum = 0 cost_tmp = 0.0 #reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + ' average cost: ' + str(cost_average) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_losses = [] test_y = [] test_features = [] test_prop = [] for i in test_batch_start: test_loss, layer3_input, y, posi_prop = test_model(i) test_prop.append(posi_prop[0][0]) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() #test_score = numpy.mean(test_losses) test_acc = compute_test_acc(test_y, test_prop) #test_acc=1-test_score print( ('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_acc * 100.)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') train_y = [] train_features = [] count = 0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y = train_model_predict( batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 #write_feature.close() clf = svm.SVC( kernel='linear' ) #OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33 clf.fit(train_features, train_y) results = clf.decision_function(test_features) lr = linear_model.LogisticRegression(C=1e5) lr.fit(train_features, train_y) results_lr = lr.decision_function(test_features) acc_svm = compute_test_acc(test_y, results) acc_lr = compute_test_acc(test_y, results_lr) find_better = False if acc_svm > max_acc: max_acc = acc_svm best_epoch = epoch find_better = True if test_acc > max_acc: max_acc = test_acc best_epoch = epoch find_better = True if acc_lr > max_acc: max_acc = acc_lr best_epoch = epoch find_better = True print '\t\t\tsvm:', acc_svm, 'lr:', acc_lr, 'nn:', test_acc, 'max:', max_acc, '(at', best_epoch, ')' # if find_better==True: # store_model_to_file(layer2_para, best_epoch) # print 'Finished storing best conv params' if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min' mid_time = time.clock() #writefile.close() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.5, n_epochs=2000, batch_size=500, emb_size=300, hidden_size=300, L2_weight=0.0001, para_len_limit=700, q_len_limit=40): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train(para_len_limit, q_len_limit) train_size=len(train_para_list) if train_size!=len(train_Q_list) or train_size!=len(train_label_list) or train_size!=len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist= load_dev_or_test(word2id, para_len_limit, q_len_limit) test_size=len(test_para_list) if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) id2word = {y:x for x,y in overall_word2id.iteritems()} word2vec=load_word2vec() rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') labels = T.imatrix('labels') para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) paragraph_input = embeddings[paragraph.flatten()].reshape((paragraph.shape[0], paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen) # # # BdGRU(rng, str(0), shape, X, mask, is_train = 1, batch_size = 1, p = 0.5) # U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size) paragraph_para=[U1, W1, b1, U1_b, W1_b, b1_b] paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) para_reps=paragraph_model.output_tensor #(batch, emb, para_len) Qs_emb = embeddings[questions.flatten()].reshape((questions.shape[0], questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength) UQ, WQ, bQ=create_GRU_para(rng, emb_size, hidden_size) UQ_b, WQ_b, bQ_b=create_GRU_para(rng, emb_size, hidden_size) Q_para=[UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] questions_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ,W=WQ,b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b) questions_reps=questions_model.output_sent_rep_maxpooling.reshape((batch_size, 1, hidden_size)) #(batch, 2*out_size) #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) #attention distributions W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size)) W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 2, hidden_size+3) # 3 extra features norm_W_a1=normalize_matrix(W_a1) norm_W_a2=normalize_matrix(W_a2) norm_U_a=normalize_matrix(U_a) LR_b = theano.shared(value=numpy.zeros((2,), dtype=theano.config.floatX), # @UndefinedVariable name='LR_b', borrow=True) attention_paras=[W_a1, W_a2, U_a, LR_b] transformed_para_reps=T.tanh(T.dot(para_reps.transpose((0, 2,1)), norm_W_a2)) transformed_q_reps=T.tanh(T.dot(questions_reps, norm_W_a1)) #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1) add_both=0.5*(transformed_para_reps+transformed_q_reps) prior_att=T.concatenate([add_both, normalize_matrix(extraF)], axis=2) #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2) valid_indices=para_mask.flatten().nonzero()[0] layer3=LogisticRegression(rng, input=prior_att.reshape((batch_size*prior_att.shape[1], hidden_size+3)), n_in=hidden_size+3, n_out=2, W=norm_U_a, b=LR_b) #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices]) error = -T.mean(T.log(layer3.p_y_given_x)[valid_indices, labels.flatten()[valid_indices]])#[T.arange(y.shape[0]), y]) distributions=layer3.p_y_given_x[:,-1].reshape((batch_size, para_mask.shape[1])) #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1])) masked_dis=distributions*para_mask ''' strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1) distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions') para_mask=para_mask masked_dis=distributions*para_mask # masked_label=debug_print(labels*para_mask, 'masked_label') # error=((masked_dis-masked_label)**2).mean() label_mask=T.gt(labels,0.0) neg_label_mask=T.lt(labels,0.0) dis_masked=distributions*label_mask remain_dis_masked=distributions*neg_label_mask ans_size=T.sum(label_mask) non_ans_size=T.sum(neg_label_mask) pos_error=T.sum((dis_masked-label_mask)**2)/ans_size neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)* ''' # def AttentionLayer(q_rep, ext_M): # theano_U_a=debug_print(norm_U_a, 'norm_U_a') # prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att') # f __name__ == '__main__': # prior_att=T.concatenate([prior_att, ext_M], axis=1) # # strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1) # return strength.transpose() #(1, #words) # distributions, updates = theano.scan( # AttentionLayer, # sequences=[questions_reps,extraF] ) # distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions') # labels=debug_print(labels, 'labels') # label_mask=T.gt(labels,0.0) # neg_label_mask=T.lt(labels,0.0) # dis_masked=distributions*label_mask # remain_dis_masked=distributions*neg_label_mask # pos_error=((dis_masked-1)**2).mean() # neg_error=((remain_dis_masked-(-1))**2).mean() # error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = [embeddings]+paragraph_para+Q_para+attention_paras L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ, UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=error#+L2_weight*L2_reg accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([paragraph, questions,labels, para_mask, q_mask, extraF], error, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF], masked_dis, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*batch_size)+[test_size-batch_size] max_exact_acc=0.0 cost_i=0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #shuffle(train_batch_start) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i+= train_model( np.asarray(train_para_list[para_id:para_id+batch_size], dtype='int32'), np.asarray(train_Q_list[para_id:para_id+batch_size], dtype='int32'), np.asarray(train_label_list[para_id:para_id+batch_size], dtype='int32'), np.asarray(train_para_mask[para_id:para_id+batch_size], dtype=theano.config.floatX), np.asarray(train_mask[para_id:para_id+batch_size], dtype=theano.config.floatX), np.asarray(train_feature_matrixlist[para_id:para_id+batch_size], dtype=theano.config.floatX)) #print iter if iter%10==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() exact_match=0.0 q_amount=0 for test_para_id in test_batch_start: distribution_matrix=test_model( np.asarray(test_para_list[test_para_id:test_para_id+batch_size], dtype='int32'), np.asarray(test_Q_list[test_para_id:test_para_id+batch_size], dtype='int32'), np.asarray(test_para_mask[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX), np.asarray(test_mask[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX), np.asarray(test_feature_matrixlist[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX)) # print distribution_matrix test_para_wordlist_list=test_text_list[test_para_id:test_para_id+batch_size] para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+batch_size] paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size] sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size] para_len=len(test_para_wordlist_list[0]) if para_len!=len(distribution_matrix[0]): print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0]) exit(0) # q_size=len(distribution_matrix) q_amount+=batch_size # print q_size # print test_para_word_list for q in range(batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], np.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q]) q_gold_ans_set=para_gold_ansset_list[q] F1=MacroF1(pred_ans, q_gold_ans_set) exact_match+=F1 # match_amount=len(pred_ans_set & q_gold_ans_set) # # print 'q_gold_ans_set:', q_gold_ans_set # # print 'pred_ans_set:', pred_ans_set # if match_amount>0: # exact_match+=match_amount*1.0/len(pred_ans_set) exact_acc=exact_match/q_amount if exact_acc> max_exact_acc: max_exact_acc=exact_acc print 'current average F1:', exact_acc, '\t\tmax F1:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.085, n_epochs=2000, nkerns=[50, 50], batch_size=1, window_width=7, maxSentLength=60, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.00005, update_freq=10, norm_threshold=5.0): model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/' rng = numpy.random.RandomState(23455) datasets, vocab_size = load_msr_corpus(rootPath + 'vocab.txt', rootPath + 'tokenized_train.txt', rootPath + 'tokenized_test.txt', maxSentLength) mtPath = '/mounts/data/proj/wenpeng/Dataset/paraphraseMT/' mt_train, mt_test = load_mts(mtPath + 'concate_15mt_train.txt', mtPath + 'concate_15mt_test.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[ 0] indices_train_l = indices_train[::2, :] indices_train_r = indices_train[1::2, :] trainLengths_l = trainLengths[::2] trainLengths_r = trainLengths[1::2] normalized_train_length_l = normalized_train_length[::2] normalized_train_length_r = normalized_train_length[1::2] trainLeftPad_l = trainLeftPad[::2] trainLeftPad_r = trainLeftPad[1::2] trainRightPad_l = trainRightPad[::2] trainRightPad_r = trainRightPad[1::2] indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[ 1] indices_test_l = indices_test[::2, :] indices_test_r = indices_test[1::2, :] testLengths_l = testLengths[::2] testLengths_r = testLengths[1::2] normalized_test_length_l = normalized_test_length[::2] normalized_test_length_r = normalized_test_length[1::2] testLeftPad_l = testLeftPad[::2] testLeftPad_r = testLeftPad[1::2] testRightPad_l = testRightPad[::2] testRightPad_r = testRightPad[1::2] n_train_batches = indices_train_l.shape[0] / batch_size n_test_batches = indices_test_l.shape[0] / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) indices_train_l = theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r = theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l = theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r = theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l = T.cast(indices_train_l, 'int32') indices_train_r = T.cast(indices_train_r, 'int32') indices_test_l = T.cast(indices_test_l, 'int32') indices_test_r = T.cast(indices_test_r, 'int32') rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size)) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init(rand_values, rootPath + 'vocab_embs_300d.txt') embeddings = theano.shared(value=rand_values, borrow=True) cost_tmp = 0 error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.imatrix( 'x_index_l') # now, x is the index matrix, must be integer x_index_r = T.imatrix('x_index_r') y = T.ivector('y') left_l = T.iscalar() right_l = T.iscalar() left_r = T.iscalar() right_r = T.iscalar() length_l = T.iscalar() length_r = T.iscalar() norm_length_l = T.dscalar() norm_length_r = T.dscalar() mts = T.dmatrix() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size = (emb_size, window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv = ishape[1] + filter_size[1] - 1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_r_input = embeddings[x_index_r.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b = create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output = debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output = debug_print(layer0_r.output, 'layer0_r.output') layer0_para = [conv_W, conv_b] layer1 = Average_Pooling(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l + filter_size[1] - 1, length_r=length_r + filter_size[1] - 1, dim=maxSentLength + filter_size[1] - 1, window_size=window_width, maxSentLength=maxSentLength) conv2_W, conv2_b = create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_size[1])) layer2_l = Conv_with_input_para(rng, input=layer1.output_tensor_l, image_shape=(batch_size, 1, nkerns[0], ishape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_size[1]), W=conv2_W, b=conv2_b) layer2_r = Conv_with_input_para(rng, input=layer1.output_tensor_r, image_shape=(batch_size, 1, nkerns[0], ishape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_size[1]), W=conv2_W, b=conv2_b) layer2_para = [conv2_W, conv2_b] layer3 = Average_Pooling_for_batch1(rng, input_l=layer2_l.output, input_r=layer2_r.output, kern=nkerns[1], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l + filter_size[1] - 1, length_r=length_r + filter_size[1] - 1, dim=maxSentLength + filter_size[1] - 1) layer3_out = debug_print(layer3.output_simi, 'layer1_out') #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh) sum_uni_l = T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) #norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r = T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) #norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) ''' uni_cosine=cosine(sum_uni_l, sum_uni_r) linear=Linear(sum_uni_l, sum_uni_r) poly=Poly(sum_uni_l, sum_uni_r) sigmoid=Sigmoid(sum_uni_l, sum_uni_r) rbf=RBF(sum_uni_l, sum_uni_r) gesd=GESD(sum_uni_l, sum_uni_r) ''' eucli_1 = 1.0 / (1.0 + EUCLID(sum_uni_l, sum_uni_r)) #25.2% #eucli_1=EUCLID(sum_uni_l, sum_uni_r) len_l = norm_length_l.reshape((1, 1)) len_r = norm_length_r.reshape((1, 1)) #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer4_input = T.concatenate( [mts, eucli_1, layer1.output_eucli, layer3_out, len_l, len_r], axis=1) #, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer4 = LogisticRegression(rng, input=layer4_input, n_in=15 + 3 + 2, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg = debug_print( (layer4.W**2).sum() + (conv2_W**2).sum() + (conv_W**2).sum(), 'L2_reg') #+(layer1.W** 2).sum() cost_this = debug_print(layer4.negative_log_likelihood(y), 'cost_this') #+L2_weight*L2_reg cost = debug_print( (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost') test_model = theano.function( [index], [layer4.errors(y), layer4.y_pred], givens={ x_index_l: indices_test_l[index:index + batch_size], x_index_r: indices_test_r[index:index + batch_size], y: testY[index:index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index], mts: mt_test[index:index + batch_size] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer4.params + layer2_para + layer0_para # + layer1.params accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') #norm=T.sqrt((grad_i**2).sum()) #if T.lt(norm_threshold, norm): # print 'big norm' # grad_i=grad_i*(norm_threshold/norm) acc = acc_i + T.sqr(grad_i) updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [index], [cost, layer4.errors(y), layer4_input], updates=updates, givens={ x_index_l: indices_train_l[index:index + batch_size], x_index_r: indices_train_r[index:index + batch_size], y: trainY[index:index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index:index + batch_size] }, on_unused_input='ignore') train_model_predict = theano.function( [index], [cost_this, layer4.errors(y)], givens={ x_index_l: indices_train_l[index:index + batch_size], x_index_r: indices_train_r[index:index + batch_size], y: trainY[index:index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index:index + batch_size] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 #shuffle(train_batch_start)#shuffle training data for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 minibatch_index = minibatch_index + 1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) if iter % update_freq != 0: cost_ij, error_ij = train_model_predict(batch_start) #print 'cost_ij: ', cost_ij cost_tmp += cost_ij error_sum += error_ij else: cost_average, error_ij, layer3_input = train_model(batch_start) #print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' sum error: '+str(error_sum)+'/'+str(update_freq) error_sum = 0 cost_tmp = 0 #reset for the next batch #print layer3_input #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + ' average cost: ' + str( cost_average) + ' error: ' + str( error_sum) + '/' + str( update_freq) + ' error rate: ' + str( error_sum * 1.0 / update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_losses = [] for i in test_batch_start: test_loss, pred_y = test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) print(( '\t\t\t\t\t\tepoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_score * 100.)) ''' #print 'validating & testing...' # compute zero-one loss on validation set validation_losses = [] for i in dev_batch_start: time.sleep(0.5) validation_losses.append(validate_model(i)) #validation_losses = [validate_model(i) for i in dev_batch_start] this_validation_loss = numpy.mean(validation_losses) print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index , n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in test_batch_start] test_score = numpy.mean(test_losses) print(('\t\t\t\tepoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_score * 100.)) ''' if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, word_nkerns=50, char_nkerns=4, batch_size=1, window_width=[2, 5], emb_size=50, char_emb_size=4, hidden_size=200, margin=0.5, L2_weight=0.0003, Div_reg=0.03, update_freq=1, norm_threshold=5.0, max_truncate=40, max_char_len=40, max_des_len=20, max_relation_len=5, max_Q_len=30, train_neg_size=21, neg_all=100, train_size=200, test_size=200, mark='_forfun'): #train_size=75909, test_size=17386 # maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/' triple_files=['annotated_fb_data_train.entitylinking.top20_succSet_asInput.txt', 'annotated_fb_data_test.entitylinking.top20_succSet_asInput.txt'] rng = numpy.random.RandomState(23455) datasets, datasets_test, length_per_example_test, vocab_size, char_size=load_train(triple_files[0], triple_files[1], max_char_len, max_des_len, max_relation_len, max_Q_len, train_size, test_size, mark)#max_char_len, max_des_len, max_relation_len, max_Q_len print 'vocab_size:', vocab_size, 'char_size:', char_size train_data=datasets # valid_data=datasets[1] test_data=datasets_test # result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores) # train_pos_entity_char=train_data[0] train_pos_entity_des=train_data[1] train_relations=train_data[2] train_entity_char_lengths=train_data[3] train_entity_des_lengths=train_data[4] train_relation_lengths=train_data[5] train_mention_char_ids=train_data[6] train_remainQ_word_ids=train_data[7] train_mention_char_lens=train_data[8] train_remainQ_word_len=train_data[9] train_entity_scores=train_data[10] test_pos_entity_char=test_data[0] test_pos_entity_des=test_data[1] test_relations=test_data[2] test_entity_char_lengths=test_data[3] test_entity_des_lengths=test_data[4] test_relation_lengths=test_data[5] test_mention_char_ids=test_data[6] test_remainQ_word_ids=test_data[7] test_mention_char_lens=test_data[8] test_remainQ_word_len=test_data[9] test_entity_scores=test_data[10] # # test_pos_entity_char=test_data[0] #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51 # test_pos_entity_des=test_data[1] #matrix, each row for a examle: 20*2*51 # test_relations=test_data[2] #matrix, each row for a example: 5*51 # test_entity_char_lengths=test_data[3] #matrix, each row for a example: 3*2*51 (three valies for one entity) # test_entity_des_lengths=test_data[4] #matrix, each row for a example: 3*2*51 (three values for one entity) # test_relation_lengths=test_data[5] #matrix, each row for a example: 3*51 # test_mention_char_ids=test_data[6] #matrix, each row for a mention: 40 # test_remainQ_word_ids=test_data[7] #matrix, each row for a question: 30 # test_mention_char_lens=test_data[8] #matrix, each three values for a mention: 3 # test_remainQ_word_len=test_data[9] #matrix, each three values for a remain question: 3 train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\ len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len), len(train_entity_scores)] if sum(train_sizes)/len(train_sizes)!=train_size: print 'weird size:', train_sizes exit(0) test_sizes=[len(test_pos_entity_char), len(test_pos_entity_des), len(test_relations), len(test_entity_char_lengths), len(test_entity_des_lengths),\ len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len), len(test_entity_scores)] if sum(test_sizes)/len(test_sizes)!=test_size: print 'weird size:', test_sizes exit(0) n_train_batches=train_size/batch_size n_test_batches=test_size/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_pos_entity_char=pythonList_into_theanoIntMatrix(train_pos_entity_char) indices_train_pos_entity_des=pythonList_into_theanoIntMatrix(train_pos_entity_des) indices_train_relations=pythonList_into_theanoIntMatrix(train_relations) indices_train_entity_char_lengths=pythonList_into_theanoIntMatrix(train_entity_char_lengths) indices_train_entity_des_lengths=pythonList_into_theanoIntMatrix(train_entity_des_lengths) indices_train_relation_lengths=pythonList_into_theanoIntMatrix(train_relation_lengths) indices_train_mention_char_ids=pythonList_into_theanoIntMatrix(train_mention_char_ids) indices_train_remainQ_word_ids=pythonList_into_theanoIntMatrix(train_remainQ_word_ids) indices_train_mention_char_lens=pythonList_into_theanoIntMatrix(train_mention_char_lens) indices_train_remainQ_word_len=pythonList_into_theanoIntMatrix(train_remainQ_word_len) indices_train_entity_scores=pythonList_into_theanoFloatMatrix(train_entity_scores) # indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char) # indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des) # indices_test_relations=pythonList_into_theanoIntMatrix(test_relations) # indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths) # indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths) # indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths) # indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids) # indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids) # indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens) # indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len) # indices_test_entity_scores=pythonList_into_theanoIntMatrix(test_entity_scores) rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb'+mark+'.txt') embeddings=theano.shared(value=rand_values, borrow=True) char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, numpy.random.RandomState(1234)) char_rand_values[0]=numpy.array(numpy.zeros(char_emb_size),dtype=theano.config.floatX) char_embeddings=theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data index = T.lscalar() chosed_indices=T.lvector() ent_char_ids_M = T.lmatrix() ent_lens_M = T.lmatrix() men_char_ids_M = T.lmatrix() men_lens_M=T.lmatrix() rel_word_ids_M=T.lmatrix() rel_word_lens_M=T.lmatrix() desH_word_ids_M=T.lmatrix() desH_word_lens_M=T.lmatrix() # desT_word_ids_M=T.lmatrix() # desT_word_lens_M=T.lmatrix() q_word_ids_M=T.lmatrix() q_word_lens_M=T.lmatrix() ent_scores=T.dvector() #max_char_len, max_des_len, max_relation_len, max_Q_len # ent_men_ishape = (char_emb_size, max_char_len) # this is the size of MNIST images # rel_ishape=(emb_size, max_relation_len) # des_ishape=(emb_size, max_des_len) # q_ishape=(emb_size, max_Q_len) filter_size=(emb_size,window_width[0]) char_filter_size=(char_emb_size, window_width[1]) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' char_filter_shape=(char_nkerns, 1, char_filter_size[0], char_filter_size[1]) word_filter_shape=(word_nkerns, 1, filter_size[0], filter_size[1]) char_conv_W, char_conv_b=create_conv_para(rng, filter_shape=char_filter_shape) q_rel_conv_W, q_rel_conv_b=create_conv_para(rng, filter_shape=word_filter_shape) q_desH_conv_W, q_desH_conv_b=create_conv_para(rng, filter_shape=word_filter_shape) params = [char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W, q_rel_conv_b, q_desH_conv_W, q_desH_conv_b] char_conv_W_into_matrix=char_conv_W.reshape((char_conv_W.shape[0], char_conv_W.shape[2]*char_conv_W.shape[3])) q_rel_conv_W_into_matrix=q_rel_conv_W.reshape((q_rel_conv_W.shape[0], q_rel_conv_W.shape[2]*q_rel_conv_W.shape[3])) q_desH_conv_W_into_matrix=q_desH_conv_W.reshape((q_desH_conv_W.shape[0], q_desH_conv_W.shape[2]*q_desH_conv_W.shape[3])) # load_model_from_file(rootPath, params, '') def SimpleQ_matches_Triple(ent_char_ids_f,ent_lens_f,rel_word_ids_f,rel_word_lens_f,desH_word_ids_f, desH_word_lens_f, men_char_ids_f, q_word_ids_f, men_lens_f, q_word_lens_f): # rng = numpy.random.RandomState(23455) ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) men_char_input = char_embeddings[men_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape((batch_size,max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) q_word_input = embeddings[q_word_ids_f.flatten()].reshape((batch_size,max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #ent_mention ent_char_conv = Conv_with_input_para(rng, input=ent_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) men_char_conv = Conv_with_input_para(rng, input=men_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) #q-rel q_rel_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) rel_conv = Conv_with_input_para(rng, input=rel_word_input, image_shape=(batch_size, 1, emb_size, max_relation_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) #q_desH q_desH_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) desH_conv = Conv_with_input_para(rng, input=desH_word_input, image_shape=(batch_size, 1, emb_size, max_des_len), filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) # #q_desT # q_desT_conv = Conv_with_input_para(rng, input=q_word_input, # image_shape=(batch_size, 1, emb_size, max_Q_len), # filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b) # desT_conv = Conv_with_input_para(rng, input=desT_word_input, # image_shape=(batch_size, 1, emb_size, max_des_len), # filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b) # ent_char_output=debug_print(ent_char_conv.output, 'ent_char.output') # men_char_output=debug_print(men_char_conv.output, 'men_char.output') ent_conv_pool=Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2]) men_conv_pool=Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens_f[0], right_l=men_lens_f[2]) # q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) rel_conv_pool=Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2]) q_rel_pool=Average_Pooling_for_SimpleQA(rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1]+filter_size[1]-1, dim=max_Q_len+filter_size[1]-1, topk=2) q_desH_pool=Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) desH_conv_pool=Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2]) # q_desT_pool=Max_Pooling(rng, input_l=q_desT_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2]) # desT_conv_pool=Max_Pooling(rng, input_l=desT_conv.output, left_l=desT_word_lens_f[0], right_l=desT_word_lens_f[2]) overall_simi=(cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)+\ cosine(q_rel_pool.topk_max_pooling, rel_conv_pool.output_maxpooling)+\ 0.1*cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling))/3.0 # cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling) return overall_simi simi_list, updates = theano.scan( SimpleQ_matches_Triple, sequences=[ent_char_ids_M,ent_lens_M,rel_word_ids_M,rel_word_lens_M,desH_word_ids_M, desH_word_lens_M, men_char_ids_M, q_word_ids_M, men_lens_M, q_word_lens_M]) simi_list+=0.5*ent_scores posi_simi=simi_list[0] nega_simies=simi_list[1:] loss_simi_list=T.maximum(0.0, margin-posi_simi.reshape((1,1))+nega_simies) loss_simi=T.mean(loss_simi_list) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((char_embeddings** 2).sum()+(embeddings** 2).sum()+(char_conv_W** 2).sum()+(q_rel_conv_W** 2).sum()+(q_desH_conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() diversify_reg= Diversify_Reg(char_conv_W_into_matrix)+Diversify_Reg(q_rel_conv_W_into_matrix)+Diversify_Reg(q_desH_conv_W_into_matrix) cost=loss_simi+L2_weight*L2_reg+Div_reg*diversify_reg #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([ent_char_ids_M, ent_lens_M, men_char_ids_M, men_lens_M, rel_word_ids_M, rel_word_lens_M, desH_word_ids_M, desH_word_lens_M, q_word_ids_M, q_word_lens_M, ent_scores], [loss_simi, simi_list],on_unused_input='ignore') # givens={ # ent_char_ids_M : test_pos_entity_char[index].reshape((length_per_example_test[index], max_char_len)), # ent_lens_M : test_entity_char_lengths[index].reshape((length_per_example_test[index], 3)), # men_char_ids_M : test_mention_char_ids[index].reshape((length_per_example_test[index], max_char_len)), # men_lens_M : test_mention_char_lens[index].reshape((length_per_example_test[index], 3)), # rel_word_ids_M : test_relations[index].reshape((length_per_example_test[index], max_relation_len)), # rel_word_lens_M : test_relation_lengths[index].reshape((length_per_example_test[index], 3)), # desH_word_ids_M : test_pos_entity_des[index].reshape((length_per_example_test[index], max_des_len)), # desH_word_lens_M : test_entity_des_lengths[index].reshape((length_per_example_test[index], 3)), # # desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2], # # desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2], # q_word_ids_M : test_remainQ_word_ids[index].reshape((length_per_example_test[index], max_Q_len)), # q_word_lens_M : test_remainQ_word_len[index].reshape((length_per_example_test[index], 3)), # ent_scores : test_entity_scores[index]}, #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] #+[embeddings]# + layer1.params # params_conv = [conv_W, conv_b] accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))) #AdaGrad # updates.append((acc_i, acc)) if param_i == embeddings: updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(emb_size))))) #Ada elif param_i == char_embeddings: updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(char_emb_size))))) #AdaGrad else: updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index, chosed_indices], [loss_simi, cost], updates=updates, givens={ ent_char_ids_M : indices_train_pos_entity_char[index].reshape((neg_all, max_char_len))[chosed_indices].reshape((train_neg_size, max_char_len)), ent_lens_M : indices_train_entity_char_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), men_char_ids_M : indices_train_mention_char_ids[index].reshape((neg_all, max_char_len))[chosed_indices].reshape((train_neg_size, max_char_len)), men_lens_M : indices_train_mention_char_lens[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), rel_word_ids_M : indices_train_relations[index].reshape((neg_all, max_relation_len))[chosed_indices].reshape((train_neg_size, max_relation_len)), rel_word_lens_M : indices_train_relation_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), desH_word_ids_M : indices_train_pos_entity_des[index].reshape((neg_all, max_des_len))[chosed_indices].reshape((train_neg_size, max_des_len)), desH_word_lens_M : indices_train_entity_des_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), # desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2], # desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2], q_word_ids_M : indices_train_remainQ_word_ids[index].reshape((neg_all, max_Q_len))[chosed_indices].reshape((train_neg_size, max_Q_len)), q_word_lens_M : indices_train_remainQ_word_len[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), ent_scores : indices_train_entity_scores[index][chosed_indices] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False best_test_accu=0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #print batch_start sample_indices=[0]+random.sample(range(1, neg_all), train_neg_size-1) loss_simi_i, cost_i= train_model(batch_start, sample_indices) # if batch_start%1==0: # print batch_start, '\t loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i # store_model_to_file(rootPath, params) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+'\tloss_simi_i: ', loss_simi_i, 'cost_i:', cost_i #if iter ==1: # exit(0) # if iter % n_train_batches == 0: test_loss=[] succ=0 for i in range(test_size): # print 'testing', i, '...' #prepare data test_ent_char_ids_M= numpy.asarray(test_pos_entity_char[i], dtype='int64').reshape((length_per_example_test[i], max_char_len)) test_ent_lens_M = numpy.asarray(test_entity_char_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3)) test_men_char_ids_M = numpy.asarray(test_mention_char_ids[i], dtype='int64').reshape((length_per_example_test[i], max_char_len)) test_men_lens_M = numpy.asarray(test_mention_char_lens[i], dtype='int64').reshape((length_per_example_test[i], 3)) test_rel_word_ids_M = numpy.asarray(test_relations[i], dtype='int64').reshape((length_per_example_test[i], max_relation_len)) test_rel_word_lens_M = numpy.asarray(test_relation_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3)) test_desH_word_ids_M =numpy.asarray( test_pos_entity_des[i], dtype='int64').reshape((length_per_example_test[i], max_des_len)) test_desH_word_lens_M = numpy.asarray(test_entity_des_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3)) test_q_word_ids_M = numpy.asarray(test_remainQ_word_ids[i], dtype='int64').reshape((length_per_example_test[i], max_Q_len)) test_q_word_lens_M = numpy.asarray(test_remainQ_word_len[i], dtype='int64').reshape((length_per_example_test[i], 3)) test_ent_scores = numpy.asarray(test_entity_scores[i], dtype=theano.config.floatX) loss_simi_i,simi_list_i=test_model(test_ent_char_ids_M, test_ent_lens_M, test_men_char_ids_M, test_men_lens_M, test_rel_word_ids_M, test_rel_word_lens_M, test_desH_word_ids_M, test_desH_word_lens_M, test_q_word_ids_M, test_q_word_lens_M, test_ent_scores) # print 'simi_list_i:', simi_list_i[:10] test_loss.append(loss_simi_i) if simi_list_i[0]>=max(simi_list_i[1:]): succ+=1 # print 'testing', i, '...acc:', succ*1.0/(i+1) succ=succ*1.0/test_size #now, check MAP and MRR print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best ' 'model %f') % (epoch, minibatch_index, n_train_batches,succ)) if best_test_accu< succ: best_test_accu=succ store_model_to_file(rootPath, params, mark) if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.001, n_epochs=2000, nkerns=[90,90], batch_size=1, window_width=2, maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200, L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=0.2): maxSentLength=max_s_length+2*(window_width-1) maxDocLength=max_d_length+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/'; rng = numpy.random.RandomState(23455) train_data,train_size, test_data, test_size, vocab_size=load_MCTest_corpus_DPNQ(rootPath+'vocab_DPNQ.txt', rootPath+'mc500.train.tsv_standardlized.txt_with_state.txt_DSSSS.txt_DPN.txt_DPNQ.txt', rootPath+'mc500.test.tsv_standardlized.txt_with_state.txt_DSSSS.txt_DPN.txt_DPNQ.txt', max_s_length,maxSentLength, maxDocLength)#vocab_size contain train, dev and test #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True) #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' # mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt') # extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt') # discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') # results=[numpy.array(data_D), numpy.array(data_Q), numpy.array(data_A1), numpy.array(data_A2), numpy.array(data_A3), numpy.array(data_A4), numpy.array(Label), # numpy.array(Length_D),numpy.array(Length_D_s), numpy.array(Length_Q), numpy.array(Length_A1), numpy.array(Length_A2), numpy.array(Length_A3), numpy.array(Length_A4), # numpy.array(leftPad_D),numpy.array(leftPad_D_s), numpy.array(leftPad_Q), numpy.array(leftPad_A1), numpy.array(leftPad_A2), numpy.array(leftPad_A3), numpy.array(leftPad_A4), # numpy.array(rightPad_D),numpy.array(rightPad_D_s), numpy.array(rightPad_Q), numpy.array(rightPad_A1), numpy.array(rightPad_A2), numpy.array(rightPad_A3), numpy.array(rightPad_A4)] # return results, line_control [train_data_D, train_data_A1, train_data_A2, train_data_A3, train_Label, train_Length_D,train_Length_D_s, train_Length_A1, train_Length_A2, train_Length_A3, train_leftPad_D,train_leftPad_D_s, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3, train_rightPad_D,train_rightPad_D_s, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3]=train_data [test_data_D, test_data_A1, test_data_A2, test_data_A3, test_Label, test_Length_D,test_Length_D_s, test_Length_A1, test_Length_A2, test_Length_A3, test_leftPad_D,test_leftPad_D_s, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3, test_rightPad_D,test_rightPad_D_s, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3]=test_data n_train_batches=train_size/batch_size n_test_batches=test_size/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int64') # indices_train_r=T.cast(indices_train_r, 'int64') # indices_test_l=T.cast(indices_test_l, 'int64') # indices_test_r=T.cast(indices_test_r, 'int64') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_DPNQ_glove_50d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum=0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer # index_Q = T.lvector() index_A1= T.lvector() index_A2= T.lvector() index_A3= T.lvector() # index_A4= T.lvector() # y = T.lvector() len_D=T.lscalar() len_D_s=T.lvector() # len_Q=T.lscalar() len_A1=T.lscalar() len_A2=T.lscalar() len_A3=T.lscalar() # len_A4=T.lscalar() left_D=T.lscalar() left_D_s=T.lvector() # left_Q=T.lscalar() left_A1=T.lscalar() left_A2=T.lscalar() left_A3=T.lscalar() # left_A4=T.lscalar() right_D=T.lscalar() right_D_s=T.lvector() # right_Q=T.lscalar() right_A1=T.lscalar() right_A2=T.lscalar() right_A3=T.lscalar() # right_A4=T.lscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words=(emb_size,window_width) filter_sents=(nkerns[0], window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # layer0_Q_input = embeddings[index_Q.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A1_input = embeddings[index_A1.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A2_input = embeddings[index_A2.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A3_input = embeddings[index_A3.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # layer0_A4_input = embeddings[index_A4.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1])) layer0_para=[conv_W, conv_b] conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) layer2_para=[conv2_W, conv2_b] high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1]) # this part decides nkern[0] and nkern[1] must be in the same dimension highW_para=[high_W, high_b] params = layer2_para+layer0_para+highW_para#+[embeddings] #load_model(params) layer0_D = Conv_with_input_para(rng, input=layer0_D_input, image_shape=(maxDocLength, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) # layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input, # image_shape=(batch_size, 1, ishape[0], ishape[1]), # filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A1 = Conv_with_input_para(rng, input=layer0_A1_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A2 = Conv_with_input_para(rng, input=layer0_A2_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A3 = Conv_with_input_para(rng, input=layer0_A3_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) # layer0_A4 = Conv_with_input_para(rng, input=layer0_A4_input, # image_shape=(batch_size, 1, ishape[0], ishape[1]), # filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output') # layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output') layer0_A1_output=debug_print(layer0_A1.output, 'layer0_A1.output') layer0_A2_output=debug_print(layer0_A2.output, 'layer0_A2.output') layer0_A3_output=debug_print(layer0_A3.output, 'layer0_A3.output') # layer0_A4_output=debug_print(layer0_A4.output, 'layer0_A4.output') # layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0], # left_D=left_D, right_D=right_D, # left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q, # length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1, # dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) layer1_DA1=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A1_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A1, right_r=right_A1, length_D_s=len_D_s+filter_words[1]-1, length_r=len_A1+filter_words[1]-1, dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) layer1_DA2=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A2_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A2, right_r=right_A2, length_D_s=len_D_s+filter_words[1]-1, length_r=len_A2+filter_words[1]-1, dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) layer1_DA3=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A3_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A3, right_r=right_A3, length_D_s=len_D_s+filter_words[1]-1, length_r=len_A3+filter_words[1]-1, dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) # layer1_DA4=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A4_output, kern=nkerns[0], # left_D=left_D, right_D=right_D, # left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A4, right_r=right_A4, # length_D_s=len_D_s+filter_words[1]-1, length_r=len_A4+filter_words[1]-1, # dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0] #conv from sentence to doc # layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), # image_shape=(batch_size, 1, nkerns[0], dshape[1]), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA1 = Conv_with_input_para(rng, input=layer1_DA1.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA2 = Conv_with_input_para(rng, input=layer1_DA2.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA3 = Conv_with_input_para(rng, input=layer1_DA3.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_DA4 = Conv_with_input_para(rng, input=layer1_DA4.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), # image_shape=(batch_size, 1, nkerns[0], dshape[1]), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) #conv single Q and A into doc level with same conv weights # layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), # image_shape=(batch_size, 1, nkerns[0], 1), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A1 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA1.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A2 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA2.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A3 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA3.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_A4 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA4.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), # image_shape=(batch_size, 1, nkerns[0], 1), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel') layer2_A1_output_sent_rep_Dlevel=debug_print(layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel') layer2_A2_output_sent_rep_Dlevel=debug_print(layer2_A2.output_sent_rep_Dlevel, 'layer2_A2.output_sent_rep_Dlevel') layer2_A3_output_sent_rep_Dlevel=debug_print(layer2_A3.output_sent_rep_Dlevel, 'layer2_A3.output_sent_rep_Dlevel') # layer2_A4_output_sent_rep_Dlevel=debug_print(layer2_A4.output_sent_rep_Dlevel, 'layer2_A4.output_sent_rep_Dlevel') # layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1], # left_l=left_D, right_l=right_D, left_r=0, right_r=0, # length_l=len_D+filter_sents[1]-1, length_r=1, # dim=maxDocLength+filter_sents[1]-1, topk=3) layer3_DA1=Average_Pooling_for_Top(rng, input_l=layer2_DA1.output, input_r=layer2_A1_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D+filter_sents[1]-1, length_r=1, dim=maxDocLength+filter_sents[1]-1, topk=3) layer3_DA2=Average_Pooling_for_Top(rng, input_l=layer2_DA2.output, input_r=layer2_A2_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D+filter_sents[1]-1, length_r=1, dim=maxDocLength+filter_sents[1]-1, topk=3) layer3_DA3=Average_Pooling_for_Top(rng, input_l=layer2_DA3.output, input_r=layer2_A3_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D+filter_sents[1]-1, length_r=1, dim=maxDocLength+filter_sents[1]-1, topk=3) # layer3_DA4=Average_Pooling_for_Top(rng, input_l=layer2_DA4.output, input_r=layer2_A4_output_sent_rep_Dlevel, kern=nkerns[1], # left_l=left_D, right_l=right_D, left_r=0, right_r=0, # length_l=len_D+filter_sents[1]-1, length_r=1, # dim=maxDocLength+filter_sents[1]-1, topk=3) #high-way # transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ') transform_gate_DA1=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b), 'transform_gate_DA1') transform_gate_DA2=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA2.output_D_sent_level_rep) + high_b), 'transform_gate_DA2') transform_gate_DA3=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA3.output_D_sent_level_rep) + high_b), 'transform_gate_DA3') # transform_gate_DA4=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA4.output_D_sent_level_rep) + high_b), 'transform_gate_DA4') # transform_gate_Q=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b), 'transform_gate_Q') transform_gate_A1=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b), 'transform_gate_A1') transform_gate_A2=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA2.output_QA_sent_level_rep) + high_b), 'transform_gate_A2') # transform_gate_A3=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA3.output_QA_sent_level_rep) + high_b), 'transform_gate_A3') # transform_gate_A4=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA4.output_QA_sent_level_rep) + high_b), 'transform_gate_A4') # overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q') overall_D_A1=(1.0-transform_gate_DA1)*layer1_DA1.output_D_sent_level_rep+transform_gate_DA1*layer3_DA1.output_D_doc_level_rep overall_D_A2=(1.0-transform_gate_DA2)*layer1_DA2.output_D_sent_level_rep+transform_gate_DA2*layer3_DA2.output_D_doc_level_rep overall_D_A3=(1.0-transform_gate_DA3)*layer1_DA3.output_D_sent_level_rep+transform_gate_DA3*layer3_DA3.output_D_doc_level_rep # overall_D_A4=(1.0-transform_gate_DA4)*layer1_DA4.output_D_sent_level_rep+transform_gate_DA4*layer3_DA4.output_D_doc_level_rep # overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel overall_A1=(1.0-transform_gate_A1)*layer1_DA1.output_QA_sent_level_rep+transform_gate_A1*layer2_A1.output_sent_rep_Dlevel overall_A2=(1.0-transform_gate_A2)*layer1_DA2.output_QA_sent_level_rep+transform_gate_A2*layer2_A2.output_sent_rep_Dlevel # overall_A3=(1.0-transform_gate_A3)*layer1_DA3.output_QA_sent_level_rep+transform_gate_A3*layer2_A3.output_sent_rep_Dlevel # overall_A4=(1.0-transform_gate_A4)*layer1_DA4.output_QA_sent_level_rep+transform_gate_A4*layer2_A4.output_sent_rep_Dlevel simi_sent_level1=debug_print(cosine(layer1_DA1.output_D_sent_level_rep, layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1') simi_sent_level2=debug_print(cosine(layer1_DA2.output_D_sent_level_rep, layer1_DA2.output_QA_sent_level_rep), 'simi_sent_level2') # simi_sent_level3=debug_print(cosine(layer1_DA3.output_D_sent_level_rep, layer1_DA3.output_QA_sent_level_rep), 'simi_sent_level3') # simi_sent_level4=debug_print(cosine(layer1_DA4.output_D_sent_level_rep, layer1_DA4.output_QA_sent_level_rep), 'simi_sent_level4') simi_doc_level1=debug_print(cosine(layer3_DA1.output_D_doc_level_rep, layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1') simi_doc_level2=debug_print(cosine(layer3_DA2.output_D_doc_level_rep, layer2_A2.output_sent_rep_Dlevel), 'simi_doc_level2') # simi_doc_level3=debug_print(cosine(layer3_DA3.output_D_doc_level_rep, layer2_A3.output_sent_rep_Dlevel), 'simi_doc_level3') # simi_doc_level4=debug_print(cosine(layer3_DA4.output_D_doc_level_rep, layer2_A4.output_sent_rep_Dlevel), 'simi_doc_level4') simi_overall_level1=debug_print(cosine(overall_D_A1, overall_A1), 'simi_overall_level1') simi_overall_level2=debug_print(cosine(overall_D_A2, overall_A2), 'simi_overall_level2') # simi_overall_level3=debug_print(cosine(overall_D_A3, overall_A3), 'simi_overall_level3') # simi_overall_level4=debug_print(cosine(overall_D_A4, overall_A4), 'simi_overall_level4') # simi_1=simi_overall_level1+simi_sent_level1+simi_doc_level1 # simi_2=simi_overall_level2+simi_sent_level2+simi_doc_level2 simi_1=(simi_overall_level1+simi_sent_level1+simi_doc_level1)/3.0 simi_2=(simi_overall_level2+simi_sent_level2+simi_doc_level2)/3.0 # simi_3=(simi_overall_level3+simi_sent_level3+simi_doc_level3)/3.0 # simi_4=(simi_overall_level4+simi_sent_level4+simi_doc_level4)/3.0 # eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA)) # #only use overall_simi # cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi) # posi_simi=simi_overall_level1 # nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4]) #use ensembled simi # cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi) # cost=T.maximum(0.0, margin+simi_2-simi_1) simi_PQ=cosine(layer1_DA1.output_QA_sent_level_rep, layer1_DA3.output_D_sent_level_rep) simi_NQ=cosine(layer1_DA2.output_QA_sent_level_rep, layer1_DA3.output_D_sent_level_rep) #bad matching at overall level # simi_PQ=cosine(overall_A1, overall_D_A3) # simi_NQ=cosine(overall_A2, overall_D_A3) match_cost=T.maximum(0.0, margin+simi_NQ-simi_PQ) cost=T.maximum(0.0, margin+simi_sent_level2-simi_sent_level1)+T.maximum(0.0, margin+simi_doc_level2-simi_doc_level1)+T.maximum(0.0, margin+simi_overall_level2-simi_overall_level1) cost=cost#+match_cost # posi_simi=simi_1 # nega_simi=simi_2 L2_reg =debug_print((high_W**2).sum()+3*(conv2_W**2).sum()+(conv_W**2).sum(), 'L2_reg')#+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost=debug_print(cost+L2_weight*L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([index], [cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2], givens={ index_D: test_data_D[index], #a matrix # index_Q: test_data_Q[index], index_A1: test_data_A1[index], index_A2: test_data_A2[index], index_A3: test_data_A3[index], # index_A4: test_data_A4[index], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], # len_Q: test_Length_Q[index], len_A1: test_Length_A1[index], len_A2: test_Length_A2[index], len_A3: test_Length_A3[index], # len_A4: test_Length_A4[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], # left_Q: test_leftPad_Q[index], left_A1: test_leftPad_A1[index], left_A2: test_leftPad_A2[index], left_A3: test_leftPad_A3[index], # left_A4: test_leftPad_A4[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], # right_Q: test_rightPad_Q[index], right_A1: test_rightPad_A1[index], right_A2: test_rightPad_A2[index], right_A3: test_rightPad_A3[index] # right_A4: test_rightPad_A4[index] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) # for param_i, grad_i, acc_i in zip(params, grads, accumulator): # acc = acc_i + T.sqr(grad_i) # if param_i == embeddings: # updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(emb_size))))) #AdaGrad # else: # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad # updates.append((acc_i, acc)) train_model = theano.function([index], [cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2], updates=updates, givens={ index_D: train_data_D[index], # index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], # index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], # len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], # len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], # left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], # left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], # right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index] # right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') train_model_predict = theano.function([index], [cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2], givens={ index_D: train_data_D[index], # index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], # index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], # len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], # len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], # left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], # left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], # right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index] # right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc=0.0 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 shuffle(train_batch_start)#shuffle training data posi_train_sent=[] nega_train_sent=[] posi_train_doc=[] nega_train_doc=[] posi_train_overall=[] nega_train_overall=[] for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 sys.stdout.write( "Training :[%6f] %% complete!\r" % ((iter%train_size)*100.0/train_size) ) sys.stdout.flush() minibatch_index=minibatch_index+1 cost_average, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2= train_model(batch_start) posi_train_sent.append(simi_sent_level1) nega_train_sent.append(simi_sent_level2) posi_train_doc.append(simi_doc_level1) nega_train_doc.append(simi_doc_level2) posi_train_overall.append(simi_overall_level1) nega_train_overall.append(simi_overall_level2) if iter % n_train_batches == 0: corr_train_sent=compute_corr(posi_train_sent, nega_train_sent) corr_train_doc=compute_corr(posi_train_doc, nega_train_doc) corr_train_overall=compute_corr(posi_train_overall, nega_train_overall) print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+'corr rate:'+str(corr_train_sent*300.0/train_size)+' '+str(corr_train_doc*300.0/train_size)+' '+str(corr_train_overall*300.0/train_size) if iter % validation_frequency == 0: posi_test_sent=[] nega_test_sent=[] posi_test_doc=[] nega_test_doc=[] posi_test_overall=[] nega_test_overall=[] for i in test_batch_start: cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2=test_model(i) posi_test_sent.append(simi_sent_level1) nega_test_sent.append(simi_sent_level2) posi_test_doc.append(simi_doc_level1) nega_test_doc.append(simi_doc_level2) posi_test_overall.append(simi_overall_level1) nega_test_overall.append(simi_overall_level2) corr_test_sent=compute_corr(posi_test_sent, nega_test_sent) corr_test_doc=compute_corr(posi_test_doc, nega_test_doc) corr_test_overall=compute_corr(posi_test_overall, nega_test_overall) #write_file.close() #test_score = numpy.mean(test_losses) test_acc_sent=corr_test_sent*1.0/(test_size/3.0) test_acc_doc=corr_test_doc*1.0/(test_size/3.0) test_acc_overall=corr_test_overall*1.0/(test_size/3.0) #test_acc=1-test_score # print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' # 'model %f %%') % # (epoch, minibatch_index, n_train_batches,test_acc * 100.)) print '\t\t\tepoch', epoch, ', minibatch', minibatch_index, '/', n_train_batches, 'test acc of best model', test_acc_sent*100,test_acc_doc*100,test_acc_overall*100 #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') find_better=False if test_acc_sent > max_acc: max_acc=test_acc_sent best_epoch=epoch find_better=True if test_acc_doc > max_acc: max_acc=test_acc_doc best_epoch=epoch find_better=True if test_acc_overall > max_acc: max_acc=test_acc_overall best_epoch=epoch find_better=True print '\t\t\tmax:', max_acc,'(at',best_epoch,')' if find_better==True: store_model_to_file(params, best_epoch, max_acc) print 'Finished storing best params' if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #writefile.close() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(file_name, vocab_file, train_file, dev_file, word2vec_file, learning_rate=0.001, n_epochs=2000, nkerns=[90, 90], batch_size=1, window_width=2, maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200, L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=128, max_d_length=128, margin=0.3): maxSentLength = max_s_length + 2 * (window_width - 1) maxDocLength = max_d_length + 2 * (window_width - 1) model_options = locals().copy() f = open(file_name, 'w') f.write("model options " + str(model_options) + '\n') rng = numpy.random.RandomState(23455) train_data, _train_Label, train_size, test_data, _test_Label, test_size, vocab_size = load_MCTest_corpus_DPN( vocab_file, train_file, dev_file, max_s_length, maxSentLength, maxDocLength) #vocab_size contain train, dev and test f.write('train_size : ' + str(train_size)) [ train_data_D, train_data_A1, train_Label, train_Length_D, train_Length_D_s, train_Length_A1, train_leftPad_D, train_leftPad_D_s, train_leftPad_A1, train_rightPad_D, train_rightPad_D_s, train_rightPad_A1 ] = train_data [ test_data_D, test_data_A1, test_Label, test_Length_D, test_Length_D_s, test_Length_A1, test_leftPad_D, test_leftPad_D_s, test_leftPad_A1, test_rightPad_D, test_rightPad_D_s, test_rightPad_A1 ] = test_data n_train_batches = train_size / batch_size n_test_batches = test_size / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) rand_values = load_word2vec_to_init(rand_values, word2vec_file) embeddings = theano.shared(value=rand_values, borrow=True) error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer index_A1 = T.lvector() y = T.lscalar() len_D = T.lscalar() len_D_s = T.lvector() len_A1 = T.lscalar() left_D = T.lscalar() left_D_s = T.lvector() left_A1 = T.lscalar() right_D = T.lscalar() right_D_s = T.lvector() right_A1 = T.lscalar() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words = (emb_size, window_width) filter_sents = (nkerns[0], window_width) ###################### # BUILD ACTUAL MODEL # ###################### f.write('... building the model\n') layer0_D_input = embeddings[index_D.flatten()].reshape( (maxDocLength, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A1_input = embeddings[index_A1.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b = create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1])) layer0_para = [conv_W, conv_b] conv2_W, conv2_b = create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) layer2_para = [conv2_W, conv2_b] high_W, high_b = create_highw_para( rng, nkerns[0], nkerns[1] ) # this part decides nkern[0] and nkern[1] must be in the same dimension highW_para = [high_W, high_b] params = layer2_para + layer0_para + highW_para #+[embeddings] layer0_D = Conv_with_input_para( rng, input=layer0_D_input, image_shape=(maxDocLength, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A1 = Conv_with_input_para( rng, input=layer0_A1_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output') layer0_A1_output = debug_print(layer0_A1.output, 'layer0_A1.output') layer1_DA1 = Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A1_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A1, right_r=right_A1, length_D_s=len_D_s + filter_words[1] - 1, length_r=len_A1 + filter_words[1] - 1, dim=maxSentLength + filter_words[1] - 1, doc_len=maxDocLength, topk=3) layer2_DA1 = Conv_with_input_para( rng, input=layer1_DA1.output_D.reshape( (batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A1 = Conv_with_input_para_one_col_featuremap( rng, input=layer1_DA1.output_QA_sent_level_rep.reshape( (batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A1_output_sent_rep_Dlevel = debug_print( layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel') layer3_DA1 = Average_Pooling_for_Top( rng, input_l=layer2_DA1.output, input_r=layer2_A1_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D + filter_sents[1] - 1, length_r=1, dim=maxDocLength + filter_sents[1] - 1, topk=3) #high-way transform_gate_DA1 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b), 'transform_gate_DA1') transform_gate_A1 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b), 'transform_gate_A1') overall_D_A1 = ( 1.0 - transform_gate_DA1 ) * layer1_DA1.output_D_sent_level_rep + transform_gate_DA1 * layer3_DA1.output_D_doc_level_rep overall_A1 = ( 1.0 - transform_gate_A1 ) * layer1_DA1.output_QA_sent_level_rep + transform_gate_A1 * layer2_A1.output_sent_rep_Dlevel simi_sent_level1 = debug_print( cosine(layer1_DA1.output_D_sent_level_rep, layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1') simi_doc_level1 = debug_print( cosine(layer3_DA1.output_D_doc_level_rep, layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1') simi_overall_level1 = debug_print(cosine(overall_D_A1, overall_A1), 'simi_overall_level1') simi_1 = (simi_overall_level1 + simi_sent_level1 + simi_doc_level1) / 3.0 logistic_w, logistic_b = create_logistic_para(rng, 1, 2) logistic_para = [logistic_w, logistic_b] params += logistic_para simi_1 = T.dot(logistic_w, simi_1) + logistic_b.dimshuffle(0, 'x') simi_1 = simi_1.dimshuffle(1, 0) simi_1 = T.nnet.softmax(simi_1) predict = T.argmax(simi_1, axis=1) tmp = T.log(simi_1) cost = T.maximum(0.0, margin + tmp[0][1 - y] - tmp[0][y]) L2_reg = (high_W**2).sum() + (conv2_W**2).sum() + (conv_W**2).sum() + ( logistic_w**2).sum() cost = cost + L2_weight * L2_reg test_model = theano.function( [index], [cost, simi_1, predict], givens={ index_D: test_data_D[index], #a matrix index_A1: test_data_A1[index], y: test_Label[index], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], len_A1: test_Length_A1[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], left_A1: test_leftPad_A1[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], right_A1: test_rightPad_A1[index], }, on_unused_input='ignore') accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') acc = acc_i + T.sqr(grad_i) updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [index], [cost, simi_1, predict], updates=updates, givens={ index_D: train_data_D[index], index_A1: train_data_A1[index], y: train_Label[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_A1: train_Length_A1[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_A1: train_leftPad_A1[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_A1: train_rightPad_A1[index], }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### f.write('... training\n') # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc = 0.0 best_epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 #shuffle(train_batch_start)#shuffle training data simi_train = [] predict_train = [] for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 minibatch_index = minibatch_index + 1 cost_average, simi, predict = train_model(batch_start) simi_train.append(simi) predict_train.append(predict) if iter % 1000 == 0: f.write('@iter :' + str(iter) + '\n') if iter % n_train_batches == 0: corr_train = compute_corr_train(predict_train, _train_Label) res = 'training @ iter = ' + str( iter) + ' average cost: ' + str( cost_average) + 'corr rate: ' + str( corr_train * 100.0 / train_size) + '\n' f.write(res) if iter % validation_frequency == 0 or iter % 20000 == 0: posi_test_sent = [] nega_test_sent = [] posi_test_doc = [] nega_test_doc = [] posi_test_overall = [] nega_test_overall = [] simi_test = [] predict_test = [] for i in test_batch_start: cost, simi, predict = test_model(i) #print simi #f.write('test_predict : ' + str(predict) + ' test_simi : ' + str(simi) + '\n' ) simi_test.append(simi) predict_test.append(predict) corr_test = compute_corr(simi_test, predict_test, f) test_acc = corr_test * 1.0 / (test_size / 4.0) res = '\t\t\tepoch ' + str(epoch) + ', minibatch ' + str( minibatch_index) + ' / ' + str( n_train_batches) + ' test acc of best model ' + str( test_acc * 100.0) + '\n' f.write(res) find_better = False if test_acc > max_acc: max_acc = test_acc best_epoch = epoch find_better = True res = '\t\t\tmax: ' + str(max_acc) + ' (at ' + str( best_epoch) + ')\n' f.write(res) if find_better == True: store_model_to_file(params, best_epoch, max_acc) print 'Finished storing best params' if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min' mid_time = time.clock() #writefile.close() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[50], batch_size=10, window_width=3, maxSentLength=1050, emb_size=50, hidden_size=200, margin=0.5): #def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[6, 12], batch_size=70, useAllSamples=0, kmax=30, ktop=5, filter_size=[10,7], # L2_weight=0.000005, dropout_p=0.5, useEmb=0, task=5, corpus=1): ibmPath='/mounts/data/proj/wenpeng/Dataset/insuranceQA/'; rng = numpy.random.RandomState(23455) datasets, vocab_size=load_ibm_corpus(ibmPath+'vocabulary', ibmPath+'train.txt', ibmPath+'dev.txt', maxSentLength) indices_train, trainLengths, trainLeftPad, trainRightPad= datasets[0] #print trainY.eval().shape[0] indices_dev, devY, devLengths, devLeftPad, devRightPad= datasets[1] n_train_batches=indices_train.shape[0]/(batch_size*4) #note that we consider 4 lines as an example in training n_valid_batches=indices_dev.shape[0]/(batch_size*4) train_batch_start=list(numpy.arange(n_train_batches)*(batch_size*4)) dev_batch_start=list(numpy.arange(n_valid_batches)*(batch_size*4)) indices_train_theano=theano.shared(numpy.asarray(indices_train, dtype=theano.config.floatX), borrow=True) indices_dev_theano=theano.shared(numpy.asarray(indices_dev, dtype=theano.config.floatX), borrow=True) indices_train_theano=T.cast(indices_train_theano, 'int32') indices_dev_theano=T.cast(indices_dev_theano, 'int32') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(1e-50+numpy.zeros(emb_size)) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values) embeddings=theano.shared(value=rand_values) # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x_index = T.imatrix('x_index') # now, x is the index matrix, must be integer #left=T.ivector('left') #right=T.ivector('right') #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_input = embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0 = Conv(rng, input=layer0_input, image_shape=((batch_size*4), 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) layer0_out=debug_print(layer0.output, 'layer0_out') layer1=Average_Pooling(rng, input=layer0_out, length_last_dim=length_after_wideConv, kern=nkerns[0] ) layer1_out=debug_print(layer1.output.reshape((batch_size*2, nkerns[0]*2)), 'layer1_out') layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh) layer3=HiddenLayer(rng, input=layer2.output, n_in=hidden_size, n_out=1, activation=T.tanh) posi_score=layer3.output[0:layer3.output.shape[0]:2,:] nega_score=layer3.output[1:layer3.output.shape[0]:2,:] cost=T.maximum(0, margin-T.sum(posi_score-nega_score)) #cost = layer3.negative_log_likelihood(y) # output a list of score dev_model = theano.function([index], layer3.output.flatten(), givens={ x_index: indices_dev_theano[index: index + (batch_size*4)]}) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params+layer0.params accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-20))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index], [cost], updates=updates, givens={ x_index: indices_train_theano[index: index + (batch_size*4)]}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches/5, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 cost_ij= train_model(batch_start) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' cost: '+str(cost_ij) if iter % validation_frequency == 0: dev_scores=[] for i in dev_batch_start: dev_scores+=list(dev_model(i)) acc_dev=compute_acc(devY, dev_scores) print(('\t\t\t\tepoch %i, minibatch %i/%i, dev acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, acc_dev * 100.)) ''' #print 'validating & testing...' # compute zero-one loss on validation set validation_losses = [] for i in dev_batch_start: time.sleep(0.5) validation_losses.append(validate_model(i)) #validation_losses = [validate_model(i) for i in dev_batch_start] this_validation_loss = numpy.mean(validation_losses) print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index , n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in test_batch_start] test_score = numpy.mean(test_losses) print(('\t\t\t\tepoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_score * 100.)) ''' if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, learning_rate=0.2, n_epochs=2000, nkerns=[6, 14], batch_size=10, useAllSamples=0, ktop=4, filter_size=[7, 5], L2_weight=0.00005, dropout_p=0.8, useEmb=0, task=2, corpus=1, dataMode=3, maxSentLength=60, sentEm_length=48, window=3, k=5, nce_seeds=2345, only_left_context=False, vali_cost_list_length=20, embedding_size=48, train_scheme=1): self.ini_learning_rate = learning_rate self.n_epochs = n_epochs self.nkerns = nkerns self.batch_size = batch_size self.useAllSamples = useAllSamples self.ktop = ktop self.filter_size = filter_size self.L2_weight = L2_weight self.dropout_p = dropout_p self.useEmb = useEmb self.task = task self.corpus = corpus self.dataMode = dataMode self.maxSentLength = maxSentLength self.kmax = self.maxSentLength / 2 + 5 self.sentEm_length = sentEm_length self.window = window self.k = k self.only_left_context = only_left_context if self.only_left_context: self.context_size = self.window else: self.context_size = 2 * self.window self.nce_seed = nce_seeds self.embedding_size = 0 self.train_scheme = train_scheme root = "/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/" wiki_path = "/mounts/data/proj/wenpeng/PhraseEmbedding/enwiki-20130503-pages-articles-cleaned-tokenized" embeddingPath = '/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt' embeddingPath2 = '/mounts/data/proj/wenpeng/MC/src/released_embedding.txt' datasets, unigram, train_lengths, dev_lengths, word_count = load_model_for_training( wiki_path, root + str(self.task) + 'classes/' + str(self.corpus) + 'train.txt', root + str(self.task) + 'classes/' + str(self.corpus) + 'dev.txt', self.maxSentLength, self.dataMode, self.train_scheme) self.datasets = datasets self.embedding_size = embedding_size self.vocab_size = word_count rand_values = random_value_normal( (self.vocab_size + 1, self.embedding_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(self.embedding_size)) self.embeddings_R = theano.shared(value=rand_values) rand_values = random_value_normal( (self.vocab_size + 1, self.embedding_size), theano.config.floatX, numpy.random.RandomState(4321)) rand_values[0] = numpy.array(numpy.zeros(self.embedding_size)) self.embeddings_Q = theano.shared(value=rand_values) self.unigram = unigram self.p_n = theano.shared(value=self.unigram) self.train_lengths = train_lengths self.dev_lengths = dev_lengths b_values = zero_value((len(unigram), ), dtype=theano.config.floatX) self.bias = theano.shared(value=b_values, name='bias') self.vali_cost_list_length = vali_cost_list_length
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, nkerns=[90, 90], batch_size=1, window_width=2, maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200, L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=0.2): maxSentLength = max_s_length + 2 * (window_width - 1) maxDocLength = max_d_length + 2 * (window_width - 1) model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/MCTest/' rng = numpy.random.RandomState(23455) train_data, train_size, test_data, test_size, vocab_size = load_MCTest_corpus_DSSSS( rootPath + 'vocab_DSSSS.txt', rootPath + 'mc500.train.tsv_standardlized.txt_with_state.txt_DSSSS.txt', rootPath + 'mc500.test.tsv_standardlized.txt_with_state.txt_DSSSS.txt', max_s_length, maxSentLength, maxDocLength) #vocab_size contain train, dev and test #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True) #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' # mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt') # extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt') # discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') # results=[numpy.array(data_D), numpy.array(data_Q), numpy.array(data_A1), numpy.array(data_A2), numpy.array(data_A3), numpy.array(data_A4), numpy.array(Label), # numpy.array(Length_D),numpy.array(Length_D_s), numpy.array(Length_Q), numpy.array(Length_A1), numpy.array(Length_A2), numpy.array(Length_A3), numpy.array(Length_A4), # numpy.array(leftPad_D),numpy.array(leftPad_D_s), numpy.array(leftPad_Q), numpy.array(leftPad_A1), numpy.array(leftPad_A2), numpy.array(leftPad_A3), numpy.array(leftPad_A4), # numpy.array(rightPad_D),numpy.array(rightPad_D_s), numpy.array(rightPad_Q), numpy.array(rightPad_A1), numpy.array(rightPad_A2), numpy.array(rightPad_A3), numpy.array(rightPad_A4)] # return results, line_control [ train_data_D, train_data_A1, train_data_A2, train_data_A3, train_data_A4, train_Label, train_Length_D, train_Length_D_s, train_Length_A1, train_Length_A2, train_Length_A3, train_Length_A4, train_leftPad_D, train_leftPad_D_s, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3, train_leftPad_A4, train_rightPad_D, train_rightPad_D_s, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3, train_rightPad_A4 ] = train_data [ test_data_D, test_data_A1, test_data_A2, test_data_A3, test_data_A4, test_Label, test_Length_D, test_Length_D_s, test_Length_A1, test_Length_A2, test_Length_A3, test_Length_A4, test_leftPad_D, test_leftPad_D_s, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3, test_leftPad_A4, test_rightPad_D, test_rightPad_D_s, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3, test_rightPad_A4 ] = test_data n_train_batches = train_size / batch_size n_test_batches = test_size / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int64') # indices_train_r=T.cast(indices_train_r, 'int64') # indices_test_l=T.cast(indices_test_l, 'int64') # indices_test_r=T.cast(indices_test_r, 'int64') rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init(rand_values, rootPath + 'vocab_glove_50d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings = theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer # index_Q = T.lvector() index_A1 = T.lvector() index_A2 = T.lvector() index_A3 = T.lvector() index_A4 = T.lvector() # y = T.lvector() len_D = T.lscalar() len_D_s = T.lvector() # len_Q=T.lscalar() len_A1 = T.lscalar() len_A2 = T.lscalar() len_A3 = T.lscalar() len_A4 = T.lscalar() left_D = T.lscalar() left_D_s = T.lvector() # left_Q=T.lscalar() left_A1 = T.lscalar() left_A2 = T.lscalar() left_A3 = T.lscalar() left_A4 = T.lscalar() right_D = T.lscalar() right_D_s = T.lvector() # right_Q=T.lscalar() right_A1 = T.lscalar() right_A2 = T.lscalar() right_A3 = T.lscalar() right_A4 = T.lscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words = (emb_size, window_width) filter_sents = (nkerns[0], window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = embeddings[index_D.flatten()].reshape( (maxDocLength, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # layer0_Q_input = embeddings[index_Q.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A1_input = embeddings[index_A1.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A2_input = embeddings[index_A2.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A3_input = embeddings[index_A3.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A4_input = embeddings[index_A4.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b = create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1])) layer0_para = [conv_W, conv_b] conv2_W, conv2_b = create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) layer2_para = [conv2_W, conv2_b] high_W, high_b = create_highw_para(rng, nkerns[0], nkerns[1]) highW_para = [high_W, high_b] params = layer2_para + layer0_para + highW_para #+[embeddings] #load_model(params) layer0_D = Conv_with_input_para( rng, input=layer0_D_input, image_shape=(maxDocLength, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) # layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input, # image_shape=(batch_size, 1, ishape[0], ishape[1]), # filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A1 = Conv_with_input_para( rng, input=layer0_A1_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A2 = Conv_with_input_para( rng, input=layer0_A2_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A3 = Conv_with_input_para( rng, input=layer0_A3_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A4 = Conv_with_input_para( rng, input=layer0_A4_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output') # layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output') layer0_A1_output = debug_print(layer0_A1.output, 'layer0_A1.output') layer0_A2_output = debug_print(layer0_A2.output, 'layer0_A2.output') layer0_A3_output = debug_print(layer0_A3.output, 'layer0_A3.output') layer0_A4_output = debug_print(layer0_A4.output, 'layer0_A4.output') # layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0], # left_D=left_D, right_D=right_D, # left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q, # length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1, # dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) layer1_DA1 = Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A1_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A1, right_r=right_A1, length_D_s=len_D_s + filter_words[1] - 1, length_r=len_A1 + filter_words[1] - 1, dim=maxSentLength + filter_words[1] - 1, doc_len=maxDocLength, topk=3) layer1_DA2 = Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A2_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A2, right_r=right_A2, length_D_s=len_D_s + filter_words[1] - 1, length_r=len_A2 + filter_words[1] - 1, dim=maxSentLength + filter_words[1] - 1, doc_len=maxDocLength, topk=3) layer1_DA3 = Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A3_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A3, right_r=right_A3, length_D_s=len_D_s + filter_words[1] - 1, length_r=len_A3 + filter_words[1] - 1, dim=maxSentLength + filter_words[1] - 1, doc_len=maxDocLength, topk=3) layer1_DA4 = Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A4_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A4, right_r=right_A4, length_D_s=len_D_s + filter_words[1] - 1, length_r=len_A4 + filter_words[1] - 1, dim=maxSentLength + filter_words[1] - 1, doc_len=maxDocLength, topk=3) #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0] #conv from sentence to doc # layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), # image_shape=(batch_size, 1, nkerns[0], dshape[1]), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA1 = Conv_with_input_para( rng, input=layer1_DA1.output_D.reshape( (batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA2 = Conv_with_input_para( rng, input=layer1_DA2.output_D.reshape( (batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA3 = Conv_with_input_para( rng, input=layer1_DA3.output_D.reshape( (batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA4 = Conv_with_input_para( rng, input=layer1_DA4.output_D.reshape( (batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) #conv single Q and A into doc level with same conv weights # layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), # image_shape=(batch_size, 1, nkerns[0], 1), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A1 = Conv_with_input_para_one_col_featuremap( rng, input=layer1_DA1.output_QA_sent_level_rep.reshape( (batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A2 = Conv_with_input_para_one_col_featuremap( rng, input=layer1_DA2.output_QA_sent_level_rep.reshape( (batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A3 = Conv_with_input_para_one_col_featuremap( rng, input=layer1_DA3.output_QA_sent_level_rep.reshape( (batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A4 = Conv_with_input_para_one_col_featuremap( rng, input=layer1_DA4.output_QA_sent_level_rep.reshape( (batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel') layer2_A1_output_sent_rep_Dlevel = debug_print( layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel') layer2_A2_output_sent_rep_Dlevel = debug_print( layer2_A2.output_sent_rep_Dlevel, 'layer2_A2.output_sent_rep_Dlevel') layer2_A3_output_sent_rep_Dlevel = debug_print( layer2_A3.output_sent_rep_Dlevel, 'layer2_A3.output_sent_rep_Dlevel') layer2_A4_output_sent_rep_Dlevel = debug_print( layer2_A4.output_sent_rep_Dlevel, 'layer2_A4.output_sent_rep_Dlevel') # layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1], # left_l=left_D, right_l=right_D, left_r=0, right_r=0, # length_l=len_D+filter_sents[1]-1, length_r=1, # dim=maxDocLength+filter_sents[1]-1, topk=3) layer3_DA1 = Average_Pooling_for_Top( rng, input_l=layer2_DA1.output, input_r=layer2_A1_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D + filter_sents[1] - 1, length_r=1, dim=maxDocLength + filter_sents[1] - 1, topk=3) layer3_DA2 = Average_Pooling_for_Top( rng, input_l=layer2_DA2.output, input_r=layer2_A2_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D + filter_sents[1] - 1, length_r=1, dim=maxDocLength + filter_sents[1] - 1, topk=3) layer3_DA3 = Average_Pooling_for_Top( rng, input_l=layer2_DA3.output, input_r=layer2_A3_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D + filter_sents[1] - 1, length_r=1, dim=maxDocLength + filter_sents[1] - 1, topk=3) layer3_DA4 = Average_Pooling_for_Top( rng, input_l=layer2_DA4.output, input_r=layer2_A4_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D + filter_sents[1] - 1, length_r=1, dim=maxDocLength + filter_sents[1] - 1, topk=3) #high-way # transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ') transform_gate_DA1 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b), 'transform_gate_DA1') transform_gate_DA2 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA2.output_D_sent_level_rep) + high_b), 'transform_gate_DA2') transform_gate_DA3 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA3.output_D_sent_level_rep) + high_b), 'transform_gate_DA3') transform_gate_DA4 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA4.output_D_sent_level_rep) + high_b), 'transform_gate_DA4') # transform_gate_Q=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b), 'transform_gate_Q') transform_gate_A1 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b), 'transform_gate_A1') transform_gate_A2 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA2.output_QA_sent_level_rep) + high_b), 'transform_gate_A2') transform_gate_A3 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA3.output_QA_sent_level_rep) + high_b), 'transform_gate_A3') transform_gate_A4 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA4.output_QA_sent_level_rep) + high_b), 'transform_gate_A4') # overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q') overall_D_A1 = ( 1.0 - transform_gate_DA1 ) * layer1_DA1.output_D_sent_level_rep + transform_gate_DA1 * layer3_DA1.output_D_doc_level_rep overall_D_A2 = ( 1.0 - transform_gate_DA2 ) * layer1_DA2.output_D_sent_level_rep + transform_gate_DA2 * layer3_DA2.output_D_doc_level_rep overall_D_A3 = ( 1.0 - transform_gate_DA3 ) * layer1_DA3.output_D_sent_level_rep + transform_gate_DA3 * layer3_DA3.output_D_doc_level_rep overall_D_A4 = ( 1.0 - transform_gate_DA4 ) * layer1_DA4.output_D_sent_level_rep + transform_gate_DA4 * layer3_DA4.output_D_doc_level_rep # overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel overall_A1 = ( 1.0 - transform_gate_A1 ) * layer1_DA1.output_QA_sent_level_rep + transform_gate_A1 * layer2_A1.output_sent_rep_Dlevel overall_A2 = ( 1.0 - transform_gate_A2 ) * layer1_DA2.output_QA_sent_level_rep + transform_gate_A2 * layer2_A2.output_sent_rep_Dlevel overall_A3 = ( 1.0 - transform_gate_A3 ) * layer1_DA3.output_QA_sent_level_rep + transform_gate_A3 * layer2_A3.output_sent_rep_Dlevel overall_A4 = ( 1.0 - transform_gate_A4 ) * layer1_DA4.output_QA_sent_level_rep + transform_gate_A4 * layer2_A4.output_sent_rep_Dlevel simi_sent_level1 = debug_print( cosine(layer1_DA1.output_D_sent_level_rep, layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1') simi_sent_level2 = debug_print( cosine(layer1_DA2.output_D_sent_level_rep, layer1_DA2.output_QA_sent_level_rep), 'simi_sent_level2') simi_sent_level3 = debug_print( cosine(layer1_DA3.output_D_sent_level_rep, layer1_DA3.output_QA_sent_level_rep), 'simi_sent_level3') simi_sent_level4 = debug_print( cosine(layer1_DA4.output_D_sent_level_rep, layer1_DA4.output_QA_sent_level_rep), 'simi_sent_level4') simi_doc_level1 = debug_print( cosine(layer3_DA1.output_D_doc_level_rep, layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1') simi_doc_level2 = debug_print( cosine(layer3_DA2.output_D_doc_level_rep, layer2_A2.output_sent_rep_Dlevel), 'simi_doc_level2') simi_doc_level3 = debug_print( cosine(layer3_DA3.output_D_doc_level_rep, layer2_A3.output_sent_rep_Dlevel), 'simi_doc_level3') simi_doc_level4 = debug_print( cosine(layer3_DA4.output_D_doc_level_rep, layer2_A4.output_sent_rep_Dlevel), 'simi_doc_level4') simi_overall_level1 = debug_print(cosine(overall_D_A1, overall_A1), 'simi_overall_level1') simi_overall_level2 = debug_print(cosine(overall_D_A2, overall_A2), 'simi_overall_level2') simi_overall_level3 = debug_print(cosine(overall_D_A3, overall_A3), 'simi_overall_level3') simi_overall_level4 = debug_print(cosine(overall_D_A4, overall_A4), 'simi_overall_level4') simi_1 = simi_overall_level1 #+simi_sent_level1+simi_doc_level1 simi_2 = simi_overall_level2 #+simi_sent_level2+simi_doc_level2 simi_3 = simi_overall_level3 #+simi_sent_level3+simi_doc_level3 simi_4 = simi_overall_level4 #+simi_sent_level4+simi_doc_level4 # simi_1=(simi_overall_level1+simi_sent_level1+simi_doc_level1)/3.0 # simi_2=(simi_overall_level2+simi_sent_level2+simi_doc_level2)/3.0 # simi_3=(simi_overall_level3+simi_sent_level3+simi_doc_level3)/3.0 # simi_4=(simi_overall_level4+simi_sent_level4+simi_doc_level4)/3.0 # eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA)) # #only use overall_simi # cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi) # posi_simi=simi_overall_level1 # nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4]) #use ensembled simi # cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi) # cost=T.maximum(0.0, margin+simi_2-simi_1)+T.maximum(0.0, margin+simi_3-simi_1)+T.maximum(0.0, margin+simi_4-simi_1) cost12 = T.maximum( 0.0, margin + simi_sent_level2 - simi_sent_level1) + T.maximum( 0.0, margin + simi_doc_level2 - simi_doc_level1) + T.maximum( 0.0, margin + simi_overall_level2 - simi_overall_level1) cost13 = T.maximum( 0.0, margin + simi_sent_level3 - simi_sent_level1) + T.maximum( 0.0, margin + simi_doc_level3 - simi_doc_level1) + T.maximum( 0.0, margin + simi_overall_level3 - simi_overall_level1) cost14 = T.maximum( 0.0, margin + simi_sent_level4 - simi_sent_level1) + T.maximum( 0.0, margin + simi_doc_level4 - simi_doc_level1) + T.maximum( 0.0, margin + simi_overall_level4 - simi_overall_level1) cost = cost12 + cost13 + cost14 posi_simi = T.max([simi_sent_level1, simi_doc_level1, simi_overall_level1]) nega_simi = T.max([ simi_sent_level2, simi_doc_level2, simi_overall_level2, simi_sent_level3, simi_doc_level3, simi_overall_level3, simi_sent_level4, simi_doc_level4, simi_overall_level4 ]) L2_reg = debug_print( (high_W**2).sum() + (conv2_W**2).sum() + (conv_W**2).sum(), 'L2_reg' ) #+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost = debug_print(cost + L2_weight * L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function( [index], [cost, posi_simi, nega_simi], givens={ index_D: test_data_D[index], #a matrix # index_Q: test_data_Q[index], index_A1: test_data_A1[index], index_A2: test_data_A2[index], index_A3: test_data_A3[index], index_A4: test_data_A4[index], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], # len_Q: test_Length_Q[index], len_A1: test_Length_A1[index], len_A2: test_Length_A2[index], len_A3: test_Length_A3[index], len_A4: test_Length_A4[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], # left_Q: test_leftPad_Q[index], left_A1: test_leftPad_A1[index], left_A2: test_leftPad_A2[index], left_A3: test_leftPad_A3[index], left_A4: test_leftPad_A4[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], # right_Q: test_rightPad_Q[index], right_A1: test_rightPad_A1[index], right_A2: test_rightPad_A2[index], right_A3: test_rightPad_A3[index], right_A4: test_rightPad_A4[index] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') acc = acc_i + T.sqr(grad_i) updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) # for param_i, grad_i, acc_i in zip(params, grads, accumulator): # acc = acc_i + T.sqr(grad_i) # if param_i == embeddings: # updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(emb_size))))) #AdaGrad # else: # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad # updates.append((acc_i, acc)) train_model = theano.function( [index], [cost, posi_simi, nega_simi], updates=updates, givens={ index_D: train_data_D[index], # index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], # len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], # left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], # right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index], right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') train_model_predict = theano.function( [index], [cost, posi_simi, nega_simi], givens={ index_D: train_data_D[index], # index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], # len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], # left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], # right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index], right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc = 0.0 best_epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 #shuffle(train_batch_start)#shuffle training data corr_train = 0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 sys.stdout.write("Training :[%6f] %% complete!\r" % ((iter % train_size) * 100.0 / train_size)) sys.stdout.flush() minibatch_index = minibatch_index + 1 cost_average, posi_simi, nega_simi = train_model(batch_start) if posi_simi > nega_simi: corr_train += 1 if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + ' average cost: ' + str( cost_average) + 'corr rate:' + str( corr_train * 100.0 / train_size) if iter % validation_frequency == 0: corr_test = 0 for i in test_batch_start: cost, posi_simi, nega_simi = test_model(i) if posi_simi > nega_simi: corr_test += 1 #write_file.close() #test_score = numpy.mean(test_losses) test_acc = corr_test * 1.0 / test_size #test_acc=1-test_score print( ('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_acc * 100.)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') find_better = False if test_acc > max_acc: max_acc = test_acc best_epoch = epoch find_better = True print '\t\t\ttest_acc:', test_acc, 'max:', max_acc, '(at', best_epoch, ')' if find_better == True: store_model_to_file(params, best_epoch, max_acc) print 'Finished storing best params' if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min' mid_time = time.clock() #writefile.close() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.08, n_epochs=2000, nkerns=[44], batch_size=1, window_width=3, maxSentLength=64, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.0006, update_freq=1, norm_threshold=5.0, max_truncate=24): maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SICK/'; rng = numpy.random.RandomState(23455) datasets, vocab_size=load_SICK_corpus(rootPath+'vocab_nonoverlap.txt', rootPath+'train_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate,maxSentLength, entailment=True)#vocab_size contain train, dev and test #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt') extra_train, extra_test=load_extra_features(rootPath+'train_rule_features_cosine_eucli_negation_len1_len2.txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2.txt') discri_train, discri_test=load_extra_features(rootPath+'train_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2,:] indices_train_r=indices_train[1::2,:] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2,:] indices_test_r=indices_test[1::2,:] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] n_train_batches=indices_train_l.shape[0]/batch_size n_test_batches=indices_test_l.shape[0]/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l=T.cast(indices_train_l, 'int64') indices_train_r=T.cast(indices_train_r, 'int64') indices_test_l=T.cast(indices_test_l, 'int64') indices_test_r=T.cast(indices_test_r, 'int64') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_nonoverlap_in_word2vec_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum=0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.lmatrix('x_index_l') # now, x is the index matrix, must be integer x_index_r = T.lmatrix('x_index_r') y = T.lvector('y') left_l=T.lscalar() right_l=T.lscalar() left_r=T.lscalar() right_r=T.lscalar() length_l=T.lscalar() length_r=T.lscalar() norm_length_l=T.dscalar() norm_length_r=T.dscalar() mts=T.dmatrix() extra=T.dmatrix() discri=T.dmatrix() #wmf=T.dmatrix() cost_tmp=T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output') layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1) #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh) sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) aver_uni_l=sum_uni_l/layer0_l_input.shape[3] norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) aver_uni_r=sum_uni_r/layer0_r_input.shape[3] norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) uni_cosine=cosine(sum_uni_l, sum_uni_r) aver_uni_cosine=cosine(aver_uni_l, aver_uni_r) uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi') linear=Linear(norm_uni_l, norm_uni_r) poly=Poly(norm_uni_l, norm_uni_r) sigmoid=Sigmoid(norm_uni_l, norm_uni_r) rbf=RBF(norm_uni_l, norm_uni_r) gesd=GESD(norm_uni_l, norm_uni_r) eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2% #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r)) len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) ''' len_l=length_l.reshape((1,1)) len_r=length_r.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer3_input=T.concatenate([mts, eucli_1,uni_cosine,#linear, poly,sigmoid,rbf, gesd, #sum_uni_r-sum_uni_l, layer1.output_eucli_to_simi,layer1.output_cosine, #layer1.output_vector_r-layer1.output_vector_l, len_l, len_r, extra #discri #wmf ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer3=LogisticRegression(rng, input=layer3_input, n_in=14+(2)+(2)+2+5, n_out=3) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((layer3.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([index], [layer3.errors(y),layer3_input, y], givens={ x_index_l: indices_test_l[index: index + batch_size], x_index_r: indices_test_r[index: index + batch_size], y: testY[index: index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index], mts: mt_test[index: index + batch_size], extra: extra_test[index: index + batch_size], discri:discri_test[index: index + batch_size] #wmf: wm_test[index: index + batch_size] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer3.params+ [conv_W, conv_b]#+[embeddings]# + layer1.params params_conv = [conv_W, conv_b] accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index,cost_tmp], cost, updates=updates, givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index: index + batch_size], extra: extra_train[index: index + batch_size], discri:discri_train[index: index + batch_size] #wmf: wm_train[index: index + batch_size] }, on_unused_input='ignore') train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y], givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index: index + batch_size], extra: extra_train[index: index + batch_size], discri:discri_train[index: index + batch_size] #wmf: wm_train[index: index + batch_size] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc=0.0 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 shuffle(train_batch_start)#shuffle training data cost_tmp=0.0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) #print batch_start if iter%update_freq != 0: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) #print 'layer3_input', layer3_input cost_tmp+=cost_ij error_sum+=error_ij #print 'cost_acc ',cost_acc #print 'cost_ij ', cost_ij #print 'cost_tmp before update',cost_tmp else: cost_average= train_model(batch_start,cost_tmp) #print 'layer3_input', layer3_input error_sum=0 cost_tmp=0.0#reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_losses=[] test_y=[] test_features=[] for i in test_batch_start: test_loss, layer3_input, y=test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) test_acc=1-test_score print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches,test_acc * 100.)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') train_y=[] train_features=[] count=0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 #write_feature.close() clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33 clf.fit(train_features, train_y) results=clf.predict(test_features) #lr=LinearRegression().fit(train_features, train_y) #results_lr=lr.predict(test_features) corr_count=0 #corr_lr=0 corr_neu=0 neu_co=0 corr_ent=0 ent_co=0 corr_contr=0 contr_co=0 test_size=len(test_y) for i in range(test_size): if test_y[i]==0:#NEUTRAL neu_co+=1 if results[i]==test_y[i]: corr_neu+=1 elif test_y[i]==1:#ENTAILMENT ent_co+=1 if results[i]==test_y[i]: corr_ent+=1 elif test_y[i]==2:#CONTRADICTION contr_co+=1 if results[i]==test_y[i]: corr_contr+=1 ''' if results[i]==test_y[i]: corr_count+=1 if test_y[i]==0: #NEUTRAL corr_neu+=1 ''' #if numpy.absolute(results_lr[i]-test_y[i])<0.5: # corr_lr+=1 corr_count=corr_neu+corr_ent+corr_contr acc=corr_count*1.0/test_size acc_neu=corr_neu*1.0/neu_co acc_ent=corr_ent*1.0/ent_co acc_contr=corr_contr*1.0/contr_co #acc_lr=corr_lr*1.0/test_size if acc > max_acc: max_acc=acc best_epoch=epoch if test_acc > max_acc: max_acc=test_acc best_epoch=epoch #if acc_lr> max_acc: # max_acc=acc_lr # best_epoch=epoch print '\t\t\tsvm acc: ', acc, ' max acc: ', max_acc,'(at',best_epoch,')',' Neu: ',acc_neu, ' Ent: ',acc_ent, ' Contr: ',acc_contr if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[50], batch_size=10, window_width=3, maxSentLength=1050, emb_size=50, hidden_size=200, margin=0.5): #def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[6, 12], batch_size=70, useAllSamples=0, kmax=30, ktop=5, filter_size=[10,7], # L2_weight=0.000005, dropout_p=0.5, useEmb=0, task=5, corpus=1): ibmPath = '/mounts/data/proj/wenpeng/Dataset/insuranceQA/' rng = numpy.random.RandomState(23455) datasets, vocab_size = load_ibm_corpus(ibmPath + 'vocabulary', ibmPath + 'train.txt', ibmPath + 'dev.txt', maxSentLength) indices_train, trainLengths, trainLeftPad, trainRightPad = datasets[0] #print trainY.eval().shape[0] indices_dev, devY, devLengths, devLeftPad, devRightPad = datasets[1] n_train_batches = indices_train.shape[0] / ( batch_size * 4 ) #note that we consider 4 lines as an example in training n_valid_batches = indices_dev.shape[0] / (batch_size * 4) train_batch_start = list(numpy.arange(n_train_batches) * (batch_size * 4)) dev_batch_start = list(numpy.arange(n_valid_batches) * (batch_size * 4)) indices_train_theano = theano.shared(numpy.asarray( indices_train, dtype=theano.config.floatX), borrow=True) indices_dev_theano = theano.shared(numpy.asarray( indices_dev, dtype=theano.config.floatX), borrow=True) indices_train_theano = T.cast(indices_train_theano, 'int32') indices_dev_theano = T.cast(indices_dev_theano, 'int32') rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(1e-50 + numpy.zeros(emb_size)) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init(rand_values) embeddings = theano.shared(value=rand_values) # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x_index = T.imatrix( 'x_index') # now, x is the index matrix, must be integer #left=T.ivector('left') #right=T.ivector('right') #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size = (emb_size, window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv = ishape[1] + filter_size[1] - 1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_input = embeddings[x_index.flatten()].reshape( ((batch_size * 4), maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0 = Conv(rng, input=layer0_input, image_shape=((batch_size * 4), 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) layer0_out = debug_print(layer0.output, 'layer0_out') layer1 = Average_Pooling(rng, input=layer0_out, length_last_dim=length_after_wideConv, kern=nkerns[0]) layer1_out = debug_print( layer1.output.reshape((batch_size * 2, nkerns[0] * 2)), 'layer1_out') layer2 = HiddenLayer(rng, input=layer1_out, n_in=nkerns[0] * 2, n_out=hidden_size, activation=T.tanh) layer3 = HiddenLayer(rng, input=layer2.output, n_in=hidden_size, n_out=1, activation=T.tanh) posi_score = layer3.output[0:layer3.output.shape[0]:2, :] nega_score = layer3.output[1:layer3.output.shape[0]:2, :] cost = T.maximum(0, margin - T.sum(posi_score - nega_score)) #cost = layer3.negative_log_likelihood(y) # output a list of score dev_model = theano.function( [index], layer3.output.flatten(), givens={x_index: indices_dev_theano[index:index + (batch_size * 4)]}) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-20))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [index], [cost], updates=updates, givens={x_index: indices_train_theano[index:index + (batch_size * 4)]}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches / 5, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 minibatch_index = minibatch_index + 1 cost_ij = train_model(batch_start) if iter % n_train_batches == 0: print 'training @ iter = ' + str(iter) + ' cost: ' + str( cost_ij) if iter % validation_frequency == 0: dev_scores = [] for i in dev_batch_start: dev_scores += list(dev_model(i)) acc_dev = compute_acc(devY, dev_scores) print( ('\t\t\t\tepoch %i, minibatch %i/%i, dev acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, acc_dev * 100.)) ''' #print 'validating & testing...' # compute zero-one loss on validation set validation_losses = [] for i in dev_batch_start: time.sleep(0.5) validation_losses.append(validate_model(i)) #validation_losses = [validate_model(i) for i in dev_batch_start] this_validation_loss = numpy.mean(validation_losses) print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index , n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in test_batch_start] test_score = numpy.mean(test_losses) print(('\t\t\t\tepoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_score * 100.)) ''' if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.08, n_epochs=2000, nkerns=[50], batch_size=1000, window_width=4, maxSentLength=64, emb_size=50, hidden_size=50, margin=0.5, L2_weight=0.0004, update_freq=1, norm_threshold=5.0, max_truncate=40, line_no=483142, comment='v5_margin0.6_neg300_'): maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options triple_path='/mounts/data/proj/wenpeng/Dataset/freebase/FB15k/' rng = numpy.random.RandomState(1234) triples, entity_size, relation_size, train_triples_set, train_entity_set, train_relation_set,dev_triples, dev_triples_set, dev_entity_set, dev_relation_set, test_triples, test_triples_set, test_entity_set, test_relation_set=load_TrainDevTest_triples_RankingLoss(triple_path+'freebase_mtr100_mte100-train.txt',triple_path+'freebase_mtr100_mte100-valid.txt', triple_path+'freebase_mtr100_mte100-test.txt', line_no, triple_path) print 'triple size:', len(triples), 'entity_size:', entity_size, 'relation_size:', relation_size#, len(entity_count), len(relation_count) dev_size=len(dev_triples) print 'dev triple size:', dev_size, 'entity_size:', len(dev_entity_set) test_size=len(test_triples) print 'test triple size:', test_size, 'entity_size:', len(test_entity_set) # print triples # print entity_count # print relation_count # exit(0) #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test # mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' # mt_train, mt_test=load_mts_wikiQA(mtPath+'result_train/concate_2mt_train.txt', mtPath+'result_test/concate_2mt_test.txt') # wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') # entity_count=theano.shared(numpy.asarray(entity_count, dtype=theano.config.floatX), borrow=True) # entity_count=T.cast(entity_count, 'int64') # relation_count=theano.shared(numpy.asarray(relation_count, dtype=theano.config.floatX), borrow=True) # relation_count=T.cast(relation_count, 'int64') rand_values=random_value_normal((entity_size, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) entity_E=theano.shared(value=rand_values, borrow=True) rand_values=random_value_normal((relation_size, emb_size), theano.config.floatX, numpy.random.RandomState(4321)) relation_E=theano.shared(value=rand_values, borrow=True) GRU_U, GRU_W, GRU_b=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size) # GRU_U_combine, GRU_W_combine, GRU_b_combine=create_nGRUs_para(rng, word_dim=emb_size, hidden_dim=emb_size, n=3) para_to_load=[entity_E, relation_E, GRU_U, GRU_W, GRU_b] load_model_from_file(triple_path+comment+'Best_Paras_dim'+str(emb_size), para_to_load) norm_entity_E=norm_matrix(entity_E) norm_relation_E=norm_matrix(relation_E) n_batchs=line_no/batch_size remain_triples=line_no%batch_size if remain_triples>0: batch_start=list(numpy.arange(n_batchs)*batch_size)+[line_no-batch_size] else: batch_start=list(numpy.arange(n_batchs)*batch_size) batch_start=theano.shared(numpy.asarray(batch_start, dtype=theano.config.floatX), borrow=True) batch_start=T.cast(batch_start, 'int64') test_triple = T.lvector('test_triple') neg_inds = T.lvector('neg_inds') ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' predicted_tail=GRU_Combine_2Vector(norm_entity_E[test_triple[0]], norm_relation_E[test_triple[1]], emb_size, GRU_U, GRU_W, GRU_b) golden_tail=norm_entity_E[test_triple[2]] pos_loss=(1-cosine(predicted_tail,golden_tail))**2 neg_Es=norm_entity_E[neg_inds].reshape((neg_inds.shape[0], emb_size)) predicted_tail=predicted_tail.reshape((1, emb_size)) multi=T.sum(predicted_tail*neg_Es, axis=1) len1=T.sqrt(T.sum(predicted_tail**2)) len2=T.sqrt(T.sum(neg_Es**2, axis=1)) cos=multi/(len1*len2) neg_loss_vector=(1-cos)**2 # normed_predicted_tail=predicted_tail/T.sqrt(T.sum(predicted_tail**2)) # # pos_loss=T.sum(abs(normed_predicted_tail-golden_tail)) # neg_Es=norm_entity_E[neg_inds].reshape((neg_inds.shape[0], emb_size)) # predicted_tail=normed_predicted_tail.reshape((1, emb_size)) # # neg_loss_vector=T.sum(abs(predicted_tail-neg_Es), axis=1) GRU_forward_step = theano.function([test_triple, neg_inds], [pos_loss,neg_loss_vector], on_unused_input='ignore') # # train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y], # givens={ # x_index_l: indices_train_l[index: index + batch_size], # x_index_r: indices_train_r[index: index + batch_size], # y: trainY[index: index + batch_size], # left_l: trainLeftPad_l[index], # right_l: trainRightPad_l[index], # left_r: trainLeftPad_r[index], # right_r: trainRightPad_r[index], # length_l: trainLengths_l[index], # length_r: trainLengths_r[index], # norm_length_l: normalized_train_length_l[index], # norm_length_r: normalized_train_length_r[index], # mts: mt_train[index: index + batch_size], # wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant # validation_frequency = min(n_train_batches/5, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False svm_max=0.0 best_epoch=0 corpus_triples_set=train_triples_set|dev_triples_set|test_triples_set while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 #shuffle(train_batch_start)#shuffle training data # cost_1, cost_l= train_model(triples) # #print 'layer3_input', layer3_input # print 'cost:', cost_1, cost_l #test test_size=len(test_triples) hits_10=test_size hits_1=test_size co=0 for test_triple in test_triples: co+=1 count=0 flag_continue=True nega_entity_set=get_negas(test_triple, corpus_triples_set, test_entity_set) # print len(nega_entity_set) p_loss, n_loss_vector=GRU_forward_step(test_triple, list(nega_entity_set)) n_loss_vector=numpy.sort(n_loss_vector) # print p_loss # print n_loss_vector[:15] # exit(0) if p_loss>n_loss_vector[0]: hits_1-=1 if p_loss>n_loss_vector[9]: hits_10-=1 if co%1000==0: print co, '...' print '\t\thits_10', hits_10*100.0/test_size, 'hits_1', hits_1*100.0/test_size hits_10=hits_10*100.0/test_size hits_1=hits_1*100.0/test_size # if patience <= iter: # done_looping = True # break #after each epoch, increase the batch_size store_model_to_file(triple_path+'Best_Paras_dim'+str(emb_size)+'_hits10_'+str(hits_10)[:6], para_to_load) print 'Finished storing best params' print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min, Hits_10:', hits_10, 'Hits_1:,', hits_1 mid_time = time.clock() exit(0) # exit(0) # #store the paras after epoch 15 # if epoch ==22: # store_model_to_file(params_conv) # print 'Finished storing best conv params' # exit(0) #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, batch_size=10000, emb_size=50, margin=0.3, L2_weight=1e-10, update_freq=1, norm_threshold=5.0, max_truncate=40, line_no=16450007, neg_size=60, test_neg_size=300, comment=''):#L1Distance_ model_options = locals().copy() print "model options", model_options triple_path='/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/freebase-subsets/' rng = numpy.random.RandomState(1234) # triples, entity_size, relation_size, entity_count, relation_count=load_triples(triple_path+'freebase_mtr100_mte100-train.txt', line_no, triple_path)#vocab_size contain train, dev and test triples, entity_size, relation_size, train_triples_set, train_entity_set, train_relation_set,statistics=load_Train(triple_path+'freebase-FB5M2M-combined.txt', line_no, triple_path) train_h2t=statistics[0] train_t2h=statistics[1] train_r2t=statistics[2] train_r2h=statistics[3] train_r_replace_tail_prop=statistics[4] print 'triple size:', len(triples), 'entity_size:', entity_size, 'relation_size:', relation_size#, len(entity_count), len(relation_count) rand_values=random_value_normal((entity_size, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) entity_E=theano.shared(value=rand_values, borrow=True) rand_values=random_value_normal((relation_size, emb_size), theano.config.floatX, numpy.random.RandomState(4321)) relation_E=theano.shared(value=rand_values, borrow=True) GRU_U, GRU_W, GRU_b=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size) # GRU_U1, GRU_W1, GRU_b1=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size) # GRU_U2, GRU_W2, GRU_b2=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size) # GRU_U_combine, GRU_W_combine, GRU_b_combine=create_nGRUs_para(rng, word_dim=emb_size, hidden_dim=emb_size, n=3) # para_to_load=[entity_E, relation_E, GRU_U, GRU_W, GRU_b] # load_model_from_file(triple_path+'Best_Paras_dim'+str(emb_size), para_to_load) #+'_hits10_63.616' # GRU_U_combine=[GRU_U0, GRU_U1, GRU_U2] # GRU_W_combine=[GRU_W0, GRU_W1, GRU_W2] # GRU_b_combine=[GRU_b0, GRU_b1, GRU_b2] # w2v_entity_rand_values=random_value_normal((entity_size, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # # w2v_relation_rand_values=random_value_normal((relation_size, emb_size), theano.config.floatX, numpy.random.RandomState(4321)) # # w2v_entity_rand_values=load_word2vec_to_init(w2v_entity_rand_values, triple_path+'freebase_mtr100_mte100-train.txt_ids_entityEmb50.txt') # w2v_relation_rand_values=load_word2vec_to_init(w2v_relation_rand_values, triple_path+'freebase_mtr100_mte100-train.txt_ids_relationEmb50.txt') # w2v_entity_rand_values=theano.shared(value=w2v_entity_rand_values, borrow=True) # w2v_relation_rand_values=theano.shared(value=w2v_relation_rand_values, borrow=True) # entity_E_ensemble=entity_E+norm_matrix(w2v_entity_rand_values) # relation_E_ensemble=relation_E+norm_matrix(w2v_relation_rand_values) norm_entity_E=norm_matrix(entity_E) norm_relation_E=norm_matrix(relation_E) n_batchs=line_no/batch_size remain_triples=line_no%batch_size if remain_triples>0: batch_start=list(numpy.arange(n_batchs)*batch_size)+[line_no-batch_size] else: batch_start=list(numpy.arange(n_batchs)*batch_size) # batch_start=theano.shared(numpy.asarray(batch_start, dtype=theano.config.floatX), borrow=True) # batch_start=T.cast(batch_start, 'int64') # allocate symbolic variables for the data # index = T.lscalar() x_index_l = T.lmatrix('x_index_l') # now, x is the index matrix, must be integer n_index_T = T.ltensor3('n_index_T') ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' dist_tail=one_batch_parallel_Ramesh(x_index_l, norm_entity_E, norm_relation_E, GRU_U, GRU_W, GRU_b, emb_size) loss__tail_is=one_neg_batches_parallel_Ramesh(n_index_T, norm_entity_E, norm_relation_E, GRU_U, GRU_W, GRU_b, emb_size) loss_tail_i=T.maximum(0.0, margin+dist_tail.reshape((dist_tail.shape[0],1))-loss__tail_is) # loss_relation_i=T.maximum(0.0, margin+dist_relation.reshape((dist_relation.shape[0],1))-loss_relation_is) # loss_head_i=T.maximum(0.0, margin+dist_head.reshape((dist_head.shape[0],1))-loss_head_is) # loss_tail_i_test=T.maximum(0.0, 0.0+dist_tail.reshape((dist_tail.shape[0],1))-loss__tail_is) # binary_matrix_test=T.gt(loss_tail_i_test, 0) # sum_vector_test=T.sum(binary_matrix_test, axis=1) # binary_vector_hits10=T.gt(sum_vector_test, 10) # test_loss=T.sum(binary_vector_hits10)*1.0/batch_size # loss_relation_i=T.maximum(0.0, margin+dis_relation.reshape((dis_relation.shape[0],1))-loss__relation_is) # loss_head_i=T.maximum(0.0, margin+dis_head.reshape((dis_head.shape[0],1))-loss__head_is) # def neg_slice(neg_matrix): # dist_tail_slice, dis_relation_slice, dis_head_slice=one_batch_parallel_Ramesh(neg_matrix, entity_E, relation_E, GRU_U_combine, GRU_W_combine, GRU_b_combine, emb_size) # loss_tail_i=T.maximum(0.0, margin+dist_tail-dist_tail_slice) # loss_relation_i=T.maximum(0.0, margin+dis_relation-dis_relation_slice) # loss_head_i=T.maximum(0.0, margin+dis_head-dis_head_slice) # return loss_tail_i, loss_relation_i, loss_head_i # # (loss__tail_is, loss__relation_is, loss__head_is), updates = theano.scan( # neg_slice, # sequences=n_index_T, # outputs_info=None) loss_tails=T.mean(T.sum(loss_tail_i, axis=1) ) # loss_relations=T.mean(T.sum(loss_relation_i, axis=1) ) # loss_heads=T.mean(T.sum(loss_head_i, axis=1) ) loss=loss_tails#+loss_relations+loss_heads L2_loss=debug_print((entity_E** 2).sum()+(relation_E** 2).sum()\ +(GRU_U** 2).sum()+(GRU_W** 2).sum(), 'L2_reg') # Div_loss=Diversify_Reg(GRU_U[0])+Diversify_Reg(GRU_U[1])+Diversify_Reg(GRU_U[2])+\ # Diversify_Reg(GRU_W[0])+Diversify_Reg(GRU_W[1])+Diversify_Reg(GRU_W[2]) cost=loss+L2_weight*L2_loss#+div_reg*Div_loss #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = [entity_E, relation_E, GRU_U, GRU_W, GRU_b] # params_conv = [conv_W, conv_b] params_to_store=[entity_E, relation_E, GRU_U, GRU_W, GRU_b] accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-9))) #AdaGrad updates.append((acc_i, acc)) # grads = T.grad(cost, params) # updates = [] # for param_i, grad_i in zip(params, grads): # updates.append((param_i, param_i - learning_rate * grad_i)) #AdaGrad train_model = theano.function([x_index_l, n_index_T], [loss, cost], updates=updates,on_unused_input='ignore') # test_model = theano.function([x_index_l, n_index_T], test_loss, on_unused_input='ignore') # # train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y], # givens={ # x_index_l: indices_train_l[index: index + batch_size], # x_index_r: indices_train_r[index: index + batch_size], # y: trainY[index: index + batch_size], # left_l: trainLeftPad_l[index], # right_l: trainRightPad_l[index], # left_r: trainLeftPad_r[index], # right_r: trainRightPad_r[index], # length_l: trainLengths_l[index], # length_r: trainLengths_r[index], # norm_length_l: normalized_train_length_l[index], # norm_length_r: normalized_train_length_r[index], # mts: mt_train[index: index + batch_size], # wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant # validation_frequency = min(n_train_batches/5, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False svm_max=0.0 best_epoch=0 # corpus_triples_set=train_triples_set|dev_triples_set|test_triples_set best_train_loss=1000000 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 # learning_rate/=epoch # print 'lr:', learning_rate #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 #shuffle(train_batch_start)#shuffle training data loss_sum=0.0 for start in batch_start: if start%100000==0: print start, '...' pos_triples=triples[start:start+batch_size] all_negs=[] # count=0 for pos_triple in pos_triples: neg_triples=get_n_neg_triples_train(pos_triple, train_triples_set, train_entity_set, train_r_replace_tail_prop, neg_size) # # print 'neg_head_triples' # neg_relation_triples=get_n_neg_triples(pos_triple, train_triples_set, train_entity_set, train_relation_set, 1, neg_size/3) # # print 'neg_relation_triples' # neg_tail_triples=get_n_neg_triples(pos_triple, train_triples_set, train_entity_set, train_relation_set, 2, neg_size/3) # print 'neg_tail_triples' all_negs.append(neg_triples) # print 'neg..', count # count+=1 neg_tensor=numpy.asarray(all_negs).reshape((batch_size, neg_size, 3)).transpose(1,0,2) loss, cost= train_model(pos_triples, neg_tensor) loss_sum+=loss loss_sum/=len(batch_start) print 'Training loss:', loss_sum, 'cost:', cost # loss_test=0.0 # # for test_start in batch_start_test: # pos_triples=test_triples[test_start:test_start+batch_size] # all_negs=[] # for pos_triple in pos_triples: # neg_triples=get_n_neg_triples_new(pos_triple, corpus_triples_set, test_entity_set, test_relation_set, test_neg_size/2, True) # all_negs.append(neg_triples) # # neg_tensor=numpy.asarray(all_negs).reshape((batch_size, test_neg_size, 3)).transpose(1,0,2) # loss_test+= test_model(pos_triples, neg_tensor) # # # loss_test/=n_batchs_test # print '\t\t\tUpdating epoch', epoch, 'finished! Test hits10:', 1.0-loss_test if loss_sum< best_train_loss: store_model_to_file(triple_path+comment+'Best_Paras_dim'+str(emb_size), params_to_store) # store_model_to_file(triple_path+'Divreg_Best_Paras_dim'+str(emb_size), params_to_store) best_train_loss=loss_sum print 'Finished storing best params' # exit(0) print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.09, n_epochs=2000, nkerns=[50], batch_size=1, window_width=3, maxSentLength=64, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.00065, Div_reg=0.01, update_freq=1, norm_threshold=5.0, max_truncate=33, max_truncate_nonoverlap=24): maxSentLength=max_truncate+2*(window_width-1) maxSentLength_nonoverlap=max_truncate_nonoverlap+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SICK/'; rng = numpy.random.RandomState(23455) datasets, vocab_size=load_SICK_corpus(rootPath+'vocab.txt', rootPath+'train_plus_dev.txt', rootPath+'test.txt', max_truncate,maxSentLength, entailment=True)#vocab_size contain train, dev and test datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True) #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt') extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt') discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2,:] indices_train_r=indices_train[1::2,:] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2,:] indices_test_r=indices_test[1::2,:] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] n_train_batches=indices_train_l.shape[0]/batch_size n_test_batches=indices_test_l.shape[0]/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l=T.cast(indices_train_l, 'int64') indices_train_r=T.cast(indices_train_r, 'int64') indices_test_l=T.cast(indices_test_l, 'int64') indices_test_r=T.cast(indices_test_r, 'int64') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) #nonoverlap indices_train_nonoverlap, trainY_nonoverlap, trainLengths_nonoverlap, normalized_train_length_nonoverlap, trainLeftPad_nonoverlap, trainRightPad_nonoverlap= datasets_nonoverlap[0] indices_train_l_nonoverlap=indices_train_nonoverlap[::2,:] indices_train_r_nonoverlap=indices_train_nonoverlap[1::2,:] trainLengths_l_nonoverlap=trainLengths_nonoverlap[::2] trainLengths_r_nonoverlap=trainLengths_nonoverlap[1::2] normalized_train_length_l_nonoverlap=normalized_train_length_nonoverlap[::2] normalized_train_length_r_nonoverlap=normalized_train_length_nonoverlap[1::2] trainLeftPad_l_nonoverlap=trainLeftPad_nonoverlap[::2] trainLeftPad_r_nonoverlap=trainLeftPad_nonoverlap[1::2] trainRightPad_l_nonoverlap=trainRightPad_nonoverlap[::2] trainRightPad_r_nonoverlap=trainRightPad_nonoverlap[1::2] indices_test_nonoverlap, testY_nonoverlap, testLengths_nonoverlap,normalized_test_length_nonoverlap, testLeftPad_nonoverlap, testRightPad_nonoverlap= datasets_nonoverlap[1] indices_test_l_nonoverlap=indices_test_nonoverlap[::2,:] indices_test_r_nonoverlap=indices_test_nonoverlap[1::2,:] testLengths_l_nonoverlap=testLengths_nonoverlap[::2] testLengths_r_nonoverlap=testLengths_nonoverlap[1::2] normalized_test_length_l_nonoverlap=normalized_test_length_nonoverlap[::2] normalized_test_length_r_nonoverlap=normalized_test_length_nonoverlap[1::2] testLeftPad_l_nonoverlap=testLeftPad_nonoverlap[::2] testLeftPad_r_nonoverlap=testLeftPad_nonoverlap[1::2] testRightPad_l_nonoverlap=testRightPad_nonoverlap[::2] testRightPad_r_nonoverlap=testRightPad_nonoverlap[1::2] ''' n_train_batches=indices_train_l.shape[0]/batch_size n_test_batches=indices_test_l.shape[0]/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) ''' indices_train_l_nonoverlap=theano.shared(numpy.asarray(indices_train_l_nonoverlap, dtype=theano.config.floatX), borrow=True) indices_train_r_nonoverlap=theano.shared(numpy.asarray(indices_train_r_nonoverlap, dtype=theano.config.floatX), borrow=True) indices_test_l_nonoverlap=theano.shared(numpy.asarray(indices_test_l_nonoverlap, dtype=theano.config.floatX), borrow=True) indices_test_r_nonoverlap=theano.shared(numpy.asarray(indices_test_r_nonoverlap, dtype=theano.config.floatX), borrow=True) indices_train_l_nonoverlap=T.cast(indices_train_l_nonoverlap, 'int64') indices_train_r_nonoverlap=T.cast(indices_train_r_nonoverlap, 'int64') indices_test_l_nonoverlap=T.cast(indices_test_l_nonoverlap, 'int64') indices_test_r_nonoverlap=T.cast(indices_test_r_nonoverlap, 'int64') rand_values_nonoverlap=random_value_normal((vocab_size_nonoverlap+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values_nonoverlap[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values_nonoverlap=load_word2vec_to_init(rand_values_nonoverlap, rootPath+'vocab_nonoverlap_train_plus_dev_in_word2vec_embs_300d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings_nonoverlap=theano.shared(value=rand_values_nonoverlap, borrow=True) #cost_tmp=0 error_sum=0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.lmatrix('x_index_l') # now, x is the index matrix, must be integer x_index_l_nonoverlap = T.lmatrix('x_index_l_nonoverlap') # now, x is the index matrix, must be integer x_index_r = T.lmatrix('x_index_r') x_index_r_nonoverlap = T.lmatrix('x_index_r_nonoverlap') y = T.lvector('y') left_l=T.lscalar() right_l=T.lscalar() left_r=T.lscalar() right_r=T.lscalar() length_l=T.lscalar() length_r=T.lscalar() norm_length_l=T.dscalar() norm_length_r=T.dscalar() left_l_nonoverlap=T.lscalar() right_l_nonoverlap=T.lscalar() left_r_nonoverlap=T.lscalar() right_r_nonoverlap=T.lscalar() length_l_nonoverlap=T.lscalar() length_r_nonoverlap=T.lscalar() norm_length_l_nonoverlap=T.dscalar() norm_length_r_nonoverlap=T.dscalar() mts=T.dmatrix() extra=T.dmatrix() discri=T.dmatrix() #wmf=T.dmatrix() cost_tmp=T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images ishape_nonoverlap = (emb_size, maxSentLength_nonoverlap) filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? #length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_l_input_nonoverlap = embeddings_nonoverlap[x_index_l_nonoverlap.flatten()].reshape((batch_size,maxSentLength_nonoverlap, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_r_input_nonoverlap = embeddings_nonoverlap[x_index_r_nonoverlap.flatten()].reshape((batch_size,maxSentLength_nonoverlap, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output') layer0_l_nonoverlap = Conv_with_input_para(rng, input=layer0_l_input_nonoverlap, image_shape=(batch_size, 1, ishape_nonoverlap[0], ishape_nonoverlap[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r_nonoverlap = Conv_with_input_para(rng, input=layer0_r_input_nonoverlap, image_shape=(batch_size, 1, ishape_nonoverlap[0], ishape_nonoverlap[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output_nonoverlap=debug_print(layer0_l_nonoverlap.output, 'layer0_l_nonoverlap.output') layer0_r_output_nonoverlap=debug_print(layer0_r_nonoverlap.output, 'layer0_r_nonoverlap.output') layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1) layer1_nonoverlap=Average_Pooling_for_Top(rng, input_l=layer0_l_output_nonoverlap, input_r=layer0_r_output_nonoverlap, kern=nkerns[0], left_l=left_l_nonoverlap, right_l=right_l_nonoverlap, left_r=left_r_nonoverlap, right_r=right_r_nonoverlap, length_l=length_l_nonoverlap+filter_size[1]-1, length_r=length_r_nonoverlap+filter_size[1]-1, dim=maxSentLength_nonoverlap+filter_size[1]-1) #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh) sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) aver_uni_l=sum_uni_l/layer0_l_input.shape[3] norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) aver_uni_r=sum_uni_r/layer0_r_input.shape[3] norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) uni_cosine=cosine(sum_uni_l, sum_uni_r) aver_uni_cosine=cosine(aver_uni_l, aver_uni_r) uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi') linear=Linear(norm_uni_l, norm_uni_r) poly=Poly(norm_uni_l, norm_uni_r) sigmoid=Sigmoid(norm_uni_l, norm_uni_r) rbf=RBF(norm_uni_l, norm_uni_r) gesd=GESD(norm_uni_l, norm_uni_r) eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2% #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r)) len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) ''' len_l=length_l.reshape((1,1)) len_r=length_r.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts sum_uni_l_nonoverlap=T.sum(layer0_l_input_nonoverlap, axis=3).reshape((1, emb_size)) aver_uni_l_nonoverlap=sum_uni_l_nonoverlap/layer0_l_input_nonoverlap.shape[3] norm_uni_l_nonoverlap=sum_uni_l_nonoverlap/T.sqrt((sum_uni_l_nonoverlap**2).sum()) sum_uni_r_nonoverlap=T.sum(layer0_r_input_nonoverlap, axis=3).reshape((1, emb_size)) aver_uni_r_nonoverlap=sum_uni_r_nonoverlap/layer0_r_input_nonoverlap.shape[3] norm_uni_r_nonoverlap=sum_uni_r_nonoverlap/T.sqrt((sum_uni_r_nonoverlap**2).sum()) uni_cosine_nonoverlap=cosine(sum_uni_l_nonoverlap, sum_uni_r_nonoverlap) aver_uni_cosine_nonoverlap=cosine(aver_uni_l_nonoverlap, aver_uni_r_nonoverlap) uni_sigmoid_simi_nonoverlap=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l_nonoverlap, norm_uni_r_nonoverlap.T)).reshape((1,1)),'uni_sigmoid_simi') eucli_1_nonoverlap=1.0/(1.0+EUCLID(sum_uni_l_nonoverlap, sum_uni_r_nonoverlap))#25.2% #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r)) len_l_nonoverlap=norm_length_l_nonoverlap.reshape((1,1)) len_r_nonoverlap=norm_length_r_nonoverlap.reshape((1,1)) ''' len_l_nonoverlap=length_l_nonoverlap.reshape((1,1)) len_r_nonoverlap=length_r_nonoverlap.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer3_input=T.concatenate([mts, eucli_1,uni_cosine,#linear, poly,sigmoid,rbf, gesd, #sum_uni_r-sum_uni_l, eucli_1_nonoverlap,uni_cosine_nonoverlap, layer1.output_eucli_to_simi,layer1.output_cosine, #layer1.output_vector_r-layer1.output_vector_l, layer1_nonoverlap.output_eucli_to_simi,layer1_nonoverlap.output_cosine, len_l, len_r, len_l_nonoverlap, len_r_nonoverlap, extra #discri #wmf ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer3=LogisticRegression(rng, input=layer3_input, n_in=14+(2*2)+(2*2)+(2*2)+9, n_out=3) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((layer3.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() diversify_reg= Diversify_Reg(layer3.W.T)+Diversify_Reg(conv_W_into_matrix) cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg+Div_reg*diversify_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([index], [layer3.errors(y),layer3_input, y], givens={ x_index_l: indices_test_l[index: index + batch_size], x_index_r: indices_test_r[index: index + batch_size], y: testY[index: index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index], x_index_l_nonoverlap: indices_test_l_nonoverlap[index: index + batch_size], x_index_r_nonoverlap: indices_test_r_nonoverlap[index: index + batch_size], left_l_nonoverlap: testLeftPad_l_nonoverlap[index], right_l_nonoverlap: testRightPad_l_nonoverlap[index], left_r_nonoverlap: testLeftPad_r_nonoverlap[index], right_r_nonoverlap: testRightPad_r_nonoverlap[index], length_l_nonoverlap: testLengths_l_nonoverlap[index], length_r_nonoverlap: testLengths_r_nonoverlap[index], norm_length_l_nonoverlap: normalized_test_length_l_nonoverlap[index], norm_length_r_nonoverlap: normalized_test_length_r_nonoverlap[index], mts: mt_test[index: index + batch_size], extra: extra_test[index: index + batch_size], discri:discri_test[index: index + batch_size] #wmf: wm_test[index: index + batch_size] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer3.params+ [conv_W, conv_b]#+[embeddings]# + layer1.params params_conv = [conv_W, conv_b] accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) # def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8): # updates = [] # grads = T.grad(cost, params) # i = theano.shared(numpy.float64(0.)) # i_t = i + 1. # fix1 = 1. - (1. - b1)**i_t # fix2 = 1. - (1. - b2)**i_t # lr_t = lr * (T.sqrt(fix2) / fix1) # for p, g in zip(params, grads): # m = theano.shared(p.get_value() * 0.) # v = theano.shared(p.get_value() * 0.) # m_t = (b1 * g) + ((1. - b1) * m) # v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) # g_t = m_t / (T.sqrt(v_t) + e) # p_t = p - (lr_t * g_t) # updates.append((m, m_t)) # updates.append((v, v_t)) # updates.append((p, p_t)) # updates.append((i, i_t)) # return updates # # updates=Adam(cost=cost, params=params, lr=0.0005) train_model = theano.function([index,cost_tmp], cost, updates=updates, givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], x_index_l_nonoverlap: indices_train_l_nonoverlap[index: index + batch_size], x_index_r_nonoverlap: indices_train_r_nonoverlap[index: index + batch_size], left_l_nonoverlap: trainLeftPad_l_nonoverlap[index], right_l_nonoverlap: trainRightPad_l_nonoverlap[index], left_r_nonoverlap: trainLeftPad_r_nonoverlap[index], right_r_nonoverlap: trainRightPad_r_nonoverlap[index], length_l_nonoverlap: trainLengths_l_nonoverlap[index], length_r_nonoverlap: trainLengths_r_nonoverlap[index], norm_length_l_nonoverlap: normalized_train_length_l_nonoverlap[index], norm_length_r_nonoverlap: normalized_train_length_r_nonoverlap[index], mts: mt_train[index: index + batch_size], extra: extra_train[index: index + batch_size], discri:discri_train[index: index + batch_size] #wmf: wm_train[index: index + batch_size] }, on_unused_input='ignore') train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y], givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], x_index_l_nonoverlap: indices_train_l_nonoverlap[index: index + batch_size], x_index_r_nonoverlap: indices_train_r_nonoverlap[index: index + batch_size], left_l_nonoverlap: trainLeftPad_l_nonoverlap[index], right_l_nonoverlap: trainRightPad_l_nonoverlap[index], left_r_nonoverlap: trainLeftPad_r_nonoverlap[index], right_r_nonoverlap: trainRightPad_r_nonoverlap[index], length_l_nonoverlap: trainLengths_l_nonoverlap[index], length_r_nonoverlap: trainLengths_r_nonoverlap[index], norm_length_l_nonoverlap: normalized_train_length_l_nonoverlap[index], norm_length_r_nonoverlap: normalized_train_length_r_nonoverlap[index], mts: mt_train[index: index + batch_size], extra: extra_train[index: index + batch_size], discri:discri_train[index: index + batch_size] #wmf: wm_train[index: index + batch_size] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc=0.0 pre_max=-1 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 shuffle(train_batch_start)#shuffle training data cost_tmp=0.0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) #print batch_start if iter%update_freq != 0: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) #print 'layer3_input', layer3_input cost_tmp+=cost_ij error_sum+=error_ij #print 'cost_acc ',cost_acc #print 'cost_ij ', cost_ij #print 'cost_tmp before update',cost_tmp else: cost_average= train_model(batch_start,cost_tmp) #print 'layer3_input', layer3_input error_sum=0 cost_tmp=0.0#reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_losses=[] test_y=[] test_features=[] for i in test_batch_start: test_loss, layer3_input, y=test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) test_acc=1-test_score print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches,test_acc * 100.)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') train_y=[] train_features=[] count=0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 #write_feature.close() clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33 clf.fit(train_features, train_y) results=clf.predict(test_features) lr=linear_model.LogisticRegression(C=1e5) lr.fit(train_features, train_y) results_lr=lr.predict(test_features) corr_count=0 corr_lr=0 corr_neu=0 neu_co=0 corr_ent=0 ent_co=0 corr_contr=0 contr_co=0 test_size=len(test_y) for i in range(test_size): if results_lr[i]==test_y[i]: corr_lr+=1 if test_y[i]==0:#NEUTRAL neu_co+=1 if results[i]==test_y[i]: corr_neu+=1 elif test_y[i]==1:#ENTAILMENT ent_co+=1 if results[i]==test_y[i]: corr_ent+=1 elif test_y[i]==2:#CONTRADICTION contr_co+=1 if results[i]==test_y[i]: corr_contr+=1 #if numpy.absolute(results_lr[i]-test_y[i])<0.5: # corr_lr+=1 corr_count=corr_neu+corr_ent+corr_contr acc=corr_count*1.0/test_size acc_neu=corr_neu*1.0/neu_co acc_ent=corr_ent*1.0/ent_co acc_contr=corr_contr*1.0/contr_co acc_lr=corr_lr*1.0/test_size if acc > max_acc: max_acc=acc best_epoch=epoch if test_acc > max_acc: max_acc=test_acc best_epoch=epoch if acc_lr> max_acc: max_acc=acc_lr best_epoch=epoch print '\t\t\tsvm:', acc, 'lr:', acc_lr, 'max:', max_acc,'(at',best_epoch,')','Neu:',acc_neu, 'Ent:',acc_ent, 'Contr:',acc_contr if max_acc > pre_max: write_feature_train=open(rootPath+'train_feature_'+str(max_acc)+'.txt', 'w') write_feature_test=open(rootPath+'test_feature_'+str(max_acc)+'.txt', 'w') for i in range(len(train_features)): write_feature_train.write(' '.join(map(str, train_features[i]))+'\n') for i in range(len(test_features)): write_feature_test.write(' '.join(map(str, test_features[i]))+'\n') write_feature_train.close() write_feature_test.close() print 'features stored over' pre_max=max_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, batch_size=500, test_batch_size=500, emb_size=10, hidden_size=10, L2_weight=0.0001, margin=0.5, train_size=4000000, test_size=1000, max_context_len=25, max_span_len=7, max_q_len=40, max_EM=0.0): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = np.random.RandomState(23455) word2id,train_questions,train_questions_mask,train_lefts,train_lefts_mask,train_spans,train_spans_mask,train_rights,train_rights_mask=load_SQUAD_hinrich(train_size, max_context_len, max_span_len, max_q_len) test_ground_truth,all_candidates_f1,test_questions,test_questions_mask,test_lefts,test_lefts_mask,test_spans,test_spans_mask,test_rights,test_rights_mask=load_dev_hinrich(word2id, test_size, max_context_len, max_span_len, max_q_len) overall_vocab_size=len(word2id) print 'vocab size:', overall_vocab_size rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, np.random.RandomState(1234)) # rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in word2id.iteritems()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() left=T.imatrix() #(2*batch, len) left_mask=T.fmatrix() #(2*batch, len) span=T.imatrix() #(2*batch, span_len) span_mask=T.fmatrix() #(2*batch, span_len) right=T.imatrix() #(2*batch, len) right_mask=T.fmatrix() #(2*batch, len) q=T.imatrix() #(2*batch, len_q) q_mask=T.fmatrix() #(2*batch, len_q) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size) GRU1_para=[U1, W1, b1, U1_b, W1_b, b1_b] U2, W2, b2=create_GRU_para(rng, hidden_size, hidden_size) U2_b, W2_b, b2_b=create_GRU_para(rng, hidden_size, hidden_size) GRU2_para=[U2, W2, b2, U2_b, W2_b, b2_b] W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size)) W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) attend_para=[W_a1, W_a2] params = [embeddings]+GRU1_para+attend_para+GRU2_para # load_model_from_file(rootPath+'Best_Para_dim'+str(emb_size), params) left_input = embeddings[left.flatten()].reshape((left.shape[0], left.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_context) span_input = embeddings[span.flatten()].reshape((span.shape[0], span.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_span) right_input = embeddings[right.flatten()].reshape((right.shape[0], right.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_context) q_input = embeddings[q.flatten()].reshape((q.shape[0], q.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_q) left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=left_input, Mask=left_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) left_reps=left_model.output_tensor #(batch, emb, para_len) span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=span_input, Mask=span_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) span_reps=span_model.output_tensor #(batch, emb, para_len) right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=right_input, Mask=right_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) right_reps=right_model.output_tensor #(batch, emb, para_len) q_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=q_input, Mask=q_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) q_reps=q_model.output_tensor #(batch, emb, para_len) #interaction left_reps_via_q_reps, q_reps_via_left_reps=attention_dot_prod_between_2tensors(left_reps, q_reps) span_reps_via_q_reps, q_reps_via_span_reps=attention_dot_prod_between_2tensors(span_reps, q_reps) right_reps_via_q_reps, q_reps_via_right_reps=attention_dot_prod_between_2tensors(right_reps, q_reps) # q_reps_via_left_reps=attention_dot_prod_between_2tensors(q_reps, left_reps) # q_reps_via_span_reps=attention_dot_prod_between_2tensors(q_reps, span_reps) # q_reps_via_right_reps=attention_dot_prod_between_2tensors(q_reps, right_reps) #combine origin_W=normalize_matrix(W_a1) attend_W=normalize_matrix(W_a2) left_origin_reps=T.dot(left_reps.dimshuffle(0, 2,1), origin_W) span_origin_reps=T.dot(span_reps.dimshuffle(0, 2,1), origin_W) right_origin_reps=T.dot(right_reps.dimshuffle(0, 2,1), origin_W) q_origin_reps=T.dot(q_reps.dimshuffle(0, 2,1), origin_W) left_attend_q_reps=T.dot(q_reps_via_left_reps.dimshuffle(0, 2,1), attend_W) span_attend_q_reps=T.dot(q_reps_via_span_reps.dimshuffle(0, 2,1), attend_W) right_attend_q_reps=T.dot(q_reps_via_right_reps.dimshuffle(0, 2,1), attend_W) q_attend_left_reps=T.dot(left_reps_via_q_reps.dimshuffle(0, 2,1), attend_W) q_attend_span_reps=T.dot(span_reps_via_q_reps.dimshuffle(0, 2,1), attend_W) q_attend_right_reps=T.dot(right_reps_via_q_reps.dimshuffle(0, 2,1), attend_W) add_left=left_origin_reps+q_attend_left_reps #(2*batch, len ,hidden) add_span=span_origin_reps+q_attend_span_reps add_right=right_origin_reps+q_attend_right_reps add_q_by_left=q_origin_reps+left_attend_q_reps add_q_by_span=q_origin_reps+span_attend_q_reps add_q_by_right=q_origin_reps+right_attend_q_reps #second GRU add_left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_left.dimshuffle(0,2,1), Mask=left_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_left_reps=add_left_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_span.dimshuffle(0,2,1), Mask=span_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_span_reps=add_span_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_right.dimshuffle(0,2,1), Mask=right_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_right_reps=add_right_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_q_by_left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_left.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_q_by_left_reps=add_q_by_left_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_q_by_span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_span.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_q_by_span_reps=add_q_by_span_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_q_by_right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_right.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_q_by_right_reps=add_q_by_right_model.output_sent_rep_maxpooling #(batch, hidden_dim) paragraph_concat=T.concatenate([add_left_reps, add_span_reps, add_right_reps], axis=1) #(batch, 3*hidden) question_concat=T.concatenate([add_q_by_left_reps, add_q_by_span_reps, add_q_by_right_reps], axis=1) #(batch, 3*hidden) simi_list=cosine_row_wise_twoMatrix(paragraph_concat, question_concat) #(2*batch) pos_simi_vec=simi_list[::2] neg_simi_vec=simi_list[1::2] raw_loss=T.maximum(0.0, margin+neg_simi_vec-pos_simi_vec) #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=T.sum(raw_loss)#+ConvGRU_1.error# accumulator=[] for para_i in params: eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([left, left_mask, span, span_mask, right, right_mask, q, q_mask], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([left, left_mask, span, span_mask, right, right_mask, q, q_mask], simi_list, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size #batch_size means how many pairs remain_train=train_size%batch_size # train_batch_start=list(np.arange(n_train_batches)*batch_size*2)+[train_size*2-batch_size*2] # always ou shu if remain_train>0: train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] else: train_batch_start=list(np.arange(n_train_batches)*batch_size) max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_odd_ids = list(np.arange(train_size)*2) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_odd_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 if iter%100==0: print 'iter:', iter iter_accu+=1 train_id_list=[[train_odd_id, train_odd_id+1] for train_odd_id in train_odd_ids[para_id:para_id+batch_size]] train_id_list=sum(train_id_list,[]) # print train_id_list cost_i+= train_model( np.asarray([train_lefts[id] for id in train_id_list], dtype='int32'), np.asarray([train_lefts_mask[id] for id in train_id_list], dtype=theano.config.floatX), np.asarray([train_spans[id] for id in train_id_list], dtype='int32'), np.asarray([train_spans_mask[id] for id in train_id_list], dtype=theano.config.floatX), np.asarray([train_rights[id] for id in train_id_list], dtype='int32'), np.asarray([train_rights_mask[id] for id in train_id_list], dtype=theano.config.floatX), np.asarray([train_questions[id] for id in train_id_list], dtype='int32'), np.asarray([train_questions_mask[id] for id in train_id_list], dtype=theano.config.floatX)) #print iter if iter%100==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() exact_match=0.0 F1_match=0.0 for test_pair_id in range(test_size): test_example_lefts=test_lefts[test_pair_id] test_example_lefts_mask=test_lefts_mask[test_pair_id] test_example_spans=test_spans[test_pair_id] test_example_spans_mask=test_spans_mask[test_pair_id] test_example_rights=test_rights[test_pair_id] test_example_rights_mask=test_rights_mask[test_pair_id] test_example_questions=test_questions[test_pair_id] test_example_questions_mask=test_questions_mask[test_pair_id] test_example_candidates_f1=all_candidates_f1[test_pair_id] test_example_size=len(test_example_lefts) # print 'test_pair_id, test_example_size:', test_pair_id, test_example_size if test_example_size < test_batch_size: #pad pad_size=test_batch_size-test_example_size test_example_lefts+=test_example_lefts[-1:]*pad_size test_example_lefts_mask+=test_example_lefts_mask[-1:]*pad_size test_example_spans+=test_example_spans[-1:]*pad_size test_example_spans_mask+=test_example_spans_mask[-1:]*pad_size test_example_rights+=test_example_rights[-1:]*pad_size test_example_rights_mask+=test_example_rights_mask[-1:]*pad_size test_example_questions+=test_example_questions[-1:]*pad_size test_example_questions_mask+=test_example_questions_mask[-1:]*pad_size test_example_candidates_f1+=test_example_candidates_f1[-1:]*pad_size test_example_size=test_batch_size n_test_batches=test_example_size/test_batch_size n_test_remain=test_example_size%test_batch_size if n_test_remain > 0: test_batch_start=list(np.arange(n_test_batches)*test_batch_size)+[test_example_size-test_batch_size] else: test_batch_start=list(np.arange(n_test_batches)*test_batch_size) all_simi_list=[] all_cand_list=[] for test_para_id in test_batch_start: simi_return_vector=test_model( np.asarray(test_example_lefts[test_para_id:test_para_id+test_batch_size], dtype='int32'), np.asarray(test_example_lefts_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), np.asarray(test_example_spans[test_para_id:test_para_id+test_batch_size], dtype='int32'), np.asarray(test_example_spans_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), np.asarray(test_example_rights[test_para_id:test_para_id+test_batch_size], dtype='int32'), np.asarray(test_example_rights_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), np.asarray(test_example_questions[test_para_id:test_para_id+test_batch_size], dtype='int32'), np.asarray(test_example_questions_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX)) candidate_f1_list=test_example_candidates_f1[test_para_id:test_para_id+test_batch_size] all_simi_list+=list(simi_return_vector) all_cand_list+=candidate_f1_list top1_f1=all_cand_list[np.argsort(all_simi_list)[-1]] # print top1_cand, test_ground_truth[test_pair_id] if top1_f1 == 1.0: exact_match+=1 # F1=macrof1(top1_cand, test_ground_truth[test_pair_id]) # print '\t\t\t', F1 F1_match+=top1_f1 # match_amount=len(pred_ans_set & q_gold_ans_set) # # print 'q_gold_ans_set:', q_gold_ans_set # # print 'pred_ans_set:', pred_ans_set # if match_amount>0: # exact_match+=match_amount*1.0/len(pred_ans_set) F1_acc=F1_match/test_size exact_acc=exact_match/test_size if F1_acc> max_F1_acc: max_F1_acc=F1_acc # store_model_to_file(params, emb_size) if exact_acc> max_exact_acc: max_exact_acc=exact_acc if max_exact_acc > max_EM: store_model_to_file(rootPath+'Best_Para_'+str(max_exact_acc), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.06, n_epochs=2000, nkerns=[50, 50], batch_size=1, window_width=[4, 4], maxSentLength=64, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.0006, update_freq=1, norm_threshold=5.0, max_truncate=40): maxSentLength = max_truncate + 2 * (window_width[0] - 1) model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/' rng = numpy.random.RandomState(23455) datasets, vocab_size = load_wikiQA_corpus( rootPath + 'vocab.txt', rootPath + 'WikiQA-train.txt', rootPath + 'test_filtered.txt', max_truncate, maxSentLength) #vocab_size contain train, dev and test #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test mtPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' mt_train, mt_test = load_mts_wikiQA( mtPath + 'result_train/concate_2mt_train.txt', mtPath + 'result_test/concate_2mt_test.txt') wm_train, wm_test = load_wmf_wikiQA( rootPath + 'train_word_matching_scores.txt', rootPath + 'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[ 0] indices_train_l = indices_train[::2, :] indices_train_r = indices_train[1::2, :] trainLengths_l = trainLengths[::2] trainLengths_r = trainLengths[1::2] normalized_train_length_l = normalized_train_length[::2] normalized_train_length_r = normalized_train_length[1::2] trainLeftPad_l = trainLeftPad[::2] trainLeftPad_r = trainLeftPad[1::2] trainRightPad_l = trainRightPad[::2] trainRightPad_r = trainRightPad[1::2] indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[ 1] indices_test_l = indices_test[::2, :] indices_test_r = indices_test[1::2, :] testLengths_l = testLengths[::2] testLengths_r = testLengths[1::2] normalized_test_length_l = normalized_test_length[::2] normalized_test_length_r = normalized_test_length[1::2] testLeftPad_l = testLeftPad[::2] testLeftPad_r = testLeftPad[1::2] testRightPad_l = testRightPad[::2] testRightPad_r = testRightPad[1::2] n_train_batches = indices_train_l.shape[0] / batch_size n_test_batches = indices_test_l.shape[0] / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) indices_train_l = theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r = theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l = theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r = theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l = T.cast(indices_train_l, 'int64') indices_train_r = T.cast(indices_train_r, 'int64') indices_test_l = T.cast(indices_test_l, 'int64') indices_test_r = T.cast(indices_test_r, 'int64') rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init(rand_values, rootPath + 'vocab_embs_300d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings = theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.lmatrix( 'x_index_l') # now, x is the index matrix, must be integer x_index_r = T.lmatrix('x_index_r') y = T.lvector('y') left_l = T.lscalar() right_l = T.lscalar() left_r = T.lscalar() right_r = T.lscalar() length_l = T.lscalar() length_r = T.lscalar() norm_length_l = T.dscalar() norm_length_r = T.dscalar() mts = T.dmatrix() wmf = T.dmatrix() cost_tmp = T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size = (emb_size, window_width[0]) filter_size_2 = (nkerns[0], window_width[1]) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv = ishape[1] + filter_size[1] - 1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_r_input = embeddings[x_index_r.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b = create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) load_model_from_file([conv_W, conv_b]) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output = debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output = debug_print(layer0_r.output, 'layer0_r.output') layer0_para = [conv_W, conv_b] layer1 = Average_Pooling(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l + filter_size[1] - 1, length_r=length_r + filter_size[1] - 1, dim=maxSentLength + filter_size[1] - 1, window_size=window_width[0], maxSentLength=maxSentLength) conv2_W, conv2_b = create_conv_para(rng, filter_shape=(nkerns[1], 1, filter_size_2[0], filter_size_2[1])) #load_model_from_file([conv2_W, conv2_b]) layer2_l = Conv_with_input_para( rng, input=layer1.output_tensor_l, image_shape=(batch_size, 1, nkerns[0], ishape[1]), filter_shape=(nkerns[1], 1, filter_size_2[0], filter_size_2[1]), W=conv2_W, b=conv2_b) layer2_r = Conv_with_input_para( rng, input=layer1.output_tensor_r, image_shape=(batch_size, 1, nkerns[0], ishape[1]), filter_shape=(nkerns[1], 1, filter_size_2[0], filter_size_2[1]), W=conv2_W, b=conv2_b) layer2_para = [conv2_W, conv2_b] layer3 = Average_Pooling_for_Top(rng, input_l=layer2_l.output, input_r=layer2_r.output, kern=nkerns[1], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l + filter_size_2[1] - 1, length_r=length_r + filter_size_2[1] - 1, dim=maxSentLength + filter_size_2[1] - 1) #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh) sum_uni_l = T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) aver_uni_l = sum_uni_l / layer0_l_input.shape[3] norm_uni_l = sum_uni_l / T.sqrt((sum_uni_l**2).sum()) sum_uni_r = T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) aver_uni_r = sum_uni_r / layer0_r_input.shape[3] norm_uni_r = sum_uni_r / T.sqrt((sum_uni_r**2).sum()) uni_cosine = cosine(sum_uni_l, sum_uni_r) aver_uni_cosine = cosine(aver_uni_l, aver_uni_r) uni_sigmoid_simi = debug_print( T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1, 1)), 'uni_sigmoid_simi') ''' linear=Linear(sum_uni_l, sum_uni_r) poly=Poly(sum_uni_l, sum_uni_r) sigmoid=Sigmoid(sum_uni_l, sum_uni_r) rbf=RBF(sum_uni_l, sum_uni_r) gesd=GESD(sum_uni_l, sum_uni_r) ''' eucli_1 = 1.0 / (1.0 + EUCLID(sum_uni_l, sum_uni_r)) #25.2% #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r)) len_l = norm_length_l.reshape((1, 1)) len_r = norm_length_r.reshape((1, 1)) ''' len_l=length_l.reshape((1,1)) len_r=length_r.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer3_input = T.concatenate( [ #mts, uni_cosine, #eucli_1_exp,#uni_sigmoid_simi, #norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, # layer1. output_cosine, #layer1.output_eucli_to_simi_exp,#layer1.output_sigmoid_simi,#layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, # layer3.output_cosine, len_l, len_r, wmf ], axis=1) #, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer3 = LogisticRegression(rng, input=layer3_input, n_in=(1) + (1) + (1) + 2 + 2, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg = debug_print( (layer3.W**2).sum() + (conv2_W**2).sum(), 'L2_reg' ) #+(conv_W** 2).sum()+(layer1.W** 2).sum()++(embeddings**2).sum() cost_this = debug_print(layer3.negative_log_likelihood(y), 'cost_this') #+L2_weight*L2_reg cost = debug_print( (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function( [index], [layer3.prop_for_posi, layer3_input, y], givens={ x_index_l: indices_test_l[index:index + batch_size], x_index_r: indices_test_r[index:index + batch_size], y: testY[index:index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index], mts: mt_test[index:index + batch_size], wmf: wm_test[index:index + batch_size] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer3.params + layer2_para #+layer0_para accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') acc = acc_i + T.sqr(grad_i) updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [index, cost_tmp], cost, updates=updates, givens={ x_index_l: indices_train_l[index:index + batch_size], x_index_r: indices_train_r[index:index + batch_size], y: trainY[index:index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index:index + batch_size], wmf: wm_train[index:index + batch_size] }, on_unused_input='ignore') train_model_predict = theano.function( [index], [cost_this, layer3.errors(y), layer3_input, y], givens={ x_index_l: indices_train_l[index:index + batch_size], x_index_r: indices_train_r[index:index + batch_size], y: trainY[index:index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index:index + batch_size], wmf: wm_train[index:index + batch_size] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches / 5, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False svm_max = 0.0 best_epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 #shuffle(train_batch_start)#shuffle training data cost_tmp = 0.0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 minibatch_index = minibatch_index + 1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) #print batch_start if iter % update_freq != 0: cost_ij, error_ij, layer3_input, y = train_model_predict( batch_start) #print 'layer3_input', layer3_input cost_tmp += cost_ij error_sum += error_ij #print 'cost_acc ',cost_acc #print 'cost_ij ', cost_ij #print 'cost_tmp before update',cost_tmp else: cost_average = train_model(batch_start, cost_tmp) #print 'layer3_input', layer3_input error_sum = 0 cost_tmp = 0.0 #reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + ' average cost: ' + str( cost_average) + ' error: ' + str( error_sum) + '/' + str( update_freq) + ' error rate: ' + str( error_sum * 1.0 / update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_probs = [] test_y = [] test_features = [] for i in test_batch_start: prob_i, layer3_input, y = test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_probs.append(prob_i[0][0]) test_y.append(y[0]) test_features.append(layer3_input[0]) MAP, MRR = compute_map_mrr(rootPath + 'test_filtered.txt', test_probs) #now, check MAP and MRR print( ('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best ' 'model %f, MRR %f') % (epoch, minibatch_index, n_train_batches, MAP, MRR)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') train_y = [] train_features = [] count = 0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y = train_model_predict( batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 #write_feature.close() clf = svm.SVC(C=1.0, kernel='linear') clf.fit(train_features, train_y) results_svm = clf.decision_function(test_features) MAP_svm, MRR_svm = compute_map_mrr( rootPath + 'test_filtered.txt', results_svm) lr = LinearRegression().fit(train_features, train_y) results_lr = lr.predict(test_features) MAP_lr, MRR_lr = compute_map_mrr( rootPath + 'test_filtered.txt', results_lr) print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr if patience <= iter: done_looping = True break #after each epoch, increase the batch_size if epoch % 2 == 1: update_freq = update_freq * 1 else: update_freq = update_freq / 1 #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=300, char_emb_size=20, hidden_size=300, L2_weight=0.0001, p_len_limit=400, test_p_len_limit=100, q_len_limit=20, char_len=15, filter_size = [5,5], char_filter_size=4, margin=0.5, max_EM=50.302743615): test_batch_size=batch_size*10 model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) word2id={} char2id={} #questions,paragraphs,q_masks,p_masks,labels, word2id train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_label_list, word2id, char2id=load_squad_cnn_rank_word_train(word2id, char2id, p_len_limit, q_len_limit, char_len) train_size=len(train_para_list) test_Q_list, test_para_list, test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, test_label_list, word2id, char2id, test_para_wordlist_list= load_squad_cnn_rank_word_dev(word2id, char2id, test_p_len_limit, q_len_limit, char_len) test_size=len(test_para_list) train_Q_list = numpy.asarray(train_Q_list, dtype='int32') train_para_list = numpy.asarray(train_para_list, dtype='int32') train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX) train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX) train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32') train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32') train_Q_char_mask = numpy.asarray(train_Q_char_mask, dtype=theano.config.floatX) train_para_char_mask = numpy.asarray(train_para_char_mask, dtype=theano.config.floatX) train_label_list = numpy.asarray(train_label_list, dtype='int32') test_Q_list = numpy.asarray(test_Q_list, dtype='int32') test_para_list = numpy.asarray(test_para_list, dtype='int32') test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX) test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX) test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32') test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32') test_Q_char_mask = numpy.asarray(test_Q_char_mask, dtype=theano.config.floatX) test_para_char_mask = numpy.asarray(test_para_char_mask, dtype=theano.config.floatX) vocab_size = len(word2id) print 'vocab size: ', vocab_size rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, rng) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_glove() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) char_size = len(char2id) print 'char size: ', char_size char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, rng) char_embeddings=theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') gold_indices= T.ivector() #batch, one gold word for each sample para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') char_paragraph = T.imatrix() #(batch, char_len*p_len) char_questions = T.imatrix() char_para_mask=T.fmatrix() char_q_mask=T.fmatrix() # true_p_len = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = paragraph.shape[0] true_p_len = paragraph.shape[1] common_input_p=embeddings[paragraph.flatten()].reshape((true_batch_size,true_p_len, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_q=embeddings[questions.flatten()].reshape((true_batch_size,q_len_limit, emb_size)) char_common_input_p=char_embeddings[char_paragraph.flatten()].reshape((true_batch_size*true_p_len, char_len, char_emb_size)) #the input format can be adapted into CNN or GRU or LSTM char_common_input_q=char_embeddings[char_questions.flatten()].reshape((true_batch_size*q_len_limit, char_len, char_emb_size)) char_p_masks = char_para_mask.reshape((true_batch_size*true_p_len, char_len)) char_q_masks = char_q_mask.reshape((true_batch_size*q_len_limit, char_len)) char_LSTM_para_dict=create_LSTM_para(rng, char_emb_size, char_emb_size) char_LSTM_para_dict_bw=create_LSTM_para(rng, char_emb_size, char_emb_size) char_lstm_layer_p=Bd_LSTM_Batch_Tensor_Input_with_Mask(char_common_input_p.dimshuffle(0,2,1), char_p_masks, char_emb_size, char_LSTM_para_dict,char_LSTM_para_dict_bw) char_word_embeddings_p = char_lstm_layer_p.output_sent_rep_conc.reshape((true_batch_size, true_p_len, 2*char_emb_size)).dimshuffle(0, 2,1) #(batch, 2*hidden) char_lstm_layer_q=Bd_LSTM_Batch_Tensor_Input_with_Mask(char_common_input_q.dimshuffle(0,2,1), char_q_masks, char_emb_size, char_LSTM_para_dict,char_LSTM_para_dict_bw) char_word_embeddings_q = char_lstm_layer_q.output_sent_rep_conc.reshape((true_batch_size, q_len_limit, 2*char_emb_size)).dimshuffle(0, 2,1) #(batch, 2*hidden) LSTM_para_dict=create_LSTM_para(rng, 2*char_emb_size+emb_size,hidden_size) #40+300 LSTM_para_dict_bw=create_LSTM_para(rng, 2*char_emb_size+emb_size,hidden_size) p_input2lstm = T.concatenate([common_input_p.dimshuffle(0,2,1), char_word_embeddings_p], axis=1) #(batch, emb_size+char_emb_size, p_len) q_input2lstm = T.concatenate([common_input_q.dimshuffle(0,2,1), char_word_embeddings_q], axis=1) #(batch, emb_size+char_emb_size, p_len) lstm_layer_p=Bd_LSTM_Batch_Tensor_Input_with_Mask(p_input2lstm, para_mask, hidden_size, LSTM_para_dict,LSTM_para_dict_bw) p_tensor3 = lstm_layer_p.output_tensor_conc #(batch, 2*hidden, p_len) lstm_layer_q=Bd_LSTM_Batch_Tensor_Input_with_Mask(q_input2lstm, q_mask, hidden_size, LSTM_para_dict,LSTM_para_dict_bw) q_reps = lstm_layer_q.output_sent_rep_conc #(batch, 2*hidden) NN_para=char_LSTM_para_dict.values()+char_LSTM_para_dict_bw.values()+LSTM_para_dict.values()+LSTM_para_dict_bw.values() input4score = T.concatenate([p_tensor3, T.repeat(q_reps.dimshuffle(0,1,'x'), true_p_len, axis=2)], axis=1) #(batch, 4*hidden, p_len) HL_1_para = create_ensemble_para(rng, hidden_size, 4*hidden_size) HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 1, hidden_size) norm_U_a=normalize_matrix(U_a) norm_HL_1_para=normalize_matrix(HL_1_para) norm_HL_2_para=normalize_matrix(HL_2_para) norm_HL_3_para=normalize_matrix(HL_3_para) norm_HL_4_para=normalize_matrix(HL_4_para) span_scores_matrix = add_HLs_2_tensor3(input4score, norm_HL_1_para,norm_HL_2_para,norm_HL_3_para,norm_HL_4_para, norm_U_a, true_batch_size,true_p_len) span_scores=T.nnet.softmax(span_scores_matrix) #(batch, para_len) loss_neg_likelihood=-T.mean(T.log(span_scores[T.arange(true_batch_size), gold_indices])) #ranking loss tanh_span_scores_matrix = span_scores#T.tanh(span_scores_matrix) #(batch, gram_size) index_matrix = T.zeros((true_batch_size, p_len_limit), dtype=theano.config.floatX) new_index_matrix = T.set_subtensor(index_matrix[T.arange(true_batch_size), gold_indices], 1.0) prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()] prob_batch_nega = tanh_span_scores_matrix[(1.0-new_index_matrix).nonzero()] repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0) repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x',0), prob_batch_posi.shape[0], axis=0).flatten() loss_rank = T.mean(T.maximum(0.0, margin-repeat_posi+repeat_nega)) loss = loss_neg_likelihood + loss_rank #test mask_test_return=T.argmax(span_scores_matrix*para_mask, axis=1) #batch params = [embeddings,char_embeddings]+NN_para+[U_a,HL_1_para,HL_2_para,HL_3_para,HL_4_para] # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=loss#+ConvGRU_1.error# accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([paragraph, questions,gold_indices, para_mask, q_mask, char_paragraph, #(batch, char_len*p_len) char_questions, char_para_mask, char_q_mask], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, char_paragraph, char_questions, char_para_mask, char_q_mask], mask_test_return, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/test_batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_batch = train_ids[para_id:para_id+batch_size] cost_i+= train_model( train_para_list[train_id_batch], train_Q_list[train_id_batch], train_label_list[train_id_batch], train_para_mask[train_id_batch], train_Q_mask[train_id_batch], train_para_char_list[train_id_batch], train_Q_char_list[train_id_batch], train_para_char_mask[train_id_batch], train_Q_char_mask[train_id_batch]) #print iter if iter%100==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() # pred_dict={} q_amount=0 p1=0 for test_para_id in test_batch_start: batch_predict_ids=test_model( test_para_list[test_para_id:test_para_id+test_batch_size], test_Q_list[test_para_id:test_para_id+test_batch_size], test_para_mask[test_para_id:test_para_id+test_batch_size], test_Q_mask[test_para_id:test_para_id+test_batch_size], test_para_char_list[test_para_id:test_para_id+test_batch_size], test_Q_char_list[test_para_id:test_para_id+test_batch_size], test_para_char_mask[test_para_id:test_para_id+test_batch_size], test_Q_char_mask[test_para_id:test_para_id+test_batch_size]) # test_para_wordlist_batch=test_para_wordlist_list[test_para_id:test_para_id+test_batch_size] test_label_batch=test_label_list[test_para_id:test_para_id+test_batch_size] q_amount+=test_batch_size for q in range(test_batch_size): #for each question predict_id = batch_predict_ids[q] ground_ids=test_label_batch[q] if predict_id in set(ground_ids): p1+=1 # print batch_predict_ids[q], mask_batch_predict_ids[q], test_p_len_limit - numpy.sum(test_para_mask[test_para_id+q]), scores_i[q], test_para_mask[test_para_id+q] exact_acc = p1*100.0/q_amount if exact_acc> max_exact_acc: max_exact_acc=exact_acc # if max_exact_acc > max_EM: # store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params) # print 'Finished storing best params at:', max_exact_acc print '\t\tcurrent exact:', exact_acc, '\t\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.0001, n_epochs=2000, nkerns=[256,256], batch_size=1, window_width=[4,4], maxSentLength=64, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.0006, Div_reg=0.06, update_freq=1, norm_threshold=5.0, max_truncate=40): maxSentLength=max_truncate+2*(window_width[0]-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/'; rng = numpy.random.RandomState(23455) datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' mt_train, mt_test=load_mts_wikiQA(mtPath+'result_train/concate_2mt_train.txt', mtPath+'result_test/concate_2mt_test.txt') wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2,:] indices_train_r=indices_train[1::2,:] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2,:] indices_test_r=indices_test[1::2,:] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] n_train_batches=indices_train_l.shape[0]/batch_size n_test_batches=indices_test_l.shape[0]/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l=T.cast(indices_train_l, 'int64') indices_train_r=T.cast(indices_train_r, 'int64') indices_test_l=T.cast(indices_test_l, 'int64') indices_test_r=T.cast(indices_test_r, 'int64') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum=0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.lmatrix('x_index_l') # now, x is the index matrix, must be integer x_index_r = T.lmatrix('x_index_r') y = T.lvector('y') left_l=T.lscalar() right_l=T.lscalar() left_r=T.lscalar() right_r=T.lscalar() length_l=T.lscalar() length_r=T.lscalar() norm_length_l=T.dscalar() norm_length_r=T.dscalar() mts=T.dmatrix() wmf=T.dmatrix() cost_tmp=T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width[0]) filter_size_2=(nkerns[0], window_width[1]) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((maxSentLength, emb_size)).transpose() layer0_r_input = embeddings[x_index_r.flatten()].reshape((maxSentLength, emb_size)).transpose() l_input_tensor=debug_print(Matrix_Bit_Shift(layer0_l_input[:,left_l:-right_l]), 'l_input_tensor') r_input_tensor=debug_print(Matrix_Bit_Shift(layer0_r_input[:,left_r:-right_r]), 'r_input_tensor') addition_l=T.sum(layer0_l_input[:,left_l:-right_l], axis=1) addition_r=T.sum(layer0_r_input[:,left_r:-right_r], axis=1) cosine_addition=cosine(addition_l, addition_r) eucli_addition=1.0/(1.0+EUCLID(addition_l, addition_r))#25.2% U, W, b=create_GRU_para(rng, emb_size, nkerns[0]) layer0_para=[U, W, b] layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1) layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1) cosine_sent=cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep) eucli_sent=1.0/(1.0+EUCLID(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep))#25.2% #ibm attentive pooling at extended sentence level attention_matrix=compute_simi_feature_matrix_with_matrix(layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim, layer0_A2.dim, maxSentLength*(maxSentLength+1)/2) # attention_vec_l_extended=T.nnet.softmax(T.max(attention_matrix, axis=1)).transpose() # ibm_l_extended=layer0_A1.output_matrix.dot(attention_vec_l_extended).transpose() # attention_vec_r_extended=T.nnet.softmax(T.max(attention_matrix, axis=0)).transpose() # ibm_r_extended=layer0_A2.output_matrix.dot(attention_vec_r_extended).transpose() # cosine_ibm_extended=cosine(ibm_l_extended, ibm_r_extended) # eucli_ibm_extended=1.0/(1.0+EUCLID(ibm_l_extended, ibm_r_extended))#25.2% #ibm attentive pooling at original sentence level simi_matrix_sent=compute_simi_feature_matrix_with_matrix(layer0_A1.output_sent_hiddenstates, layer0_A2.output_sent_hiddenstates, length_l, length_r, maxSentLength) attention_vec_l=T.nnet.softmax(T.max(simi_matrix_sent, axis=1)).transpose() ibm_l=layer0_A1.output_sent_hiddenstates.dot(attention_vec_l).transpose() attention_vec_r=T.nnet.softmax(T.max(simi_matrix_sent, axis=0)).transpose() ibm_r=layer0_A2.output_sent_hiddenstates.dot(attention_vec_r).transpose() cosine_ibm=cosine(ibm_l, ibm_r) eucli_ibm=1.0/(1.0+EUCLID(ibm_l, ibm_r))#25.2% l_max_attention=T.max(attention_matrix, axis=1) neighborsArgSorted = T.argsort(l_max_attention) kNeighborsArg = neighborsArgSorted[-3:]#only average the max 3 vectors ll = T.sort(kNeighborsArg).flatten() # make y indices in acending lie r_max_attention=T.max(attention_matrix, axis=0) neighborsArgSorted_r = T.argsort(r_max_attention) kNeighborsArg_r = neighborsArgSorted_r[-3:]#only average the max 3 vectors rr = T.sort(kNeighborsArg_r).flatten() # make y indices in acending lie l_max_min_attention=debug_print(layer0_A1.output_matrix[:,ll], 'l_max_min_attention') r_max_min_attention=debug_print(layer0_A2.output_matrix[:,rr], 'r_max_min_attention') U1, W1, b1=create_GRU_para(rng, nkerns[0], nkerns[1]) layer1_para=[U1, W1, b1] layer1_A1=GRU_Matrix_Input(X=l_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1) layer1_A2=GRU_Matrix_Input(X=r_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1) vec_l=debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])), 'vec_l') vec_r=debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])), 'vec_r') # sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) # aver_uni_l=sum_uni_l/layer0_l_input.shape[3] # norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) # sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) # aver_uni_r=sum_uni_r/layer0_r_input.shape[3] # norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) # uni_cosine=cosine(vec_l, vec_r) # aver_uni_cosine=cosine(aver_uni_l, aver_uni_r) # uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi') # ''' # linear=Linear(sum_uni_l, sum_uni_r) # poly=Poly(sum_uni_l, sum_uni_r) # sigmoid=Sigmoid(sum_uni_l, sum_uni_r) # rbf=RBF(sum_uni_l, sum_uni_r) # gesd=GESD(sum_uni_l, sum_uni_r) # ''' eucli_1=1.0/(1.0+EUCLID(vec_l, vec_r))#25.2% # #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r)) # len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) # # ''' # len_l=length_l.reshape((1,1)) # len_r=length_r.reshape((1,1)) # ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer3_input=T.concatenate([vec_l, vec_r, uni_cosine,eucli_1, cosine_addition, eucli_addition, # cosine_sent, eucli_sent, ibm_l.reshape((1, nkerns[0])), ibm_r.reshape((1, nkerns[0])), #2*nkerns[0]+ cosine_ibm, eucli_ibm, len_l, len_r,wmf ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer3=LogisticRegression(rng, input=layer3_input, n_in=(2*nkerns[1]+2)+2 +(2*nkerns[0]+2)+2+2, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((layer3.W** 2).sum()+(U** 2).sum()+(W** 2).sum()+(U1** 2).sum()+(W1** 2).sum(), 'L2_reg')#+(conv_W** 2).sum()+(layer1.W** 2).sum()++(embeddings**2).sum() diversify_reg= Diversify_Reg(layer3.W.T)+Diversify_Reg(U[0])+Diversify_Reg(W[0])+Diversify_Reg(U1[0])+Diversify_Reg(W1[0])+Diversify_Reg(U[1])+Diversify_Reg(W[1])+Diversify_Reg(U1[1])+Diversify_Reg(W1[1])+Diversify_Reg(U[2])+Diversify_Reg(W[2])+Diversify_Reg(U1[2])+Diversify_Reg(W1[2]) cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg+Div_reg*diversify_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([index], [layer3.prop_for_posi,layer3_input, y], givens={ x_index_l: indices_test_l[index: index + batch_size], x_index_r: indices_test_r[index: index + batch_size], y: testY[index: index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index], mts: mt_test[index: index + batch_size], wmf: wm_test[index: index + batch_size]}, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer3.params+ layer1_para+layer0_para#+[embeddings]# + layer1.params # params_conv = [conv_W, conv_b] # accumulator=[] # for para_i in params: # eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) # accumulator.append(theano.shared(eps_p, borrow=True)) # # # create a list of gradients for all model parameters # grads = T.grad(cost, params) # # updates = [] # for param_i, grad_i, acc_i in zip(params, grads, accumulator): # grad_i=debug_print(grad_i,'grad_i') # acc = acc_i + T.sqr(grad_i) # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad # updates.append((acc_i, acc)) def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8): updates = [] grads = T.grad(cost, params) i = theano.shared(numpy.float64(0.)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates updates=Adam(cost=cost, params=params, lr=learning_rate) train_model = theano.function([index,cost_tmp], cost, updates=updates, givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index: index + batch_size], wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore') train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y], givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index: index + batch_size], wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time epoch = 0 done_looping = False svm_max=0.0 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 #shuffle(train_batch_start)#shuffle training data cost_tmp=0.0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) # print batch_start if iter%update_freq != 0: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) #print 'layer3_input', layer3_input cost_tmp+=cost_ij error_sum+=error_ij #print 'cost_acc ',cost_acc #print 'cost_ij ', cost_ij #print 'cost_tmp before update',cost_tmp else: cost_average= train_model(batch_start,cost_tmp) #print 'layer3_input', layer3_input error_sum=0 cost_tmp=0.0#reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_probs=[] test_y=[] test_features=[] for i in test_batch_start: prob_i, layer3_input, y=test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_probs.append(prob_i[0][0]) test_y.append(y[0]) test_features.append(layer3_input[0]) MAP, MRR=compute_map_mrr(rootPath+'test_filtered.txt', test_probs) #now, check MAP and MRR print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best ' 'model %f, MRR %f') % (epoch, minibatch_index, n_train_batches,MAP, MRR)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') train_y=[] train_features=[] count=0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 #write_feature.close() clf = svm.SVC(C=1.0, kernel='linear') clf.fit(train_features, train_y) results_svm=clf.decision_function(test_features) MAP_svm, MRR_svm=compute_map_mrr(rootPath+'test_filtered.txt', results_svm) lr=LinearRegression().fit(train_features, train_y) results_lr=lr.predict(test_features) MAP_lr, MRR_lr=compute_map_mrr(rootPath+'test_filtered.txt', results_lr) print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, nkerns=[50], batch_size=1, window_width=4, maxSentLength=64, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.0003, update_freq=1, norm_threshold=5.0, max_truncate=40): maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/'; rng = numpy.random.RandomState(23455) datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' mt_train, mt_test=load_mts_wikiQA(mtPath+'result_train/concate_2mt_train.txt', mtPath+'result_test/concate_2mt_test.txt') wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2,:] indices_train_r=indices_train[1::2,:] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2,:] indices_test_r=indices_test[1::2,:] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] n_train_batches=indices_train_l.shape[0]/batch_size n_test_batches=indices_test_l.shape[0]/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l=T.cast(indices_train_l, 'int64') indices_train_r=T.cast(indices_train_r, 'int64') indices_test_l=T.cast(indices_test_l, 'int64') indices_test_r=T.cast(indices_test_r, 'int64') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum=0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.lmatrix('x_index_l') # now, x is the index matrix, must be integer x_index_r = T.lmatrix('x_index_r') y = T.lvector('y') left_l=T.lscalar() right_l=T.lscalar() left_r=T.lscalar() right_r=T.lscalar() length_l=T.lscalar() length_r=T.lscalar() norm_length_l=T.dscalar() norm_length_r=T.dscalar() mts=T.dmatrix() wmf=T.dmatrix() cost_tmp=T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output') layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1) #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh) sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) aver_uni_l=sum_uni_l/layer0_l_input.shape[3] norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) aver_uni_r=sum_uni_r/layer0_r_input.shape[3] norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) uni_cosine=cosine(sum_uni_l, sum_uni_r) aver_uni_cosine=cosine(aver_uni_l, aver_uni_r) uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi') ''' linear=Linear(sum_uni_l, sum_uni_r) poly=Poly(sum_uni_l, sum_uni_r) sigmoid=Sigmoid(sum_uni_l, sum_uni_r) rbf=RBF(sum_uni_l, sum_uni_r) gesd=GESD(sum_uni_l, sum_uni_r) ''' eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2% #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r)) len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) ''' len_l=length_l.reshape((1,1)) len_r=length_r.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer3_input=T.concatenate([#mts, uni_cosine,#eucli_1_exp,#uni_sigmoid_simi, #norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, # layer1.output_cosine, #layer1.output_eucli_to_simi_exp,#layer1.output_sigmoid_simi,#layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, # len_l, len_r,wmf ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer3=LogisticRegression(rng, input=layer3_input, n_in=(1)+(1)+2+2, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((layer3.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([index], [layer3.prop_for_posi,layer3_input, y], givens={ x_index_l: indices_test_l[index: index + batch_size], x_index_r: indices_test_r[index: index + batch_size], y: testY[index: index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index], mts: mt_test[index: index + batch_size], wmf: wm_test[index: index + batch_size]}, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer3.params+ [conv_W, conv_b]#+[embeddings]# + layer1.params params_conv = [conv_W, conv_b] accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index,cost_tmp], cost, updates=updates, givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index: index + batch_size], wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore') train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y], givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index: index + batch_size], wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False svm_max=0.0 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 #shuffle(train_batch_start)#shuffle training data cost_tmp=0.0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) #print batch_start if iter%update_freq != 0: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) #print 'layer3_input', layer3_input cost_tmp+=cost_ij error_sum+=error_ij #print 'cost_acc ',cost_acc #print 'cost_ij ', cost_ij #print 'cost_tmp before update',cost_tmp else: cost_average= train_model(batch_start,cost_tmp) #print 'layer3_input', layer3_input error_sum=0 cost_tmp=0.0#reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_probs=[] test_y=[] test_features=[] for i in test_batch_start: prob_i, layer3_input, y=test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_probs.append(prob_i[0][0]) test_y.append(y[0]) test_features.append(layer3_input[0]) MAP, MRR=compute_map_mrr(rootPath+'test_filtered.txt', test_probs) #now, check MAP and MRR print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best ' 'model %f, MRR %f') % (epoch, minibatch_index, n_train_batches,MAP, MRR)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') train_y=[] train_features=[] count=0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 #write_feature.close() clf = svm.SVC(C=1.0, kernel='linear') clf.fit(train_features, train_y) results_svm=clf.decision_function(test_features) MAP_svm, MRR_svm=compute_map_mrr(rootPath+'test_filtered.txt', results_svm) lr=LinearRegression().fit(train_features, train_y) results_lr=lr.predict(test_features) MAP_lr, MRR_lr=compute_map_mrr(rootPath+'test_filtered.txt', results_lr) print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr if patience <= iter: done_looping = True break #after each epoch, increase the batch_size if epoch%2==1: update_freq=update_freq*1 else: update_freq=update_freq/1 #store the paras after epoch 15 if epoch ==15: store_model_to_file(params_conv) print 'Finished storing best conv params' exit(0) #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.0001, n_epochs=2000, batch_size=20, test_batch_size=200, emb_size=300, hidden_size=300, L2_weight=0.0001, para_len_limit=400, q_len_limit=40, max_EM=50.302743615): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) # glove_vocab=set(word2vec.keys()) train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train(para_len_limit, q_len_limit) train_size=len(train_para_list) if train_size!=len(train_Q_list) or train_size!=len(train_label_list) or train_size!=len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist, q_idlist= load_dev_or_test(word2id, para_len_limit, q_len_limit) test_size=len(test_para_list) if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in overall_word2id.iteritems()} # word2vec=load_glove() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') # labels = T.imatrix('labels') #(batch, para_len) gold_indices= T.ivector() #batch para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) is_train = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size=paragraph.shape[0] norm_extraF=normalize_matrix(extraF) U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size) paragraph_para=[U1, W1, b1, U1_b, W1_b, b1_b] U_e1, W_e1, b_e1=create_GRU_para(rng, 3*hidden_size+3, hidden_size) U_e1_b, W_e1_b, b_e1_b=create_GRU_para(rng, 3*hidden_size+3, hidden_size) paragraph_para_e1=[U_e1, W_e1, b_e1, U_e1_b, W_e1_b, b_e1_b] UQ, WQ, bQ=create_GRU_para(rng, emb_size, hidden_size) UQ_b, WQ_b, bQ_b=create_GRU_para(rng, emb_size, hidden_size) Q_para=[UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] # W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size)) # W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 1, 2*hidden_size) # 3 extra features # LR_b = theano.shared(value=numpy.zeros((2,), # dtype=theano.config.floatX), # @UndefinedVariable # name='LR_b', borrow=True) HL_paras=[U_a] params = [embeddings]+paragraph_para+Q_para+paragraph_para_e1+HL_paras load_model_from_file(rootPath+'Best_Paras_conv_50.302743614', params) paragraph_input = embeddings[paragraph.flatten()].reshape((true_batch_size, paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen) concate_paragraph_input=T.concatenate([paragraph_input, norm_extraF.dimshuffle((0,2,1))], axis=1) paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) para_reps=paragraph_model.output_tensor #(batch, emb, para_len) # #LSTM # fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters # paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) # para_reps=paragraph_model.output_tensor Qs_emb = embeddings[questions.flatten()].reshape((true_batch_size, questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength) questions_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ,W=WQ,b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b) questions_reps_tensor=questions_model.output_tensor questions_reps=questions_model.output_sent_rep_maxpooling.reshape((true_batch_size, 1, hidden_size)) #(batch, 1, hidden) questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) #(batch, para_len, hidden) # #LSTM for questions # fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters # questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) # questions_reps_tensor=questions_model.output_tensor # def example_in_batch(para_matrix, q_matrix): #assume both are (hidden, len) transpose_para_matrix=para_matrix.T interaction_matrix=T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len) norm_interaction_matrix=T.nnet.softmax(interaction_matrix) # norm_interaction_matrix=T.maximum(0.0, interaction_matrix) return T.dot(q_matrix, norm_interaction_matrix.T)/T.sum(norm_interaction_matrix.T, axis=0).dimshuffle('x',0) #(len, para_len) batch_q_reps, updates = theano.scan(fn=example_in_batch, outputs_info=None, sequences=[para_reps, questions_reps_tensor]) #batch_q_reps (batch, hidden, para_len) #para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1), all are in (batch, hidden , para_len) ensemble_para_reps_tensor=T.concatenate([para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1), norm_extraF.dimshuffle(0,2,1)], axis=1) #(batch, 3*hidden+3, para_len) para_ensemble_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=ensemble_para_reps_tensor, Mask=para_mask, hidden_dim=hidden_size,U=U_e1,W=W_e1,b=b_e1,Ub=U_e1_b,Wb=W_e1_b,bb=b_e1_b) para_reps_tensor4score=para_ensemble_model.output_tensor #(batch, hidden ,para_len) para_reps_tensor4score = dropout_standard(is_train, para_reps_tensor4score, 0.2, rng) #for span reps span_1=T.concatenate([para_reps_tensor4score, para_reps_tensor4score], axis=1) #(batch, 2*hidden ,para_len) span_2=T.concatenate([para_reps_tensor4score[:,:,:-1], para_reps_tensor4score[:,:,1:]], axis=1) #(batch, 2*hidden ,para_len-1) span_3=T.concatenate([para_reps_tensor4score[:,:,:-2], para_reps_tensor4score[:,:,2:]], axis=1) #(batch, 2*hidden ,para_len-2) span_4=T.concatenate([para_reps_tensor4score[:,:,:-3], para_reps_tensor4score[:,:,3:]], axis=1) #(batch, 2*hidden ,para_len-3) span_5=T.concatenate([para_reps_tensor4score[:,:,:-4], para_reps_tensor4score[:,:,4:]], axis=1) #(batch, 2*hidden ,para_len-4) span_6=T.concatenate([para_reps_tensor4score[:,:,:-5], para_reps_tensor4score[:,:,5:]], axis=1) #(batch, 2*hidden ,para_len-5) span_7=T.concatenate([para_reps_tensor4score[:,:,:-6], para_reps_tensor4score[:,:,6:]], axis=1) #(batch, 2*hidden ,para_len-6) span_8=T.concatenate([para_reps_tensor4score[:,:,:-7], para_reps_tensor4score[:,:,7:]], axis=1) #(batch, 2*hidden ,para_len-7) span_9=T.concatenate([para_reps_tensor4score[:,:,:-8], para_reps_tensor4score[:,:,8:]], axis=1) #(batch, 2*hidden ,para_len-8) span_10=T.concatenate([para_reps_tensor4score[:,:,:-9], para_reps_tensor4score[:,:,9:]], axis=1) #(batch, 2*hidden ,para_len-9) span_11=T.concatenate([para_reps_tensor4score[:,:,:-10], para_reps_tensor4score[:,:,10:]], axis=1) #(batch, 2*hidden ,para_len-10) span_12=T.concatenate([para_reps_tensor4score[:,:,:-11], para_reps_tensor4score[:,:,11:]], axis=1) #(batch, 2*hidden ,para_len-11) span_13=T.concatenate([para_reps_tensor4score[:,:,:-12], para_reps_tensor4score[:,:,12:]], axis=1) #(batch, 2*hidden ,para_len-12) span_reps=T.concatenate([span_1, span_2, span_3, span_4, span_5, span_6, span_7, span_8, span_9, span_10, span_11, span_12, span_13], axis=2) #(batch, 2*hidden, 13*para_len-78) test_span_reps=T.concatenate([span_1, span_2, span_3, span_4, span_5, span_6, span_7], axis=2) #(batch, 2*hidden, 5*para_len-10) #, span_6, span_7 #score each span reps norm_U_a=normalize_matrix(U_a) span_scores_tensor=T.dot(span_reps.dimshuffle(0,2,1), norm_U_a) #(batch, 13*para_len-78, 1) span_scores=T.nnet.softmax(span_scores_tensor.reshape((true_batch_size, 13*paragraph.shape[1]-78))) #(batch, 7*para_len-21) loss=-T.sum(T.log(span_scores[T.arange(true_batch_size), gold_indices])) test_span_scores_tensor=T.dot(test_span_reps.dimshuffle(0,2,1), norm_U_a) #(batch, 7*para_len-21, 1) test_span_scores=T.nnet.softmax(test_span_scores_tensor.reshape((true_batch_size, 7*paragraph.shape[1]-21))) #(batch, 7*para_len-21) test_return=T.argmax(test_span_scores, axis=1) #batch #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) # L2_reg = L2norm_paraList([embeddings]) cost=loss#+ConvGRU_1.error# accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([paragraph, questions,gold_indices, para_mask, q_mask, extraF, is_train], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF, is_train], test_return, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/test_batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i+= train_model( numpy.asarray([train_para_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_Q_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_label_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_para_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), numpy.asarray([train_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), numpy.asarray([train_feature_matrixlist[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), 1) #print iter if iter%10==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() # writefile=codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') # writefile.write('{') pred_dict={} # exact_match=0.0 # F1_match=0.0 q_amount=0 for test_para_id in test_batch_start: batch_predict_ids=test_model( numpy.asarray(test_para_list[test_para_id:test_para_id+test_batch_size], dtype='int32'), numpy.asarray(test_Q_list[test_para_id:test_para_id+test_batch_size], dtype='int32'), numpy.asarray(test_para_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), numpy.asarray(test_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), numpy.asarray(test_feature_matrixlist[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), 0) # print distribution_matrix test_para_wordlist_list=test_text_list[test_para_id:test_para_id+test_batch_size] # para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+test_batch_size] q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size] # print 'q_ids_batch:', q_ids_batch # paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size] # sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size] # para_len=len(test_para_wordlist_list[0]) # if para_len!=len(distribution_matrix[0]): # print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0]) # exit(0) # q_size=len(distribution_matrix) q_amount+=test_batch_size # print q_size # print test_para_word_list # Q_list_inword=test_Q_list_word[test_para_id:test_para_id+test_batch_size] for q in range(test_batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_list[q]) q_id=q_ids_batch[q] pred_dict[q_id]=pred_ans # writefile.write('"'+str(q_id)+'": "'+pred_ans+'", ') # pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], numpy.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q]) # q_gold_ans_set=para_gold_ansset_list[q] # # print test_para_wordlist_list[q] # # print Q_list_inword[q] # # print pred_ans.encode('utf8'), q_gold_ans_set # if pred_ans in q_gold_ans_set: # exact_match+=1 # F1=MacroF1(pred_ans, q_gold_ans_set) # F1_match+=F1 with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt') # F1_acc=F1_match/q_amount # exact_acc=exact_match/q_amount if F1_acc> max_F1_acc: max_F1_acc=F1_acc if exact_acc> max_exact_acc: max_exact_acc=exact_acc if max_exact_acc > max_EM: store_model_to_file(rootPath+'Best_Paras_conv_'+str(max_exact_acc), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc # os.system('python evaluate-v1.1.py '+rootPath+'dev-v1.1.json '+rootPath+'predictions.txt') if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, word_nkerns=500, char_nkerns=100, batch_size=1, window_width=3, emb_size=500, char_emb_size=100, hidden_size=200, margin=0.5, L2_weight=0.0003, update_freq=1, norm_threshold=5.0, max_truncate=40, max_char_len=40, max_des_len=20, max_relation_len=5, max_Q_len=30, train_neg_size=6, neg_all=100, train_size=75893, test_size=19168, mark='_BiasedMaxPool_lr0.1_word500_char100_noDes_ent2.0'): #train_size=75909, test_size=17386 # maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/' triple_files=['annotated_fb_data_train.entitylinking.top20_succSet_asInput.txt', 'annotated_fb_data_test.entitylinking.top20_succSet_asInput.fromMo_FB5M.txt'] rng = numpy.random.RandomState(23455) word2id, char2id=load_word2id_char2id(mark) # datasets, datasets_test, length_per_example_test, vocab_size, char_size=load_test_or_valid(triple_files[0], triple_files[1], max_char_len, max_des_len, max_relation_len, max_Q_len, train_size, test_size)#max_char_len, max_des_len, max_relation_len, max_Q_len datasets_test, length_per_example_test, word2id, char2id = load_test_or_valid(triple_files[1], char2id, word2id, max_char_len, max_des_len, max_relation_len, max_Q_len, test_size) vocab_size=len(word2id) char_size=len(char2id) print 'vocab_size:', vocab_size, 'char_size:', char_size # train_data=datasets # valid_data=datasets[1] test_data=datasets_test # result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores) # # train_pos_entity_char=train_data[0] # train_pos_entity_des=train_data[1] # train_relations=train_data[2] # train_entity_char_lengths=train_data[3] # train_entity_des_lengths=train_data[4] # train_relation_lengths=train_data[5] # train_mention_char_ids=train_data[6] # train_remainQ_word_ids=train_data[7] # train_mention_char_lens=train_data[8] # train_remainQ_word_len=train_data[9] # train_entity_scores=train_data[10] test_pos_entity_char=test_data[0] # test_pos_entity_des=test_data[1] test_relations=test_data[2] test_entity_char_lengths=test_data[3] # test_entity_des_lengths=test_data[4] test_relation_lengths=test_data[5] test_mention_char_ids=test_data[6] test_remainQ_word_ids=test_data[7] test_mention_char_lens=test_data[8] test_remainQ_word_len=test_data[9] test_entity_scores=test_data[10] # # test_pos_entity_char=test_data[0] #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51 # test_pos_entity_des=test_data[1] #matrix, each row for a examle: 20*2*51 # test_relations=test_data[2] #matrix, each row for a example: 5*51 # test_entity_char_lengths=test_data[3] #matrix, each row for a example: 3*2*51 (three valies for one entity) # test_entity_des_lengths=test_data[4] #matrix, each row for a example: 3*2*51 (three values for one entity) # test_relation_lengths=test_data[5] #matrix, each row for a example: 3*51 # test_mention_char_ids=test_data[6] #matrix, each row for a mention: 40 # test_remainQ_word_ids=test_data[7] #matrix, each row for a question: 30 # test_mention_char_lens=test_data[8] #matrix, each three values for a mention: 3 # test_remainQ_word_len=test_data[9] #matrix, each three values for a remain question: 3 # train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\ # len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len), len(train_entity_scores)] # if sum(train_sizes)/len(train_sizes)!=train_size: # print 'weird size:', train_sizes # exit(0) test_sizes=[len(test_pos_entity_char), len(test_relations), len(test_entity_char_lengths),\ len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len), len(test_entity_scores)] if sum(test_sizes)/len(test_sizes)!=test_size: print 'weird size:', test_sizes exit(0) # n_train_batches=train_size/batch_size # n_test_batches=test_size/batch_size # train_batch_start=list(numpy.arange(n_train_batches)*batch_size) # test_batch_start=list(numpy.arange(n_test_batches)*batch_size) # indices_train_pos_entity_char=pythonList_into_theanoIntMatrix(train_pos_entity_char) # indices_train_pos_entity_des=pythonList_into_theanoIntMatrix(train_pos_entity_des) # indices_train_relations=pythonList_into_theanoIntMatrix(train_relations) # indices_train_entity_char_lengths=pythonList_into_theanoIntMatrix(train_entity_char_lengths) # indices_train_entity_des_lengths=pythonList_into_theanoIntMatrix(train_entity_des_lengths) # indices_train_relation_lengths=pythonList_into_theanoIntMatrix(train_relation_lengths) # indices_train_mention_char_ids=pythonList_into_theanoIntMatrix(train_mention_char_ids) # indices_train_remainQ_word_ids=pythonList_into_theanoIntMatrix(train_remainQ_word_ids) # indices_train_mention_char_lens=pythonList_into_theanoIntMatrix(train_mention_char_lens) # indices_train_remainQ_word_len=pythonList_into_theanoIntMatrix(train_remainQ_word_len) # indices_train_entity_scores=pythonList_into_theanoFloatMatrix(train_entity_scores) # indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char) # indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des) # indices_test_relations=pythonList_into_theanoIntMatrix(test_relations) # indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths) # indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths) # indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths) # indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids) # indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids) # indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens) # indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len) # indices_test_entity_scores=pythonList_into_theanoIntMatrix(test_entity_scores) rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) # rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb.txt') embeddings=theano.shared(value=rand_values, borrow=True) char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # char_rand_values[0]=numpy.array(numpy.zeros(char_emb_size),dtype=theano.config.floatX) char_embeddings=theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data index = T.iscalar() chosed_indices=T.ivector() ent_char_ids_M = T.imatrix() ent_lens_M = T.imatrix() men_char_ids_M = T.imatrix() men_lens_M=T.imatrix() rel_word_ids_M=T.imatrix() rel_word_lens_M=T.imatrix() #desH_word_ids_M=T.imatrix() #desH_word_lens_M=T.imatrix() q_word_ids_M=T.imatrix() q_word_lens_M=T.imatrix() ent_scores=T.fvector() filter_size=(emb_size,window_width) char_filter_size=(char_emb_size, window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' char_filter_shape=(char_nkerns, 1, char_filter_size[0], char_filter_size[1]) word_filter_shape=(word_nkerns, 1, filter_size[0], filter_size[1]) char_conv_W, char_conv_b=create_conv_para(rng, filter_shape=char_filter_shape) q_rel_conv_W, q_rel_conv_b=create_conv_para(rng, filter_shape=word_filter_shape) #q_desH_conv_W, q_desH_conv_b=create_conv_para(rng, filter_shape=word_filter_shape) params = [char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W, q_rel_conv_b]#, q_desH_conv_W, q_desH_conv_b] load_model_from_file(rootPath, params, mark) def SimpleQ_matches_Triple(ent_char_ids_f,ent_lens_f,rel_word_ids_f,rel_word_lens_f, men_char_ids_f, q_word_ids_f, men_lens_f, q_word_lens_f): # rng = numpy.random.RandomState(23455) ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) men_char_input = char_embeddings[men_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape((batch_size,max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) q_word_input = embeddings[q_word_ids_f.flatten()].reshape((batch_size,max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #ent_mention ent_char_conv = Conv_with_input_para(rng, input=ent_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) men_char_conv = Conv_with_input_para(rng, input=men_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) #q-rel q_rel_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) rel_conv = Conv_with_input_para(rng, input=rel_word_input, image_shape=(batch_size, 1, emb_size, max_relation_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) #q_desH #q_desH_conv = Conv_with_input_para(rng, input=q_word_input, # image_shape=(batch_size, 1, emb_size, max_Q_len), # filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) #desH_conv = Conv_with_input_para(rng, input=desH_word_input, # image_shape=(batch_size, 1, emb_size, max_des_len), # filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) ent_conv_pool=Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2]) men_conv_pool=Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens_f[0], right_l=men_lens_f[2]) #q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) rel_conv_pool=Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2]) q_rel_pool=Average_Pooling_for_SimpleQA(rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1]+filter_size[1]-1, dim=max_Q_len+filter_size[1]-1, topk=2) #q_desH_pool=Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) #desH_conv_pool=Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2]) overall_simi=cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)*0.33333+\ cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling)*0.55 # 0.0*cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling) # cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling) return overall_simi simi_list, updates = theano.scan( SimpleQ_matches_Triple, sequences=[ent_char_ids_M,ent_lens_M,rel_word_ids_M,rel_word_lens_M, men_char_ids_M, q_word_ids_M, men_lens_M, q_word_lens_M]) simi_list+=0.2*ent_scores posi_simi=simi_list[0] nega_simies=simi_list[1:] loss_simi_list=T.maximum(0.0, margin-posi_simi.reshape((1,1))+nega_simies) loss_simi=T.sum(loss_simi_list) test_model = theano.function([ent_char_ids_M, ent_lens_M, men_char_ids_M, men_lens_M, rel_word_ids_M, rel_word_lens_M, q_word_ids_M, q_word_lens_M, ent_scores], [loss_simi, simi_list],on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... testing' start_time = time.clock() mid_time = start_time epoch = 0 test_loss=[] succ=0 for i in range(test_size): #prepare data test_ent_char_ids_M= numpy.asarray(test_pos_entity_char[i], dtype='int32').reshape((length_per_example_test[i], max_char_len)) test_ent_lens_M = numpy.asarray(test_entity_char_lengths[i], dtype='int32').reshape((length_per_example_test[i], 3)) test_men_char_ids_M = numpy.asarray(test_mention_char_ids[i], dtype='int32').reshape((length_per_example_test[i], max_char_len)) test_men_lens_M = numpy.asarray(test_mention_char_lens[i], dtype='int32').reshape((length_per_example_test[i], 3)) test_rel_word_ids_M = numpy.asarray(test_relations[i], dtype='int32').reshape((length_per_example_test[i], max_relation_len)) test_rel_word_lens_M = numpy.asarray(test_relation_lengths[i], dtype='int32').reshape((length_per_example_test[i], 3)) #test_desH_word_ids_M =numpy.asarray( test_pos_entity_des[i], dtype='int32').reshape((length_per_example_test[i], max_des_len)) #test_desH_word_lens_M = numpy.asarray(test_entity_des_lengths[i], dtype='int32').reshape((length_per_example_test[i], 3)) test_q_word_ids_M = numpy.asarray(test_remainQ_word_ids[i], dtype='int32').reshape((length_per_example_test[i], max_Q_len)) test_q_word_lens_M = numpy.asarray(test_remainQ_word_len[i], dtype='int32').reshape((length_per_example_test[i], 3)) test_ent_scores = numpy.asarray(test_entity_scores[i], dtype=theano.config.floatX) loss_simi_i,simi_list_i=test_model(test_ent_char_ids_M, test_ent_lens_M, test_men_char_ids_M, test_men_lens_M, test_rel_word_ids_M, test_rel_word_lens_M, test_q_word_ids_M, test_q_word_lens_M, test_ent_scores) # print 'simi_list_i:', simi_list_i[:10] test_loss.append(loss_simi_i) if len(simi_list_i)==1 or simi_list_i[0]>=max(simi_list_i[1:]): succ+=1 if i%1000==0: print 'testing', i, '...acc:', (succ*1.0/(i+1))*(19168*1.0/21687) succ=succ*100.0/21687 #now, check MAP and MRR print 'accu:', succ # store_model_to_file(rootPath, params, succ, mark) print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
def evaluate_lenet5(learning_rate=0.09, n_epochs=2000, nkerns=[50,50], batch_size=1, window_width=3, maxSentLength=64, maxDocLength=60, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.00065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59): maxSentLength=max_s_length+2*(window_width-1) maxDocLength=max_d_length+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/'; rng = numpy.random.RandomState(23455) train_data,train_size, test_data, test_size, vocab_size=load_MCTest_corpus(rootPath+'vocab.txt', rootPath+'mc500.train.tsv_standardlized.txt', rootPath+'mc500.test.tsv_standardlized.txt', max_s_length,maxSentLength, maxDocLength)#vocab_size contain train, dev and test #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True) #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' # mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt') # extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt') # discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') [train_data_D, train_data_Q, train_data_A, train_Y, train_Label, train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A, train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A, train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A]=train_data [test_data_D, test_data_Q, test_data_A, test_Y, test_Label, test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A, test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A, test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A]=test_data n_train_batches=train_size/batch_size n_test_batches=test_size/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int64') # indices_train_r=T.cast(indices_train_r, 'int64') # indices_test_l=T.cast(indices_test_l, 'int64') # indices_test_r=T.cast(indices_test_r, 'int64') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum=0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer index_Q = T.lvector() index_A= T.lvector() y = T.lvector() len_D=T.lscalar() len_D_s=T.lvector() len_Q=T.lscalar() len_A=T.lscalar() left_D=T.lscalar() left_D_s=T.lvector() left_Q=T.lscalar() left_A=T.lscalar() right_D=T.lscalar() right_D_s=T.lvector() right_Q=T.lscalar() right_A=T.lscalar() #wmf=T.dmatrix() cost_tmp=T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words=(emb_size,window_width) filter_sents=(nkerns[0], window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_Q_input = embeddings[index_Q.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A_input = embeddings[index_A.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1])) # load_model_for_conv1([conv_W, conv_b]) layer0_D = Conv_with_input_para(rng, input=layer0_D_input, image_shape=(maxDocLength, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A = Conv_with_input_para(rng, input=layer0_A_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output') layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output') layer0_A_output=debug_print(layer0_A.output, 'layer0_A.output') layer0_para=[conv_W, conv_b] layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q, length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1, dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) layer1_DA=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A, right_r=right_A, length_D_s=len_D_s+filter_words[1]-1, length_r=len_A+filter_words[1]-1, dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0] #conv from sentence to doc layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA = Conv_with_input_para(rng, input=layer1_DA.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) #conv single Q and A into doc level with same conv weights layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel') layer2_A_output_sent_rep_Dlevel=debug_print(layer2_A.output_sent_rep_Dlevel, 'layer2_A.output_sent_rep_Dlevel') layer2_para=[conv2_W, conv2_b] layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D+filter_sents[1]-1, length_r=1, dim=maxDocLength+filter_sents[1]-1, topk=3) layer3_DA=Average_Pooling_for_Top(rng, input_l=layer2_DA.output, input_r=layer2_A_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D+filter_sents[1]-1, length_r=1, dim=maxDocLength+filter_sents[1]-1, topk=3) #high-way high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1]) transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ') transform_gate_DA=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA.output_D_sent_level_rep) + high_b), 'transform_gate_DA') transform_gate_Q=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b), 'transform_gate_Q') transform_gate_A=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA.output_QA_sent_level_rep) + high_b), 'transform_gate_A') highW_para=[high_W, high_b] overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q') overall_D_A=(1.0-transform_gate_DA)*layer1_DA.output_D_sent_level_rep+transform_gate_DA*layer3_DA.output_D_doc_level_rep overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel overall_A=(1.0-transform_gate_A)*layer1_DA.output_QA_sent_level_rep+transform_gate_A*layer2_A.output_sent_rep_Dlevel simi_sent_level=debug_print(cosine(layer1_DQ.output_D_sent_level_rep+layer1_DA.output_D_sent_level_rep, layer1_DQ.output_QA_sent_level_rep+layer1_DA.output_QA_sent_level_rep), 'simi_sent_level') simi_doc_level=debug_print(cosine(layer3_DQ.output_D_doc_level_rep+layer3_DA.output_D_doc_level_rep, layer2_Q.output_sent_rep_Dlevel+layer2_A.output_sent_rep_Dlevel), 'simi_doc_level') simi_overall_level=debug_print(cosine(overall_D_Q+overall_D_A, overall_Q+overall_A), 'simi_overall_level') # eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA)) layer4_input=debug_print(T.concatenate([simi_sent_level, simi_doc_level, simi_overall_level ], axis=1), 'layer4_input')#, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer4=LogisticRegression(rng, input=layer4_input, n_in=3, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((layer4.W** 2).sum()+(high_W**2).sum()+(conv2_W**2).sum()+(conv_W**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost_this =debug_print(layer4.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') # # [train_data_D, train_data_Q, train_data_A, train_Y, train_Label, # train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A, # train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A, # train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A]=train_data # [test_data_D, test_data_Q, test_data_A, test_Y, test_Label, # test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A, # test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A, # test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A]=test_data # index = T.lscalar() # index_D = T.lmatrix() # now, x is the index matrix, must be integer # index_Q = T.lvector() # index_A= T.lvector() # # y = T.lvector() # len_D=T.lscalar() # len_D_s=T.lvector() # len_Q=T.lscalar() # len_A=T.lscalar() # # left_D=T.lscalar() # left_D_s=T.lvector() # left_Q=T.lscalar() # left_A=T.lscalar() # # right_D=T.lscalar() # right_D_s=T.lvector() # right_Q=T.lscalar() # right_A=T.lscalar() # # # #wmf=T.dmatrix() # cost_tmp=T.dscalar() test_model = theano.function([index], [layer4.errors(y),layer4_input, y, layer4.prop_for_posi], givens={ index_D: test_data_D[index], #a matrix index_Q: test_data_Q[index], index_A: test_data_A[index], y: test_Y[index:index+batch_size], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], len_Q: test_Length_Q[index], len_A: test_Length_A[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], left_Q: test_leftPad_Q[index], left_A: test_leftPad_A[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], right_Q: test_rightPad_Q[index], right_A: test_rightPad_A[index] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer4.params+layer2_para+layer0_para+highW_para accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index,cost_tmp], cost, updates=updates, givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A: train_data_A[index], y: train_Y[index:index+batch_size], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A: train_Length_A[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A: train_leftPad_A[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A: train_rightPad_A[index] }, on_unused_input='ignore') train_model_predict = theano.function([index], [cost_this,layer4.errors(y), layer4_input, y], givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A: train_data_A[index], y: train_Y[index:index+batch_size], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A: train_Length_A[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A: train_leftPad_A[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A: train_rightPad_A[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc=0.0 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 #shuffle(train_batch_start)#shuffle training data cost_tmp=0.0 # readfile=open('/mounts/data/proj/wenpeng/Dataset/SICK/train_plus_dev.txt', 'r') # train_pairs=[] # train_y=[] # for line in readfile: # tokens=line.strip().split('\t') # listt=tokens[0]+'\t'+tokens[1] # train_pairs.append(listt) # train_y.append(tokens[2]) # readfile.close() # writefile=open('/mounts/data/proj/wenpeng/Dataset/SICK/weights_fine_tune.txt', 'w') for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 sys.stdout.write( "Training :[%6f] %% complete!\r" % (batch_start*100.0/train_size) ) sys.stdout.flush() minibatch_index=minibatch_index+1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) #print batch_start if iter%update_freq != 0: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) #print 'layer3_input', layer3_input cost_tmp+=cost_ij error_sum+=error_ij else: cost_average= train_model(batch_start,cost_tmp) #print 'layer3_input', layer3_input error_sum=0 cost_tmp=0.0#reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_losses=[] test_y=[] test_features=[] test_prop=[] for i in test_batch_start: test_loss, layer3_input, y, posi_prop=test_model(i) test_prop.append(posi_prop[0][0]) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() #test_score = numpy.mean(test_losses) test_acc=compute_test_acc(test_y, test_prop) #test_acc=1-test_score print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches,test_acc * 100.)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') train_y=[] train_features=[] count=0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 #write_feature.close() clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33 clf.fit(train_features, train_y) results=clf.decision_function(test_features) lr=linear_model.LogisticRegression(C=1e5) lr.fit(train_features, train_y) results_lr=lr.decision_function(test_features) acc_svm=compute_test_acc(test_y, results) acc_lr=compute_test_acc(test_y, results_lr) find_better=False if acc_svm > max_acc: max_acc=acc_svm best_epoch=epoch find_better=True if test_acc > max_acc: max_acc=test_acc best_epoch=epoch find_better=True if acc_lr> max_acc: max_acc=acc_lr best_epoch=epoch find_better=True print '\t\t\tsvm:', acc_svm, 'lr:', acc_lr, 'nn:', test_acc, 'max:', max_acc,'(at',best_epoch,')' # if find_better==True: # store_model_to_file(layer2_para, best_epoch) # print 'Finished storing best conv params' if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #writefile.close() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, nkerns=[256, 256], batch_size=1, window_width=3, maxSentLength=64, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.0006, update_freq=1, norm_threshold=5.0, max_truncate=33): # max_truncate can be 45 maxSentLength = max_truncate + 2 * (window_width - 1) model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/SICK/' rng = numpy.random.RandomState(23455) # datasets, vocab_size=load_SICK_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'train.txt', rootPath+'test.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test datasets, vocab_size = load_SICK_corpus(rootPath + 'vocab.txt', rootPath + 'train_plus_dev.txt', rootPath + 'test.txt', max_truncate, maxSentLength, entailment=True) mt_train, mt_test = load_mts_wikiQA( rootPath + 'Train_plus_dev_MT/concate_14mt_train.txt', rootPath + 'Test_MT/concate_14mt_test.txt') extra_train, extra_test = load_extra_features( rootPath + 'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath + 'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt' ) discri_train, discri_test = load_extra_features( rootPath + 'train_plus_dev_discri_features_0.3.txt', rootPath + 'test_discri_features_0.3.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[ 0] indices_train_l = indices_train[::2, :] indices_train_r = indices_train[1::2, :] trainLengths_l = trainLengths[::2] trainLengths_r = trainLengths[1::2] normalized_train_length_l = normalized_train_length[::2] normalized_train_length_r = normalized_train_length[1::2] trainLeftPad_l = trainLeftPad[::2] trainLeftPad_r = trainLeftPad[1::2] trainRightPad_l = trainRightPad[::2] trainRightPad_r = trainRightPad[1::2] indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[ 1] indices_test_l = indices_test[::2, :] indices_test_r = indices_test[1::2, :] testLengths_l = testLengths[::2] testLengths_r = testLengths[1::2] normalized_test_length_l = normalized_test_length[::2] normalized_test_length_r = normalized_test_length[1::2] testLeftPad_l = testLeftPad[::2] testLeftPad_r = testLeftPad[1::2] testRightPad_l = testRightPad[::2] testRightPad_r = testRightPad[1::2] n_train_batches = indices_train_l.shape[0] / batch_size n_test_batches = indices_test_l.shape[0] / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) indices_train_l = theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r = theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l = theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r = theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l = T.cast(indices_train_l, 'int64') indices_train_r = T.cast(indices_train_r, 'int64') indices_test_l = T.cast(indices_test_l, 'int64') indices_test_r = T.cast(indices_test_r, 'int64') ''' indices_train_l=T.cast(indices_train_l, 'int32') indices_train_r=T.cast(indices_train_r, 'int32') indices_test_l=T.cast(indices_test_l, 'int32') indices_test_r=T.cast(indices_test_r, 'int32') ''' rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) # rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_glove_50d.txt') rand_values = load_word2vec_to_init(rand_values, rootPath + 'vocab_glove_300d.txt') embeddings = theano.shared(value=rand_values, borrow=True) error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.lmatrix( 'x_index_l') # now, x is the index matrix, must be integer x_index_r = T.lmatrix('x_index_r') y = T.lvector('y') left_l = T.lscalar() right_l = T.lscalar() left_r = T.lscalar() right_r = T.lscalar() length_l = T.lscalar() length_r = T.lscalar() norm_length_l = T.dscalar() norm_length_r = T.dscalar() mts = T.dmatrix() extra = T.dmatrix() discri = T.dmatrix() cost_tmp = T.dscalar() # #GPU # index = T.iscalar() # x_index_l = T.imatrix('x_index_l') # now, x is the index matrix, must be integer # x_index_r = T.imatrix('x_index_r') # y = T.ivector('y') # left_l=T.iscalar() # right_l=T.iscalar() # left_r=T.iscalar() # right_r=T.iscalar() # length_l=T.iscalar() # length_r=T.iscalar() # norm_length_l=T.fscalar() # norm_length_r=T.fscalar() # #mts=T.dmatrix() # #wmf=T.dmatrix() # cost_tmp=T.fscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size = (emb_size, window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv = ishape[1] + filter_size[1] - 1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = debug_print( embeddings[x_index_l.flatten()].reshape( (maxSentLength, emb_size)).transpose(), 'layer0_l_input') layer0_r_input = debug_print( embeddings[x_index_r.flatten()].reshape( (maxSentLength, emb_size)).transpose(), 'layer0_r_input') l_input_tensor = debug_print( Matrix_Bit_Shift(layer0_l_input[:, left_l:-right_l]), 'l_input_tensor') r_input_tensor = debug_print( Matrix_Bit_Shift(layer0_r_input[:, left_r:-right_r]), 'r_input_tensor') addition_l = T.sum(layer0_l_input[:, left_l:-right_l], axis=1) addition_r = T.sum(layer0_r_input[:, left_r:-right_r], axis=1) cosine_addition = cosine(addition_l, addition_r) eucli_addition = 1.0 / (1.0 + EUCLID(addition_l, addition_r)) #25.2% U, W, b = create_GRU_para(rng, emb_size, nkerns[0]) layer0_para = [U, W, b] layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor, hidden_dim=nkerns[0], U=U, W=W, b=b, bptt_truncate=-1) layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor, hidden_dim=nkerns[0], U=U, W=W, b=b, bptt_truncate=-1) cosine_sent = cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep) eucli_sent = 1.0 / (1.0 + EUCLID(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)) #25.2% #ibm attentive pooling at extended sentence level attention_matrix = compute_simi_feature_matrix_with_matrix( layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim, layer0_A2.dim, maxSentLength * (maxSentLength + 1) / 2) # attention_vec_l_extended=T.nnet.softmax(T.max(attention_matrix, axis=1)).transpose() # ibm_l_extended=layer0_A1.output_matrix.dot(attention_vec_l_extended).transpose() # attention_vec_r_extended=T.nnet.softmax(T.max(attention_matrix, axis=0)).transpose() # ibm_r_extended=layer0_A2.output_matrix.dot(attention_vec_r_extended).transpose() # cosine_ibm_extended=cosine(ibm_l_extended, ibm_r_extended) # eucli_ibm_extended=1.0/(1.0+EUCLID(ibm_l_extended, ibm_r_extended))#25.2% #ibm attentive pooling at original sentence level simi_matrix_sent = compute_simi_feature_matrix_with_matrix( layer0_A1.output_sent_hiddenstates, layer0_A2.output_sent_hiddenstates, length_l, length_r, maxSentLength) attention_vec_l = T.nnet.softmax(T.max(simi_matrix_sent, axis=1)).transpose() ibm_l = layer0_A1.output_sent_hiddenstates.dot(attention_vec_l).transpose() attention_vec_r = T.nnet.softmax(T.max(simi_matrix_sent, axis=0)).transpose() ibm_r = layer0_A2.output_sent_hiddenstates.dot(attention_vec_r).transpose() cosine_ibm = cosine(ibm_l, ibm_r) eucli_ibm = 1.0 / (1.0 + EUCLID(ibm_l, ibm_r)) #25.2% l_max_attention = T.max(attention_matrix, axis=1) neighborsArgSorted = T.argsort(l_max_attention) kNeighborsArg = neighborsArgSorted[:3] #only average the min 3 vectors ll = T.sort(kNeighborsArg).flatten() # make y indices in acending lie r_max_attention = T.max(attention_matrix, axis=0) neighborsArgSorted_r = T.argsort(r_max_attention) kNeighborsArg_r = neighborsArgSorted_r[:3] #only average the min 3 vectors rr = T.sort(kNeighborsArg_r).flatten() # make y indices in acending lie l_max_min_attention = debug_print(layer0_A1.output_matrix[:, ll], 'l_max_min_attention') r_max_min_attention = debug_print(layer0_A2.output_matrix[:, rr], 'r_max_min_attention') U1, W1, b1 = create_GRU_para(rng, nkerns[0], nkerns[1]) layer1_para = [U1, W1, b1] layer1_A1 = GRU_Matrix_Input(X=l_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1], U=U1, W=W1, b=b1, bptt_truncate=-1) layer1_A2 = GRU_Matrix_Input(X=r_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1], U=U1, W=W1, b=b1, bptt_truncate=-1) vec_l = debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])), 'vec_l') vec_r = debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])), 'vec_r') # sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) # aver_uni_l=sum_uni_l/layer0_l_input.shape[3] # norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) # sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) # aver_uni_r=sum_uni_r/layer0_r_input.shape[3] # norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) # uni_cosine = cosine(vec_l, vec_r) # aver_uni_cosine=cosine(aver_uni_l, aver_uni_r) # uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi') # ''' # linear=Linear(sum_uni_l, sum_uni_r) # poly=Poly(sum_uni_l, sum_uni_r) # sigmoid=Sigmoid(sum_uni_l, sum_uni_r) # rbf=RBF(sum_uni_l, sum_uni_r) # gesd=GESD(sum_uni_l, sum_uni_r) # ''' eucli_1 = 1.0 / (1.0 + EUCLID(vec_l, vec_r)) #25.2% # #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r)) # len_l = norm_length_l.reshape((1, 1)) len_r = norm_length_r.reshape((1, 1)) # # ''' # len_l=length_l.reshape((1,1)) # len_r=length_r.reshape((1,1)) # ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer3_input = T.concatenate( [ vec_l, vec_r, uni_cosine, eucli_1, cosine_addition, eucli_addition, # cosine_sent, eucli_sent, ibm_l.reshape((1, nkerns[0])), ibm_r.reshape((1, nkerns[0])), #2*nkerns[0]+ cosine_ibm, eucli_ibm, # ibm_l_extended.reshape((1, nkerns[0])), ibm_r_extended.reshape((1, nkerns[0])), #2*nkerns[0]+ # cosine_ibm_extended, eucli_ibm_extended, mts, len_l, len_r, extra ], axis=1) #, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer3 = LogisticRegression(rng, input=layer3_input, n_in=(2 * nkerns[1] + 2) + 2 + (2 * nkerns[0] + 2) + 14 + 2 + 9, n_out=3) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg = debug_print( (layer3.W**2).sum() + (U**2).sum() + (W**2).sum() + (U1**2).sum() + (W1**2).sum(), 'L2_reg') #+(layer1.W** 2).sum()++(embeddings**2).sum() cost_this = debug_print(layer3.negative_log_likelihood(y), 'cost_this') #+L2_weight*L2_reg cost = debug_print( (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function( [index], [layer3.errors(y), layer3_input, y], givens={ x_index_l: indices_test_l[index:index + batch_size], x_index_r: indices_test_r[index:index + batch_size], y: testY[index:index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index], mts: mt_test[index:index + batch_size], extra: extra_test[index:index + batch_size], discri: discri_test[index:index + batch_size] }, on_unused_input='ignore', allow_input_downcast=True) #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer3.params + layer1_para + layer0_para #+[embeddings]# + layer1.params # params_conv = [conv_W, conv_b] accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') acc = acc_i + T.sqr(grad_i) updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) # def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8): # updates = [] # grads = T.grad(cost, params) # i = theano.shared(numpy.float64(0.)) # i_t = i + 1. # fix1 = 1. - (1. - b1)**i_t # fix2 = 1. - (1. - b2)**i_t # lr_t = lr * (T.sqrt(fix2) / fix1) # for p, g in zip(params, grads): # m = theano.shared(p.get_value() * 0.) # v = theano.shared(p.get_value() * 0.) # m_t = (b1 * g) + ((1. - b1) * m) # v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) # g_t = m_t / (T.sqrt(v_t) + e) # p_t = p - (lr_t * g_t) # updates.append((m, m_t)) # updates.append((v, v_t)) # updates.append((p, p_t)) # updates.append((i, i_t)) # return updates # # updates=Adam(cost=cost, params=params, lr=learning_rate) train_model = theano.function( [index, cost_tmp], cost, updates=updates, givens={ x_index_l: indices_train_l[index:index + batch_size], x_index_r: indices_train_r[index:index + batch_size], y: trainY[index:index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index:index + batch_size], extra: extra_train[index:index + batch_size], discri: discri_train[index:index + batch_size] }, on_unused_input='ignore', allow_input_downcast=True) train_model_predict = theano.function( [index, cost_tmp], [cost_this, layer3.errors(y), layer3_input, y], givens={ x_index_l: indices_train_l[index:index + batch_size], x_index_r: indices_train_r[index:index + batch_size], y: trainY[index:index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index:index + batch_size], extra: extra_train[index:index + batch_size], discri: discri_train[index:index + batch_size] }, on_unused_input='ignore', allow_input_downcast=True) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time epoch = 0 done_looping = False acc_max = 0.0 best_epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 # shuffle(train_batch_start)#shuffle training data cost_tmp = 0.0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop # if (batch_start+1)%1000==0: # print batch_start+1, 'uses ', (time.time()-mid_time)/60.0, 'min' iter = (epoch - 1) * n_train_batches + minibatch_index + 1 minibatch_index = minibatch_index + 1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) #print batch_start if iter % update_freq != 0: cost_ij, error_ij, layer3_input, y = train_model_predict( batch_start, 0.0) #print 'layer3_input', layer3_input cost_tmp += cost_ij error_sum += error_ij #print 'cost_acc ',cost_acc #print 'cost_ij ', cost_ij #print 'cost_tmp before update',cost_tmp else: cost_average = train_model(batch_start, cost_tmp) #print 'layer3_input', layer3_input error_sum = 0 cost_tmp = 0.0 #reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + ' average cost: ' + str( cost_average) + ' error: ' + str( error_sum) + '/' + str( update_freq) + ' error rate: ' + str( error_sum * 1.0 / update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_losses = [] test_y = [] test_features = [] for i in test_batch_start: test_loss, layer3_input, y = test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) print( ('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, (1 - test_score) * 100.)) acc_nn = 1 - test_score #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') #this step is risky: if the training data is too big, then this step will make the training time twice longer train_y = [] train_features = [] count = 0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y = train_model_predict( batch_start, 0.0) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 #write_feature.close() clf = svm.SVC(C=1.0, kernel='linear') clf.fit(train_features, train_y) results = clf.predict(test_features) lr = linear_model.LogisticRegression(C=1e5) lr.fit(train_features, train_y) results_lr = lr.predict(test_features) corr_count = 0 corr_count_lr = 0 test_size = len(test_y) for i in range(test_size): if results[i] == test_y[i]: corr_count += 1 if results_lr[i] == test_y[i]: corr_count_lr += 1 acc_svm = corr_count * 1.0 / test_size acc_lr = corr_count_lr * 1.0 / test_size if acc_svm > acc_max: acc_max = acc_svm best_epoch = epoch if acc_lr > acc_max: acc_max = acc_lr best_epoch = epoch if acc_nn > acc_max: acc_max = acc_nn best_epoch = epoch print 'acc_nn:', acc_nn, 'acc_lr:', acc_lr, 'acc_svm:', acc_svm, ' max acc: ', acc_max, ' at epoch: ', best_epoch if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(file_name, input_filename, model_filename, learning_rate=0.001, n_epochs=2000, nkerns=[90, 90], batch_size=1, window_width=2, maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200, L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=128, max_d_length=128, margin=0.3): maxSentLength = max_s_length + 2 * (window_width - 1) maxDocLength = max_d_length + 2 * (window_width - 1) model_options = locals().copy() f = open(file_name, 'w') f.write("model options " + str(model_options) + '\n') #rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/'; rng = numpy.random.RandomState(23455) train_data, _train_Label, train_size, test_data, _test_Label, test_size, vocab_size = load_MCTest_corpus_DPN( 'vocab_table_wenyan.txt', input_filename, input_filename, max_s_length, maxSentLength, maxDocLength) #vocab_size contain train, dev and test f.write('train_size : ' + str(train_size)) #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True) #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' # mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt') # extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt') # discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') # results=[numpy.array(data_D), numpy.array(data_Q), numpy.array(data_A1), numpy.array(data_A2), numpy.array(data_A3), numpy.array(data_A4), numpy.array(Label), # numpy.array(Length_D),numpy.array(Length_D_s), numpy.array(Length_Q), numpy.array(Length_A1), numpy.array(Length_A2), numpy.array(Length_A3), numpy.array(Length_A4), # numpy.array(leftPad_D),numpy.array(leftPad_D_s), numpy.array(leftPad_Q), numpy.array(leftPad_A1), numpy.array(leftPad_A2), numpy.array(leftPad_A3), numpy.array(leftPad_A4), # numpy.array(rightPad_D),numpy.array(rightPad_D_s), numpy.array(rightPad_Q), numpy.array(rightPad_A1), numpy.array(rightPad_A2), numpy.array(rightPad_A3), numpy.array(rightPad_A4)] # return results, line_control [ train_data_D, train_data_A1, train_Label, train_Length_D, train_Length_D_s, train_Length_A1, train_leftPad_D, train_leftPad_D_s, train_leftPad_A1, train_rightPad_D, train_rightPad_D_s, train_rightPad_A1 ] = train_data [ test_data_D, test_data_A1, test_Label, test_Length_D, test_Length_D_s, test_Length_A1, test_leftPad_D, test_leftPad_D_s, test_leftPad_A1, test_rightPad_D, test_rightPad_D_s, test_rightPad_A1 ] = test_data n_train_batches = train_size / batch_size n_test_batches = test_size / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int64') # indices_train_r=T.cast(indices_train_r, 'int64') # indices_test_l=T.cast(indices_test_l, 'int64') # indices_test_r=T.cast(indices_test_r, 'int64') rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init(rand_values, 'vectors_wenyan2.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings = theano.shared(value=rand_values, borrow=True) error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer # index_Q = T.lvector() index_A1 = T.lvector() # index_A2= T.lvector() # index_A3= T.lvector() # index_A4= T.lvector() y = T.lscalar() len_D = T.lscalar() len_D_s = T.lvector() # len_Q=T.lscalar() len_A1 = T.lscalar() # len_A2=T.lscalar() # len_A3=T.lscalar() # len_A4=T.lscalar() left_D = T.lscalar() left_D_s = T.lvector() # left_Q=T.lscalar() left_A1 = T.lscalar() # left_A2=T.lscalar() # left_A3=T.lscalar() # left_A4=T.lscalar() right_D = T.lscalar() right_D_s = T.lvector() # right_Q=T.lscalar() right_A1 = T.lscalar() # right_A2=T.lscalar() # right_A3=T.lscalar() # right_A4=T.lscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words = (emb_size, window_width) filter_sents = (nkerns[0], window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### f.write('... building the model\n') # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = embeddings[index_D.flatten()].reshape( (maxDocLength, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A1_input = embeddings[index_A1.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #layer0_A2_input = embeddings[index_A2.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # layer0_A3_input = embeddings[index_A3.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # layer0_A4_input = embeddings[index_A4.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b = create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1])) layer0_para = [conv_W, conv_b] conv2_W, conv2_b = create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) layer2_para = [conv2_W, conv2_b] high_W, high_b = create_highw_para( rng, nkerns[0], nkerns[1] ) # this part decides nkern[0] and nkern[1] must be in the same dimension highW_para = [high_W, high_b] params = layer2_para + layer0_para + highW_para #+[embeddings] #load_model(params) layer0_D = Conv_with_input_para( rng, input=layer0_D_input, image_shape=(maxDocLength, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) # layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input, # image_shape=(batch_size, 1, ishape[0], ishape[1]), # filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A1 = Conv_with_input_para( rng, input=layer0_A1_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) #layer0_A2 = Conv_with_input_para(rng, input=layer0_A2_input, # image_shape=(batch_size, 1, ishape[0], ishape[1]), # filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) # layer0_A3 = Conv_with_input_para(rng, input=layer0_A3_input, # image_shape=(batch_size, 1, ishape[0], ishape[1]), # filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) # layer0_A4 = Conv_with_input_para(rng, input=layer0_A4_input, # image_shape=(batch_size, 1, ishape[0], ishape[1]), # filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output') # layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output') layer0_A1_output = debug_print(layer0_A1.output, 'layer0_A1.output') #layer0_A2_output=debug_print(layer0_A2.output, 'layer0_A2.output') # layer0_A3_output=debug_print(layer0_A3.output, 'layer0_A3.output') # layer0_A4_output=debug_print(layer0_A4.output, 'layer0_A4.output') # layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0], # left_D=left_D, right_D=right_D, # left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q, # length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1, # dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) layer1_DA1 = Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A1_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A1, right_r=right_A1, length_D_s=len_D_s + filter_words[1] - 1, length_r=len_A1 + filter_words[1] - 1, dim=maxSentLength + filter_words[1] - 1, doc_len=maxDocLength, topk=1) #layer1_DA2=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A2_output, kern=nkerns[0], # left_D=left_D, right_D=right_D, # left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A2, right_r=right_A2, # length_D_s=len_D_s+filter_words[1]-1, length_r=len_A2+filter_words[1]-1, # dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) # layer1_DA3=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A3_output, kern=nkerns[0], # left_D=left_D, right_D=right_D, # left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A3, right_r=right_A3, # length_D_s=len_D_s+filter_words[1]-1, length_r=len_A3+filter_words[1]-1, # dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) # layer1_DA4=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A4_output, kern=nkerns[0], # left_D=left_D, right_D=right_D, # left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A4, right_r=right_A4, # length_D_s=len_D_s+filter_words[1]-1, length_r=len_A4+filter_words[1]-1, # dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0] #conv from sentence to doc # layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), # image_shape=(batch_size, 1, nkerns[0], dshape[1]), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA1 = Conv_with_input_para( rng, input=layer1_DA1.output_D.reshape( (batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) #layer2_DA2 = Conv_with_input_para(rng, input=layer1_DA2.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), # image_shape=(batch_size, 1, nkerns[0], dshape[1]), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_DA3 = Conv_with_input_para(rng, input=layer1_DA3.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), # image_shape=(batch_size, 1, nkerns[0], dshape[1]), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_DA4 = Conv_with_input_para(rng, input=layer1_DA4.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), # image_shape=(batch_size, 1, nkerns[0], dshape[1]), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) #conv single Q and A into doc level with same conv weights # layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), # image_shape=(batch_size, 1, nkerns[0], 1), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A1 = Conv_with_input_para_one_col_featuremap( rng, input=layer1_DA1.output_QA_sent_level_rep.reshape( (batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) #layer2_A2 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA2.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), # image_shape=(batch_size, 1, nkerns[0], 1), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_A3 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA3.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), # image_shape=(batch_size, 1, nkerns[0], 1), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_A4 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA4.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), # image_shape=(batch_size, 1, nkerns[0], 1), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel') layer2_A1_output_sent_rep_Dlevel = debug_print( layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel') # layer2_A2_output_sent_rep_Dlevel=debug_print(layer2_A2.output_sent_rep_Dlevel, 'layer2_A2.output_sent_rep_Dlevel') # layer2_A3_output_sent_rep_Dlevel=debug_print(layer2_A3.output_sent_rep_Dlevel, 'layer2_A3.output_sent_rep_Dlevel') # layer2_A4_output_sent_rep_Dlevel=debug_print(layer2_A4.output_sent_rep_Dlevel, 'layer2_A4.output_sent_rep_Dlevel') # layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1], # left_l=left_D, right_l=right_D, left_r=0, right_r=0, # length_l=len_D+filter_sents[1]-1, length_r=1, # dim=maxDocLength+filter_sents[1]-1, topk=3) layer3_DA1 = Average_Pooling_for_Top( rng, input_l=layer2_DA1.output, input_r=layer2_A1_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D + filter_sents[1] - 1, length_r=1, dim=maxDocLength + filter_sents[1] - 1, topk=1) #layer3_DA2=Average_Pooling_for_Top(rng, input_l=layer2_DA2.output, input_r=layer2_A2_output_sent_rep_Dlevel, kern=nkerns[1], # left_l=left_D, right_l=right_D, left_r=0, right_r=0, # length_l=len_D+filter_sents[1]-1, length_r=1, # dim=maxDocLength+filter_sents[1]-1, topk=3) # layer3_DA3=Average_Pooling_for_Top(rng, input_l=layer2_DA3.output, input_r=layer2_A3_output_sent_rep_Dlevel, kern=nkerns[1], # left_l=left_D, right_l=right_D, left_r=0, right_r=0, # length_l=len_D+filter_sents[1]-1, length_r=1, # dim=maxDocLength+filter_sents[1]-1, topk=3) # layer3_DA4=Average_Pooling_for_Top(rng, input_l=layer2_DA4.output, input_r=layer2_A4_output_sent_rep_Dlevel, kern=nkerns[1], # left_l=left_D, right_l=right_D, left_r=0, right_r=0, # length_l=len_D+filter_sents[1]-1, length_r=1, # dim=maxDocLength+filter_sents[1]-1, topk=3) #high-way # transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ') transform_gate_DA1 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b), 'transform_gate_DA1') transform_gate_A1 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b), 'transform_gate_A1') # transform_gate_A2=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA2.output_QA_sent_level_rep) + high_b), 'transform_gate_A2') # transform_gate_A3=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA3.output_QA_sent_level_rep) + high_b), 'transform_gate_A3') # transform_gate_A4=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA4.output_QA_sent_level_rep) + high_b), 'transform_gate_A4') # overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q') overall_D_A1 = ( 1.0 - transform_gate_DA1 ) * layer1_DA1.output_D_sent_level_rep + transform_gate_DA1 * layer3_DA1.output_D_doc_level_rep # overall_D_A2=(1.0-transform_gate_DA2)*layer1_DA2.output_D_sent_level_rep+transform_gate_DA2*layer3_DA2.output_D_doc_level_rep # overall_D_A3=(1.0-transform_gate_DA3)*layer1_DA3.output_D_sent_level_rep+transform_gate_DA3*layer3_DA3.output_D_doc_level_rep # overall_D_A4=(1.0-transform_gate_DA4)*layer1_DA4.output_D_sent_level_rep+transform_gate_DA4*layer3_DA4.output_D_doc_level_rep # overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel overall_A1 = ( 1.0 - transform_gate_A1 ) * layer1_DA1.output_QA_sent_level_rep + transform_gate_A1 * layer2_A1.output_sent_rep_Dlevel #overall_A2=(1.0-transform_gate_A2)*layer1_DA2.output_QA_sent_level_rep+transform_gate_A2*layer2_A2.output_sent_rep_Dlevel # overall_A3=(1.0-transform_gate_A3)*layer1_DA3.output_QA_sent_level_rep+transform_gate_A3*layer2_A3.output_sent_rep_Dlevel # overall_A4=(1.0-transform_gate_A4)*layer1_DA4.output_QA_sent_level_rep+transform_gate_A4*layer2_A4.output_sent_rep_Dlevel simi_sent_level1 = debug_print( cosine(layer1_DA1.output_D_sent_level_rep, layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1') #simi_sent_level2=debug_print(cosine(layer1_DA2.output_D_sent_level_rep, layer1_DA2.output_QA_sent_level_rep), 'simi_sent_level2') # simi_sent_level3=debug_print(cosine(layer1_DA3.output_D_sent_level_rep, layer1_DA3.output_QA_sent_level_rep), 'simi_sent_level3') # simi_sent_level4=debug_print(cosine(layer1_DA4.output_D_sent_level_rep, layer1_DA4.output_QA_sent_level_rep), 'simi_sent_level4') simi_doc_level1 = debug_print( cosine(layer3_DA1.output_D_doc_level_rep, layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1') #simi_doc_level2=debug_print(cosine(layer3_DA2.output_D_doc_level_rep, layer2_A2.output_sent_rep_Dlevel), 'simi_doc_level2') # simi_doc_level3=debug_print(cosine(layer3_DA3.output_D_doc_level_rep, layer2_A3.output_sent_rep_Dlevel), 'simi_doc_level3') # simi_doc_level4=debug_print(cosine(layer3_DA4.output_D_doc_level_rep, layer2_A4.output_sent_rep_Dlevel), 'simi_doc_level4') simi_overall_level1 = debug_print(cosine(overall_D_A1, overall_A1), 'simi_overall_level1') #simi_overall_level2=debug_print(cosine(overall_D_A2, overall_A2), 'simi_overall_level2') # simi_overall_level3=debug_print(cosine(overall_D_A3, overall_A3), 'simi_overall_level3') # simi_overall_level4=debug_print(cosine(overall_D_A4, overall_A4), 'simi_overall_level4') # simi_1=simi_overall_level1+simi_sent_level1+simi_doc_level1 # simi_2=simi_overall_level2+simi_sent_level2+simi_doc_level2 simi_1 = (simi_overall_level1 + simi_sent_level1 + simi_doc_level1) / 3.0 #simi_1 = simi_doc_level1 #simi_2=(simi_overall_level2+simi_sent_level2+simi_doc_level2)/3.0 # simi_3=(simi_overall_level3+simi_sent_level3+simi_doc_level3)/3.0 # simi_4=(simi_overall_level4+simi_sent_level4+simi_doc_level4)/3.0 logistic_w, logistic_b = create_logistic_para(rng, 1, 2) logistic_para = [logistic_w, logistic_b] sent_w, sent_b = create_logistic_para(rng, 1, 2) doc_w, doc_b = create_logistic_para(rng, 1, 2) sent_para = [sent_w, sent_b] doc_para = [doc_w, doc_b] params += logistic_para params += sent_para params += doc_para load_model(params, model_filename) simi_sent = T.dot(sent_w, simi_sent_level1) + sent_b.dimshuffle(0, 'x') simi_sent = simi_sent.dimshuffle(1, 0) simi_sent = T.nnet.softmax(simi_sent) tmp_sent = T.log(simi_sent) simi_doc = T.dot(doc_w, simi_doc_level1) + doc_b.dimshuffle(0, 'x') simi_doc = simi_doc.dimshuffle(1, 0) simi_doc = T.nnet.softmax(simi_doc) tmp_doc = T.log(simi_doc) #cost = margin - simi_1 simi_overall = T.dot(logistic_w, simi_overall_level1) + logistic_b.dimshuffle(0, 'x') simi_overall = simi_overall.dimshuffle(1, 0) simi_overall = T.nnet.softmax(simi_overall) predict = T.argmax(simi_overall, axis=1) tmp_overall = T.log(simi_overall) cost = -(tmp_overall[0][y] + tmp_doc[0][y] + tmp_sent[0][y]) / 3.0 L2_reg = (conv2_W**2).sum() + (conv_W**2).sum() + (logistic_w**2).sum() + ( high_W**2).sum() cost = cost + L2_weight * L2_reg #simi_1 = [simi_overall,simi_doc,simi_sent] # eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA)) # #only use overall_simi # cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi) # posi_simi=simi_overall_level1 # nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4]) #use ensembled simi # cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi) # cost=T.maximum(0.0, margin+simi_2-simi_1) #cost=T.maximum(0.0, margin+simi_sent_level2-simi_sent_level1)+T.maximum(0.0, margin+simi_doc_level2-simi_doc_level1)+T.maximum(0.0, margin+simi_overall_level2-simi_overall_level1) # posi_simi=simi_1 # nega_simi=simi_2 #L2_reg =debug_print((high_W**2).sum()+(conv2_W**2).sum()+(conv_W**2).sum(), 'L2_reg')#+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() #cost=debug_print(cost+L2_weight*L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function( [index], [cost, simi_overall, simi_doc, simi_sent, predict], givens={ index_D: test_data_D[index], #a matrix # index_Q: test_data_Q[index], index_A1: test_data_A1[index], y: test_Label[index], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], # len_Q: test_Length_Q[index], len_A1: test_Length_A1[index], # len_A2: test_Length_A2[index], # len_A3: test_Length_A3[index], # len_A4: test_Length_A4[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], # left_Q: test_leftPad_Q[index], left_A1: test_leftPad_A1[index], # left_A2: test_leftPad_A2[index], # left_A3: test_leftPad_A3[index], # left_A4: test_leftPad_A4[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], # right_Q: test_rightPad_Q[index], right_A1: test_rightPad_A1[index], }, on_unused_input='ignore') accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') acc = acc_i + T.sqr(grad_i) updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) # for param_i, grad_i, acc_i in zip(params, grads, accumulator): # acc = acc_i + T.sqr(grad_i) # if param_i == embeddings: # updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(emb_size))))) #AdaGrad # else: # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad # updates.append((acc_i, acc)) train_model = theano.function( [index], [cost, simi_overall, simi_doc, simi_sent, predict], updates=updates, givens={ index_D: train_data_D[index], # index_Q: train_data_Q[index], index_A1: train_data_A1[index], # index_A2: train_data_A2[index], # index_A3: train_data_A3[index], # index_A4: train_data_A4[index], y: train_Label[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], # len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], # len_A2: train_Length_A2[index], # len_A3: train_Length_A3[index], # len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], # left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], # left_A2: train_leftPad_A2[index], # left_A3: train_leftPad_A3[index], # left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], # right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], # right_A2: train_rightPad_A2[index] # right_A3: train_rightPad_A3[index], # right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### f.write('... training\n') # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch cost, simi_overall, simi_doc, simi_sent, predict = test_model(0) cost, simi_overall1, simi_doc, simi_sent, predict = test_model(1) cost, simi_overall2, simi_doc, simi_sent, predict = test_model(2) cost, simi_overall3, simi_doc, simi_sent, predict = test_model(3) return simi_overall, simi_overall1, simi_overall2, simi_overall3 '''