def forward_prop_step(x_t, s_t1_prev):            
     # GRU Layer 1
     z_t1 =debug_print( T.nnet.sigmoid(U[0].dot(x_t) + W[0].dot(s_t1_prev) + b[0]), 'z_t1')
     r_t1 = debug_print(T.nnet.sigmoid(U[1].dot(x_t) + W[1].dot(s_t1_prev) + b[1]), 'r_t1')
     c_t1 = debug_print(T.tanh(U[2].dot(x_t) + W[2].dot(s_t1_prev * r_t1) + b[2]), 'c_t1')
     s_t1 = debug_print((T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev, 's_t1')
     return s_t1
    def __init__(self, rng, input_l, input_r): # length_l, length_r: valid lengths after conv
        
        input_l_matrix=debug_print(input_l.reshape((input_l.shape[2], input_l.shape[3])), 'origin_input_l_matrix')
        #input_l_matrix=debug_print(input_l_matrix[:, left_l:(input_l_matrix.shape[1]-right_l)],'input_l_matrix')
        input_r_matrix=debug_print(input_r.reshape((input_r.shape[2], input_r.shape[3])),'origin_input_r_matrix')
        #input_r_matrix=debug_print(input_r_matrix[:, left_r:(input_r_matrix.shape[1]-right_r)],'input_r_matrix')
    
        
        #with attention
        dot_l=debug_print(T.sum(input_l_matrix, axis=1), 'dot_l') # first add 1e-20 for each element to make non-zero input for weight gradient
        dot_r=debug_print(T.sum(input_r_matrix, axis=1),'dot_r')      
        '''
        #without attention
        dot_l=debug_print(T.sum(input_l_matrix, axis=1), 'dot_l') # first add 1e-20 for each element to make non-zero input for weight gradient
        dot_r=debug_print(T.sum(input_r_matrix, axis=1),'dot_r')      
        '''
        '''
        #with attention, then max pooling
        dot_l=debug_print(T.max(input_l_matrix*weights_question_matrix, axis=1), 'dot_l') # first add 1e-20 for each element to make non-zero input for weight gradient
        dot_r=debug_print(T.max(input_r_matrix*weights_answer_matrix, axis=1),'dot_r')          
        '''
        norm_l=debug_print(T.sqrt((dot_l**2).sum()),'norm_l')
        norm_r=debug_print(T.sqrt((dot_r**2).sum()), 'norm_r')
        
        self.output_vector_l=debug_print((dot_l/norm_l).reshape((1, input_l.shape[2])),'output_vector_l')
        self.output_vector_r=debug_print((dot_r/norm_r).reshape((1, input_r.shape[2])), 'output_vector_r')      
        self.output_concate=T.concatenate([dot_l, dot_r], axis=0).reshape((1, input_l.shape[2]*2))
        self.output_cosine=debug_print((T.sum(dot_l*dot_r)/norm_l/norm_r).reshape((1,1)),'output_cosine')

        self.output_eucli=debug_print(T.sqrt(T.sqr(dot_l-dot_r).sum()+1e-20).reshape((1,1)),'output_eucli')
        self.output_eucli_to_simi=1.0/(1.0+self.output_eucli)
Exemplo n.º 3
0
    def _calc_q_hat(self, r_h):
        """Calculates q_hat according to the model.

        Parameters
        ----------
        r_h : ndarray
            embedded context words

        Returns
        -------
        ndarray
            q_hat, computed by a single hidden layer
        """
        bias = debug_print(T.ones((r_h.shape[0], 1), dtype=floatX), 'bias1')
        r_h = debug_print(T.concatenate([r_h, bias], axis=1),
                          'r_h_concatenate')
        hidden_output = debug_print(
            get_activation_func(self.activation_func, T.dot(r_h, self.W1)),
            'hidden_output')
        bias = debug_print(T.ones((hidden_output.shape[0], 1), dtype=floatX),
                           'bias2')
        hidden_output = debug_print(
            T.concatenate([hidden_output, bias], axis=1),
            'hidden_output_concatenate')
        return get_activation_func(self.activation_func,
                                   T.dot(hidden_output, self.W2))
Exemplo n.º 4
0
    def _calc_regularization_cost(self):
        """Calculate the regularization cost given the weight decay parameters.

        Only the parameters will be considered that are stored in the set
        self.regularize.

        Returns
        -------
        theano variable
            regularization cost depending on the parameters to be regularized
            and the weight decay parameters for L1 and L2 regularization.
        """
        cost = theano.shared(value=np.cast[floatX](.0))
        l1_cost = 0
        l2_cost = 0

        for p in self.regularize:
            l1_cost += T.sum(T.abs_(self.__dict__[p]))
            l2_cost += T.sum(T.sqr(self.__dict__[p]))

        l1_cost = debug_print(l1_cost, 'l1_cost')
        l2_cost = debug_print(l2_cost, 'l2_cost')

        if self.l1_weight != 0:
            cost += self.l1_weight * l1_cost

        if self.l2_weight != 0:
            cost += self.l2_weight * l2_cost

        return cost
    def __init__(self, rng, input, filter_shape, image_shape, W, b):
        assert image_shape[1] == filter_shape[1]
        self.input = input
        self.W = W
        self.b = b

        # convolve input feature maps with filters
        conv_out = debug_print(conv.conv2d(input=input, filters=self.W,
                filter_shape=filter_shape, image_shape=image_shape, border_mode='valid') ,'conv_out')   #here, we should pad enough zero padding for input 
        
        # add the bias term. Since the bias is a vector (1D array), we first
        # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will
        # thus be broadcasted across mini-batches and feature map
        # width & height
        conv_with_bias = debug_print(T.tanh(conv_out + self.b.dimshuffle('x', 0, 'x', 'x')),'conv_with_bias')
        narrow_conv_out=debug_print(conv_with_bias.reshape((image_shape[0], 1, filter_shape[0], image_shape[3]-filter_shape[3]+1)),'narrow_conv_out') #(batch, 1, kernerl, ishape[1]-filter_size1[1]+1)
        
        #pad filter_size-1 zero embeddings at both sides
        left_padding = T.zeros((image_shape[0], 1, filter_shape[0], filter_shape[3]-1), dtype=theano.config.floatX)
        right_padding = T.zeros((image_shape[0], 1, filter_shape[0], filter_shape[3]-1), dtype=theano.config.floatX)
        self.output = debug_print(T.concatenate([left_padding, narrow_conv_out, right_padding], axis=3) ,'self.output')
        

        # store parameters of this layer
        self.params = [self.W, self.b]
Exemplo n.º 6
0
def compute_simi_feature_batch1(input_l_matrix, input_r_matrix, length_l,
                                length_r, para_matrix, dim):
    #matrix_r_after_translate=debug_print(T.dot(para_matrix, input_r_matrix), 'matrix_r_after_translate')
    matrix_r_after_translate = input_r_matrix

    repeated_1 = debug_print(
        T.repeat(input_l_matrix, dim,
                 axis=1)[:, :(length_l * length_r)], 'repeated_1'
    )  # add 10 because max_sent_length is only input for conv, conv will make size bigger
    repeated_2 = debug_print(
        repeat_whole_tensor(matrix_r_after_translate, dim,
                            False)[:, :(length_l * length_r)], 'repeated_2')
    '''
    #cosine attention   
    length_1=debug_print(1e-10+T.sqrt(T.sum(T.sqr(repeated_1), axis=0)),'length_1')
    length_2=debug_print(1e-10+T.sqrt(T.sum(T.sqr(repeated_2), axis=0)), 'length_2')

    multi=debug_print(repeated_1*repeated_2, 'multi')
    sum_multi=debug_print(T.sum(multi, axis=0),'sum_multi')
    
    list_of_simi= debug_print(sum_multi/(length_1*length_2),'list_of_simi')   #to get rid of zero length
    simi_matrix=debug_print(list_of_simi.reshape((length_l, length_r)), 'simi_matrix')
    
    '''
    #euclid, effective for wikiQA
    gap = debug_print(repeated_1 - repeated_2, 'gap')
    eucli = debug_print(T.sqrt(1e-10 + T.sum(T.sqr(gap), axis=0)), 'eucli')
    simi_matrix = debug_print((1.0 / (1.0 + eucli)).reshape(
        (length_l, length_r)), 'simi_matrix')

    return simi_matrix  #[:length_l, :length_r]
Exemplo n.º 7
0
    def __init__(self, rng, input_l, input_r): # length_l, length_r: valid lengths after conv
        
        input_l_matrix=debug_print(input_l.reshape((input_l.shape[2], input_l.shape[3])), 'origin_input_l_matrix')
        #input_l_matrix=debug_print(input_l_matrix[:, left_l:(input_l_matrix.shape[1]-right_l)],'input_l_matrix')
        input_r_matrix=debug_print(input_r.reshape((input_r.shape[2], input_r.shape[3])),'origin_input_r_matrix')
        #input_r_matrix=debug_print(input_r_matrix[:, left_r:(input_r_matrix.shape[1]-right_r)],'input_r_matrix')
    
        
        #with attention
        dot_l=debug_print(T.sum(input_l_matrix, axis=1), 'dot_l') # first add 1e-20 for each element to make non-zero input for weight gradient
        dot_r=debug_print(T.sum(input_r_matrix, axis=1),'dot_r')      
        '''
        #without attention
        dot_l=debug_print(T.sum(input_l_matrix, axis=1), 'dot_l') # first add 1e-20 for each element to make non-zero input for weight gradient
        dot_r=debug_print(T.sum(input_r_matrix, axis=1),'dot_r')      
        '''
        '''
        #with attention, then max pooling
        dot_l=debug_print(T.max(input_l_matrix*weights_question_matrix, axis=1), 'dot_l') # first add 1e-20 for each element to make non-zero input for weight gradient
        dot_r=debug_print(T.max(input_r_matrix*weights_answer_matrix, axis=1),'dot_r')          
        '''
        norm_l=debug_print(T.sqrt((dot_l**2).sum()),'norm_l')
        norm_r=debug_print(T.sqrt((dot_r**2).sum()), 'norm_r')
        
        self.output_vector_l=debug_print((dot_l/norm_l).reshape((1, input_l.shape[2])),'output_vector_l')
        self.output_vector_r=debug_print((dot_r/norm_r).reshape((1, input_r.shape[2])), 'output_vector_r')      
        self.output_concate=T.concatenate([dot_l, dot_r], axis=0).reshape((1, input_l.shape[2]*2))
        self.output_cosine=debug_print((T.sum(dot_l*dot_r)/norm_l/norm_r).reshape((1,1)),'output_cosine')

        self.output_eucli=debug_print(T.sqrt(T.sqr(dot_l-dot_r).sum()+1e-20).reshape((1,1)),'output_eucli')
        self.output_eucli_to_simi=1.0/(1.0+self.output_eucli)
Exemplo n.º 8
0
    def _calc_regularization_cost(self):
        """Calculate the regularization cost given the weight decay parameters.

        Only the parameters will be considered that are stored in the set
        self.regularize.

        Returns
        -------
        theano variable
            regularization cost depending on the parameters to be regularized
            and the weight decay parameters for L1 and L2 regularization.
        """
        cost = theano.shared(value=np.cast[floatX](.0))
        l1_cost = 0
        l2_cost = 0

        for p in self.regularize:
            l1_cost += T.sum(T.abs_(self.__dict__[p]))
            l2_cost += T.sum(T.sqr(self.__dict__[p]))

        l1_cost = debug_print(l1_cost, 'l1_cost')
        l2_cost = debug_print(l2_cost, 'l2_cost')

        if self.l1_weight != 0:
            cost += self.l1_weight * l1_cost

        if self.l2_weight != 0:
            cost += self.l2_weight * l2_cost

        return cost
def compute_simi_feature_batch1(input_l_matrix, input_r_matrix, length_l, length_r, para_matrix, dim):
    #matrix_r_after_translate=debug_print(T.dot(para_matrix, input_r_matrix), 'matrix_r_after_translate')
    matrix_r_after_translate=input_r_matrix

    repeated_1=debug_print(T.repeat(input_l_matrix, dim, axis=1)[:, : (length_l*length_r)],'repeated_1') # add 10 because max_sent_length is only input for conv, conv will make size bigger
    repeated_2=debug_print(repeat_whole_tensor(matrix_r_after_translate, dim, False)[:, : (length_l*length_r)],'repeated_2')
    '''
    #cosine attention   
    length_1=debug_print(1e-10+T.sqrt(T.sum(T.sqr(repeated_1), axis=0)),'length_1')
    length_2=debug_print(1e-10+T.sqrt(T.sum(T.sqr(repeated_2), axis=0)), 'length_2')

    multi=debug_print(repeated_1*repeated_2, 'multi')
    sum_multi=debug_print(T.sum(multi, axis=0),'sum_multi')
    
    list_of_simi= debug_print(sum_multi/(length_1*length_2),'list_of_simi')   #to get rid of zero length
    simi_matrix=debug_print(list_of_simi.reshape((length_l, length_r)), 'simi_matrix')
    
    '''
    #euclid, effective for wikiQA
    gap=debug_print(repeated_1-repeated_2, 'gap')
    eucli=debug_print(T.sqrt(1e-10+T.sum(T.sqr(gap), axis=0)),'eucli')
    simi_matrix=debug_print((1.0/(1.0+eucli)).reshape((length_l, length_r)), 'simi_matrix')
    
    
    return simi_matrix#[:length_l, :length_r]
Exemplo n.º 10
0
 def forward_prop_step(x_t, s_t1_prev):            
     # GRU Layer 1
     z_t1 =debug_print( T.nnet.sigmoid(U[0].dot(x_t) + W[0].dot(s_t1_prev) + b[0]), 'z_t1')
     r_t1 = debug_print(T.nnet.sigmoid(U[1].dot(x_t) + W[1].dot(s_t1_prev) + b[1]), 'r_t1')
     c_t1 = debug_print(T.tanh(U[2].dot(x_t) + W[2].dot(s_t1_prev * r_t1) + b[2]), 'c_t1')
     s_t1 = debug_print((T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev, 's_t1')
     return s_t1
Exemplo n.º 11
0
def create_GRU_para(rng, word_dim, hidden_dim):
        # Initialize the network parameters
        U = numpy.random.uniform(-numpy.sqrt(1./hidden_dim), numpy.sqrt(1./hidden_dim), (3, hidden_dim, word_dim))
        W = numpy.random.uniform(-numpy.sqrt(1./hidden_dim), numpy.sqrt(1./hidden_dim), (3, hidden_dim, hidden_dim))
        b = numpy.zeros((3, hidden_dim))
        # Theano: Created shared variables
        U = debug_print(theano.shared(name='U', value=U.astype(theano.config.floatX), borrow=True), 'U')
        W = debug_print(theano.shared(name='W', value=W.astype(theano.config.floatX), borrow=True), 'W')
        b = debug_print(theano.shared(name='b', value=b.astype(theano.config.floatX), borrow=True), 'b')
        return U, W, b
Exemplo n.º 12
0
def cosine(vec1, vec2):
    vec1 = debug_print(vec1, 'vec1')
    vec2 = debug_print(vec2, 'vec2')
    norm_uni_l = T.sqrt((vec1**2).sum())
    norm_uni_r = T.sqrt((vec2**2).sum())

    dot = T.dot(vec1, vec2.T)

    simi = debug_print(dot / (norm_uni_l * norm_uni_r), 'uni-cosine')
    return simi.reshape((1, 1))
def create_GRU_para(rng, word_dim, hidden_dim):
        # Initialize the network parameters
        U = numpy.random.uniform(-numpy.sqrt(1./hidden_dim), numpy.sqrt(1./hidden_dim), (3, hidden_dim, word_dim))
        W = numpy.random.uniform(-numpy.sqrt(1./hidden_dim), numpy.sqrt(1./hidden_dim), (3, hidden_dim, hidden_dim))
        b = numpy.zeros((3, hidden_dim))
        # Theano: Created shared variables
        U = debug_print(theano.shared(name='U', value=U.astype(theano.config.floatX), borrow=True), 'U')
        W = debug_print(theano.shared(name='W', value=W.astype(theano.config.floatX), borrow=True), 'W')
        b = debug_print(theano.shared(name='b', value=b.astype(theano.config.floatX), borrow=True), 'b')
        return U, W, b
def cosine(vec1, vec2):
    vec1=debug_print(vec1, 'vec1')
    vec2=debug_print(vec2, 'vec2')
    norm_uni_l=T.sqrt((vec1**2).sum())
    norm_uni_r=T.sqrt((vec2**2).sum())
    
    dot=T.dot(vec1,vec2.T)
    
    simi=debug_print(dot/(norm_uni_l*norm_uni_r), 'uni-cosine')
    return simi.reshape((1,1))    
Exemplo n.º 15
0
def unify_eachone(matrix, sentlength_1, sentlength_2, Np):

    #tensor: (1, feature maps, 66, 66)
    #sentlength_1=dim-left1-right1
    #sentlength_2=dim-left2-right2
    #core=tensor[:,:, left1:(dim-right1),left2:(dim-right2) ]

    repeat_row = Np / sentlength_1
    extra_row = Np % sentlength_1
    repeat_col = Np / sentlength_2
    extra_col = Np % sentlength_2

    #repeat core
    matrix_1 = repeat_whole_tensor(matrix, 5, True)
    matrix_2 = repeat_whole_tensor(matrix_1, 5, False)

    new_rows = T.maximum(sentlength_1, sentlength_1 * repeat_row + extra_row)
    new_cols = T.maximum(sentlength_2, sentlength_2 * repeat_col + extra_col)

    #core=debug_print(core_2[:,:, :new_rows, : new_cols],'core')
    new_matrix = debug_print(matrix_2[:new_rows, :new_cols], 'new_matrix')
    #determine x, y start positions
    size_row = new_rows / Np
    remain_row = new_rows % Np
    size_col = new_cols / Np
    remain_col = new_cols % Np

    xx = debug_print(
        T.concatenate([
            T.arange(Np - remain_row + 1) * size_row,
            (Np - remain_row) * size_row + (T.arange(remain_row) + 1) *
            (size_row + 1)
        ]), 'xx')
    yy = debug_print(
        T.concatenate([
            T.arange(Np - remain_col + 1) * size_col,
            (Np - remain_col) * size_col + (T.arange(remain_col) + 1) *
            (size_col + 1)
        ]), 'yy')

    list_of_maxs = []
    for i in xrange(Np):
        for j in xrange(Np):
            region = debug_print(new_matrix[xx[i]:xx[i + 1], yy[j]:yy[j + 1]],
                                 'region')
            #maxvalue1=debug_print(T.max(region, axis=2), 'maxvalue1')
            maxvalue = debug_print(T.max(region).reshape((1, 1)), 'maxvalue')
            list_of_maxs.append(maxvalue)

    all_max_value = T.concatenate(list_of_maxs, axis=1).reshape((1, Np**2))

    return all_max_value
Exemplo n.º 16
0
    def _get_noise(self, batch_size):
        # Create unigram noise distribution.
        srng = MRG_RandomStreams2(seed=self.nce_seed)

        # Get the indices of the noise samples.
        random_noise = debug_print(srng.multinomial(
                size=(batch_size, self.k), pvals=self.unigram), 'random_noise')

        noise_indices_flat = debug_print(random_noise.reshape(
                (batch_size * self.k,)), 'noise_indices_flat')
        p_n_noise = debug_print(self.p_n[noise_indices_flat].reshape(
                (batch_size, self.k)), 'p_n_noise')
        return random_noise, p_n_noise
Exemplo n.º 17
0
    def __init__(self, X, word_dim, hidden_dim, U, W, b, bptt_truncate):
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate

        def forward_prop_step(x_t, s_t1_prev):
            # GRU Layer 1
            z_t1 = debug_print(
                T.nnet.sigmoid(U[0].dot(x_t) + W[0].dot(s_t1_prev) + b[0]),
                'z_t1')
            r_t1 = debug_print(
                T.nnet.sigmoid(U[1].dot(x_t) + W[1].dot(s_t1_prev) + b[1]),
                'r_t1')
            c_t1 = debug_print(
                T.tanh(U[2].dot(x_t) + W[2].dot(s_t1_prev * r_t1) + b[2]),
                'c_t1')
            s_t1 = debug_print(
                (T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev, 's_t1')
            return s_t1

        s, updates = theano.scan(
            forward_prop_step,
            sequences=X.transpose(1, 0),
            truncate_gradient=self.bptt_truncate,
            outputs_info=dict(initial=T.zeros(self.hidden_dim)))

        self.output_matrix = debug_print(s.transpose(),
                                         'GRU_Matrix_Input.output_matrix')
        self.output_vector_mean = T.mean(self.output_matrix, axis=1)
        self.output_vector_max = T.max(self.output_matrix, axis=1)
        self.output_vector_last = self.output_matrix[:, -1]
Exemplo n.º 18
0
    def negative_log_likelihood(self, y):
        """Return the mean of the negative log-likelihood of the prediction
        of this model under a given target distribution.

        .. math::

            \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
                \ell (\theta=\{W,b\}, \mathcal{D})

        :type y: theano.tensor.TensorType
        :param y: corresponds to a vector that gives for each example the
                  correct label

        Note: we use the mean instead of the sum so that
              the learning rate is less dependent on the batch size
        """
        # y.shape[0] is (symbolically) the number of rows in y, i.e.,
        # number of examples (call it n) in the minibatch
        # T.arange(y.shape[0]) is a symbolic vector which will contain
        # [0,1,2,... n-1]. T.log(self.p_y_given_x) is a matrix of
        # Log-Probabilities (call it LP) with one row per example and
        # one column per class. LP[T.arange(y.shape[0]),y] is a vector
        # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
        # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
        # the mean (across minibatch examples) of the elements in v,
        # i.e., the mean log-likelihood across the minibatch.  
        y=debug_print(y,'y_true')
        log_likelihood=-T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
        return log_likelihood
Exemplo n.º 19
0
 def get_pure_noise(self, targets):
         # Create unigram noise distribution.
     srng = MRG_RandomStreams2(seed=self.nce_seed)
 
     # Get the indices of the noise samples.
     random_noise = srng.multinomial(size=(self.batch_size, self.k*4), pvals=self.unigram)
     noise_matrix=[]
     for row in range(self.batch_size):
         noise_list=[]
         target=targets[row][0]
         #print 'target:'+str(target)
         count=0
         for col in range(self.k*4):
             noise=debug_print(random_noise[row][col], 'noise')
             if noise.eval()!=target:
                 noise_list.append(noise)
                 count+=1
                 if count==self.k:
                     break
         noise_matrix.append(noise_list)
         
     random_noise=T.concatenate(noise_matrix, axis=0).reshape((self.batch_size, self.k))        
     noise_indices_flat = random_noise.reshape((self.batch_size * self.k,))
     p_n_noise = self.p_n[noise_indices_flat].reshape((self.batch_size, self.k))
     return random_noise, p_n_noise         
Exemplo n.º 20
0
    def __init__(self, batch_size, vocab_size, left_context, right_context,
            emb_size, k, unigram, l1_weight=0, l2_weight=0, nce_seed=2345):
        self.name = 'vLBL'
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.left_context = left_context
        self.right_context = right_context
        self.context_size = self.left_context + self.right_context
        self.emb_size = emb_size
        self.k = k
        self.unigram = unigram
        self.p_n = debug_print(theano.shared(value=unigram, name='noise_probab'),
                'noise')

        self.l1_weight = l1_weight
        self.l2_weight = l2_weight
        self.nce_seed = nce_seed

        # Create context and target embeddings
        rand_values = random_value_normal((self.vocab_size, self.emb_size),
                floatX, np.random.RandomState(1234))
        self.R = theano.shared(value=rand_values, name='R')
        rand_values = random_value_normal((self.vocab_size, self.emb_size),
                floatX, np.random.RandomState(4321))
        self.Q = theano.shared(value=rand_values, name='Q')
        b_values = zero_value((self.vocab_size,), dtype=floatX)
        self.bias = theano.shared(value=b_values, name='bias')

        # The learning rates are created the first time set_learning_rate is
        # called.
        self.lr = None
Exemplo n.º 21
0
    def _get_noise(self, batch_size):
        # Create unigram noise distribution.
        srng = MRG_RandomStreams2(seed=self.nce_seed)

        # Get the indices of the noise samples.
        random_noise = debug_print(
            srng.multinomial(size=(batch_size, self.k), pvals=self.unigram),
            'random_noise')

        noise_indices_flat = debug_print(
            random_noise.reshape((batch_size * self.k, )),
            'noise_indices_flat')
        p_n_noise = debug_print(
            self.p_n[noise_indices_flat].reshape((batch_size, self.k)),
            'p_n_noise')
        return random_noise, p_n_noise
Exemplo n.º 22
0
    def negative_log_likelihood(self, y):
        """Return the mean of the negative log-likelihood of the prediction
        of this model under a given target distribution.

        .. math::

            \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
                \ell (\theta=\{W,b\}, \mathcal{D})

        :type y: theano.tensor.TensorType
        :param y: corresponds to a vector that gives for each example the
                  correct label

        Note: we use the mean instead of the sum so that
              the learning rate is less dependent on the batch size
        """
        # y.shape[0] is (symbolically) the number of rows in y, i.e.,
        # number of examples (call it n) in the minibatch
        # T.arange(y.shape[0]) is a symbolic vector which will contain
        # [0,1,2,... n-1]. T.log(self.p_y_given_x) is a matrix of
        # Log-Probabilities (call it LP) with one row per example and
        # one column per class. LP[T.arange(y.shape[0]),y] is a vector
        # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
        # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
        # the mean (across minibatch examples) of the elements in v,
        # i.e., the mean log-likelihood across the minibatch.
        y = debug_print(y, 'y_true')
        log_likelihood = -T.mean(
            T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
        return log_likelihood
Exemplo n.º 23
0
    def __init__(self, rng, input, length_last_dim, kern):

        self.input = input

        # there are "num input feature maps * filter height * filter width"
        # inputs to each hidden unit
        fan_in = kern  #kern numbers
        # each unit in the lower layer receives a gradient from:
        # "num output feature maps * filter height * filter width" /
        #   pooling size
        fan_out = kern
        # initialize weights with random weights
        W_bound = numpy.sqrt(6. / (fan_in + fan_out))
        self.W = theano.shared(numpy.asarray(rng.uniform(low=-W_bound,
                                                         high=W_bound,
                                                         size=(kern, kern)),
                                             dtype=theano.config.floatX),
                               borrow=True)  #a weight matrix kern*kern

        #para_tensor=self.W.dimshuffle('x', 'x', 0, 1)

        simi_tensor = compute_simi_feature(
            self.input, length_last_dim, self.W
        )  #(input.shape[0]/2, input.shape[1], input.shape[3], input.shape[3])
        simi_question = debug_print(
            T.sum(simi_tensor, axis=3).reshape(
                (self.input.shape[0] / 2, length_last_dim)), 'simi_question')
        simi_answer = debug_print(
            T.sum(simi_tensor, axis=2).reshape(
                (self.input.shape[0] / 2, length_last_dim)), 'simi_answer')

        weights_question = T.nnet.softmax(simi_question)
        weights_answer = T.nnet.softmax(simi_answer)

        concate = T.concatenate([weights_question, weights_answer], axis=1)
        reshaped_concate = concate.reshape(
            (input.shape[0], 1, 1, length_last_dim))

        weight_tensor = T.repeat(reshaped_concate, kern, axis=2)

        ele_add = self.input + weight_tensor
        self.output = T.sum(ele_add, axis=3)

        self.params = [self.W]
def unify_eachone(matrix, sentlength_1, sentlength_2, Np):

    #tensor: (1, feature maps, 66, 66)
    #sentlength_1=dim-left1-right1
    #sentlength_2=dim-left2-right2
    #core=tensor[:,:, left1:(dim-right1),left2:(dim-right2) ]

    repeat_row=Np/sentlength_1
    extra_row=Np%sentlength_1
    repeat_col=Np/sentlength_2
    extra_col=Np%sentlength_2    

    #repeat core
    matrix_1=repeat_whole_tensor(matrix, 5, True) 
    matrix_2=repeat_whole_tensor(matrix_1, 5, False)
    
    new_rows=T.maximum(sentlength_1, sentlength_1*repeat_row+extra_row)
    new_cols=T.maximum(sentlength_2, sentlength_2*repeat_col+extra_col)
    
    #core=debug_print(core_2[:,:, :new_rows, : new_cols],'core')
    new_matrix=debug_print(matrix_2[:new_rows,:new_cols], 'new_matrix')
    #determine x, y start positions
    size_row=new_rows/Np
    remain_row=new_rows%Np
    size_col=new_cols/Np
    remain_col=new_cols%Np
    
    xx=debug_print(T.concatenate([T.arange(Np-remain_row+1)*size_row, (Np-remain_row)*size_row+(T.arange(remain_row)+1)*(size_row+1)]),'xx')
    yy=debug_print(T.concatenate([T.arange(Np-remain_col+1)*size_col, (Np-remain_col)*size_col+(T.arange(remain_col)+1)*(size_col+1)]),'yy')
    
    list_of_maxs=[]
    for i in xrange(Np):
        for j in xrange(Np):
            region=debug_print(new_matrix[xx[i]:xx[i+1], yy[j]:yy[j+1]],'region')
            #maxvalue1=debug_print(T.max(region, axis=2), 'maxvalue1')
            maxvalue=debug_print(T.max(region).reshape((1,1)), 'maxvalue')
            list_of_maxs.append(maxvalue)
    

    all_max_value=T.concatenate(list_of_maxs, axis=1).reshape((1, Np**2))
    
    return all_max_value            
Exemplo n.º 25
0
    def _calc_q_hat(self, r_h):
        """Calculates q_hat according to the model.

        Parameters
        ----------
        r_h : ndarray
            embedded context words

        Returns
        -------
        ndarray
            q_hat, computed by a single hidden layer
        """
        bias = debug_print(T.ones((r_h.shape[0], 1), dtype=floatX), 'bias1')
        r_h = debug_print(T.concatenate([r_h, bias], axis=1), 'r_h_concatenate')
        hidden_output = debug_print(get_activation_func(self.activation_func,
                T.dot(r_h, self.W1)), 'hidden_output')
        bias = debug_print(T.ones((hidden_output.shape[0], 1), dtype=floatX), 'bias2')
        hidden_output = debug_print(T.concatenate([hidden_output, bias], axis=1), 'hidden_output_concatenate')
        return get_activation_func(self.activation_func,
                T.dot(hidden_output, self.W2))
Exemplo n.º 26
0
    def __init__(self, rng, input, n_in, n_out):
        """ Initialize the parameters of the logistic regression

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
                      architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
                     which the datapoints lie

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
                      which the labels lie

        """

        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
        '''
        self.W = theano.shared(value=numpy.zeros((n_in, n_out),
                                                 dtype=theano.config.floatX),  # @UndefinedVariable
                                name='W', borrow=True)
        '''
        self.W = theano.shared(numpy.asarray(rng.uniform(
            low=-numpy.sqrt(6. / (n_in + n_out)),
            high=numpy.sqrt(6. / (n_in + n_out)),
            size=(n_in, n_out)),
                                             dtype=theano.config.floatX),
                               borrow=True)

        # initialize the baises b as a vector of n_out 0s
        self.b = theano.shared(
            value=numpy.zeros(
                (n_out, ), dtype=theano.config.floatX),  # @UndefinedVariable
            name='b',
            borrow=True)
        before_softmax = T.dot(input, self.W) + self.b
        # compute vector of class-membership probabilities in symbolic form,
        self.p_y_given_x = T.nnet.softmax(before_softmax)  # is a vector

        self.prop_for_posi = self.p_y_given_x[:, 1:2]
        # compute prediction as class whose probability is maximal in
        # symbolic form
        self.y_pred = debug_print(
            T.argmax(self.p_y_given_x, axis=1),
            'y_pred')  # choose the maximal element in above vector

        # parameters of the model
        self.params = [self.W, self.b]
Exemplo n.º 27
0
    def _calc_q_hat(self, r_h):
        """Calculates q_hat according to the model.

        Parameters
        ----------
        r_h : ndarray
            embedded context words

        Returns
        -------
        ndarray
            q_hat as calculated in Eq. 2 in [MniKav13]
        """
        inner = debug_print(r_h * self.C, 'inner-q_hat')
        return T.sum(inner, axis=1)
Exemplo n.º 28
0
    def __init__(self, rng, input, length_last_dim, kern):

        self.input = input

        # there are "num input feature maps * filter height * filter width"
        # inputs to each hidden unit
        fan_in = kern #kern numbers
        # each unit in the lower layer receives a gradient from:
        # "num output feature maps * filter height * filter width" /
        #   pooling size
        fan_out = kern
        # initialize weights with random weights
        W_bound = numpy.sqrt(6. / (fan_in + fan_out))
        self.W = theano.shared(numpy.asarray(
            rng.uniform(low=-W_bound, high=W_bound, size=(kern, kern)),
            dtype=theano.config.floatX),
                               borrow=True) #a weight matrix kern*kern
        
        #para_tensor=self.W.dimshuffle('x', 'x', 0, 1)
        
        simi_tensor=compute_simi_feature(self.input, length_last_dim, self.W) #(input.shape[0]/2, input.shape[1], input.shape[3], input.shape[3])
        simi_question=debug_print(T.sum(simi_tensor, axis=3).reshape((self.input.shape[0]/2, length_last_dim)),'simi_question')
        simi_answer=debug_print(T.sum(simi_tensor, axis=2).reshape((self.input.shape[0]/2, length_last_dim)), 'simi_answer')
        
        weights_question =T.nnet.softmax(simi_question) 
        weights_answer=T.nnet.softmax(simi_answer) 
        
        concate=T.concatenate([weights_question, weights_answer], axis=1)
        reshaped_concate=concate.reshape((input.shape[0], 1, 1, length_last_dim))
        
        weight_tensor=T.repeat(reshaped_concate, kern, axis=2)
        
        ele_add=self.input+weight_tensor
        self.output=T.sum(ele_add, axis=3)

        self.params = [self.W]
Exemplo n.º 29
0
    def _calc_q_hat(self, r_h):
        """Calculates q_hat according to the model.

        Parameters
        ----------
        r_h : ndarray
            embedded context words

        Returns
        -------
        ndarray
            q_hat as calculated in Eq. 2 in [MniKav13]
        """
        inner = debug_print(r_h * self.C, 'inner-q_hat')
        return T.sum(inner, axis=1)
Exemplo n.º 30
0
    def configure(self, args):
        super(vLblNCEPredictor, self).configure(args)
        self.vocab = read_vocabulary_id_file(args.vocabulary)
        self.vocab_size = len(self.vocab.keys())
        self.effective_vocab_size = len(self.vocab.keys())
        self.perplexity = args.perplexity
        self.save_word = args.save_word
        self.result_file = args.result_file
        self.store_rank = args.store_rank
        self.store_argmax = args.store_argmax
        self.store_softmax = args.store_softmax
        self.normalize_with_root = args.normalize_with_root
        self.information = args.information
        self.predictions = args.predictions

        # This code is taken from SimpleVLblNceTrainer
        if args.pred_vocab:
            # Element i contains the index of the i'th prediction vocabulary
            # token in the original vocabulary.
            self.vocab_mapping_list = list()

            # Mapping from the model vocabulary to the prediction vocabulary
            # indices
            self.vocab_mapping = dict()

            for i, token in enumerate(file_line_generator(args.pred_vocab)):

                if not token in self.vocab:
                    raise ValueError('Token "%s" in prediction vocabulary ' +
                                     'does not exist in model vocabulary.' %
                                     token)

                self.vocab_mapping_list.append(self.vocab[token])
                self.vocab_mapping[self.vocab[token]] = i
        else:
            self.vocab_mapping_list = range(len(self.vocab))
            self.vocab_mapping = dict(
                zip(self.vocab_mapping_list, self.vocab_mapping_list))

        if self.perplexity:
            self.example_iterator_type = PaddedWindowExamplesGenerator
            self.example_processor = self._process_example_full_text
            self.learn_eos = True  # We need to set that because otherwise PaddedWindowExampleGenerator will ignore end-of-sentence tags (</S>)
            self.disable_padding = False
            self.w_indices = debug_print(T.imatrix('w'), 'w')
            self.inputs.append(self.w_indices)
        else:
            self.example_processor = self._process_example_context_per_line
Exemplo n.º 31
0
    def configure(self, args):
        super(vLblNCEPredictor, self).configure(args)
        self.vocab = read_vocabulary_id_file(args.vocabulary)
        self.vocab_size = len(self.vocab.keys())
        self.effective_vocab_size = len(self.vocab.keys())
        self.perplexity = args.perplexity
        self.save_word = args.save_word
        self.result_file = args.result_file
        self.store_rank = args.store_rank
        self.store_argmax = args.store_argmax
        self.store_softmax = args.store_softmax
        self.normalize_with_root = args.normalize_with_root
        self.information = args.information
        self.predictions = args.predictions

        # This code is taken from SimpleVLblNceTrainer
        if args.pred_vocab:
            # Element i contains the index of the i'th prediction vocabulary
            # token in the original vocabulary.
            self.vocab_mapping_list = list()

            # Mapping from the model vocabulary to the prediction vocabulary
            # indices
            self.vocab_mapping = dict()

            for i, token in enumerate(file_line_generator(args.pred_vocab)):

                if not token in self.vocab:
                    raise ValueError('Token "%s" in prediction vocabulary ' +
                            'does not exist in model vocabulary.' % token)

                self.vocab_mapping_list.append(self.vocab[token])
                self.vocab_mapping[self.vocab[token]] = i
        else:
            self.vocab_mapping_list = range(len(self.vocab))
            self.vocab_mapping = dict(
                    zip(self.vocab_mapping_list, self.vocab_mapping_list))

        if self.perplexity:
            self.example_iterator_type = PaddedWindowExamplesGenerator
            self.example_processor = self._process_example_full_text
            self.learn_eos = True  # We need to set that because otherwise PaddedWindowExampleGenerator will ignore end-of-sentence tags (</S>)
            self.disable_padding = False
            self.w_indices = debug_print(T.imatrix('w'), 'w')
            self.inputs.append(self.w_indices)
        else:
            self.example_processor = self._process_example_context_per_line
Exemplo n.º 32
0
    def __init__(self, rng, input, n_in, n_out):
        """ Initialize the parameters of the logistic regression

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
                      architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
                     which the datapoints lie

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
                      which the labels lie

        """

        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
        '''
        self.W = theano.shared(value=numpy.zeros((n_in, n_out),
                                                 dtype=theano.config.floatX),  # @UndefinedVariable
                                name='W', borrow=True)
        '''
        self.W= theano.shared(numpy.asarray(rng.uniform(
                    low=-numpy.sqrt(6. / (n_in + n_out)),
                    high=numpy.sqrt(6. / (n_in + n_out)),
                    size=(n_in, n_out)), dtype=theano.config.floatX), borrow=True)
        
        # initialize the baises b as a vector of n_out 0s
        self.b = theano.shared(value=numpy.zeros((n_out,),
                                                 dtype=theano.config.floatX),  # @UndefinedVariable
                               name='b', borrow=True)
        before_softmax=T.dot(input, self.W) + self.b
        # compute vector of class-membership probabilities in symbolic form, 
        self.p_y_given_x =T.nnet.softmax(before_softmax) # is a vector
        
        self.prop_for_posi=self.p_y_given_x[:,1:2]
        # compute prediction as class whose probability is maximal in
        # symbolic form
        self.y_pred = debug_print(T.argmax(self.p_y_given_x, axis=1),'y_pred') # choose the maximal element in above vector

        # parameters of the model
        self.params = [self.W, self.b]
Exemplo n.º 33
0
    def _embed_word_indices(self, indices, embeddings):
        """Embed all indexes using the given embeddings.

        Parameters
        ----------
        indices : ndarray
            indices of the items to embed using the given embeddings matrix
            Note: indices are flattened
        embeddings : ndarray (vocab_size x emb_size)
            embeddings matrix

        Returns
        -------
        ndarray (len(indices) x emb_size)
            embedded indices
        """
        try:
            embedded = super(VLblNceDistributional, self) \
                ._embed_word_indices(indices, embeddings)
        except ValueError:
            concatenated_input = debug_print(indices.flatten(),
                                             "concatenated_input")

            # Commented till new version of Theano is available
            #embedded = theano.sparse.basic.get_item_list(embeddings, concatenated_input)

            # Fix for current Theano's version: get rows of embeddings matrix
            #via multiplying it with other sparse matrix
            data = debug_print(
                T.ones_like(T.arange(concatenated_input.shape[0])), 'data')
            ind = debug_print(concatenated_input, 'ind')
            indptr = debug_print(T.arange(concatenated_input.shape[0] + 1),
                                 'indptr')
            shape = debug_print(
                T.stack(concatenated_input.shape[0], embeddings.shape[0]),
                'shape')
            mult1 = debug_print(
                theano.sparse.basic.CSR(data, ind, indptr, shape), 'mult1')

            embedded = debug_print(
                theano.sparse.basic.structured_dot(mult1, embeddings),
                'embedded')

        return embedded
Exemplo n.º 34
0
    def errors(self, y):
        """Return a float representing the number of errors in the minibatch
        over the total number of examples of the minibatch ; zero one
        loss over the size of the minibatch

        :type y: theano.tensor.TensorType
        :param y: corresponds to a vector that gives for each example the
                  correct label
        """
        y=debug_print(y,'y_true')
        # check if y has same dimension of y_pred
        if y.ndim != self.y_pred.ndim:
            raise TypeError('y should have the same shape as self.y_pred',
                ('y', y.type, 'y_pred', self.y_pred.type))   # wenpeng modified from "('y', target.type, 'y_pred', self.y_pred.type))"
        # check if y is of the correct datatype
        if y.dtype.startswith('int'):
            # the T.neq operator returns a vector of 0s and 1s, where 1
            # represents a mistake in prediction
            return T.mean(T.neq(self.y_pred, y))
        else:
            raise NotImplementedError()
Exemplo n.º 35
0
    def errors(self, y):
        """Return a float representing the number of errors in the minibatch
        over the total number of examples of the minibatch ; zero one
        loss over the size of the minibatch

        :type y: theano.tensor.TensorType
        :param y: corresponds to a vector that gives for each example the
                  correct label
        """
        y=debug_print(y,'y_true')
        # check if y has same dimension of y_pred
        if y.ndim != self.y_pred.ndim:
            raise TypeError('y should have the same shape as self.y_pred',
                ('y', y.type, 'y_pred', self.y_pred.type))   # wenpeng modified from "('y', target.type, 'y_pred', self.y_pred.type))"
        # check if y is of the correct datatype
        if y.dtype.startswith('int'):
            # the T.neq operator returns a vector of 0s and 1s, where 1
            # represents a mistake in prediction
            return T.mean(T.neq(self.y_pred, y))
        else:
            raise NotImplementedError()
Exemplo n.º 36
0
    def __init__(self,
                 batch_size,
                 vocab_size,
                 left_context,
                 right_context,
                 emb_size,
                 k,
                 unigram,
                 l1_weight=0,
                 l2_weight=0,
                 nce_seed=2345):
        self.name = 'vLBL'
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.left_context = left_context
        self.right_context = right_context
        self.context_size = self.left_context + self.right_context
        self.emb_size = emb_size
        self.k = k
        self.unigram = unigram
        self.p_n = debug_print(
            theano.shared(value=unigram, name='noise_probab'), 'noise')

        self.l1_weight = l1_weight
        self.l2_weight = l2_weight
        self.nce_seed = nce_seed

        # Create context and target embeddings
        rand_values = random_value_normal((self.vocab_size, self.emb_size),
                                          floatX, np.random.RandomState(1234))
        self.R = theano.shared(value=rand_values, name='R')
        rand_values = random_value_normal((self.vocab_size, self.emb_size),
                                          floatX, np.random.RandomState(4321))
        self.Q = theano.shared(value=rand_values, name='Q')
        b_values = zero_value((self.vocab_size, ), dtype=floatX)
        self.bias = theano.shared(value=b_values, name='bias')

        # The learning rates are created the first time set_learning_rate is
        # called.
        self.lr = None
 def __init__(self, X, word_dim, hidden_dim, U, W, b, bptt_truncate):
     self.hidden_dim = hidden_dim
     self.bptt_truncate = bptt_truncate
     
     def forward_prop_step(x_t, s_t1_prev):            
         # GRU Layer 1
         z_t1 =debug_print( T.nnet.sigmoid(U[0].dot(x_t) + W[0].dot(s_t1_prev) + b[0]), 'z_t1')
         r_t1 = debug_print(T.nnet.sigmoid(U[1].dot(x_t) + W[1].dot(s_t1_prev) + b[1]), 'r_t1')
         c_t1 = debug_print(T.tanh(U[2].dot(x_t) + W[2].dot(s_t1_prev * r_t1) + b[2]), 'c_t1')
         s_t1 = debug_print((T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev, 's_t1')
         return s_t1
     
     s, updates = theano.scan(
         forward_prop_step,
         sequences=X.transpose(1,0),
         truncate_gradient=self.bptt_truncate,
         outputs_info=dict(initial=T.zeros(self.hidden_dim)))
     
     self.output_matrix=debug_print(s.transpose(), 'GRU_Matrix_Input.output_matrix')
     self.output_vector_mean=T.mean(self.output_matrix, axis=1)
     self.output_vector_max=T.max(self.output_matrix, axis=1)
     self.output_vector_last=self.output_matrix[:,-1]
Exemplo n.º 38
0
 def __init__(self, rng, tensor_l, tensor_r, dim,kern, l_left_pad, l_right_pad, r_left_pad, r_right_pad): # length_l, length_r: valid lengths after conv
     #first reshape into matrix
     matrix_l=tensor_l.reshape((tensor_l.shape[2], tensor_l.shape[3]))
     matrix_r=tensor_r.reshape((tensor_r.shape[2], tensor_r.shape[3]))
     #start
     repeated_1=debug_print(T.repeat(matrix_l, dim, axis=1),'repeated_1') # add 10 because max_sent_length is only input for conv, conv will make size bigger
     repeated_2=debug_print(repeat_whole_tensor(matrix_r, dim, False),'repeated_2')
     '''
     #cosine attention   
     length_1=debug_print(1e-10+T.sqrt(T.sum(T.sqr(repeated_1), axis=0)),'length_1')
     length_2=debug_print(1e-10+T.sqrt(T.sum(T.sqr(repeated_2), axis=0)), 'length_2')
 
     multi=debug_print(repeated_1*repeated_2, 'multi')
     sum_multi=debug_print(T.sum(multi, axis=0),'sum_multi')
     
     list_of_simi= debug_print(sum_multi/(length_1*length_2),'list_of_simi')   #to get rid of zero length
     simi_matrix=debug_print(list_of_simi.reshape((length_l, length_r)), 'simi_matrix')
     
     '''
     #euclid, effective for wikiQA
     gap=debug_print(repeated_1-repeated_2, 'gap')
     eucli=debug_print(T.sqrt(1e-10+T.sum(T.sqr(gap), axis=0)),'eucli')
     simi_matrix=debug_print((1.0/(1.0+eucli)).reshape((dim, dim)), 'simi_matrix')
     W_bound = numpy.sqrt(6. / (dim + kern))
     self.W = theano.shared(numpy.asarray(rng.uniform(low=-W_bound, high=W_bound, size=(kern, dim)),dtype=theano.config.floatX),borrow=True) #a weight matrix kern*kern
     matrix_l_attention=debug_print(T.dot(self.W, simi_matrix.T), 'matrix_l_attention')
     matrix_r_attention=debug_print(T.dot(self.W, simi_matrix), 'matrix_r_attention')
     #reset zero at both side
     left_zeros_l=T.set_subtensor(matrix_l_attention[:,:l_left_pad], T.zeros((matrix_l_attention.shape[0], l_left_pad), dtype=theano.config.floatX))
     right_zeros_l=T.set_subtensor(left_zeros_l[:,-l_right_pad:], T.zeros((matrix_l_attention.shape[0], l_right_pad), dtype=theano.config.floatX))
     left_zeros_r=T.set_subtensor(matrix_r_attention[:,:r_left_pad], T.zeros((matrix_r_attention.shape[0], r_left_pad), dtype=theano.config.floatX))
     right_zeros_r=T.set_subtensor(left_zeros_r[:,-r_right_pad:], T.zeros((matrix_r_attention.shape[0], r_right_pad), dtype=theano.config.floatX))       
     #combine with original input matrix
     self.new_tensor_l=T.concatenate([matrix_l,right_zeros_l], axis=0).reshape((tensor_l.shape[0], 2*tensor_l.shape[1], tensor_l.shape[2], tensor_l.shape[3])) 
     self.new_tensor_r=T.concatenate([matrix_r,right_zeros_r], axis=0).reshape((tensor_r.shape[0], 2*tensor_r.shape[1], tensor_r.shape[2], tensor_r.shape[3])) 
     
     self.params=[self.W]
 def __init__(self, rng, tensor_l, tensor_r, dim,kern, l_left_pad, l_right_pad, r_left_pad, r_right_pad): # length_l, length_r: valid lengths after conv
     #first reshape into matrix
     matrix_l=tensor_l.reshape((tensor_l.shape[2], tensor_l.shape[3]))
     matrix_r=tensor_r.reshape((tensor_r.shape[2], tensor_r.shape[3]))
     #start
     repeated_1=debug_print(T.repeat(matrix_l, dim, axis=1),'repeated_1') # add 10 because max_sent_length is only input for conv, conv will make size bigger
     repeated_2=debug_print(repeat_whole_tensor(matrix_r, dim, False),'repeated_2')
     '''
     #cosine attention   
     length_1=debug_print(1e-10+T.sqrt(T.sum(T.sqr(repeated_1), axis=0)),'length_1')
     length_2=debug_print(1e-10+T.sqrt(T.sum(T.sqr(repeated_2), axis=0)), 'length_2')
 
     multi=debug_print(repeated_1*repeated_2, 'multi')
     sum_multi=debug_print(T.sum(multi, axis=0),'sum_multi')
     
     list_of_simi= debug_print(sum_multi/(length_1*length_2),'list_of_simi')   #to get rid of zero length
     simi_matrix=debug_print(list_of_simi.reshape((length_l, length_r)), 'simi_matrix')
     
     '''
     #euclid, effective for wikiQA
     gap=debug_print(repeated_1-repeated_2, 'gap')
     eucli=debug_print(T.sqrt(1e-10+T.sum(T.sqr(gap), axis=0)),'eucli')
     simi_matrix=debug_print((1.0/(1.0+eucli)).reshape((dim, dim)), 'simi_matrix')
     W_bound = numpy.sqrt(6. / (dim + kern))
     self.W = theano.shared(numpy.asarray(rng.uniform(low=-W_bound, high=W_bound, size=(kern, dim)),dtype=theano.config.floatX),borrow=True) #a weight matrix kern*kern
     matrix_l_attention=debug_print(T.dot(self.W, simi_matrix.T), 'matrix_l_attention')
     matrix_r_attention=debug_print(T.dot(self.W, simi_matrix), 'matrix_r_attention')
     #reset zero at both side
     left_zeros_l=T.set_subtensor(matrix_l_attention[:,:l_left_pad], T.zeros((matrix_l_attention.shape[0], l_left_pad), dtype=theano.config.floatX))
     right_zeros_l=T.set_subtensor(left_zeros_l[:,-l_right_pad:], T.zeros((matrix_l_attention.shape[0], l_right_pad), dtype=theano.config.floatX))
     left_zeros_r=T.set_subtensor(matrix_r_attention[:,:r_left_pad], T.zeros((matrix_r_attention.shape[0], r_left_pad), dtype=theano.config.floatX))
     right_zeros_r=T.set_subtensor(left_zeros_r[:,-r_right_pad:], T.zeros((matrix_r_attention.shape[0], r_right_pad), dtype=theano.config.floatX))       
     #combine with original input matrix
     self.new_tensor_l=T.concatenate([matrix_l,right_zeros_l], axis=0).reshape((tensor_l.shape[0], 2*tensor_l.shape[1], tensor_l.shape[2], tensor_l.shape[3])) 
     self.new_tensor_r=T.concatenate([matrix_r,right_zeros_r], axis=0).reshape((tensor_r.shape[0], 2*tensor_r.shape[1], tensor_r.shape[2], tensor_r.shape[3])) 
     
     self.params=[self.W]
Exemplo n.º 40
0
    def _embed_word_indices(self, indices, embeddings):
        """Embed all indexes using the given embeddings.

        Parameters
        ----------
        indices : ndarray
            indices of the items to embed using the given embeddings matrix
            Note: indices are flattened
        embeddings : ndarray (vocab_size x emb_size)
            embeddings matrix

        Returns
        -------
        ndarray (len(indices) x emb_size)
            embedded indices
        """
        try:
            embedded = super(VLblNceDistributional, self) \
                ._embed_word_indices(indices, embeddings)
        except ValueError:
            concatenated_input = debug_print(indices.flatten(), "concatenated_input")

            # Commented till new version of Theano is available
            #embedded = theano.sparse.basic.get_item_list(embeddings, concatenated_input)

            # Fix for current Theano's version: get rows of embeddings matrix
            #via multiplying it with other sparse matrix
            data = debug_print(T.ones_like(T.arange(concatenated_input.shape[0])), 'data')
            ind = debug_print(concatenated_input, 'ind')
            indptr = debug_print(T.arange(concatenated_input.shape[0] + 1), 'indptr')
            shape = debug_print(T.stack(concatenated_input.shape[0], embeddings.shape[0]), 'shape')
            mult1 = debug_print(theano.sparse.basic.CSR(data, ind, indptr, shape), 'mult1')

            embedded =  debug_print(theano.sparse.basic.structured_dot(mult1, embeddings), 'embedded')

        return embedded
Exemplo n.º 41
0
def compute_simi_feature(tensor, dim, para_matrix):
    odd_tensor = debug_print(tensor[0:tensor.shape[0]:2, :, :, :],
                             'odd_tensor')
    even_tensor = debug_print(tensor[1:tensor.shape[0]:2, :, :, :],
                              'even_tensor')
    even_tensor_after_translate = debug_print(
        T.dot(
            para_matrix,
            even_tensor.reshape((tensor.shape[2], dim * tensor.shape[0] / 2))),
        'even_tensor_after_translate')
    fake_even_tensor = debug_print(
        even_tensor_after_translate.reshape(
            (tensor.shape[0] / 2, tensor.shape[1], tensor.shape[2],
             tensor.shape[3])), 'fake_even_tensor')

    repeated_1 = debug_print(T.repeat(odd_tensor, dim, axis=3), 'repeated_1')
    repeated_2 = debug_print(repeat_whole_matrix(fake_even_tensor, dim, False),
                             'repeated_2')
    #repeated_2=T.repeat(even_tensor, even_tensor.shape[3], axis=2).reshape((tensor.shape[0]/2, tensor.shape[1], tensor.shape[2], tensor.shape[3]**2))
    length_1 = debug_print(1e-10 + T.sqrt(T.sum(T.sqr(repeated_1), axis=2)),
                           'length_1')
    length_2 = debug_print(1e-10 + T.sqrt(T.sum(T.sqr(repeated_2), axis=2)),
                           'length_2')

    multi = debug_print(repeated_1 * repeated_2, 'multi')
    sum_multi = debug_print(T.sum(multi, axis=2), 'sum_multi')

    list_of_simi = debug_print(sum_multi / (length_1 * length_2),
                               'list_of_simi')  #to get rid of zero length

    return list_of_simi.reshape((tensor.shape[0] / 2, tensor.shape[1],
                                 tensor.shape[3], tensor.shape[3]))
Exemplo n.º 42
0
def compute_simi_feature(tensor, dim, para_matrix):
    odd_tensor=debug_print(tensor[0:tensor.shape[0]:2,:,:,:],'odd_tensor')
    even_tensor=debug_print(tensor[1:tensor.shape[0]:2,:,:,:], 'even_tensor')
    even_tensor_after_translate=debug_print(T.dot(para_matrix, even_tensor.reshape((tensor.shape[2], dim*tensor.shape[0]/2))), 'even_tensor_after_translate')
    fake_even_tensor=debug_print(even_tensor_after_translate.reshape((tensor.shape[0]/2, tensor.shape[1], tensor.shape[2], tensor.shape[3])),'fake_even_tensor')

    repeated_1=debug_print(T.repeat(odd_tensor, dim, axis=3),'repeated_1')
    repeated_2=debug_print(repeat_whole_matrix(fake_even_tensor, dim, False),'repeated_2')
    #repeated_2=T.repeat(even_tensor, even_tensor.shape[3], axis=2).reshape((tensor.shape[0]/2, tensor.shape[1], tensor.shape[2], tensor.shape[3]**2))    
    length_1=debug_print(1e-10+T.sqrt(T.sum(T.sqr(repeated_1), axis=2)),'length_1')
    length_2=debug_print(1e-10+T.sqrt(T.sum(T.sqr(repeated_2), axis=2)), 'length_2')

    multi=debug_print(repeated_1*repeated_2, 'multi')
    sum_multi=debug_print(T.sum(multi, axis=2),'sum_multi')
    
    list_of_simi= debug_print(sum_multi/(length_1*length_2),'list_of_simi')   #to get rid of zero length
    
    return list_of_simi.reshape((tensor.shape[0]/2, tensor.shape[1], tensor.shape[3], tensor.shape[3]))
def evaluate_lenet5(learning_rate=0.0001, n_epochs=2000, nkerns=[50,50], batch_size=10, window_width=3,
                    maxSentLength=64, emb_size=50, hidden_size=200,
                    margin=0.5, L2_weight=0.0006, update_freq=1, norm_threshold=5.0, max_truncate=33):# max_truncate can be 45
    maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SICK/';
    rng = numpy.random.RandomState(23455)
#     datasets, vocab_size=load_SICK_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'train.txt', rootPath+'test.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test
    datasets, vocab_size=load_SICK_corpus(rootPath+'vocab.txt', rootPath+'train_plus_dev.txt', rootPath+'test.txt', max_truncate,maxSentLength, entailment=True)
    mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
    extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
    discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    
    
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2,:]
    indices_train_r=indices_train[1::2,:]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    
    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad = datasets[1]
    indices_test_l=indices_test[::2,:]
    indices_test_r=indices_test[1::2,:]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
    indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    indices_train_l=T.cast(indices_train_l, 'int64')
    indices_train_r=T.cast(indices_train_r, 'int64')
    indices_test_l=T.cast(indices_test_l, 'int64')
    indices_test_r=T.cast(indices_test_r, 'int64')
    '''
    indices_train_l=T.cast(indices_train_l, 'int32')
    indices_train_r=T.cast(indices_train_r, 'int32')
    indices_test_l=T.cast(indices_test_l, 'int32')
    indices_test_r=T.cast(indices_test_r, 'int32')
    '''


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_glove_50d.txt')
#     rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    

    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')  
    left_l=T.lvector()
    right_l=T.lvector()
    left_r=T.lvector()
    right_r=T.lvector()
    length_l=T.lvector()
    length_r=T.lvector()
    norm_length_l=T.dvector()
    norm_length_r=T.dvector()
    mts=T.dmatrix()
    extra=T.dmatrix()
    discri=T.dmatrix()
    cost_tmp=T.dscalar()




#     #GPU
#     index = T.iscalar()
#     x_index_l = T.imatrix('x_index_l')   # now, x is the index matrix, must be integer
#     x_index_r = T.imatrix('x_index_r')
#     y = T.ivector('y')  
#     left_l=T.iscalar()
#     right_l=T.iscalar()
#     left_r=T.iscalar()
#     right_r=T.iscalar()
#     length_l=T.iscalar()
#     length_r=T.iscalar()
#     norm_length_l=T.fscalar()
#     norm_length_r=T.fscalar()
#     #mts=T.dmatrix()
#     #wmf=T.dmatrix()
#     cost_tmp=T.fscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = debug_print(embeddings[x_index_l.flatten()].reshape((batch_size, maxSentLength, emb_size)).transpose(0,2,1), 'layer0_l_input')
    layer0_r_input = debug_print(embeddings[x_index_r.flatten()].reshape((batch_size, maxSentLength, emb_size)).transpose(0,2,1), 'layer0_r_input')
    #paras:
    U, W, b=create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para=[U, W, b]     
    U1, W1, b1=create_GRU_para(rng, nkerns[0], nkerns[1])
    layer1_para=[U1, W1, b1] 
    def loop (l_left, l_right, l_matrix, r_left, r_right, r_matrix, mts_i, extra_i, norm_length_l_i, norm_length_r_i):   
        l_input_tensor=debug_print(Matrix_Bit_Shift(l_matrix[:,l_left:-l_right]), 'l_input_tensor')
        r_input_tensor=debug_print(Matrix_Bit_Shift(r_matrix[:,r_left:-r_right]), 'r_input_tensor')
        
        addition_l=T.sum(l_matrix[:,l_left:-l_right], axis=1)
        addition_r=T.sum(r_matrix[:,r_left:-r_right], axis=1)
        cosine_addition=cosine(addition_l, addition_r)
        eucli_addition=1.0/(1.0+EUCLID(addition_l, addition_r))#25.2%
        
        layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
        layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
        
        cosine_sent=cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)
        eucli_sent=1.0/(1.0+EUCLID(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep))#25.2%
        
        attention_matrix=compute_simi_feature_matrix_with_matrix(layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim, layer0_A2.dim, maxSentLength*(maxSentLength+1)/2)
        
        l_max_attention=T.max(attention_matrix, axis=1)
        neighborsArgSorted = T.argsort(l_max_attention)
        kNeighborsArg = neighborsArgSorted[:3]#only average the min 3 vectors
        ll = T.sort(kNeighborsArg).flatten() # make y indices in acending lie
    
    
        r_max_attention=T.max(attention_matrix, axis=0)
        neighborsArgSorted_r = T.argsort(r_max_attention)
        kNeighborsArg_r = neighborsArgSorted_r[:3]#only average the min 3 vectors
        rr = T.sort(kNeighborsArg_r).flatten() # make y indices in acending lie
    
        
        l_max_min_attention=debug_print(layer0_A1.output_matrix[:,ll], 'l_max_min_attention')
        r_max_min_attention=debug_print(layer0_A2.output_matrix[:,rr], 'r_max_min_attention')
        

    
        layer1_A1=GRU_Matrix_Input(X=l_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1)
        layer1_A2=GRU_Matrix_Input(X=r_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1)
    
        vec_l=debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])), 'vec_l')
        vec_r=debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])), 'vec_r')
    
        
        
    #     sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    #     aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
    #     norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    #     sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    #     aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
    #     norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    #     
        uni_cosine=cosine(vec_l, vec_r)
    #     aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
    #     uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')    
    #     '''
    #     linear=Linear(sum_uni_l, sum_uni_r)
    #     poly=Poly(sum_uni_l, sum_uni_r)
    #     sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    #     rbf=RBF(sum_uni_l, sum_uni_r)
    #     gesd=GESD(sum_uni_l, sum_uni_r)
    #     '''
        eucli_1=1.0/(1.0+EUCLID(vec_l, vec_r))#25.2%
    #     #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    #     
        len_l=norm_length_l_i.reshape((1,1))
        len_r=norm_length_r_i.reshape((1,1))  
    #     
    #     '''
    #     len_l=length_l.reshape((1,1))
    #     len_r=length_r.reshape((1,1))  
    #     '''
        #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
        #length_gap=T.sqrt((len_l-len_r)**2)
        #layer3_input=mts
#         layer3_input_nn=T.concatenate([vec_l, vec_r,
#                                     cosine_addition, eucli_addition,
#     #                                 cosine_sent, eucli_sent,
#                                     uni_cosine,eucli_1], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
        
        output_i=T.concatenate([vec_l, vec_r,
                                    cosine_addition, eucli_addition,
    #                                 cosine_sent, eucli_sent,
                                    uni_cosine,eucli_1,
                                    mts_i.reshape((1,14)),
                                    len_l, len_r,
                                    extra_i.reshape((1,9))], axis=1)#, layer2.output, layer1.output_cosine], axis=1)    
        return output_i
    
    layer3_input, _ = theano.scan(fn=loop,
                            sequences=[left_l, right_l, layer0_l_input, left_r, right_r, layer0_r_input, mts, extra, norm_length_l, norm_length_r],
                            outputs_info=None,#[self.h0, None],
                            n_steps=batch_size)       
#l_left, l_right, l_matrix, r_left, r_right, r_matrix, mts_i, extra_i, norm_length_l_i, norm_length_r_i
#     x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
#     x_index_r = T.lmatrix('x_index_r')
#     y = T.lvector('y')  
#     left_l=T.lvector()
#     right_l=T.lvector()
#     left_r=T.lvector()
#     right_r=T.lvector()
#     length_l=T.lvector()
#     length_r=T.lvector()
#     norm_length_l=T.dvector()
#     norm_length_r=T.dvector()
#     mts=T.dmatrix()
#     extra=T.dmatrix()
#     discri=T.dmatrix()
#     cost_tmp=T.dscalar()

    
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    feature_size=2*nkerns[1]+2+2+14+2+9
    layer3_input=layer3_input.reshape((batch_size, feature_size))
    layer3=LogisticRegression(rng, input=layer3_input, n_in=feature_size, n_out=3)

    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer3.W** 2).sum()+(U** 2).sum()+(W** 2).sum()+(U1** 2).sum()+(W1** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    

    
    test_model = theano.function([index], [layer3.errors(y),layer3_input, y],
          givens={
            x_index_l: indices_test_l[index: index + batch_size],
            x_index_r: indices_test_r[index: index + batch_size],
            y: testY[index: index + batch_size],
            left_l: testLeftPad_l[index: index + batch_size],
            right_l: testRightPad_l[index: index + batch_size],
            left_r: testLeftPad_r[index: index + batch_size],
            right_r: testRightPad_r[index: index + batch_size],
            length_l: testLengths_l[index: index + batch_size],
            length_r: testLengths_r[index: index + batch_size],
            norm_length_l: normalized_test_length_l[index: index + batch_size],
            norm_length_r: normalized_test_length_r[index: index + batch_size],
            mts: mt_test[index: index + batch_size],
            extra: extra_test[index: index + batch_size],
            discri:discri_test[index: index + batch_size]
            }, on_unused_input='ignore', allow_input_downcast=True)


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params+ layer1_para+layer0_para#+[embeddings]# + layer1.params 
#     params_conv = [conv_W, conv_b]
    
#     accumulator=[]
#     for para_i in params:
#         eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
#         accumulator.append(theano.shared(eps_p, borrow=True))
#       
#     # create a list of gradients for all model parameters
#     grads = T.grad(cost, params)
# 
#     updates = []
#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         grad_i=debug_print(grad_i,'grad_i')
#         acc = acc_i + T.sqr(grad_i)
#         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
#         updates.append((acc_i, acc))    

    def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
        updates = []
        grads = T.grad(cost, params)
        i = theano.shared(numpy.float64(0.))
        i_t = i + 1.
        fix1 = 1. - (1. - b1)**i_t
        fix2 = 1. - (1. - b2)**i_t
        lr_t = lr * (T.sqrt(fix2) / fix1)
        for p, g in zip(params, grads):
            m = theano.shared(p.get_value() * 0.)
            v = theano.shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (T.sqrt(v_t) + e)
            p_t = p - (lr_t * g_t)
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        updates.append((i, i_t))
        return updates
    
    updates=Adam(cost=cost, params=params, lr=learning_rate)
  
    train_model = theano.function([index,cost_tmp], cost, updates=updates,
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index: index + batch_size],
            right_l: trainRightPad_l[index: index + batch_size],
            left_r: trainLeftPad_r[index: index + batch_size],
            right_r: trainRightPad_r[index: index + batch_size],
            length_l: trainLengths_l[index: index + batch_size],
            length_r: trainLengths_r[index: index + batch_size],
            norm_length_l: normalized_train_length_l[index: index + batch_size],
            norm_length_r: normalized_train_length_r[index: index + batch_size],
            mts: mt_train[index: index + batch_size],
            extra: extra_train[index: index + batch_size],
            discri:discri_train[index: index + batch_size]
            }, on_unused_input='ignore', allow_input_downcast=True)

    train_model_predict = theano.function([index, cost_tmp], [cost_this,layer3.errors(y), layer3_input, y],
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index: index + batch_size],
            right_l: trainRightPad_l[index: index + batch_size],
            left_r: trainLeftPad_r[index: index + batch_size],
            right_r: trainRightPad_r[index: index + batch_size],
            length_l: trainLengths_l[index: index + batch_size],
            length_r: trainLengths_r[index: index + batch_size],
            norm_length_l: normalized_train_length_l[index: index + batch_size],
            norm_length_r: normalized_train_length_r[index: index + batch_size],
            mts: mt_train[index: index + batch_size],
            extra: extra_train[index: index + batch_size],
            discri:discri_train[index: index + batch_size]
            }, on_unused_input='ignore', allow_input_downcast=True)



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()

    mid_time = start_time

    epoch = 0
    done_looping = False
    
    acc_max=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
#         shuffle(train_batch_start)#shuffle training data
        cost_tmp=0.0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
#             if (batch_start+1)%1000==0:
#                 print batch_start+1,  'uses ', (time.time()-mid_time)/60.0, 'min'
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter%update_freq != 0:
                cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start, 0.0)
                #print 'layer3_input', layer3_input
                cost_tmp+=cost_ij
                error_sum+=error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average= train_model(batch_start,cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum=0
                cost_tmp=0.0#reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq)
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses=[]
                test_y=[]
                test_features=[]
                for i in test_batch_start:
                    test_loss, layer3_input, y=test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y)
                    test_features.append(layer3_input)
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+
 
                #write_file.close()
                test_score = numpy.mean(test_losses)
                test_features=numpy.concatenate(test_features, axis=0)
                test_y=numpy.concatenate(test_y, axis=0)
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,
                           (1-test_score) * 100.))
                acc_nn=1-test_score
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                 
                #this step is risky: if the training data is too big, then this step will make the training time twice longer
                train_y=[]
                train_features=[]
                count=0
                for batch_start in train_batch_start: 
                    cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start, 0.0)
                    train_y.append(y)
                    train_features.append(layer3_input)
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1
 
                train_features=numpy.concatenate(train_features, axis=0)
                train_y=numpy.concatenate(train_y, axis=0)
 
                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results=clf.predict(test_features)
                lr=linear_model.LogisticRegression(C=1e5)
                lr.fit(train_features, train_y)
                results_lr=lr.predict(test_features)
                corr_count=0
                corr_count_lr=0
                test_size=len(test_y)
                for i in range(test_size):
                    if results[i]==test_y[i]:
                        corr_count+=1
                    if results_lr[i]==test_y[i]:
                        corr_count_lr+=1
                acc_svm=corr_count*1.0/test_size
                acc_lr=corr_count_lr*1.0/test_size
                if acc_svm > acc_max:
                    acc_max=acc_svm
                    best_epoch=epoch
                if acc_lr > acc_max:
                    acc_max=acc_lr
                    best_epoch=epoch
                if acc_nn > acc_max:
                    acc_max=acc_nn
                    best_epoch=epoch
                print  'acc_nn:', acc_nn, 'acc_lr:', acc_lr, 'acc_svm:', acc_svm, ' max acc: ',    acc_max , ' at epoch: ', best_epoch  

            if patience <= iter:
                done_looping = True
                break
        
        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()
            
        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.001, n_epochs=2000, nkerns=[90,90], batch_size=1, window_width=2,
                    maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200,
                    L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=0.2):
    maxSentLength=max_s_length+2*(window_width-1)
    maxDocLength=max_d_length+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/';
    rng = numpy.random.RandomState(23455)
    train_data,train_size, test_data, test_size, vocab_size=load_MCTest_corpus_DPNQ(rootPath+'vocab_DPNQ.txt', rootPath+'mc500.train.tsv_standardlized.txt_with_state.txt_DSSSS.txt_DPN.txt_DPNQ.txt', rootPath+'mc500.test.tsv_standardlized.txt_with_state.txt_DSSSS.txt_DPN.txt_DPNQ.txt', max_s_length,maxSentLength, maxDocLength)#vocab_size contain train, dev and test

    #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
#     mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
#     extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
#     discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')

# results=[numpy.array(data_D), numpy.array(data_Q), numpy.array(data_A1), numpy.array(data_A2), numpy.array(data_A3), numpy.array(data_A4), numpy.array(Label), 
#          numpy.array(Length_D),numpy.array(Length_D_s), numpy.array(Length_Q), numpy.array(Length_A1), numpy.array(Length_A2), numpy.array(Length_A3), numpy.array(Length_A4),
#         numpy.array(leftPad_D),numpy.array(leftPad_D_s), numpy.array(leftPad_Q), numpy.array(leftPad_A1), numpy.array(leftPad_A2), numpy.array(leftPad_A3), numpy.array(leftPad_A4),
#         numpy.array(rightPad_D),numpy.array(rightPad_D_s), numpy.array(rightPad_Q), numpy.array(rightPad_A1), numpy.array(rightPad_A2), numpy.array(rightPad_A3), numpy.array(rightPad_A4)]
# return results, line_control
    [train_data_D, train_data_A1, train_data_A2, train_data_A3, train_Label, 
                 train_Length_D,train_Length_D_s, train_Length_A1, train_Length_A2, train_Length_A3,
                train_leftPad_D,train_leftPad_D_s, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3,
                train_rightPad_D,train_rightPad_D_s, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3]=train_data
    [test_data_D, test_data_A1, test_data_A2, test_data_A3, test_Label, 
                 test_Length_D,test_Length_D_s, test_Length_A1, test_Length_A2, test_Length_A3,
                test_leftPad_D,test_leftPad_D_s, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3,
                test_rightPad_D,test_rightPad_D_s, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3]=test_data                


    n_train_batches=train_size/batch_size
    n_test_batches=test_size/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
#     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
#     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
#     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
#     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
#     indices_train_l=T.cast(indices_train_l, 'int64')
#     indices_train_r=T.cast(indices_train_r, 'int64')
#     indices_test_l=T.cast(indices_test_l, 'int64')
#     indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_DPNQ_glove_50d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    
    #cost_tmp=0
    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()   # now, x is the index matrix, must be integer
#     index_Q = T.lvector()
    index_A1= T.lvector()
    index_A2= T.lvector()
    index_A3= T.lvector()
#     index_A4= T.lvector()
#     y = T.lvector()  
    
    len_D=T.lscalar()
    len_D_s=T.lvector()
#     len_Q=T.lscalar()
    len_A1=T.lscalar()
    len_A2=T.lscalar()
    len_A3=T.lscalar()
#     len_A4=T.lscalar()

    left_D=T.lscalar()
    left_D_s=T.lvector()
#     left_Q=T.lscalar()
    left_A1=T.lscalar()
    left_A2=T.lscalar()
    left_A3=T.lscalar()
#     left_A4=T.lscalar()

    right_D=T.lscalar()
    right_D_s=T.lvector()
#     right_Q=T.lscalar()
    right_A1=T.lscalar()
    right_A2=T.lscalar()
    right_A3=T.lscalar()
#     right_A4=T.lscalar()
        


    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength) # doc shape
    filter_words=(emb_size,window_width)
    filter_sents=(nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
#     length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
#     layer0_Q_input = embeddings[index_Q.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = embeddings[index_A1.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A2_input = embeddings[index_A2.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A3_input = embeddings[index_A3.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
#     layer0_A4_input = embeddings[index_A4.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
        
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]))
    layer0_para=[conv_W, conv_b] 
    conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]))
    layer2_para=[conv2_W, conv2_b]
    high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1]) # this part decides nkern[0] and nkern[1] must be in the same dimension
    highW_para=[high_W, high_b]
    params = layer2_para+layer0_para+highW_para#+[embeddings]
    #load_model(params)

    layer0_D = Conv_with_input_para(rng, input=layer0_D_input,
            image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
#     layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input,
#             image_shape=(batch_size, 1, ishape[0], ishape[1]),
#             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    layer0_A1 = Conv_with_input_para(rng, input=layer0_A1_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    layer0_A2 = Conv_with_input_para(rng, input=layer0_A2_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    layer0_A3 = Conv_with_input_para(rng, input=layer0_A3_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
#     layer0_A4 = Conv_with_input_para(rng, input=layer0_A4_input,
#             image_shape=(batch_size, 1, ishape[0], ishape[1]),
#             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    
    layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output')
#     layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output')
    layer0_A1_output=debug_print(layer0_A1.output, 'layer0_A1.output')
    layer0_A2_output=debug_print(layer0_A2.output, 'layer0_A2.output')
    layer0_A3_output=debug_print(layer0_A3.output, 'layer0_A3.output')
#     layer0_A4_output=debug_print(layer0_A4.output, 'layer0_A4.output')
       

#     layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0],
#                                       left_D=left_D, right_D=right_D,
#                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q, 
#                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1,
#                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    layer1_DA1=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A1_output, kern=nkerns[0],
                                      left_D=left_D, right_D=right_D,
                     left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A1, right_r=right_A1, 
                      length_D_s=len_D_s+filter_words[1]-1, length_r=len_A1+filter_words[1]-1,
                       dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    layer1_DA2=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A2_output, kern=nkerns[0],
                                      left_D=left_D, right_D=right_D,
                     left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A2, right_r=right_A2, 
                      length_D_s=len_D_s+filter_words[1]-1, length_r=len_A2+filter_words[1]-1,
                       dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    layer1_DA3=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A3_output, kern=nkerns[0],
                                      left_D=left_D, right_D=right_D,
                     left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A3, right_r=right_A3, 
                      length_D_s=len_D_s+filter_words[1]-1, length_r=len_A3+filter_words[1]-1,
                       dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
#     layer1_DA4=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A4_output, kern=nkerns[0],
#                                       left_D=left_D, right_D=right_D,
#                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A4, right_r=right_A4, 
#                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_A4+filter_words[1]-1,
#                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    
    
    #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0]
    #conv from sentence to doc
#     layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
#             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
#             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_DA1 = Conv_with_input_para(rng, input=layer1_DA1.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
            image_shape=(batch_size, 1, nkerns[0], dshape[1]),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_DA2 = Conv_with_input_para(rng, input=layer1_DA2.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
            image_shape=(batch_size, 1, nkerns[0], dshape[1]),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_DA3 = Conv_with_input_para(rng, input=layer1_DA3.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
            image_shape=(batch_size, 1, nkerns[0], dshape[1]),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
#     layer2_DA4 = Conv_with_input_para(rng, input=layer1_DA4.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
#             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
#             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #conv single Q and A into doc level with same conv weights
#     layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
#             image_shape=(batch_size, 1, nkerns[0], 1),
#             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A1 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA1.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
            image_shape=(batch_size, 1, nkerns[0], 1),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A2 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA2.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
            image_shape=(batch_size, 1, nkerns[0], 1),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A3 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA3.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
            image_shape=(batch_size, 1, nkerns[0], 1),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
#     layer2_A4 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA4.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
#             image_shape=(batch_size, 1, nkerns[0], 1),
#             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
#     layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel')
    layer2_A1_output_sent_rep_Dlevel=debug_print(layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel')
    layer2_A2_output_sent_rep_Dlevel=debug_print(layer2_A2.output_sent_rep_Dlevel, 'layer2_A2.output_sent_rep_Dlevel')
    layer2_A3_output_sent_rep_Dlevel=debug_print(layer2_A3.output_sent_rep_Dlevel, 'layer2_A3.output_sent_rep_Dlevel')
#     layer2_A4_output_sent_rep_Dlevel=debug_print(layer2_A4.output_sent_rep_Dlevel, 'layer2_A4.output_sent_rep_Dlevel')
    
    
#     layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1],
#                      left_l=left_D, right_l=right_D, left_r=0, right_r=0, 
#                       length_l=len_D+filter_sents[1]-1, length_r=1,
#                        dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA1=Average_Pooling_for_Top(rng, input_l=layer2_DA1.output, input_r=layer2_A1_output_sent_rep_Dlevel, kern=nkerns[1],
                     left_l=left_D, right_l=right_D, left_r=0, right_r=0, 
                      length_l=len_D+filter_sents[1]-1, length_r=1,
                       dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA2=Average_Pooling_for_Top(rng, input_l=layer2_DA2.output, input_r=layer2_A2_output_sent_rep_Dlevel, kern=nkerns[1],
                     left_l=left_D, right_l=right_D, left_r=0, right_r=0, 
                      length_l=len_D+filter_sents[1]-1, length_r=1,
                       dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA3=Average_Pooling_for_Top(rng, input_l=layer2_DA3.output, input_r=layer2_A3_output_sent_rep_Dlevel, kern=nkerns[1],
                     left_l=left_D, right_l=right_D, left_r=0, right_r=0, 
                      length_l=len_D+filter_sents[1]-1, length_r=1,
                       dim=maxDocLength+filter_sents[1]-1, topk=3)
#     layer3_DA4=Average_Pooling_for_Top(rng, input_l=layer2_DA4.output, input_r=layer2_A4_output_sent_rep_Dlevel, kern=nkerns[1],
#                      left_l=left_D, right_l=right_D, left_r=0, right_r=0, 
#                       length_l=len_D+filter_sents[1]-1, length_r=1,
#                        dim=maxDocLength+filter_sents[1]-1, topk=3)
    
    #high-way
    
#     transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ')
    transform_gate_DA1=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b), 'transform_gate_DA1')
    transform_gate_DA2=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA2.output_D_sent_level_rep) + high_b), 'transform_gate_DA2')
    transform_gate_DA3=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA3.output_D_sent_level_rep) + high_b), 'transform_gate_DA3')
#     transform_gate_DA4=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA4.output_D_sent_level_rep) + high_b), 'transform_gate_DA4')
#     transform_gate_Q=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b), 'transform_gate_Q')
    transform_gate_A1=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b), 'transform_gate_A1')
    transform_gate_A2=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA2.output_QA_sent_level_rep) + high_b), 'transform_gate_A2')
#     transform_gate_A3=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA3.output_QA_sent_level_rep) + high_b), 'transform_gate_A3')
#     transform_gate_A4=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA4.output_QA_sent_level_rep) + high_b), 'transform_gate_A4')
    
        
#     overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q')
    overall_D_A1=(1.0-transform_gate_DA1)*layer1_DA1.output_D_sent_level_rep+transform_gate_DA1*layer3_DA1.output_D_doc_level_rep
    overall_D_A2=(1.0-transform_gate_DA2)*layer1_DA2.output_D_sent_level_rep+transform_gate_DA2*layer3_DA2.output_D_doc_level_rep
    overall_D_A3=(1.0-transform_gate_DA3)*layer1_DA3.output_D_sent_level_rep+transform_gate_DA3*layer3_DA3.output_D_doc_level_rep
#     overall_D_A4=(1.0-transform_gate_DA4)*layer1_DA4.output_D_sent_level_rep+transform_gate_DA4*layer3_DA4.output_D_doc_level_rep
    
#     overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel
    overall_A1=(1.0-transform_gate_A1)*layer1_DA1.output_QA_sent_level_rep+transform_gate_A1*layer2_A1.output_sent_rep_Dlevel
    overall_A2=(1.0-transform_gate_A2)*layer1_DA2.output_QA_sent_level_rep+transform_gate_A2*layer2_A2.output_sent_rep_Dlevel
#     overall_A3=(1.0-transform_gate_A3)*layer1_DA3.output_QA_sent_level_rep+transform_gate_A3*layer2_A3.output_sent_rep_Dlevel
#     overall_A4=(1.0-transform_gate_A4)*layer1_DA4.output_QA_sent_level_rep+transform_gate_A4*layer2_A4.output_sent_rep_Dlevel
    
    simi_sent_level1=debug_print(cosine(layer1_DA1.output_D_sent_level_rep, layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1')
    simi_sent_level2=debug_print(cosine(layer1_DA2.output_D_sent_level_rep, layer1_DA2.output_QA_sent_level_rep), 'simi_sent_level2')
#     simi_sent_level3=debug_print(cosine(layer1_DA3.output_D_sent_level_rep, layer1_DA3.output_QA_sent_level_rep), 'simi_sent_level3')
#     simi_sent_level4=debug_print(cosine(layer1_DA4.output_D_sent_level_rep, layer1_DA4.output_QA_sent_level_rep), 'simi_sent_level4')
  
  
    simi_doc_level1=debug_print(cosine(layer3_DA1.output_D_doc_level_rep, layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1')
    simi_doc_level2=debug_print(cosine(layer3_DA2.output_D_doc_level_rep, layer2_A2.output_sent_rep_Dlevel), 'simi_doc_level2')
#     simi_doc_level3=debug_print(cosine(layer3_DA3.output_D_doc_level_rep, layer2_A3.output_sent_rep_Dlevel), 'simi_doc_level3')
#     simi_doc_level4=debug_print(cosine(layer3_DA4.output_D_doc_level_rep, layer2_A4.output_sent_rep_Dlevel), 'simi_doc_level4')

    
    simi_overall_level1=debug_print(cosine(overall_D_A1, overall_A1), 'simi_overall_level1')
    simi_overall_level2=debug_print(cosine(overall_D_A2, overall_A2), 'simi_overall_level2')
#     simi_overall_level3=debug_print(cosine(overall_D_A3, overall_A3), 'simi_overall_level3')
#     simi_overall_level4=debug_print(cosine(overall_D_A4, overall_A4), 'simi_overall_level4')

#     simi_1=simi_overall_level1+simi_sent_level1+simi_doc_level1
#     simi_2=simi_overall_level2+simi_sent_level2+simi_doc_level2
 
    simi_1=(simi_overall_level1+simi_sent_level1+simi_doc_level1)/3.0
    simi_2=(simi_overall_level2+simi_sent_level2+simi_doc_level2)/3.0
#     simi_3=(simi_overall_level3+simi_sent_level3+simi_doc_level3)/3.0
#     simi_4=(simi_overall_level4+simi_sent_level4+simi_doc_level4)/3.0 
    


#     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))
 
#     #only use overall_simi    
#     cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi)
#     posi_simi=simi_overall_level1
#     nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])
    #use ensembled simi
#     cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi)
#     cost=T.maximum(0.0, margin+simi_2-simi_1)
    simi_PQ=cosine(layer1_DA1.output_QA_sent_level_rep, layer1_DA3.output_D_sent_level_rep)
    simi_NQ=cosine(layer1_DA2.output_QA_sent_level_rep, layer1_DA3.output_D_sent_level_rep)
    #bad matching at overall level
#     simi_PQ=cosine(overall_A1, overall_D_A3)
#     simi_NQ=cosine(overall_A2, overall_D_A3)
    match_cost=T.maximum(0.0, margin+simi_NQ-simi_PQ) 
    cost=T.maximum(0.0, margin+simi_sent_level2-simi_sent_level1)+T.maximum(0.0, margin+simi_doc_level2-simi_doc_level1)+T.maximum(0.0, margin+simi_overall_level2-simi_overall_level1)
    cost=cost#+match_cost
#     posi_simi=simi_1
#     nega_simi=simi_2


    
    L2_reg =debug_print((high_W**2).sum()+3*(conv2_W**2).sum()+(conv_W**2).sum(), 'L2_reg')#+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost=debug_print(cost+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    


    
    test_model = theano.function([index], [cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2],
          givens={
            index_D: test_data_D[index], #a matrix
#             index_Q: test_data_Q[index],
            index_A1: test_data_A1[index],
            index_A2: test_data_A2[index],
            index_A3: test_data_A3[index],
#             index_A4: test_data_A4[index],

            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
#             len_Q: test_Length_Q[index],
            len_A1: test_Length_A1[index],
            len_A2: test_Length_A2[index],
            len_A3: test_Length_A3[index],
#             len_A4: test_Length_A4[index],

            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
#             left_Q: test_leftPad_Q[index],
            left_A1: test_leftPad_A1[index],
            left_A2: test_leftPad_A2[index],
            left_A3: test_leftPad_A3[index],
#             left_A4: test_leftPad_A4[index],
        
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
#             right_Q: test_rightPad_Q[index],
            right_A1: test_rightPad_A1[index],
            right_A2: test_rightPad_A2[index],
            right_A3: test_rightPad_A3[index]
#             right_A4: test_rightPad_A4[index]
            
            }, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
        updates.append((acc_i, acc))    
 
#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         acc = acc_i + T.sqr(grad_i)
#         if param_i == embeddings:
#             updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(emb_size)))))   #AdaGrad
#         else:
#             updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
#         updates.append((acc_i, acc))    
  
    train_model = theano.function([index], [cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2], updates=updates,
          givens={
            index_D: train_data_D[index],
#             index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
#             index_A4: train_data_A4[index],

            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
#             len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
#             len_A4: train_Length_A4[index],

            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
#             left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
#             left_A4: train_leftPad_A4[index],
        
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
#             right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index]
#             right_A4: train_rightPad_A4[index]
            }, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2],
          givens={
            index_D: train_data_D[index],
#             index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
#             index_A4: train_data_A4[index],

            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
#             len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
#             len_A4: train_Length_A4[index],

            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
#             left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
#             left_A4: train_leftPad_A4[index],
        
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
#             right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index]
#             right_A4: train_rightPad_A4[index]
            }, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        shuffle(train_batch_start)#shuffle training data


        posi_train_sent=[]
        nega_train_sent=[]
        posi_train_doc=[]
        nega_train_doc=[]
        posi_train_overall=[]
        nega_train_overall=[]
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1
            sys.stdout.write( "Training :[%6f] %% complete!\r" % ((iter%train_size)*100.0/train_size) )
            sys.stdout.flush()
            minibatch_index=minibatch_index+1
            
            cost_average, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2= train_model(batch_start)
            posi_train_sent.append(simi_sent_level1)
            nega_train_sent.append(simi_sent_level2)
            posi_train_doc.append(simi_doc_level1)
            nega_train_doc.append(simi_doc_level2)
            posi_train_overall.append(simi_overall_level1)
            nega_train_overall.append(simi_overall_level2)
            if iter % n_train_batches == 0:
                corr_train_sent=compute_corr(posi_train_sent, nega_train_sent)
                corr_train_doc=compute_corr(posi_train_doc, nega_train_doc)
                corr_train_overall=compute_corr(posi_train_overall, nega_train_overall)
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+'corr rate:'+str(corr_train_sent*300.0/train_size)+' '+str(corr_train_doc*300.0/train_size)+' '+str(corr_train_overall*300.0/train_size)

            
            if iter % validation_frequency == 0:
                posi_test_sent=[]
                nega_test_sent=[]
                posi_test_doc=[]
                nega_test_doc=[]
                posi_test_overall=[]
                nega_test_overall=[]
                for i in test_batch_start:
                    cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2=test_model(i)
                    posi_test_sent.append(simi_sent_level1)
                    nega_test_sent.append(simi_sent_level2)
                    posi_test_doc.append(simi_doc_level1)
                    nega_test_doc.append(simi_doc_level2)
                    posi_test_overall.append(simi_overall_level1)
                    nega_test_overall.append(simi_overall_level2)
                corr_test_sent=compute_corr(posi_test_sent, nega_test_sent)
                corr_test_doc=compute_corr(posi_test_doc, nega_test_doc)
                corr_test_overall=compute_corr(posi_test_overall, nega_test_overall)

                #write_file.close()
                #test_score = numpy.mean(test_losses)
                test_acc_sent=corr_test_sent*1.0/(test_size/3.0)
                test_acc_doc=corr_test_doc*1.0/(test_size/3.0)
                test_acc_overall=corr_test_overall*1.0/(test_size/3.0)
                #test_acc=1-test_score
#                 print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
#                            'model %f %%') %
#                           (epoch, minibatch_index, n_train_batches,test_acc * 100.))
                print '\t\t\tepoch', epoch, ', minibatch', minibatch_index, '/', n_train_batches, 'test acc of best model', test_acc_sent*100,test_acc_doc*100,test_acc_overall*100 
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                 

  
                find_better=False
                if test_acc_sent > max_acc:
                    max_acc=test_acc_sent
                    best_epoch=epoch    
                    find_better=True     
                if test_acc_doc > max_acc:
                    max_acc=test_acc_doc
                    best_epoch=epoch    
                    find_better=True 
                if test_acc_overall > max_acc:
                    max_acc=test_acc_overall
                    best_epoch=epoch    
                    find_better=True         
                print '\t\t\tmax:',    max_acc,'(at',best_epoch,')'
                if find_better==True:
                    store_model_to_file(params, best_epoch, max_acc)
                    print 'Finished storing best params'  

            if patience <= iter:
                done_looping = True
                break
        
        
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock()
        #writefile.close()
   
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Exemplo n.º 45
0
def evaluate_lenet5(learning_rate=0.05,
                    n_epochs=2000,
                    nkerns=[256, 256],
                    batch_size=1,
                    window_width=3,
                    maxSentLength=64,
                    emb_size=300,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.0006,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_truncate=33):  # max_truncate can be 45
    maxSentLength = max_truncate + 2 * (window_width - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/SICK/'
    rng = numpy.random.RandomState(23455)
    #     datasets, vocab_size=load_SICK_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'train.txt', rootPath+'test.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test
    datasets, vocab_size = load_SICK_corpus(rootPath + 'vocab.txt',
                                            rootPath + 'train_plus_dev.txt',
                                            rootPath + 'test.txt',
                                            max_truncate,
                                            maxSentLength,
                                            entailment=True)
    mt_train, mt_test = load_mts_wikiQA(
        rootPath + 'Train_plus_dev_MT/concate_14mt_train.txt',
        rootPath + 'Test_MT/concate_14mt_test.txt')
    extra_train, extra_test = load_extra_features(
        rootPath +
        'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt',
        rootPath +
        'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt'
    )
    discri_train, discri_test = load_extra_features(
        rootPath + 'train_plus_dev_discri_features_0.3.txt',
        rootPath + 'test_discri_features_0.3.txt')

    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[
        0]
    indices_train_l = indices_train[::2, :]
    indices_train_r = indices_train[1::2, :]
    trainLengths_l = trainLengths[::2]
    trainLengths_r = trainLengths[1::2]
    normalized_train_length_l = normalized_train_length[::2]
    normalized_train_length_r = normalized_train_length[1::2]

    trainLeftPad_l = trainLeftPad[::2]
    trainLeftPad_r = trainLeftPad[1::2]
    trainRightPad_l = trainRightPad[::2]
    trainRightPad_r = trainRightPad[1::2]

    indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[
        1]
    indices_test_l = indices_test[::2, :]
    indices_test_r = indices_test[1::2, :]
    testLengths_l = testLengths[::2]
    testLengths_r = testLengths[1::2]
    normalized_test_length_l = normalized_test_length[::2]
    normalized_test_length_r = normalized_test_length[1::2]

    testLeftPad_l = testLeftPad[::2]
    testLeftPad_r = testLeftPad[1::2]
    testRightPad_l = testRightPad[::2]
    testRightPad_r = testRightPad[1::2]

    n_train_batches = indices_train_l.shape[0] / batch_size
    n_test_batches = indices_test_l.shape[0] / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    indices_train_l = theano.shared(numpy.asarray(indices_train_l,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_train_r = theano.shared(numpy.asarray(indices_train_r,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_test_l = theano.shared(numpy.asarray(indices_test_l,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_test_r = theano.shared(numpy.asarray(indices_test_r,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_train_l = T.cast(indices_train_l, 'int64')
    indices_train_r = T.cast(indices_train_r, 'int64')
    indices_test_l = T.cast(indices_test_l, 'int64')
    indices_test_r = T.cast(indices_test_r, 'int64')
    '''
    indices_train_l=T.cast(indices_train_l, 'int32')
    indices_train_r=T.cast(indices_train_r, 'int32')
    indices_test_l=T.cast(indices_test_l, 'int32')
    indices_test_r=T.cast(indices_test_r, 'int32')
    '''

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    #     rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_glove_50d.txt')
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_glove_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix(
        'x_index_l')  # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')
    left_l = T.lscalar()
    right_l = T.lscalar()
    left_r = T.lscalar()
    right_r = T.lscalar()
    length_l = T.lscalar()
    length_r = T.lscalar()
    norm_length_l = T.dscalar()
    norm_length_r = T.dscalar()
    mts = T.dmatrix()
    extra = T.dmatrix()
    discri = T.dmatrix()
    cost_tmp = T.dscalar()

    #     #GPU
    #     index = T.iscalar()
    #     x_index_l = T.imatrix('x_index_l')   # now, x is the index matrix, must be integer
    #     x_index_r = T.imatrix('x_index_r')
    #     y = T.ivector('y')
    #     left_l=T.iscalar()
    #     right_l=T.iscalar()
    #     left_r=T.iscalar()
    #     right_r=T.iscalar()
    #     length_l=T.iscalar()
    #     length_r=T.iscalar()
    #     norm_length_l=T.fscalar()
    #     norm_length_r=T.fscalar()
    #     #mts=T.dmatrix()
    #     #wmf=T.dmatrix()
    #     cost_tmp=T.fscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size = (emb_size, window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv = ishape[1] + filter_size[1] - 1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = debug_print(
        embeddings[x_index_l.flatten()].reshape(
            (maxSentLength, emb_size)).transpose(), 'layer0_l_input')
    layer0_r_input = debug_print(
        embeddings[x_index_r.flatten()].reshape(
            (maxSentLength, emb_size)).transpose(), 'layer0_r_input')

    l_input_tensor = debug_print(
        Matrix_Bit_Shift(layer0_l_input[:, left_l:-right_l]), 'l_input_tensor')
    r_input_tensor = debug_print(
        Matrix_Bit_Shift(layer0_r_input[:, left_r:-right_r]), 'r_input_tensor')

    addition_l = T.sum(layer0_l_input[:, left_l:-right_l], axis=1)
    addition_r = T.sum(layer0_r_input[:, left_r:-right_r], axis=1)
    cosine_addition = cosine(addition_l, addition_r)
    eucli_addition = 1.0 / (1.0 + EUCLID(addition_l, addition_r))  #25.2%

    U, W, b = create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para = [U, W, b]

    layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor,
                                       hidden_dim=nkerns[0],
                                       U=U,
                                       W=W,
                                       b=b,
                                       bptt_truncate=-1)
    layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor,
                                       hidden_dim=nkerns[0],
                                       U=U,
                                       W=W,
                                       b=b,
                                       bptt_truncate=-1)

    cosine_sent = cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)
    eucli_sent = 1.0 / (1.0 + EUCLID(layer0_A1.output_sent_rep,
                                     layer0_A2.output_sent_rep))  #25.2%

    #ibm attentive pooling at extended sentence level
    attention_matrix = compute_simi_feature_matrix_with_matrix(
        layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim,
        layer0_A2.dim,
        maxSentLength * (maxSentLength + 1) / 2)
    #     attention_vec_l_extended=T.nnet.softmax(T.max(attention_matrix, axis=1)).transpose()
    #     ibm_l_extended=layer0_A1.output_matrix.dot(attention_vec_l_extended).transpose()
    #     attention_vec_r_extended=T.nnet.softmax(T.max(attention_matrix, axis=0)).transpose()
    #     ibm_r_extended=layer0_A2.output_matrix.dot(attention_vec_r_extended).transpose()
    #     cosine_ibm_extended=cosine(ibm_l_extended, ibm_r_extended)
    #     eucli_ibm_extended=1.0/(1.0+EUCLID(ibm_l_extended, ibm_r_extended))#25.2%

    #ibm attentive pooling at original sentence level
    simi_matrix_sent = compute_simi_feature_matrix_with_matrix(
        layer0_A1.output_sent_hiddenstates, layer0_A2.output_sent_hiddenstates,
        length_l, length_r, maxSentLength)
    attention_vec_l = T.nnet.softmax(T.max(simi_matrix_sent,
                                           axis=1)).transpose()
    ibm_l = layer0_A1.output_sent_hiddenstates.dot(attention_vec_l).transpose()
    attention_vec_r = T.nnet.softmax(T.max(simi_matrix_sent,
                                           axis=0)).transpose()
    ibm_r = layer0_A2.output_sent_hiddenstates.dot(attention_vec_r).transpose()
    cosine_ibm = cosine(ibm_l, ibm_r)
    eucli_ibm = 1.0 / (1.0 + EUCLID(ibm_l, ibm_r))  #25.2%

    l_max_attention = T.max(attention_matrix, axis=1)
    neighborsArgSorted = T.argsort(l_max_attention)
    kNeighborsArg = neighborsArgSorted[:3]  #only average the min 3 vectors
    ll = T.sort(kNeighborsArg).flatten()  # make y indices in acending lie

    r_max_attention = T.max(attention_matrix, axis=0)
    neighborsArgSorted_r = T.argsort(r_max_attention)
    kNeighborsArg_r = neighborsArgSorted_r[:3]  #only average the min 3 vectors
    rr = T.sort(kNeighborsArg_r).flatten()  # make y indices in acending lie

    l_max_min_attention = debug_print(layer0_A1.output_matrix[:, ll],
                                      'l_max_min_attention')
    r_max_min_attention = debug_print(layer0_A2.output_matrix[:, rr],
                                      'r_max_min_attention')

    U1, W1, b1 = create_GRU_para(rng, nkerns[0], nkerns[1])
    layer1_para = [U1, W1, b1]

    layer1_A1 = GRU_Matrix_Input(X=l_max_min_attention,
                                 word_dim=nkerns[0],
                                 hidden_dim=nkerns[1],
                                 U=U1,
                                 W=W1,
                                 b=b1,
                                 bptt_truncate=-1)
    layer1_A2 = GRU_Matrix_Input(X=r_max_min_attention,
                                 word_dim=nkerns[0],
                                 hidden_dim=nkerns[1],
                                 U=U1,
                                 W=W1,
                                 b=b1,
                                 bptt_truncate=-1)

    vec_l = debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])),
                        'vec_l')
    vec_r = debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])),
                        'vec_r')

    #     sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    #     aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
    #     norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    #     sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    #     aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
    #     norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    #
    uni_cosine = cosine(vec_l, vec_r)
    #     aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
    #     uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')
    #     '''
    #     linear=Linear(sum_uni_l, sum_uni_r)
    #     poly=Poly(sum_uni_l, sum_uni_r)
    #     sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    #     rbf=RBF(sum_uni_l, sum_uni_r)
    #     gesd=GESD(sum_uni_l, sum_uni_r)
    #     '''
    eucli_1 = 1.0 / (1.0 + EUCLID(vec_l, vec_r))  #25.2%
    #     #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    #
    len_l = norm_length_l.reshape((1, 1))
    len_r = norm_length_r.reshape((1, 1))
    #
    #     '''
    #     len_l=length_l.reshape((1,1))
    #     len_r=length_r.reshape((1,1))
    #     '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input = T.concatenate(
        [
            vec_l,
            vec_r,
            uni_cosine,
            eucli_1,
            cosine_addition,
            eucli_addition,
            #                                 cosine_sent, eucli_sent,
            ibm_l.reshape((1, nkerns[0])),
            ibm_r.reshape((1, nkerns[0])),  #2*nkerns[0]+
            cosine_ibm,
            eucli_ibm,

            #                                 ibm_l_extended.reshape((1, nkerns[0])), ibm_r_extended.reshape((1, nkerns[0])), #2*nkerns[0]+
            #                                 cosine_ibm_extended, eucli_ibm_extended,
            mts,
            len_l,
            len_r,
            extra
        ],
        axis=1)  #, layer2.output, layer1.output_cosine], axis=1)

    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3 = LogisticRegression(rng,
                                input=layer3_input,
                                n_in=(2 * nkerns[1] + 2) + 2 +
                                (2 * nkerns[0] + 2) + 14 + 2 + 9,
                                n_out=3)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (layer3.W**2).sum() + (U**2).sum() + (W**2).sum() + (U1**2).sum() +
        (W1**2).sum(), 'L2_reg')  #+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this = debug_print(layer3.negative_log_likelihood(y),
                            'cost_this')  #+L2_weight*L2_reg
    cost = debug_print(
        (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index], [layer3.errors(y), layer3_input, y],
        givens={
            x_index_l: indices_test_l[index:index + batch_size],
            x_index_r: indices_test_r[index:index + batch_size],
            y: testY[index:index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index:index + batch_size],
            extra: extra_test[index:index + batch_size],
            discri: discri_test[index:index + batch_size]
        },
        on_unused_input='ignore',
        allow_input_downcast=True)

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params + layer1_para + layer0_para  #+[embeddings]# + layer1.params
    #     params_conv = [conv_W, conv_b]

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))


#     def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
#         updates = []
#         grads = T.grad(cost, params)
#         i = theano.shared(numpy.float64(0.))
#         i_t = i + 1.
#         fix1 = 1. - (1. - b1)**i_t
#         fix2 = 1. - (1. - b2)**i_t
#         lr_t = lr * (T.sqrt(fix2) / fix1)
#         for p, g in zip(params, grads):
#             m = theano.shared(p.get_value() * 0.)
#             v = theano.shared(p.get_value() * 0.)
#             m_t = (b1 * g) + ((1. - b1) * m)
#             v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
#             g_t = m_t / (T.sqrt(v_t) + e)
#             p_t = p - (lr_t * g_t)
#             updates.append((m, m_t))
#             updates.append((v, v_t))
#             updates.append((p, p_t))
#         updates.append((i, i_t))
#         return updates
#
#     updates=Adam(cost=cost, params=params, lr=learning_rate)

    train_model = theano.function(
        [index, cost_tmp],
        cost,
        updates=updates,
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size],
            extra: extra_train[index:index + batch_size],
            discri: discri_train[index:index + batch_size]
        },
        on_unused_input='ignore',
        allow_input_downcast=True)

    train_model_predict = theano.function(
        [index, cost_tmp],
        [cost_this, layer3.errors(y), layer3_input, y],
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size],
            extra: extra_train[index:index + batch_size],
            discri: discri_train[index:index + batch_size]
        },
        on_unused_input='ignore',
        allow_input_downcast=True)

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()

    mid_time = start_time

    epoch = 0
    done_looping = False

    acc_max = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #         shuffle(train_batch_start)#shuffle training data
        cost_tmp = 0.0
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            #             if (batch_start+1)%1000==0:
            #                 print batch_start+1,  'uses ', (time.time()-mid_time)/60.0, 'min'
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1

            minibatch_index = minibatch_index + 1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter % update_freq != 0:
                cost_ij, error_ij, layer3_input, y = train_model_predict(
                    batch_start, 0.0)
                #print 'layer3_input', layer3_input
                cost_tmp += cost_ij
                error_sum += error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average = train_model(batch_start, cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum = 0
                cost_tmp = 0.0  #reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + ' error: ' + str(
                            error_sum) + '/' + str(
                                update_freq) + ' error rate: ' + str(
                                    error_sum * 1.0 / update_freq)
            #if iter ==1:
            #    exit(0)

            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses = []
                test_y = []
                test_features = []
                for i in test_batch_start:
                    test_loss, layer3_input, y = test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                test_score = numpy.mean(test_losses)
                print(
                    ('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                     'model %f %%') % (epoch, minibatch_index, n_train_batches,
                                       (1 - test_score) * 100.))
                acc_nn = 1 - test_score
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')

                #this step is risky: if the training data is too big, then this step will make the training time twice longer
                train_y = []
                train_features = []
                count = 0
                for batch_start in train_batch_start:
                    cost_ij, error_ij, layer3_input, y = train_model_predict(
                        batch_start, 0.0)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()

                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results = clf.predict(test_features)
                lr = linear_model.LogisticRegression(C=1e5)
                lr.fit(train_features, train_y)
                results_lr = lr.predict(test_features)
                corr_count = 0
                corr_count_lr = 0
                test_size = len(test_y)
                for i in range(test_size):
                    if results[i] == test_y[i]:
                        corr_count += 1
                    if results_lr[i] == test_y[i]:
                        corr_count_lr += 1
                acc_svm = corr_count * 1.0 / test_size
                acc_lr = corr_count_lr * 1.0 / test_size
                if acc_svm > acc_max:
                    acc_max = acc_svm
                    best_epoch = epoch
                if acc_lr > acc_max:
                    acc_max = acc_lr
                    best_epoch = epoch
                if acc_nn > acc_max:
                    acc_max = acc_nn
                    best_epoch = epoch
                print 'acc_nn:', acc_nn, 'acc_lr:', acc_lr, 'acc_svm:', acc_svm, ' max acc: ', acc_max, ' at epoch: ', best_epoch

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.085,
                    n_epochs=2000,
                    nkerns=[50, 50],
                    batch_size=1,
                    window_width=7,
                    maxSentLength=60,
                    emb_size=300,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.00005,
                    update_freq=10,
                    norm_threshold=5.0):

    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/'
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size = load_msr_corpus(rootPath + 'vocab.txt',
                                           rootPath + 'tokenized_train.txt',
                                           rootPath + 'tokenized_test.txt',
                                           maxSentLength)
    mtPath = '/mounts/data/proj/wenpeng/Dataset/paraphraseMT/'
    mt_train, mt_test = load_mts(mtPath + 'concate_15mt_train.txt',
                                 mtPath + 'concate_15mt_test.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[
        0]
    indices_train_l = indices_train[::2, :]
    indices_train_r = indices_train[1::2, :]
    trainLengths_l = trainLengths[::2]
    trainLengths_r = trainLengths[1::2]
    normalized_train_length_l = normalized_train_length[::2]
    normalized_train_length_r = normalized_train_length[1::2]

    trainLeftPad_l = trainLeftPad[::2]
    trainLeftPad_r = trainLeftPad[1::2]
    trainRightPad_l = trainRightPad[::2]
    trainRightPad_r = trainRightPad[1::2]
    indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[
        1]
    indices_test_l = indices_test[::2, :]
    indices_test_r = indices_test[1::2, :]
    testLengths_l = testLengths[::2]
    testLengths_r = testLengths[1::2]
    normalized_test_length_l = normalized_test_length[::2]
    normalized_test_length_r = normalized_test_length[1::2]

    testLeftPad_l = testLeftPad[::2]
    testLeftPad_r = testLeftPad[1::2]
    testRightPad_l = testRightPad[::2]
    testRightPad_r = testRightPad[1::2]

    n_train_batches = indices_train_l.shape[0] / batch_size
    n_test_batches = indices_test_l.shape[0] / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    indices_train_l = theano.shared(numpy.asarray(indices_train_l,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_train_r = theano.shared(numpy.asarray(indices_train_r,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_test_l = theano.shared(numpy.asarray(indices_test_l,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_test_r = theano.shared(numpy.asarray(indices_test_r,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_train_l = T.cast(indices_train_l, 'int32')
    indices_train_r = T.cast(indices_train_r, 'int32')
    indices_test_l = T.cast(indices_test_l, 'int32')
    indices_test_r = T.cast(indices_test_r, 'int32')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size))
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    cost_tmp = 0
    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.imatrix(
        'x_index_l')  # now, x is the index matrix, must be integer
    x_index_r = T.imatrix('x_index_r')
    y = T.ivector('y')
    left_l = T.iscalar()
    right_l = T.iscalar()
    left_r = T.iscalar()
    right_r = T.iscalar()
    length_l = T.iscalar()
    length_r = T.iscalar()
    norm_length_l = T.dscalar()
    norm_length_r = T.dscalar()
    mts = T.dmatrix()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size = (emb_size, window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv = ishape[1] + filter_size[1] - 1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(nkerns[0], 1,
                                                    filter_size[0],
                                                    filter_size[1]))

    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_l = Conv_with_input_para(rng,
                                    input=layer0_l_input,
                                    image_shape=(batch_size, 1, ishape[0],
                                                 ishape[1]),
                                    filter_shape=(nkerns[0], 1, filter_size[0],
                                                  filter_size[1]),
                                    W=conv_W,
                                    b=conv_b)
    layer0_r = Conv_with_input_para(rng,
                                    input=layer0_r_input,
                                    image_shape=(batch_size, 1, ishape[0],
                                                 ishape[1]),
                                    filter_shape=(nkerns[0], 1, filter_size[0],
                                                  filter_size[1]),
                                    W=conv_W,
                                    b=conv_b)
    layer0_l_output = debug_print(layer0_l.output, 'layer0_l.output')
    layer0_r_output = debug_print(layer0_r.output, 'layer0_r.output')
    layer0_para = [conv_W, conv_b]

    layer1 = Average_Pooling(rng,
                             input_l=layer0_l_output,
                             input_r=layer0_r_output,
                             kern=nkerns[0],
                             left_l=left_l,
                             right_l=right_l,
                             left_r=left_r,
                             right_r=right_r,
                             length_l=length_l + filter_size[1] - 1,
                             length_r=length_r + filter_size[1] - 1,
                             dim=maxSentLength + filter_size[1] - 1,
                             window_size=window_width,
                             maxSentLength=maxSentLength)

    conv2_W, conv2_b = create_conv_para(rng,
                                        filter_shape=(nkerns[1], 1, nkerns[0],
                                                      filter_size[1]))
    layer2_l = Conv_with_input_para(rng,
                                    input=layer1.output_tensor_l,
                                    image_shape=(batch_size, 1, nkerns[0],
                                                 ishape[1]),
                                    filter_shape=(nkerns[1], 1, nkerns[0],
                                                  filter_size[1]),
                                    W=conv2_W,
                                    b=conv2_b)
    layer2_r = Conv_with_input_para(rng,
                                    input=layer1.output_tensor_r,
                                    image_shape=(batch_size, 1, nkerns[0],
                                                 ishape[1]),
                                    filter_shape=(nkerns[1], 1, nkerns[0],
                                                  filter_size[1]),
                                    W=conv2_W,
                                    b=conv2_b)
    layer2_para = [conv2_W, conv2_b]

    layer3 = Average_Pooling_for_batch1(rng,
                                        input_l=layer2_l.output,
                                        input_r=layer2_r.output,
                                        kern=nkerns[1],
                                        left_l=left_l,
                                        right_l=right_l,
                                        left_r=left_r,
                                        right_r=right_r,
                                        length_l=length_l + filter_size[1] - 1,
                                        length_r=length_r + filter_size[1] - 1,
                                        dim=maxSentLength + filter_size[1] - 1)

    layer3_out = debug_print(layer3.output_simi, 'layer1_out')

    #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh)

    sum_uni_l = T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    #norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r = T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    #norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    '''
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    linear=Linear(sum_uni_l, sum_uni_r)
    poly=Poly(sum_uni_l, sum_uni_r)
    sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    rbf=RBF(sum_uni_l, sum_uni_r)
    gesd=GESD(sum_uni_l, sum_uni_r)
    '''
    eucli_1 = 1.0 / (1.0 + EUCLID(sum_uni_l, sum_uni_r))  #25.2%

    #eucli_1=EUCLID(sum_uni_l, sum_uni_r)
    len_l = norm_length_l.reshape((1, 1))
    len_r = norm_length_r.reshape((1, 1))
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer4_input = T.concatenate(
        [mts, eucli_1, layer1.output_eucli, layer3_out, len_l, len_r],
        axis=1)  #, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer4 = LogisticRegression(rng,
                                input=layer4_input,
                                n_in=15 + 3 + 2,
                                n_out=2)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (layer4.W**2).sum() + (conv2_W**2).sum() + (conv_W**2).sum(),
        'L2_reg')  #+(layer1.W** 2).sum()
    cost_this = debug_print(layer4.negative_log_likelihood(y),
                            'cost_this')  #+L2_weight*L2_reg
    cost = debug_print(
        (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost')

    test_model = theano.function(
        [index], [layer4.errors(y), layer4.y_pred],
        givens={
            x_index_l: indices_test_l[index:index + batch_size],
            x_index_r: indices_test_r[index:index + batch_size],
            y: testY[index:index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index:index + batch_size]
        },
        on_unused_input='ignore')

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer4.params + layer2_para + layer0_para  # + layer1.params

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        #norm=T.sqrt((grad_i**2).sum())
        #if T.lt(norm_threshold, norm):
        #    print 'big norm'
        #    grad_i=grad_i*(norm_threshold/norm)
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [index], [cost, layer4.errors(y), layer4_input],
        updates=updates,
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size]
        },
        on_unused_input='ignore')

    train_model_predict = theano.function(
        [index], [cost_this, layer4.errors(y)],
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #shuffle(train_batch_start)#shuffle training data

        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1

            minibatch_index = minibatch_index + 1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            if iter % update_freq != 0:
                cost_ij, error_ij = train_model_predict(batch_start)
                #print 'cost_ij: ', cost_ij
                cost_tmp += cost_ij
                error_sum += error_ij
            else:
                cost_average, error_ij, layer3_input = train_model(batch_start)
                #print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' sum error: '+str(error_sum)+'/'+str(update_freq)
                error_sum = 0
                cost_tmp = 0  #reset for the next batch
                #print layer3_input
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + ' error: ' + str(
                            error_sum) + '/' + str(
                                update_freq) + ' error rate: ' + str(
                                    error_sum * 1.0 / update_freq)
            #if iter ==1:
            #    exit(0)

            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses = []
                for i in test_batch_start:
                    test_loss, pred_y = test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                test_score = numpy.mean(test_losses)
                print((
                    '\t\t\t\t\t\tepoch %i, minibatch %i/%i, test error of best '
                    'model %f %%') % (epoch, minibatch_index, n_train_batches,
                                      test_score * 100.))
                '''
                #print 'validating & testing...'
                # compute zero-one loss on validation set
                validation_losses = []
                for i in dev_batch_start:
                    time.sleep(0.5)
                    validation_losses.append(validate_model(i))
                #validation_losses = [validate_model(i) for i in dev_batch_start]
                this_validation_loss = numpy.mean(validation_losses)
                print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index , n_train_batches, \
                       this_validation_loss * 100.))
                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)
                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    # test it on the test set
                    test_losses = [test_model(i) for i in test_batch_start]
                    test_score = numpy.mean(test_losses)
                    print(('\t\t\t\tepoch %i, minibatch %i/%i, test error of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,
                           test_score * 100.))
            '''

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Exemplo n.º 47
0
def evaluate_lenet5(learning_rate=0.1,
                    n_epochs=2000,
                    nkerns=[50],
                    batch_size=10,
                    window_width=3,
                    maxSentLength=1050,
                    emb_size=50,
                    hidden_size=200,
                    margin=0.5):
    #def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[6, 12], batch_size=70, useAllSamples=0, kmax=30, ktop=5, filter_size=[10,7],
    #                    L2_weight=0.000005, dropout_p=0.5, useEmb=0, task=5, corpus=1):

    ibmPath = '/mounts/data/proj/wenpeng/Dataset/insuranceQA/'

    rng = numpy.random.RandomState(23455)
    datasets, vocab_size = load_ibm_corpus(ibmPath + 'vocabulary',
                                           ibmPath + 'train.txt',
                                           ibmPath + 'dev.txt', maxSentLength)

    indices_train, trainLengths, trainLeftPad, trainRightPad = datasets[0]
    #print trainY.eval().shape[0]
    indices_dev, devY, devLengths, devLeftPad, devRightPad = datasets[1]

    n_train_batches = indices_train.shape[0] / (
        batch_size * 4
    )  #note that we consider 4 lines as an example in training
    n_valid_batches = indices_dev.shape[0] / (batch_size * 4)

    train_batch_start = list(numpy.arange(n_train_batches) * (batch_size * 4))
    dev_batch_start = list(numpy.arange(n_valid_batches) * (batch_size * 4))

    indices_train_theano = theano.shared(numpy.asarray(
        indices_train, dtype=theano.config.floatX),
                                         borrow=True)
    indices_dev_theano = theano.shared(numpy.asarray(
        indices_dev, dtype=theano.config.floatX),
                                       borrow=True)
    indices_train_theano = T.cast(indices_train_theano, 'int32')
    indices_dev_theano = T.cast(indices_dev_theano, 'int32')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(1e-50 + numpy.zeros(emb_size))
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values)
    embeddings = theano.shared(value=rand_values)

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x_index = T.imatrix(
        'x_index')  # now, x is the index matrix, must be integer
    #left=T.ivector('left')
    #right=T.ivector('right')

    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size = (emb_size, window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv = ishape[1] + filter_size[1] - 1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_input = embeddings[x_index.flatten()].reshape(
        ((batch_size * 4), maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0 = Conv(rng,
                  input=layer0_input,
                  image_shape=((batch_size * 4), 1, ishape[0], ishape[1]),
                  filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))

    layer0_out = debug_print(layer0.output, 'layer0_out')

    layer1 = Average_Pooling(rng,
                             input=layer0_out,
                             length_last_dim=length_after_wideConv,
                             kern=nkerns[0])

    layer1_out = debug_print(
        layer1.output.reshape((batch_size * 2, nkerns[0] * 2)), 'layer1_out')

    layer2 = HiddenLayer(rng,
                         input=layer1_out,
                         n_in=nkerns[0] * 2,
                         n_out=hidden_size,
                         activation=T.tanh)
    layer3 = HiddenLayer(rng,
                         input=layer2.output,
                         n_in=hidden_size,
                         n_out=1,
                         activation=T.tanh)

    posi_score = layer3.output[0:layer3.output.shape[0]:2, :]
    nega_score = layer3.output[1:layer3.output.shape[0]:2, :]
    cost = T.maximum(0, margin - T.sum(posi_score - nega_score))

    #cost = layer3.negative_log_likelihood(y)
    # output a list of score
    dev_model = theano.function(
        [index],
        layer3.output.flatten(),
        givens={x_index: indices_dev_theano[index:index + (batch_size * 4)]})
    # create a list of all model parameters to be fit by gradient descent

    params = layer3.params + layer2.params + layer1.params + layer0.params

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i /
                        (T.sqrt(acc) + 1e-20)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [index], [cost],
        updates=updates,
        givens={x_index: indices_train_theano[index:index + (batch_size * 4)]})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches / 5, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1

            minibatch_index = minibatch_index + 1
            cost_ij = train_model(batch_start)
            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(iter) + ' cost: ' + str(
                    cost_ij)
            if iter % validation_frequency == 0:
                dev_scores = []
                for i in dev_batch_start:
                    dev_scores += list(dev_model(i))

                acc_dev = compute_acc(devY, dev_scores)
                print(
                    ('\t\t\t\tepoch %i, minibatch %i/%i, dev acc of best '
                     'model %f %%') %
                    (epoch, minibatch_index, n_train_batches, acc_dev * 100.))
                '''
                #print 'validating & testing...'
                # compute zero-one loss on validation set
                validation_losses = []
                for i in dev_batch_start:
                    time.sleep(0.5)
                    validation_losses.append(validate_model(i))
                #validation_losses = [validate_model(i) for i in dev_batch_start]
                this_validation_loss = numpy.mean(validation_losses)
                print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index , n_train_batches, \
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [test_model(i) for i in test_batch_start]
                    test_score = numpy.mean(test_losses)
                    print(('\t\t\t\tepoch %i, minibatch %i/%i, test error of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,
                           test_score * 100.))
            '''

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Exemplo n.º 48
0
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, word_nkerns=50, char_nkerns=4, batch_size=1, window_width=[2, 5],
                    emb_size=50, char_emb_size=4, hidden_size=200,
                    margin=0.5, L2_weight=0.0003, Div_reg=0.03, update_freq=1, norm_threshold=5.0, max_truncate=40, 
                    max_char_len=40, max_des_len=20, max_relation_len=5, max_Q_len=30, train_neg_size=21, 
                    neg_all=100, train_size=200, test_size=200, mark='_forfun'):  #train_size=75909, test_size=17386
#     maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/'
    triple_files=['annotated_fb_data_train.entitylinking.top20_succSet_asInput.txt', 'annotated_fb_data_test.entitylinking.top20_succSet_asInput.txt']

    rng = numpy.random.RandomState(23455)
    datasets, datasets_test, length_per_example_test, vocab_size, char_size=load_train(triple_files[0], triple_files[1], max_char_len, max_des_len, max_relation_len, max_Q_len, train_size, test_size, mark)#max_char_len, max_des_len, max_relation_len, max_Q_len

    
    print 'vocab_size:', vocab_size, 'char_size:', char_size

    train_data=datasets
#     valid_data=datasets[1]
    test_data=datasets_test
#     result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores)
#     
    train_pos_entity_char=train_data[0]
    train_pos_entity_des=train_data[1]
    train_relations=train_data[2]
    train_entity_char_lengths=train_data[3]
    train_entity_des_lengths=train_data[4]
    train_relation_lengths=train_data[5]
    train_mention_char_ids=train_data[6]
    train_remainQ_word_ids=train_data[7]
    train_mention_char_lens=train_data[8]
    train_remainQ_word_len=train_data[9]
    train_entity_scores=train_data[10]

    test_pos_entity_char=test_data[0]
    test_pos_entity_des=test_data[1]
    test_relations=test_data[2]
    test_entity_char_lengths=test_data[3]
    test_entity_des_lengths=test_data[4]
    test_relation_lengths=test_data[5]
    test_mention_char_ids=test_data[6]
    test_remainQ_word_ids=test_data[7]
    test_mention_char_lens=test_data[8]
    test_remainQ_word_len=test_data[9]
    test_entity_scores=test_data[10]
# 
#     test_pos_entity_char=test_data[0]       #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51
#     test_pos_entity_des=test_data[1]        #matrix, each row for a examle: 20*2*51
#     test_relations=test_data[2]             #matrix, each row for a example: 5*51
#     test_entity_char_lengths=test_data[3]   #matrix, each row for a example: 3*2*51  (three valies for one entity)
#     test_entity_des_lengths=test_data[4]    #matrix, each row for a example: 3*2*51  (three values for one entity)
#     test_relation_lengths=test_data[5]      #matrix, each row for a example: 3*51
#     test_mention_char_ids=test_data[6]      #matrix, each row for a mention: 40
#     test_remainQ_word_ids=test_data[7]      #matrix, each row for a question: 30
#     test_mention_char_lens=test_data[8]     #matrix, each three values for a mention: 3
#     test_remainQ_word_len=test_data[9]      #matrix, each three values for a remain question: 3
    

    train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\
           len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len), len(train_entity_scores)]
    if sum(train_sizes)/len(train_sizes)!=train_size:
        print 'weird size:', train_sizes
        exit(0)

    test_sizes=[len(test_pos_entity_char), len(test_pos_entity_des), len(test_relations), len(test_entity_char_lengths), len(test_entity_des_lengths),\
           len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len), len(test_entity_scores)]
    if sum(test_sizes)/len(test_sizes)!=test_size:
        print 'weird size:', test_sizes
        exit(0)

    n_train_batches=train_size/batch_size
    n_test_batches=test_size/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)
    
    indices_train_pos_entity_char=pythonList_into_theanoIntMatrix(train_pos_entity_char)
    indices_train_pos_entity_des=pythonList_into_theanoIntMatrix(train_pos_entity_des)
    indices_train_relations=pythonList_into_theanoIntMatrix(train_relations)
    indices_train_entity_char_lengths=pythonList_into_theanoIntMatrix(train_entity_char_lengths)
    indices_train_entity_des_lengths=pythonList_into_theanoIntMatrix(train_entity_des_lengths)
    indices_train_relation_lengths=pythonList_into_theanoIntMatrix(train_relation_lengths)
    indices_train_mention_char_ids=pythonList_into_theanoIntMatrix(train_mention_char_ids)
    indices_train_remainQ_word_ids=pythonList_into_theanoIntMatrix(train_remainQ_word_ids)
    indices_train_mention_char_lens=pythonList_into_theanoIntMatrix(train_mention_char_lens)
    indices_train_remainQ_word_len=pythonList_into_theanoIntMatrix(train_remainQ_word_len)   
    indices_train_entity_scores=pythonList_into_theanoFloatMatrix(train_entity_scores) 
    
#     indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char)
#     indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des)
#     indices_test_relations=pythonList_into_theanoIntMatrix(test_relations)
#     indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths)
#     indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths)
#     indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths)
#     indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids)
#     indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids)
#     indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens)
#     indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len)   
#     indices_test_entity_scores=pythonList_into_theanoIntMatrix(test_entity_scores)

    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb'+mark+'.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      

    char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    char_rand_values[0]=numpy.array(numpy.zeros(char_emb_size),dtype=theano.config.floatX)
    char_embeddings=theano.shared(value=char_rand_values, borrow=True)      

    
    # allocate symbolic variables for the data
    index = T.lscalar()
    chosed_indices=T.lvector()
    
    ent_char_ids_M = T.lmatrix()   
    ent_lens_M = T.lmatrix()
    men_char_ids_M = T.lmatrix()  
    men_lens_M=T.lmatrix()
    rel_word_ids_M=T.lmatrix()
    rel_word_lens_M=T.lmatrix()
    desH_word_ids_M=T.lmatrix()
    desH_word_lens_M=T.lmatrix()
#     desT_word_ids_M=T.lmatrix()
#     desT_word_lens_M=T.lmatrix()
    q_word_ids_M=T.lmatrix()
    q_word_lens_M=T.lmatrix()
    ent_scores=T.dvector()

#max_char_len, max_des_len, max_relation_len, max_Q_len
#     ent_men_ishape = (char_emb_size, max_char_len)  # this is the size of MNIST images
#     rel_ishape=(emb_size, max_relation_len)
#     des_ishape=(emb_size, max_des_len)
#     q_ishape=(emb_size, max_Q_len)
    
    filter_size=(emb_size,window_width[0])
    char_filter_size=(char_emb_size, window_width[1])
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
#     length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    

    char_filter_shape=(char_nkerns, 1, char_filter_size[0], char_filter_size[1])
    word_filter_shape=(word_nkerns, 1, filter_size[0], filter_size[1])
    char_conv_W, char_conv_b=create_conv_para(rng, filter_shape=char_filter_shape)
    q_rel_conv_W, q_rel_conv_b=create_conv_para(rng, filter_shape=word_filter_shape)
    q_desH_conv_W, q_desH_conv_b=create_conv_para(rng, filter_shape=word_filter_shape)
    params = [char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W, q_rel_conv_b, q_desH_conv_W, q_desH_conv_b]
    char_conv_W_into_matrix=char_conv_W.reshape((char_conv_W.shape[0], char_conv_W.shape[2]*char_conv_W.shape[3]))
    q_rel_conv_W_into_matrix=q_rel_conv_W.reshape((q_rel_conv_W.shape[0], q_rel_conv_W.shape[2]*q_rel_conv_W.shape[3]))
    q_desH_conv_W_into_matrix=q_desH_conv_W.reshape((q_desH_conv_W.shape[0], q_desH_conv_W.shape[2]*q_desH_conv_W.shape[3]))
#     load_model_from_file(rootPath, params, '')

    def SimpleQ_matches_Triple(ent_char_ids_f,ent_lens_f,rel_word_ids_f,rel_word_lens_f,desH_word_ids_f,
                       desH_word_lens_f,
                       men_char_ids_f, q_word_ids_f, men_lens_f, q_word_lens_f):
        

#         rng = numpy.random.RandomState(23455)
        ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        men_char_input = char_embeddings[men_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        
        rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape((batch_size,max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        
#         desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        q_word_input = embeddings[q_word_ids_f.flatten()].reshape((batch_size,max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
    
        #ent_mention
        ent_char_conv = Conv_with_input_para(rng, input=ent_char_input,
                image_shape=(batch_size, 1, char_emb_size, max_char_len),
                filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b)
        men_char_conv = Conv_with_input_para(rng, input=men_char_input,
                image_shape=(batch_size, 1, char_emb_size, max_char_len),
                filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b)
        #q-rel
        q_rel_conv = Conv_with_input_para(rng, input=q_word_input,
                image_shape=(batch_size, 1, emb_size, max_Q_len),
                filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b)
        rel_conv = Conv_with_input_para(rng, input=rel_word_input,
                image_shape=(batch_size, 1, emb_size, max_relation_len),
                filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b)
        #q_desH
        q_desH_conv = Conv_with_input_para(rng, input=q_word_input,
                image_shape=(batch_size, 1, emb_size, max_Q_len),
                filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b)
        desH_conv = Conv_with_input_para(rng, input=desH_word_input,
                image_shape=(batch_size, 1, emb_size, max_des_len),
                filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b)
#         #q_desT
#         q_desT_conv = Conv_with_input_para(rng, input=q_word_input,
#                 image_shape=(batch_size, 1, emb_size, max_Q_len),
#                 filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b)
#         desT_conv = Conv_with_input_para(rng, input=desT_word_input,
#                 image_shape=(batch_size, 1, emb_size, max_des_len),
#                 filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b)
    #     ent_char_output=debug_print(ent_char_conv.output, 'ent_char.output')
    #     men_char_output=debug_print(men_char_conv.output, 'men_char.output')
        
        
        
        ent_conv_pool=Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2])
        men_conv_pool=Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens_f[0], right_l=men_lens_f[2])
        
#         q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2])
        rel_conv_pool=Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2])
        q_rel_pool=Average_Pooling_for_SimpleQA(rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, 
                                                left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1]+filter_size[1]-1, 
                                                dim=max_Q_len+filter_size[1]-1, topk=2)
        
        
        q_desH_pool=Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2])
        desH_conv_pool=Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2])
        
#         q_desT_pool=Max_Pooling(rng, input_l=q_desT_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2])
#         desT_conv_pool=Max_Pooling(rng, input_l=desT_conv.output, left_l=desT_word_lens_f[0], right_l=desT_word_lens_f[2])    
        
        
        overall_simi=(cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)+\
                    cosine(q_rel_pool.topk_max_pooling, rel_conv_pool.output_maxpooling)+\
                    0.1*cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling))/3.0

#                     cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling)
        return overall_simi
    
    simi_list, updates = theano.scan(
        SimpleQ_matches_Triple,
                sequences=[ent_char_ids_M,ent_lens_M,rel_word_ids_M,rel_word_lens_M,desH_word_ids_M,
                   desH_word_lens_M,
                   men_char_ids_M, q_word_ids_M, men_lens_M, q_word_lens_M])
    
    simi_list+=0.5*ent_scores
    
    posi_simi=simi_list[0]
    nega_simies=simi_list[1:]
    loss_simi_list=T.maximum(0.0, margin-posi_simi.reshape((1,1))+nega_simies) 
    loss_simi=T.mean(loss_simi_list)

    

    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((char_embeddings** 2).sum()+(embeddings** 2).sum()+(char_conv_W** 2).sum()+(q_rel_conv_W** 2).sum()+(q_desH_conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    diversify_reg= Diversify_Reg(char_conv_W_into_matrix)+Diversify_Reg(q_rel_conv_W_into_matrix)+Diversify_Reg(q_desH_conv_W_into_matrix)
    cost=loss_simi+L2_weight*L2_reg+Div_reg*diversify_reg
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    



    test_model = theano.function([ent_char_ids_M, ent_lens_M, men_char_ids_M, men_lens_M, rel_word_ids_M, rel_word_lens_M, desH_word_ids_M, desH_word_lens_M,
                                  q_word_ids_M, q_word_lens_M, ent_scores], [loss_simi, simi_list],on_unused_input='ignore')
#           givens={
#             ent_char_ids_M : test_pos_entity_char[index].reshape((length_per_example_test[index], max_char_len)),  
#             ent_lens_M : test_entity_char_lengths[index].reshape((length_per_example_test[index], 3)),
#             men_char_ids_M : test_mention_char_ids[index].reshape((length_per_example_test[index], max_char_len)),  
#             men_lens_M : test_mention_char_lens[index].reshape((length_per_example_test[index], 3)),
#             rel_word_ids_M : test_relations[index].reshape((length_per_example_test[index], max_relation_len)),  
#             rel_word_lens_M : test_relation_lengths[index].reshape((length_per_example_test[index], 3)),
#             desH_word_ids_M : test_pos_entity_des[index].reshape((length_per_example_test[index], max_des_len)), 
#             desH_word_lens_M : test_entity_des_lengths[index].reshape((length_per_example_test[index], 3)),
# #             desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2], 
# #             desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2],
#             q_word_ids_M : test_remainQ_word_ids[index].reshape((length_per_example_test[index], max_Q_len)), 
#             q_word_lens_M : test_remainQ_word_len[index].reshape((length_per_example_test[index], 3)),
#             ent_scores : test_entity_scores[index]},
                                  
    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    #+[embeddings]# + layer1.params 
#     params_conv = [conv_W, conv_b]
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
#         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10)))   #AdaGrad
#         updates.append((acc_i, acc))    
        if param_i == embeddings:
            updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(emb_size)))))   #Ada
        elif param_i == char_embeddings:
            updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(char_emb_size)))))   #AdaGrad
        else:
            updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10)))   #AdaGrad
        updates.append((acc_i, acc)) 
  
    train_model = theano.function([index, chosed_indices], [loss_simi, cost], updates=updates,
          givens={
            ent_char_ids_M : indices_train_pos_entity_char[index].reshape((neg_all, max_char_len))[chosed_indices].reshape((train_neg_size, max_char_len)),  
            ent_lens_M : indices_train_entity_char_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            men_char_ids_M : indices_train_mention_char_ids[index].reshape((neg_all, max_char_len))[chosed_indices].reshape((train_neg_size, max_char_len)),  
            men_lens_M : indices_train_mention_char_lens[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            rel_word_ids_M : indices_train_relations[index].reshape((neg_all, max_relation_len))[chosed_indices].reshape((train_neg_size, max_relation_len)),  
            rel_word_lens_M : indices_train_relation_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            desH_word_ids_M : indices_train_pos_entity_des[index].reshape((neg_all, max_des_len))[chosed_indices].reshape((train_neg_size, max_des_len)), 
            desH_word_lens_M : indices_train_entity_des_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
#             desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2], 
#             desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2],
            q_word_ids_M : indices_train_remainQ_word_ids[index].reshape((neg_all, max_Q_len))[chosed_indices].reshape((train_neg_size, max_Q_len)), 
            q_word_lens_M : indices_train_remainQ_word_len[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            ent_scores : indices_train_entity_scores[index][chosed_indices]
            
            }, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    best_test_accu=0.0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0


        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1
 
            minibatch_index=minibatch_index+1
            #print batch_start
            sample_indices=[0]+random.sample(range(1, neg_all), train_neg_size-1)
            loss_simi_i, cost_i= train_model(batch_start, sample_indices)
#             if batch_start%1==0:
#                 print batch_start, '\t loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i
#                 store_model_to_file(rootPath, params)
 
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+'\tloss_simi_i: ', loss_simi_i, 'cost_i:', cost_i
            #if iter ==1:
            #    exit(0)
#             
            if iter % n_train_batches == 0:
                 
                test_loss=[]
                succ=0
                for i in range(test_size):
#                     print 'testing', i, '...'
                    #prepare data
                    test_ent_char_ids_M= numpy.asarray(test_pos_entity_char[i], dtype='int64').reshape((length_per_example_test[i], max_char_len))  
                    test_ent_lens_M = numpy.asarray(test_entity_char_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_men_char_ids_M = numpy.asarray(test_mention_char_ids[i], dtype='int64').reshape((length_per_example_test[i], max_char_len))
                    test_men_lens_M = numpy.asarray(test_mention_char_lens[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_rel_word_ids_M = numpy.asarray(test_relations[i], dtype='int64').reshape((length_per_example_test[i], max_relation_len))  
                    test_rel_word_lens_M = numpy.asarray(test_relation_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_desH_word_ids_M =numpy.asarray( test_pos_entity_des[i], dtype='int64').reshape((length_per_example_test[i], max_des_len))
                    test_desH_word_lens_M = numpy.asarray(test_entity_des_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_q_word_ids_M = numpy.asarray(test_remainQ_word_ids[i], dtype='int64').reshape((length_per_example_test[i], max_Q_len))
                    test_q_word_lens_M = numpy.asarray(test_remainQ_word_len[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_ent_scores = numpy.asarray(test_entity_scores[i], dtype=theano.config.floatX)
             
             
             
             
                                
                    loss_simi_i,simi_list_i=test_model(test_ent_char_ids_M, test_ent_lens_M, test_men_char_ids_M, test_men_lens_M, test_rel_word_ids_M, test_rel_word_lens_M,
                                                       test_desH_word_ids_M, test_desH_word_lens_M, test_q_word_ids_M, test_q_word_lens_M, test_ent_scores)
#                     print 'simi_list_i:', simi_list_i[:10]
                    test_loss.append(loss_simi_i)
                    if simi_list_i[0]>=max(simi_list_i[1:]):
                        succ+=1
#                     print 'testing', i, '...acc:', succ*1.0/(i+1)
                succ=succ*1.0/test_size
                #now, check MAP and MRR
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best '
                           'model %f') %
                          (epoch, minibatch_index, n_train_batches,succ))

                if best_test_accu< succ:
                    best_test_accu=succ
                    store_model_to_file(rootPath, params, mark)
            if patience <= iter:
                done_looping = True
                break
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock() 

            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05,
                    n_epochs=2000,
                    nkerns=[90, 90],
                    batch_size=1,
                    window_width=2,
                    maxSentLength=64,
                    maxDocLength=60,
                    emb_size=50,
                    hidden_size=200,
                    L2_weight=0.0065,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_s_length=57,
                    max_d_length=59,
                    margin=0.2):
    maxSentLength = max_s_length + 2 * (window_width - 1)
    maxDocLength = max_d_length + 2 * (window_width - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/MCTest/'
    rng = numpy.random.RandomState(23455)
    train_data, train_size, test_data, test_size, vocab_size = load_MCTest_corpus_DSSSS(
        rootPath + 'vocab_DSSSS.txt', rootPath +
        'mc500.train.tsv_standardlized.txt_with_state.txt_DSSSS.txt',
        rootPath + 'mc500.test.tsv_standardlized.txt_with_state.txt_DSSSS.txt',
        max_s_length, maxSentLength,
        maxDocLength)  #vocab_size contain train, dev and test

    #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    #     mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
    #     extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
    #     discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')

    # results=[numpy.array(data_D), numpy.array(data_Q), numpy.array(data_A1), numpy.array(data_A2), numpy.array(data_A3), numpy.array(data_A4), numpy.array(Label),
    #          numpy.array(Length_D),numpy.array(Length_D_s), numpy.array(Length_Q), numpy.array(Length_A1), numpy.array(Length_A2), numpy.array(Length_A3), numpy.array(Length_A4),
    #         numpy.array(leftPad_D),numpy.array(leftPad_D_s), numpy.array(leftPad_Q), numpy.array(leftPad_A1), numpy.array(leftPad_A2), numpy.array(leftPad_A3), numpy.array(leftPad_A4),
    #         numpy.array(rightPad_D),numpy.array(rightPad_D_s), numpy.array(rightPad_Q), numpy.array(rightPad_A1), numpy.array(rightPad_A2), numpy.array(rightPad_A3), numpy.array(rightPad_A4)]
    # return results, line_control
    [
        train_data_D, train_data_A1, train_data_A2, train_data_A3,
        train_data_A4, train_Label, train_Length_D, train_Length_D_s,
        train_Length_A1, train_Length_A2, train_Length_A3, train_Length_A4,
        train_leftPad_D, train_leftPad_D_s, train_leftPad_A1, train_leftPad_A2,
        train_leftPad_A3, train_leftPad_A4, train_rightPad_D,
        train_rightPad_D_s, train_rightPad_A1, train_rightPad_A2,
        train_rightPad_A3, train_rightPad_A4
    ] = train_data
    [
        test_data_D, test_data_A1, test_data_A2, test_data_A3, test_data_A4,
        test_Label, test_Length_D, test_Length_D_s, test_Length_A1,
        test_Length_A2, test_Length_A3, test_Length_A4, test_leftPad_D,
        test_leftPad_D_s, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3,
        test_leftPad_A4, test_rightPad_D, test_rightPad_D_s, test_rightPad_A1,
        test_rightPad_A2, test_rightPad_A3, test_rightPad_A4
    ] = test_data

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    #     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    #     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    #     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    #     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    #     indices_train_l=T.cast(indices_train_l, 'int64')
    #     indices_train_r=T.cast(indices_train_r, 'int64')
    #     indices_test_l=T.cast(indices_test_l, 'int64')
    #     indices_test_r=T.cast(indices_test_r, 'int64')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_glove_50d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    #cost_tmp=0
    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()  # now, x is the index matrix, must be integer
    #     index_Q = T.lvector()
    index_A1 = T.lvector()
    index_A2 = T.lvector()
    index_A3 = T.lvector()
    index_A4 = T.lvector()
    #     y = T.lvector()

    len_D = T.lscalar()
    len_D_s = T.lvector()
    #     len_Q=T.lscalar()
    len_A1 = T.lscalar()
    len_A2 = T.lscalar()
    len_A3 = T.lscalar()
    len_A4 = T.lscalar()

    left_D = T.lscalar()
    left_D_s = T.lvector()
    #     left_Q=T.lscalar()
    left_A1 = T.lscalar()
    left_A2 = T.lscalar()
    left_A3 = T.lscalar()
    left_A4 = T.lscalar()

    right_D = T.lscalar()
    right_D_s = T.lvector()
    #     right_Q=T.lscalar()
    right_A1 = T.lscalar()
    right_A2 = T.lscalar()
    right_A3 = T.lscalar()
    right_A4 = T.lscalar()

    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength)  # doc shape
    filter_words = (emb_size, window_width)
    filter_sents = (nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    #     length_after_wideConv=ishape[1]+filter_size[1]-1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = embeddings[index_D.flatten()].reshape(
        (maxDocLength, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    #     layer0_Q_input = embeddings[index_Q.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = embeddings[index_A1.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A2_input = embeddings[index_A2.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A3_input = embeddings[index_A3.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A4_input = embeddings[index_A4.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(nkerns[0], 1,
                                                    filter_words[0],
                                                    filter_words[1]))
    layer0_para = [conv_W, conv_b]
    conv2_W, conv2_b = create_conv_para(rng,
                                        filter_shape=(nkerns[1], 1, nkerns[0],
                                                      filter_sents[1]))
    layer2_para = [conv2_W, conv2_b]
    high_W, high_b = create_highw_para(rng, nkerns[0], nkerns[1])
    highW_para = [high_W, high_b]
    params = layer2_para + layer0_para + highW_para  #+[embeddings]
    #load_model(params)

    layer0_D = Conv_with_input_para(
        rng,
        input=layer0_D_input,
        image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    #     layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input,
    #             image_shape=(batch_size, 1, ishape[0], ishape[1]),
    #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    layer0_A1 = Conv_with_input_para(
        rng,
        input=layer0_A1_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_A2 = Conv_with_input_para(
        rng,
        input=layer0_A2_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_A3 = Conv_with_input_para(
        rng,
        input=layer0_A3_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_A4 = Conv_with_input_para(
        rng,
        input=layer0_A4_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)

    layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output')
    #     layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output')
    layer0_A1_output = debug_print(layer0_A1.output, 'layer0_A1.output')
    layer0_A2_output = debug_print(layer0_A2.output, 'layer0_A2.output')
    layer0_A3_output = debug_print(layer0_A3.output, 'layer0_A3.output')
    layer0_A4_output = debug_print(layer0_A4.output, 'layer0_A4.output')

    #     layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0],
    #                                       left_D=left_D, right_D=right_D,
    #                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q,
    #                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1,
    #                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    layer1_DA1 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A1_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A1,
                                      right_r=right_A1,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A1 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=3)
    layer1_DA2 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A2_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A2,
                                      right_r=right_A2,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A2 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=3)
    layer1_DA3 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A3_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A3,
                                      right_r=right_A3,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A3 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=3)
    layer1_DA4 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A4_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A4,
                                      right_r=right_A4,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A4 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=3)

    #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0]
    #conv from sentence to doc
    #     layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
    #             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_DA1 = Conv_with_input_para(
        rng,
        input=layer1_DA1.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_DA2 = Conv_with_input_para(
        rng,
        input=layer1_DA2.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_DA3 = Conv_with_input_para(
        rng,
        input=layer1_DA3.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_DA4 = Conv_with_input_para(
        rng,
        input=layer1_DA4.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    #conv single Q and A into doc level with same conv weights
    #     layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
    #             image_shape=(batch_size, 1, nkerns[0], 1),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A1 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA1.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A2 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA2.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A3 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA3.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A4 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA4.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    #     layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel')
    layer2_A1_output_sent_rep_Dlevel = debug_print(
        layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel')
    layer2_A2_output_sent_rep_Dlevel = debug_print(
        layer2_A2.output_sent_rep_Dlevel, 'layer2_A2.output_sent_rep_Dlevel')
    layer2_A3_output_sent_rep_Dlevel = debug_print(
        layer2_A3.output_sent_rep_Dlevel, 'layer2_A3.output_sent_rep_Dlevel')
    layer2_A4_output_sent_rep_Dlevel = debug_print(
        layer2_A4.output_sent_rep_Dlevel, 'layer2_A4.output_sent_rep_Dlevel')

    #     layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1],
    #                      left_l=left_D, right_l=right_D, left_r=0, right_r=0,
    #                       length_l=len_D+filter_sents[1]-1, length_r=1,
    #                        dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA1 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA1.output,
        input_r=layer2_A1_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)
    layer3_DA2 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA2.output,
        input_r=layer2_A2_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)
    layer3_DA3 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA3.output,
        input_r=layer2_A3_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)
    layer3_DA4 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA4.output,
        input_r=layer2_A4_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)

    #high-way

    #     transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ')
    transform_gate_DA1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b),
        'transform_gate_DA1')
    transform_gate_DA2 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA2.output_D_sent_level_rep) + high_b),
        'transform_gate_DA2')
    transform_gate_DA3 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA3.output_D_sent_level_rep) + high_b),
        'transform_gate_DA3')
    transform_gate_DA4 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA4.output_D_sent_level_rep) + high_b),
        'transform_gate_DA4')
    #     transform_gate_Q=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b), 'transform_gate_Q')
    transform_gate_A1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b),
        'transform_gate_A1')
    transform_gate_A2 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA2.output_QA_sent_level_rep) + high_b),
        'transform_gate_A2')
    transform_gate_A3 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA3.output_QA_sent_level_rep) + high_b),
        'transform_gate_A3')
    transform_gate_A4 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA4.output_QA_sent_level_rep) + high_b),
        'transform_gate_A4')

    #     overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q')
    overall_D_A1 = (
        1.0 - transform_gate_DA1
    ) * layer1_DA1.output_D_sent_level_rep + transform_gate_DA1 * layer3_DA1.output_D_doc_level_rep
    overall_D_A2 = (
        1.0 - transform_gate_DA2
    ) * layer1_DA2.output_D_sent_level_rep + transform_gate_DA2 * layer3_DA2.output_D_doc_level_rep
    overall_D_A3 = (
        1.0 - transform_gate_DA3
    ) * layer1_DA3.output_D_sent_level_rep + transform_gate_DA3 * layer3_DA3.output_D_doc_level_rep
    overall_D_A4 = (
        1.0 - transform_gate_DA4
    ) * layer1_DA4.output_D_sent_level_rep + transform_gate_DA4 * layer3_DA4.output_D_doc_level_rep

    #     overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel
    overall_A1 = (
        1.0 - transform_gate_A1
    ) * layer1_DA1.output_QA_sent_level_rep + transform_gate_A1 * layer2_A1.output_sent_rep_Dlevel
    overall_A2 = (
        1.0 - transform_gate_A2
    ) * layer1_DA2.output_QA_sent_level_rep + transform_gate_A2 * layer2_A2.output_sent_rep_Dlevel
    overall_A3 = (
        1.0 - transform_gate_A3
    ) * layer1_DA3.output_QA_sent_level_rep + transform_gate_A3 * layer2_A3.output_sent_rep_Dlevel
    overall_A4 = (
        1.0 - transform_gate_A4
    ) * layer1_DA4.output_QA_sent_level_rep + transform_gate_A4 * layer2_A4.output_sent_rep_Dlevel

    simi_sent_level1 = debug_print(
        cosine(layer1_DA1.output_D_sent_level_rep,
               layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1')
    simi_sent_level2 = debug_print(
        cosine(layer1_DA2.output_D_sent_level_rep,
               layer1_DA2.output_QA_sent_level_rep), 'simi_sent_level2')
    simi_sent_level3 = debug_print(
        cosine(layer1_DA3.output_D_sent_level_rep,
               layer1_DA3.output_QA_sent_level_rep), 'simi_sent_level3')
    simi_sent_level4 = debug_print(
        cosine(layer1_DA4.output_D_sent_level_rep,
               layer1_DA4.output_QA_sent_level_rep), 'simi_sent_level4')

    simi_doc_level1 = debug_print(
        cosine(layer3_DA1.output_D_doc_level_rep,
               layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1')
    simi_doc_level2 = debug_print(
        cosine(layer3_DA2.output_D_doc_level_rep,
               layer2_A2.output_sent_rep_Dlevel), 'simi_doc_level2')
    simi_doc_level3 = debug_print(
        cosine(layer3_DA3.output_D_doc_level_rep,
               layer2_A3.output_sent_rep_Dlevel), 'simi_doc_level3')
    simi_doc_level4 = debug_print(
        cosine(layer3_DA4.output_D_doc_level_rep,
               layer2_A4.output_sent_rep_Dlevel), 'simi_doc_level4')

    simi_overall_level1 = debug_print(cosine(overall_D_A1, overall_A1),
                                      'simi_overall_level1')
    simi_overall_level2 = debug_print(cosine(overall_D_A2, overall_A2),
                                      'simi_overall_level2')
    simi_overall_level3 = debug_print(cosine(overall_D_A3, overall_A3),
                                      'simi_overall_level3')
    simi_overall_level4 = debug_print(cosine(overall_D_A4, overall_A4),
                                      'simi_overall_level4')

    simi_1 = simi_overall_level1  #+simi_sent_level1+simi_doc_level1
    simi_2 = simi_overall_level2  #+simi_sent_level2+simi_doc_level2
    simi_3 = simi_overall_level3  #+simi_sent_level3+simi_doc_level3
    simi_4 = simi_overall_level4  #+simi_sent_level4+simi_doc_level4
    #     simi_1=(simi_overall_level1+simi_sent_level1+simi_doc_level1)/3.0
    #     simi_2=(simi_overall_level2+simi_sent_level2+simi_doc_level2)/3.0
    #     simi_3=(simi_overall_level3+simi_sent_level3+simi_doc_level3)/3.0
    #     simi_4=(simi_overall_level4+simi_sent_level4+simi_doc_level4)/3.0

    #     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))

    #     #only use overall_simi
    #     cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi)
    #     posi_simi=simi_overall_level1
    #     nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])
    #use ensembled simi
    #     cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi)
    #     cost=T.maximum(0.0, margin+simi_2-simi_1)+T.maximum(0.0, margin+simi_3-simi_1)+T.maximum(0.0, margin+simi_4-simi_1)
    cost12 = T.maximum(
        0.0, margin + simi_sent_level2 - simi_sent_level1) + T.maximum(
            0.0, margin + simi_doc_level2 - simi_doc_level1) + T.maximum(
                0.0, margin + simi_overall_level2 - simi_overall_level1)
    cost13 = T.maximum(
        0.0, margin + simi_sent_level3 - simi_sent_level1) + T.maximum(
            0.0, margin + simi_doc_level3 - simi_doc_level1) + T.maximum(
                0.0, margin + simi_overall_level3 - simi_overall_level1)
    cost14 = T.maximum(
        0.0, margin + simi_sent_level4 - simi_sent_level1) + T.maximum(
            0.0, margin + simi_doc_level4 - simi_doc_level1) + T.maximum(
                0.0, margin + simi_overall_level4 - simi_overall_level1)
    cost = cost12 + cost13 + cost14
    posi_simi = T.max([simi_sent_level1, simi_doc_level1, simi_overall_level1])
    nega_simi = T.max([
        simi_sent_level2, simi_doc_level2, simi_overall_level2,
        simi_sent_level3, simi_doc_level3, simi_overall_level3,
        simi_sent_level4, simi_doc_level4, simi_overall_level4
    ])

    L2_reg = debug_print(
        (high_W**2).sum() + (conv2_W**2).sum() + (conv_W**2).sum(), 'L2_reg'
    )  #+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost = debug_print(cost + L2_weight * L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index],
        [cost, posi_simi, nega_simi],
        givens={
            index_D: test_data_D[index],  #a matrix
            #             index_Q: test_data_Q[index],
            index_A1: test_data_A1[index],
            index_A2: test_data_A2[index],
            index_A3: test_data_A3[index],
            index_A4: test_data_A4[index],
            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            #             len_Q: test_Length_Q[index],
            len_A1: test_Length_A1[index],
            len_A2: test_Length_A2[index],
            len_A3: test_Length_A3[index],
            len_A4: test_Length_A4[index],
            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            #             left_Q: test_leftPad_Q[index],
            left_A1: test_leftPad_A1[index],
            left_A2: test_leftPad_A2[index],
            left_A3: test_leftPad_A3[index],
            left_A4: test_leftPad_A4[index],
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            #             right_Q: test_rightPad_Q[index],
            right_A1: test_rightPad_A1[index],
            right_A2: test_rightPad_A2[index],
            right_A3: test_rightPad_A3[index],
            right_A4: test_rightPad_A4[index]
        },
        on_unused_input='ignore')

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))


#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         acc = acc_i + T.sqr(grad_i)
#         if param_i == embeddings:
#             updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(emb_size)))))   #AdaGrad
#         else:
#             updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
#         updates.append((acc_i, acc))

    train_model = theano.function(
        [index],
        [cost, posi_simi, nega_simi],
        updates=updates,
        givens={
            index_D: train_data_D[index],
            #             index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            index_A4: train_data_A4[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            #             len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            len_A4: train_Length_A4[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            #             left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            left_A4: train_leftPad_A4[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            #             right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index],
            right_A4: train_rightPad_A4[index]
        },
        on_unused_input='ignore')

    train_model_predict = theano.function(
        [index],
        [cost, posi_simi, nega_simi],
        givens={
            index_D: train_data_D[index],
            #             index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            index_A4: train_data_A4[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            #             len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            len_A4: train_Length_A4[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            #             left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            left_A4: train_leftPad_A4[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            #             right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index],
            right_A4: train_rightPad_A4[index]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    max_acc = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #shuffle(train_batch_start)#shuffle training data

        corr_train = 0
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1
            sys.stdout.write("Training :[%6f] %% complete!\r" %
                             ((iter % train_size) * 100.0 / train_size))
            sys.stdout.flush()
            minibatch_index = minibatch_index + 1

            cost_average, posi_simi, nega_simi = train_model(batch_start)
            if posi_simi > nega_simi:
                corr_train += 1

            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + 'corr rate:' + str(
                            corr_train * 100.0 / train_size)

            if iter % validation_frequency == 0:
                corr_test = 0
                for i in test_batch_start:
                    cost, posi_simi, nega_simi = test_model(i)
                    if posi_simi > nega_simi:
                        corr_test += 1

                #write_file.close()
                #test_score = numpy.mean(test_losses)
                test_acc = corr_test * 1.0 / test_size
                #test_acc=1-test_score
                print(
                    ('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                     'model %f %%') %
                    (epoch, minibatch_index, n_train_batches, test_acc * 100.))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')

                find_better = False
                if test_acc > max_acc:
                    max_acc = test_acc
                    best_epoch = epoch
                    find_better = True
                print '\t\t\ttest_acc:', test_acc, 'max:', max_acc, '(at', best_epoch, ')'
                if find_better == True:
                    store_model_to_file(params, best_epoch, max_acc)
                    print 'Finished storing best params'

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
        mid_time = time.clock()
        #writefile.close()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Exemplo n.º 50
0
    def _get_gradients_adagrad(self, J):
        """Get the AdaGrad gradients and squared gradients updates.

        The returned gradients still need to be multiplied with the general
        learning rate.

        Parameters
        ----------
        J : theano variable
            cost

        Returns
        -------
        theano variable
            gradients that are adapted by the AdaGrad algorithm
        theano variable
            updated sum of squares for all previous steps
        """
        grads = T.grad(J, [self.__dict__[self.updatable_parameters[i]]
                for i in xrange(len(self.updatable_parameters))])

        for i, _ in enumerate(grads):
            grads[i] = debug_print(grads[i], 'grads_' + self.updatable_parameters[i])

        updated_squares = dict()

        # Add squared gradient to the squared gradient matrix for AdaGrad and
        # recalculate the gradient.
        for i, p in enumerate(self.updatable_parameters):

            # We need to handle sparse gradient variables differently
            if isinstance(grads[i], sparse.SparseVariable):
                # Add the sqares to the matrix
                power = debug_print(sparse.structured_pow(grads[i], 2.), 'pow_' + p)
                # Remove zeros (might happen when squaring near zero values)
                power = sparse.remove0(power)
                updated_squares[p] = self.__dict__['adagrad_matrix_' + p] + power

                # Get only those squares that will be altered, for all others we
                # don't have gradients, i.e., we don't need to consider them at
                # all.
                sqrt_matrix = sparse.sp_ones_like(power)
                sqrt_matrix = debug_print(updated_squares[p] * sqrt_matrix, 'adagrad_squares_subset_' + p)

                # Take the square root of the matrix subset.
                sqrt_matrix = debug_print(sparse.sqrt(sqrt_matrix), 'adagrad_sqrt_' + p)
                # Calc 1. / the square root.
                sqrt_matrix = debug_print(sparse.structured_pow(sqrt_matrix, -1.), 'adagrad_pow-1_' + p)
                grads[i] = sparse.mul(grads[i], sqrt_matrix)
            else:
                power = debug_print(T.pow(grads[i], 2.), 'pow_' + p)
                updated_squares[p] = self.__dict__['adagrad_matrix_' + p] + power

                # Call sqrt only for those items that are non-zero.
                denominator = T.switch(T.neq(updated_squares[p], 0.0),
                        T.sqrt(updated_squares[p]),
                        T.ones_like(updated_squares[p], dtype=floatX))
                grads[i] = T.mul(grads[i], 1. / denominator)

            updated_squares[p] = debug_print(updated_squares[p], 'upd_squares_' + p)

        for i, _ in enumerate(grads):
            grads[i] = debug_print(grads[i], 'grads_updated_' + self.updatable_parameters[i])

        return grads, updated_squares
def evaluate_lenet5(learning_rate=0.09,
                    n_epochs=2000,
                    nkerns=[50, 50],
                    batch_size=1,
                    window_width=3,
                    maxSentLength=64,
                    maxDocLength=60,
                    emb_size=300,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.00065,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_s_length=57,
                    max_d_length=59):
    maxSentLength = max_s_length + 2 * (window_width - 1)
    maxDocLength = max_d_length + 2 * (window_width - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/MCTest/'
    rng = numpy.random.RandomState(23455)
    train_data, train_size, test_data, test_size, vocab_size = load_MCTest_corpus(
        rootPath + 'vocab.txt', rootPath + 'mc500.train.tsv_standardlized.txt',
        rootPath + 'mc500.test.tsv_standardlized.txt', max_s_length,
        maxSentLength, maxDocLength)  #vocab_size contain train, dev and test

    #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    #     mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
    #     extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
    #     discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    [
        train_data_D, train_data_Q, train_data_A, train_Y, train_Label,
        train_Length_D, train_Length_D_s, train_Length_Q, train_Length_A,
        train_leftPad_D, train_leftPad_D_s, train_leftPad_Q, train_leftPad_A,
        train_rightPad_D, train_rightPad_D_s, train_rightPad_Q,
        train_rightPad_A
    ] = train_data
    [
        test_data_D, test_data_Q, test_data_A, test_Y, test_Label,
        test_Length_D, test_Length_D_s, test_Length_Q, test_Length_A,
        test_leftPad_D, test_leftPad_D_s, test_leftPad_Q, test_leftPad_A,
        test_rightPad_D, test_rightPad_D_s, test_rightPad_Q, test_rightPad_A
    ] = test_data

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    #     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    #     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    #     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    #     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    #     indices_train_l=T.cast(indices_train_l, 'int64')
    #     indices_train_r=T.cast(indices_train_r, 'int64')
    #     indices_test_l=T.cast(indices_test_l, 'int64')
    #     indices_test_r=T.cast(indices_test_r, 'int64')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_embs_300d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    #cost_tmp=0
    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()  # now, x is the index matrix, must be integer
    index_Q = T.lvector()
    index_A = T.lvector()
    y = T.lvector()

    len_D = T.lscalar()
    len_D_s = T.lvector()
    len_Q = T.lscalar()
    len_A = T.lscalar()

    left_D = T.lscalar()
    left_D_s = T.lvector()
    left_Q = T.lscalar()
    left_A = T.lscalar()

    right_D = T.lscalar()
    right_D_s = T.lvector()
    right_Q = T.lscalar()
    right_A = T.lscalar()

    #wmf=T.dmatrix()
    cost_tmp = T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength)  # doc shape
    filter_words = (emb_size, window_width)
    filter_sents = (nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    #     length_after_wideConv=ishape[1]+filter_size[1]-1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = embeddings[index_D.flatten()].reshape(
        (maxDocLength, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_Q_input = embeddings[index_Q.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A_input = embeddings[index_A.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(nkerns[0], 1,
                                                    filter_words[0],
                                                    filter_words[1]))
    #     load_model_for_conv1([conv_W, conv_b])

    layer0_D = Conv_with_input_para(
        rng,
        input=layer0_D_input,
        image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_Q = Conv_with_input_para(
        rng,
        input=layer0_Q_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_A = Conv_with_input_para(
        rng,
        input=layer0_A_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)

    layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output')
    layer0_Q_output = debug_print(layer0_Q.output, 'layer0_Q.output')
    layer0_A_output = debug_print(layer0_A.output, 'layer0_A.output')
    layer0_para = [conv_W, conv_b]

    layer1_DQ = Average_Pooling_Scan(rng,
                                     input_D=layer0_D_output,
                                     input_r=layer0_Q_output,
                                     kern=nkerns[0],
                                     left_D=left_D,
                                     right_D=right_D,
                                     left_D_s=left_D_s,
                                     right_D_s=right_D_s,
                                     left_r=left_Q,
                                     right_r=right_Q,
                                     length_D_s=len_D_s + filter_words[1] - 1,
                                     length_r=len_Q + filter_words[1] - 1,
                                     dim=maxSentLength + filter_words[1] - 1,
                                     doc_len=maxDocLength,
                                     topk=3)
    layer1_DA = Average_Pooling_Scan(rng,
                                     input_D=layer0_D_output,
                                     input_r=layer0_A_output,
                                     kern=nkerns[0],
                                     left_D=left_D,
                                     right_D=right_D,
                                     left_D_s=left_D_s,
                                     right_D_s=right_D_s,
                                     left_r=left_A,
                                     right_r=right_A,
                                     length_D_s=len_D_s + filter_words[1] - 1,
                                     length_r=len_A + filter_words[1] - 1,
                                     dim=maxSentLength + filter_words[1] - 1,
                                     doc_len=maxDocLength,
                                     topk=3)

    conv2_W, conv2_b = create_conv_para(rng,
                                        filter_shape=(nkerns[1], 1, nkerns[0],
                                                      filter_sents[1]))
    #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0]
    #conv from sentence to doc
    layer2_DQ = Conv_with_input_para(
        rng,
        input=layer1_DQ.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_DA = Conv_with_input_para(
        rng,
        input=layer1_DA.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    #conv single Q and A into doc level with same conv weights
    layer2_Q = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DQ.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_Q_output_sent_rep_Dlevel = debug_print(
        layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel')
    layer2_A_output_sent_rep_Dlevel = debug_print(
        layer2_A.output_sent_rep_Dlevel, 'layer2_A.output_sent_rep_Dlevel')
    layer2_para = [conv2_W, conv2_b]

    layer3_DQ = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DQ.output,
        input_r=layer2_Q_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)
    layer3_DA = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA.output,
        input_r=layer2_A_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)

    #high-way
    high_W, high_b = create_highw_para(rng, nkerns[0], nkerns[1])
    transform_gate_DQ = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b),
        'transform_gate_DQ')
    transform_gate_DA = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA.output_D_sent_level_rep) + high_b),
        'transform_gate_DA')
    transform_gate_Q = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b),
        'transform_gate_Q')
    transform_gate_A = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA.output_QA_sent_level_rep) + high_b),
        'transform_gate_A')
    highW_para = [high_W, high_b]

    overall_D_Q = debug_print(
        (1.0 - transform_gate_DQ) * layer1_DQ.output_D_sent_level_rep +
        transform_gate_DQ * layer3_DQ.output_D_doc_level_rep, 'overall_D_Q')
    overall_D_A = (
        1.0 - transform_gate_DA
    ) * layer1_DA.output_D_sent_level_rep + transform_gate_DA * layer3_DA.output_D_doc_level_rep
    overall_Q = (
        1.0 - transform_gate_Q
    ) * layer1_DQ.output_QA_sent_level_rep + transform_gate_Q * layer2_Q.output_sent_rep_Dlevel
    overall_A = (
        1.0 - transform_gate_A
    ) * layer1_DA.output_QA_sent_level_rep + transform_gate_A * layer2_A.output_sent_rep_Dlevel

    simi_sent_level = debug_print(
        cosine(
            layer1_DQ.output_D_sent_level_rep +
            layer1_DA.output_D_sent_level_rep,
            layer1_DQ.output_QA_sent_level_rep +
            layer1_DA.output_QA_sent_level_rep), 'simi_sent_level')
    simi_doc_level = debug_print(
        cosine(
            layer3_DQ.output_D_doc_level_rep +
            layer3_DA.output_D_doc_level_rep,
            layer2_Q.output_sent_rep_Dlevel + layer2_A.output_sent_rep_Dlevel),
        'simi_doc_level')
    simi_overall_level = debug_print(
        cosine(overall_D_Q + overall_D_A, overall_Q + overall_A),
        'simi_overall_level')

    #     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))

    layer4_input = debug_print(
        T.concatenate([simi_sent_level, simi_doc_level, simi_overall_level],
                      axis=1),
        'layer4_input')  #, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer4 = LogisticRegression(rng, input=layer4_input, n_in=3, n_out=2)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (layer4.W**2).sum() + (high_W**2).sum() + (conv2_W**2).sum() +
        (conv_W**2).sum(),
        'L2_reg')  #+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this = debug_print(layer4.negative_log_likelihood(y),
                            'cost_this')  #+L2_weight*L2_reg
    cost = debug_print(
        (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    #
    #     [train_data_D, train_data_Q, train_data_A, train_Y, train_Label,
    #                  train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A,
    #                 train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A,
    #                 train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A]=train_data
    #     [test_data_D, test_data_Q, test_data_A, test_Y, test_Label,
    #                  test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A,
    #                 test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A,
    #                 test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A]=test_data
    #     index = T.lscalar()
    #     index_D = T.lmatrix()   # now, x is the index matrix, must be integer
    #     index_Q = T.lvector()
    #     index_A= T.lvector()
    #
    #     y = T.lvector()
    #     len_D=T.lscalar()
    #     len_D_s=T.lvector()
    #     len_Q=T.lscalar()
    #     len_A=T.lscalar()
    #
    #     left_D=T.lscalar()
    #     left_D_s=T.lvector()
    #     left_Q=T.lscalar()
    #     left_A=T.lscalar()
    #
    #     right_D=T.lscalar()
    #     right_D_s=T.lvector()
    #     right_Q=T.lscalar()
    #     right_A=T.lscalar()
    #
    #
    #     #wmf=T.dmatrix()
    #     cost_tmp=T.dscalar()

    test_model = theano.function(
        [index],
        [layer4.errors(y), layer4_input, y, layer4.prop_for_posi],
        givens={
            index_D: test_data_D[index],  #a matrix
            index_Q: test_data_Q[index],
            index_A: test_data_A[index],
            y: test_Y[index:index + batch_size],
            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            len_Q: test_Length_Q[index],
            len_A: test_Length_A[index],
            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            left_Q: test_leftPad_Q[index],
            left_A: test_leftPad_A[index],
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            right_Q: test_rightPad_Q[index],
            right_A: test_rightPad_A[index]
        },
        on_unused_input='ignore')

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer4.params + layer2_para + layer0_para + highW_para

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [index, cost_tmp],
        cost,
        updates=updates,
        givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A: train_data_A[index],
            y: train_Y[index:index + batch_size],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A: train_Length_A[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A: train_leftPad_A[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A: train_rightPad_A[index]
        },
        on_unused_input='ignore')

    train_model_predict = theano.function(
        [index], [cost_this, layer4.errors(y), layer4_input, y],
        givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A: train_data_A[index],
            y: train_Y[index:index + batch_size],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A: train_Length_A[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A: train_leftPad_A[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A: train_rightPad_A[index]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    max_acc = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #shuffle(train_batch_start)#shuffle training data
        cost_tmp = 0.0
        #         readfile=open('/mounts/data/proj/wenpeng/Dataset/SICK/train_plus_dev.txt', 'r')
        #         train_pairs=[]
        #         train_y=[]
        #         for line in readfile:
        #             tokens=line.strip().split('\t')
        #             listt=tokens[0]+'\t'+tokens[1]
        #             train_pairs.append(listt)
        #             train_y.append(tokens[2])
        #         readfile.close()
        #         writefile=open('/mounts/data/proj/wenpeng/Dataset/SICK/weights_fine_tune.txt', 'w')
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1
            sys.stdout.write("Training :[%6f] %% complete!\r" %
                             (batch_start * 100.0 / train_size))
            sys.stdout.flush()
            minibatch_index = minibatch_index + 1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter % update_freq != 0:
                cost_ij, error_ij, layer3_input, y = train_model_predict(
                    batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp += cost_ij
                error_sum += error_ij

            else:
                cost_average = train_model(batch_start, cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum = 0
                cost_tmp = 0.0  #reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)

            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(cost_average)

            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses = []
                test_y = []
                test_features = []
                test_prop = []
                for i in test_batch_start:
                    test_loss, layer3_input, y, posi_prop = test_model(i)
                    test_prop.append(posi_prop[0][0])
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                #test_score = numpy.mean(test_losses)
                test_acc = compute_test_acc(test_y, test_prop)
                #test_acc=1-test_score
                print(
                    ('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                     'model %f %%') %
                    (epoch, minibatch_index, n_train_batches, test_acc * 100.))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')

                train_y = []
                train_features = []
                count = 0
                for batch_start in train_batch_start:
                    cost_ij, error_ij, layer3_input, y = train_model_predict(
                        batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()
                clf = svm.SVC(
                    kernel='linear'
                )  #OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33
                clf.fit(train_features, train_y)
                results = clf.decision_function(test_features)
                lr = linear_model.LogisticRegression(C=1e5)
                lr.fit(train_features, train_y)
                results_lr = lr.decision_function(test_features)

                acc_svm = compute_test_acc(test_y, results)
                acc_lr = compute_test_acc(test_y, results_lr)

                find_better = False
                if acc_svm > max_acc:
                    max_acc = acc_svm
                    best_epoch = epoch
                    find_better = True
                if test_acc > max_acc:
                    max_acc = test_acc
                    best_epoch = epoch
                    find_better = True
                if acc_lr > max_acc:
                    max_acc = acc_lr
                    best_epoch = epoch
                    find_better = True
                print '\t\t\tsvm:', acc_svm, 'lr:', acc_lr, 'nn:', test_acc, 'max:', max_acc, '(at', best_epoch, ')'


#                 if find_better==True:
#                     store_model_to_file(layer2_para, best_epoch)
#                     print 'Finished storing best conv params'

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
        mid_time = time.clock()
        #writefile.close()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Exemplo n.º 52
0
    def link(self, inputs):
        self.inputs = inputs
        h_indices = inputs[0]
        w_indices = inputs[1]

        if self.lr_adaptation_method == 'adagrad':
            self._create_adagrad_matrices()

        # Embed context and target words
        r_h = debug_print(self._calc_r_h(h_indices),
                'embed_context')
        q_w = debug_print(self._embed_target(w_indices), 'embed_target')

        # This is the actual, not the expected batch size. Since there might be
        # batches that are not complete, e.g., at the end of a dataset, this can
        # differ.
        batch_size = q_w.shape[0]

        # Calculate predicted target word embedding
        q_hat = debug_print(self._calc_q_hat(r_h), 'q_hat')

        noise_indices, p_n_noise = self._get_noise(batch_size)

        # Get the data part of Eq. 8.
        s_theta_data = debug_print(
                T.sum(q_hat * q_w, axis=1) + self.bias[w_indices],
                's_theta_data')
        p_n_data = debug_print(self.p_n[w_indices], 'p_n_data')
        delta_s_theta_data = debug_print(
                s_theta_data - T.log(self.k * p_n_data),
                'delta_s_theta_data')

        log_sigm_data = debug_print(T.log(T.nnet.sigmoid(delta_s_theta_data)),
                'log_sigm_data')

        # Get the data noise of Eq. 8.
        q_noise = debug_print(self._embed_noise(noise_indices), 'q_noise')
        q_hat_res = q_hat.reshape((batch_size, 1, self.emb_size))
        s_theta_noise = debug_print(
                T.sum(q_hat_res * q_noise, axis=2) + self.bias[noise_indices],
                's_theta_noise')
        delta_s_theta_noise = debug_print(s_theta_noise - T.log(self.k * p_n_noise),
                'delta_s_theta_noise')

        log_sigm_noise = debug_print(T.log(1 - T.nnet.sigmoid(delta_s_theta_noise)),
                'log_sigm_noise')
        sum_noise_per_example = debug_print(T.sum(log_sigm_noise, axis=1),
                'sum_noise_per_example')

        # Calc objective function (cf. Eq. 8 in [MniKav13])
        # [MniKav13] maximize, therefore we need to switch signs in order to
        # minimize
        J = debug_print(-T.mean(log_sigm_data) - T.mean(sum_noise_per_example),
                'J')
        regularization_cost = debug_print(self._calc_regularization_cost(),
                'regularization_cost')
        self.cost = debug_print(J + regularization_cost, 'overall_cost')

        self.outputs = [h_indices, w_indices, q_hat, self.cost]
        self.outputs.extend(self.updatable_parameters)

        if self.lr_adaptation_method == 'adagrad':
            grads, updated_squares = self._get_gradients_adagrad(self.cost)

            # Update the running squared gradient for AdaGrad
            additional_updates = [(self.__dict__['adagrad_matrix_' + p],
                    updated_squares[p])
                    for p in self.updatable_parameters]
        else:
            grads = self._get_gradients(self.cost)
            additional_updates = []

        updates = []

        try:
            for i, p in enumerate(self.updatable_parameters):
                updates.append((self.__dict__[p],
                        self.__dict__[p] -
                        self.lr.get(p, self.lr['default']) * grads[i]))
        except KeyError:
            raise ValueError('Not all parameters or "default" specified in ' +
                    'the learning-rate parameter.')

        updates.extend(additional_updates)

        self.trainer = theano.function([h_indices, w_indices],
                [self.cost, h_indices, w_indices], updates=updates)

        # In prediction we have to normalize explicitly in order to receive a
        # real probability distribution.
        s_theta_all_w = T.dot(q_hat, self.Q.T) + self.bias

        self.predictor = theano.function([h_indices],
                [q_hat, s_theta_all_w])
        self.validator = theano.function([h_indices, w_indices],
                [self.cost, s_theta_all_w])
        return self.cost
Exemplo n.º 53
0
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, batch_size=10000, emb_size=50,
                    margin=0.3, L2_weight=1e-10, update_freq=1, norm_threshold=5.0, max_truncate=40, line_no=16450007, neg_size=60, test_neg_size=300,
                    comment=''):#L1Distance_
    model_options = locals().copy()
    print "model options", model_options
    triple_path='/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/freebase-subsets/'
    rng = numpy.random.RandomState(1234)
#     triples, entity_size, relation_size, entity_count, relation_count=load_triples(triple_path+'freebase_mtr100_mte100-train.txt', line_no, triple_path)#vocab_size contain train, dev and test
    triples, entity_size, relation_size, train_triples_set, train_entity_set, train_relation_set,statistics=load_Train(triple_path+'freebase-FB5M2M-combined.txt', line_no, triple_path)
    train_h2t=statistics[0]
    train_t2h=statistics[1]
    train_r2t=statistics[2]
    train_r2h=statistics[3]
    train_r_replace_tail_prop=statistics[4]
    
    print 'triple size:', len(triples), 'entity_size:', entity_size, 'relation_size:', relation_size#, len(entity_count), len(relation_count)

       


    rand_values=random_value_normal((entity_size, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    entity_E=theano.shared(value=rand_values, borrow=True)      
    rand_values=random_value_normal((relation_size, emb_size), theano.config.floatX, numpy.random.RandomState(4321))
    relation_E=theano.shared(value=rand_values, borrow=True)    
    
    GRU_U, GRU_W, GRU_b=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size)  
#     GRU_U1, GRU_W1, GRU_b1=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size)  
#     GRU_U2, GRU_W2, GRU_b2=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size)  
#     GRU_U_combine, GRU_W_combine, GRU_b_combine=create_nGRUs_para(rng, word_dim=emb_size, hidden_dim=emb_size, n=3) 
#     para_to_load=[entity_E, relation_E, GRU_U, GRU_W, GRU_b]
#     load_model_from_file(triple_path+'Best_Paras_dim'+str(emb_size), para_to_load)  #+'_hits10_63.616'
#     GRU_U_combine=[GRU_U0, GRU_U1, GRU_U2]
#     GRU_W_combine=[GRU_W0, GRU_W1, GRU_W2]
#     GRU_b_combine=[GRU_b0, GRU_b1, GRU_b2]

#     w2v_entity_rand_values=random_value_normal((entity_size, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
#    
#     w2v_relation_rand_values=random_value_normal((relation_size, emb_size), theano.config.floatX, numpy.random.RandomState(4321))
# 
#     w2v_entity_rand_values=load_word2vec_to_init(w2v_entity_rand_values, triple_path+'freebase_mtr100_mte100-train.txt_ids_entityEmb50.txt')
#     w2v_relation_rand_values=load_word2vec_to_init(w2v_relation_rand_values, triple_path+'freebase_mtr100_mte100-train.txt_ids_relationEmb50.txt')
#     w2v_entity_rand_values=theano.shared(value=w2v_entity_rand_values, borrow=True)       
#     w2v_relation_rand_values=theano.shared(value=w2v_relation_rand_values, borrow=True)  
      
#     entity_E_ensemble=entity_E+norm_matrix(w2v_entity_rand_values)
#     relation_E_ensemble=relation_E+norm_matrix(w2v_relation_rand_values)
    
    norm_entity_E=norm_matrix(entity_E)
    norm_relation_E=norm_matrix(relation_E)
    
        
    n_batchs=line_no/batch_size
    remain_triples=line_no%batch_size
    if remain_triples>0:
        batch_start=list(numpy.arange(n_batchs)*batch_size)+[line_no-batch_size]
    else:
        batch_start=list(numpy.arange(n_batchs)*batch_size)

#     batch_start=theano.shared(numpy.asarray(batch_start, dtype=theano.config.floatX), borrow=True)
#     batch_start=T.cast(batch_start, 'int64')   
    
    # allocate symbolic variables for the data
#     index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    n_index_T = T.ltensor3('n_index_T')
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    

    
    dist_tail=one_batch_parallel_Ramesh(x_index_l, norm_entity_E, norm_relation_E, GRU_U, GRU_W, GRU_b, emb_size)
    
    
    loss__tail_is=one_neg_batches_parallel_Ramesh(n_index_T, norm_entity_E, norm_relation_E, GRU_U, GRU_W, GRU_b, emb_size)
    loss_tail_i=T.maximum(0.0, margin+dist_tail.reshape((dist_tail.shape[0],1))-loss__tail_is) 
#     loss_relation_i=T.maximum(0.0, margin+dist_relation.reshape((dist_relation.shape[0],1))-loss_relation_is)     
#     loss_head_i=T.maximum(0.0, margin+dist_head.reshape((dist_head.shape[0],1))-loss_head_is)    
    
    
#     loss_tail_i_test=T.maximum(0.0, 0.0+dist_tail.reshape((dist_tail.shape[0],1))-loss__tail_is)   
#     binary_matrix_test=T.gt(loss_tail_i_test, 0)
#     sum_vector_test=T.sum(binary_matrix_test, axis=1)
#     binary_vector_hits10=T.gt(sum_vector_test, 10)
#     test_loss=T.sum(binary_vector_hits10)*1.0/batch_size  
#     loss_relation_i=T.maximum(0.0, margin+dis_relation.reshape((dis_relation.shape[0],1))-loss__relation_is) 
#     loss_head_i=T.maximum(0.0, margin+dis_head.reshape((dis_head.shape[0],1))-loss__head_is)     
#     def neg_slice(neg_matrix):
#         dist_tail_slice, dis_relation_slice, dis_head_slice=one_batch_parallel_Ramesh(neg_matrix, entity_E, relation_E, GRU_U_combine, GRU_W_combine, GRU_b_combine, emb_size)
#         loss_tail_i=T.maximum(0.0, margin+dist_tail-dist_tail_slice) 
#         loss_relation_i=T.maximum(0.0, margin+dis_relation-dis_relation_slice) 
#         loss_head_i=T.maximum(0.0, margin+dis_head-dis_head_slice) 
#         return loss_tail_i, loss_relation_i, loss_head_i
#     
#     (loss__tail_is, loss__relation_is, loss__head_is), updates = theano.scan(
#        neg_slice,
#        sequences=n_index_T,
#        outputs_info=None)  
    
    loss_tails=T.mean(T.sum(loss_tail_i, axis=1) )
#     loss_relations=T.mean(T.sum(loss_relation_i, axis=1) )
#     loss_heads=T.mean(T.sum(loss_head_i, axis=1) )
    loss=loss_tails#+loss_relations+loss_heads
    L2_loss=debug_print((entity_E** 2).sum()+(relation_E** 2).sum()\
                      +(GRU_U** 2).sum()+(GRU_W** 2).sum(), 'L2_reg')
#     Div_loss=Diversify_Reg(GRU_U[0])+Diversify_Reg(GRU_U[1])+Diversify_Reg(GRU_U[2])+\
#         Diversify_Reg(GRU_W[0])+Diversify_Reg(GRU_W[1])+Diversify_Reg(GRU_W[2])
    cost=loss+L2_weight*L2_loss#+div_reg*Div_loss
    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = [entity_E, relation_E, GRU_U, GRU_W, GRU_b]
#     params_conv = [conv_W, conv_b]
    params_to_store=[entity_E, relation_E, GRU_U, GRU_W, GRU_b]
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
    grads = T.grad(cost, params)
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-9)))   #AdaGrad
        updates.append((acc_i, acc))    

   

#     grads = T.grad(cost, params)
#     updates = []
#     for param_i, grad_i in zip(params, grads):
#         updates.append((param_i, param_i - learning_rate * grad_i))   #AdaGrad 
        
    train_model = theano.function([x_index_l, n_index_T], [loss, cost], updates=updates,on_unused_input='ignore')
#     test_model = theano.function([x_index_l, n_index_T], test_loss, on_unused_input='ignore')
# 
#     train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
#           givens={
#             x_index_l: indices_train_l[index: index + batch_size],
#             x_index_r: indices_train_r[index: index + batch_size],
#             y: trainY[index: index + batch_size],
#             left_l: trainLeftPad_l[index],
#             right_l: trainRightPad_l[index],
#             left_r: trainLeftPad_r[index],
#             right_r: trainRightPad_r[index],
#             length_l: trainLengths_l[index],
#             length_r: trainLengths_r[index],
#             norm_length_l: normalized_train_length_l[index],
#             norm_length_r: normalized_train_length_r[index],
#             mts: mt_train[index: index + batch_size],
#             wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
#     validation_frequency = min(n_train_batches/5, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    svm_max=0.0
    best_epoch=0
#     corpus_triples_set=train_triples_set|dev_triples_set|test_triples_set
    best_train_loss=1000000
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
#         learning_rate/=epoch
#         print 'lr:', learning_rate
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        #shuffle(train_batch_start)#shuffle training data
        loss_sum=0.0
        for start in batch_start:
            if start%100000==0:
                print start, '...'
            pos_triples=triples[start:start+batch_size]
            all_negs=[]
#             count=0
            for pos_triple in pos_triples:
                neg_triples=get_n_neg_triples_train(pos_triple, train_triples_set, train_entity_set, train_r_replace_tail_prop, neg_size)
# #                 print 'neg_head_triples'
#                 neg_relation_triples=get_n_neg_triples(pos_triple, train_triples_set, train_entity_set, train_relation_set, 1, neg_size/3)
# #                 print 'neg_relation_triples'
#                 neg_tail_triples=get_n_neg_triples(pos_triple, train_triples_set, train_entity_set, train_relation_set, 2, neg_size/3)
#                 print 'neg_tail_triples'
                all_negs.append(neg_triples)
#                 print 'neg..', count
#                 count+=1
            neg_tensor=numpy.asarray(all_negs).reshape((batch_size, neg_size, 3)).transpose(1,0,2)
            loss, cost= train_model(pos_triples, neg_tensor)
            loss_sum+=loss
        loss_sum/=len(batch_start)
        print 'Training loss:', loss_sum, 'cost:', cost
        
        
#         loss_test=0.0
# 
#         for test_start in batch_start_test:
#             pos_triples=test_triples[test_start:test_start+batch_size]
#             all_negs=[]
#             for pos_triple in pos_triples:
#                 neg_triples=get_n_neg_triples_new(pos_triple, corpus_triples_set, test_entity_set, test_relation_set, test_neg_size/2, True)
#                 all_negs.append(neg_triples)
#                 
#             neg_tensor=numpy.asarray(all_negs).reshape((batch_size, test_neg_size, 3)).transpose(1,0,2)
#             loss_test+= test_model(pos_triples, neg_tensor)
#             
#             
#         loss_test/=n_batchs_test
#         print '\t\t\tUpdating epoch', epoch, 'finished! Test hits10:', 1.0-loss_test
        if loss_sum< best_train_loss:
            store_model_to_file(triple_path+comment+'Best_Paras_dim'+str(emb_size), params_to_store)
#             store_model_to_file(triple_path+'Divreg_Best_Paras_dim'+str(emb_size), params_to_store)
            best_train_loss=loss_sum
            print 'Finished storing best  params'
#             exit(0)
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock()            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, nkerns=[50,50], batch_size=1, window_width=3,
                    maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200,
                    L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=1.0, decay=0.95):
    maxSentLength=max_s_length+2*(window_width-1)
    maxDocLength=max_d_length+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/';
    rng = numpy.random.RandomState(23455)
    train_data,train_size, test_data, test_size, vocab_size=load_MCTest_corpus_DQAAAA(rootPath+'vocab_DQAAAA.txt', rootPath+'mc500.train.tsv_standardlized.txt_DQAAAA.txt', rootPath+'mc500.test.tsv_standardlized.txt_DQAAAA.txt', max_s_length,maxSentLength, maxDocLength)#vocab_size contain train, dev and test


    [train_data_D, train_data_Q, train_data_A1, train_data_A2, train_data_A3, train_data_A4, train_Label, 
                 train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A1, train_Length_A2, train_Length_A3, train_Length_A4,
                train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3, train_leftPad_A4,
                train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3, train_rightPad_A4]=train_data
    [test_data_D, test_data_Q, test_data_A1, test_data_A2, test_data_A3, test_data_A4, test_Label, 
                 test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A1, test_Length_A2, test_Length_A3, test_Length_A4,
                test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3, test_leftPad_A4,
                test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3, test_rightPad_A4]=test_data                


    n_train_batches=train_size/batch_size
    n_test_batches=test_size/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
#     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
#     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
#     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
#     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
#     indices_train_l=T.cast(indices_train_l, 'int64')
#     indices_train_r=T.cast(indices_train_r, 'int64')
#     indices_test_l=T.cast(indices_test_l, 'int64')
#     indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_DQAAAA_glove_50d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    
    #cost_tmp=0
    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()   # now, x is the index matrix, must be integer
    index_Q = T.lvector()
    index_A1= T.lvector()
    index_A2= T.lvector()
    index_A3= T.lvector()
    index_A4= T.lvector()
#     y = T.lvector()  
    
    len_D=T.lscalar()
    len_D_s=T.lvector()
    len_Q=T.lscalar()
    len_A1=T.lscalar()
    len_A2=T.lscalar()
    len_A3=T.lscalar()
    len_A4=T.lscalar()

    left_D=T.lscalar()
    left_D_s=T.lvector()
    left_Q=T.lscalar()
    left_A1=T.lscalar()
    left_A2=T.lscalar()
    left_A3=T.lscalar()
    left_A4=T.lscalar()

    right_D=T.lscalar()
    right_D_s=T.lvector()
    right_Q=T.lscalar()
    right_A1=T.lscalar()
    right_A2=T.lscalar()
    right_A3=T.lscalar()
    right_A4=T.lscalar()
        


    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength) # doc shape
    filter_words=(emb_size,window_width)
    filter_sents=(nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
#     length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = debug_print(embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1), 'layer0_D_input')#.dimshuffle(0, 'x', 1, 2)
    layer0_Q_input = debug_print(embeddings[index_Q.flatten()].reshape((maxSentLength, emb_size)).transpose(), 'layer0_Q_input')#.dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = debug_print(embeddings[index_A1.flatten()].reshape((maxSentLength, emb_size)).transpose(), 'layer0_A1_input')#.dimshuffle(0, 'x', 1, 2)
    layer0_A2_input = embeddings[index_A2.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2)
    layer0_A3_input = embeddings[index_A3.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2)
    layer0_A4_input = embeddings[index_A4.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2)
    
        
    U, W, b=create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para=[U, W, b] 
#     conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]))
#     layer2_para=[conv2_W, conv2_b]
#     high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1])
#     highW_para=[high_W, high_b]

    #load_model(params)
    
    
    layer0_D = GRU_Tensor3_Input(T=layer0_D_input[left_D:-right_D,:,:],
                                 lefts=left_D_s[left_D:-right_D],
                                 rights=right_D_s[left_D:-right_D],
                                 hidden_dim=nkerns[0],
                                 U=U,W=W,b=b)
    layer0_Q = GRU_Matrix_Input(X=layer0_Q_input[:,left_Q:-right_Q], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A1 = GRU_Matrix_Input(X=layer0_A1_input[:,left_A1:-right_A1], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A2 = GRU_Matrix_Input(X=layer0_A2_input[:,left_A2:-right_A2], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A3 = GRU_Matrix_Input(X=layer0_A3_input[:,left_A3:-right_A3], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A4 = GRU_Matrix_Input(X=layer0_A4_input[:,left_A4:-right_A4], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)

    
    layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output')
    layer0_Q_output=debug_print(layer0_Q.output_vector_mean, 'layer0_Q.output')
    layer0_A1_output=debug_print(layer0_A1.output_vector_mean, 'layer0_A1.output')
    layer0_A2_output=debug_print(layer0_A2.output_vector_mean, 'layer0_A2.output')
    layer0_A3_output=debug_print(layer0_A3.output_vector_mean, 'layer0_A3.output')
    layer0_A4_output=debug_print(layer0_A4.output_vector_mean, 'layer0_A4.output')
    
    #before reasoning, do a GRU for doc: d
    U_d, W_d, b_d=create_GRU_para(rng, nkerns[0], nkerns[0])
    layer_d_para=[U_d, W_d, b_d]
    layer_D_GRU = GRU_Matrix_Input(X=layer0_D_output, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_d,W=W_d,b=b_d,bptt_truncate=-1)
    
    #Reasoning Layer 1
    repeat_Q=debug_print(T.repeat(layer0_Q_output.reshape((layer0_Q_output.shape[0],1)), maxDocLength, axis=1)[:,:layer_D_GRU.output_matrix.shape[1]], 'repeat_Q')
    input_DNN=debug_print(T.concatenate([layer_D_GRU.output_matrix,repeat_Q], axis=0).transpose(), 'input_DNN')#each row is an example
    output_DNN1=HiddenLayer(rng, input=input_DNN, n_in=nkerns[0]*2, n_out=nkerns[0])
    output_DNN2=HiddenLayer(rng, input=output_DNN1.output, n_in=nkerns[0], n_out=nkerns[0])
    
    DNN_out=debug_print(output_DNN2.output.transpose(), 'DNN_out')
    U_p, W_p, b_p=create_GRU_para(rng, nkerns[0], nkerns[0])
    layer_pooling_para=[U_p, W_p, b_p] 
    pooling=GRU_Matrix_Input(X=DNN_out, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p,W=W_p,b=b_p,bptt_truncate=-1)
    translated_Q1=debug_print(pooling.output_vector_max, 'translated_Q1')


    #before reasoning, do a GRU for doc: d2
    U_d2, W_d2, b_d2=create_GRU_para(rng, nkerns[0], nkerns[0])
    layer_d2_para=[U_d2, W_d2, b_d2]
    layer_D2_GRU = GRU_Matrix_Input(X=layer_D_GRU.output_matrix, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_d2,W=W_d2,b=b_d2,bptt_truncate=-1)
    #Reasoning Layer 2
    repeat_Q1=debug_print(T.repeat(translated_Q1.reshape((translated_Q1.shape[0],1)), maxDocLength, axis=1)[:,:layer_D2_GRU.output_matrix.shape[1]], 'repeat_Q1')
    input_DNN2=debug_print(T.concatenate([layer_D2_GRU.output_matrix,repeat_Q1], axis=0).transpose(), 'input_DNN2')#each row is an example
    output_DNN3=HiddenLayer(rng, input=input_DNN2, n_in=nkerns[0]*2, n_out=nkerns[0])
    output_DNN4=HiddenLayer(rng, input=output_DNN3.output, n_in=nkerns[0], n_out=nkerns[0])
    
    DNN_out2=debug_print(output_DNN4.output.transpose(), 'DNN_out2')
    U_p2, W_p2, b_p2=create_GRU_para(rng, nkerns[0], nkerns[0])
    layer_pooling_para2=[U_p2, W_p2, b_p2] 
    pooling2=GRU_Matrix_Input(X=DNN_out2, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p2,W=W_p2,b=b_p2,bptt_truncate=-1)
    translated_Q2=debug_print(pooling2.output_vector_max, 'translated_Q2')
    

    QA1=T.concatenate([translated_Q2, layer0_A1_output], axis=0)
    QA2=T.concatenate([translated_Q2, layer0_A2_output], axis=0)
    QA3=T.concatenate([translated_Q2, layer0_A3_output], axis=0)
    QA4=T.concatenate([translated_Q2, layer0_A4_output], axis=0)
    
    W_HL,b_HL=create_HiddenLayer_para(rng, n_in=nkerns[0]*2, n_out=1)
    match_params=[W_HL,b_HL]
    QA1_match=HiddenLayer(rng, input=QA1, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL)
    QA2_match=HiddenLayer(rng, input=QA2, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL)
    QA3_match=HiddenLayer(rng, input=QA3, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL)
    QA4_match=HiddenLayer(rng, input=QA4, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL)
    
    
    
#     simi_overall_level1=debug_print(cosine(translated_Q2, layer0_A1_output), 'simi_overall_level1')
#     simi_overall_level2=debug_print(cosine(translated_Q2, layer0_A2_output), 'simi_overall_level2')
#     simi_overall_level3=debug_print(cosine(translated_Q2, layer0_A3_output), 'simi_overall_level3')
#     simi_overall_level4=debug_print(cosine(translated_Q2, layer0_A4_output), 'simi_overall_level4')

    simi_overall_level1=debug_print(QA1_match.output[0], 'simi_overall_level1')
    simi_overall_level2=debug_print(QA2_match.output[0], 'simi_overall_level2')
    simi_overall_level3=debug_print(QA3_match.output[0], 'simi_overall_level3')
    simi_overall_level4=debug_print(QA4_match.output[0], 'simi_overall_level4')


#     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))
 
    #only use overall_simi    
    cost=T.maximum(0.0, margin+simi_overall_level2-simi_overall_level1)+T.maximum(0.0, margin+simi_overall_level3-simi_overall_level1)+T.maximum(0.0, margin+simi_overall_level4-simi_overall_level1)
    
#     cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi)
    posi_simi=simi_overall_level1
    nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])
#     #use ensembled simi
#     cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi)
#     posi_simi=simi_1
#     nega_simi=T.max([simi_2, simi_3, simi_4])


    
    L2_reg =debug_print((U**2).sum()+(W**2).sum()
                        +(U_p**2).sum()+(W_p**2).sum()
                        +(U_p2**2).sum()+(W_p2**2).sum()
                        +(U_d**2).sum()+(W_d**2).sum()
                        +(U_d2**2).sum()+(W_d2**2).sum()
                        +(output_DNN1.W**2).sum()+(output_DNN2.W**2).sum()
                        +(output_DNN3.W**2).sum()+(output_DNN4.W**2).sum()
                        +(W_HL**2).sum(), 'L2_reg')#+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost=debug_print(cost+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    


    
    test_model = theano.function([index], [cost, posi_simi, nega_simi],
          givens={
            index_D: test_data_D[index], #a matrix
            index_Q: test_data_Q[index],
            index_A1: test_data_A1[index],
            index_A2: test_data_A2[index],
            index_A3: test_data_A3[index],
            index_A4: test_data_A4[index],

            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            len_Q: test_Length_Q[index],
            len_A1: test_Length_A1[index],
            len_A2: test_Length_A2[index],
            len_A3: test_Length_A3[index],
            len_A4: test_Length_A4[index],

            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            left_Q: test_leftPad_Q[index],
            left_A1: test_leftPad_A1[index],
            left_A2: test_leftPad_A2[index],
            left_A3: test_leftPad_A3[index],
            left_A4: test_leftPad_A4[index],
        
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            right_Q: test_rightPad_Q[index],
            right_A1: test_rightPad_A1[index],
            right_A2: test_rightPad_A2[index],
            right_A3: test_rightPad_A3[index],
            right_A4: test_rightPad_A4[index]
            
            }, on_unused_input='ignore')


    params = layer0_para+output_DNN1.params+output_DNN2.params+output_DNN3.params+output_DNN4.params+layer_pooling_para+layer_pooling_para2+match_params+layer_d_para+layer_d2_para
    
    
#     accumulator=[]
#     for para_i in params:
#         eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
#         accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)


#     updates = []
#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         grad_i=debug_print(grad_i,'grad_i')
#         acc = decay*acc_i + (1-decay)*T.sqr(grad_i) #rmsprop
#         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-6)))   
#         updates.append((acc_i, acc))      
 
    def AdaDelta_updates(parameters,gradients,rho,eps):
        # create variables to store intermediate updates
        gradients_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ]
        deltas_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ]
     
        # calculates the new "average" delta for the next iteration
        gradients_sq_new = [ rho*g_sq + (1-rho)*(g**2) for g_sq,g in zip(gradients_sq,gradients) ]
     
        # calculates the step in direction. The square root is an approximation to getting the RMS for the average value
        deltas = [ (T.sqrt(d_sq+eps)/T.sqrt(g_sq+eps))*grad for d_sq,g_sq,grad in zip(deltas_sq,gradients_sq_new,gradients) ]
     
        # calculates the new "average" deltas for the next step.
        deltas_sq_new = [ rho*d_sq + (1-rho)*(d**2) for d_sq,d in zip(deltas_sq,deltas) ]
     
        # Prepare it as a list f
        gradient_sq_updates = zip(gradients_sq,gradients_sq_new)
        deltas_sq_updates = zip(deltas_sq,deltas_sq_new)
        parameters_updates = [ (p,p - d) for p,d in zip(parameters,deltas) ]
        return gradient_sq_updates + deltas_sq_updates + parameters_updates   
    
    updates=AdaDelta_updates(params, grads, decay, 1e-6)
  
    train_model = theano.function([index], [cost, posi_simi, nega_simi], updates=updates,
          givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            index_A4: train_data_A4[index],

            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            len_A4: train_Length_A4[index],

            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            left_A4: train_leftPad_A4[index],
        
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index],
            right_A4: train_rightPad_A4[index]
            }, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost, posi_simi, nega_simi],
          givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            index_A4: train_data_A4[index],

            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            len_A4: train_Length_A4[index],

            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            left_A4: train_leftPad_A4[index],
        
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index],
            right_A4: train_rightPad_A4[index]
            }, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
#         shuffle(train_batch_start)#shuffle training data


        corr_train=0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1
            sys.stdout.write( "Training :[%6f] %% complete!\r" % ((iter%train_size)*100.0/train_size) )
            sys.stdout.flush()

            minibatch_index=minibatch_index+1
            
            cost_average, posi_simi, nega_simi= train_model(batch_start)
            if posi_simi>nega_simi:
                corr_train+=1
            
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+'corr rate:'+str(corr_train*100.0/train_size)

            
            if iter % validation_frequency == 0:
                corr_test=0
                for i in test_batch_start:
                    cost, posi_simi, nega_simi=test_model(i)
                    if posi_simi>nega_simi:
                        corr_test+=1

                #write_file.close()
                #test_score = numpy.mean(test_losses)
                test_acc=corr_test*1.0/test_size
                #test_acc=1-test_score
                print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,test_acc * 100.))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                 

  
                find_better=False
                if test_acc > max_acc:
                    max_acc=test_acc
                    best_epoch=epoch    
                    find_better=True             
                print '\t\t\ttest_acc:', test_acc, 'max:',    max_acc,'(at',best_epoch,')'
                if find_better==True:
                    store_model_to_file(params, best_epoch, max_acc)
                    print 'Finished storing best params'  

            if patience <= iter:
                done_looping = True
                break
        
        
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock()
        #writefile.close()
   
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    def loop (l_left, l_right, l_matrix, r_left, r_right, r_matrix, mts_i, extra_i, norm_length_l_i, norm_length_r_i):   
        l_input_tensor=debug_print(Matrix_Bit_Shift(l_matrix[:,l_left:-l_right]), 'l_input_tensor')
        r_input_tensor=debug_print(Matrix_Bit_Shift(r_matrix[:,r_left:-r_right]), 'r_input_tensor')
        
        addition_l=T.sum(l_matrix[:,l_left:-l_right], axis=1)
        addition_r=T.sum(r_matrix[:,r_left:-r_right], axis=1)
        cosine_addition=cosine(addition_l, addition_r)
        eucli_addition=1.0/(1.0+EUCLID(addition_l, addition_r))#25.2%
        
        layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
        layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
        
        cosine_sent=cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)
        eucli_sent=1.0/(1.0+EUCLID(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep))#25.2%
        
        attention_matrix=compute_simi_feature_matrix_with_matrix(layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim, layer0_A2.dim, maxSentLength*(maxSentLength+1)/2)
        
        l_max_attention=T.max(attention_matrix, axis=1)
        neighborsArgSorted = T.argsort(l_max_attention)
        kNeighborsArg = neighborsArgSorted[:3]#only average the min 3 vectors
        ll = T.sort(kNeighborsArg).flatten() # make y indices in acending lie
    
    
        r_max_attention=T.max(attention_matrix, axis=0)
        neighborsArgSorted_r = T.argsort(r_max_attention)
        kNeighborsArg_r = neighborsArgSorted_r[:3]#only average the min 3 vectors
        rr = T.sort(kNeighborsArg_r).flatten() # make y indices in acending lie
    
        
        l_max_min_attention=debug_print(layer0_A1.output_matrix[:,ll], 'l_max_min_attention')
        r_max_min_attention=debug_print(layer0_A2.output_matrix[:,rr], 'r_max_min_attention')
        

    
        layer1_A1=GRU_Matrix_Input(X=l_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1)
        layer1_A2=GRU_Matrix_Input(X=r_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1)
    
        vec_l=debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])), 'vec_l')
        vec_r=debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])), 'vec_r')
    
        
        
    #     sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    #     aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
    #     norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    #     sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    #     aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
    #     norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    #     
        uni_cosine=cosine(vec_l, vec_r)
    #     aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
    #     uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')    
    #     '''
    #     linear=Linear(sum_uni_l, sum_uni_r)
    #     poly=Poly(sum_uni_l, sum_uni_r)
    #     sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    #     rbf=RBF(sum_uni_l, sum_uni_r)
    #     gesd=GESD(sum_uni_l, sum_uni_r)
    #     '''
        eucli_1=1.0/(1.0+EUCLID(vec_l, vec_r))#25.2%
    #     #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    #     
        len_l=norm_length_l_i.reshape((1,1))
        len_r=norm_length_r_i.reshape((1,1))  
    #     
    #     '''
    #     len_l=length_l.reshape((1,1))
    #     len_r=length_r.reshape((1,1))  
    #     '''
        #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
        #length_gap=T.sqrt((len_l-len_r)**2)
        #layer3_input=mts
#         layer3_input_nn=T.concatenate([vec_l, vec_r,
#                                     cosine_addition, eucli_addition,
#     #                                 cosine_sent, eucli_sent,
#                                     uni_cosine,eucli_1], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
        
        output_i=T.concatenate([vec_l, vec_r,
                                    cosine_addition, eucli_addition,
    #                                 cosine_sent, eucli_sent,
                                    uni_cosine,eucli_1,
                                    mts_i.reshape((1,14)),
                                    len_l, len_r,
                                    extra_i.reshape((1,9))], axis=1)#, layer2.output, layer1.output_cosine], axis=1)    
        return output_i
Exemplo n.º 56
0
def evaluate_lenet5(file_name,
                    vocab_file,
                    train_file,
                    dev_file,
                    word2vec_file,
                    learning_rate=0.001,
                    n_epochs=2000,
                    nkerns=[90, 90],
                    batch_size=1,
                    window_width=2,
                    maxSentLength=64,
                    maxDocLength=60,
                    emb_size=50,
                    hidden_size=200,
                    L2_weight=0.0065,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_s_length=128,
                    max_d_length=128,
                    margin=0.3):
    maxSentLength = max_s_length + 2 * (window_width - 1)
    maxDocLength = max_d_length + 2 * (window_width - 1)
    model_options = locals().copy()
    f = open(file_name, 'w')
    f.write("model options " + str(model_options) + '\n')
    rng = numpy.random.RandomState(23455)
    train_data, _train_Label, train_size, test_data, _test_Label, test_size, vocab_size = load_MCTest_corpus_DPN(
        vocab_file, train_file, dev_file, max_s_length, maxSentLength,
        maxDocLength)  #vocab_size contain train, dev and test
    f.write('train_size : ' + str(train_size))

    [
        train_data_D, train_data_A1, train_Label, train_Length_D,
        train_Length_D_s, train_Length_A1, train_leftPad_D, train_leftPad_D_s,
        train_leftPad_A1, train_rightPad_D, train_rightPad_D_s,
        train_rightPad_A1
    ] = train_data
    [
        test_data_D, test_data_A1, test_Label, test_Length_D, test_Length_D_s,
        test_Length_A1, test_leftPad_D, test_leftPad_D_s, test_leftPad_A1,
        test_rightPad_D, test_rightPad_D_s, test_rightPad_A1
    ] = test_data

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    rand_values = load_word2vec_to_init(rand_values, word2vec_file)
    embeddings = theano.shared(value=rand_values, borrow=True)

    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()  # now, x is the index matrix, must be integer
    index_A1 = T.lvector()
    y = T.lscalar()

    len_D = T.lscalar()
    len_D_s = T.lvector()
    len_A1 = T.lscalar()

    left_D = T.lscalar()
    left_D_s = T.lvector()
    left_A1 = T.lscalar()

    right_D = T.lscalar()
    right_D_s = T.lvector()
    right_A1 = T.lscalar()

    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength)  # doc shape
    filter_words = (emb_size, window_width)
    filter_sents = (nkerns[0], window_width)

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    f.write('... building the model\n')

    layer0_D_input = embeddings[index_D.flatten()].reshape(
        (maxDocLength, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = embeddings[index_A1.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(nkerns[0], 1,
                                                    filter_words[0],
                                                    filter_words[1]))
    layer0_para = [conv_W, conv_b]
    conv2_W, conv2_b = create_conv_para(rng,
                                        filter_shape=(nkerns[1], 1, nkerns[0],
                                                      filter_sents[1]))
    layer2_para = [conv2_W, conv2_b]
    high_W, high_b = create_highw_para(
        rng, nkerns[0], nkerns[1]
    )  # this part decides nkern[0] and nkern[1] must be in the same dimension
    highW_para = [high_W, high_b]
    params = layer2_para + layer0_para + highW_para  #+[embeddings]

    layer0_D = Conv_with_input_para(
        rng,
        input=layer0_D_input,
        image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_A1 = Conv_with_input_para(
        rng,
        input=layer0_A1_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)

    layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output')
    layer0_A1_output = debug_print(layer0_A1.output, 'layer0_A1.output')

    layer1_DA1 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A1_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A1,
                                      right_r=right_A1,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A1 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=3)

    layer2_DA1 = Conv_with_input_para(
        rng,
        input=layer1_DA1.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A1 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA1.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A1_output_sent_rep_Dlevel = debug_print(
        layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel')

    layer3_DA1 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA1.output,
        input_r=layer2_A1_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)

    #high-way

    transform_gate_DA1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b),
        'transform_gate_DA1')
    transform_gate_A1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b),
        'transform_gate_A1')

    overall_D_A1 = (
        1.0 - transform_gate_DA1
    ) * layer1_DA1.output_D_sent_level_rep + transform_gate_DA1 * layer3_DA1.output_D_doc_level_rep
    overall_A1 = (
        1.0 - transform_gate_A1
    ) * layer1_DA1.output_QA_sent_level_rep + transform_gate_A1 * layer2_A1.output_sent_rep_Dlevel

    simi_sent_level1 = debug_print(
        cosine(layer1_DA1.output_D_sent_level_rep,
               layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1')

    simi_doc_level1 = debug_print(
        cosine(layer3_DA1.output_D_doc_level_rep,
               layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1')

    simi_overall_level1 = debug_print(cosine(overall_D_A1, overall_A1),
                                      'simi_overall_level1')

    simi_1 = (simi_overall_level1 + simi_sent_level1 + simi_doc_level1) / 3.0
    logistic_w, logistic_b = create_logistic_para(rng, 1, 2)
    logistic_para = [logistic_w, logistic_b]
    params += logistic_para
    simi_1 = T.dot(logistic_w, simi_1) + logistic_b.dimshuffle(0, 'x')
    simi_1 = simi_1.dimshuffle(1, 0)

    simi_1 = T.nnet.softmax(simi_1)
    predict = T.argmax(simi_1, axis=1)
    tmp = T.log(simi_1)
    cost = T.maximum(0.0, margin + tmp[0][1 - y] - tmp[0][y])
    L2_reg = (high_W**2).sum() + (conv2_W**2).sum() + (conv_W**2).sum() + (
        logistic_w**2).sum()
    cost = cost + L2_weight * L2_reg

    test_model = theano.function(
        [index],
        [cost, simi_1, predict],
        givens={
            index_D: test_data_D[index],  #a matrix
            index_A1: test_data_A1[index],
            y: test_Label[index],
            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            len_A1: test_Length_A1[index],
            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            left_A1: test_leftPad_A1[index],
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            right_A1: test_rightPad_A1[index],
        },
        on_unused_input='ignore')

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [index], [cost, simi_1, predict],
        updates=updates,
        givens={
            index_D: train_data_D[index],
            index_A1: train_data_A1[index],
            y: train_Label[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_A1: train_Length_A1[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_A1: train_leftPad_A1[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_A1: train_rightPad_A1[index],
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    f.write('... training\n')
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    max_acc = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #shuffle(train_batch_start)#shuffle training data

        simi_train = []
        predict_train = []
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1
            minibatch_index = minibatch_index + 1

            cost_average, simi, predict = train_model(batch_start)
            simi_train.append(simi)
            predict_train.append(predict)
            if iter % 1000 == 0:
                f.write('@iter :' + str(iter) + '\n')
            if iter % n_train_batches == 0:
                corr_train = compute_corr_train(predict_train, _train_Label)
                res = 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + 'corr rate: ' + str(
                            corr_train * 100.0 / train_size) + '\n'
                f.write(res)

            if iter % validation_frequency == 0 or iter % 20000 == 0:
                posi_test_sent = []
                nega_test_sent = []
                posi_test_doc = []
                nega_test_doc = []
                posi_test_overall = []
                nega_test_overall = []

                simi_test = []
                predict_test = []
                for i in test_batch_start:
                    cost, simi, predict = test_model(i)
                    #print simi
                    #f.write('test_predict : ' + str(predict) + ' test_simi : ' + str(simi) + '\n' )
                    simi_test.append(simi)
                    predict_test.append(predict)
                corr_test = compute_corr(simi_test, predict_test, f)
                test_acc = corr_test * 1.0 / (test_size / 4.0)
                res = '\t\t\tepoch ' + str(epoch) + ', minibatch ' + str(
                    minibatch_index) + ' / ' + str(
                        n_train_batches) + ' test acc of best model ' + str(
                            test_acc * 100.0) + '\n'
                f.write(res)

                find_better = False
                if test_acc > max_acc:
                    max_acc = test_acc
                    best_epoch = epoch
                    find_better = True
                res = '\t\t\tmax: ' + str(max_acc) + ' (at ' + str(
                    best_epoch) + ')\n'
                f.write(res)
                if find_better == True:
                    store_model_to_file(params, best_epoch, max_acc)
                    print 'Finished storing best params'

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
        mid_time = time.clock()
        #writefile.close()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Exemplo n.º 57
0
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[50], batch_size=10, window_width=3,
                    maxSentLength=1050, emb_size=50, hidden_size=200,
                    margin=0.5):
#def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[6, 12], batch_size=70, useAllSamples=0, kmax=30, ktop=5, filter_size=[10,7],
#                    L2_weight=0.000005, dropout_p=0.5, useEmb=0, task=5, corpus=1):

    ibmPath='/mounts/data/proj/wenpeng/Dataset/insuranceQA/';


    rng = numpy.random.RandomState(23455)
    datasets, vocab_size=load_ibm_corpus(ibmPath+'vocabulary', ibmPath+'train.txt', ibmPath+'dev.txt', maxSentLength)

    indices_train, trainLengths, trainLeftPad, trainRightPad= datasets[0]
    #print trainY.eval().shape[0]
    indices_dev, devY, devLengths, devLeftPad, devRightPad= datasets[1]

    n_train_batches=indices_train.shape[0]/(batch_size*4) #note that we consider 4 lines as an example in training
    n_valid_batches=indices_dev.shape[0]/(batch_size*4)
    
    train_batch_start=list(numpy.arange(n_train_batches)*(batch_size*4))
    dev_batch_start=list(numpy.arange(n_valid_batches)*(batch_size*4))

    indices_train_theano=theano.shared(numpy.asarray(indices_train, dtype=theano.config.floatX), borrow=True)
    indices_dev_theano=theano.shared(numpy.asarray(indices_dev, dtype=theano.config.floatX), borrow=True)
    indices_train_theano=T.cast(indices_train_theano, 'int32')
    indices_dev_theano=T.cast(indices_dev_theano, 'int32')



    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(1e-50+numpy.zeros(emb_size))
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values)
    embeddings=theano.shared(value=rand_values)      

    
    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x_index = T.imatrix('x_index')   # now, x is the index matrix, must be integer
    #left=T.ivector('left')
    #right=T.ivector('right')
    
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_input = embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0 = Conv(rng, input=layer0_input,
            image_shape=((batch_size*4), 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))
    
    layer0_out=debug_print(layer0.output, 'layer0_out')
    
    layer1=Average_Pooling(rng, input=layer0_out, length_last_dim=length_after_wideConv, kern=nkerns[0] ) 
    
    layer1_out=debug_print(layer1.output.reshape((batch_size*2, nkerns[0]*2)), 'layer1_out')
    
    layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh)
    layer3=HiddenLayer(rng, input=layer2.output, n_in=hidden_size, n_out=1, activation=T.tanh)
    
    posi_score=layer3.output[0:layer3.output.shape[0]:2,:]
    nega_score=layer3.output[1:layer3.output.shape[0]:2,:]
    cost=T.maximum(0, margin-T.sum(posi_score-nega_score))
    

    
    #cost = layer3.negative_log_likelihood(y)
    # output a list of score
    dev_model = theano.function([index], layer3.output.flatten(),
             givens={
                x_index: indices_dev_theano[index: index + (batch_size*4)]})
    # create a list of all model parameters to be fit by gradient descent

    params = layer3.params + layer2.params + layer1.params+layer0.params
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-20)))   #AdaGrad
        updates.append((acc_i, acc))    
  
    train_model = theano.function([index], [cost], updates=updates,
          givens={
            x_index: indices_train_theano[index: index + (batch_size*4)]})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches/5, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            cost_ij= train_model(batch_start)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' cost: '+str(cost_ij)
            if iter % validation_frequency == 0:
                dev_scores=[]
                for i in dev_batch_start:
                    dev_scores+=list(dev_model(i))

                acc_dev=compute_acc(devY, dev_scores)
                print(('\t\t\t\tepoch %i, minibatch %i/%i, dev acc of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,
                           acc_dev * 100.))

                '''
                #print 'validating & testing...'
                # compute zero-one loss on validation set
                validation_losses = []
                for i in dev_batch_start:
                    time.sleep(0.5)
                    validation_losses.append(validate_model(i))
                #validation_losses = [validate_model(i) for i in dev_batch_start]
                this_validation_loss = numpy.mean(validation_losses)
                print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index , n_train_batches, \
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [test_model(i) for i in test_batch_start]
                    test_score = numpy.mean(test_losses)
                    print(('\t\t\t\tepoch %i, minibatch %i/%i, test error of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,
                           test_score * 100.))
            '''

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.085, n_epochs=2000, nkerns=[1,1], batch_size=1, window_width=3,
                    maxSentLength=60, emb_size=300, L2_weight=0.0005, update_freq=1, unifiedWidth_conv0=8, k_dy=3, ktop=3):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/';
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size=load_msr_corpus(rootPath+'vocab.txt', rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength)
    mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/'
    #mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2,:]
    indices_train_r=indices_train[1::2,:]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]
    indices_test_l=indices_test[::2,:]
    indices_test_r=indices_test[1::2,:]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
    indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    indices_train_l=T.cast(indices_train_l, 'int64')
    indices_train_r=T.cast(indices_train_r, 'int64')
    indices_test_l=T.cast(indices_test_l, 'int64')
    indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size))
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    

    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')  
    left_l=T.lscalar()
    right_l=T.lscalar()
    left_r=T.lscalar()
    right_r=T.lscalar()
    length_l=T.lscalar()
    length_r=T.lscalar()
    norm_length_l=T.dscalar()
    norm_length_r=T.dscalar()
    #mts=T.dmatrix()
    #wmf=T.dmatrix()
    cost_tmp=T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv0=ishape[1]+filter_size[1]-1
    poolsize1=(1, length_after_wideConv0)
    length_after_wideConv1=unifiedWidth_conv0+filter_size[1]-1
    poolsize2=(1, length_after_wideConv1)
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
    
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))

    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_ll=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_l_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), poolsize=poolsize1, k=k_dy, unifiedWidth=unifiedWidth_conv0, left=left_l, right=right_l, 
                        W=conv_W, b=conv_b,
                        firstLayer=True)
    layer0_rr=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_r_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), poolsize=poolsize1, k=k_dy, unifiedWidth=unifiedWidth_conv0, left=left_r, right=right_r, 
                        W=conv_W, b=conv_b,
                        firstLayer=True)

    layer0_l_output=debug_print(layer0_ll.fold_output, 'layer0_l.output')
    layer0_r_output=debug_print(layer0_rr.fold_output, 'layer0_r.output')
    
    layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=ishape[0]/2,
                                       left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                                       length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1,
                                       dim=maxSentLength+filter_size[1]-1)

    conv_W2, conv_b2=create_conv_para(rng, filter_shape=(1, 1, filter_size[0]/2, filter_size[1]))

    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer1_ll=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_ll.output,
            image_shape=(batch_size, nkerns[0], ishape[0]/2, unifiedWidth_conv0),
            filter_shape=(nkerns[1], nkerns[0], filter_size[0]/2, filter_size[1]), poolsize=poolsize2, k=ktop, unifiedWidth=ktop, left=layer0_ll.leftPad, right=layer0_ll.rightPad, 
                        W=conv_W2, b=conv_b2,
                        firstLayer=False)
    layer1_rr=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_rr.output,
            image_shape=(batch_size, nkerns[0], ishape[0]/2, unifiedWidth_conv0),
            filter_shape=(nkerns[1], nkerns[0], filter_size[0]/2, filter_size[1]), poolsize=poolsize2, k=ktop, unifiedWidth=ktop, left=layer0_rr.leftPad, right=layer0_rr.rightPad, 
                        W=conv_W2, b=conv_b2,
                        firstLayer=False)

    layer1_l_output=debug_print(layer1_ll.fold_output, 'layer1_l.output')
    layer1_r_output=debug_print(layer1_rr.fold_output, 'layer1_r.output')
    
    layer2=Average_Pooling_for_Top(rng, input_l=layer1_l_output, input_r=layer1_r_output, kern=ishape[0]/4,
                                       left_l=layer0_ll.leftPad, right_l=layer0_ll.rightPad, left_r=layer0_rr.leftPad, right_r=layer0_rr.rightPad, 
                                       length_l=k_dy+filter_size[1]-1, length_r=k_dy+filter_size[1]-1,
                                       dim=unifiedWidth_conv0+filter_size[1]-1)    

    
    
    #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh)
    
    
    sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    '''
    linear=Linear(sum_uni_l, sum_uni_r)
    poly=Poly(sum_uni_l, sum_uni_r)
    sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    rbf=RBF(sum_uni_l, sum_uni_r)
    gesd=GESD(sum_uni_l, sum_uni_r)
    '''
    eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2%
    #eucli_1=EUCLID(sum_uni_l, sum_uni_r)
    
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
    
    '''
    len_l=length_l.reshape((1,1))
    len_r=length_r.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input=T.concatenate([#mts, 
                                eucli_1, uni_cosine,
                                #norm_uni_l, norm_uni_r,#uni_cosine,#norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, #
                                
                                layer1.output_eucli_to_simi,layer1.output_cosine,
                                layer1.output_attentions, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, #
                                #layer1.output_vector_l,layer1.output_vector_r,
                                
                                layer2.output_eucli_to_simi,layer2.output_cosine,
                                layer2.output_attentions,
                                #layer2.output_vector_l,layer2.output_vector_r,
                                
                                len_l, len_r
                                #layer1.output_attentions,
                                #wmf,
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3=LogisticRegression(rng, input=layer3_input, n_in=(2)+(2+4*4)+(2+4*4)+2, n_out=2)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer3.W** 2).sum()+(conv_W** 2).sum()+(conv_W2**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()
    cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost')
    

    
    test_model = theano.function([index], [layer3.errors(y), layer3.y_pred, layer3_input, y],
          givens={
            x_index_l: indices_test_l[index: index + batch_size],
            x_index_r: indices_test_r[index: index + batch_size],
            y: testY[index: index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index]
            #mts: mt_test[index: index + batch_size],
            #wmf: wm_test[index: index + batch_size]
            }, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params+ [conv_W]+[conv_W2]# + layer1.params 
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        #grad_i=debug_print(grad_i,'grad_i')
        #norm=T.sqrt((grad_i**2).sum())
        #if T.lt(norm_threshold, norm):
        #    print 'big norm'
        #    grad_i=grad_i*(norm_threshold/norm)
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
        updates.append((acc_i, acc))    
  
    train_model = theano.function([index,cost_tmp], [cost,layer3.errors(y), layer3_input], updates=updates,
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index]
            #mts: mt_train[index: index + batch_size],
            #wmf: wm_train[index: index + batch_size]
            }, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index]
            #mts: mt_train[index: index + batch_size],
            #wmf: wm_train[index: index + batch_size]
            }, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        shuffle(train_batch_start)#shuffle training data
        cost_tmp=0.0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            if iter%update_freq != 0:
                cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                #print 'cost_ij: ', cost_ij
                cost_tmp+=cost_ij
                error_sum+=error_ij
            else:
                cost_average, error_ij, layer3_input= train_model(batch_start,cost_tmp)
                #print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' sum error: '+str(error_sum)+'/'+str(update_freq)
                error_sum=0
                cost_tmp=0.0#reset for the next batch
                #print layer3_input
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq)
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses=[]
                test_y=[]
                test_features=[]
                for i in test_batch_start:
                    test_loss, pred_y, layer3_input, y=test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                
                test_score = numpy.mean(test_losses)
                test_acc=1-test_score
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,
                           (1-test_score) * 100.))
                #now, see the results of svm
                #write_feature=open('feature_check.txt', 'w')
                train_y=[]
                train_features=[]
                for batch_start in train_batch_start: 
                    cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n')
                #write_feature.close()

                clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33
                clf.fit(train_features, train_y)
                results=clf.predict(test_features)
                lr=linear_model.LogisticRegression().fit(train_features, train_y)
                results_lr=lr.predict(test_features)
                corr_count=0
                corr_lr=0
                test_size=len(test_y)
                for i in range(test_size):
                    if results[i]==test_y[i]:
                        corr_count+=1
                    if numpy.absolute(results_lr[i]-test_y[i])<0.5:
                        corr_lr+=1
                acc=corr_count*1.0/test_size
                acc_lr=corr_lr*1.0/test_size
                if acc > max_acc:
                    max_acc=acc
                    best_epoch=epoch
                if acc_lr> max_acc:
                    max_acc=acc_lr
                    best_epoch=epoch
                if test_acc> max_acc:
                    max_acc=test_acc
                    best_epoch=epoch
                print '\t\t\t\t\t\t\t\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ',    max_acc , ' at epoch: ', best_epoch     
                #exit(0)
            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Exemplo n.º 59
0
 def __init__(self):
     super(vLblNCEPredictor, self).__init__()
     self.h_indices = debug_print(T.imatrix('h'), 'h')
     self.inputs = [self.h_indices]