def build_model(tparams,options):
    
    trng = RandomStreams(options['SEED'])
    
    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    # input sentences, size of n_steps * n_samples
    x = tensor.matrix('x', dtype='int64')
    # the corresponding masks padding zeros
    mask = tensor.matrix('mask', dtype=config.floatX)
    # size of n_samples * n_z
    z = tensor.matrix('z', dtype=config.floatX)
    y = tensor.matrix('y', dtype=config.floatX)
    z = dropout(z, trng, use_noise)
    y = dropout(y, trng, use_noise)

    n_steps = x.shape[0] # the sentence length in this mini-batch
    n_samples = x.shape[1] # the number of sentences in this mini-batch
    
    n_x = tparams['Wemb'].shape[1] # the dimension of the word embedding
    
    # size of n_steps,n_samples,n_x
    emb = tparams['Wemb'][x.flatten()].reshape([n_steps,n_samples,n_x])
    emb = dropout(emb, trng, use_noise)
    
    # 1 * n_samples * n_x
    z0 =tensor.dot(z,tparams['C0']).dimshuffle('x',0,1)
    # n_steps * n_samples * n_x
    emb_input = tensor.concatenate((z0,emb[:n_steps-1]))
    # n_steps * n_samples
    mask0 =mask[0].dimshuffle('x',0)
    mask_input = tensor.concatenate((mask0,mask[:n_steps-1]))

    # decoding the sentence vector z back into the original sentence
    h_decoder = encoder_layer(tparams, emb_input, mask_input,y, seq_output=True)
    h_decoder = dropout(h_decoder, trng, use_noise)
                                         
    shape = h_decoder.shape
    h_decoder = h_decoder.reshape((shape[0]*shape[1], shape[2]))
    
    Vhid = tensor.dot(tparams['Vhid'],tparams['Wemb'].T)
    pred_x = tensor.dot(h_decoder, Vhid) + tparams['bhid']
    pred = tensor.nnet.softmax(pred_x)
    
    x_vec = x.reshape((shape[0]*shape[1],))
    
    index = tensor.arange(shape[0]*shape[1])
    
    pred_word = pred[index, x_vec]
    mask_word = mask.reshape((shape[0]*shape[1],))
    
    index_list = theano.tensor.eq(mask_word, 1.).nonzero()[0]
    
    pred_word = pred_word[index_list]
    
    # the cross-entropy loss                 
    cost = -tensor.log(pred_word + 1e-6).sum() / n_samples  
    
    return use_noise, x, mask, y, z, cost
def build_model(tparams, options):

    trng = RandomStreams(options['SEED'])

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    # size of n_samples * n_z
    z = tensor.matrix('z', dtype=config.floatX)
    # size of n_samples * n_y
    y = tensor.matrix('y', dtype=config.floatX)

    z = dropout(z, trng, use_noise)

    h = tensor.tanh(tensor.dot(z, tparams['Wy1']) + tparams['by1'])
    h = dropout(h, trng, use_noise)

    # size of n_samples * n_y
    pred = tensor.nnet.sigmoid(tensor.dot(h, tparams['Wy2']) + tparams['by2'])

    f_pred = theano.function([z], pred, name='f_pred')

    cost = (-y * tensor.log(pred + 1e-6) -
            (1. - y) * tensor.log(1. - pred + 1e-6)).sum() / z.shape[0]

    return use_noise, z, y, cost, f_pred
示例#3
0
    def forward_alexnet(self, inp, weights, reuse=False):
        # reuse is for the normalization parameters.

        conv1 = conv_block(inp, weights['conv1_weights'], weights['conv1_biases'], stride_y=4, stride_x=4, groups=1, reuse=reuse, scope='conv1')
        norm1 = lrn(conv1, 2, 1e-05, 0.75)
        pool1 = max_pool(norm1, 3, 3, 2, 2, padding='VALID')

        # 2nd Layer: Conv (w ReLu)  -> Lrn -> Pool with 2 groups
        conv2 = conv_block(pool1, weights['conv2_weights'], weights['conv2_biases'], stride_y=1, stride_x=1, groups=2, reuse=reuse, scope='conv2')
        norm2 = lrn(conv2, 2, 1e-05, 0.75)
        pool2 = max_pool(norm2, 3, 3, 2, 2, padding='VALID')

        # 3rd Layer: Conv (w ReLu)
        conv3 = conv_block(pool2, weights['conv3_weights'], weights['conv3_biases'], stride_y=1, stride_x=1, groups=1, reuse=reuse, scope='conv3')

        # 4th Layer: Conv (w ReLu) splitted into two groups
        conv4 = conv_block(conv3, weights['conv4_weights'], weights['conv4_biases'], stride_y=1, stride_x=1, groups=2, reuse=reuse, scope='conv4')

        # 5th Layer: Conv (w ReLu) -> Pool splitted into two groups
        conv5 = conv_block(conv4, weights['conv5_weights'], weights['conv5_biases'], stride_y=1, stride_x=1, groups=2, reuse=reuse, scope='conv5')
        pool5 = max_pool(conv5, 3, 3, 2, 2, padding='VALID')

        # 6th Layer: Flatten -> FC (w ReLu) -> Dropout
        flattened = tf.reshape(pool5, [-1, 6 * 6 * 256])
        fc6 = fc(flattened, weights['fc6_weights'], weights['fc6_biases'], activation='relu')
        dropout6 = dropout(fc6, self.KEEP_PROB)

        # 7th Layer: FC (w ReLu) -> Dropout
        fc7 = fc(dropout6, weights['fc7_weights'], weights['fc7_biases'], activation='relu')
        dropout7 = dropout(fc7, self.KEEP_PROB)

        # 8th Layer: FC and return unscaled activations
        fc8 = fc(dropout7, weights['fc8_weights'], weights['fc8_biases'])

        return fc7, fc8
示例#4
0
    def graph(self, input, is_training):
        with tf.name_scope('model'):
            net = ut.conv_layer(input, 64, 7, 2, name='conv1')
            net = ut.bottleneck(net,
                                128,
                                stride=1,
                                training=is_training,
                                name='res1')
            net = ut.max_pool(net, 2, 2, 'max_pool')
            net = ut.bottleneck(net,
                                int(self.nFeats / 2),
                                stride=1,
                                training=is_training,
                                name='res2')
            net = ut.bottleneck(net,
                                self.nFeats,
                                stride=1,
                                training=is_training,
                                name='res3')

            with tf.name_scope('stacks'):
                stack_out = []
                with tf.name_scope('stage_0'):
                    hg = ut.hourglass(net, self.nLow, self.nFeats, 'hourglass')
                    drop = ut.dropout(hg, self.dropout_rate, is_training,
                                      'dropout')
                    ll = ut.conv_layer_bn(drop, self.nFeats, 1, 1, is_training)
                    out = ut.conv_layer(ll, self.num_points, 1, 1, name='out')
                    out_ = ut.conv_layer(out, self.nFeats, 1, 1, name='out_')
                    sum_ = tf.add(net, out_, name='merge')
                    stack_out.append(out)
                for i in range(1, self.nStacks):
                    with tf.name_scope('stage_' + str(i)):
                        hg = ut.hourglass(sum_, self.nLow, self.nFeats,
                                          'hourglass')
                        drop = ut.dropout(hg, self.dropout_rate, is_training,
                                          'dropout')
                        ll = ut.conv_layer_bn(drop, self.nFeats, 1, 1,
                                              is_training)
                        out = ut.conv_layer(ll,
                                            self.num_points,
                                            1,
                                            1,
                                            name='out')
                        out_ = ut.conv_layer(ll,
                                             self.nFeats,
                                             1,
                                             1,
                                             name='out_')
                        sum_ = tf.add(sum_, out_, name='merge')
                        stack_out.append(out)
            with tf.name_scope('upsampling'):
                net = ut.batch_norm(sum_, is_training)
                net = ut.conv_layer_bn(net, self.nFeats, 3, 1, is_training)
                up1 = ut.deconv_layer(net, self.num_points, 1, 2, name='up_1')
                net = ut.conv_layer_bn(up1, self.nFeats, 3, 1, is_training)
                up2 = ut.deconv_layer(net, self.num_points, 1, 2, name='up_2')
            return tf.stack(stack_out, axis=1, name='stack_out'), up1, up2
示例#5
0
def build_model(tparams, options):

    trng = RandomStreams(SEED)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    # input sentences, size of n_steps * n_samples
    x = tensor.matrix('x', dtype='int64')
    # the corresponding masks padding zeros
    mask = tensor.matrix('mask', dtype=config.floatX)
    # size of n_z * n_samples
    z = tensor.matrix('z', dtype=config.floatX)
    z = dropout(z, trng, use_noise)

    n_steps = x.shape[0]  # the sentence length in this mini-batch
    n_samples = x.shape[1]  # the number of sentences in this mini-batch

    n_x = tparams['Wemb'].shape[1]  # the dimension of the word embedding

    emb = tparams['Wemb'][x.flatten()].reshape([n_steps, n_samples, n_x])
    emb = dropout(emb, trng, use_noise)

    # decoding the sentence vector z back into the original sentence
    h_decoder = decoder_layer(tparams, emb, z, mask=mask)
    h_decoder = dropout(h_decoder, trng, use_noise)

    shape = h_decoder.shape
    h_decoder = h_decoder.reshape((shape[0] * shape[1], shape[2]))

    Vhid = tensor.dot(tparams['Vhid'], tparams['Wemb'].T)
    pred_x = tensor.dot(h_decoder, Vhid) + tparams['bhid']
    pred = tensor.nnet.softmax(pred_x)

    x_vec = x.reshape((shape[0] * shape[1], ))

    index = tensor.arange(shape[0] * shape[1])

    pred_word = pred[index, x_vec]
    mask_word = mask.reshape((shape[0] * shape[1], ))

    index_list = theano.tensor.eq(mask_word, 1.).nonzero()[0]

    pred_word = pred_word[index_list]

    # the cross-entropy loss
    cost = -tensor.log(pred_word + 1e-6).sum() / n_samples

    f_pred_prob = theano.function([x, mask, z], pred_word, name='f_pred_prob')

    return use_noise, x, mask, z, f_pred_prob, cost
 def backpropagate(self,
                   X,
                   Y,
                   cost_function,
                   hidden_layer_dropout=0.1,
                   input_layer_dropout=0.1):
     assert X.shape == (self.batch_size,
                        self.input_dim), '[Error] X shape is wrong'
     assert Y.shape == (self.batch_size,
                        self.output_dim), '[Error] Y shape is wrong'
     outputs, derivatives = self.forward(X, trace=True)
     output = outputs[-1]
     cost_derivative = cost_function(output, Y.T, derivative=True)
     da = cost_derivative
     for k in range(len(self.layers) - 1, -1, -1):
         outputs[k] = dropout(
             outputs[k],
             hidden_layer_dropout if k > 0 else input_layer_dropout)
         assert da.shape == (self.layers[k][0],
                             self.batch_size), '[Error] da shape is wrong'
         dW = np.dot(da, outputs[k].T) / float(self.batch_size)
         db = (np.sum(da, axis=1) / float(self.batch_size)).reshape(
             self.layers[k][0], 1)
         dW = np.hstack((dW, db))
         if k > 0:
             dh = (np.sum(np.dot(self.weights[k][:, :-1].T, da), axis=1) /
                   float(self.batch_size)).reshape(self.layers[k - 1][0], 1)
             da = dh * derivatives[k - 1]
         dW = self.backpropagation_type(k, dW)
         self.weights[k] += dW
    def call(self, inputs):
        """
        Args:
            inputs: it is a list of the tokens, masks, indices of clf tokens, and labels
                    tokens shape = (number of choices * batch size, context length, 3)
                    masks1 is the mask of the second paragraphs of the tokens
                           shape = (number of choices * batch size, context length)
                    masks2 is the mask of the second paragraphs of the predictions
                           shape = (number of choices * batch size, context length)
                    clf_ids is the list of indices of clf tokens
                           shape = (number of choices * batch size)
                    labels shape = (number of choices * batch size)

        Returns:
            lm_logits shape = (batch size, seq length, vocab size)
            lm_losses shape = ()
            clf_losses shape = ()
        """

        tokens, masks1, masks2, clf_ids, labels = inputs[0], inputs[1], inputs[2], inputs[3], inputs[4]
        embedding = self.embed(tokens)
        self.embed.we = dropout(self.embed.we, self.embd_pdrop, self.train)
        hidden = self.transform(embedding)

        lm_logits, lm_loss = self.lm(hidden, tokens, masks1, masks2)
        clf_loss = self.clf(hidden, clf_ids, labels)

        return lm_logits, lm_loss, clf_loss
    def call(self, inputs):
        """
        Args:
            inputs: it is a list of the ID and positions of the tokens and their mask.
                    tokens shape = (batch size, context length, 3 (IDs and positions and segments))
                    masks shape = (batch size, context length)

        Returns:
            logits: shape = (batch size, context length, vocab size)
            losses: shape = ()
        """

        tokens, masks1, masks2 = inputs[0], inputs[1], inputs[2]
        embedding = self.embed(tokens)
        self.embed.we = dropout(self.embed.we, self.embd_pdrop, self.train)
        hidden = self.transform(embedding)
        hidden = tf.reshape(tf.boolean_mask(hidden, masks2), [-1, self.n_embd])
        tokens = tf.reshape(tf.boolean_mask(tokens[:, :, 0], masks1), [-1])
        logits = tf.reshape(tf.matmul(hidden, self.embed.we[:self.n_vocab + self.n_special, :], transpose_b=True),
                            [-1, self.n_vocab + self.n_special])
        eps = 1e-100
        labels = tf.one_hot(tokens, self.n_vocab + self.n_special, 1 - (self.n_vocab - 1) * eps, eps)
        losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                         labels=labels)
        loss = tf.reduce_mean(losses)
        return logits, loss
示例#9
0
文件: modeling.py 项目: Renmant/no
def residual_mlp_layer(x_flat,
                       intermediate_size,
                       initializer_range=0.02,
                       hidden_dropout_prob=0.1):
    """
    :param x: The attention output. It should be [batch_size*seq_length, dim]
    :param intermediate_size: the hidden projection. By default this is the input_dim * 4.
    in the original GPT we would return layer_norm(x_norm + h1) rather than layer_norm(x + h1)
    :return:
    """
    batch_size_seq_length, hidden_size = get_shape_list(x_flat,
                                                        expected_rank=2)
    x_norm = layer_norm(x_flat, name='mlp_ln0')

    intermediate_output = tf.layers.dense(
        x_norm,
        intermediate_size,
        activation=gelu,
        kernel_initializer=create_initializer(initializer_range),
        name='intermediate',
    )

    output_for_residual = tf.layers.dense(
        intermediate_output,
        hidden_size,
        name='output',
        kernel_initializer=create_initializer(initializer_range))
    output_for_residual = dropout(output_for_residual, hidden_dropout_prob)

    layer_output = layer_norm(x_flat + output_for_residual, name='mlp_ln1')
    return layer_output
示例#10
0
    def call(self, inputs):
        """
        Args:
            inputs: it is list of the ID and positions of the tokens and their mask.
                    tokens shape = (batch size, context length, 2 (IDs and positions))
                    masks shape = (batch size, context length)

        Returns:
            logits: shape = (batch size, context length, vocab size)
            losses: shape = (batch size, )
        """
        tokens = tf.reshape(inputs[0], [-1, self.n_ctx, 2])
        masks = tf.reshape(inputs[1], (-1, self.n_ctx))
        masks1 = tf.slice(masks, [0, 1], [-1, self.n_ctx - 1])
        masks2 = tf.pad(masks1, [[0, 0], [0, 1]])
        masks1 = tf.pad(masks1, [[0, 0], [1, 0]])
        embedding = self.embed(tokens)
        self.embed.we = dropout(self.embed.we, self.embd_pdrop, self.train)
        hidden = self.transform(embedding)
        hidden = tf.reshape(hidden, [-1, self.n_ctx, self.n_embd])
        hidden = tf.reshape(tf.boolean_mask(hidden, masks2), [-1, self.n_embd])
        tokens = tf.reshape(tf.boolean_mask(tokens[:, :, 0], masks1), [-1])
        logits = tf.reshape(
            tf.matmul(hidden,
                      self.embed.we[:self.n_vocab, :],
                      transpose_b=True), [-1, self.n_vocab])
        losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                                labels=tokens)
        losses = tf.reduce_mean(losses)
        return logits, losses
示例#11
0
    def prediction_save_cache(self, x):
        """
        Compute prediction for the fully-connected net and save intermediate 
        activations.

        N samples, D dims per sample, each sample is a row vec, M is the dims of
        y/prediction

        Input: 
            x: A numpy array of input data, shape (N, D)
        Return:
            output: Output prediction/prediction of label, shape (N, M)
            caches: Saved intermediate activations for use in backprop
        """
        caches = {}
        h = x  # Input into the next layer or previous hidden activation
        for l in range(self.n_hidden):
            l = str(l)
            w, b = self.params["w" + l], self.params["b" + l]
            h, caches["affine" + l] = affine(h, w, b)  # Affine layer
            h, caches["relu" + l] = relu(h)  # Activation (ReLU)
            # Dropout layer (train-time dropout)
            h, caches["dropout" + l] = dropout(h, self.dropout)

        # Output layer, simply an affine
        output, cache = affine(h, self.params["w_out"], self.params["b_out"])
        caches["affine_out"] = cache
        return output, caches
示例#12
0
 def _process_layers(self, weights, data, learning=True):
     for W, mean, std in self._generate_layers(weights):
         data = normalize(data, mean, std)
         data = relu(data)
         if learning and self.dropout is not None:
             data = dropout(data, self.dropout)
         data = np.dot(data, W)
     return data
示例#13
0
 def _process_layers(self, weights, data, learning=True):
     for W, mean, std in self._generate_layers(weights):
         data = normalize(data, mean, std)
         data = relu(data)
         if learning and self.dropout is not None:
             data = dropout(data, self.dropout)
         data = np.dot(data, W)
     return data
示例#14
0
def build_model(tparams, options):

    trng = RandomStreams(SEED)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    # input sentence: n_steps * n_samples
    x = tensor.matrix('x', dtype='int32')
    # label: (n_samples,)
    y = tensor.vector('y', dtype='int32')

    layer0_input = tparams['Wemb'][tensor.cast(x.flatten(),
                                               dtype='int32')].reshape(
                                                   (x.shape[0], 1, x.shape[1],
                                                    tparams['Wemb'].shape[1]))
    layer0_input = dropout(layer0_input, trng, use_noise)

    layer1_inputs = []
    for i in xrange(len(options['filter_hs'])):
        filter_shape = options['filter_shapes'][i]
        pool_size = options['pool_sizes'][i]
        conv_layer = encoder(tparams,
                             layer0_input,
                             filter_shape=filter_shape,
                             pool_size=pool_size,
                             prefix=_p('cnn_encoder', i))
        layer1_input = conv_layer
        layer1_inputs.append(layer1_input)
    layer1_input = tensor.concatenate(layer1_inputs, 1)
    layer1_input = dropout(layer1_input, trng, use_noise)

    # this is the label prediction you made
    pred = tensor.nnet.softmax(
        tensor.dot(layer1_input, tparams['Wy']) + tparams['by'])

    f_pred_prob = theano.function([x], pred, name='f_pred_prob')
    f_pred = theano.function([x], pred.argmax(axis=1), name='f_pred')

    # get the expression of how we calculate the cost function
    # i.e. corss-entropy loss
    index = tensor.arange(x.shape[0])
    cost = -tensor.log(pred[index, y] + 1e-6).mean()

    return use_noise, x, y, f_pred_prob, f_pred, cost
示例#15
0
def build_model(tparams, options):

    trng = RandomStreams(SEED)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    # n_samples * n_chars
    x = tensor.matrix('x', dtype='int32')
    y = tensor.matrix('y', dtype='int32')
    # (ncons*n_samples) * n_chars
    cy = tensor.matrix('cy', dtype='int32')

    # n_samples * n_h
    tmp_x = tensor.tanh(tensor.dot(x, tparams['W1']) + tparams['b1'])
    tmp_y = tensor.tanh(tensor.dot(y, tparams['W1']) + tparams['b1'])
    # (ncons*n_samples) * n_h
    tmp_cy = tensor.tanh(tensor.dot(cy, tparams['W1']) + tparams['b1'])

    # n_samples * n_h
    feats_x = tensor.tanh(tensor.dot(tmp_x, tparams['W2']) + tparams['b2'])
    feats_y = tensor.tanh(tensor.dot(tmp_y, tparams['W2']) + tparams['b2'])
    # (ncons*n_samples) * n_h
    feats_cy = tensor.tanh(tensor.dot(tmp_cy, tparams['W2']) + tparams['b2'])

    feats_x = dropout(feats_x, trng, use_noise)
    feats_y = dropout(feats_y, trng, use_noise)
    feats_cy = dropout(feats_cy, trng, use_noise)

    feats_x = l2norm(feats_x)
    feats_y = l2norm(feats_y)
    feats_cy = l2norm(feats_cy)

    # Tile by number of contrast terms
    # (ncon*n_samples) * n_h
    feats_x = tensor.tile(feats_x, (options['ncon'], 1))
    feats_y = tensor.tile(feats_y, (options['ncon'], 1))

    cost = tensor.log(1 + tensor.sum(
        tensor.exp(-options['gamma'] * ((feats_x * feats_y).sum(axis=1) -
                                        (feats_x * feats_cy).sum(axis=1)))))

    return use_noise, [x, y, cy], cost
 def _attn(self, q, k, v):
     w = tf.matmul(q, k)
     if self.scale:
         n_state = shape_list(v)[-1]
         w = w * tf.rsqrt(tf.cast(n_state, tf.float32))
     w = self.mask_attn_weights(w)
     w = tf.nn.softmax(w)
     w = dropout(w, self.attn_pdrop, self.train)
     a = tf.matmul(w, v)
     return a
 def clf(self, hidden, clf_ids, labels):
     clf_hidden = tf.reshape(tf.gather_nd(hidden, clf_ids), [-1, self.n_embd])
     clf_logits = self.classifier(clf_hidden)
     clf_logits = dropout(clf_logits, self.clf_pdrop, self.train)
     clf_logits = tf.reshape(clf_logits, [-1, 2])
     eps = 1e-100
     labels = tf.one_hot(labels, 2, 1 - eps, eps)
     clf_losses = tf.nn.softmax_cross_entropy_with_logits(logits=clf_logits, labels=labels)
     clf_loss = tf.reduce_mean(clf_losses)
     return clf_loss
示例#18
0
def build_multi_dynamic_brnn(args,
                             maxTimeSteps,
                             inputX,
                             cell_fn,
                             seqLengths,
                             time_major=True):
    hid_input = inputX  # shape=(maxTimeSteps, args.batch_size, args.num_feature)
    for i in range(args.num_layer):
        scope = 'DBRNN_' + str(i + 1)
        forward_cell = cell_fn(args.num_hidden, activation=args.activation)
        backward_cell = cell_fn(args.num_hidden, activation=args.activation)
        # tensor of shape: [max_time, batch_size, input_size]
        outputs, output_states = bidirectional_dynamic_rnn(
            forward_cell,
            backward_cell,
            inputs=hid_input,
            dtype=tf.float32,
            sequence_length=seqLengths,
            time_major=True,
            scope=scope)

        # forward output, backward output
        # tensor of shape: [max_time, batch_size, input_size]
        output_fw, output_bw = outputs
        # forward states, backward states
        output_state_fw, output_state_bw = output_states

        # output_fb = tf.concat(2, [output_fw, output_bw])
        output_fb = tf.concat(
            [output_fw, output_bw],
            2)  # 连接两个矩阵的操作 [max_time, batch_size, input_size*2]??
        shape = output_fb.get_shape().as_list()
        output_fb = tf.reshape(
            output_fb,
            [shape[0], shape[1], 2, int(shape[2] / 2)])  # 第四维度表示取输出结果均值
        hidden = tf.reduce_sum(output_fb, 2)  # 得到第三维度上相加的值,代表了什么?????
        hidden = dropout(hidden, args.keep_prob, (args.mode == 'train'))

        if i != (args.num_layer - 1):
            hid_input = hidden
        else:
            outputXrs = tf.reshape(
                hidden,
                [-1, args.num_hidden])  # reshape(tensor,shape,name=None)
            # -1代表把其他维度flatten成一维,应该是生成了什么?[ ?, num_hidden ] 只知道其中一个维度代表num_hidden

            # output_list = tf.split(0, maxTimeSteps, outputXrs)
            output_list = tf.split(outputXrs, maxTimeSteps,
                                   0)  # 将outputXrs分成maxTmeSteps份,
            fbHrs = [
                tf.reshape(t, [args.batch_size, args.num_hidden])
                for t in output_list
            ]  # 把每一时刻的tensor分成
            # [batch_size, num_hidden]大小,并组成以时间为轴的列表
    return fbHrs
示例#19
0
def build_model(tparams,options):
    
    trng = RandomStreams(SEED)
    
    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))
    
    # input sentence: n_steps * n_samples
    x = tensor.matrix('x', dtype='int32')
    mask = tensor.matrix('mask', dtype=config.floatX)
    
    # label: (n_samples,)
    y = tensor.vector('y',dtype='int32')

    n_steps = x.shape[0] # the length of the longest sentence in this minibatch
    n_samples = x.shape[1] # how many samples we have in this minibatch
    n_x = tparams['Wemb'].shape[1] # the dimension of the word-embedding
    
    emb = tparams['Wemb'][x.flatten()].reshape([n_steps,n_samples,n_x])  
    emb = dropout(emb, trng, use_noise)
                        
    # encoding of the sentence, size of n_samples * n_h                                                               
    h_encoder = encoder(tparams, emb, mask=mask, prefix='lstm_encoder')
    h_encoder_rev = encoder(tparams, emb[::-1], mask=mask[::-1], prefix='lstm_encoder_rev')
    
    # size of n_samples * (2*n_h) 
    z = tensor.concatenate((h_encoder,h_encoder_rev),axis=1) 
    z = dropout(z, trng, use_noise)  
    
    # this is the label prediction you made 
    # size of n_samples * n_y
    pred = tensor.nnet.softmax(tensor.dot(z, tparams['Wy'])+tparams['by'])
    
    f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
    f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')

    # get the expression of how we calculate the cost function
    # i.e. corss-entropy loss
    index = tensor.arange(n_samples)
    cost = -tensor.log(pred[index, y] + 1e-6).mean()                          

    return use_noise, x, mask, y, f_pred_prob, f_pred, cost
 def call(self, inputs):
     c = self.conv1d_c(inputs)
     q, k, v = tf.split(c, 3, 2)
     q = self.split_heads(q, self.n_head)
     k = self.split_heads(k, self.n_head, k=True)
     v = self.split_heads(v, self.n_head)
     a = self._attn(q, k, v)
     a = self.merge_heads(a)
     a = self.conv1d_a(a)
     a = dropout(a, self.resid_pdrop, self.train)
     return a
示例#21
0
    def forward_fc(self, inp, weights, reuse=False, is_training=False):
        # reuse is for the normalization parameters.
        x = tf.reshape(inp, [-1, 512])
        dense1 = fc(x,
                    weights['dense1_weights'],
                    weights['dense1_biases'],
                    activation=None)
        bn1 = tf.layers.batch_normalization(dense1,
                                            momentum=0.99,
                                            training=is_training,
                                            name='bn1',
                                            reuse=tf.AUTO_REUSE)
        relu1 = tf.nn.relu(bn1)
        dropout1 = dropout(relu1, self.KEEP_PROB)

        dense2 = fc(dropout1,
                    weights['dense2_weights'],
                    weights['dense2_biases'],
                    activation=None)
        bn2 = tf.layers.batch_normalization(dense2,
                                            momentum=0.99,
                                            training=is_training,
                                            name='bn2',
                                            reuse=tf.AUTO_REUSE)
        relu2 = tf.nn.relu(bn2)
        dropout2 = dropout(relu2, self.KEEP_PROB)

        dense3 = fc(dropout2,
                    weights['dense3_weights'],
                    weights['dense3_biases'],
                    activation=None)
        bn3 = tf.layers.batch_normalization(dense3,
                                            momentum=0.99,
                                            training=is_training,
                                            name='bn3',
                                            reuse=tf.AUTO_REUSE)
        relu3 = tf.nn.relu(bn3)
        if self.loss_func == self.additive_angular_margin_softmax:
            return dense2, bn3  # last_layer_linear for angular softmax
        elif self.loss_func == self.softmax:
            return dense2, relu3
示例#22
0
def build_model(tparams, options):

    trng = RandomStreams(SEED)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    # x: n_steps * n_samples
    x = tensor.matrix('x', dtype='int64')
    y = tensor.matrix('y', dtype='int64')

    n_steps = x.shape[0]
    n_samples = x.shape[1]

    n_x = tparams['Wemb'].shape[1]

    emb = tparams['Wemb'][x.flatten()].reshape([n_steps, n_samples, n_x])
    emb = dropout(emb, trng, use_noise)

    h_decoder = decoder_layer(tparams, emb, prefix='decoder_h1')
    h_decoder = dropout(h_decoder, trng, use_noise)

    h_decoder = decoder_layer(tparams, h_decoder, prefix='decoder_h2')
    h_decoder = dropout(h_decoder, trng, use_noise)

    # n_steps * n_samples * n_h
    shape = h_decoder.shape
    h_decoder = h_decoder.reshape((shape[0] * shape[1], shape[2]))

    pred = tensor.dot(h_decoder, tparams['Vhid']) + tparams['bhid']
    pred = tensor.nnet.softmax(pred)

    y_vec = y.reshape((shape[0] * shape[1], ))
    index = tensor.arange(shape[0] * shape[1])
    y_pred = pred[index, y_vec]

    f_pred_prob = theano.function([x, y], y_pred, name='f_pred_prob')
    cost = -tensor.log(y_pred + 1e-6).sum() / n_steps / n_samples

    return use_noise, x, y, f_pred_prob, cost
示例#23
0
    def forward_fc(self, inp, weights, reuse=False, is_training=False):
        # reuse is for the normalization parameters.
        x = tf.reshape(inp, [-1, 512])
        dense1 = fc(x,
                    weights['dense1_weights'],
                    weights['dense1_biases'],
                    activation=None)
        bn1 = tf.layers.batch_normalization(dense1,
                                            momentum=0.99,
                                            training=is_training,
                                            name='bn1',
                                            reuse=tf.AUTO_REUSE)
        relu1 = tf.nn.relu(bn1)
        dropout1 = dropout(relu1, self.KEEP_PROB)

        dense2 = fc(dropout1,
                    weights['dense2_weights'],
                    weights['dense2_biases'],
                    activation=None)
        bn2 = tf.layers.batch_normalization(dense2,
                                            momentum=0.99,
                                            training=is_training,
                                            name='bn2',
                                            reuse=tf.AUTO_REUSE)
        relu2 = tf.nn.relu(bn2)
        dropout2 = dropout(relu2, self.KEEP_PROB)

        dense3 = fc(dropout2,
                    weights['dense3_weights'],
                    weights['dense3_biases'],
                    activation=None)
        bn3 = tf.layers.batch_normalization(dense3,
                                            momentum=0.99,
                                            training=is_training,
                                            name='bn3',
                                            reuse=tf.AUTO_REUSE)

        return dense1, bn3
def build_model(tparams,options):
    
    trng = RandomStreams(SEED)
    
    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))
    
    # input sentence: n_steps * n_samples
    x = tensor.matrix('x', dtype='int32')
    # label: (n_samples,)
    y = tensor.vector('y',dtype='int32')
    
    layer0_input = tparams['Wemb'][tensor.cast(x.flatten(),dtype='int32')].reshape((x.shape[0],1,x.shape[1],tparams['Wemb'].shape[1])) 
    layer0_input = dropout(layer0_input, trng, use_noise)
 
    layer1_inputs = []
    for i in xrange(len(options['filter_hs'])):
        filter_shape = options['filter_shapes'][i]
        pool_size = options['pool_sizes'][i]
        conv_layer = encoder(tparams, layer0_input,filter_shape=filter_shape, pool_size=pool_size,prefix=_p('cnn_encoder',i))                          
        layer1_input = conv_layer
        layer1_inputs.append(layer1_input)
    layer1_input = tensor.concatenate(layer1_inputs,1)
    layer1_input = dropout(layer1_input, trng, use_noise) 
    
    # this is the label prediction you made 
    pred = tensor.nnet.softmax(tensor.dot(layer1_input, tparams['Wy']) + tparams['by'])
    
    f_pred_prob = theano.function([x], pred, name='f_pred_prob')
    f_pred = theano.function([x], pred.argmax(axis=1), name='f_pred')

    # get the expression of how we calculate the cost function
    # i.e. corss-entropy loss
    index = tensor.arange(x.shape[0])
    cost = -tensor.log(pred[index, y] + 1e-6).mean()                          

    return use_noise, x, y, f_pred_prob, f_pred, cost
def build_multi_dynamic_brnn(args,
                             maxTimeSteps,
                             inputX,
                             cell_fn,
                             seqLengths,
                             time_major=True):
    hid_input = inputX
    for i in range(args.num_layer):
        scope = 'DBRNN_' + str(i + 1)
        forward_cell = cell_fn(args.num_hidden, activation=args.activation)
        backward_cell = cell_fn(args.num_hidden, activation=args.activation)
        # tensor of shape: [max_time, batch_size, input_size]
        outputs, output_states = bidirectional_dynamic_rnn(
            forward_cell,
            backward_cell,
            inputs=hid_input,
            dtype=tf.float32,
            sequence_length=seqLengths,
            time_major=True,
            scope=scope)
        # forward output, backward ouput
        # tensor of shape: [max_time, batch_size, input_size]
        output_fw, output_bw = outputs
        # forward states, backward states
        output_state_fw, output_state_bw = output_states
        # output_fb = tf.concat(2, [output_fw, output_bw])
        output_fb = tf.concat([output_fw, output_bw], 2)
        shape = output_fb.get_shape().as_list()
        output_fb = tf.reshape(
            output_fb,
            [shape[0], shape[1], 2, int(shape[2] / 2)])
        hidden = tf.reduce_sum(output_fb, 2)
        hidden = dropout(hidden, args.keep_prob, (args.mode == 'train'))

        if i != args.num_layer - 1:
            hid_input = hidden
        else:
            outputXrs = tf.reshape(hidden, [-1, args.num_hidden])
            # output_list = tf.split(0, maxTimeSteps, outputXrs)
            output_list = tf.split(outputXrs, maxTimeSteps, 0)
            fbHrs = [
                tf.reshape(t, [args.batch_size, args.num_hidden])
                for t in output_list
            ]
    return fbHrs
示例#26
0
def attention_func(input_tensor, attention_mask, hidden_size,
                   hidden_dropout_prob, num_attention_heads,
                   attention_head_size, attention_probs_dropout_prob,
                   initializer_range, batch_size, seq_length):
    attention_heads = []
    with tf.variable_scope("attention") as scope:
        with tf.variable_scope("self"):
            attention_head = attention_layer(
                from_tensor=input_tensor,
                to_tensor=input_tensor,
                attention_mask=attention_mask,
                num_attention_heads=num_attention_heads,
                size_per_head=attention_head_size,
                attention_probs_dropout_prob=attention_probs_dropout_prob,
                initializer_range=initializer_range,
                do_return_2d_tensor=True,
                batch_size=batch_size,
                from_seq_length=seq_length,
                to_seq_length=seq_length)
            attention_heads.append(attention_head)
        if len(attention_heads) == 1:
            attention_output = attention_heads[0]
        else:
            # In the case where we have other sequences, we just concatenate
            # them to the self-attention head before the projection.
            attention_output = tf.concat(attention_heads, axis=-1)
        # Run a linear projection of `hidden_size` then add a residual
        # with `layer_input`.
        with tf.variable_scope("output"):
            attention_output = tf.layers.dense(
                attention_output,
                hidden_size,
                kernel_initializer=utils.create_initializer(initializer_range))
            attention_output = utils.dropout(attention_output,
                                             hidden_dropout_prob)
            attention_output = utils.layer_norm(attention_output +
                                                input_tensor)
    return attention_output, scope
示例#27
0
def feedforward_func(input_tensor,
                     intermediate_size,
                     initializer_range,
                     hidden_size,
                     hidden_dropout_prob,
                     intermediate_act_fn=utils.gelu):
    # The activation is only applied to the "intermediate" hidden layer.
    with tf.variable_scope("feedforward") as scope:
        intermediate_output = tf.layers.dense(
            input_tensor,
            intermediate_size,
            activation=intermediate_act_fn,
            kernel_initializer=utils.create_initializer(initializer_range),
            name="intermediate_dense")
        # Down-project back to `hidden_size` then add the residual.
        layer_output = tf.layers.dense(
            intermediate_output,
            hidden_size,
            kernel_initializer=utils.create_initializer(initializer_range),
            name="intermediate_output")
        layer_output = utils.dropout(layer_output, hidden_dropout_prob)
        layer_output = utils.layer_norm(layer_output + input_tensor)
    return layer_output, scope
示例#28
0
def net_1(input, is_train):
    conv1 = conv(input,
                 filter_h=5,
                 filter_w=5,
                 num_filters=32,
                 stride_y=1,
                 stride_x=1,
                 name='conv1')
    pool1 = max_pool(conv1,
                     filter_h=2,
                     filter_w=2,
                     stride_y=2,
                     stride_x=2,
                     name='pool1')
    conv2 = conv(pool1, 5, 5, 64, 1, 1, 'conv2')
    pool2 = max_pool(conv2, 2, 2, 2, 2, 'pool2')
    flattened = flatten_3d(pool2, name='flattening')
    fc3 = fc(flattened, out_neurons=1000, name='fc3')
    dropout3 = dropout(fc3,
                       keep_prob=prob_close(is_train, 0.5),
                       name='dropout3')
    fc4 = fc(dropout3, out_neurons=10, name='fc4', relu=False)

    return fc4
def train_functions(model, datasets, batch_size, learning_rate, annealing_learning_rate,
                    l1_learning_rate, l2_learning_rate, dropout_rate=None, noise_rate=None):
    """
        Generates a function `train` that implements one step of fine-tuning,
        a function `validate` that computes the error on a batch from the validation set
        and a function `test` that computes the error on a batch from the testing set

        :type datasets: Theano shred variable
        :param datasets: Dataset with train, test and valid sets

        :type batch_size: int
        :param batch_size: Size of the batch for train

        :type learning_rate: float
        :param learning_rate: learning rate

        :type annealing_learning_rate: float
        :param annealing_learning_rate: decreasing rate of learning rate

        type l1_learning_rate: float
        :param l1_learning_rate: L1-norm's weight when added to the cost

        :type l2_learning_rate: float
        :param l2_learning_rate: L2-norm's weight when added to the cost
        """
    train_set_x, train_set_y = datasets['train_set']

    y = T.matrix('y')
    index = T.lscalar()

    # compiling a Theano function that computes the mistakes that are made by the model on a mini batch
    test_model = theano.function(
        inputs=[model.input, y],
        outputs=error_function(model, y)
    )

    validate_model = theano.function(
        inputs=[model.input, y],
        outputs=error_function(model, y)
    )

    # the cost we minimize during training is the model cost of plus the regularization terms (L1 and L2)
    loss_function = (
        cost_function(model, y)
        + l1_learning_rate * model.L1
        + l2_learning_rate * model.L2
    )

    # compute the gradient of cost with respect params
    gparams = [T.grad(loss_function, param) for param in model.params]

    #################################################
    # Wudi change the annealing learning rate:
    #################################################
    updates = []
    state_learning_rate = theano.shared(
        numpy.asarray(
            learning_rate,
            dtype=theano.config.floatX
        ),
        borrow=True)
    updates.append((state_learning_rate, annealing_learning_rate * state_learning_rate))

    # compute list of fine-tuning updates
    for param, gparam in zip(model.params, gparams):
        updates.append((param, param - state_learning_rate * gparam))

    model_input = train_set_x[index * batch_size: (index + 1) * batch_size]
    if noise_rate is not None:
        model_input = utils.add_gaussian(input=model_input, noise_level=noise_rate)

    if dropout_rate is not None:
        model_input = utils.dropout(input=model_input, noise_level=dropout_rate, rescale=True)

    train_model = theano.function(
        inputs=[index],
        outputs=loss_function,
        updates=updates,
        givens={
            model.input: model_input,
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    # theano.printing.pydotprint(train_model, outfile="s.png", var_with_name_simple=True)
    return train_model, test_model, validate_model
示例#30
0
def attention_layer(from_tensor,
                    to_tensor,
                    attention_mask=None,
                    num_attention_heads=1,
                    size_per_head=512,
                    query_act=None,
                    key_act=None,
                    value_act=None,
                    attention_probs_dropout_prob=0.0,
                    initializer_range=0.02,
                    do_return_2d_tensor=False,
                    batch_size=None,
                    from_seq_length=None,
                    to_seq_length=None):
    """Performs multi-headed attention from `from_tensor` to `to_tensor`.
      This is an implementation of multi-headed attention based on "Attention
      is all you Need". If `from_tensor` and `to_tensor` are the same, then
      this is self-attention. Each timestep in `from_tensor` attends to the
      corresponding sequence in `to_tensor`, and returns a fixed-with vector.
      This function first projects `from_tensor` into a "query" tensor and
      `to_tensor` into "key" and "value" tensors. These are (effectively) a list
      of tensors of length `num_attention_heads`, where each tensor is of shape
      [batch_size, seq_length, size_per_head].
      Then, the query and key tensors are dot-producted and scaled. These are
      softmaxed to obtain attention probabilities. The value tensors are then
      interpolated by these probabilities, then concatenated back to a single
      tensor and returned.
      In practice, the multi-headed attention are done with transposes and
      reshapes rather than actual separate tensors.
      Args:
        from_tensor: float Tensor of shape [batch_size, from_seq_length,
          from_width].
        to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
        attention_mask: (optional) int32 Tensor of shape [batch_size,
          from_seq_length, to_seq_length]. The values should be 1 or 0. The
          attention scores will effectively be set to -infinity for any positions in
          the mask that are 0, and will be unchanged for positions that are 1.
        num_attention_heads: int. Number of attention heads.
        size_per_head: int. Size of each attention head.
        query_act: (optional) Activation function for the query transform.
        key_act: (optional) Activation function for the key transform.
        value_act: (optional) Activation function for the value transform.
        attention_probs_dropout_prob: (optional) float. Dropout probability of the
          attention probabilities.
        initializer_range: float. Range of the weight initializer.
        do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
          * from_seq_length, num_attention_heads * size_per_head]. If False, the
          output will be of shape [batch_size, from_seq_length, num_attention_heads
          * size_per_head].
        batch_size: (Optional) int. If the input is 2D, this might be the batch size
          of the 3D version of the `from_tensor` and `to_tensor`.
        from_seq_length: (Optional) If the input is 2D, this might be the seq length
          of the 3D version of the `from_tensor`.
        to_seq_length: (Optional) If the input is 2D, this might be the seq length
          of the 3D version of the `to_tensor`.
      Returns:
        float Tensor of shape [batch_size, from_seq_length,
          num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
          true, this will be of shape [batch_size * from_seq_length,
          num_attention_heads * size_per_head]).
      Raises:
        ValueError: Any of the arguments or tensor shapes are invalid.
      """
    def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
                             seq_length, width):
        output_tensor = tf.reshape(
            input_tensor, [batch_size, seq_length, num_attention_heads, width])
        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
        return output_tensor

    from_shape = utils.get_shape_list(from_tensor, expected_rank=[2, 3])
    to_shape = utils.get_shape_list(to_tensor, expected_rank=[2, 3])
    if len(from_shape) != len(to_shape):
        raise ValueError(
            "The rank of `from_tensor` must match the rank of `to_tensor`.")
    if len(from_shape) == 3:
        batch_size = from_shape[0]
        from_seq_length = from_shape[1]
        to_seq_length = to_shape[1]
    elif len(from_shape) == 2:
        if (batch_size is None or from_seq_length is None
                or to_seq_length is None):
            raise ValueError(
                "When passing in rank 2 tensors to attention_layer, the values "
                "for `batch_size`, `from_seq_length`, and `to_seq_length` "
                "must all be specified.")
    # Scalar dimensions referenced here:
    #   B = batch size (number of sequences)
    #   F = `from_tensor` sequence length
    #   T = `to_tensor` sequence length
    #   N = `num_attention_heads`
    #   H = `size_per_head`
    from_tensor_2d = utils.reshape_to_matrix(from_tensor)
    to_tensor_2d = utils.reshape_to_matrix(to_tensor)
    # `query_layer` = [B*F, N*H]
    query_layer = tf.layers.dense(
        from_tensor_2d,
        num_attention_heads * size_per_head,
        activation=query_act,
        name="query",
        kernel_initializer=utils.create_initializer(initializer_range))
    # `key_layer` = [B*T, N*H]
    key_layer = tf.layers.dense(
        to_tensor_2d,
        num_attention_heads * size_per_head,
        activation=key_act,
        name="key",
        kernel_initializer=utils.create_initializer(initializer_range))
    # `value_layer` = [B*T, N*H]
    value_layer = tf.layers.dense(
        to_tensor_2d,
        num_attention_heads * size_per_head,
        activation=value_act,
        name="value",
        kernel_initializer=utils.create_initializer(initializer_range))
    # `query_layer` = [B, N, F, H]
    query_layer = transpose_for_scores(query_layer, batch_size,
                                       num_attention_heads, from_seq_length,
                                       size_per_head)
    # `key_layer` = [B, N, T, H]
    key_layer = transpose_for_scores(key_layer, batch_size,
                                     num_attention_heads, to_seq_length,
                                     size_per_head)
    # Take the dot product between "query" and "key" to get the raw
    # attention scores.
    # `attention_scores` = [B, N, F, T]
    attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
    attention_scores = tf.multiply(attention_scores,
                                   1.0 / math.sqrt(float(size_per_head)))
    if attention_mask is not None:
        # `attention_mask` = [B, 1, F, T]
        attention_mask = tf.expand_dims(attention_mask, axis=[1])
        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        attention_scores += adder
    # Normalize the attention scores to probabilities.
    # `attention_probs` = [B, N, F, T]
    attention_probs = tf.nn.softmax(attention_scores)
    # This is actually dropping out entire tokens to attend to, which might
    # seem a bit unusual, but is taken from the original Transformer paper.
    attention_probs = utils.dropout(attention_probs,
                                    attention_probs_dropout_prob)
    # `value_layer` = [B, T, N, H]
    value_layer = tf.reshape(
        value_layer,
        [batch_size, to_seq_length, num_attention_heads, size_per_head])
    # `value_layer` = [B, N, T, H]
    value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
    # `context_layer` = [B, N, F, H]
    context_layer = tf.matmul(attention_probs, value_layer)
    # `context_layer` = [B, F, N, H]
    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
    if do_return_2d_tensor:
        # `context_layer` = [B*F, N*H]
        context_layer = tf.reshape(context_layer, [
            batch_size * from_seq_length, num_attention_heads * size_per_head
        ])
    else:
        # `context_layer` = [B, F, N*H]
        context_layer = tf.reshape(
            context_layer,
            [batch_size, from_seq_length, num_attention_heads * size_per_head])
    return context_layer
 def call(self, inputs):
     hidden1 = self.act(self.conv_fc(inputs))
     hidden2 = self.conv_proj(hidden1)
     hidden2 = dropout(hidden2, self.resid_pdrop, self.train)
     return hidden2
示例#32
0
def build_model(tparams, options):

    trng = RandomStreams(SEED)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    # description string: n_steps * n_samples
    x = tensor.matrix('x', dtype='int32')
    x_mask = tensor.matrix('x_mask', dtype=config.floatX)

    y = tensor.matrix('y', dtype='int32')
    y_mask = tensor.matrix('y_mask', dtype=config.floatX)

    n_steps_x = x.shape[0]
    n_steps_y = y.shape[0]
    n_samples = x.shape[1]

    n_x = tparams['Wemb'].shape[1]

    # n_steps * n_samples * n_x
    x_emb = tparams['Wemb'][x.flatten()].reshape([n_steps_x, n_samples, n_x])
    y_emb = tparams['Wemb'][y.flatten()].reshape([n_steps_y, n_samples, n_x])

    # n_samples * n_h
    h_emb_f_x = encoder(tparams, x_emb, mask=x_mask, prefix='encoder_f')
    h_emb_b_x = encoder(tparams,
                        x_emb[::-1],
                        mask=x_mask[::-1],
                        prefix='encoder_b')

    h_emb_f_y = encoder(tparams, y_emb, mask=y_mask, prefix='encoder_f')
    h_emb_b_y = encoder(tparams,
                        y_emb[::-1],
                        mask=y_mask[::-1],
                        prefix='encoder_b')

    # n_samples * (2*n_h)
    h_emb_x = tensor.concatenate((h_emb_f_x, h_emb_b_x), axis=1)
    h_emb_y = tensor.concatenate((h_emb_f_y, h_emb_b_y), axis=1)
    h_emb_x = dropout(h_emb_x, trng, use_noise)
    h_emb_y = dropout(h_emb_y, trng, use_noise)

    h_emb_x = l2norm(h_emb_x)
    h_emb_y = l2norm(h_emb_y)

    # contrastive strings
    # description string: n_steps * (ncon*n_samples)
    cy = tensor.matrix('cy', dtype='int32')
    cy_mask = tensor.matrix('cy_mask', dtype=config.floatX)

    n_steps_cy = cy.shape[0]
    n_samples_c = cy.shape[1]

    # n_steps * (ncon*n_samples) * n_x
    cy_emb = tparams['Wemb'][cy.flatten()].reshape(
        [n_steps_cy, n_samples_c, n_x])

    # (ncon*n_samples) * n_h
    h_emb_f_cy = encoder(tparams, cy_emb, mask=cy_mask, prefix='encoder_f')
    h_emb_b_cy = encoder(tparams,
                         cy_emb[::-1],
                         mask=cy_mask[::-1],
                         prefix='encoder_b')

    # (ncon*n_samples) * (2*n_h)
    h_emb_cy = tensor.concatenate((h_emb_f_cy, h_emb_b_cy), axis=1)
    h_emb_cy = dropout(h_emb_cy, trng, use_noise)

    h_emb_cy = l2norm(h_emb_cy)

    # Tile by number of contrast terms
    # (ncon*n_samples) * (2*n_h)
    h_emb_x = tensor.tile(h_emb_x, (options['ncon'], 1))
    h_emb_y = tensor.tile(h_emb_y, (options['ncon'], 1))

    cost = tensor.log(1 + tensor.sum(
        tensor.exp(-options['gamma'] * ((h_emb_x * h_emb_y).sum(axis=1) -
                                        (h_emb_x * h_emb_cy).sum(axis=1)))))

    return use_noise, [x, x_mask, y, y_mask, cy, cy_mask], cost
def build_model(tparams, options):

    trng = RandomStreams(SEED)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    x = tensor.matrix('x', dtype='int32')
    y = tensor.matrix('y', dtype='int32')
    cy = tensor.matrix('cy', dtype='int32')

    layer0_input = tparams['Wemb'][tensor.cast(x.flatten(),
                                               dtype='int32')].reshape(
                                                   (x.shape[0], 1, x.shape[1],
                                                    tparams['Wemb'].shape[1]))

    layer1_inputs = []
    for i in xrange(len(options['filter_hs'])):
        filter_shape = options['filter_shapes'][i]
        pool_size = options['pool_sizes'][i]
        conv_layer = encoder(tparams,
                             layer0_input,
                             filter_shape=filter_shape,
                             pool_size=pool_size,
                             prefix=_p('cnn_encoder', i))
        layer1_input = conv_layer
        layer1_inputs.append(layer1_input)
    layer1_input_x = tensor.concatenate(layer1_inputs, 1)
    layer1_input_x = dropout(layer1_input_x, trng, use_noise)

    layer0_input = tparams['Wemb'][tensor.cast(y.flatten(),
                                               dtype='int32')].reshape(
                                                   (y.shape[0], 1, y.shape[1],
                                                    tparams['Wemb'].shape[1]))

    layer1_inputs = []
    for i in xrange(len(options['filter_hs'])):
        filter_shape = options['filter_shapes'][i]
        pool_size = options['pool_sizes'][i]
        conv_layer = encoder(tparams,
                             layer0_input,
                             filter_shape=filter_shape,
                             pool_size=pool_size,
                             prefix=_p('cnn_encoder', i))
        layer1_input = conv_layer
        layer1_inputs.append(layer1_input)
    layer1_input_y = tensor.concatenate(layer1_inputs, 1)
    layer1_input_y = dropout(layer1_input_y, trng, use_noise)

    layer0_input = tparams['Wemb'][tensor.cast(
        cy.flatten(), dtype='int32')].reshape(
            (cy.shape[0], 1, cy.shape[1], tparams['Wemb'].shape[1]))

    layer1_inputs = []
    for i in xrange(len(options['filter_hs'])):
        filter_shape = options['filter_shapes'][i]
        pool_size = options['pool_sizes'][i]
        conv_layer = encoder(tparams,
                             layer0_input,
                             filter_shape=filter_shape,
                             pool_size=pool_size,
                             prefix=_p('cnn_encoder', i))
        layer1_input = conv_layer
        layer1_inputs.append(layer1_input)
    layer1_input_cy = tensor.concatenate(layer1_inputs, 1)
    layer1_input_cy = dropout(layer1_input_cy, trng, use_noise)

    layer1_input_x = l2norm(layer1_input_x)
    layer1_input_y = l2norm(layer1_input_y)
    layer1_input_cy = l2norm(layer1_input_cy)

    # Tile by number of contrast terms
    # (ncon*n_samples) * (2*n_h)
    layer1_input_x = tensor.tile(layer1_input_x, (options['ncon'], 1))
    layer1_input_y = tensor.tile(layer1_input_y, (options['ncon'], 1))

    cost = tensor.log(1 + tensor.sum(
        tensor.exp(-options['gamma'] *
                   ((layer1_input_x * layer1_input_y).sum(axis=1) -
                    (layer1_input_x * layer1_input_cy).sum(axis=1)))))

    return use_noise, [x, y, cy], cost