示例#1
0
def build_rnn(x_sym, hid_init_sym, hid2_init_sym, seq_length, vocab_size,
              rnn_size):

    l_input = L.InputLayer(input_var=x_sym, shape=(None, seq_length))
    l_input_hid = L.InputLayer(input_var=hid_init_sym, shape=(None, rnn_size))
    l_input_hid2 = L.InputLayer(input_var=hid2_init_sym,
                                shape=(None, rnn_size))

    l_input = L.EmbeddingLayer(l_input,
                               input_size=vocab_size,
                               output_size=rnn_size)

    l_rnn = L.LSTMLayer(l_input, num_units=rnn_size,
                        hid_init=l_input_hid)  #, cell_init=l_init_cell)
    h = L.DropoutLayer(l_rnn, p=dropout_prob)
    l_rnn2 = L.LSTMLayer(h, num_units=rnn_size,
                         hid_init=l_input_hid2)  #, cell_init=l_init_cell2)
    h = L.DropoutLayer(l_rnn2, p=dropout_prob)

    # Before the decoder layer, we need to reshape the sequence into the batch dimension,
    # so that timesteps are decoded independently.
    l_shp = L.ReshapeLayer(h, (-1, rnn_size))

    pred = NCELayer(l_shp, num_units=vocab_size, Z=Z)
    pred = L.ReshapeLayer(pred, (-1, seq_length, vocab_size))
    return l_rnn, l_rnn2, pred
示例#2
0
    def get_emb_layer_from_idx(self, idx_layer, avg):
        suf = '_avg' if avg else ''
        # if not avg:
        # idx_layer = MaskLayer(idx_layer, self.config[self.name]['mask_rate'])
        self.emb = L.EmbeddingLayer(
            idx_layer,
            len(self.map),
            self.config[self.name]['emb_dim'],
            W=Normal(self.args.init_std) if not avg else Constant(),
            name='e%s%s' % (self.name, suf))
        # W=HeNormal('relu') if not avg else Constant(), name = 'e%s%s'%(self.name, suf))
        self.emb.params[self.emb.W].remove('regularizable')
        if self.config[self.name]['freeze']:
            self.emb.params[self.emb.W].remove('trainable')

        # load embedding from external file if available
        if self.name == 'word' and self.args.train and self.args.embw:
            if not avg or self.config['word']['freeze']:
                try:
                    self.load_emb(self.args.embw)
                except:
                    print 'Not able to read pre-trained embeddings, use random initialization instead'

        add_layer = self.additional_layer(idx_layer, self.emb, avg)

        # add noise to embeddings as in Plank tagger
        add_layer = L.GaussianNoiseLayer(add_layer, 0.1)

        return add_layer
def ptb_lstm(input_var, vocabulary_size, hidden_size, seq_len, num_layers,
             dropout, batch_size):
    l_input = L.InputLayer(shape=(batch_size, seq_len), input_var=input_var)
    l_embed = L.EmbeddingLayer(l_input,
                               vocabulary_size,
                               hidden_size,
                               W=init.Uniform(1.0))
    l_lstms = []
    for i in range(num_layers):
        l_lstm = L.LSTMLayer(l_embed if i == 0 else l_lstms[-1],
                             hidden_size,
                             ingate=L.Gate(W_in=init.GlorotUniform(),
                                           W_hid=init.Orthogonal()),
                             forgetgate=L.Gate(W_in=init.GlorotUniform(),
                                               W_hid=init.Orthogonal(),
                                               b=init.Constant(1.0)),
                             cell=L.Gate(
                                 W_in=init.GlorotUniform(),
                                 W_hid=init.Orthogonal(),
                                 W_cell=None,
                                 nonlinearity=lasagne.nonlinearities.tanh),
                             outgate=L.Gate(W_in=init.GlorotUniform(),
                                            W_hid=init.Orthogonal()))
        l_lstms.append(l_lstm)
    l_drop = L.DropoutLayer(l_lstms[-1], dropout)
    l_out = L.DenseLayer(l_drop, num_units=vocabulary_size, num_leading_axes=2)
    l_out = L.ReshapeLayer(
        l_out,
        (l_out.output_shape[0] * l_out.output_shape[1], l_out.output_shape[2]))
    l_out = L.NonlinearityLayer(l_out,
                                nonlinearity=lasagne.nonlinearities.softmax)
    return l_out
    def build_network(self, vocab_size, input_var, mask_var, W_init):

        l_in = L.InputLayer(shape=(None, None, 1), input_var=input_var)
        l_mask = L.InputLayer(shape=(None, None), input_var=mask_var)
        l_embed = L.EmbeddingLayer(l_in,
                                   input_size=vocab_size,
                                   output_size=EMBED_DIM,
                                   W=W_init)

        l_fwd_1 = L.LSTMLayer(l_embed,
                              NUM_HIDDEN,
                              grad_clipping=GRAD_CLIP,
                              mask_input=l_mask,
                              gradient_steps=GRAD_STEPS,
                              precompute_input=True)
        l_bkd_1 = L.LSTMLayer(l_embed,
                              NUM_HIDDEN,
                              grad_clipping=GRAD_CLIP,
                              mask_input=l_mask,
                              gradient_steps=GRAD_STEPS,
                              precompute_input=True,
                              backwards=True)

        l_all_1 = L.concat([l_fwd_1, l_bkd_1], axis=2)

        l_fwd_2 = L.LSTMLayer(l_all_1,
                              NUM_HIDDEN,
                              grad_clipping=GRAD_CLIP,
                              mask_input=l_mask,
                              gradient_steps=GRAD_STEPS,
                              precompute_input=True)
        l_bkd_2 = L.LSTMLayer(l_all_1,
                              NUM_HIDDEN,
                              grad_clipping=GRAD_CLIP,
                              mask_input=l_mask,
                              gradient_steps=GRAD_STEPS,
                              precompute_input=True,
                              backwards=True)

        l_fwd_1_slice = L.SliceLayer(l_fwd_1, -1, 1)
        l_bkd_1_slice = L.SliceLayer(l_bkd_1, 0, 1)
        y_1 = L.ElemwiseSumLayer([l_fwd_1_slice, l_bkd_1_slice])

        l_fwd_2_slice = L.SliceLayer(l_fwd_2, -1, 1)
        l_bkd_2_slice = L.SliceLayer(l_bkd_2, 0, 1)
        y_2 = L.ElemwiseSumLayer([l_fwd_2_slice, l_bkd_2_slice])

        y = L.concat([y_1, y_2], axis=1)
        g = L.DenseLayer(y,
                         num_units=EMBED_DIM,
                         nonlinearity=lasagne.nonlinearities.tanh)
        l_out = L.DenseLayer(g,
                             num_units=vocab_size,
                             W=l_embed.W.T,
                             nonlinearity=lasagne.nonlinearities.softmax)

        return l_out
示例#5
0
    def __init__(
        self,
        n_words,
        dim_emb,
        num_units,
        n_classes,
        w_emb=None,
        dropout=0.2,
        use_final=False,
        lr=0.001,
        pretrain=None,
    ):
        self.n_words = n_words
        self.dim_emb = dim_emb
        self.num_units = num_units
        self.n_classes = n_classes
        self.lr = lr

        if w_emb is None:
            w_emb = init.Normal()

        self.l_x = layers.InputLayer((None, None))
        self.l_m = layers.InputLayer((None, None))
        self.l_emb = layers.EmbeddingLayer(self.l_x, n_words, dim_emb, W=w_emb)
        self.l_ebd = self.l_emb

        if dropout:
            self.l_emb = layers.dropout(self.l_emb, dropout)

        if use_final:
            self.l_enc = layers.LSTMLayer(self.l_emb,
                                          num_units,
                                          mask_input=self.l_m,
                                          only_return_final=True,
                                          grad_clipping=10.0,
                                          gradient_steps=400)
            self.l_rnn = self.l_enc
        else:
            self.l_enc = layers.LSTMLayer(self.l_emb,
                                          num_units,
                                          mask_input=self.l_m,
                                          only_return_final=False,
                                          grad_clipping=10.0,
                                          gradient_steps=400)
            self.l_rnn = self.l_enc
            self.l_enc = MeanLayer(self.l_enc, self.l_m)

        if dropout:
            self.l_enc = layers.dropout(self.l_enc, dropout)

        self.l_y = layers.DenseLayer(self.l_enc,
                                     n_classes,
                                     nonlinearity=nonlinearities.softmax)

        if pretrain:
            self.load_pretrain(pretrain)
示例#6
0
 def model(self, query_input, batch_size, query_vocab_size,
           context_vocab_size, emb_dim_size):
     l_input = L.InputLayer(shape=(batch_size, ), input_var=query_input)
     l_embed = L.EmbeddingLayer(l_input,
                                input_size=query_vocab_size,
                                output_size=emb_dim_size)
     l_out = L.DenseLayer(l_embed,
                          num_units=context_vocab_size,
                          nonlinearity=lasagne.nonlinearities.softmax)
     return l_embed, l_out
示例#7
0
    def __init__(self, input, input_size, embedding_size):
        """
                Allocate an Embedding Layer.

                """

        self.input = input
        self.output = layers.EmbeddingLayer(self.input,
                                            input_size,
                                            embedding_size,
                                            W=initialize_parameters()[0])
示例#8
0
def build_cnn():
    data_size = (None, 10, 100)  # Batch size x Img Channels x Height x Width

    input_var = T.tensor3(name="input", dtype='int64')

    values = np.array(np.random.randint(0, 1, (5, 10, 100)))
    input_var.tag.test_value = values
    input_layer = L.InputLayer(data_size, input_var=input_var)

    W = create_char_embedding_matrix()

    embed_layer = L.EmbeddingLayer(input_layer,
                                   input_size=102,
                                   output_size=101,
                                   W=W)

    reshape = L.reshape(embed_layer, (-1, 100, 101))
    dim_shuffle = L.dimshuffle(reshape, (0, 2, 1))
    #conv_layer_1 = L.Conv2DLayer(embed_layer, 4, (1), 1, 0)
    #pool_layer_1 = L.MaxPool1DLayer(conv_layer_1, pool_size=1)
    print L.get_output(dim_shuffle).tag.test_value.shape

    conv_layer_1 = L.Conv1DLayer(dim_shuffle, 50, 2, 1)

    print L.get_output(conv_layer_1).tag.test_value.shape
    print "TEST"
    pool_layer_1 = L.MaxPool1DLayer(conv_layer_1, pool_size=99)
    print L.get_output(pool_layer_1).tag.test_value.shape
    reshape_conv_1 = L.reshape(pool_layer_1, (-1, 50))

    conv_layer_2 = L.Conv1DLayer(dim_shuffle, 50, 3, 1)
    pool_layer_2 = L.MaxPool1DLayer(conv_layer_2, pool_size=98)
    reshape_conv_2 = L.reshape(pool_layer_2, (-1, 50))

    merge_layer = L.ConcatLayer([reshape_conv_1, reshape_conv_2], 1)
    print L.get_output(merge_layer).tag.test_value.shape
    reshape_output = L.reshape(merge_layer, (-1, 10, 100))
    print L.get_output(reshape_output).tag.test_value.shape

    x = T.tensor3(name="testname", dtype='int32')
    #x = T.imatrix()
    #output = L.get_output(conv_layer_1,x)

    #f = theano.function([x],output)

    word = unicode("Tat")
    word_index = np.array([])

    #print word_index

    #x_test = np.array([word_index]).astype('int32')
    #print f(x_test)

    return reshape_output
示例#9
0
 def build_network(self):
     l_char1_in = L.InputLayer(shape=(None, None, self.max_word_len),
                               input_var=self.inps[0])
     l_char2_in = L.InputLayer(shape=(None, None, self.max_word_len),
                               input_var=self.inps[1])
     l_mask1_in = L.InputLayer(shape=(None, None, self.max_word_len),
                               input_var=self.inps[2])
     l_mask2_in = L.InputLayer(shape=(None, None, self.max_word_len),
                               input_var=self.inps[3])
     l_char_in = L.ConcatLayer([l_char1_in, l_char2_in],
                               axis=1)  # B x (ND+NQ) x L
     l_char_mask = L.ConcatLayer([l_mask1_in, l_mask2_in], axis=1)
     shp = (self.inps[0].shape[0],
            self.inps[0].shape[1] + self.inps[1].shape[1],
            self.inps[1].shape[2])
     l_index_reshaped = L.ReshapeLayer(l_char_in,
                                       (shp[0] * shp[1], shp[2]))  # BN x L
     l_mask_reshaped = L.ReshapeLayer(l_char_mask,
                                      (shp[0] * shp[1], shp[2]))  # BN x L
     l_lookup = L.EmbeddingLayer(l_index_reshaped, self.num_chars,
                                 self.char_dim)  # BN x L x D
     l_fgru = L.GRULayer(l_lookup,
                         2 * self.char_dim,
                         grad_clipping=10,
                         gradient_steps=-1,
                         precompute_input=True,
                         only_return_final=True,
                         mask_input=l_mask_reshaped)
     l_bgru = L.GRULayer(l_lookup,
                         2 * self.char_dim,
                         grad_clipping=10,
                         gradient_steps=-1,
                         precompute_input=True,
                         backwards=True,
                         only_return_final=True,
                         mask_input=l_mask_reshaped)  # BN x 2D
     l_fwdembed = L.DenseLayer(l_fgru,
                               self.embed_dim / 2,
                               nonlinearity=None)  # BN x DE
     l_bckembed = L.DenseLayer(l_bgru,
                               self.embed_dim / 2,
                               nonlinearity=None)  # BN x DE
     l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1)
     l_char_embed = L.ReshapeLayer(l_embed,
                                   (shp[0], shp[1], self.embed_dim / 2))
     l_embed1 = L.SliceLayer(l_char_embed,
                             slice(0, self.inps[0].shape[1]),
                             axis=1)
     l_embed2 = L.SliceLayer(l_char_embed,
                             slice(-self.inps[1].shape[1], None),
                             axis=1)
     return l_embed1, l_embed2
示例#10
0
    def model(self, query_input, batch_size, query_vocab_size,
              context_vocab_size, emb_dim_size):
        l_input = L.InputLayer(shape=(batch_size, ), input_var=query_input)
        l_embed_continuous = L.EmbeddingLayer(l_input,
                                              input_size=query_vocab_size,
                                              output_size=emb_dim_size)
        l_values_discrete = L.EmbeddingLayer(l_input,
                                             input_size=query_vocab_size,
                                             output_size=emb_dim_size)
        l_probabilities_discrete = L.NonlinearityLayer(
            l_values_discrete, nonlinearity=lasagne.nonlinearities.softmax)
        l_embed_discrete = StochasticLayer(l_probabilities_discrete,
                                           estimator='MF')
        l_merge = L.ElemwiseSumLayer([l_embed_continuous, l_embed_discrete])
        l_out = L.DenseLayer(l_merge,
                             num_units=emb_dim_size,
                             nonlinearity=lasagne.nonlinearities.softmax)

        l_merge_2 = L.ElemwiseMergeLayer([l_out, l_embed_discrete],
                                         merge_function=T.mul)
        l_final_out = L.DenseLayer(l_merge_2, num_units=context_vocab_size)
        return l_values_discrete, l_final_out
示例#11
0
def build_cnn(input):
    #data_size = (None,103,130)  # Batch size x Img Channels x Height x Width

    #input_var = T.tensor3(name = "input",dtype='int64')
    input_var = input

    #values = np.array(np.random.randint(0,102,(1,9,50)))

    #input_var.tag.test_value = values
    #number sentences x words x characters
    input_layer = L.InputLayer((None,9,50), input_var=input)

    W = create_char_embedding_matrix()

    embed_layer = L.EmbeddingLayer(input_layer, input_size=103,output_size=101, W=W)
    #print "EMBED", L.get_output(embed_layer).tag.test_value.shape
    reshape_embed = L.reshape(embed_layer,(-1,50,101))
    #print "reshap embed", L.get_output(reshape_embed).tag.test_value.shape
    conv_layer_1 = L.Conv1DLayer(reshape_embed, 55, 2)
    conv_layer_2 = L.Conv1DLayer(reshape_embed, 55, 3)
    #print "TEST"
    #print "Convolution Layer 1", L.get_output(conv_layer_1).tag.test_value.shape
    #print "Convolution Layer 2", L.get_output(conv_layer_2).tag.test_value.shape

    #flatten_conv_1 = L.flatten(conv_layer_1,3)
    #flatten_conv_2 = L.flatten(conv_layer_2,3)

    #reshape_max_1 = L.reshape(flatten_conv_1,(-1,49))
    #reshape_max_2 = L.reshape(flatten_conv_2, (-1,48))

    #print "OUTPUT Flatten1", L.get_output(flatten_conv_1).tag.test_value.shape
    #print "OUTPUT Flatten2", L.get_output(flatten_conv_2).tag.test_value.shape

    #print "OUTPUT reshape_max_1", L.get_output(reshape_max_1).tag.test_value.shape
    #print "OUTPUT reshape_max_2", L.get_output(reshape_max_2).tag.test_value.shape

    pool_layer_1 = L.MaxPool1DLayer(conv_layer_1, pool_size=54)
    pool_layer_2 = L.MaxPool1DLayer(conv_layer_2, pool_size=53)


    #print "OUTPUT POOL1", L.get_output(pool_layer_1).tag.test_value.shape
    #print "OUTPUT POOL2",L.get_output(pool_layer_2).tag.test_value.shape

    merge_layer = L.ConcatLayer([pool_layer_1, pool_layer_2], 1)

    flatten_merge = L.flatten(merge_layer, 2)
    reshape_merge = L.reshape(flatten_merge, (1,9,110))
    print L.get_output(reshape_embed).shape
    #print L.get_output(reshape_merge).tag.test_value.shape

    return reshape_merge, char_index_lookup
    def get_conv_input(self, sidx, tidx, avg=False):
        suf = '_avg' if avg else ''

        feat_embs = [
            self.manager.feats[name].get_emb_layer(sidx, tidx, avg=avg)
            for name in self.args.source_feats
        ]

        # TODO: change the meaning
        if self.args.lex == 'mix':
            concat_emb = L.ElemwiseSumLayer(feat_embs)  # (100, 15, 256)
        else:
            concat_emb = L.concat(feat_embs, axis=2)  # (100, 15, 256+100)

        pos = np.array([0] * (self.args.window_size / 2) + [1] + [0] *
                       (self.args.window_size / 2)).astype(
                           theano.config.floatX)
        post = theano.shared(pos[np.newaxis, :, np.newaxis],
                             borrow=True)  # (1, 15, 1)
        posl = L.InputLayer(
            (None, self.args.window_size, 1),
            input_var=T.extra_ops.repeat(post, sidx.shape[0],
                                         axis=0))  # (100, 15, 1)
        conv_in = L.concat([concat_emb, posl], axis=2)  # (100, 15, 256+1)

        if self.args.pos_emb:
            posint = L.flatten(
                L.ExpressionLayer(posl,
                                  lambda x: T.cast(x, 'int64')))  # (100, 15)
            pos_emb = L.EmbeddingLayer(
                posint,
                self.args.window_size,
                8,
                name='epos' + suf,
                W=Normal(0.01) if not avg else Constant())  # (100, 15, 8)
            pos_emb.params[pos_emb.W].remove('regularizable')
            conv_in = L.concat([concat_emb, posl, pos_emb],
                               axis=2)  # (100, 15, 256+1+8)

        # # squeeze
        # if self.args.squeeze:
        #     conv_in = L.DenseLayer(conv_in, num_units=self.args.squeeze, name='squeeze'+suf, num_leading_axes=2,
        #                     W=HeNormal('relu')) # (100, 15, 256)

        conv_in = L.dimshuffle(conv_in, (0, 2, 1))  # (100, 256+1, 15)

        return conv_in
示例#13
0
def integrate_captions(input_var=T.imatrix()):
    '''
            :param batch_size: number of images
            :param nb_caption: number of caption used per image
    '''

    ###############################
    # Build Network Configuration #
    ###############################

    print('... Integrating captions to the model')

    # Input of the network : shape = (nb_caption, seq_length)
    network = layers.InputLayer(shape=(None, None), input_var=input_var)

    # Embedding layer : shape = (nb_caption, seq_length, 400)
    vocab_length = get_vocab_length()
    network = layers.EmbeddingLayer(network, vocab_length, output_size=400)

    # LSTM layer : shape = (nb_caption, 500)
    gate_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(),
                                  W_hid=lasagne.init.Orthogonal(),
                                  b=lasagne.init.Constant(0.))

    cell_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(),
                                  W_hid=lasagne.init.Orthogonal(),
                                  W_cell=None,
                                  b=lasagne.init.Constant(0.),
                                  nonlinearity=nonlinearities.tanh)

    network = layers.LSTMLayer(network,
                               num_units=500,
                               ingate=gate_parameters,
                               forgetgate=gate_parameters,
                               cell=cell_parameters,
                               outgate=gate_parameters,
                               grad_clipping=100.,
                               only_return_final=True)

    # Dense Layer : shape = (nb_caption, 500)
    network = layers.DenseLayer(network, num_units=500)

    # Reshape layer : shape = (nb_caption, 500, 1, 1)
    network = layers.ReshapeLayer(network, (-1, 500, 1, 1))

    return network
def makeRNN(xInputRNN, hiddenInitRNN, hidden2InitRNN, sequenceLen, vocabularySize, neuralNetworkSz):

	input_Layer = L.InputLayer(input_var = xInputRNN, shape = (None, sequenceLen))
	hidden_Layer = L.InputLayer(input_var = hiddenInitRNN, shape = (None, neuralNetworkSz))
	hidden_Layer2 = L.InputLayer(input_var = hidden2InitRNN, shape = (None, neuralNetworkSz))
	input_Layer = L.EmbeddingLayer(input_Layer, input_size = vocabularySize, output_size = neuralNetworkSz)

	RNN_Layer = L.LSTMLayer(input_Layer, num_units = neuralNetworkSz, hid_init = hidden_Layer)
	h = L.DropoutLayer(RNN_Layer, p = dropOutProbability)
	RNN_Layer2 = L.LSTMLayer(h, num_units = neuralNetworkSz, hid_init = hidden_Layer2)
	h = L.DropoutLayer(RNN_Layer2, p = dropOutProbability)

	layerShape = L.ReshapeLayer(h, (-1, neuralNetworkSz))
	
	predictions = NCE(layerShape, num_units = vocabularySize, Z = Z)
	predictions = L.ReshapeLayer(predictions, (-1, sequenceLen, vocabularySize))
	return RNN_Layer, RNN_Layer2, predictions
示例#15
0
    def rnn_encoder(x_sym, x_mask):
        name = "Encoder"
        n_layers = 1
        n_units = 128
        emb_size = 128
        rnn = DropoutLSTMLayer

        l_in = L.InputLayer((None, None), input_var=x_sym)
        l_mask = L.InputLayer((None, None), input_var=x_mask)
        l_emb = DropoutEmbeddingLayer(l_in,
                                      dict_size,
                                      emb_size,
                                      name=name + '.Embedding',
                                      dropout=0.25)
        l_onehot = L.EmbeddingLayer(l_in,
                                    dict_size,
                                    dict_size,
                                    W=np.eye(dict_size, dtype='float32'),
                                    name=name + '.OneHot')
        l_onehot.params[l_onehot.W].remove('trainable')

        l_enc_forwards = rnn(l_emb,
                             num_units=n_units,
                             mask_input=l_mask,
                             name=name + '.0.Forward')
        l_enc_backwards = rnn(l_emb,
                              num_units=n_units,
                              mask_input=l_mask,
                              backwards=True,
                              name=name + '.0.Backward')
        l_enc = L.ConcatLayer([l_enc_forwards, l_enc_backwards], axis=2)

        for i in range(n_layers - 1):
            l_enc = rnn(l_enc,
                        num_units=n_units,
                        mask_input=l_mask,
                        name="%s.%d.Forward" % (name, i + 1),
                        dropout=0.25)

        return l_onehot, l_enc
示例#16
0
    def __init__(self, incomings, vocab_size, emb_size, W, WT=None, **kwargs):
        super(EncodingFullLayer, self).__init__(incomings, **kwargs)
        #        if len(self.input_shapes[0]) == 3:
        #            batch_size, w_count, w_length = self.input_shapes[0]
        shape = tuple(self.input_shapes[0])
        #        else:
        #            shape = tuple(self.input_shapes[0])

        self.WT = None
        #        self.reset_zero()
        self.l_in = LL.InputLayer(shape=shape)
        self.l_in_pe = LL.InputLayer(shape=shape + (emb_size, ))
        self.l_emb = LL.EmbeddingLayer(self.l_in,
                                       input_size=vocab_size,
                                       output_size=emb_size,
                                       W=W)
        self.W = self.l_emb.W
        self.l_emb = LL.ElemwiseMergeLayer((self.l_emb, self.l_in_pe),
                                           merge_function=T.mul)
        self.l_emb_res = LL.ExpressionLayer(self.l_emb,
                                            lambda X: X.sum(2),
                                            output_shape='auto')

        #        self.l_emb_res = SumLayer(self.l_emb, axis=2)
        if np.any(WT):
            self.l_emb_res = TemporalEncodicgLayer(self.l_emb_res, T=WT)
            self.WT = self.l_emb_res.T
        params = LL.helper.get_all_params(self.l_emb_res, trainable=True)
        values = LL.helper.get_all_param_values(self.l_emb_res, trainable=True)
        for p, v in zip(params, values):
            self.add_param(p, v.shape, name=p.name)

        zero_vec_tensor = T.vector()
        self.zero_vec = np.zeros(emb_size, dtype=theano.config.floatX)
        self.set_zero = theano.function(
            [zero_vec_tensor],
            updates=[(x, T.set_subtensor(x[-1, :], zero_vec_tensor))
                     for x in [self.W]])
示例#17
0
def build_network(W,
                  number_unique_tags,
                  longest_word,
                  longest_sentence,
                  input_var=None):
    print("Building network ...")

    input_layer = L.InputLayer((None, longest_sentence, longest_word),
                               input_var=input_var)

    embed_layer = L.EmbeddingLayer(input_layer,
                                   input_size=103,
                                   output_size=101,
                                   W=W)

    reshape_embed = L.reshape(embed_layer, (-1, longest_word, 101))

    conv_layer_1 = L.Conv1DLayer(reshape_embed, longest_word, 2)
    conv_layer_2 = L.Conv1DLayer(reshape_embed, longest_word, 3)

    pool_layer_1 = L.MaxPool1DLayer(conv_layer_1, pool_size=longest_word - 1)
    pool_layer_2 = L.MaxPool1DLayer(conv_layer_2, pool_size=longest_word - 2)

    merge_layer = L.ConcatLayer([pool_layer_1, pool_layer_2], 1)
    flatten_merge = L.flatten(merge_layer, 2)
    reshape_merge = L.reshape(flatten_merge,
                              (-1, longest_sentence, int(longest_word * 2)))

    l_re = lasagne.layers.RecurrentLayer(
        reshape_merge,
        N_HIDDEN,
        nonlinearity=lasagne.nonlinearities.sigmoid,
        mask_input=None)
    l_out = lasagne.layers.DenseLayer(
        l_re, number_unique_tags, nonlinearity=lasagne.nonlinearities.softmax)

    print "DONE BUILDING NETWORK"
    return l_out
示例#18
0
class decoder_step:
    #inputs
    encoder = L.InputLayer((None, None, CODE_SIZE), name='encoded sequence')
    encoder_mask = L.InputLayer((None, None), name='encoded sequence')

    inp = L.InputLayer((None, ), name='current character')

    l_target_emb = L.EmbeddingLayer(inp, dst_voc.len, 128)

    #recurrent part

    l_rnn1 = AutoLSTMCell(l_target_emb, 128, name="lstm1")

    query = L.DenseLayer(l_rnn1.out, 128, nonlinearity=None)
    attn = AttentionLayer(encoder, query, 128, mask_input=encoder_mask)['attn']

    l_rnn = L.concat([attn, l_rnn1.out, l_target_emb])

    l_rnn2 = AutoLSTMCell(l_rnn, 128, name="lstm1")

    next_token_probas = L.DenseLayer(l_rnn2.out,
                                     dst_voc.len,
                                     nonlinearity=T.nnet.softmax)

    #pick next token from predicted probas
    next_token = ProbabilisticResolver(next_token_probas)

    tau = T.scalar("sample temperature", "float32")

    next_token_temperatured = TemperatureResolver(next_token_probas, tau)
    next_token_greedy = GreedyResolver(next_token_probas)

    auto_updates = {
        **l_rnn1.get_automatic_updates(),
        **l_rnn2.get_automatic_updates()
    }
示例#19
0
    def get_char2word(self, ic, avg=False):
        suf = '_avg' if avg else ''
        ec = L.EmbeddingLayer(
            ic,
            self.args.vc,
            self.args.nc,
            name='ec' + suf,
            W=HeNormal() if not avg else Constant())  # (100, 24, 32, 16)
        ec.params[ec.W].remove('regularizable')

        if self.args.char_model == 'CNN':
            lds = L.dimshuffle(ec, (0, 3, 1, 2))  # (100, 16, 24, 32)
            ls = []
            for n in self.args.ngrams:
                lconv = L.Conv2DLayer(
                    lds,
                    self.args.nf, (1, n),
                    untie_biases=True,
                    W=HeNormal('relu') if not avg else Constant(),
                    name='conv_%d' % n + suf)  # (100, 64/4, 24, 32-n+1)
                lpool = L.MaxPool2DLayer(
                    lconv, (1, self.args.max_len - n + 1))  # (100, 64, 24, 1)
                lpool = L.flatten(lpool, outdim=3)  # (100, 16, 24)
                lpool = L.dimshuffle(lpool, (0, 2, 1))  # (100, 24, 16)
                ls.append(lpool)
            xc = L.concat(ls, axis=2)  # (100, 24, 64)
            return xc

        elif self.args.char_model == 'LSTM':
            ml = L.ExpressionLayer(
                ic, lambda x: T.neq(x, 0))  # mask layer (100, 24, 32)
            ml = L.reshape(ml, (-1, self.args.max_len))  # (2400, 32)

            gate_params = L.recurrent.Gate(W_in=Orthogonal(),
                                           W_hid=Orthogonal())
            cell_params = L.recurrent.Gate(W_in=Orthogonal(),
                                           W_hid=Orthogonal(),
                                           W_cell=None,
                                           nonlinearity=tanh)

            lstm_in = L.reshape(
                ec, (-1, self.args.max_len, self.args.nc))  # (2400, 32, 16)
            lstm_f = L.LSTMLayer(
                lstm_in,
                self.args.nw / 2,
                mask_input=ml,
                grad_clipping=10.,
                learn_init=True,
                peepholes=False,
                precompute_input=True,
                ingate=gate_params,
                forgetgate=gate_params,
                cell=cell_params,
                outgate=gate_params,
                # unroll_scan=True,
                only_return_final=True,
                name='forward' + suf)  # (2400, 64)
            lstm_b = L.LSTMLayer(
                lstm_in,
                self.args.nw / 2,
                mask_input=ml,
                grad_clipping=10.,
                learn_init=True,
                peepholes=False,
                precompute_input=True,
                ingate=gate_params,
                forgetgate=gate_params,
                cell=cell_params,
                outgate=gate_params,
                # unroll_scan=True,
                only_return_final=True,
                backwards=True,
                name='backward' + suf)  # (2400, 64)
            remove_reg(lstm_f)
            remove_reg(lstm_b)
            if avg:
                set_zero(lstm_f)
                set_zero(lstm_b)
            xc = L.concat([lstm_f, lstm_b], axis=1)  # (2400, 128)
            xc = L.reshape(xc,
                           (-1, self.args.sw, self.args.nw))  # (100, 24, 256)
            return xc
    def build_network(self, vocab_size, doc_var, query_var, docmask_var,
                      qmask_var, W_init):

        l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var)
        l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var)
        l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var)
        l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var)
        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=vocab_size,
                                      output_size=EMBED_DIM,
                                      W=W_init)
        l_qembed = L.EmbeddingLayer(l_qin,
                                    input_size=vocab_size,
                                    output_size=EMBED_DIM,
                                    W=l_docembed.W)

        l_fwd_doc = L.GRULayer(l_docembed,
                               NUM_HIDDEN,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_doc = L.GRULayer(l_docembed,
                               NUM_HIDDEN,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True,
                               backwards=True)

        l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2)

        l_fwd_q = L.GRULayer(l_qembed,
                             NUM_HIDDEN,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True)
        l_bkd_q = L.GRULayer(l_qembed,
                             NUM_HIDDEN,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             backwards=True)

        l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1)
        l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1)
        l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice])

        d = L.get_output(l_doc)  # B x N x D
        q = L.get_output(l_q)  # B x D
        p = T.batched_dot(d, q)  # B x N
        pm = T.nnet.softmax(
            T.set_subtensor(
                T.alloc(-20., p.shape[0], p.shape[1])[docmask_var.nonzero()],
                p[docmask_var.nonzero()]))

        index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape)
        final = T.inc_subtensor(
            T.alloc(0., p.shape[0], vocab_size)[index,
                                                T.flatten(doc_var, outdim=2)],
            pm)
        #qv = T.flatten(query_var,outdim=2)
        #index2 = T.reshape(T.repeat(T.arange(qv.shape[0]),qv.shape[1]),qv.shape)
        #xx = index2[qmask_var.nonzero()]
        #yy = qv[qmask_var.nonzero()]
        #pV = T.set_subtensor(final[xx,yy], T.zeros_like(qv[xx,yy]))

        return final, l_doc, l_q
示例#21
0
    def rnn_decoder(l_input_one_hot,
                    l_encoder_hid,
                    encoder_mask,
                    out_sym,
                    out_mask,
                    out_go_sym,
                    name="Decoder"):
        n_layers = 1
        n_units = 256
        n_attention_units = 256
        emb_size = 256
        rnn = DropoutLSTMLayer

        l_go_out = L.InputLayer((None, None), input_var=out_go_sym)
        l_out_mask = L.InputLayer((None, None), input_var=out_mask)
        l_in_mask = L.InputLayer((None, None), input_var=encoder_mask)

        l_emb = L.EmbeddingLayer(l_go_out,
                                 dict_size,
                                 emb_size,
                                 name=name + '.Embedding')

        last_hid_encoded = L.SliceLayer(rnn(l_encoder_hid,
                                            num_units=n_units,
                                            mask_input=l_in_mask,
                                            name=name + '.Summarizer',
                                            dropout=0.25),
                                        indices=-1,
                                        axis=1)
        encoder_last_hid_repeat = RepeatLayer(last_hid_encoded,
                                              n=T.shape(out_go_sym)[1],
                                              axis=1)

        l_dec = L.ConcatLayer([l_emb, encoder_last_hid_repeat], axis=2)
        for i in range(n_layers):
            l_dec = rnn(l_dec,
                        num_units=n_units,
                        mask_input=l_out_mask,
                        name="%s.%d.Forward" % (name, i),
                        learn_init=True,
                        dropout=0.25)

        l_attention = BahdanauKeyValueAttentionLayer(
            [l_encoder_hid, l_input_one_hot, l_in_mask, l_dec],
            n_attention_units,
            name=name + '.Attention')  # (bs, seq_out, dict)
        l_out = L.ReshapeLayer(l_attention, (-1, [2]))

        out_random = L.get_output(
            l_out, deterministic=False)  # (batch * seq_out) x dict
        out_deterministic = L.get_output(
            l_out, deterministic=True)  # (batch * seq_out) x dict
        params = L.get_all_params([l_out], trainable=True)

        rcrossentropy = T.nnet.categorical_crossentropy(
            out_random + 1e-8, out_sym.flatten())  # (batch * seq) x 1
        crossentropy = T.reshape(rcrossentropy, (bs, -1))  # batch x seq
        loss = T.sum(out_mask * crossentropy) / T.sum(out_mask)  # scalar

        argmax = T.argmax(T.reshape(out_deterministic, (bs, -1, dict_size)),
                          axis=-1)  # batch x seq x 1

        return {'loss': loss, 'argmax': argmax, 'params': params}
示例#22
0
def clone(src_net, dst_net, mask_input):
    """
    Clones a lasagne neural network, keeping weights tied.

    For all layers of src_net in turn, starting at the first:
     1. creates a copy of the layer,
     2. reuses the original objects for weights and
     3. appends the new layer to dst_net.

    InputLayers are ignored.
    Recurrent layers (LSTMLayer) are passed mask_input.
    """
    logger.info("Net to be cloned:")
    for l in layers.get_all_layers(src_net):
        logger.info(" - {} ({}):".format(l.name, l))

    logger.info("Starting to clone..")
    for l in layers.get_all_layers(src_net):
        logger.info("src_net[...]: {} ({}):".format(l.name, l))
        if type(l) == layers.InputLayer:
            logger.info(' - skipping')
            continue
        if type(l) == layers.DenseLayer:
            dst_net = layers.DenseLayer(
                dst_net,
                num_units=l.num_units,
                W=l.W,
                b=l.b,
                nonlinearity=l.nonlinearity,
                name=l.name+'2',
            )
        elif type(l) == layers.EmbeddingLayer:
            dst_net = layers.EmbeddingLayer(
                dst_net,
                l.input_size,
                l.output_size,
                W=l.W,
                name=l.name+'2',
            )
        elif type(l) == layers.LSTMLayer:
            dst_net = layers.LSTMLayer(
                dst_net,
                l.num_units,
                ingate=layers.Gate(
                    W_in=l.W_in_to_ingate,
                    W_hid=l.W_hid_to_ingate,
                    W_cell=l.W_cell_to_ingate,
                    b=l.b_ingate,
                    nonlinearity=l.nonlinearity_ingate
                ),
                forgetgate=layers.Gate(
                    W_in=l.W_in_to_forgetgate,
                    W_hid=l.W_hid_to_forgetgate,
                    W_cell=l.W_cell_to_forgetgate,
                    b=l.b_forgetgate,
                    nonlinearity=l.nonlinearity_forgetgate
                ),
                cell=layers.Gate(
                    W_in=l.W_in_to_cell,
                    W_hid=l.W_hid_to_cell,
                    W_cell=None,
                    b=l.b_cell,
                    nonlinearity=l.nonlinearity_cell
                ),
                outgate=layers.Gate(
                    W_in=l.W_in_to_outgate,
                    W_hid=l.W_hid_to_outgate,
                    W_cell=l.W_cell_to_outgate,
                    b=l.b_outgate,
                    nonlinearity=l.nonlinearity_outgate
                ),
                nonlinearity=l.nonlinearity,
                cell_init=l.cell_init,
                hid_init=l.hid_init,
                backwards=l.backwards,
                learn_init=l.learn_init,
                peepholes=l.peepholes,
                gradient_steps=l.gradient_steps,
                grad_clipping=l.grad_clipping,
                unroll_scan=l.unroll_scan,
                precompute_input=l.precompute_input,
                # mask_input=l.mask_input, # AttributeError: 'LSTMLayer' object has no attribute 'mask_input'
                name=l.name+'2',
                mask_input=mask_input,
            )
        elif type(l) == layers.SliceLayer:
            dst_net = layers.SliceLayer(
                dst_net,
                indices=l.slice,
                axis=l.axis,
                name=l.name+'2',
            )
        else:
            raise ValueError("Unhandled layer: {}".format(l))
        new_layer = layers.get_all_layers(dst_net)[-1]
        logger.info('dst_net[...]: {} ({})'.format(new_layer, new_layer.name))

    logger.info("Result of cloning:")
    for l in layers.get_all_layers(dst_net):
        logger.info(" - {} ({}):".format(l.name, l))

    return dst_net
    def build_network(self, K, vocab_size, W_init):

        l_docin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[0])
        l_doctokin = L.InputLayer(shape=(None, None), input_var=self.inps[1])
        l_qin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[2])
        l_qtokin = L.InputLayer(shape=(None, None), input_var=self.inps[3])
        l_docmask = L.InputLayer(shape=(None, None), input_var=self.inps[6])
        l_qmask = L.InputLayer(shape=(None, None), input_var=self.inps[7])
        l_tokin = L.InputLayer(shape=(None, MAX_WORD_LEN),
                               input_var=self.inps[8])
        l_tokmask = L.InputLayer(shape=(None, MAX_WORD_LEN),
                                 input_var=self.inps[9])
        l_featin = L.InputLayer(shape=(None, None), input_var=self.inps[11])

        doc_shp = self.inps[1].shape
        qry_shp = self.inps[3].shape

        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=vocab_size,
                                      output_size=self.embed_dim,
                                      W=W_init)  # B x N x 1 x DE
        l_doce = L.ReshapeLayer(
            l_docembed, (doc_shp[0], doc_shp[1], self.embed_dim))  # B x N x DE
        l_qembed = L.EmbeddingLayer(l_qin,
                                    input_size=vocab_size,
                                    output_size=self.embed_dim,
                                    W=l_docembed.W)
        l_qembed = L.ReshapeLayer(
            l_qembed, (qry_shp[0], qry_shp[1], self.embed_dim))  # B x N x DE
        l_fembed = L.EmbeddingLayer(l_featin, input_size=2,
                                    output_size=2)  # B x N x 2

        if self.train_emb == 0:
            l_docembed.params[l_docembed.W].remove('trainable')

        # char embeddings
        if self.use_chars:
            l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars,
                                        2 * self.char_dim)  # T x L x D
            l_fgru = L.GRULayer(l_lookup,
                                self.char_dim,
                                grad_clipping=GRAD_CLIP,
                                mask_input=l_tokmask,
                                gradient_steps=GRAD_STEPS,
                                precompute_input=True,
                                only_return_final=True)
            l_bgru = L.GRULayer(l_lookup,
                                2 * self.char_dim,
                                grad_clipping=GRAD_CLIP,
                                mask_input=l_tokmask,
                                gradient_steps=GRAD_STEPS,
                                precompute_input=True,
                                backwards=True,
                                only_return_final=True)  # T x 2D
            l_fwdembed = L.DenseLayer(l_fgru,
                                      self.embed_dim / 2,
                                      nonlinearity=None)  # T x DE/2
            l_bckembed = L.DenseLayer(l_bgru,
                                      self.embed_dim / 2,
                                      nonlinearity=None)  # T x DE/2
            l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1)
            l_docchar_embed = IndexLayer([l_doctokin, l_embed])  # B x N x DE/2
            l_qchar_embed = IndexLayer([l_qtokin, l_embed])  # B x Q x DE/2

            l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2)
            l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2)

        l_fwd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             only_return_final=False)
        l_bkd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             backwards=True,
                             only_return_final=False)

        l_q = L.ConcatLayer([l_fwd_q, l_bkd_q])  # B x Q x 2D
        q = L.get_output(l_q)  # B x Q x 2D
        q = q[T.arange(q.shape[0]), self.inps[12], :]  # B x 2D

        l_qs = [l_q]
        for i in range(K - 1):
            l_fwd_doc_1 = L.GRULayer(l_doce,
                                     self.nhidden,
                                     grad_clipping=GRAD_CLIP,
                                     mask_input=l_docmask,
                                     gradient_steps=GRAD_STEPS,
                                     precompute_input=True)
            l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP,
                    mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                            backwards=True)

            l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1],
                               axis=2)  # B x N x DE

            l_fwd_q_1 = L.GRULayer(l_qembed,
                                   self.nhidden,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True)
            l_bkd_q_1 = L.GRULayer(l_qembed,
                                   self.nhidden,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True,
                                   backwards=True)

            l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1],
                                    axis=2)  # B x Q x DE
            l_qs.append(l_q_c_1)

            qd = L.get_output(l_q_c_1)  # B x Q x DE
            dd = L.get_output(l_doc_1)  # B x N x DE
            M = T.batched_dot(dd, qd.dimshuffle((0, 2, 1)))  # B x N x Q
            alphas = T.nnet.softmax(
                T.reshape(M, (M.shape[0] * M.shape[1], M.shape[2])))
            alphas_r = T.reshape(alphas, (M.shape[0],M.shape[1],M.shape[2]))* \
                    self.inps[7][:,np.newaxis,:] # B x N x Q
            alphas_r = alphas_r / alphas_r.sum(axis=2)[:, :,
                                                       np.newaxis]  # B x N x Q
            q_rep = T.batched_dot(alphas_r, qd)  # B x N x DE

            l_q_rep_in = L.InputLayer(shape=(None, None, 2 * self.nhidden),
                                      input_var=q_rep)
            l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul)
            l_doce = L.dropout(l_doc_2_in, p=self.dropout)  # B x N x DE

        if self.use_feat:
            l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2)  # B x N x DE+2
        l_fwd_doc = L.GRULayer(l_doce,
                               self.nhidden,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP,
                mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                        backwards=True)

        l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2)

        d = L.get_output(l_doc)  # B x N x 2D
        p = T.batched_dot(d, q)  # B x N
        pm = T.nnet.softmax(p) * self.inps[10]
        pm = pm / pm.sum(axis=1)[:, np.newaxis]
        final = T.batched_dot(pm, self.inps[4])

        dv = L.get_output(l_doc, deterministic=True)  # B x N x 2D
        p = T.batched_dot(dv, q)  # B x N
        pm = T.nnet.softmax(p) * self.inps[10]
        pm = pm / pm.sum(axis=1)[:, np.newaxis]
        final_v = T.batched_dot(pm, self.inps[4])

        return final, final_v, l_doc, l_qs, l_docembed.W
示例#24
0
    def __init__(self):
        self.batch_size = 32
        self.embedding_size = 50
        self.nb_max_sentences = 10
        self.length_max_sentences = 30
        self.vocab_size = 10000
        self.nb_hidden = 32
        self.nb_hops = 5
        # Dimension of the input context is (batch_size, number of sentences, max size of sentences)
        self.context = T.itensor3('context')
        self.mask_context = T.imatrix('context_mask')
        # Dimension of the question input is (batch_size, max size of sentences)
        self.question = T.itensor3('question')
        self.mask_question = T.imatrix('question_mask')
        """
        Building the Input context module

        """
        mask_context = layers.InputLayer(
            (self.batch_size * self.nb_max_sentences,
             self.length_max_sentences),
            input_var=self.mask_context)
        # (batch_size, nb_sentences, length_max_sentences)
        input_module = layers.InputLayer(
            (self.batch_size, self.nb_max_sentences,
             self.length_max_sentences),
            input_var=self.context)
        # (batch_size, nb_sentences * length_max_sentences)
        input_module = layers.ReshapeLayer(input_module, (self.batch_size, -1))

        # (batch_size, nb_sentences * length_max_sequences, embedding_size)
        input_module = layers.EmbeddingLayer(input_module, self.vocab_size,
                                             self.embedding_size)

        # (batch_size, nb_sentences, length_max_sequences, embedding_size)
        input_module = layers.ReshapeLayer(
            input_module, (self.batch_size, self.nb_max_sentences,
                           self.length_max_sentences, self.embedding_size))

        # (batch_size * nb_sentences, length_sentences, embedding_size)
        input_module = layers.ReshapeLayer(
            input_module, (self.batch_size * self.nb_max_sentences,
                           self.length_max_sentences, self.embedding_size))

        # (batch_size * nb_sentences, nb_hidden)
        input_module = layers.GRULayer(input_module,
                                       self.nb_hidden,
                                       mask_input=mask_context,
                                       only_return_final=True)
        context = layers.get_output(input_module)
        # input_module = layers.ReshapeLayer(input_module, (self.batch_size, self.nb_max_sentences, self.nb_hidden))
        """
        Building the Input context module

        """
        # (bach_size, length_sentences)
        mask_question = layers.InputLayer(
            (self.batch_size, self.length_max_sentences),
            input_var=self.mask_question)
        # (batch_size, length_sentences)
        question_module = layers.InputLayer(
            (self.batch_size, self.length_max_sentences))

        # (batch_size, length_sentences, embedding_size)
        question_module = layers.EmbeddingLayer(question_module,
                                                self.vocab_size,
                                                self.embedding_size)

        # (batch_size, nb_hidden)
        question_module = layers.GRULayer(question_module,
                                          self.nb_hidden,
                                          mask_input=mask_question,
                                          only_return_final=True)
        question = layers.get_output(question_module)
        """
        Building the Memory module

        """
        memory = question
        self._M = utils.get_shared('glorot_uniform', self.nb_hidden,
                                   self.nb_hidden)

        for step in xrange(self.nb_hops):
            z_score_vector = T.concatenate([
                context, question, memory, context * question,
                context * memory,
                T.abs_(context - question),
                T.abs_(context - memory),
                T.dot(T.dot(context, self._M), question),
                T.dot(T.dot(context, self._M), memory)
            ])

            self._M1 = utils.get_shared('glorot_uniform', self.nb_hidden * 9,
                                        self.nb_hidden)
            self._B1 = utils.get_shared('constant_zero', self.nb_hidden, None)
            z1 = T.tanh(T.dot(self._M1, z_score_vector) + self._B1)

            self._M2 = utils.get_shared('glorot_uniform', self.nb_hidden, 1)
            self._B2 = utils.get_shared('constant_zero', self.nb_hidden, None)
            z2 = T.nnet.sigmoid(T.dot(self._M2, z1) + self._B2)
示例#25
0
    def buildModel(self):
        print(' -- Building...')
        x_init = sparse.csr_matrix('x', dtype='float32')
        y_init = T.imatrix('y')
        g_init = T.imatrix('g')
        ind_init = T.ivector('ind')
        sub_path_init = T.imatrix('subPathsBatch')
        mask_init = T.fmatrix('subMask')

        # step train
        x_input = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                 input_var=x_init)
        g_input = lgl.InputLayer(shape=(None, 2), input_var=g_init)
        ind_input = lgl.InputLayer(shape=(None, ), input_var=ind_init)
        pair_second = lgl.SliceLayer(g_input, indices=1, axis=1)
        pair_first = lgl.SliceLayer(g_input, indices=0, axis=1)
        pair_first_emd = lgl.EmbeddingLayer(pair_first,
                                            input_size=self.num_ver,
                                            output_size=self.embedding_size)
        emd_to_numver = layers.DenseLayer(
            pair_first_emd,
            self.num_ver,
            nonlinearity=lg.nonlinearities.softmax)
        index_emd = lgl.EmbeddingLayer(ind_input,
                                       input_size=self.num_ver,
                                       output_size=self.embedding_size,
                                       W=pair_first_emd.W)
        x_to_ydim = layers.SparseLayer(x_input,
                                       self.y.shape[1],
                                       nonlinearity=lg.nonlinearities.softmax)
        index_emd = layers.DenseLayer(index_emd,
                                      self.y.shape[1],
                                      nonlinearity=lg.nonlinearities.softmax)
        concat_two = lgl.ConcatLayer([x_to_ydim, index_emd], axis=1)
        concat_two = layers.DenseLayer(concat_two,
                                       self.y.shape[1],
                                       nonlinearity=lg.nonlinearities.softmax)
        concat_two_output = lgl.get_output(concat_two)
        step_loss = lgo.categorical_crossentropy(concat_two_output,
                                                 y_init).mean()
        hid_loss = lgl.get_output(x_to_ydim)
        step_loss += lgo.categorical_crossentropy(hid_loss, y_init).mean()
        emd_loss = lgl.get_output(index_emd)
        step_loss += lgo.categorical_crossentropy(emd_loss, y_init).mean()
        step_params = [
            index_emd.W, index_emd.b, x_to_ydim.W, x_to_ydim.b, concat_two.W,
            concat_two.b
        ]
        step_updates = lg.updates.sgd(step_loss,
                                      step_params,
                                      learning_rate=self.step_learning_rate)
        self.step_train = theano.function([x_init, y_init, ind_init],
                                          step_loss,
                                          updates=step_updates,
                                          on_unused_input='ignore')
        self.test_fn = theano.function([x_init, ind_init],
                                       concat_two_output,
                                       on_unused_input='ignore')

        # supervised train
        fc_output = lgl.get_output(emd_to_numver)
        pair_second_output = lgl.get_output(pair_second)
        sup_loss = lgo.categorical_crossentropy(fc_output,
                                                pair_second_output).sum()
        sup_params = lgl.get_all_params(emd_to_numver, trainable=True)
        sup_updates = lg.updates.sgd(sup_loss,
                                     sup_params,
                                     learning_rate=self.sup_learning_rate)
        self.sup_train = theano.function([g_init],
                                         sup_loss,
                                         updates=sup_updates,
                                         on_unused_input='ignore')

        cross_entropy = lgo.categorical_crossentropy(fc_output,
                                                     pair_second_output)
        cross_entropy = T.reshape(cross_entropy, (1, self.unsup_batch_size),
                                  ndim=None)

        mask_input = lgl.InputLayer(shape=(None, self.window_size + 1),
                                    input_var=mask_init)
        subPath_in = lgl.InputLayer(shape=(None, self.window_size + 1),
                                    input_var=sub_path_init)
        sub_path_emd = lgl.EmbeddingLayer(subPath_in,
                                          input_size=self.num_ver,
                                          output_size=self.embedding_size,
                                          W=pair_first_emd.W)

        lstm_layer = lgl.LSTMLayer(sub_path_emd,
                                   self.lstm_hidden_units,
                                   grad_clipping=3,
                                   mask_input=mask_input)

        # handle path weight
        max1 = T.mean(lgl.get_output(lstm_layer), axis=1)
        max2 = T.mean(max1, axis=1)
        max2_init = T.fcol('max2')
        max2_init = T.reshape(max2, ((self.subpath_num, 1)))
        max2_input = lgl.InputLayer(shape=(self.subpath_num, 1),
                                    input_var=max2_init)
        max2_input = lgl.BatchNormLayer(max2_input)
        path_weight = lgl.get_output(max2_input)
        path_weight = lg.nonlinearities.sigmoid(path_weight)
        path_weight = 1 + 0.3 * path_weight

        # unsupervised train
        reweight_loss = T.dot(cross_entropy, path_weight)[0][0]
        lstm_params_all = lgl.get_all_params(lstm_layer, trainable=True)
        lstm_params = list(set(lstm_params_all).difference(set(sup_params)))
        lstm_updates = lg.updates.sgd(reweight_loss,
                                      lstm_params,
                                      learning_rate=0.01)
        self.lstm_fn = theano.function([sub_path_init, g_init, mask_init],
                                       reweight_loss,
                                       updates=lstm_updates,
                                       on_unused_input='ignore')
        alpha_updates = lg.updates.sgd(reweight_loss,
                                       sup_params,
                                       learning_rate=0.001)
        self.alpha_fn = theano.function([sub_path_init, g_init, mask_init],
                                        reweight_loss,
                                        updates=alpha_updates,
                                        on_unused_input='ignore')
        print(' -- Done!')
    def build_network(self,
                      vocab_size,
                      input_var,
                      mask_var,
                      docidx_var,
                      docidx_mask,
                      skip_connect=True):

        l_in = L.InputLayer(shape=(None, None, 1), input_var=input_var)

        l_mask = L.InputLayer(shape=(None, None), input_var=mask_var)

        l_embed = L.EmbeddingLayer(l_in,
                                   input_size=vocab_size,
                                   output_size=EMBED_DIM,
                                   W=self.params['W_emb'])

        l_embed_noise = L.dropout(l_embed, p=DROPOUT_RATE)

        # NOTE: Moved initialization of forget gate biases to init_params
        #forget_gate_1 = L.Gate(b=lasagne.init.Constant(3))
        #forget_gate_2 = L.Gate(b=lasagne.init.Constant(3))

        # NOTE: LSTM layer provided by Lasagne is slightly different from that used in DeepMind's paper.
        # In the paper the cell-to-* weights are not diagonal.
        # the 1st lstm layer
        in_gate = L.Gate(W_in=self.params['W_lstm1_xi'],
                         W_hid=self.params['W_lstm1_hi'],
                         W_cell=self.params['W_lstm1_ci'],
                         b=self.params['b_lstm1_i'],
                         nonlinearity=lasagne.nonlinearities.sigmoid)
        forget_gate = L.Gate(W_in=self.params['W_lstm1_xf'],
                             W_hid=self.params['W_lstm1_hf'],
                             W_cell=self.params['W_lstm1_cf'],
                             b=self.params['b_lstm1_f'],
                             nonlinearity=lasagne.nonlinearities.sigmoid)
        out_gate = L.Gate(W_in=self.params['W_lstm1_xo'],
                          W_hid=self.params['W_lstm1_ho'],
                          W_cell=self.params['W_lstm1_co'],
                          b=self.params['b_lstm1_o'],
                          nonlinearity=lasagne.nonlinearities.sigmoid)
        cell_gate = L.Gate(W_in=self.params['W_lstm1_xc'],
                           W_hid=self.params['W_lstm1_hc'],
                           W_cell=None,
                           b=self.params['b_lstm1_c'],
                           nonlinearity=lasagne.nonlinearities.tanh)
        l_fwd_1 = L.LSTMLayer(l_embed_noise,
                              NUM_HIDDEN,
                              ingate=in_gate,
                              forgetgate=forget_gate,
                              cell=cell_gate,
                              outgate=out_gate,
                              peepholes=True,
                              grad_clipping=GRAD_CLIP,
                              mask_input=l_mask,
                              gradient_steps=GRAD_STEPS,
                              precompute_input=True)

        # the 2nd lstm layer
        if skip_connect:
            # construct skip connection from the lookup table to the 2nd layer
            batch_size, seq_len, _ = input_var.shape
            # concatenate the last dimension of l_fwd_1 and embed
            l_fwd_1_shp = L.ReshapeLayer(l_fwd_1, (-1, NUM_HIDDEN))
            l_embed_shp = L.ReshapeLayer(l_embed, (-1, EMBED_DIM))
            to_next_layer = L.ReshapeLayer(
                L.concat([l_fwd_1_shp, l_embed_shp], axis=1),
                (batch_size, seq_len, NUM_HIDDEN + EMBED_DIM))
        else:
            to_next_layer = l_fwd_1

        to_next_layer_noise = L.dropout(to_next_layer, p=DROPOUT_RATE)

        in_gate = L.Gate(W_in=self.params['W_lstm2_xi'],
                         W_hid=self.params['W_lstm2_hi'],
                         W_cell=self.params['W_lstm2_ci'],
                         b=self.params['b_lstm2_i'],
                         nonlinearity=lasagne.nonlinearities.sigmoid)
        forget_gate = L.Gate(W_in=self.params['W_lstm2_xf'],
                             W_hid=self.params['W_lstm2_hf'],
                             W_cell=self.params['W_lstm2_cf'],
                             b=self.params['b_lstm2_f'],
                             nonlinearity=lasagne.nonlinearities.sigmoid)
        out_gate = L.Gate(W_in=self.params['W_lstm2_xo'],
                          W_hid=self.params['W_lstm2_ho'],
                          W_cell=self.params['W_lstm2_co'],
                          b=self.params['b_lstm2_o'],
                          nonlinearity=lasagne.nonlinearities.sigmoid)
        cell_gate = L.Gate(W_in=self.params['W_lstm2_xc'],
                           W_hid=self.params['W_lstm2_hc'],
                           W_cell=None,
                           b=self.params['b_lstm2_c'],
                           nonlinearity=lasagne.nonlinearities.tanh)
        l_fwd_2 = L.LSTMLayer(to_next_layer_noise,
                              NUM_HIDDEN,
                              ingate=in_gate,
                              forgetgate=forget_gate,
                              cell=cell_gate,
                              outgate=out_gate,
                              peepholes=True,
                              grad_clipping=GRAD_CLIP,
                              mask_input=l_mask,
                              gradient_steps=GRAD_STEPS,
                              precompute_input=True)

        # slice final states of both lstm layers
        l_fwd_1_slice = L.SliceLayer(l_fwd_1, -1, 1)
        l_fwd_2_slice = L.SliceLayer(l_fwd_2, -1, 1)

        # g will be used to score the words based on their embeddings
        g = L.DenseLayer(L.concat([l_fwd_1_slice, l_fwd_2_slice], axis=1),
                         num_units=EMBED_DIM,
                         W=self.params['W_dense'],
                         b=self.params['b_dense'],
                         nonlinearity=lasagne.nonlinearities.tanh)

        ## get outputs
        #g_out = L.get_output(g) # B x D
        #g_out_val = L.get_output(g, deterministic=True) # B x D

        ## compute softmax probs
        #probs,_ = theano.scan(fn=lambda g,d,dm,W: T.nnet.softmax(T.dot(g,W[d,:].T)*dm),
        #                    outputs_info=None,
        #                    sequences=[g_out,docidx_var,docidx_mask],
        #                    non_sequences=self.params['W_emb'])
        #predicted_probs = probs.reshape(docidx_var.shape) # B x N
        #probs_val,_ = theano.scan(fn=lambda g,d,dm,W: T.nnet.softmax(T.dot(g,W[d,:].T)*dm),
        #                    outputs_info=None,
        #                    sequences=[g_out_val,docidx_var,docidx_mask],
        #                    non_sequences=self.params['W_emb'])
        #predicted_probs_val = probs_val.reshape(docidx_var.shape) # B x N
        #return predicted_probs, predicted_probs_val

        # W is shared with the lookup table
        l_out = L.DenseLayer(g,
                             num_units=vocab_size,
                             W=self.params['W_emb'].T,
                             nonlinearity=lasagne.nonlinearities.softmax,
                             b=None)
        return l_out
示例#27
0
def build_model(vocab_size,
                doc_var,
                qry_var,
                doc_mask_var,
                qry_mask_var,
                W_init=lasagne.init.Normal()):

    l_doc_in = L.InputLayer(shape=(None, None, 1), input_var=doc_var)
    l_qry_in = L.InputLayer(shape=(None, None, 1), input_var=qry_var)

    l_doc_embed = L.EmbeddingLayer(l_doc_in, vocab_size, EMBED_DIM, W=W_init)
    l_qry_embed = L.EmbeddingLayer(l_qry_in,
                                   vocab_size,
                                   EMBED_DIM,
                                   W=l_doc_embed.W)

    l_doc_mask = L.InputLayer(shape=(None, None), input_var=doc_mask_var)
    l_qry_mask = L.InputLayer(shape=(None, None), input_var=qry_mask_var)

    l_doc_fwd = L.LSTMLayer(l_doc_embed,
                            NUM_HIDDEN,
                            grad_clipping=GRAD_CLIP,
                            mask_input=l_doc_mask,
                            gradient_steps=GRAD_STEPS,
                            precompute_input=True)
    l_doc_bkd = L.LSTMLayer(l_doc_embed,
                            NUM_HIDDEN,
                            grad_clipping=GRAD_CLIP,
                            mask_input=l_doc_mask,
                            gradient_steps=GRAD_STEPS,
                            precompute_input=True,
                            backwards=True)
    l_qry_fwd = L.LSTMLayer(l_qry_embed,
                            NUM_HIDDEN,
                            grad_clipping=GRAD_CLIP,
                            mask_input=l_qry_mask,
                            gradient_steps=GRAD_STEPS,
                            precompute_input=True)
    l_qry_bkd = L.LSTMLayer(l_qry_embed,
                            NUM_HIDDEN,
                            grad_clipping=GRAD_CLIP,
                            mask_input=l_qry_mask,
                            gradient_steps=GRAD_STEPS,
                            precompute_input=True,
                            backwards=True)

    l_doc_fwd_slice = L.SliceLayer(l_doc_fwd, -1, 1)
    l_doc_bkd_slice = L.SliceLayer(l_doc_bkd, 0, 1)
    l_qry_fwd_slice = L.SliceLayer(l_qry_fwd, -1, 1)
    l_qry_bkd_slice = L.SliceLayer(l_qry_bkd, 0, 1)

    r = L.DenseLayer(L.ElemwiseSumLayer([l_doc_fwd_slice, l_doc_bkd_slice]),
                     num_units=NUM_HIDDEN,
                     nonlinearity=lasagne.nonlinearities.tanh)
    u = L.DenseLayer(L.ElemwiseSumLayer([l_qry_fwd_slice, l_qry_bkd_slice]),
                     num_units=NUM_HIDDEN,
                     nonlinearity=lasagne.nonlinearities.tanh)

    g = L.DenseLayer(L.concat([r, u], axis=1),
                     num_units=EMBED_DIM,
                     W=lasagne.init.GlorotNormal(),
                     nonlinearity=lasagne.nonlinearities.tanh)

    l_out = L.DenseLayer(g,
                         num_units=vocab_size,
                         W=l_doc_embed.W.T,
                         nonlinearity=lasagne.nonlinearities.softmax,
                         b=None)

    return l_out
def build_model(hyparams,
                vocab,
                nclasses=2,
                batchsize=None,
                invar=None,
                maskvar=None,
                maxlen=MAXLEN):

    embedding_dim = hyparams.embedding_dim
    nhidden = hyparams.nhidden
    bidirectional = hyparams.bidirectional
    pool = hyparams.pool
    grad_clip = hyparams.grad_clip
    init = hyparams.init

    net = OrderedDict()

    V = len(vocab)
    W = lasagne.init.Normal()

    gate_params = layer.recurrent.Gate(
        W_in=lasagne.init.Orthogonal(),
        W_hid=lasagne.init.Orthogonal(),
        b=lasagne.init.Constant(0.)
    )
    cell_params = layer.recurrent.Gate(
        W_in=lasagne.init.Orthogonal(),
        W_hid=lasagne.init.Orthogonal(),
        W_cell=None,
        b=lasagne.init.Constant(0.),
        nonlinearity=lasagne.nonlinearities.tanh
    )

    # define model
    net['input'] = layer.InputLayer((batchsize, maxlen), input_var=invar)
    net['mask'] = layer.InputLayer((batchsize, maxlen), input_var=maskvar)
    net['emb'] = layer.EmbeddingLayer(net['input'], input_size=V, output_size=embedding_dim, W=W)
    net['fwd1'] = layer.LSTMLayer(
        net['emb'],
        num_units=nhidden,
        grad_clipping=grad_clip,
        nonlinearity=lasagne.nonlinearities.tanh,
        mask_input=net['mask'],
        ingate=gate_params,
        forgetgate=gate_params,
        cell=cell_params,
        outgate=gate_params,
        learn_init=True
    )
    if bidirectional:
        net['bwd1'] = layer.LSTMLayer(
            net['emb'],
            num_units=nhidden,
            grad_clipping=grad_clip,
            nonlinearity=lasagne.nonlinearities.tanh,
            mask_input=net['mask'],
            ingate=gate_params,
            forgetgate=gate_params,
            cell=cell_params,
            outgate=gate_params,
            learn_init=True,
            backwards=True
        )

        def tmean(a, b):
            agg = theano.tensor.add(a, b)
            agg /= 2.
            return agg

        net['pool'] = layer.ElemwiseMergeLayer([net['fwd1'], net['bwd1']], tmean)
    else:
        net['pool'] = layer.ConcatLayer([net['fwd1']])
    net['dropout1'] = layer.DropoutLayer(net['pool'], p=0.5)
    net['fwd2'] = layer.LSTMLayer(
        net['dropout1'],
        num_units=nhidden,
        grad_clipping=grad_clip,
        nonlinearity=lasagne.nonlinearities.tanh,
        mask_input=net['mask'],
        ingate=gate_params,
        forgetgate=gate_params,
        cell=cell_params,
        outgate=gate_params,
        learn_init=True,
        only_return_final=True
    )
    net['dropout2'] = layer.DropoutLayer(net['fwd2'], p=0.6)
    net['softmax'] = layer.DenseLayer(
        net['dropout2'],
        num_units=nclasses,
        nonlinearity=lasagne.nonlinearities.softmax
    )
    ASSUME = {net['input']: (200, 140), net['mask']: (200, 140)}
    logstr = '========== MODEL ========== \n'
    logstr += 'vocab size: %d\n' % V
    logstr += 'embedding dim: %d\n' % embedding_dim
    logstr += 'nhidden: %d\n' % nhidden
    logstr += 'pooling: %s\n' % pool
    for lname, lyr in net.items():
        logstr += '%s %s\n' % (lname, str(get_output_shape(lyr, ASSUME)))
    logstr += '=========================== \n'
    print logstr
    return net
示例#29
0
    def get_actor(self, avg=False):
        suf = '_avg' if avg else ''
        iw = L.InputLayer(shape=(None, self.args.sw))  # (100, 24)
        ew = L.EmbeddingLayer(
            iw,
            self.args.vw,
            self.args.nw,
            name='ew' + suf,
            W=HeNormal() if not avg else Constant())  # (100, 24, 256)
        ew.params[ew.W].remove('regularizable')
        if 'w' in self.args.freeze:
            ew.params[ew.W].remove('trainable')
        # for access from outside
        if not avg:
            self.Ew = ew.W

        # char embedding with CNN/LSTM
        ic = L.InputLayer(shape=(None, self.args.sw,
                                 self.args.max_len))  # (100, 24, 32)
        ec = self.get_char2word(ic, avg)  # (100, 24, 256)

        it = L.InputLayer(shape=(None, self.args.st))
        et = L.EmbeddingLayer(it,
                              self.args.vt,
                              self.args.nt,
                              name='et' + suf,
                              W=HeNormal() if not avg else Constant())
        et.params[et.W].remove('regularizable')

        il = L.InputLayer(shape=(None, self.args.sl))
        el = L.EmbeddingLayer(il,
                              self.args.vl,
                              self.args.nl,
                              name='el' + suf,
                              W=HeNormal() if not avg else Constant())
        el.params[el.W].remove('regularizable')

        to_concat = []
        if self.args.type == 'word':
            to_concat.append(ew)
        elif self.args.type == 'char':
            to_concat.append(ec)
        elif self.args.type == 'both':
            to_concat += [ew, ec]
        elif self.args.type == 'mix':
            to_concat.append(L.ElemwiseSumLayer([ew, ec]))

        if not self.args.untagged:
            to_concat.append(et)
        if not self.args.unlabeled:
            to_concat.append(el)

        x = L.concat(to_concat, axis=2)  # (100, 24, 64+16+16)

        # additional:
        # get the more compact representation of each token by its word, tag and label,
        # before putting into the hidden layer
        if self.args.squeeze:
            x = L.DenseLayer(
                x,
                num_units=self.args.squeeze,
                name='h0' + suf,
                num_leading_axes=2,
                W=HeNormal('relu') if not avg else Constant())  # (100, 24, 64)

        h1 = L.DenseLayer(
            x,
            num_units=self.args.nh1,
            name='h1' + suf,
            W=HeNormal('relu') if not avg else Constant())  # (100, 512)
        h1 = L.dropout(h1, self.args.p1)
        h2 = L.DenseLayer(
            h1,
            num_units=self.args.nh2,
            name='h2' + suf,
            W=HeNormal('relu') if not avg else Constant())  # (100, 256)
        h2 = L.dropout(h2, self.args.p2)
        h3 = L.DenseLayer(h2,
                          num_units=self.args.nh3,
                          name='h3' + suf,
                          W=HeNormal() if not avg else Constant(),
                          nonlinearity=softmax)  # (100, 125) num of actions

        return iw, ic, it, il, h3
    def build_network(self, K, vocab_size, W_init):

        l_docin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[0])
        l_doctokin = L.InputLayer(shape=(None, None), input_var=self.inps[1])
        l_qin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[2])
        l_qtokin = L.InputLayer(shape=(None, None), input_var=self.inps[3])
        l_docmask = L.InputLayer(shape=(None, None), input_var=self.inps[6])
        l_qmask = L.InputLayer(shape=(None, None), input_var=self.inps[7])
        l_tokin = L.InputLayer(shape=(None, MAX_WORD_LEN),
                               input_var=self.inps[8])
        l_tokmask = L.InputLayer(shape=(None, MAX_WORD_LEN),
                                 input_var=self.inps[9])
        l_featin = L.InputLayer(shape=(None, None), input_var=self.inps[11])

        l_match_feat = L.InputLayer(shape=(None, None, None),
                                    input_var=self.inps[13])
        l_match_feat = L.EmbeddingLayer(l_match_feat, 2, 1)
        l_match_feat = L.ReshapeLayer(l_match_feat, (-1, [1], [2]))

        l_use_char = L.InputLayer(shape=(None, None, self.feat_cnt),
                                  input_var=self.inps[14])
        l_use_char_q = L.InputLayer(shape=(None, None, self.feat_cnt),
                                    input_var=self.inps[15])

        doc_shp = self.inps[1].shape
        qry_shp = self.inps[3].shape

        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=vocab_size,
                                      output_size=self.embed_dim,
                                      W=W_init)  # B x N x 1 x DE
        l_doce = L.ReshapeLayer(
            l_docembed, (doc_shp[0], doc_shp[1], self.embed_dim))  # B x N x DE
        l_qembed = L.EmbeddingLayer(l_qin,
                                    input_size=vocab_size,
                                    output_size=self.embed_dim,
                                    W=l_docembed.W)

        if self.train_emb == 0:
            l_docembed.params[l_docembed.W].remove('trainable')
            l_qembed.params[l_qembed.W].remove('trainable')

        l_qembed = L.ReshapeLayer(
            l_qembed, (qry_shp[0], qry_shp[1], self.embed_dim))  # B x N x DE
        l_fembed = L.EmbeddingLayer(l_featin, input_size=2,
                                    output_size=2)  # B x N x 2

        # char embeddings
        if self.use_chars:
            # ====== concatenation ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 2*self.char_dim) # T x L x D
            # l_fgru = L.GRULayer(l_lookup, self.char_dim, grad_clipping=GRAD_CLIP,
            #         mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True,
            #         only_return_final=True)
            # l_bgru = L.GRULayer(l_lookup, 2*self.char_dim, grad_clipping=GRAD_CLIP,
            #         mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True,
            #         backwards=True, only_return_final=True) # T x 2D
            # l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim/2, nonlinearity=None) # T x DE/2
            # l_bckembed = L.DenseLayer(l_bgru, self.embed_dim/2, nonlinearity=None) # T x DE/2
            # l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1)
            # l_docchar_embed = IndexLayer([l_doctokin, l_embed]) # B x N x DE/2
            # l_qchar_embed = IndexLayer([l_qtokin, l_embed]) # B x Q x DE/2

            # l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2)
            # l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2)

            # ====== bidir feat concat ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            # l_fgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True)
            # l_bgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True, backwards = True)
            # l_char_gru = L.ElemwiseSumLayer([l_fgru, l_bgru])
            # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            # l_doce = L.ConcatLayer([l_use_char, l_docchar_embed, l_doce], axis = 2)
            # l_qembed = L.ConcatLayer([l_use_char_q, l_qchar_embed, l_qembed], axis = 2)

            # ====== char concat ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True)
            # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            # l_doce = L.ConcatLayer([l_docchar_embed, l_doce], axis = 2)
            # l_qembed = L.ConcatLayer([l_qchar_embed, l_qembed], axis = 2)

            # ====== feat concat ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True)
            # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            # l_doce = L.ConcatLayer([l_use_char, l_docchar_embed, l_doce], axis = 2)
            # l_qembed = L.ConcatLayer([l_use_char_q, l_qchar_embed, l_qembed], axis = 2)

            # ====== gating ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True)
            # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            # l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce])
            # l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed])

            # ====== tie gating ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True)
            # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            # l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce])
            # l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed], W = l_doce.W, b = l_doce.b)

            # ====== scalar gating ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True)
            # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            # l_doce = ScalarDymLayer([l_use_char, l_docchar_embed, l_doce])
            # l_qembed = ScalarDymLayer([l_use_char_q, l_qchar_embed, l_qembed])

            # ====== dibirectional gating ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            # l_fgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True)
            # l_bgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True, backwards = True)
            # l_char_gru = L.ElemwiseSumLayer([l_fgru, l_bgru])
            # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            # l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce])
            # l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed])

            # ====== gate + concat ======
            l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            l_char_gru = L.GRULayer(l_lookup,
                                    self.embed_dim,
                                    grad_clipping=GRAD_CLIP,
                                    mask_input=l_tokmask,
                                    gradient_steps=GRAD_STEPS,
                                    precompute_input=True,
                                    only_return_final=True)
            l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce])
            l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed])

            l_doce = L.ConcatLayer([l_use_char, l_doce], axis=2)
            l_qembed = L.ConcatLayer([l_use_char_q, l_qembed], axis=2)

            # ====== bidirectional gate + concat ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            # l_fgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True)
            # l_bgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True, backwards = True)
            # l_char_gru = L.ElemwiseSumLayer([l_fgru, l_bgru])
            # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            # l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce])
            # l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed])

            # l_doce = L.ConcatLayer([l_use_char, l_doce], axis = 2)
            # l_qembed = L.ConcatLayer([l_use_char_q, l_qembed], axis = 2)

        attentions = []
        if self.save_attn:
            l_m = PairwiseInteractionLayer([l_doce, l_qembed])
            attentions.append(L.get_output(l_m, deterministic=True))

        for i in range(K - 1):
            l_fwd_doc_1 = L.GRULayer(l_doce,
                                     self.nhidden,
                                     grad_clipping=GRAD_CLIP,
                                     mask_input=l_docmask,
                                     gradient_steps=GRAD_STEPS,
                                     precompute_input=True)
            l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP,
                    mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                            backwards=True)

            l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1],
                               axis=2)  # B x N x DE

            l_fwd_q_1 = L.GRULayer(l_qembed,
                                   self.nhidden,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True)
            l_bkd_q_1 = L.GRULayer(l_qembed,
                                   self.nhidden,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True,
                                   backwards=True)

            l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1],
                                    axis=2)  # B x Q x DE

            l_doce = MatrixAttentionLayer(
                [l_doc_1, l_q_c_1, l_qmask, l_match_feat])
            # l_doce = MatrixAttentionLayer([l_doc_1, l_q_c_1, l_qmask])

            # === begin GA ===
            # l_m = PairwiseInteractionLayer([l_doc_1, l_q_c_1])
            # l_doc_2_in = GatedAttentionLayer([l_doc_1, l_q_c_1, l_m], mask_input=self.inps[7])
            # l_doce = L.dropout(l_doc_2_in, p=self.dropout) # B x N x DE
            # === end GA ===

            # if self.save_attn:
            #     attentions.append(L.get_output(l_m, deterministic=True))

        if self.use_feat:
            l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2)  # B x N x DE+2
        l_fwd_doc = L.GRULayer(l_doce,
                               self.nhidden,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP,
                mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                        backwards=True)
        l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2)

        l_fwd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             only_return_final=False)
        l_bkd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             backwards=True,
                             only_return_final=False)
        l_q = L.ConcatLayer([l_fwd_q, l_bkd_q], axis=2)  # B x Q x 2D

        if self.save_attn:
            l_m = PairwiseInteractionLayer([l_doc, l_q])
            attentions.append(L.get_output(l_m, deterministic=True))

        l_prob = AttentionSumLayer([l_doc, l_q],
                                   self.inps[4],
                                   self.inps[12],
                                   mask_input=self.inps[10])
        final = L.get_output(l_prob)
        final_v = L.get_output(l_prob, deterministic=True)

        return final, final_v, l_prob, l_docembed.W, attentions