Exemplo n.º 1
0
 def build_network(self):
     l_char1_in = L.InputLayer(shape=(None, None, self.max_word_len),
                               input_var=self.inps[0])
     l_char2_in = L.InputLayer(shape=(None, None, self.max_word_len),
                               input_var=self.inps[1])
     l_mask1_in = L.InputLayer(shape=(None, None, self.max_word_len),
                               input_var=self.inps[2])
     l_mask2_in = L.InputLayer(shape=(None, None, self.max_word_len),
                               input_var=self.inps[3])
     l_char_in = L.ConcatLayer([l_char1_in, l_char2_in],
                               axis=1)  # B x (ND+NQ) x L
     l_char_mask = L.ConcatLayer([l_mask1_in, l_mask2_in], axis=1)
     shp = (self.inps[0].shape[0],
            self.inps[0].shape[1] + self.inps[1].shape[1],
            self.inps[1].shape[2])
     l_index_reshaped = L.ReshapeLayer(l_char_in,
                                       (shp[0] * shp[1], shp[2]))  # BN x L
     l_mask_reshaped = L.ReshapeLayer(l_char_mask,
                                      (shp[0] * shp[1], shp[2]))  # BN x L
     l_lookup = L.EmbeddingLayer(l_index_reshaped, self.num_chars,
                                 self.char_dim)  # BN x L x D
     l_fgru = L.GRULayer(l_lookup,
                         2 * self.char_dim,
                         grad_clipping=10,
                         gradient_steps=-1,
                         precompute_input=True,
                         only_return_final=True,
                         mask_input=l_mask_reshaped)
     l_bgru = L.GRULayer(l_lookup,
                         2 * self.char_dim,
                         grad_clipping=10,
                         gradient_steps=-1,
                         precompute_input=True,
                         backwards=True,
                         only_return_final=True,
                         mask_input=l_mask_reshaped)  # BN x 2D
     l_fwdembed = L.DenseLayer(l_fgru,
                               self.embed_dim / 2,
                               nonlinearity=None)  # BN x DE
     l_bckembed = L.DenseLayer(l_bgru,
                               self.embed_dim / 2,
                               nonlinearity=None)  # BN x DE
     l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1)
     l_char_embed = L.ReshapeLayer(l_embed,
                                   (shp[0], shp[1], self.embed_dim / 2))
     l_embed1 = L.SliceLayer(l_char_embed,
                             slice(0, self.inps[0].shape[1]),
                             axis=1)
     l_embed2 = L.SliceLayer(l_char_embed,
                             slice(-self.inps[1].shape[1], None),
                             axis=1)
     return l_embed1, l_embed2
Exemplo n.º 2
0
    def __init__(self,
                 insize,
                 vocoder,
                 mlpg_wins=[],
                 hiddensize=256,
                 nonlinearity=lasagne.nonlinearities.very_leaky_rectify,
                 nblayers=3,
                 bn_axes=None,
                 dropout_p=-1.0,
                 grad_clipping=50):
        if bn_axes is None:
            bn_axes = []  # Recurrent nets don't like batch norm [ref. needed]
        model.Model.__init__(self, insize, vocoder, hiddensize)

        if len(bn_axes) > 0:
            warnings.warn(
                'ModelBGRU: You are using bn_axes={}, but batch normalisation is supposed to make Recurrent Neural Networks (RNNS) unstable [ref. needed]'
                .format(bn_axes))

        l_hid = ll.InputLayer(shape=(None, None, insize),
                              input_var=self._input_values,
                              name='input_conditional')

        for layi in xrange(nblayers):
            layerstr = 'l' + str(1 + layi) + '_BGRU{}'.format(hiddensize)

            fwd = ll.GRULayer(l_hid,
                              num_units=hiddensize,
                              backwards=False,
                              name=layerstr + '.fwd',
                              grad_clipping=grad_clipping)
            bck = ll.GRULayer(l_hid,
                              num_units=hiddensize,
                              backwards=True,
                              name=layerstr + '.bck',
                              grad_clipping=grad_clipping)
            l_hid = ll.ConcatLayer((fwd, bck), axis=2)

            # Add batch normalisation
            if len(bn_axes) > 0:
                l_hid = ll.batch_norm(
                    l_hid, axes=bn_axes)  # Not be good for recurrent nets!

            # Add dropout (after batchnorm)
            if dropout_p > 0.0: l_hid = ll.dropout(l_hid, p=dropout_p)

        l_out = layer_final(l_hid, vocoder, mlpg_wins)

        self.init_finish(
            l_out
        )  # Has to be called at the end of the __init__ to print out the architecture, get the trainable params, etc.
Exemplo n.º 3
0
    def _init_model(self, in_size, out_size, n_hid=10, learning_rate_sl=0.005, \
            learning_rate_rl=0.005, batch_size=32, ment=0.1):
        # 2-layer MLP
        self.in_size = in_size # x and y coordinate
        self.out_size = out_size # up, down, right, left
        self.batch_size = batch_size
        self.learning_rate = learning_rate_rl
        self.n_hid = n_hid

        input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.imatrix('tm'), \
                T.itensor3('am'), T.fvector('r')

        in_var = T.reshape(input_var, (input_var.shape[0]*input_var.shape[1],self.in_size))

        l_mask_in = L.InputLayer(shape=(None,None), input_var=turn_mask)

        pol_in = T.fmatrix('pol-h')
        l_in = L.InputLayer(shape=(None,None,self.in_size), input_var=input_var)
        l_pol_rnn = L.GRULayer(l_in, n_hid, hid_init=pol_in, mask_input=l_mask_in) # B x H x D
        pol_out = L.get_output(l_pol_rnn)[:,-1,:]
        l_den_in = L.ReshapeLayer(l_pol_rnn, (turn_mask.shape[0]*turn_mask.shape[1], n_hid)) # BH x D
        l_out = L.DenseLayer(l_den_in, self.out_size, nonlinearity=lasagne.nonlinearities.softmax)

        self.network = l_out
        self.params = L.get_all_params(self.network)

        # rl
        probs = L.get_output(self.network) # BH x A
        out_probs = T.reshape(probs, (input_var.shape[0],input_var.shape[1],self.out_size)) # B x H x A
        log_probs = T.log(out_probs)
        act_probs = (log_probs*act_mask).sum(axis=2) # B x H
        ep_probs = (act_probs*turn_mask).sum(axis=1) # B
        H_probs = -T.sum(T.sum(out_probs*log_probs,axis=2),axis=1) # B
        self.loss = 0.-T.mean(ep_probs*reward_var + ment*H_probs)

        updates = lasagne.updates.rmsprop(self.loss, self.params, learning_rate=learning_rate_rl, \
                epsilon=1e-4)

        self.inps = [input_var, turn_mask, act_mask, reward_var, pol_in]
        self.train_fn = theano.function(self.inps, self.loss, updates=updates)
        self.obj_fn = theano.function(self.inps, self.loss)
        self.act_fn = theano.function([input_var, turn_mask, pol_in], [out_probs, pol_out])

        # sl
        sl_loss = 0.-T.mean(ep_probs)
        sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, learning_rate=learning_rate_sl, \
                epsilon=1e-4)

        self.sl_train_fn = theano.function([input_var, turn_mask, act_mask, pol_in], sl_loss, \
                updates=sl_updates)
        self.sl_obj_fn = theano.function([input_var, turn_mask, act_mask, pol_in], sl_loss)
    def build_network(self, K, vocab_size, W_init):

        l_docin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[0])
        l_doctokin = L.InputLayer(shape=(None, None), input_var=self.inps[1])
        l_qin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[2])
        l_qtokin = L.InputLayer(shape=(None, None), input_var=self.inps[3])
        l_docmask = L.InputLayer(shape=(None, None), input_var=self.inps[6])
        l_qmask = L.InputLayer(shape=(None, None), input_var=self.inps[7])
        l_tokin = L.InputLayer(shape=(None, MAX_WORD_LEN),
                               input_var=self.inps[8])
        l_tokmask = L.InputLayer(shape=(None, MAX_WORD_LEN),
                                 input_var=self.inps[9])
        l_featin = L.InputLayer(shape=(None, None), input_var=self.inps[11])

        l_match_feat = L.InputLayer(shape=(None, None, None),
                                    input_var=self.inps[13])
        l_match_feat = L.EmbeddingLayer(l_match_feat, 2, 1)
        l_match_feat = L.ReshapeLayer(l_match_feat, (-1, [1], [2]))

        l_use_char = L.InputLayer(shape=(None, None, self.feat_cnt),
                                  input_var=self.inps[14])
        l_use_char_q = L.InputLayer(shape=(None, None, self.feat_cnt),
                                    input_var=self.inps[15])

        doc_shp = self.inps[1].shape
        qry_shp = self.inps[3].shape

        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=vocab_size,
                                      output_size=self.embed_dim,
                                      W=W_init)  # B x N x 1 x DE
        l_doce = L.ReshapeLayer(
            l_docembed, (doc_shp[0], doc_shp[1], self.embed_dim))  # B x N x DE
        l_qembed = L.EmbeddingLayer(l_qin,
                                    input_size=vocab_size,
                                    output_size=self.embed_dim,
                                    W=l_docembed.W)

        if self.train_emb == 0:
            l_docembed.params[l_docembed.W].remove('trainable')
            l_qembed.params[l_qembed.W].remove('trainable')

        l_qembed = L.ReshapeLayer(
            l_qembed, (qry_shp[0], qry_shp[1], self.embed_dim))  # B x N x DE
        l_fembed = L.EmbeddingLayer(l_featin, input_size=2,
                                    output_size=2)  # B x N x 2

        # char embeddings
        if self.use_chars:
            # ====== concatenation ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 2*self.char_dim) # T x L x D
            # l_fgru = L.GRULayer(l_lookup, self.char_dim, grad_clipping=GRAD_CLIP,
            #         mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True,
            #         only_return_final=True)
            # l_bgru = L.GRULayer(l_lookup, 2*self.char_dim, grad_clipping=GRAD_CLIP,
            #         mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True,
            #         backwards=True, only_return_final=True) # T x 2D
            # l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim/2, nonlinearity=None) # T x DE/2
            # l_bckembed = L.DenseLayer(l_bgru, self.embed_dim/2, nonlinearity=None) # T x DE/2
            # l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1)
            # l_docchar_embed = IndexLayer([l_doctokin, l_embed]) # B x N x DE/2
            # l_qchar_embed = IndexLayer([l_qtokin, l_embed]) # B x Q x DE/2

            # l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2)
            # l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2)

            # ====== bidir feat concat ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            # l_fgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True)
            # l_bgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True, backwards = True)
            # l_char_gru = L.ElemwiseSumLayer([l_fgru, l_bgru])
            # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            # l_doce = L.ConcatLayer([l_use_char, l_docchar_embed, l_doce], axis = 2)
            # l_qembed = L.ConcatLayer([l_use_char_q, l_qchar_embed, l_qembed], axis = 2)

            # ====== char concat ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True)
            # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            # l_doce = L.ConcatLayer([l_docchar_embed, l_doce], axis = 2)
            # l_qembed = L.ConcatLayer([l_qchar_embed, l_qembed], axis = 2)

            # ====== feat concat ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True)
            # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            # l_doce = L.ConcatLayer([l_use_char, l_docchar_embed, l_doce], axis = 2)
            # l_qembed = L.ConcatLayer([l_use_char_q, l_qchar_embed, l_qembed], axis = 2)

            # ====== gating ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True)
            # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            # l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce])
            # l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed])

            # ====== tie gating ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True)
            # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            # l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce])
            # l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed], W = l_doce.W, b = l_doce.b)

            # ====== scalar gating ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            # l_char_gru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True)
            # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            # l_doce = ScalarDymLayer([l_use_char, l_docchar_embed, l_doce])
            # l_qembed = ScalarDymLayer([l_use_char_q, l_qchar_embed, l_qembed])

            # ====== dibirectional gating ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            # l_fgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True)
            # l_bgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True, backwards = True)
            # l_char_gru = L.ElemwiseSumLayer([l_fgru, l_bgru])
            # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            # l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce])
            # l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed])

            # ====== gate + concat ======
            l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            l_char_gru = L.GRULayer(l_lookup,
                                    self.embed_dim,
                                    grad_clipping=GRAD_CLIP,
                                    mask_input=l_tokmask,
                                    gradient_steps=GRAD_STEPS,
                                    precompute_input=True,
                                    only_return_final=True)
            l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce])
            l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed])

            l_doce = L.ConcatLayer([l_use_char, l_doce], axis=2)
            l_qembed = L.ConcatLayer([l_use_char_q, l_qembed], axis=2)

            # ====== bidirectional gate + concat ======
            # l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, 32)
            # l_fgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True)
            # l_bgru = L.GRULayer(l_lookup, self.embed_dim, grad_clipping = GRAD_CLIP, mask_input = l_tokmask, gradient_steps = GRAD_STEPS, precompute_input = True, only_return_final = True, backwards = True)
            # l_char_gru = L.ElemwiseSumLayer([l_fgru, l_bgru])
            # l_docchar_embed = IndexLayer([l_doctokin, l_char_gru])
            # l_qchar_embed = IndexLayer([l_qtokin, l_char_gru])

            # l_doce = GateDymLayer([l_use_char, l_docchar_embed, l_doce])
            # l_qembed = GateDymLayer([l_use_char_q, l_qchar_embed, l_qembed])

            # l_doce = L.ConcatLayer([l_use_char, l_doce], axis = 2)
            # l_qembed = L.ConcatLayer([l_use_char_q, l_qembed], axis = 2)

        attentions = []
        if self.save_attn:
            l_m = PairwiseInteractionLayer([l_doce, l_qembed])
            attentions.append(L.get_output(l_m, deterministic=True))

        for i in range(K - 1):
            l_fwd_doc_1 = L.GRULayer(l_doce,
                                     self.nhidden,
                                     grad_clipping=GRAD_CLIP,
                                     mask_input=l_docmask,
                                     gradient_steps=GRAD_STEPS,
                                     precompute_input=True)
            l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP,
                    mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                            backwards=True)

            l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1],
                               axis=2)  # B x N x DE

            l_fwd_q_1 = L.GRULayer(l_qembed,
                                   self.nhidden,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True)
            l_bkd_q_1 = L.GRULayer(l_qembed,
                                   self.nhidden,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True,
                                   backwards=True)

            l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1],
                                    axis=2)  # B x Q x DE

            l_doce = MatrixAttentionLayer(
                [l_doc_1, l_q_c_1, l_qmask, l_match_feat])
            # l_doce = MatrixAttentionLayer([l_doc_1, l_q_c_1, l_qmask])

            # === begin GA ===
            # l_m = PairwiseInteractionLayer([l_doc_1, l_q_c_1])
            # l_doc_2_in = GatedAttentionLayer([l_doc_1, l_q_c_1, l_m], mask_input=self.inps[7])
            # l_doce = L.dropout(l_doc_2_in, p=self.dropout) # B x N x DE
            # === end GA ===

            # if self.save_attn:
            #     attentions.append(L.get_output(l_m, deterministic=True))

        if self.use_feat:
            l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2)  # B x N x DE+2
        l_fwd_doc = L.GRULayer(l_doce,
                               self.nhidden,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP,
                mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                        backwards=True)
        l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2)

        l_fwd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             only_return_final=False)
        l_bkd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             backwards=True,
                             only_return_final=False)
        l_q = L.ConcatLayer([l_fwd_q, l_bkd_q], axis=2)  # B x Q x 2D

        if self.save_attn:
            l_m = PairwiseInteractionLayer([l_doc, l_q])
            attentions.append(L.get_output(l_m, deterministic=True))

        l_prob = AttentionSumLayer([l_doc, l_q],
                                   self.inps[4],
                                   self.inps[12],
                                   mask_input=self.inps[10])
        final = L.get_output(l_prob)
        final_v = L.get_output(l_prob, deterministic=True)

        return final, final_v, l_prob, l_docembed.W, attentions
Exemplo n.º 5
0
    def __init__(self):
        self.batch_size = 32
        self.embedding_size = 50
        self.nb_max_sentences = 10
        self.length_max_sentences = 30
        self.vocab_size = 10000
        self.nb_hidden = 32
        self.nb_hops = 5
        # Dimension of the input context is (batch_size, number of sentences, max size of sentences)
        self.context = T.itensor3('context')
        self.mask_context = T.imatrix('context_mask')
        # Dimension of the question input is (batch_size, max size of sentences)
        self.question = T.itensor3('question')
        self.mask_question = T.imatrix('question_mask')
        """
        Building the Input context module

        """
        mask_context = layers.InputLayer(
            (self.batch_size * self.nb_max_sentences,
             self.length_max_sentences),
            input_var=self.mask_context)
        # (batch_size, nb_sentences, length_max_sentences)
        input_module = layers.InputLayer(
            (self.batch_size, self.nb_max_sentences,
             self.length_max_sentences),
            input_var=self.context)
        # (batch_size, nb_sentences * length_max_sentences)
        input_module = layers.ReshapeLayer(input_module, (self.batch_size, -1))

        # (batch_size, nb_sentences * length_max_sequences, embedding_size)
        input_module = layers.EmbeddingLayer(input_module, self.vocab_size,
                                             self.embedding_size)

        # (batch_size, nb_sentences, length_max_sequences, embedding_size)
        input_module = layers.ReshapeLayer(
            input_module, (self.batch_size, self.nb_max_sentences,
                           self.length_max_sentences, self.embedding_size))

        # (batch_size * nb_sentences, length_sentences, embedding_size)
        input_module = layers.ReshapeLayer(
            input_module, (self.batch_size * self.nb_max_sentences,
                           self.length_max_sentences, self.embedding_size))

        # (batch_size * nb_sentences, nb_hidden)
        input_module = layers.GRULayer(input_module,
                                       self.nb_hidden,
                                       mask_input=mask_context,
                                       only_return_final=True)
        context = layers.get_output(input_module)
        # input_module = layers.ReshapeLayer(input_module, (self.batch_size, self.nb_max_sentences, self.nb_hidden))
        """
        Building the Input context module

        """
        # (bach_size, length_sentences)
        mask_question = layers.InputLayer(
            (self.batch_size, self.length_max_sentences),
            input_var=self.mask_question)
        # (batch_size, length_sentences)
        question_module = layers.InputLayer(
            (self.batch_size, self.length_max_sentences))

        # (batch_size, length_sentences, embedding_size)
        question_module = layers.EmbeddingLayer(question_module,
                                                self.vocab_size,
                                                self.embedding_size)

        # (batch_size, nb_hidden)
        question_module = layers.GRULayer(question_module,
                                          self.nb_hidden,
                                          mask_input=mask_question,
                                          only_return_final=True)
        question = layers.get_output(question_module)
        """
        Building the Memory module

        """
        memory = question
        self._M = utils.get_shared('glorot_uniform', self.nb_hidden,
                                   self.nb_hidden)

        for step in xrange(self.nb_hops):
            z_score_vector = T.concatenate([
                context, question, memory, context * question,
                context * memory,
                T.abs_(context - question),
                T.abs_(context - memory),
                T.dot(T.dot(context, self._M), question),
                T.dot(T.dot(context, self._M), memory)
            ])

            self._M1 = utils.get_shared('glorot_uniform', self.nb_hidden * 9,
                                        self.nb_hidden)
            self._B1 = utils.get_shared('constant_zero', self.nb_hidden, None)
            z1 = T.tanh(T.dot(self._M1, z_score_vector) + self._B1)

            self._M2 = utils.get_shared('glorot_uniform', self.nb_hidden, 1)
            self._B2 = utils.get_shared('constant_zero', self.nb_hidden, None)
            z2 = T.nnet.sigmoid(T.dot(self._M2, z1) + self._B2)
Exemplo n.º 6
0
target_var = T.ivector('targets')
index = T.iscalar("index")
batch_size = 500
n_train_batches = train_x.get_value(borrow=True).shape[0] / batch_size
n_valid_batches = valid_x.get_value(borrow=True).shape[0] / batch_size
n_test_batches = test_x.get_value(borrow=True).shape[0] / batch_size

l_in = layers.InputLayer(shape=(None, seq_len, feature_num),
                         input_var=input_var)
#l_rec = layers.RecurrentLayer(incoming=l_in,
#                              num_units=100,
#                              W_hid_to_hid=init_diagnal(100),
#                              b=init_constant(size=(100,)),
#                              nonlinearity=lasagne.nonlinearities.rectify,
#                              grad_clipping=1)
l_rec = layers.GRULayer(incoming=l_in, num_units=hidden_unit)
l_out = layers.DenseLayer(incoming=l_rec,
                          num_units=10,
                          nonlinearity=lasagne.nonlinearities.softmax)

prediction = lasagne.layers.get_output(l_out)
loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
loss = loss.mean()

params = lasagne.layers.get_all_params(l_out, trainable=True)
sum = 0
for p in params:
    shape = p.shape.eval()
    print shape
    if len(shape) > 1:
        sum += shape[0] * shape[1]
Exemplo n.º 7
0
def make_model():
    image = ll.InputLayer((BS, CH, IH, IW), name='step1.image')

    h_read_init = ll.InputLayer(
        (HS, ),
        lasagne.utils.create_param(li.Uniform(), (HS, ),
                                   name='step1.tensor.h_read_init'),
        name='step1.h_read_init')
    h_read_init.add_param(h_read_init.input_var, (HS, ))

    h_write_init = ll.InputLayer(
        (HS, ),
        lasagne.utils.create_param(li.Uniform(), (HS, ),
                                   name='step1.tensor.h_write_init'),
        name='step1.h_write_init')
    h_write_init.add_param(h_write_init.input_var, (HS, ))

    h_read = ll.ExpressionLayer(h_read_init,
                                lambda t: T.tile(T.reshape(t, (1, HS)),
                                                 (BS, 1)), (BS, HS),
                                name='step1.h_read')

    h_write = ll.ExpressionLayer(h_write_init,
                                 lambda t: T.tile(T.reshape(t, (1, HS)),
                                                  (BS, 1)), (BS, HS),
                                 name='step1.h_write')

    canvas = ll.InputLayer(
        (BS, CH, IH, IW),
        lasagne.utils.create_param(li.Constant(0.0), (BS, CH, IH, IW),
                                   name='step1.tensor.canvas'),
        name='step1.canvas')

    image_prev = ll.NonlinearityLayer(canvas,
                                      ln.sigmoid,
                                      name='step1.image_prev')

    image_error = ll.ElemwiseSumLayer([image, image_prev],
                                      coeffs=[1, -1],
                                      name='step1.image_error')
    image_stack = ll.ConcatLayer([image, image_error],
                                 name='step1.image_stack')

    read_params = ll.DenseLayer(h_write,
                                6,
                                nonlinearity=None,
                                name='step1.read_params')
    read_window = advanced_layers.AttentionLayer([read_params, image_stack],
                                                 (WH, WW),
                                                 name='step1.read_window')

    read_flat = ll.FlattenLayer(read_window, name='step1.read_flat')
    read_code = ll.ConcatLayer([read_flat, h_write], name='step1.read_code')

    read_code_sequence = ll.ReshapeLayer(read_code,
                                         (BS, 1, read_code.output_shape[-1]),
                                         name='step1.read_code_sequence')

    read_rnn = ll.GRULayer(
        read_code_sequence,
        HS,
        only_return_final=True,
        hid_init=h_read,
        name='step1.read_rnn',
    )

    sample_mean = ll.DenseLayer(read_rnn,
                                ENC_NDIM,
                                nonlinearity=None,
                                name='step1.sample_mean')
    sample_logvar2 = ll.DenseLayer(read_rnn,
                                   ENC_NDIM,
                                   nonlinearity=None,
                                   name='step1.sample_logvar2')
    sample = advanced_layers.SamplingLayer([sample_mean, sample_logvar2],
                                           ENC_VAR,
                                           name='step1.sample')

    write_code = ll.DenseLayer(sample, HS, name='step1.write_code')
    write_code_sequence = ll.ReshapeLayer(write_code,
                                          (BS, 1, write_code.output_shape[-1]),
                                          name='step1.write_code_sequence')
    write_rnn = ll.GRULayer(
        write_code_sequence,
        HS,
        only_return_final=True,
        hid_init=h_write,
        name='step1.write_rnn',
    )
    write_window_flat = ll.DenseLayer(write_rnn,
                                      CH * WH * WW,
                                      name='step1.write_window_flat')
    write_window = ll.ReshapeLayer(write_window_flat, (BS, CH, WH, WW),
                                   name='step1.write_window')

    write_params = ll.DenseLayer(h_write,
                                 6,
                                 nonlinearity=None,
                                 name='step1.write_params')
    write_image = advanced_layers.AttentionLayer([write_params, write_window],
                                                 (IH, IW),
                                                 name='step1.write_image')
    canvas_next = ll.ElemwiseSumLayer([canvas, write_image],
                                      name='step1.canvas_next')

    def rename(name):
        if name is None:
            return None
        step, real_name = name.split('.', 1)
        step = int(step[4:])
        return 'step%d.%s' % (step + 1, real_name)

    for step in xrange(1, TIME_ROUNDS):
        sample_random_variable_next = sample.random_stream.normal(
            sample.input_shapes[0],
            std=sample.variation_coeff,
        )
        sample_random_variable_next.name = 'step%d.sample.random_variable' % \
            (step + 1)

        canvas, canvas_next = (canvas_next,
                               utils.modified_copy(
                                   canvas_next,
                                   modify={
                                       h_read:
                                       read_rnn,
                                       h_write:
                                       write_rnn,
                                       canvas:
                                       canvas_next,
                                       sample.random_stream:
                                       sample.random_stream,
                                       sample.random_variable:
                                       sample_random_variable_next,
                                   },
                                   rename=rename,
                               ))

        h_read = read_rnn
        h_write = write_rnn
        read_rnn = utils.layer_by_name(canvas_next,
                                       'step%d.read_rnn' % (step + 1))
        write_rnn = utils.layer_by_name(canvas_next,
                                        'step%d.write_rnn' % (step + 1))
        sample = utils.layer_by_name(canvas_next, 'step%d.sample' % (step + 1))

    output = ll.NonlinearityLayer(canvas_next, ln.sigmoid, name='output')

    return output
    def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs):
        
        print ("==> not used params in DMN class:", kwargs.keys())
        self.train_list_raw = train_list_raw
        self.test_list_raw = test_list_raw
        self.png_folder = png_folder
        self.batch_size = batch_size
        self.dropout = dropout
        self.l2 = l2
        self.mode = mode
        self.batch_norm = batch_norm
        self.num_units = rnn_num_units
        
        self.input_var = T.tensor4('input_var')
        self.answer_var = T.ivector('answer_var')
        
        print ("==> building network")
        example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) #########
        answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) #########
       
        network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        
        # CONV-RELU-POOL 1
        network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), 
                                     stride=1, nonlinearity=rectify)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        # CONV-RELU-POOL 2
        network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), 
                                     stride=1, nonlinearity=rectify)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        
        # CONV-RELU-POOL 3
        network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 
                                     stride=1, nonlinearity=rectify)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        # CONV-RELU-POOL 4
        network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 
                                     stride=1, nonlinearity=rectify)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        self.params = layers.get_all_params(network, trainable=True)
        
        output = layers.get_output(network)
        num_channels  = 32 
        filter_W = 54
        filter_H = 8
        
        # NOTE: these constants are shapes of last pool layer, it can be symbolic 
        # explicit values are better for optimizations
        
        channels = []
        for channel_index in range(num_channels):
            channels.append(output[:, channel_index, :, :].transpose((0, 2, 1)))
        
        rnn_network_outputs = []
        W_in_to_updategate = None
        W_hid_to_updategate = None
        b_updategate = None
        W_in_to_resetgate = None
        W_hid_to_resetgate = None
        b_resetgate = None
        W_in_to_hidden_update = None
        W_hid_to_hidden_update = None
        b_hidden_update = None
        
        W_in_to_updategate1 = None
        W_hid_to_updategate1 = None
        b_updategate1 = None
        W_in_to_resetgate1 = None
        W_hid_to_resetgate1 = None
        b_resetgate1 = None
        W_in_to_hidden_update1 = None
        W_hid_to_hidden_update1 = None
        b_hidden_update1 = None
        
        for channel_index in range(num_channels):
            rnn_input_var = channels[channel_index]
            
            # InputLayer       
            network = layers.InputLayer(shape=(None, filter_W, filter_H), input_var=rnn_input_var)

            if (channel_index == 0):
                # GRULayer
                network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=False)
                W_in_to_updategate = network.W_in_to_updategate
                W_hid_to_updategate = network.W_hid_to_updategate
                b_updategate = network.b_updategate
                W_in_to_resetgate = network.W_in_to_resetgate
                W_hid_to_resetgate = network.W_hid_to_resetgate
                b_resetgate = network.b_resetgate
                W_in_to_hidden_update = network.W_in_to_hidden_update
                W_hid_to_hidden_update = network.W_hid_to_hidden_update
                b_hidden_update = network.b_hidden_update
                
                # BatchNormalization Layer
                if (self.batch_norm):
                    network = layers.BatchNormLayer(incoming=network)
                
                # GRULayer
                network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True)
                W_in_to_updategate1 = network.W_in_to_updategate
                W_hid_to_updategate1 = network.W_hid_to_updategate
                b_updategate1 = network.b_updategate
                W_in_to_resetgate1 = network.W_in_to_resetgate
                W_hid_to_resetgate1 = network.W_hid_to_resetgate
                b_resetgate1 = network.b_resetgate
                W_in_to_hidden_update1 = network.W_in_to_hidden_update
                W_hid_to_hidden_update1 = network.W_hid_to_hidden_update
                b_hidden_update1 = network.b_hidden_update
                        
                # add params 
                self.params += layers.get_all_params(network, trainable=True)

            else:
                # GRULayer, but shared
                network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=False,
                            resetgate=layers.Gate(W_in=W_in_to_resetgate, W_hid=W_hid_to_resetgate, b=b_resetgate),
                            updategate=layers.Gate(W_in=W_in_to_updategate, W_hid=W_hid_to_updategate, b=b_updategate),
                            hidden_update=layers.Gate(W_in=W_in_to_hidden_update, W_hid=W_hid_to_hidden_update, b=b_hidden_update))
                            
                # BatchNormalization Layer
                if (self.batch_norm):
                    network = layers.BatchNormLayer(incoming=network)
                    
                # GRULayer, but shared
                network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True,
                            resetgate=layers.Gate(W_in=W_in_to_resetgate1, W_hid=W_hid_to_resetgate1, b=b_resetgate1),
                            updategate=layers.Gate(W_in=W_in_to_updategate1, W_hid=W_hid_to_updategate1, b=b_updategate1),
                            hidden_update=layers.Gate(W_in=W_in_to_hidden_update1, W_hid=W_hid_to_hidden_update1, b=b_hidden_update1))
                
            
            rnn_network_outputs.append(layers.get_output(network))
        
        all_output_var = T.concatenate(rnn_network_outputs, axis=1)
        print (all_output_var.eval({self.input_var:example}).shape)
        
        # InputLayer
        network = layers.InputLayer(shape=(None, self.num_units * num_channels), input_var=all_output_var)
        
        # Dropout Layer
        if (self.dropout > 0):
            network = layers.dropout(network, self.dropout)
        
        # BatchNormalization Layer
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        # Last layer: classification
        network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        
    
        self.params += layers.get_all_params(network, trainable=True)
        self.prediction = layers.get_output(network)
    
        #print "==> param shapes", [x.eval().shape for x in self.params]
        
        self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean()
        if (self.l2 > 0):
            self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(self.params, 
                                                                          lasagne.regularization.l2)
        else:
            self.loss_l2 = 0
        self.loss = self.loss_ce + self.loss_l2
        
        #updates = lasagne.updates.adadelta(self.loss, self.params)
        updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003)
        
        if self.mode == 'train':
            print ("==> compiling train_fn")
            self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 
                                            outputs=[self.prediction, self.loss],
                                            updates=updates)
        
        print ("==> compiling test_fn")
        self.test_fn = theano.function(inputs=[self.input_var, self.answer_var],
                                       outputs=[self.prediction, self.loss])
    def build_network(self, K, vocab_size, W_init):

        l_docin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[0])
        l_doctokin = L.InputLayer(shape=(None, None), input_var=self.inps[1])
        l_qin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[2])
        l_qtokin = L.InputLayer(shape=(None, None), input_var=self.inps[3])
        l_docmask = L.InputLayer(shape=(None, None), input_var=self.inps[6])
        l_qmask = L.InputLayer(shape=(None, None), input_var=self.inps[7])
        l_tokin = L.InputLayer(shape=(None, MAX_WORD_LEN),
                               input_var=self.inps[8])
        l_tokmask = L.InputLayer(shape=(None, MAX_WORD_LEN),
                                 input_var=self.inps[9])
        l_featin = L.InputLayer(shape=(None, None), input_var=self.inps[11])

        doc_shp = self.inps[1].shape
        qry_shp = self.inps[3].shape

        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=vocab_size,
                                      output_size=self.embed_dim,
                                      W=W_init)  # B x N x 1 x DE
        l_doce = L.ReshapeLayer(
            l_docembed, (doc_shp[0], doc_shp[1], self.embed_dim))  # B x N x DE
        l_qembed = L.EmbeddingLayer(l_qin,
                                    input_size=vocab_size,
                                    output_size=self.embed_dim,
                                    W=l_docembed.W)
        l_qembed = L.ReshapeLayer(
            l_qembed, (qry_shp[0], qry_shp[1], self.embed_dim))  # B x N x DE
        l_fembed = L.EmbeddingLayer(l_featin, input_size=2,
                                    output_size=2)  # B x N x 2

        if self.train_emb == 0:
            l_docembed.params[l_docembed.W].remove('trainable')

        # char embeddings
        if self.use_chars:
            l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars,
                                        2 * self.char_dim)  # T x L x D
            l_fgru = L.GRULayer(l_lookup,
                                self.char_dim,
                                grad_clipping=GRAD_CLIP,
                                mask_input=l_tokmask,
                                gradient_steps=GRAD_STEPS,
                                precompute_input=True,
                                only_return_final=True)
            l_bgru = L.GRULayer(l_lookup,
                                2 * self.char_dim,
                                grad_clipping=GRAD_CLIP,
                                mask_input=l_tokmask,
                                gradient_steps=GRAD_STEPS,
                                precompute_input=True,
                                backwards=True,
                                only_return_final=True)  # T x 2D
            l_fwdembed = L.DenseLayer(l_fgru,
                                      self.embed_dim / 2,
                                      nonlinearity=None)  # T x DE/2
            l_bckembed = L.DenseLayer(l_bgru,
                                      self.embed_dim / 2,
                                      nonlinearity=None)  # T x DE/2
            l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1)
            l_docchar_embed = IndexLayer([l_doctokin, l_embed])  # B x N x DE/2
            l_qchar_embed = IndexLayer([l_qtokin, l_embed])  # B x Q x DE/2

            l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2)
            l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2)

        l_fwd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             only_return_final=False)
        l_bkd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             backwards=True,
                             only_return_final=False)

        l_q = L.ConcatLayer([l_fwd_q, l_bkd_q])  # B x Q x 2D
        q = L.get_output(l_q)  # B x Q x 2D
        q = q[T.arange(q.shape[0]), self.inps[12], :]  # B x 2D

        l_qs = [l_q]
        for i in range(K - 1):
            l_fwd_doc_1 = L.GRULayer(l_doce,
                                     self.nhidden,
                                     grad_clipping=GRAD_CLIP,
                                     mask_input=l_docmask,
                                     gradient_steps=GRAD_STEPS,
                                     precompute_input=True)
            l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP,
                    mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                            backwards=True)

            l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1],
                               axis=2)  # B x N x DE

            l_fwd_q_1 = L.GRULayer(l_qembed,
                                   self.nhidden,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True)
            l_bkd_q_1 = L.GRULayer(l_qembed,
                                   self.nhidden,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True,
                                   backwards=True)

            l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1],
                                    axis=2)  # B x Q x DE
            l_qs.append(l_q_c_1)

            qd = L.get_output(l_q_c_1)  # B x Q x DE
            dd = L.get_output(l_doc_1)  # B x N x DE
            M = T.batched_dot(dd, qd.dimshuffle((0, 2, 1)))  # B x N x Q
            alphas = T.nnet.softmax(
                T.reshape(M, (M.shape[0] * M.shape[1], M.shape[2])))
            alphas_r = T.reshape(alphas, (M.shape[0],M.shape[1],M.shape[2]))* \
                    self.inps[7][:,np.newaxis,:] # B x N x Q
            alphas_r = alphas_r / alphas_r.sum(axis=2)[:, :,
                                                       np.newaxis]  # B x N x Q
            q_rep = T.batched_dot(alphas_r, qd)  # B x N x DE

            l_q_rep_in = L.InputLayer(shape=(None, None, 2 * self.nhidden),
                                      input_var=q_rep)
            l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul)
            l_doce = L.dropout(l_doc_2_in, p=self.dropout)  # B x N x DE

        if self.use_feat:
            l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2)  # B x N x DE+2
        l_fwd_doc = L.GRULayer(l_doce,
                               self.nhidden,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP,
                mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                        backwards=True)

        l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2)

        d = L.get_output(l_doc)  # B x N x 2D
        p = T.batched_dot(d, q)  # B x N
        pm = T.nnet.softmax(p) * self.inps[10]
        pm = pm / pm.sum(axis=1)[:, np.newaxis]
        final = T.batched_dot(pm, self.inps[4])

        dv = L.get_output(l_doc, deterministic=True)  # B x N x 2D
        p = T.batched_dot(dv, q)  # B x N
        pm = T.nnet.softmax(p) * self.inps[10]
        pm = pm / pm.sum(axis=1)[:, np.newaxis]
        final_v = T.batched_dot(pm, self.inps[4])

        return final, final_v, l_doc, l_qs, l_docembed.W
Exemplo n.º 10
0
    def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size,
                 l2, mode, rnn_num_units, batch_norm, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()
        self.train_list_raw = train_list_raw
        self.test_list_raw = test_list_raw
        self.png_folder = png_folder
        self.batch_size = batch_size
        self.l2 = l2
        self.mode = mode
        self.num_units = rnn_num_units
        self.batch_norm = batch_norm

        self.input_var = T.tensor3('input_var')
        self.answer_var = T.ivector('answer_var')

        # scale inputs to be in [-1, 1]
        input_var_norm = 2 * self.input_var - 1

        print "==> building network"
        example = np.random.uniform(size=(self.batch_size, 858, 256),
                                    low=0.0,
                                    high=1.0).astype(np.float32)  #########
        answer = np.random.randint(low=0, high=176,
                                   size=(self.batch_size, ))  #########

        # InputLayer
        network = layers.InputLayer(shape=(None, 858, 256),
                                    input_var=input_var_norm)
        print layers.get_output(network).eval({self.input_var: example}).shape

        # GRULayer
        network = layers.GRULayer(incoming=network, num_units=self.num_units)
        print layers.get_output(network).eval({self.input_var: example}).shape

        # BatchNormalization Layer
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
            print layers.get_output(network).eval({
                self.input_var: example
            }).shape

        # GRULayer
        network = layers.GRULayer(incoming=network,
                                  num_units=self.num_units,
                                  only_return_final=True)
        print layers.get_output(network).eval({self.input_var: example}).shape

        # Last layer: classification
        network = layers.DenseLayer(incoming=network,
                                    num_units=176,
                                    nonlinearity=softmax)
        print layers.get_output(network).eval({self.input_var: example}).shape

        self.params = layers.get_all_params(network, trainable=True)
        self.prediction = layers.get_output(network)

        self.loss_ce = lasagne.objectives.categorical_crossentropy(
            self.prediction, self.answer_var).mean()
        if (self.l2 > 0):
            self.loss_l2 = self.l2 * lasagne.regularization.regularize_network_params(
                network, lasagne.regularization.l2)
        else:
            self.loss_l2 = 0
        self.loss = self.loss_ce + self.loss_l2

        #updates = lasagne.updates.adadelta(self.loss, self.params)
        updates = lasagne.updates.momentum(self.loss,
                                           self.params,
                                           learning_rate=0.003)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[self.input_var, self.answer_var],
                outputs=[self.prediction, self.loss],
                updates=updates)

        print "==> compiling test_fn"
        self.test_fn = theano.function(
            inputs=[self.input_var, self.answer_var],
            outputs=[self.prediction, self.loss])
Exemplo n.º 11
0
    # NOTE: these constants are shapes of last pool layer, it can be symbolic
    # explicit values are better for optimizations
    num_channels = 32
    filter_W = 852
    filter_H = 8

    # InputLayer
    network = layers.InputLayer(shape=(None, filter_W,
                                       num_channels * filter_H),
                                input_var=output)
    print layers.get_output(network).eval({input_var: example}).shape

    # GRULayer
    network = layers.GRULayer(incoming=network,
                              num_units=num_units,
                              only_return_final=True)
    print layers.get_output(network).eval({input_var: example}).shape
    if (batch_norm):
        network = layers.BatchNormLayer(incoming=network)
    if (dropout > 0):
        network = layers.dropout(network, dropout)

    # Last layer: classification
    network = layers.DenseLayer(incoming=network,
                                num_units=3,
                                nonlinearity=softmax)
    print layers.get_output(network).eval({input_var: example}).shape

    params += layers.get_all_params(network, trainable=True)
    prediction = layers.get_output(network)
Exemplo n.º 12
0
index = T.iscalar("index")
batch_size = 500
n_train_batches = train_x.get_value(borrow=True).shape[0] / batch_size
n_valid_batches = valid_x.get_value(borrow=True).shape[0] / batch_size
n_test_batches = test_x.get_value(borrow=True).shape[0] / batch_size

l_in = layers.InputLayer(shape=(None, seq_len, feature_num),
                         input_var=input_var)
#l_rec = layers.RecurrentLayer(incoming=l_in,
#                              num_units=100,
#                              W_hid_to_hid=init_diagnal(100),
#                              b=init_constant(size=(100,)),
#                              nonlinearity=lasagne.nonlinearities.rectify,
#                              grad_clipping=1)
l_rec = layers.GRULayer(incoming=l_in,
                        num_units=hidden_unit,
                        only_return_final=True)
l_out = layers.DenseLayer(incoming=l_rec,
                          num_units=2,
                          nonlinearity=lasagne.nonlinearities.softmax)

prediction = lasagne.layers.get_output(l_out)
loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
loss = loss.mean()

params = lasagne.layers.get_all_params(l_out, trainable=True)
sum = 0
for p in params:
    shape = p.shape.eval()
    print shape
    if len(shape) > 1:
Exemplo n.º 13
0
    def build_network(self, K, vocab_size, doc_var, query_var, docmask_var,
                      qmask_var, candmask_var, feat_var, W_init):

        l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var)
        l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var)
        l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var)
        l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var)
        l_featin = L.InputLayer(shape=(None, None), input_var=feat_var)
        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=vocab_size,
                                      output_size=EMBED_DIM,
                                      W=W_init)  # B x N x 1 x DE
        l_doce = L.ReshapeLayer(
            l_docembed,
            (doc_var.shape[0], doc_var.shape[1], EMBED_DIM))  # B x N x DE
        l_qembed = L.EmbeddingLayer(l_qin,
                                    input_size=vocab_size,
                                    output_size=EMBED_DIM,
                                    W=l_docembed.W)
        l_fembed = L.EmbeddingLayer(l_featin, input_size=2,
                                    output_size=2)  # B x N x 2

        if not EMB_TRAIN: l_docembed.params[l_docembed.W].remove('trainable')

        l_fwd_q = L.GRULayer(l_qembed,
                             NUM_HIDDEN,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True)
        l_bkd_q = L.GRULayer(l_qembed,
                             NUM_HIDDEN,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             backwards=True)

        l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1)
        l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1)
        l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice])  # B x 2D
        q = L.get_output(l_q)  # B x 2D

        l_qs = [l_q]
        for i in range(K - 1):
            l_fwd_doc_1 = L.GRULayer(l_doce,
                                     NUM_HIDDEN,
                                     grad_clipping=GRAD_CLIP,
                                     mask_input=l_docmask,
                                     gradient_steps=GRAD_STEPS,
                                     precompute_input=True)
            l_bkd_doc_1 = L.GRULayer(l_doce,
                                     NUM_HIDDEN,
                                     grad_clipping=GRAD_CLIP,
                                     mask_input=l_docmask,
                                     gradient_steps=GRAD_STEPS,
                                     precompute_input=True,
                                     backwards=True)

            l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2)

            l_fwd_q_1 = L.GRULayer(l_qembed,
                                   NUM_HIDDEN,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True)
            l_bkd_q_1 = L.GRULayer(l_qembed,
                                   NUM_HIDDEN,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True,
                                   backwards=True)

            l_fwd_q_slice_1 = L.SliceLayer(l_fwd_q_1, -1, 1)
            l_bkd_q_slice_1 = L.SliceLayer(l_bkd_q_1, 0, 1)
            l_q_c_1 = L.ConcatLayer([l_fwd_q_slice_1,
                                     l_bkd_q_slice_1])  # B x DE

            l_qs.append(l_q_c_1)

            qd = L.get_output(l_q_c_1)
            q_rep = T.reshape(T.tile(qd, (1, doc_var.shape[1])),
                              (doc_var.shape[0], doc_var.shape[1],
                               2 * NUM_HIDDEN))  # B x N x DE

            l_q_rep_in = L.InputLayer(shape=(None, None, 2 * NUM_HIDDEN),
                                      input_var=q_rep)
            l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul)
            l_doce = L.dropout(l_doc_2_in, p=DROPOUT_RATE)  # B x N x DE

        l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2)  # B x N x DE+2
        l_fwd_doc = L.GRULayer(l_doce,
                               NUM_HIDDEN,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_doc = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP,
                mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                        backwards=True)

        l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2)

        d = L.get_output(l_doc)  # B x N x 2D
        p = T.batched_dot(d, q)  # B x N
        pm = T.nnet.softmax(p) * candmask_var
        pm = pm / pm.sum(axis=1)[:, np.newaxis]

        index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape)
        final = T.inc_subtensor(T.alloc(0.,p.shape[0],vocab_size)[index,T.flatten(doc_var,outdim=2)],\
                pm)

        dv = L.get_output(l_doc, deterministic=True)  # B x N x 2D
        p = T.batched_dot(dv, q)  # B x N
        pm = T.nnet.softmax(p) * candmask_var
        pm = pm / pm.sum(axis=1)[:, np.newaxis]

        index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape)
        final_v = T.inc_subtensor(T.alloc(0.,p.shape[0],vocab_size)[index,\
                T.flatten(doc_var,outdim=2)],pm)

        return final, final_v, l_doc, l_qs
Exemplo n.º 14
0
# Recurrent layers expect input of shape
# (batch size, max sequence length, number of features)
l_in = layers.InputLayer(shape=(N_BATCH, MAX_LENGTH, 2))
# The network also needs a way to provide a mask for each sequence.  We'll
# use a separate input layer for that.  Since the mask only determines
# which indices are part of the sequence for each batch entry, they are
# supplied as matrices of dimensionality (N_BATCH, MAX_LENGTH)
l_mask = layers.InputLayer(shape=(N_BATCH, MAX_LENGTH))
# We're using a bidirectional network, which means we will combine two
# RecurrentLayers, one with the backwards=True keyword argument.
# Setting a value for grad_clipping will clip the gradients in the layer
# Setting only_return_final=True makes the layers only return their output
# for the final time step, which is all we need for this task
l_forward = layers.GRULayer(l_in,
                            N_HIDDEN,
                            mask_input=l_mask,
                            grad_clipping=GRAD_CLIP,
                            only_return_final=True)
l_backward = layers.GRULayer(l_in,
                             N_HIDDEN,
                             mask_input=l_mask,
                             grad_clipping=GRAD_CLIP,
                             only_return_final=True,
                             backwards=True)
# Now, we'll concatenate the outputs to combine them.
l_concat = layers.ConcatLayer([l_forward, l_backward])
# Our output layer is a simple dense connection, with 1 output unit
l_out = layers.DenseLayer(l_concat,
                          num_units=1,
                          nonlinearity=lasagne.nonlinearities.tanh)
Exemplo n.º 15
0
    def build_network(self, K, vocab_size, doc_var, query_var, cand_var,
                      docmask_var, qmask_var, candmask_var, W_init):

        l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var)
        l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var)
        l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var)
        l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var)
        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=vocab_size,
                                      output_size=self.embed_dim,
                                      W=W_init)  # B x N x 1 x DE
        l_doce = L.ReshapeLayer(
            l_docembed,
            (doc_var.shape[0], doc_var.shape[1], self.embed_dim))  # B x N x DE
        l_qembed = L.EmbeddingLayer(l_qin,
                                    input_size=vocab_size,
                                    output_size=self.embed_dim,
                                    W=l_docembed.W)

        if self.train_emb == 0:
            l_docembed.params[l_docembed.W].remove('trainable')

        l_fwd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True)
        l_bkd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             backwards=True)

        l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1)
        l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1)
        l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice])  # B x 2D
        q = L.get_output(l_q)  # B x 2D

        l_qs = [l_q]
        for i in range(K - 1):
            l_fwd_doc_1 = L.GRULayer(l_doce,
                                     self.nhidden,
                                     grad_clipping=GRAD_CLIP,
                                     mask_input=l_docmask,
                                     gradient_steps=GRAD_STEPS,
                                     precompute_input=True)
            l_bkd_doc_1 = L.GRULayer(l_doce,
                                     self.nhidden,
                                     grad_clipping=GRAD_CLIP,
                                     mask_input=l_docmask,
                                     gradient_steps=GRAD_STEPS,
                                     precompute_input=True,
                                     backwards=True)

            l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2)

            l_fwd_q_1 = L.GRULayer(l_qembed,
                                   self.nhidden,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True)
            l_bkd_q_1 = L.GRULayer(l_qembed,
                                   self.nhidden,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True,
                                   backwards=True)

            l_fwd_q_slice_1 = L.SliceLayer(l_fwd_q_1, -1, 1)
            l_bkd_q_slice_1 = L.SliceLayer(l_bkd_q_1, 0, 1)
            l_q_c_1 = L.ConcatLayer([l_fwd_q_slice_1,
                                     l_bkd_q_slice_1])  # B x DE

            l_qs.append(l_q_c_1)

            qd = L.get_output(l_q_c_1)
            q_rep = T.reshape(T.tile(qd, (1, doc_var.shape[1])),
                              (doc_var.shape[0], doc_var.shape[1],
                               2 * self.nhidden))  # B x N x DE

            l_q_rep_in = L.InputLayer(shape=(None, None, 2 * self.nhidden),
                                      input_var=q_rep)
            l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul)
            l_doce = L.dropout(l_doc_2_in, p=self.dropout)

        l_fwd_doc = L.GRULayer(l_doce,
                               self.nhidden,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_doc = L.GRULayer(l_doce,
                               self.nhidden,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True,
                               backwards=True)

        l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2)

        d = L.get_output(l_doc)  # B x N x 2D
        p = T.batched_dot(d, q)  # B x N
        pm = T.nnet.softmax(p) * candmask_var
        pm = pm / pm.sum(axis=1)[:, np.newaxis]
        final = T.batched_dot(pm, cand_var)

        dv = L.get_output(l_doc, deterministic=True)  # B x N x 2D
        p = T.batched_dot(dv, q)  # B x N
        pm = T.nnet.softmax(p) * candmask_var
        pm = pm / pm.sum(axis=1)[:, np.newaxis]
        final_v = T.batched_dot(pm, cand_var)

        return final, final_v, l_doc, l_qs, l_docembed.W
Exemplo n.º 16
0
    def _init_model(self, in_size, out_size, slot_sizes, db, \
            n_hid=10, learning_rate_sl=0.005, learning_rate_rl=0.005, batch_size=32, ment=0.1, \
            inputtype='full', sl='e2e', rl='e2e'):
        self.in_size = in_size
        self.out_size = out_size
        self.slot_sizes = slot_sizes
        self.batch_size = batch_size
        self.learning_rate = learning_rate_rl
        self.n_hid = n_hid
        self.r_hid = self.n_hid
        self.sl = sl
        self.rl = rl

        table = db.table
        counts = db.counts
        m_unk = [db.inv_counts[s][-1] for s in dialog_config.inform_slots]
        prior = [db.priors[s] for s in dialog_config.inform_slots]
        unknown = [db.unks[s] for s in dialog_config.inform_slots]
        ids = [db.ids[s] for s in dialog_config.inform_slots]

        input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.bmatrix('tm'), \
                T.btensor3('am'), T.fvector('r')
        T_var, N_var = T.as_tensor_variable(table), T.as_tensor_variable(
            counts)
        db_index_var = T.imatrix('db')
        db_index_switch = T.bvector('s')

        l_mask_in = L.InputLayer(shape=(None, None), input_var=turn_mask)
        flat_mask = T.reshape(turn_mask,
                              (turn_mask.shape[0] * turn_mask.shape[1], 1))

        def _smooth(p):
            p_n = p + EPS
            return p_n / (p_n.sum(axis=1)[:, np.newaxis])

        def _add_unk(p, m, N):
            # p: B x V, m- num missing, N- total, p0: 1 x V
            t_unk = T.as_tensor_variable(float(m) / N)
            ps = p * (1. - t_unk)
            return T.concatenate([ps, T.tile(t_unk, (ps.shape[0], 1))], axis=1)

        def kl_divergence(p, q):
            p_n = _smooth(p)
            return -T.sum(q * T.log(p_n), axis=1)

        # belief tracking
        l_in = L.InputLayer(shape=(None, None, self.in_size),
                            input_var=input_var)
        p_vars = []
        pu_vars = []
        phi_vars = []
        p_targets = []
        phi_targets = []
        hid_in_vars = []
        hid_out_vars = []
        bt_loss = T.as_tensor_variable(0.)
        kl_loss = []
        x_loss = []
        self.trackers = []
        for i, s in enumerate(dialog_config.inform_slots):
            hid_in = T.fmatrix('h')
            l_rnn = L.GRULayer(l_in, self.r_hid, hid_init=hid_in,  \
                    mask_input=l_mask_in,
                    grad_clipping=10.) # B x H x D
            l_b_in = L.ReshapeLayer(l_rnn,
                                    (input_var.shape[0] * input_var.shape[1],
                                     self.r_hid))  # BH x D
            hid_out = L.get_output(l_rnn)[:, -1, :]

            p_targ = T.ftensor3('p_target_' + s)
            p_t = T.reshape(
                p_targ,
                (p_targ.shape[0] * p_targ.shape[1], self.slot_sizes[i]))
            phi_targ = T.fmatrix('phi_target' + s)
            phi_t = T.reshape(phi_targ,
                              (phi_targ.shape[0] * phi_targ.shape[1], 1))

            l_b = L.DenseLayer(l_b_in,
                               self.slot_sizes[i],
                               nonlinearity=lasagne.nonlinearities.softmax)
            l_phi = L.DenseLayer(l_b_in,
                                 1,
                                 nonlinearity=lasagne.nonlinearities.sigmoid)

            phi = T.clip(L.get_output(l_phi), 0.01, 0.99)
            p = L.get_output(l_b)
            p_u = _add_unk(p, m_unk[i], db.N)
            kl_loss.append(
                T.sum(flat_mask.flatten() * kl_divergence(p, p_t)) /
                T.sum(flat_mask))
            x_loss.append(
                T.sum(flat_mask *
                      lasagne.objectives.binary_crossentropy(phi, phi_t)) /
                T.sum(flat_mask))
            bt_loss += kl_loss[-1] + x_loss[-1]

            p_vars.append(p)
            pu_vars.append(p_u)
            phi_vars.append(phi)
            p_targets.append(p_targ)
            phi_targets.append(phi_targ)
            hid_in_vars.append(hid_in)
            hid_out_vars.append(hid_out)
            self.trackers.append(l_b)
            self.trackers.append(l_phi)
        self.bt_params = L.get_all_params(self.trackers)

        def check_db(pv, phi, Tb, N):
            O = T.alloc(0., pv[0].shape[0], Tb.shape[0])  # BH x T.shape[0]
            for i, p in enumerate(pv):
                p_dc = T.tile(phi[i], (1, Tb.shape[0]))
                O += T.log(p_dc*(1./db.table.shape[0]) + \
                        (1.-p_dc)*(p[:,Tb[:,i]]/N[np.newaxis,:,i]))
            Op = T.exp(O)  #+EPS # BH x T.shape[0]
            Os = T.sum(Op, axis=1)[:, np.newaxis]  # BH x 1
            return Op / Os

        def entropy(p):
            p = _smooth(p)
            return -T.sum(p * T.log(p), axis=-1)

        def weighted_entropy(p, q, p0, unks, idd):
            w = T.dot(idd, q.transpose())  # Pi x BH
            u = p0[np.newaxis, :] * (q[:, unks].sum(axis=1)[:, np.newaxis]
                                     )  # BH x Pi
            p_tilde = w.transpose() + u
            return entropy(p_tilde)

        p_db = check_db(pu_vars, phi_vars, T_var, N_var)  # BH x T.shape[0]

        if inputtype == 'entropy':
            H_vars = [weighted_entropy(pv,p_db,prior[i],unknown[i],ids[i]) \
                    for i,pv in enumerate(p_vars)]
            H_db = entropy(p_db)
            phv = [ph[:, 0] for ph in phi_vars]
            t_in = T.stacklists(H_vars + phv + [H_db]).transpose()  # BH x 2M+1
            t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \
                    t_in.shape[1])) # B x H x 2M+1
            l_in_pol = L.InputLayer(
                    shape=(None,None,2*len(dialog_config.inform_slots)+1), \
                    input_var=t_in_resh)
        else:
            in_reshaped = T.reshape(input_var,
                    (input_var.shape[0]*input_var.shape[1], \
                    input_var.shape[2]))
            prev_act = in_reshaped[:, -len(dialog_config.inform_slots):]
            t_in = T.concatenate(pu_vars + phi_vars + [p_db, prev_act],
                                 axis=1)  # BH x D-sum+A
            t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \
                    t_in.shape[1])) # B x H x D-sum
            l_in_pol = L.InputLayer(shape=(None,None,sum(self.slot_sizes)+ \
                    3*len(dialog_config.inform_slots)+ \
                    table.shape[0]), input_var=t_in_resh)

        pol_in = T.fmatrix('pol-h')
        l_pol_rnn = L.GRULayer(l_in_pol,
                               n_hid,
                               hid_init=pol_in,
                               mask_input=l_mask_in,
                               grad_clipping=10.)  # B x H x D
        pol_out = L.get_output(l_pol_rnn)[:, -1, :]
        l_den_in = L.ReshapeLayer(
            l_pol_rnn,
            (turn_mask.shape[0] * turn_mask.shape[1], n_hid))  # BH x D
        l_out = L.DenseLayer(l_den_in, self.out_size, \
                nonlinearity=lasagne.nonlinearities.softmax) # BH x A

        self.network = l_out
        self.pol_params = L.get_all_params(self.network)
        self.params = self.bt_params + self.pol_params

        # db loss
        p_db_reshaped = T.reshape(
            p_db, (turn_mask.shape[0], turn_mask.shape[1], table.shape[0]))
        p_db_final = p_db_reshaped[:, -1, :]  # B x T.shape[0]
        p_db_final = _smooth(p_db_final)
        ix = T.tile(T.arange(p_db_final.shape[0]),
                    (db_index_var.shape[1], 1)).transpose()
        sample_probs = p_db_final[ix, db_index_var]  # B x K
        if dialog_config.SUCCESS_MAX_RANK == 1:
            log_db_probs = T.log(sample_probs).sum(axis=1)
        else:
            cum_probs,_ = theano.scan(fn=lambda x, prev: x+prev, \
                    outputs_info=T.zeros_like(sample_probs[:,0]), \
                    sequences=sample_probs[:,:-1].transpose())
            cum_probs = T.clip(cum_probs.transpose(), 0., 1. - 1e-5)  # B x K-1
            log_db_probs = T.log(sample_probs).sum(
                axis=1) - T.log(1. - cum_probs).sum(axis=1)  # B
        log_db_probs = log_db_probs * db_index_switch

        # rl
        probs = L.get_output(self.network)  # BH x A
        probs = _smooth(probs)
        out_probs = T.reshape(probs, (turn_mask.shape[0], turn_mask.shape[1],
                                      self.out_size))  # B x H x A
        log_probs = T.log(out_probs)
        act_probs = (log_probs * act_mask).sum(axis=2)  # B x H
        ep_probs = (act_probs * turn_mask).sum(axis=1)  # B
        H_probs = -T.sum(T.sum(out_probs * log_probs, axis=2), axis=1)  # B
        self.act_loss = -T.mean(ep_probs * reward_var)
        self.db_loss = -T.mean(log_db_probs * reward_var)
        self.reg_loss = -T.mean(ment * H_probs)
        self.loss = self.act_loss + self.db_loss + self.reg_loss

        self.inps = [input_var, turn_mask, act_mask, reward_var, db_index_var, db_index_switch, \
                pol_in] + hid_in_vars
        self.obj_fn = theano.function(self.inps,
                                      self.loss,
                                      on_unused_input='warn')
        self.act_fn = theano.function([input_var,turn_mask,pol_in]+hid_in_vars, \
                [out_probs,p_db,pol_out]+pu_vars+phi_vars+hid_out_vars, on_unused_input='warn')
        self.debug_fn = theano.function(self.inps, [probs, p_db, self.loss],
                                        on_unused_input='warn')
        self._rl_train_fn(self.learning_rate)

        ## sl
        sl_loss = 0. + bt_loss - T.mean(ep_probs)

        if self.sl == 'e2e':
            sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, \
                    learning_rate=learning_rate_sl, epsilon=1e-4)
            sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates)
        elif self.sl == 'bel':
            sl_updates = lasagne.updates.rmsprop(sl_loss, self.bt_params, \
                    learning_rate=learning_rate_sl, epsilon=1e-4)
            sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates)
        else:
            sl_updates = lasagne.updates.rmsprop(sl_loss, self.pol_params, \
                    learning_rate=learning_rate_sl, epsilon=1e-4)
            sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates)

        sl_inps = [input_var, turn_mask, act_mask, pol_in
                   ] + p_targets + phi_targets + hid_in_vars
        self.sl_train_fn = theano.function(sl_inps, [sl_loss]+kl_loss+x_loss, updates=sl_updates, \
                on_unused_input='warn')
        self.sl_obj_fn = theano.function(sl_inps,
                                         sl_loss,
                                         on_unused_input='warn')
Exemplo n.º 17
0
    def __init__(self,
                 n_inputs,
                 n_outputs,
                 n_components=1,
                 n_filters=[],
                 n_hiddens=[10, 10],
                 n_rnn=None,
                 impute_missing=True,
                 seed=None,
                 svi=True):
        """Initialize a mixture density network with custom layers

        Parameters
        ----------
        n_inputs : int or tuple of ints or list of ints
            Dimensionality of input
        n_outputs : int
            Dimensionality of output
        n_components : int
            Number of components of the mixture density
        n_filters : list of ints
            Number of filters  per convolutional layer
        n_hiddens : list of ints
            Number of hidden units per fully connected layer
        n_rnn : None or int
            Number of RNN units
        impute_missing : bool
            If set to True, learns replacement value for NaNs, otherwise those
            inputs are set to zero
        seed : int or None
            If provided, random number generator will be seeded
        svi : bool
            Whether to use SVI version or not
        """
        self.impute_missing = impute_missing
        self.n_components = n_components
        self.n_filters = n_filters
        self.n_hiddens = n_hiddens
        self.n_outputs = n_outputs
        self.svi = svi

        self.iws = tt.vector('iws', dtype=dtype)
        if n_rnn is None:
            self.n_rnn = 0
        else:
            self.n_rnn = n_rnn
        if self.n_rnn > 0 and len(self.n_filters) > 0:
            raise NotImplementedError

        self.seed = seed
        if seed is not None:
            self.rng = np.random.RandomState(seed=seed)
        else:
            self.rng = np.random.RandomState()
        lasagne.random.set_rng(self.rng)

        # cast n_inputs to tuple
        if type(n_inputs) is int:
            self.n_inputs = (n_inputs, )
        elif type(n_inputs) is list:
            self.n_inputs = tuple(n_inputs)
        elif type(n_inputs) is tuple:
            self.n_inputs = n_inputs
        else:
            raise ValueError('n_inputs type not supported')

        # compose layers
        self.layer = collections.OrderedDict()

        # stats : input placeholder, (batch, *self.n_inputs)
        if len(self.n_inputs) + 1 == 2:
            self.stats = tt.matrix('stats', dtype=dtype)
        elif len(self.n_inputs) + 1 == 3:
            self.stats = tt.tensor3('stats', dtype=dtype)
        elif len(self.n_inputs) + 1 == 4:
            self.stats = tt.tensor4('stats', dtype=dtype)
        else:
            raise NotImplementedError

        # input layer
        self.layer['input'] = ll.InputLayer((None, *self.n_inputs),
                                            input_var=self.stats)

        # learn replacement values
        if self.impute_missing:
            self.layer['missing'] = dl.ImputeMissingLayer(
                last(self.layer), n_inputs=self.n_inputs)
        else:
            self.layer['missing'] = dl.ReplaceMissingLayer(
                last(self.layer), n_inputs=self.n_inputs)

        # recurrent neural net
        # expects shape (batch, sequence_length, num_inputs)
        if self.n_rnn > 0:
            if len(self.n_inputs) == 1:
                rs = (-1, *self.n_inputs, 1)
                self.layer['rnn_reshape'] = ll.ReshapeLayer(
                    last(self.layer), rs)

            self.layer['rnn'] = ll.GRULayer(last(self.layer),
                                            n_rnn,
                                            only_return_final=True)

        # convolutional layers
        # expects shape (batch, num_input_channels, input_rows, input_columns)
        if len(self.n_filters) > 0:
            # reshape
            if len(self.n_inputs) == 1:
                raise NotImplementedError
            elif len(self.n_inputs) == 2:
                rs = (-1, 1, *self.n_inputs)
            else:
                rs = None
            if rs is not None:
                self.layer['conv_reshape'] = ll.ReshapeLayer(
                    last(self.layer), rs)

            # add layers
            for l in range(len(n_filters)):
                self.layer['conv_' + str(l + 1)] = ll.Conv2DLayer(
                    name='c' + str(l + 1),
                    incoming=last(self.layer),
                    num_filters=n_filters[l],
                    filter_size=3,
                    stride=(2, 2),
                    pad=0,
                    untie_biases=False,
                    W=lasagne.init.GlorotUniform(),
                    b=lasagne.init.Constant(0.),
                    nonlinearity=lnl.rectify,
                    flip_filters=True,
                    convolution=tt.nnet.conv2d)

        # flatten
        self.layer['flatten'] = ll.FlattenLayer(incoming=last(self.layer),
                                                outdim=2)

        # hidden layers
        for l in range(len(n_hiddens)):
            self.layer['hidden_' + str(l + 1)] = dl.FullyConnectedLayer(
                last(self.layer),
                n_units=n_hiddens[l],
                svi=svi,
                name='h' + str(l + 1))

        last_hidden = last(self.layer)

        # mixture layers
        self.layer['mixture_weights'] = dl.MixtureWeightsLayer(
            last_hidden,
            n_units=n_components,
            actfun=lnl.softmax,
            svi=svi,
            name='weights')
        self.layer['mixture_means'] = dl.MixtureMeansLayer(
            last_hidden,
            n_components=n_components,
            n_dim=n_outputs,
            svi=svi,
            name='means')
        self.layer['mixture_precisions'] = dl.MixturePrecisionsLayer(
            last_hidden,
            n_components=n_components,
            n_dim=n_outputs,
            svi=svi,
            name='precisions')
        last_mog = [
            self.layer['mixture_weights'], self.layer['mixture_means'],
            self.layer['mixture_precisions']
        ]

        # output placeholder
        self.params = tt.matrix('params',
                                dtype=dtype)  # (batch, self.n_outputs)

        # mixture parameters
        # a : weights, matrix with shape (batch, n_components)
        # ms : means, list of len n_components with (batch, n_dim, n_dim)
        # Us : precision factors, n_components list with (batch, n_dim, n_dim)
        # ldetUs : log determinants of precisions, n_comp list with (batch, )
        self.a, self.ms, precision_out = ll.get_output(last_mog,
                                                       deterministic=False)

        self.Us = precision_out['Us']
        self.ldetUs = precision_out['ldetUs']

        self.comps = {
            **{
                'a': self.a
            },
            **{'m' + str(i): self.ms[i]
               for i in range(self.n_components)},
            **{'U' + str(i): self.Us[i]
               for i in range(self.n_components)}
        }

        # log probability of y given the mixture distribution
        # lprobs_comps : log probs per component, list of len n_components with (batch, )
        # probs : log probs of mixture, (batch, )

        self.lprobs_comps = [
            -0.5 * tt.sum(tt.sum(
                (self.params - m).dimshuffle([0, 'x', 1]) * U, axis=2)**2,
                          axis=1) + ldetU
            for m, U, ldetU in zip(self.ms, self.Us, self.ldetUs)
        ]
        self.lprobs = (MyLogSumExp(tt.stack(self.lprobs_comps, axis=1) + tt.log(self.a), axis=1) \
                      - (0.5 * self.n_outputs * np.log(2 * np.pi))).squeeze()

        # the quantities from above again, but with deterministic=True
        # --- in the svi case, this will disable injection of randomness;
        # the mean of weights is used instead
        self.da, self.dms, dprecision_out = ll.get_output(last_mog,
                                                          deterministic=True)
        self.dUs = dprecision_out['Us']
        self.dldetUs = dprecision_out['ldetUs']
        self.dcomps = {
            **{
                'a': self.da
            },
            **{'m' + str(i): self.dms[i]
               for i in range(self.n_components)},
            **{'U' + str(i): self.dUs[i]
               for i in range(self.n_components)}
        }
        self.dlprobs_comps = [
            -0.5 * tt.sum(tt.sum(
                (self.params - m).dimshuffle([0, 'x', 1]) * U, axis=2)**2,
                          axis=1) + ldetU
            for m, U, ldetU in zip(self.dms, self.dUs, self.dldetUs)
        ]
        self.dlprobs = (MyLogSumExp(tt.stack(self.dlprobs_comps, axis=1) + tt.log(self.da), axis=1) \
                       - (0.5 * self.n_outputs * np.log(2 * np.pi))).squeeze()

        # parameters of network
        self.aps = ll.get_all_params(last_mog)  # all parameters
        self.mps = ll.get_all_params(last_mog, mp=True)  # means
        self.sps = ll.get_all_params(last_mog, sp=True)  # log stds

        # weight and bias parameter sets as seperate lists
        self.mps_wp = ll.get_all_params(last_mog, mp=True, wp=True)
        self.sps_wp = ll.get_all_params(last_mog, sp=True, wp=True)
        self.mps_bp = ll.get_all_params(last_mog, mp=True, bp=True)
        self.sps_bp = ll.get_all_params(last_mog, sp=True, bp=True)

        # theano functions
        self.compile_funs()

        self.iws = tt.vector('iws', dtype=dtype)
Exemplo n.º 18
0
    def __init__(self, n_inputs=None, n_outputs=None, input_shape=None,
                 n_bypass=0,
                 density='mog',
                 n_hiddens=(10, 10), impute_missing=True, seed=None,
                 n_filters=(), filter_sizes=3, pool_sizes=2,
                 n_rnn=0,
                 **density_opts):

        """Initialize a mixture density network with custom layers

        Parameters
        ----------
        n_inputs : int
            Total input dimensionality (data/summary stats)
        n_outputs : int
            Dimensionality of output (simulator parameters)
        input_shape : tuple
            Size to which data are reshaped before CNN or RNN
        n_bypass : int
            Number of elements at end of input which bypass CNN or RNN
        density : string
            Type of density condition on the network, can be 'mog' or 'maf'
        n_components : int
            Number of components of the mixture density
        n_filters : list of ints
            Number of filters  per convolutional layer
        n_hiddens : list of ints
            Number of hidden units per fully connected layer
        n_rnn : None or int
            Number of RNN units
        impute_missing : bool
            If set to True, learns replacement value for NaNs, otherwise those
            inputs are set to zero
        seed : int or None
            If provided, random number generator will be seeded
        density_opts : dict
            Options for the density estimator
        """
        if n_rnn > 0 and len(n_filters) > 0:
            raise NotImplementedError
        assert isint(n_inputs) and isint(n_outputs)\
            and n_inputs > 0 and n_outputs > 0

        self.density = density.lower()
        self.impute_missing = impute_missing
        self.n_hiddens = list(n_hiddens)
        self.n_outputs, self.n_inputs = n_outputs, n_inputs
        self.n_bypass = n_bypass

        self.n_rnn = n_rnn

        self.n_filters, self.filter_sizes, self.pool_sizes, n_cnn = \
            list(n_filters), filter_sizes, pool_sizes, len(n_filters)
        if type(self.filter_sizes) is int:
            self.filter_sizes = [self.filter_sizes for _ in range(n_cnn)]
        else:
            assert len(self.filter_sizes) >= n_cnn
        if type(self.pool_sizes) is int:
            self.pool_sizes = [self.pool_sizes for _ in range(n_cnn)]
        else:
            assert len(self.pool_sizes) >= n_cnn

        self.iws = tt.vector('iws', dtype=dtype)

        self.seed = seed
        if seed is not None:
            self.rng = np.random.RandomState(seed=seed)
        else:
            self.rng = np.random.RandomState()
        lasagne.random.set_rng(self.rng)

        self.input_shape = (n_inputs,) if input_shape is None else input_shape
        assert np.prod(self.input_shape) + self.n_bypass == self.n_inputs
        assert 1 <= len(self.input_shape) <= 3

        # params: output placeholder (batch, self.n_outputs)
        self.params = tensorN(2, name='params', dtype=dtype)

        # stats : input placeholder, (batch, self.n_inputs)
        self.stats = tensorN(2, name='stats', dtype=dtype)

        # compose layers
        self.layer = collections.OrderedDict()

        # input layer, None indicates batch size not fixed at compile time
        self.layer['input'] = ll.InputLayer(
            (None, self.n_inputs), input_var=self.stats)

        # learn replacement values
        if self.impute_missing:
            self.layer['missing'] = \
                dl.ImputeMissingLayer(last(self.layer),
                                      n_inputs=(self.n_inputs,))
        else:
            self.layer['missing'] = \
                dl.ReplaceMissingLayer(last(self.layer),
                                       n_inputs=(self.n_inputs,))

        if self.n_bypass > 0 and (self.n_rnn > 0 or n_cnn > 0):
            last_layer = last(self.layer)
            bypass_slice = slice(self.n_inputs - self.n_bypass, self.n_inputs)
            direct_slice = slice(0, self.n_inputs - self.n_bypass)
            self.layer['bypass'] = ll.SliceLayer(last_layer, bypass_slice)
            self.layer['direct'] = ll.SliceLayer(last_layer, direct_slice)

        # reshape inputs prior to RNN or CNN step
        if self.n_rnn > 0 or n_cnn > 0:

            if len(n_filters) > 0 and len(self.input_shape) == 2:  # 1 channel
                rs = (-1, 1, *self.input_shape)
            else:
                if self.n_rnn > 0:
                    assert len(self.input_shape) == 2  # time, dim
                else:
                    assert len(self.input_shape) == 3  # channel, row, col
                rs = (-1, *self.input_shape)

            # last layer is 'missing' or 'direct'
            self.layer['reshape'] = ll.ReshapeLayer(last(self.layer), rs)

        # recurrent neural net, input: (batch, sequence_length, num_inputs)
        if self.n_rnn > 0:
            self.layer['rnn'] = ll.GRULayer(last(self.layer), n_rnn,
                                            only_return_final=True)

        # convolutional net, input: (batch, channels, rows, columns)
        if n_cnn > 0:
            for l in range(n_cnn):  # add layers
                if self.pool_sizes[l] == 1:
                    padding = (self.filter_sizes[l] - 1) // 2
                else:
                    padding = 0
                self.layer['conv_' + str(l + 1)] = ll.Conv2DLayer(
                    name='c' + str(l + 1),
                    incoming=last(self.layer),
                    num_filters=self.n_filters[l],
                    filter_size=self.filter_sizes[l],
                    stride=(1, 1),
                    pad=padding,
                    untie_biases=False,
                    W=lasagne.init.GlorotUniform(),
                    b=lasagne.init.Constant(0.),
                    nonlinearity=lnl.rectify,
                    flip_filters=True,
                    convolution=tt.nnet.conv2d)

                if self.pool_sizes[l] > 1:
                    self.layer['pool_' + str(l + 1)] = ll.MaxPool2DLayer(
                        name='p' + str(l + 1),
                        incoming=last(self.layer),
                        pool_size=self.pool_sizes[l],
                        stride=None,
                        ignore_border=True)

        # flatten
        self.layer['flatten'] = ll.FlattenLayer(
            incoming=last(self.layer),
            outdim=2)

        # incorporate bypass inputs
        if self.n_bypass > 0 and (self.n_rnn > 0 or n_cnn > 0):
            self.layer['bypass_merge'] = lasagne.layers.ConcatLayer(
                [self.layer['bypass'], last(self.layer)], axis=1)

        if self.density == 'mog':
            self.init_mdn(**density_opts)
        elif self.density == 'maf':
            self.init_maf(**density_opts)
        else:
            raise NotImplementedError

        self.compile_funs()  # theano functions
Exemplo n.º 19
0
    def build_network(self, vocab_size, doc_var, query_var, docmask_var,
                      qmask_var, candmask_var, W_init):

        l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var)
        l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var)
        l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var)
        l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var)
        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=vocab_size,
                                      output_size=EMBED_DIM,
                                      W=W_init)  # B x N x 1 x DE
        l_doce = L.ReshapeLayer(
            l_docembed,
            (doc_var.shape[0], doc_var.shape[1], EMBED_DIM))  # B x N x DE
        l_qembed = L.EmbeddingLayer(l_qin,
                                    input_size=vocab_size,
                                    output_size=EMBED_DIM,
                                    W=l_docembed.W)

        l_fwd_q = L.GRULayer(l_qembed,
                             NUM_HIDDEN,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True)
        l_bkd_q = L.GRULayer(l_qembed,
                             NUM_HIDDEN,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             backwards=True)

        l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1)
        l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1)
        l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice])  # B x 2D
        q = L.get_output(l_q)  # B x 2D

        l_fwd_doc_1 = L.GRULayer(l_doce,
                                 NUM_HIDDEN,
                                 grad_clipping=GRAD_CLIP,
                                 mask_input=l_docmask,
                                 gradient_steps=GRAD_STEPS,
                                 precompute_input=True)
        l_bkd_doc_1 = L.GRULayer(l_doce,
                                 NUM_HIDDEN,
                                 grad_clipping=GRAD_CLIP,
                                 mask_input=l_docmask,
                                 gradient_steps=GRAD_STEPS,
                                 precompute_input=True,
                                 backwards=True)

        l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2)
        l_doc_1 = L.dropout(l_doc_1, p=DROPOUT_RATE)

        l_fwd_q_c = L.GRULayer(l_qembed,
                               NUM_HIDDEN,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_qmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_q_c = L.GRULayer(l_qembed,
                               NUM_HIDDEN,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_qmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True,
                               backwards=True)

        l_fwd_q_slice_c = L.SliceLayer(l_fwd_q_c, -1, 1)
        l_bkd_q_slice_c = L.SliceLayer(l_bkd_q_c, 0, 1)
        l_q_c = L.ConcatLayer([l_fwd_q_slice_c, l_bkd_q_slice_c])  # B x DE

        qd = L.get_output(l_q_c)
        q_rep = T.reshape(
            T.tile(qd, (1, doc_var.shape[1])),
            (doc_var.shape[0], doc_var.shape[1], 2 * NUM_HIDDEN))  # B x N x DE

        l_q_rep_in = L.InputLayer(shape=(None, None, 2 * NUM_HIDDEN),
                                  input_var=q_rep)
        l_doc_gru_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul)

        l_fwd_doc = L.GRULayer(l_doc_gru_in,
                               NUM_HIDDEN,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_doc = L.GRULayer(l_doc_gru_in,
                               NUM_HIDDEN,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True,
                               backwards=True)

        l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2)

        d = L.get_output(l_doc)  # B x N x 2D
        p = T.batched_dot(d, q)  # B x N
        pm = T.nnet.softmax(
            T.set_subtensor(
                T.alloc(-20., p.shape[0], p.shape[1])[candmask_var.nonzero()],
                p[candmask_var.nonzero()]))

        index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape)
        final = T.inc_subtensor(
            T.alloc(0., p.shape[0], vocab_size)[index,
                                                T.flatten(doc_var, outdim=2)],
            pm)

        dv = L.get_output(l_doc, deterministic=True)  # B x N x 2D
        p = T.batched_dot(dv, q)  # B x N
        pm = T.nnet.softmax(
            T.set_subtensor(
                T.alloc(-20., p.shape[0], p.shape[1])[candmask_var.nonzero()],
                p[candmask_var.nonzero()]))

        index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape)
        final_v = T.inc_subtensor(
            T.alloc(0., p.shape[0], vocab_size)[index,
                                                T.flatten(doc_var, outdim=2)],
            pm)

        return final, final_v, l_doc, [l_q, l_q_c]
Exemplo n.º 20
0
def build_network():
    batch_norm = False
    num_units = 500  # rnn hidden units number
    l2 = 0.0  # l2 regularization
    dropout = 0.5

    input_var = T.tensor4('input_var')
    answer_var = T.ivector('answer_var')

    print('==> building network')
    example = np.random.uniform(size=(batch_size, 1, 128, 858),
                                low=0.0,
                                high=1.0).astype(np.float32)
    answer = np.random.randint(low=0, high=176, size=(batch_size, ))

    network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=input_var)
    print(layers.get_output(network).eval({input_var: example}).shape)

    # conv-relu-pool 1
    network = layers.Conv2DLayer(incoming=network,
                                 num_filters=16,
                                 filter_size=(7, 7),
                                 stride=1,
                                 nonlinearity=rectify)
    print(layers.get_output(network).eval({input_var: example}).shape)

    network = layers.MaxPool2DLayer(incoming=network,
                                    pool_size=(3, 3),
                                    stride=2,
                                    pad=2)
    print(layers.get_output(network).eval({input_var: example}).shape)
    if batch_norm:
        network = layers.BatchNormLayer(incoming=network)

    # conv-relu-pool 2
    network = layers.Conv2DLayer(incoming=network,
                                 num_filters=32,
                                 filter_size=(5, 5),
                                 stride=1,
                                 nonlinearity=rectify)
    print(layers.get_output(network).eval({input_var: example}).shape)

    network = layers.MaxPool2DLayer(incoming=network,
                                    pool_size=(3, 3),
                                    stride=2,
                                    pad=2)
    print(layers.get_output(network).eval({input_var: example}).shape)
    if batch_norm:
        network = layers.BatchNormLayer(incoming=network)

    # conv-relu-pool 3
    network = layers.Conv2DLayer(incoming=network,
                                 num_filters=32,
                                 filter_size=(5, 5),
                                 stride=1,
                                 nonlinearity=rectify)
    print(layers.get_output(network).eval({input_var: example}).shape)

    network = layers.MaxPool2DLayer(incoming=network,
                                    pool_size=(3, 3),
                                    stride=2,
                                    pad=2)
    print(layers.get_output(network).eval({input_var: example}).shape)
    if batch_norm:
        network = layers.BatchNormLayer(incoming=network)

    # conv-relu-pool 4
    network = layers.Conv2DLayer(incoming=network,
                                 num_filters=32,
                                 filter_size=(3, 3),
                                 stride=1,
                                 nonlinearity=rectify)
    print(layers.get_output(network).eval({input_var: example}).shape)

    network = layers.MaxPool2DLayer(incoming=network,
                                    pool_size=(3, 3),
                                    stride=2,
                                    pad=2)
    print(layers.get_output(network).eval({input_var: example}).shape)
    if batch_norm:
        network = layers.BatchNormLayer(incoming=network)

    params = layers.get_all_params(network, trainable=True)
    output = layers.get_output(network)
    output = output.transpose((0, 3, 1, 2))
    output = output.flatten(ndim=3)

    # This params is important
    num_channels = 32
    filter_w = 54
    filter_h = 8

    network = layers.InputLayer(shape=(None, filter_w,
                                       num_channels * filter_h),
                                input_var=output)
    print(layers.get_output(network).eval({input_var: example}).shape)

    network = layers.GRULayer(incoming=network,
                              num_units=num_units,
                              only_return_final=True)
    print(layers.get_output(network).eval({input_var: example}).shape)
    if batch_norm:
        network = layers.BatchNormLayer(incoming=network)
    if dropout > 0:
        network = layers.dropout(network, dropout)

    # last layer: classification
    network = layers.DenseLayer(incoming=network,
                                num_units=176,
                                nonlinearity=softmax)
    print(layers.get_output(network).eval({input_var: example}).shape)

    params += layers.get_all_params(network, trainable=True)
    prediction = layers.get_output(network)

    print('==> param shapes', [x.eval().shape for x in params])

    loss_ce = lasagne.objectives.categorical_crossentropy(
        prediction, answer_var).mean()
    if l2 > 0:
        loss_l2 = l2 * lasagne.regularization.apply_penalty(
            params, lasagne.regularization.l2)
    else:
        loss_l2 = 0
    loss = loss_ce + loss_l2

    # updates = lasagne.updates.adadelta(loss, params)
    updates = lasagne.updates.momentum(loss, params,
                                       learning_rate=0.003)  # good one
    # updates = lasagne.updates.momentum(loss, params, learning_rate=0.0003)  # good one

    print('==> compiling train_fn')
    train_fn = theano.function(inputs=[input_var, answer_var],
                               outputs=[prediction, loss],
                               updates=updates)
    test_fn = theano.function(inputs=[input_var, answer_var],
                              outputs=[prediction, loss])

    return train_fn, test_fn
Exemplo n.º 21
0
    def build_network(self, K, vocab_size, W_init):

        l_docin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[0])
        l_doctokin = L.InputLayer(shape=(None, None), input_var=self.inps[1])
        l_qin = L.InputLayer(shape=(None, None, 1), input_var=self.inps[2])
        l_qtokin = L.InputLayer(shape=(None, None), input_var=self.inps[3])
        l_docmask = L.InputLayer(shape=(None, None), input_var=self.inps[6])
        l_qmask = L.InputLayer(shape=(None, None), input_var=self.inps[7])
        l_tokin = L.InputLayer(shape=(None, MAX_WORD_LEN),
                               input_var=self.inps[8])
        l_tokmask = L.InputLayer(shape=(None, MAX_WORD_LEN),
                                 input_var=self.inps[9])
        l_featin = L.InputLayer(shape=(None, None), input_var=self.inps[11])

        doc_shp = self.inps[1].shape
        qry_shp = self.inps[3].shape

        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=vocab_size,
                                      output_size=self.embed_dim,
                                      W=W_init)  # B x N x 1 x DE
        l_doce = L.ReshapeLayer(
            l_docembed, (doc_shp[0], doc_shp[1], self.embed_dim))  # B x N x DE
        l_qembed = L.EmbeddingLayer(l_qin,
                                    input_size=vocab_size,
                                    output_size=self.embed_dim,
                                    W=l_docembed.W)
        l_qembed = L.ReshapeLayer(
            l_qembed, (qry_shp[0], qry_shp[1], self.embed_dim))  # B x N x DE
        l_fembed = L.EmbeddingLayer(l_featin, input_size=2,
                                    output_size=2)  # B x N x 2

        if self.train_emb == 0:
            l_docembed.params[l_docembed.W].remove('trainable')

        # char embeddings
        if self.use_chars:
            l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars,
                                        2 * self.char_dim)  # T x L x D
            l_fgru = L.GRULayer(l_lookup,
                                self.char_dim,
                                grad_clipping=GRAD_CLIP,
                                mask_input=l_tokmask,
                                gradient_steps=GRAD_STEPS,
                                precompute_input=True,
                                only_return_final=True)
            l_bgru = L.GRULayer(l_lookup,
                                2 * self.char_dim,
                                grad_clipping=GRAD_CLIP,
                                mask_input=l_tokmask,
                                gradient_steps=GRAD_STEPS,
                                precompute_input=True,
                                backwards=True,
                                only_return_final=True)  # T x 2D
            l_fwdembed = L.DenseLayer(l_fgru,
                                      self.embed_dim / 2,
                                      nonlinearity=None)  # T x DE/2
            l_bckembed = L.DenseLayer(l_bgru,
                                      self.embed_dim / 2,
                                      nonlinearity=None)  # T x DE/2
            l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1)
            l_docchar_embed = IndexLayer([l_doctokin, l_embed])  # B x N x DE/2
            l_qchar_embed = IndexLayer([l_qtokin, l_embed])  # B x Q x DE/2

            l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2)
            l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2)

        l_qembed = L.DropoutLayer(l_qembed, p=self.dropout)
        l_doce = L.DropoutLayer(l_doce, p=self.dropout)

        l_fwd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             only_return_final=True)
        l_bkd_q = L.GRULayer(l_qembed,
                             self.nhidden,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             backwards=True,
                             only_return_final=True)
        l_q = L.ConcatLayer([l_fwd_q, l_bkd_q])  # B x 2D

        if self.use_feat:
            l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2)  # B x N x DE+2
        l_fwd_doc_1 = L.GRULayer(l_doce,
                                 self.nhidden,
                                 grad_clipping=GRAD_CLIP,
                                 mask_input=l_docmask,
                                 gradient_steps=GRAD_STEPS,
                                 precompute_input=True)
        l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP,
                mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                        backwards=True)
        l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2)  # B x N x 2D

        l_o = BilinearAttentionLayer([l_doc_1, l_q],
                                     2 * self.nhidden,
                                     mask_input=self.inps[6])  # B x 2D
        #odim = self.embed_dim
        #if self.use_chars: odim += self.embed_dim/2
        #if self.use_feat: odim += 2
        #l_od = L.DenseLayer(l_o, odim)
        l_od = l_o

        oo = L.get_output(l_od)  # B x OD
        d = L.get_output(l_doc_1)  # B x N x OD
        p = T.batched_dot(d, oo)  # B x N
        pm = T.nnet.softmax(p) * self.inps[10]
        pm = pm / pm.sum(axis=1)[:, np.newaxis]
        final = T.batched_dot(pm, self.inps[4])

        ov = L.get_output(l_od, deterministic=True)
        dv = L.get_output(l_doc_1, deterministic=True)  # B x N x OD
        p = T.batched_dot(dv, ov)  # B x N
        pm = T.nnet.softmax(p) * self.inps[10]
        pm = pm / pm.sum(axis=1)[:, np.newaxis]
        final_v = T.batched_dot(pm, self.inps[4])

        return final, final_v, l_od, [], l_docembed.W
    def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size,
                 dropout, l2, mode, batch_norm, rnn_num_units, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()
        self.train_list_raw = train_list_raw
        self.test_list_raw = test_list_raw
        self.png_folder = png_folder
        self.batch_size = batch_size
        self.dropout = dropout
        self.l2 = l2
        self.mode = mode
        self.batch_norm = batch_norm
        self.num_units = rnn_num_units

        self.input_var = T.tensor4('input_var')
        self.answer_var = T.ivector('answer_var')

        print "==> building network"
        example = np.random.uniform(size=(self.batch_size, 1, 128, 858),
                                    low=0.0,
                                    high=1.0).astype(np.float32)  #########
        answer = np.random.randint(low=0, high=176,
                                   size=(self.batch_size, ))  #########

        network = layers.InputLayer(shape=(None, 1, 128, 858),
                                    input_var=self.input_var)
        print layers.get_output(network).eval({self.input_var: example}).shape

        # CONV-RELU-POOL 1
        network = layers.Conv2DLayer(incoming=network,
                                     num_filters=16,
                                     filter_size=(7, 7),
                                     stride=1,
                                     nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var: example}).shape
        network = layers.MaxPool2DLayer(incoming=network,
                                        pool_size=(3, 3),
                                        stride=(2, 1),
                                        pad=2)
        print layers.get_output(network).eval({self.input_var: example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        # CONV-RELU-POOL 2
        network = layers.Conv2DLayer(incoming=network,
                                     num_filters=32,
                                     filter_size=(5, 5),
                                     stride=1,
                                     nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var: example}).shape
        network = layers.MaxPool2DLayer(incoming=network,
                                        pool_size=(3, 3),
                                        stride=(2, 1),
                                        pad=2)
        print layers.get_output(network).eval({self.input_var: example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        # CONV-RELU-POOL 3
        network = layers.Conv2DLayer(incoming=network,
                                     num_filters=32,
                                     filter_size=(3, 3),
                                     stride=1,
                                     nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var: example}).shape
        network = layers.MaxPool2DLayer(incoming=network,
                                        pool_size=(3, 3),
                                        stride=(2, 1),
                                        pad=2)
        print layers.get_output(network).eval({self.input_var: example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        # CONV-RELU-POOL 4
        network = layers.Conv2DLayer(incoming=network,
                                     num_filters=32,
                                     filter_size=(3, 3),
                                     stride=1,
                                     nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var: example}).shape
        network = layers.MaxPool2DLayer(incoming=network,
                                        pool_size=(3, 3),
                                        stride=(2, 1),
                                        pad=2)
        print layers.get_output(network).eval({self.input_var: example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        self.params = layers.get_all_params(network, trainable=True)

        output = layers.get_output(network)
        output = output.transpose((0, 3, 1, 2))
        output = output.flatten(ndim=3)

        # NOTE: these constants are shapes of last pool layer, it can be symbolic
        # explicit values are better for optimizations
        num_channels = 32
        filter_W = 852
        filter_H = 8

        # InputLayer
        network = layers.InputLayer(shape=(None, filter_W,
                                           num_channels * filter_H),
                                    input_var=output)
        print layers.get_output(network).eval({self.input_var: example}).shape

        # GRULayer
        network = layers.GRULayer(incoming=network,
                                  num_units=self.num_units,
                                  only_return_final=True)
        print layers.get_output(network).eval({self.input_var: example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        if (self.dropout > 0):
            network = layers.dropout(network, self.dropout)

        # Last layer: classification
        network = layers.DenseLayer(incoming=network,
                                    num_units=176,
                                    nonlinearity=softmax)
        print layers.get_output(network).eval({self.input_var: example}).shape

        self.params += layers.get_all_params(network, trainable=True)
        self.prediction = layers.get_output(network)

        #print "==> param shapes", [x.eval().shape for x in self.params]

        self.loss_ce = lasagne.objectives.categorical_crossentropy(
            self.prediction, self.answer_var).mean()
        if (self.l2 > 0):
            self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(
                self.params, lasagne.regularization.l2)
        else:
            self.loss_l2 = 0
        self.loss = self.loss_ce + self.loss_l2

        #updates = lasagne.updates.adadelta(self.loss, self.params)
        #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) # good one
        updates = lasagne.updates.momentum(self.loss,
                                           self.params,
                                           learning_rate=0.001)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[self.input_var, self.answer_var],
                outputs=[self.prediction, self.loss],
                updates=updates)

        print "==> compiling test_fn"
        self.test_fn = theano.function(
            inputs=[self.input_var, self.answer_var],
            outputs=[self.prediction, self.loss])
Exemplo n.º 23
0
    def build_network(self, K, vocab_size, W_init):

        l_docin = L.InputLayer(shape=(None,None,1), input_var=self.inps[0])
        l_doctokin = L.InputLayer(shape=(None,None), input_var=self.inps[1])
        l_qin = L.InputLayer(shape=(None,None,1), input_var=self.inps[2])
        l_qtokin = L.InputLayer(shape=(None,None), input_var=self.inps[3])
        l_docmask = L.InputLayer(shape=(None,None), input_var=self.inps[6])
        l_qmask = L.InputLayer(shape=(None,None), input_var=self.inps[7])
        l_tokin = L.InputLayer(shape=(None,MAX_WORD_LEN), input_var=self.inps[8])
        l_tokmask = L.InputLayer(shape=(None,MAX_WORD_LEN), input_var=self.inps[9])
        l_featin = L.InputLayer(shape=(None,None), input_var=self.inps[11])

        doc_shp = self.inps[1].shape
        qry_shp = self.inps[3].shape

        l_docembed = L.EmbeddingLayer(l_docin, input_size=vocab_size, 
                output_size=self.embed_dim, W=W_init) # B x N x 1 x DE
        l_doce = L.ReshapeLayer(l_docembed, 
                (doc_shp[0],doc_shp[1],self.embed_dim)) # B x N x DE
        l_qemb = L.EmbeddingLayer(l_qin, input_size=vocab_size, 
                output_size=self.embed_dim, W=l_docembed.W)
        l_qembed = L.ReshapeLayer(l_qemb, 
                (qry_shp[0],qry_shp[1],self.embed_dim)) # B x N x DE
        l_fembed = L.EmbeddingLayer(l_featin, input_size=2, output_size=2) # B x N x 2

        if self.train_emb==0: 
            l_docembed.params[l_docembed.W].remove('trainable')
            l_qemb.params[l_qemb.W].remove('trainable')

        # char embeddings
        if self.use_chars:
            l_lookup = L.EmbeddingLayer(l_tokin, self.num_chars, self.char_dim) # T x L x D
            l_fgru = L.GRULayer(l_lookup, self.char_dim, grad_clipping=GRAD_CLIP, 
                    mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True,
                    only_return_final=True)
            l_bgru = L.GRULayer(l_lookup, self.char_dim, grad_clipping=GRAD_CLIP, 
                    mask_input=l_tokmask, gradient_steps=GRAD_STEPS, precompute_input=True, 
                    backwards=True, only_return_final=True) # T x 2D
            l_fwdembed = L.DenseLayer(l_fgru, self.embed_dim/2, nonlinearity=None) # T x DE/2
            l_bckembed = L.DenseLayer(l_bgru, self.embed_dim/2, nonlinearity=None) # T x DE/2
            l_embed = L.ElemwiseSumLayer([l_fwdembed, l_bckembed], coeffs=1)
            l_docchar_embed = IndexLayer([l_doctokin, l_embed]) # B x N x DE/2
            l_qchar_embed = IndexLayer([l_qtokin, l_embed]) # B x Q x DE/2

            l_doce = L.ConcatLayer([l_doce, l_docchar_embed], axis=2)
            l_qembed = L.ConcatLayer([l_qembed, l_qchar_embed], axis=2)

        attentions = []
        if self.save_attn:
            l_m = PairwiseInteractionLayer([l_doce,l_qembed])
            attentions.append(L.get_output(l_m, deterministic=True))

        for i in range(K-1):
            l_fwd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, 
                    mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True)
            l_bkd_doc_1 = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, 
                    mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                            backwards=True)

            l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1], axis=2) # B x N x DE

            l_fwd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, 
                    mask_input=l_qmask, 
                    gradient_steps=GRAD_STEPS, precompute_input=True)
            l_bkd_q_1 = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, 
                    mask_input=l_qmask, 
                    gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True)

            l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1], axis=2) # B x Q x DE

            l_m = PairwiseInteractionLayer([l_doc_1, l_q_c_1])
            l_doc_2_in = GatedAttentionLayer([l_doc_1, l_q_c_1, l_m], 
                    gating_fn=self.gating_fn, 
                    mask_input=self.inps[7])
            l_doce = L.dropout(l_doc_2_in, p=self.dropout) # B x N x DE
            if self.save_attn: 
                attentions.append(L.get_output(l_m, deterministic=True))

        if self.use_feat: l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2) # B x N x DE+2

        # final layer
        l_fwd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, 
                mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True)
        l_bkd_doc = L.GRULayer(l_doce, self.nhidden, grad_clipping=GRAD_CLIP, 
                mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                        backwards=True)
        l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2)

        l_fwd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, 
                gradient_steps=GRAD_STEPS, precompute_input=True, only_return_final=False)
        l_bkd_q = L.GRULayer(l_qembed, self.nhidden, grad_clipping=GRAD_CLIP, mask_input=l_qmask, 
                gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True, 
                only_return_final=False)
        l_q = L.ConcatLayer([l_fwd_q, l_bkd_q], axis=2) # B x Q x 2D

        if self.save_attn:
            l_m = PairwiseInteractionLayer([l_doc, l_q])
            attentions.append(L.get_output(l_m, deterministic=True))

        l_prob = AttentionSumLayer([l_doc,l_q], self.inps[4], self.inps[12], 
                mask_input=self.inps[10])
        final = L.get_output(l_prob)
        final_v = L.get_output(l_prob, deterministic=True)

        return final, final_v, l_prob, l_docembed.W, attentions
    def build_network(self, vocab_size, doc_var, query_var, docmask_var,
                      qmask_var, W_init):

        l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var)
        l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var)
        l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var)
        l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var)
        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=vocab_size,
                                      output_size=EMBED_DIM,
                                      W=W_init)
        l_qembed = L.EmbeddingLayer(l_qin,
                                    input_size=vocab_size,
                                    output_size=EMBED_DIM,
                                    W=l_docembed.W)

        l_fwd_doc = L.GRULayer(l_docembed,
                               NUM_HIDDEN,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_doc = L.GRULayer(l_docembed,
                               NUM_HIDDEN,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True,
                               backwards=True)

        l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2)

        l_fwd_q = L.GRULayer(l_qembed,
                             NUM_HIDDEN,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True)
        l_bkd_q = L.GRULayer(l_qembed,
                             NUM_HIDDEN,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             backwards=True)

        l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1)
        l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1)
        l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice])

        d = L.get_output(l_doc)  # B x N x D
        q = L.get_output(l_q)  # B x D
        p = T.batched_dot(d, q)  # B x N
        pm = T.nnet.softmax(
            T.set_subtensor(
                T.alloc(-20., p.shape[0], p.shape[1])[docmask_var.nonzero()],
                p[docmask_var.nonzero()]))

        index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape)
        final = T.inc_subtensor(
            T.alloc(0., p.shape[0], vocab_size)[index,
                                                T.flatten(doc_var, outdim=2)],
            pm)
        #qv = T.flatten(query_var,outdim=2)
        #index2 = T.reshape(T.repeat(T.arange(qv.shape[0]),qv.shape[1]),qv.shape)
        #xx = index2[qmask_var.nonzero()]
        #yy = qv[qmask_var.nonzero()]
        #pV = T.set_subtensor(final[xx,yy], T.zeros_like(qv[xx,yy]))

        return final, l_doc, l_q
    def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size,
                 l2, mode, rnn_num_units, **kwargs):

        self.train_list_raw = train_list_raw
        self.test_list_raw = test_list_raw
        self.png_folder = png_folder
        self.batch_size = batch_size
        self.l2 = l2
        self.mode = mode
        self.num_units = rnn_num_units

        self.input_var = T.tensor3('input_var')
        self.answer_var = T.ivector('answer_var')

        ### Arquitectura de la red gru

        example = np.random.uniform(size=(self.batch_size, 768, 256),
                                    low=0.0,
                                    high=1.0).astype(np.float32)
        answer = np.random.randint(low=0, high=176, size=(self.batch_size, ))

        # Capa de entrada

        network = layers.InputLayer(shape=(None, 768, 256),
                                    input_var=self.input_var)

        # Capa GRU:

        network = layers.GRULayer(incoming=network,
                                  num_units=self.num_units,
                                  only_return_final=True)

        # Ultima Capa de la red

        network = layers.DenseLayer(incoming=network,
                                    num_units=122,
                                    nonlinearity=softmax)

        self.params = layers.get_all_params(network, trainable=True)
        self.prediction = layers.get_output(network)

        self.loss_ce = lasagne.objectives.categorical_crossentropy(
            self.prediction, self.answer_var).mean()
        if (self.l2 > 0):
            self.loss_l2 = self.l2 * lasagne.regularization.regularize_network_params(
                network, lasagne.regularization.l2)
        else:
            self.loss_l2 = 0
        self.loss = self.loss_ce + self.loss_l2

        updates = lasagne.updates.momentum(self.loss,
                                           self.params,
                                           learning_rate=0.0005)

        if self.mode == 'train':
            self.train_fn = theano.function(
                inputs=[self.input_var, self.answer_var],
                outputs=[self.prediction, self.loss],
                updates=updates)

        self.test_fn = theano.function(
            inputs=[self.input_var, self.answer_var],
            outputs=[self.prediction, self.loss])