def _get_normalised_relevance_layer(self, layer, feeder):
        def add_epsilon(Zs):
            tmp = (T.cast(Zs >= 0, theano.config.floatX) * 2.0 - 1.0)
            return Zs + self.epsilon * tmp

        if isinstance(layer, L.DenseLayer):
            forward_layer = L.DenseLayer(layer.input_layer,
                                         layer.num_units,
                                         W=layer.W,
                                         b=layer.b,
                                         nonlinearity=None)
        elif isinstance(layer, L.Conv2DLayer):
            forward_layer = L.Conv2DLayer(layer.input_layer,
                                          num_filters=layer.num_filters,
                                          W=layer.W,
                                          b=layer.b,
                                          stride=layer.stride,
                                          filter_size=layer.filter_size,
                                          flip_filters=layer.flip_filters,
                                          untie_biases=layer.untie_biases,
                                          pad=layer.pad,
                                          nonlinearity=None)
        else:
            raise NotImplementedError()

        forward_layer = L.ExpressionLayer(forward_layer,
                                          lambda x: 1.0 / add_epsilon(x))
        feeder = L.ElemwiseMergeLayer([forward_layer, feeder],
                                      merge_function=T.mul)

        return feeder
    def get_conv_input(self, sidx, tidx, avg=False):
        suf = '_avg' if avg else ''

        feat_embs = [
            self.manager.feats[name].get_emb_layer(sidx, tidx, avg=avg)
            for name in self.args.source_feats
        ]

        # TODO: change the meaning
        if self.args.lex == 'mix':
            concat_emb = L.ElemwiseSumLayer(feat_embs)  # (100, 15, 256)
        else:
            concat_emb = L.concat(feat_embs, axis=2)  # (100, 15, 256+100)

        pos = np.array([0] * (self.args.window_size / 2) + [1] + [0] *
                       (self.args.window_size / 2)).astype(
                           theano.config.floatX)
        post = theano.shared(pos[np.newaxis, :, np.newaxis],
                             borrow=True)  # (1, 15, 1)
        posl = L.InputLayer(
            (None, self.args.window_size, 1),
            input_var=T.extra_ops.repeat(post, sidx.shape[0],
                                         axis=0))  # (100, 15, 1)
        conv_in = L.concat([concat_emb, posl], axis=2)  # (100, 15, 256+1)

        if self.args.pos_emb:
            posint = L.flatten(
                L.ExpressionLayer(posl,
                                  lambda x: T.cast(x, 'int64')))  # (100, 15)
            pos_emb = L.EmbeddingLayer(
                posint,
                self.args.window_size,
                8,
                name='epos' + suf,
                W=Normal(0.01) if not avg else Constant())  # (100, 15, 8)
            pos_emb.params[pos_emb.W].remove('regularizable')
            conv_in = L.concat([concat_emb, posl, pos_emb],
                               axis=2)  # (100, 15, 256+1+8)

        # # squeeze
        # if self.args.squeeze:
        #     conv_in = L.DenseLayer(conv_in, num_units=self.args.squeeze, name='squeeze'+suf, num_leading_axes=2,
        #                     W=HeNormal('relu')) # (100, 15, 256)

        conv_in = L.dimshuffle(conv_in, (0, 2, 1))  # (100, 256+1, 15)

        return conv_in
示例#3
0
    def _invert_GlobalPoolLayer(self, layer, feeder):
        assert isinstance(layer, L.GlobalPoolLayer)
        assert layer.pool_function == T.mean
        assert len(L.get_output_shape(layer.input_layer)) == 4

        target_shape = L.get_output_shape(feeder) + (1, 1)
        if target_shape[0] is None:
            target_shape = (-1, ) + target_shape[1:]

        feeder = L.ReshapeLayer(feeder, target_shape)

        upscaling = L.get_output_shape(layer.input_layer)[2:]
        feeder = L.Upscale2DLayer(feeder, upscaling)

        def expression(x):
            return x / np.prod(upscaling).astype(theano.config.floatX)

        feeder = L.ExpressionLayer(feeder, expression)
        return feeder
示例#4
0
    def __init__(self, incomings, vocab_size, emb_size, W, WT=None, **kwargs):
        super(EncodingFullLayer, self).__init__(incomings, **kwargs)
        #        if len(self.input_shapes[0]) == 3:
        #            batch_size, w_count, w_length = self.input_shapes[0]
        shape = tuple(self.input_shapes[0])
        #        else:
        #            shape = tuple(self.input_shapes[0])

        self.WT = None
        #        self.reset_zero()
        self.l_in = LL.InputLayer(shape=shape)
        self.l_in_pe = LL.InputLayer(shape=shape + (emb_size, ))
        self.l_emb = LL.EmbeddingLayer(self.l_in,
                                       input_size=vocab_size,
                                       output_size=emb_size,
                                       W=W)
        self.W = self.l_emb.W
        self.l_emb = LL.ElemwiseMergeLayer((self.l_emb, self.l_in_pe),
                                           merge_function=T.mul)
        self.l_emb_res = LL.ExpressionLayer(self.l_emb,
                                            lambda X: X.sum(2),
                                            output_shape='auto')

        #        self.l_emb_res = SumLayer(self.l_emb, axis=2)
        if np.any(WT):
            self.l_emb_res = TemporalEncodicgLayer(self.l_emb_res, T=WT)
            self.WT = self.l_emb_res.T
        params = LL.helper.get_all_params(self.l_emb_res, trainable=True)
        values = LL.helper.get_all_param_values(self.l_emb_res, trainable=True)
        for p, v in zip(params, values):
            self.add_param(p, v.shape, name=p.name)

        zero_vec_tensor = T.vector()
        self.zero_vec = np.zeros(emb_size, dtype=theano.config.floatX)
        self.set_zero = theano.function(
            [zero_vec_tensor],
            updates=[(x, T.set_subtensor(x[-1, :], zero_vec_tensor))
                     for x in [self.W]])
示例#5
0
    def get_char2word(self, ic, avg=False):
        suf = '_avg' if avg else ''
        ec = L.EmbeddingLayer(
            ic,
            self.args.vc,
            self.args.nc,
            name='ec' + suf,
            W=HeNormal() if not avg else Constant())  # (100, 24, 32, 16)
        ec.params[ec.W].remove('regularizable')

        if self.args.char_model == 'CNN':
            lds = L.dimshuffle(ec, (0, 3, 1, 2))  # (100, 16, 24, 32)
            ls = []
            for n in self.args.ngrams:
                lconv = L.Conv2DLayer(
                    lds,
                    self.args.nf, (1, n),
                    untie_biases=True,
                    W=HeNormal('relu') if not avg else Constant(),
                    name='conv_%d' % n + suf)  # (100, 64/4, 24, 32-n+1)
                lpool = L.MaxPool2DLayer(
                    lconv, (1, self.args.max_len - n + 1))  # (100, 64, 24, 1)
                lpool = L.flatten(lpool, outdim=3)  # (100, 16, 24)
                lpool = L.dimshuffle(lpool, (0, 2, 1))  # (100, 24, 16)
                ls.append(lpool)
            xc = L.concat(ls, axis=2)  # (100, 24, 64)
            return xc

        elif self.args.char_model == 'LSTM':
            ml = L.ExpressionLayer(
                ic, lambda x: T.neq(x, 0))  # mask layer (100, 24, 32)
            ml = L.reshape(ml, (-1, self.args.max_len))  # (2400, 32)

            gate_params = L.recurrent.Gate(W_in=Orthogonal(),
                                           W_hid=Orthogonal())
            cell_params = L.recurrent.Gate(W_in=Orthogonal(),
                                           W_hid=Orthogonal(),
                                           W_cell=None,
                                           nonlinearity=tanh)

            lstm_in = L.reshape(
                ec, (-1, self.args.max_len, self.args.nc))  # (2400, 32, 16)
            lstm_f = L.LSTMLayer(
                lstm_in,
                self.args.nw / 2,
                mask_input=ml,
                grad_clipping=10.,
                learn_init=True,
                peepholes=False,
                precompute_input=True,
                ingate=gate_params,
                forgetgate=gate_params,
                cell=cell_params,
                outgate=gate_params,
                # unroll_scan=True,
                only_return_final=True,
                name='forward' + suf)  # (2400, 64)
            lstm_b = L.LSTMLayer(
                lstm_in,
                self.args.nw / 2,
                mask_input=ml,
                grad_clipping=10.,
                learn_init=True,
                peepholes=False,
                precompute_input=True,
                ingate=gate_params,
                forgetgate=gate_params,
                cell=cell_params,
                outgate=gate_params,
                # unroll_scan=True,
                only_return_final=True,
                backwards=True,
                name='backward' + suf)  # (2400, 64)
            remove_reg(lstm_f)
            remove_reg(lstm_b)
            if avg:
                set_zero(lstm_f)
                set_zero(lstm_b)
            xc = L.concat([lstm_f, lstm_b], axis=1)  # (2400, 128)
            xc = L.reshape(xc,
                           (-1, self.args.sw, self.args.nw))  # (100, 24, 256)
            return xc
示例#6
0
    def __init__(self,
                 incomings,
                 vocab_size,
                 emb_size,
                 A=lasagne.init.Normal(std=0.1),
                 C=lasagne.init.Normal(std=0.1),
                 AT=lasagne.init.Normal(std=0.1),
                 CT=lasagne.init.Normal(std=0.1),
                 nonlin=lasagne.nonlinearities.softmax,
                 RN=0.,
                 **kwargs):
        super(MemoryLayer, self).__init__(incomings, **kwargs)

        self.vocab_size, self.emb_size = vocab_size, emb_size
        self.nonlin = nonlin
        self.RN = RN
        #        self.A, self.C, self.AT, self.CT = A, C, AT, CT

        batch_size, c_count, c_length = self.input_shapes[0]
        _, q_count, _ = self.input_shapes[2]

        self.l_c_in = LL.InputLayer(shape=(batch_size, c_count, c_length))
        self.l_c_in_pe = LL.InputLayer(shape=(batch_size, c_count, c_length,
                                              self.emb_size))
        self.l_u_in = LL.InputLayer(shape=(batch_size, q_count, self.emb_size))

        self.l_c_A_enc = EncodingFullLayer((self.l_c_in, self.l_c_in_pe),
                                           self.vocab_size, self.emb_size, A,
                                           AT)
        self.l_c_C_enc = EncodingFullLayer((self.l_c_in, self.l_c_in_pe),
                                           self.vocab_size, self.emb_size, C,
                                           CT)
        self.A, self.C = self.l_c_A_enc.W, self.l_c_C_enc.W
        self.AT, self.CT = self.l_c_A_enc.WT, self.l_c_C_enc.WT
        if len(incomings
               ) == 4:  # if there is also the probabilities over sentences
            self.l_in_ac_prob = LL.InputLayer(shape=(batch_size, c_count,
                                                     emb_size))
            self.l_c_A_enc_ = LL.ElemwiseMergeLayer(
                (self.l_c_A_enc, self.l_in_ac_prob), merge_function=T.mul)
            self.l_c_C_enc_ = LL.ElemwiseMergeLayer(
                (self.l_c_C_enc, self.l_in_ac_prob), merge_function=T.mul)

        self.l_u_in_tr = LL.DimshuffleLayer(self.l_u_in, pattern=(0, 2, 1))
        if len(incomings) == 4:
            self.l_p = BatchedDotLayer((self.l_c_A_enc_, self.l_u_in_tr))
        else:
            self.l_p = BatchedDotLayer((self.l_c_A_enc, self.l_u_in_tr))

        if self.l_p.output_shape[2] == 1:
            self.l_p = LL.FlattenLayer(self.l_p, outdim=2)


#            self.l_p = LL.DimshuffleLayer(self.l_p, (0, 1))

        if self.nonlin == 'MaxOut':
            raise NotImplementedError
        self.l_p = LL.NonlinearityLayer(self.l_p, nonlinearity=nonlin)
        self.l_p = LL.DimshuffleLayer(self.l_p, (0, 1, 'x'))
        #        self.l_p = LL.ReshapeLayer(self.l_p, self.l_p.output_shape + (1,))
        self.l_p = LL.ExpressionLayer(self.l_p,
                                      lambda X: X.repeat(emb_size, 2),
                                      output_shape='auto')
        ##        self.l_p = RepeatDimLayer(self.l_p, emb_size, axis=2)
        if len(incomings) == 4:
            self.l_pc = LL.ElemwiseMergeLayer((self.l_p, self.l_c_C_enc_),
                                              merge_function=T.mul)
        else:
            self.l_pc = LL.ElemwiseMergeLayer((self.l_p, self.l_c_C_enc),
                                              merge_function=T.mul)
        self.l_o = LL.ExpressionLayer(self.l_pc,
                                      lambda X: X.sum(1),
                                      output_shape='auto')
        #        self.l_o = SumLayer(self.l_pc, axis=1)
        self.l_o = LL.DimshuffleLayer(self.l_o, pattern=(0, 'x', 1))
        self.l_o_u = LL.ElemwiseMergeLayer((self.l_o, self.l_u_in),
                                           merge_function=T.add)

        params = LL.helper.get_all_params(self.l_o_u, trainable=True)
        values = LL.helper.get_all_param_values(self.l_o_u, trainable=True)
        for p, v in zip(params, values):
            self.add_param(p, v.shape, name=p.name)
示例#7
0
def conv_concat(_in, _vec):
    n = _in.output_shape[2]
    _bcast = L.ExpressionLayer(_vec, lambda __X: __X.dimshuffle(0, 1, 'x', 'x') * T.ones((__X.shape[0], __X.shape[1], n, n)), output_shape='auto')      
    return L.ConcatLayer([_in, _bcast], axis=1)
示例#8
0
def create_conditon_slices_from(_cond, ish):
    if len(ish)==2:
        return _cond
    else:
        return L.ExpressionLayer(_cond, lambda __X: __X.dimshuffle(0, 1, 'x', 'x') \
                * T.ones((__X.shape[0], __X.shape[1],)+ish[-2:]), output_shape='auto')
示例#9
0
def clip(l, b=1):
    """
    A very simple gradient clipping wrapper because stupid lasagne doens't support it 
    """
    return L.ExpressionLayer(l, lambda x: theano.gradient.grad_clip(x, -b, b))
示例#10
0
    def additional_layer(self, idx_layer, emb_layer, avg=False):
        suf = '_avg' if avg else ''
        if self.name == 'char':
            if self.args.char_model == 'cnn':
                lds = L.dimshuffle(emb_layer,
                                   (0, 3, 1, 2))  # (100, 16, 26, 32)
                ls = []
                for n in self.args.ngrams:
                    lconv = L.Conv2DLayer(
                        lds,
                        self.args.conv_dim,
                        (1, n),
                        untie_biases=False,
                        # W=HeNormal('relu') if not avg else Constant(),
                        W=GlorotNormal('relu') if not avg else Constant(),
                        name='conv_%d' % n + suf)  # (100, 64/4, 26, 32-n+1)

                    lpool = L.MaxPool2DLayer(lconv,
                                             (1, self.args.max_word_len - n +
                                              1))  # (100, 64, 26, 1)
                    lpool = L.flatten(lpool, outdim=3)  # (100, 16, 26)
                    lpool = L.dimshuffle(lpool, (0, 2, 1))  # (100, 26, 16)
                    ls.append(lpool)
                xc = L.concat(ls, axis=2, name='echar_concat')  # (100, 26, 64)
                # additional
                # xc = L.DenseLayer(xc, self.args.embw_dim, nonlinearity=None, name='echar_affine', num_leading_axes=2,
                # W=HeNormal() if not avg else Constant()) # (100, 26, 100)
                return xc
            elif self.args.char_model == 'lstm':
                ml = L.ExpressionLayer(
                    idx_layer,
                    lambda x: T.neq(x, 0))  # mask layer (100, 24, 32)
                ml = L.reshape(ml, (-1, self.args.max_word_len))  # (1500, 32)

                gate_params = L.recurrent.Gate(W_in=Orthogonal(),
                                               W_hid=Orthogonal())
                cell_params = L.recurrent.Gate(W_in=Orthogonal(),
                                               W_hid=Orthogonal(),
                                               W_cell=None,
                                               nonlinearity=tanh)

                lstm_in = L.reshape(
                    emb_layer,
                    (-1, self.args.max_word_len,
                     self.config['char']['emb_dim']))  # (1500, 32, 16)
                lstm_f = L.LSTMLayer(
                    lstm_in,
                    32,
                    mask_input=ml,
                    grad_clipping=10.,
                    learn_init=True,
                    peepholes=False,
                    precompute_input=True,
                    ingate=gate_params,
                    forgetgate=gate_params,
                    cell=cell_params,
                    outgate=gate_params,
                    # unroll_scan=True,
                    only_return_final=True,
                    name='forward' + suf)  # (1500, 32)
                lstm_b = L.LSTMLayer(
                    lstm_in,
                    32,
                    mask_input=ml,
                    grad_clipping=10.,
                    learn_init=True,
                    peepholes=False,
                    precompute_input=True,
                    ingate=gate_params,
                    forgetgate=gate_params,
                    cell=cell_params,
                    outgate=gate_params,
                    # unroll_scan=True,
                    only_return_final=True,
                    backwards=True,
                    name='backward' + suf)  # (1500, 32)
                remove_reg(lstm_f)
                remove_reg(lstm_b)
                if avg:
                    set_zero(lstm_f)
                    set_zero(lstm_b)
                xc = L.concat([lstm_f, lstm_b], axis=1)  # (1500, 64)
                if self.args.lstm_tagger:
                    xc = L.reshape(
                        xc, (-1, self.args.max_sent_len, 64))  # (100, 161, 64)
                elif self.args.trans_tagger:
                    xc = L.reshape(
                        xc, (-1, self.args.window_size, 64))  # (100, 15, 64)
                else:
                    xc = L.reshape(xc, (-1, 26, 64))  # (100, 26, 64)
                return xc

        elif self.name == 'morph':
            # idx (100, 26/161, 16)  emb (100, 26/161, 16, 32)
            if self.args.morph_model == 'max':
                xm = L.MaxPool2DLayer(
                    emb_layer,
                    (self.args.max_morph_len, 1))  # (100, 26/161, 1, 32)
                # xm = L.reshape(xm, (-1, 26, self.config['morph']['emb_dim'])) # (100, 26/161, 32)
                xm = L.flatten(xm, outdim=3)  # (100, 26/161, 32)
                # xm = L.ExpressionLayer(emb_layer, lambda x: T.max(x, 2))
            elif self.args.morph_model == 'avg':
                mask = L.ExpressionLayer(
                    idx_layer, lambda x: T.neq(x, 0))  # (100, 26, 16)
                mask = L.dimshuffle(mask, (0, 1, 2, 'x'))  # (100, 26, 16, 1)
                mask = L.ExpressionLayer(mask, lambda x: T.extra_ops.repeat(
                    x, self.config['morph']['emb_dim'], 3))  # (100, 26, 16, 1)
                xm = L.ElemwiseMergeLayer([
                    emb_layer, mask
                ], lambda x, m: T.sum(x * m, 2) / T.sum(m, 2))  # (100, 26, 32)
                # xm = L.reshape(xm, (-1, self.args.feat_shape, self.config['morph']['emb_dim'])) # (100, 26, 32)
            return xm
        else:
            return emb_layer
示例#11
0
def make_model():
    image = ll.InputLayer((BS, CH, IH, IW), name='step1.image')

    h_read_init = ll.InputLayer(
        (HS, ),
        lasagne.utils.create_param(li.Uniform(), (HS, ),
                                   name='step1.tensor.h_read_init'),
        name='step1.h_read_init')
    h_read_init.add_param(h_read_init.input_var, (HS, ))

    h_write_init = ll.InputLayer(
        (HS, ),
        lasagne.utils.create_param(li.Uniform(), (HS, ),
                                   name='step1.tensor.h_write_init'),
        name='step1.h_write_init')
    h_write_init.add_param(h_write_init.input_var, (HS, ))

    h_read = ll.ExpressionLayer(h_read_init,
                                lambda t: T.tile(T.reshape(t, (1, HS)),
                                                 (BS, 1)), (BS, HS),
                                name='step1.h_read')

    h_write = ll.ExpressionLayer(h_write_init,
                                 lambda t: T.tile(T.reshape(t, (1, HS)),
                                                  (BS, 1)), (BS, HS),
                                 name='step1.h_write')

    canvas = ll.InputLayer(
        (BS, CH, IH, IW),
        lasagne.utils.create_param(li.Constant(0.0), (BS, CH, IH, IW),
                                   name='step1.tensor.canvas'),
        name='step1.canvas')

    image_prev = ll.NonlinearityLayer(canvas,
                                      ln.sigmoid,
                                      name='step1.image_prev')

    image_error = ll.ElemwiseSumLayer([image, image_prev],
                                      coeffs=[1, -1],
                                      name='step1.image_error')
    image_stack = ll.ConcatLayer([image, image_error],
                                 name='step1.image_stack')

    read_params = ll.DenseLayer(h_write,
                                6,
                                nonlinearity=None,
                                name='step1.read_params')
    read_window = advanced_layers.AttentionLayer([read_params, image_stack],
                                                 (WH, WW),
                                                 name='step1.read_window')

    read_flat = ll.FlattenLayer(read_window, name='step1.read_flat')
    read_code = ll.ConcatLayer([read_flat, h_write], name='step1.read_code')

    read_code_sequence = ll.ReshapeLayer(read_code,
                                         (BS, 1, read_code.output_shape[-1]),
                                         name='step1.read_code_sequence')

    read_rnn = ll.GRULayer(
        read_code_sequence,
        HS,
        only_return_final=True,
        hid_init=h_read,
        name='step1.read_rnn',
    )

    sample_mean = ll.DenseLayer(read_rnn,
                                ENC_NDIM,
                                nonlinearity=None,
                                name='step1.sample_mean')
    sample_logvar2 = ll.DenseLayer(read_rnn,
                                   ENC_NDIM,
                                   nonlinearity=None,
                                   name='step1.sample_logvar2')
    sample = advanced_layers.SamplingLayer([sample_mean, sample_logvar2],
                                           ENC_VAR,
                                           name='step1.sample')

    write_code = ll.DenseLayer(sample, HS, name='step1.write_code')
    write_code_sequence = ll.ReshapeLayer(write_code,
                                          (BS, 1, write_code.output_shape[-1]),
                                          name='step1.write_code_sequence')
    write_rnn = ll.GRULayer(
        write_code_sequence,
        HS,
        only_return_final=True,
        hid_init=h_write,
        name='step1.write_rnn',
    )
    write_window_flat = ll.DenseLayer(write_rnn,
                                      CH * WH * WW,
                                      name='step1.write_window_flat')
    write_window = ll.ReshapeLayer(write_window_flat, (BS, CH, WH, WW),
                                   name='step1.write_window')

    write_params = ll.DenseLayer(h_write,
                                 6,
                                 nonlinearity=None,
                                 name='step1.write_params')
    write_image = advanced_layers.AttentionLayer([write_params, write_window],
                                                 (IH, IW),
                                                 name='step1.write_image')
    canvas_next = ll.ElemwiseSumLayer([canvas, write_image],
                                      name='step1.canvas_next')

    def rename(name):
        if name is None:
            return None
        step, real_name = name.split('.', 1)
        step = int(step[4:])
        return 'step%d.%s' % (step + 1, real_name)

    for step in xrange(1, TIME_ROUNDS):
        sample_random_variable_next = sample.random_stream.normal(
            sample.input_shapes[0],
            std=sample.variation_coeff,
        )
        sample_random_variable_next.name = 'step%d.sample.random_variable' % \
            (step + 1)

        canvas, canvas_next = (canvas_next,
                               utils.modified_copy(
                                   canvas_next,
                                   modify={
                                       h_read:
                                       read_rnn,
                                       h_write:
                                       write_rnn,
                                       canvas:
                                       canvas_next,
                                       sample.random_stream:
                                       sample.random_stream,
                                       sample.random_variable:
                                       sample_random_variable_next,
                                   },
                                   rename=rename,
                               ))

        h_read = read_rnn
        h_write = write_rnn
        read_rnn = utils.layer_by_name(canvas_next,
                                       'step%d.read_rnn' % (step + 1))
        write_rnn = utils.layer_by_name(canvas_next,
                                        'step%d.write_rnn' % (step + 1))
        sample = utils.layer_by_name(canvas_next, 'step%d.sample' % (step + 1))

    output = ll.NonlinearityLayer(canvas_next, ln.sigmoid, name='output')

    return output
示例#12
0
    def _build_net(self, emb_char_filter_size=5, emb_dropout=True, **kwargs):

        batch_size = self.mask_context_var.shape[0]
        context_len = self.mask_context_var.shape[1]
        question_len = self.question_var.shape[1]
        context_word_len = self.context_char_var.shape[2]
        question_word_len = self.question_char_var.shape[2]

        self.batch_size = batch_size
        self.context_len = context_len
        ''' Inputs and word embeddings'''

        l_context_char = LL.InputLayer(shape=(None, None, None),
                                       input_var=self.context_char_var)
        l_question_char = LL.InputLayer(shape=(None, None, None),
                                        input_var=self.question_char_var)

        l_c_mask = LL.InputLayer(shape=(None, None),
                                 input_var=self.mask_context_var)
        l_q_mask = LL.InputLayer(shape=(None, None),
                                 input_var=self.mask_question_var)

        l_c_char_mask = LL.InputLayer(shape=(None, None, None),
                                      input_var=self.mask_context_char_var)
        l_q_char_mask = LL.InputLayer(shape=(None, None, None),
                                      input_var=self.mask_question_char_var)

        l_c_emb = LL.InputLayer(shape=(None, None, self.emb_size),
                                input_var=self.context_var)
        l_q_emb = LL.InputLayer(shape=(None, None, self.emb_size),
                                input_var=self.question_var)

        if self.train_unk:
            l_c_unk_mask = LL.InputLayer(shape=(None, None),
                                         input_var=self.mask_context_unk_var)
            l_q_unk_mask = LL.InputLayer(shape=(None, None),
                                         input_var=self.mask_question_unk_var)

            l_c_emb = TrainUnkLayer(l_c_emb,
                                    l_c_unk_mask,
                                    output_size=self.emb_size,
                                    W=self.word_embeddings[0])

            l_q_emb = TrainUnkLayer(l_q_emb,
                                    l_q_unk_mask,
                                    output_size=self.emb_size,
                                    W=l_c_emb.W)

        if self.negative:
            l_c_emb = TrainNAWLayer(l_c_emb,
                                    l_c_mask,
                                    output_size=self.emb_size)
        ''' Char-embeddings '''

        # (batch_size x context_len x context_word_len x emb_char_size)
        l_c_char_emb = LL.EmbeddingLayer(l_context_char,
                                         input_size=self.alphabet_size,
                                         output_size=self.emb_char_size)

        l_q_char_emb = LL.EmbeddingLayer(l_question_char,
                                         input_size=self.alphabet_size,
                                         output_size=self.emb_char_size,
                                         W=l_c_char_emb.W)

        # here I do multiplication of character embeddings with masks,
        # because I want to pad them with constant zeros

        l_c_char_mask = ForgetSizeLayer(
            LL.dimshuffle(l_c_char_mask, (0, 1, 2, 'x')))
        l_q_char_mask = ForgetSizeLayer(
            LL.dimshuffle(l_q_char_mask, (0, 1, 2, 'x')))

        l_c_char_emb = LL.ElemwiseMergeLayer([l_c_char_emb, l_c_char_mask],
                                             T.mul)
        l_q_char_emb = LL.ElemwiseMergeLayer([l_q_char_emb, l_q_char_mask],
                                             T.mul)

        # convolutions

        l_c_char_emb = LL.dimshuffle(
            LL.reshape(l_c_char_emb, (batch_size * context_len,
                                      context_word_len, self.emb_char_size)),
            (0, 2, 1))
        l_c_char_conv = LL.Conv1DLayer(l_c_char_emb,
                                       num_filters=self.num_emb_char_filters,
                                       filter_size=emb_char_filter_size,
                                       nonlinearity=L.nonlinearities.tanh,
                                       pad=self.conv)
        # (batch_size * context_len x num_filters x context_word_len + filter_size - 1)

        l_c_char_emb = LL.ExpressionLayer(l_c_char_conv,
                                          lambda X: X.max(2),
                                          output_shape='auto')
        l_c_char_emb = LL.reshape(
            l_c_char_emb, (batch_size, context_len, self.num_emb_char_filters))

        l_q_char_emb = LL.dimshuffle(
            LL.reshape(l_q_char_emb, (batch_size * question_len,
                                      question_word_len, self.emb_char_size)),
            (0, 2, 1))
        l_q_char_conv = LL.Conv1DLayer(l_q_char_emb,
                                       num_filters=self.num_emb_char_filters,
                                       filter_size=emb_char_filter_size,
                                       nonlinearity=L.nonlinearities.tanh,
                                       W=l_c_char_conv.W,
                                       b=l_c_char_conv.b,
                                       pad=self.conv)
        # (batch_size * question_len x num_filters x question_word_len + filter_size - 1)

        l_q_char_emb = LL.ExpressionLayer(l_q_char_conv,
                                          lambda X: X.max(2),
                                          output_shape='auto')
        l_q_char_emb = LL.reshape(
            l_q_char_emb,
            (batch_size, question_len, self.num_emb_char_filters))
        ''' Concatenating both embeddings '''

        l_c_emb = LL.concat([l_c_emb, l_c_char_emb], axis=2)
        l_q_emb = LL.concat([l_q_emb, l_q_char_emb], axis=2)

        # originally I had dropout here
        ''' Highway layer allowing for interaction between embeddings '''

        l_c_P = LL.reshape(l_c_emb,
                           (batch_size * context_len,
                            self.emb_size + self.num_emb_char_filters))
        l_c_P = LL.DenseLayer(l_c_P,
                              num_units=self.rec_size,
                              b=None,
                              nonlinearity=None)

        l_c_high = HighwayLayer(l_c_P)
        l_c_emb = LL.reshape(l_c_high,
                             (batch_size, context_len, self.rec_size))

        l_q_P = LL.reshape(l_q_emb,
                           (batch_size * question_len,
                            self.emb_size + self.num_emb_char_filters))
        l_q_P = LL.DenseLayer(l_q_P,
                              num_units=self.rec_size,
                              W=l_c_P.W,
                              b=None,
                              nonlinearity=None)

        l_q_high = HighwayLayer(l_q_P,
                                W1=l_c_high.W1,
                                b1=l_c_high.b1,
                                W2=l_c_high.W2,
                                b2=l_c_high.b2)
        l_q_emb = LL.reshape(l_q_high,
                             (batch_size, question_len, self.rec_size))
        ''' Calculating wiq features from https://arxiv.org/abs/1703.04816 '''

        l_weighted_feat = WeightedFeatureLayer(
            [l_c_emb, l_q_emb, l_c_mask, l_q_mask])  # batch_size x context_len
        l_weighted_feat = LL.dimshuffle(l_weighted_feat, (0, 1, 'x'))

        # batch_size x context_len
        l_bin_feat = LL.InputLayer(shape=(None, None),
                                   input_var=self.bin_feat_var)
        l_bin_feat = LL.dimshuffle(l_bin_feat, (0, 1, 'x'))
        ''' Dropout at the embeddings '''

        if emb_dropout:
            print('Using dropout after wiq calculation.')
            l_c_emb = LL.dropout(l_c_emb)
            l_q_emb = LL.dropout(l_q_emb)
        ''' Here we concatenate wiq features to embeddings'''

        # both features are concatenated to the embeddings
        # for the question we fix the features to 1
        l_c_emb = LL.concat([l_c_emb, l_bin_feat, l_weighted_feat], axis=2)
        l_q_emb = LL.pad(l_q_emb,
                         width=[(0, 2)],
                         val=L.utils.floatX(1),
                         batch_ndim=2)
        ''' Context and question encoding using the same BiLSTM for both '''

        # output shape is (batch_size x context_len x rec_size)
        l_c_enc_forw = LL.LSTMLayer(l_c_emb,
                                    num_units=self.rec_size,
                                    grad_clipping=100,
                                    mask_input=l_c_mask)

        l_c_enc_back = LL.LSTMLayer(l_c_emb,
                                    num_units=self.rec_size,
                                    grad_clipping=100,
                                    mask_input=l_c_mask,
                                    backwards=True)

        # output shape is (batch_size x question_len x rec_size)
        l_q_enc_forw = LL.LSTMLayer(
            l_q_emb,
            num_units=self.rec_size,
            grad_clipping=100,
            mask_input=l_q_mask,
            ingate=LL.Gate(W_in=l_c_enc_forw.W_in_to_ingate,
                           W_hid=l_c_enc_forw.W_hid_to_ingate,
                           W_cell=l_c_enc_forw.W_cell_to_ingate,
                           b=l_c_enc_forw.b_ingate),
            forgetgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_forgetgate,
                               W_hid=l_c_enc_forw.W_hid_to_forgetgate,
                               W_cell=l_c_enc_forw.W_cell_to_forgetgate,
                               b=l_c_enc_forw.b_forgetgate),
            outgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_outgate,
                            W_hid=l_c_enc_forw.W_hid_to_outgate,
                            W_cell=l_c_enc_forw.W_cell_to_outgate,
                            b=l_c_enc_forw.b_outgate),
            cell=LL.Gate(W_in=l_c_enc_forw.W_in_to_cell,
                         W_hid=l_c_enc_forw.W_hid_to_cell,
                         W_cell=None,
                         b=l_c_enc_forw.b_cell,
                         nonlinearity=L.nonlinearities.tanh))

        l_q_enc_back = LL.LSTMLayer(
            l_q_emb,
            num_units=self.rec_size,
            grad_clipping=100,
            mask_input=l_q_mask,
            backwards=True,
            ingate=LL.Gate(W_in=l_c_enc_back.W_in_to_ingate,
                           W_hid=l_c_enc_back.W_hid_to_ingate,
                           W_cell=l_c_enc_back.W_cell_to_ingate,
                           b=l_c_enc_back.b_ingate),
            forgetgate=LL.Gate(W_in=l_c_enc_back.W_in_to_forgetgate,
                               W_hid=l_c_enc_back.W_hid_to_forgetgate,
                               W_cell=l_c_enc_back.W_cell_to_forgetgate,
                               b=l_c_enc_back.b_forgetgate),
            outgate=LL.Gate(W_in=l_c_enc_back.W_in_to_outgate,
                            W_hid=l_c_enc_back.W_hid_to_outgate,
                            W_cell=l_c_enc_back.W_cell_to_outgate,
                            b=l_c_enc_back.b_outgate),
            cell=LL.Gate(W_in=l_c_enc_back.W_in_to_cell,
                         W_hid=l_c_enc_back.W_hid_to_cell,
                         W_cell=None,
                         b=l_c_enc_back.b_cell,
                         nonlinearity=L.nonlinearities.tanh))

        # batch_size x context_len  x 2*rec_size
        l_c_enc = LL.concat([l_c_enc_forw, l_c_enc_back], axis=2)
        # batch_size x question_len x 2*rec_size
        l_q_enc = LL.concat([l_q_enc_forw, l_q_enc_back], axis=2)

        def proj_init():
            return np.vstack([
                np.eye(self.rec_size, dtype=theano.config.floatX),
                np.eye(self.rec_size, dtype=theano.config.floatX)
            ])

        # this is H from the paper, shape: (batch_size * context_len x
        # rec_size)
        l_c_proj = LL.reshape(l_c_enc,
                              (batch_size * context_len, 2 * self.rec_size))
        l_c_proj = LL.DenseLayer(l_c_proj,
                                 num_units=self.rec_size,
                                 W=proj_init(),
                                 b=None,
                                 nonlinearity=L.nonlinearities.tanh)

        # this is Z from the paper, shape: (batch_size * question_len x
        # rec_size)
        l_q_proj = LL.reshape(l_q_enc,
                              (batch_size * question_len, 2 * self.rec_size))
        l_q_proj = LL.DenseLayer(l_q_proj,
                                 num_units=self.rec_size,
                                 W=proj_init(),
                                 b=None,
                                 nonlinearity=L.nonlinearities.tanh)
        ''' Additional, weighted question encoding (alphas from paper) '''

        l_alpha = LL.DenseLayer(
            l_q_proj,  # batch_size * question_len x 1
            num_units=1,
            b=None,
            nonlinearity=None)

        # batch_size x question_len
        l_alpha = MaskedSoftmaxLayer(
            LL.reshape(l_alpha, (batch_size, question_len)), l_q_mask)

        # batch_size x rec_size
        l_z_hat = BatchedDotLayer([
            LL.reshape(l_q_proj, (batch_size, question_len, self.rec_size)),
            l_alpha
        ])

        return l_c_proj, l_z_hat
    def __init__(self,
                 input_shape,
                 output_dim,
                 hidden_sizes,
                 conv_filters,
                 conv_filter_sizes,
                 conv_strides,
                 conv_pads,
                 hidden_W_init=LI.GlorotUniform(),
                 hidden_b_init=LI.Constant(0.),
                 output_W_init=LI.GlorotUniform(),
                 output_b_init=LI.Constant(0.),
                 hidden_nonlinearity=LN.rectify,
                 output_nonlinearity=LN.softmax,
                 name=None,
                 input_var=None):

        if name is None:
            prefix = ""
        else:
            prefix = name + "_"

        if len(input_shape) == 3:
            l_in = L.InputLayer(shape=(None, np.prod(input_shape)),
                                input_var=input_var)
            l_hid = L.reshape(l_in, ([0], ) + input_shape)
        elif len(input_shape) == 2:
            l_in = L.InputLayer(shape=(None, np.prod(input_shape)),
                                input_var=input_var)
            input_shape = (1, ) + input_shape
            l_hid = L.reshape(l_in, ([0], ) + input_shape)
        else:
            l_in = L.InputLayer(shape=(None, ) + input_shape,
                                input_var=input_var)
            l_hid = l_in

        assert input_shape[0] % 2 == 0
        l_hid0 = L.SliceLayer(l_hid, slice(None, input_shape[0] // 2), axis=1)
        l_hid1 = L.SliceLayer(l_hid, slice(input_shape[0] // 2, None), axis=1)
        l_hids = [l_hid0, l_hid1]

        for idx, conv_filter, filter_size, stride, pad in zip(
                range(len(conv_filters)),
                conv_filters,
                conv_filter_sizes,
                conv_strides,
                conv_pads,
        ):
            for ihid in range(len(l_hids)):
                if ihid > 0:
                    conv_kwargs = dict(W=l_hids[0].W, b=l_hids[0].b)
                else:
                    conv_kwargs = dict()
                l_hids[ihid] = L.Conv2DLayer(l_hids[ihid],
                                             num_filters=conv_filter,
                                             filter_size=filter_size,
                                             stride=(stride, stride),
                                             pad=pad,
                                             nonlinearity=hidden_nonlinearity,
                                             name="%sconv_hidden_%d_%d" %
                                             (prefix, idx, ihid),
                                             convolution=wrapped_conv,
                                             **conv_kwargs)

        l_hid = L.ElemwiseSumLayer(l_hids, coeffs=[-1, 1])
        l_hid = L.ExpressionLayer(l_hid, lambda X: X * X)

        for idx, hidden_size in enumerate(hidden_sizes):
            l_hid = L.DenseLayer(
                l_hid,
                num_units=hidden_size,
                nonlinearity=hidden_nonlinearity,
                name="%shidden_%d" % (prefix, idx),
                W=hidden_W_init,
                b=hidden_b_init,
            )
        l_out = L.DenseLayer(
            l_hid,
            num_units=output_dim,
            nonlinearity=output_nonlinearity,
            name="%soutput" % (prefix, ),
            W=output_W_init,
            b=output_b_init,
        )
        self._l_in = l_in
        self._l_out = l_out
        self._input_var = l_in.input_var
示例#14
0
def clip(l, b=1):
    return L.ExpressionLayer(l, lambda x: theano.gradient.grad_clip(x, -b, b))