Exemplo n.º 1
0
def get_rnn_unit(l_in,
                 mask,
                 rev_mask,
                 state,
                 rev_state,
                 n_units,
                 prefix,
                 grad_clip=0,
                 context=None,
                 attention=False):

    net = OrderedDict()
    hid = state
    rg = Gate(W_in=input, W_hid=inner, W_cell=None)
    ug = Gate(W_in=input, W_hid=inner, W_cell=None)
    hg = Gate(W_in=input, W_hid=inner, W_cell=None, nonlinearity=tanh)

    net[prefix + 'gru'] = GRULayer(l_in,
                                   num_units=n_units,
                                   resetgate=rg,
                                   updategate=ug,
                                   hidden_update=hg,
                                   mask_input=mask,
                                   hid_init=hid,
                                   learn_init=False,
                                   only_return_final=False,
                                   grad_clipping=grad_clip,
                                   context_input=context,
                                   use_attention=attention,
                                   name='gru')

    if rev_mask is not None and rev_state is not None:
        net[prefix + 'gru_rev'] = GRULayer(l_in,
                                           num_units=n_units,
                                           resetgate=rg,
                                           updategate=ug,
                                           hidden_update=hg,
                                           mask_input=rev_mask,
                                           hid_init=rev_state,
                                           only_return_final=False,
                                           learn_init=False,
                                           grad_clipping=grad_clip,
                                           context_input=context,
                                           backwards=True,
                                           name='gru_rev')

        net['context'] = ElemwiseSumLayer(net.values()[-2:], name='context')

    return net
Exemplo n.º 2
0
    def get_unidirectional_layer(self,
                                 input_layer,
                                 mask_layer,
                                 n_hidden,
                                 true_input_size,
                                 only_return_final,
                                 backwards=False):
        if true_input_size is not None:
            if self.layer_type == "LSTM":
                layer = LSTMLayerOHEInput
            elif self.layer_type == "GRU":
                layer = GRULayerOHEInput
            elif self.layer_type == "Vanilla":
                layer = VanillaLayerOHEInput
            else:
                raise ValueError('Unknown layer type')

            return layer(input_layer,
                         n_hidden,
                         true_input_size,
                         mask_input=mask_layer,
                         grad_clipping=self.grad_clip,
                         learn_init=True,
                         only_return_final=only_return_final,
                         backwards=backwards,
                         ingate=Gate(nonlinearity=self.act_f_input),
                         forgetgate=Gate(nonlinearity=self.act_f_forget),
                         outgate=Gate(nonlinearity=self.act_f_output),
                         cell=Gate(W_cell=None, nonlinearity=self.act_f_cell),
                         nonlinearity=self.act_f_hidden)
        else:
            if self.layer_type == "LSTM":
                layer = lasagne.layers.LSTMLayer
            elif self.layer_type == "GRU":
                layer = lasagne.layers.GRULayer
            elif self.layer_type == "Vanilla":
                layer = lasagne.layers.RecurrentLayer
            else:
                raise ValueError('Unknown layer type')

            return layer(input_layer,
                         n_hidden,
                         mask_input=mask_layer,
                         grad_clipping=self.grad_clip,
                         learn_init=True,
                         only_return_final=only_return_final,
                         backwards=backwards)
Exemplo n.º 3
0
    def __init__(self,
                 n_in,
                 n_out,
                 resetgate=Gate(W_cell=None),
                 updategate=Gate(W_cell=None),
                 hidden_update=Gate(W_cell=None,
                                    nonlinearity=nonlinearities.tanh),
                 grad_clipping=100):
        self.n_in = n_in
        self.n_out = n_out
        self.grad_clipping = grad_clipping

        self.params = []

        def create_gate_params(gate):
            """ Convenience function for adding layer parameters from a Gate
            instance. """
            return (gate.W_in.sample(
                (n_in, n_out)), gate.W_hid.sample(
                    (n_out, n_out)), gate.b.sample(
                        (n_out, )), gate.nonlinearity)

        (W_in_to_updategate, W_hid_to_updategate, b_updategate,
         self.nonlinearity_updategate) = create_gate_params(updategate)

        (W_in_to_resetgate, W_hid_to_resetgate, b_resetgate,
         self.nonlinearity_resetgate) = create_gate_params(resetgate)

        (W_in_to_hidden_update, W_hid_to_hidden_update, b_hidden_update,
         self.nonlinearity_hid) = create_gate_params(hidden_update)

        W_in_stacked = np.concatenate(
            [W_in_to_resetgate, W_in_to_updategate, W_in_to_hidden_update],
            axis=1)
        self.W_in_stacked = shared(W_in_stacked, 'W_in')

        W_hid_stacked = np.concatenate(
            [W_hid_to_resetgate, W_hid_to_updategate, W_hid_to_hidden_update],
            axis=1)
        self.W_hid_stacked = shared(W_hid_stacked, 'W_hid')

        b_stacked = np.concatenate(
            [b_resetgate, b_updategate, b_hidden_update], axis=0)
        self.b_stacked = shared(b_stacked, name='b')

        self.in_params = [self.W_in_stacked, self.b_stacked]
        self.rec_params = [self.W_hid_stacked]
Exemplo n.º 4
0
    def __init__(self,
                 incoming,
                 num_units,
                 ingate=Gate(),
                 forgetgate=Gate(),
                 cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh),
                 outgate=Gate(),
                 nonlinearity=nonlinearities.tanh,
                 cell_init=init.Constant(0.),
                 hid_init=init.Constant(0.),
                 backwards=False,
                 learn_init=False,
                 peepholes=True,
                 gradient_steps=-1,
                 grad_clipping=0,
                 unroll_scan=False,
                 precompute_input=True,
                 mask_input=None,
                 only_return_final=False,
                 W_dt=init.GlorotUniform(),
                 b_dt=init.Constant(0.),
                 nonlinearity_dt=nonlinearities.rectify,
                 num_dt_layers=1,
                 **kwargs):

        super(LSTMDTLayer,
              self).__init__(incoming, num_units, ingate, forgetgate, cell,
                             outgate, nonlinearity, cell_init, hid_init,
                             backwards, learn_init, peepholes, gradient_steps,
                             grad_clipping, unroll_scan, precompute_input,
                             mask_input, only_return_final, **kwargs)

        self.nonlinearity_dt = (nonlinearities.identity if
                                nonlinearity_dt is None else nonlinearity_dt)
        self.num_dt_layers = num_dt_layers
        self.W_dt = [
            self.add_param(W_dt, (num_units, num_units), name="W_dt")
            for _ in range(self.num_dt_layers)
        ]
        self.b_dt = [
            self.add_param(b_dt, (1, num_units),
                           name="b_dt",
                           regularizable=False)
            for _ in range(self.num_dt_layers)
        ]
Exemplo n.º 5
0
    def _get_l_out(self, input_vars):
        listener.check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        input_var = input_vars[0]

        l_in = InputLayer(shape=(None, self.seq_vec.max_len),
                          input_var=input_var,
                          name=id_tag + 'desc_input')
        l_in_embed = EmbeddingLayer(
            l_in,
            input_size=len(self.seq_vec.tokens),
            output_size=self.options.listener_cell_size,
            name=id_tag + 'desc_embed')

        cell = CELLS[self.options.listener_cell]
        cell_kwargs = {
            'grad_clipping': self.options.listener_grad_clipping,
            'num_units': self.options.listener_cell_size,
        }
        if self.options.listener_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(
                b=Constant(self.options.listener_forget_bias))
        if self.options.listener_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[
                self.options.listener_nonlinearity]

        l_rec1 = cell(l_in_embed, name=id_tag + 'rec1', **cell_kwargs)
        if self.options.listener_dropout > 0.0:
            l_rec1_drop = DropoutLayer(l_rec1,
                                       p=self.options.listener_dropout,
                                       name=id_tag + 'rec1_drop')
        else:
            l_rec1_drop = l_rec1

        l_hidden = DenseLayer(
            l_rec1_drop,
            num_units=self.options.listener_cell_size,
            nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity],
            name=id_tag + 'hidden')
        if self.options.listener_dropout > 0.0:
            l_hidden_drop = DropoutLayer(l_hidden,
                                         p=self.options.listener_dropout,
                                         name=id_tag + 'hidden_drop')
        else:
            l_hidden_drop = l_hidden
        l_out = DenseLayer(l_hidden_drop,
                           num_units=3,
                           nonlinearity=softmax,
                           name=id_tag + 'scores')

        return l_out, [l_in]
Exemplo n.º 6
0
    def _get_l_out(self, input_vars):
        check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        input_var = input_vars[0]
        context_vars = input_vars[1:]

        l_in = InputLayer(shape=(None, self.seq_vec.max_len),
                          input_var=input_var,
                          name=id_tag + 'desc_input')
        l_in_embed = EmbeddingLayer(
            l_in,
            input_size=len(self.seq_vec.tokens),
            output_size=self.options.listener_cell_size,
            name=id_tag + 'desc_embed')

        cell = CELLS[self.options.listener_cell]
        cell_kwargs = {
            'grad_clipping': self.options.listener_grad_clipping,
            'num_units': self.options.listener_cell_size,
        }
        if self.options.listener_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(
                b=Constant(self.options.listener_forget_bias))
        if self.options.listener_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[
                self.options.listener_nonlinearity]

        l_rec1 = cell(l_in_embed, name=id_tag + 'rec1', **cell_kwargs)
        if self.options.listener_dropout > 0.0:
            l_rec1_drop = DropoutLayer(l_rec1,
                                       p=self.options.listener_dropout,
                                       name=id_tag + 'rec1_drop')
        else:
            l_rec1_drop = l_rec1
        l_rec2 = cell(l_rec1_drop,
                      name=id_tag + 'rec2',
                      only_return_final=True,
                      **cell_kwargs)
        if self.options.listener_dropout > 0.0:
            l_rec2_drop = DropoutLayer(l_rec2,
                                       p=self.options.listener_dropout,
                                       name=id_tag + 'rec2_drop')
        else:
            l_rec2_drop = l_rec2
        # add only_return_final to l_rec1 and uncomment next line to remove second layer
        # l_rec2_drop = l_rec1_drop

        # Context repr has shape (batch_size, context_len * repr_size)
        l_context_repr, context_inputs = self.color_vec.get_input_layer(
            context_vars,
            cell_size=self.options.listener_cell_size,
            context_len=self.context_len,
            id=self.id)
        l_concat = ConcatLayer([l_context_repr, l_rec2_drop],
                               axis=1,
                               name=id_tag + 'concat_context_rec2')
        l_hidden_drop = l_concat
        for i in range(1, self.options.listener_hidden_color_layers + 1):
            l_hidden = NINLayer(l_hidden_drop,
                                num_units=self.options.listener_cell_size,
                                nonlinearity=NONLINEARITIES[
                                    self.options.listener_nonlinearity],
                                name=id_tag + 'hidden_combined%d' % i)
            if self.options.listener_dropout > 0.0:
                l_hidden_drop = DropoutLayer(l_hidden,
                                             p=self.options.listener_dropout,
                                             name=id_tag + 'hidden_drop')
            else:
                l_hidden_drop = l_hidden

        l_scores = DenseLayer(l_hidden_drop,
                              num_units=self.context_len,
                              nonlinearity=softmax,
                              name=id_tag + 'scores')

        return l_scores, [l_in] + context_inputs
Exemplo n.º 7
0
    def __init__(self, incoming, num_units,
                 resetgate=Gate(W_cell=None),
                 updategate=Gate(W_cell=None),
                 hidden_update=Gate(W_cell=None,
                                    nonlinearity=nonlinearities.tanh),
                 hid_init=init.Constant(0.),
                 cov_init=init.Constant(0.),
                 backwards=False,
                 learn_init=False,
                 gradient_steps=-1,
                 grad_clipping=0,
                 unroll_scan=False,
                 precompute_input=True,
                 mask_input=None,
                 only_return_final=False,
                 **kwargs):

        # This layer inherits from a MergeLayer, because it can have three
        # inputs - the layer input, the mask and the initial hidden state.  We
        # will just provide the layer input as incomings, unless a mask input
        # or initial hidden state was provided.
        incomings = [incoming]
        self.mask_incoming_index = -1
        self.hid_init_incoming_index = -1
        self.cov_init_incoming_index = -1
        if mask_input is not None:
            incomings.append(mask_input)
            self.mask_incoming_index = len(incomings)-1
        if isinstance(hid_init, Layer):
            incomings.append(hid_init)
            self.hid_init_incoming_index = len(incomings)-1
        if isinstance(cov_init, Layer):
            incomings.append(cov_init)
            self.cov_init_incoming_index = len(incomings)-1

        # Initialize parent layer
        super(GRULayer, self).__init__(incomings, **kwargs)

        self.learn_init = learn_init
        self.num_units = num_units
        self.grad_clipping = grad_clipping
        self.backwards = backwards
        self.gradient_steps = gradient_steps
        self.unroll_scan = unroll_scan
        self.precompute_input = precompute_input
        self.only_return_final = only_return_final

        if unroll_scan and gradient_steps != -1:
            raise ValueError(
                "Gradient steps must be -1 when unroll_scan is true.")

        # Retrieve the dimensionality of the incoming layer
        input_shape = self.input_shapes[0]

        if unroll_scan and input_shape[1] is None:
            raise ValueError("Input sequence length cannot be specified as "
                             "None when unroll_scan is True")

        # Input dimensionality is the output dimensionality of the input layer
        num_inputs = np.prod(input_shape[2:])

        def add_gate_params(gate, gate_name):
            """ Convenience function for adding layer parameters from a Gate
            instance. """
            return (self.add_param(gate.W_in, (num_inputs, num_units),
                                   name="W_in_to_{}".format(gate_name)),
                    self.add_param(gate.W_hid, (num_units, num_units),
                                   name="W_hid_to_{}".format(gate_name)),
                    self.add_param(gate.b, (num_units,),
                                   name="b_{}".format(gate_name),
                                   regularizable=False),
                    gate.nonlinearity)

        # Add in all parameters from gates
        (self.W_in_to_updategate, self.W_hid_to_updategate, self.b_updategate,
         self.nonlinearity_updategate) = add_gate_params(updategate,
                                                         'updategate')
        (self.W_in_to_resetgate, self.W_hid_to_resetgate, self.b_resetgate,
         self.nonlinearity_resetgate) = add_gate_params(resetgate, 'resetgate')

        (self.W_in_to_hidden_update, self.W_hid_to_hidden_update,
         self.b_hidden_update, self.nonlinearity_hid) = add_gate_params(
             hidden_update, 'hidden_update')

        # Initialize hidden state
        if isinstance(hid_init, Layer):
            self.hid_init = hid_init
        else:
            self.hid_init = self.add_param(
                hid_init, (1, self.num_units), name="hid_init",
                trainable=learn_init, regularizable=False)

        if isinstance(cov_init, Layer):
            self.cov_init = cov_init
        else:
            self.cov_init = self.add_param(
                cov_init, (1, self.num_units), name="cov_init",
                trainable=learn_init, regularizable=False)
Exemplo n.º 8
0
def get_rnn(input_var,
            mask_var,
            time_var,
            arch_size,
            GRAD_CLIP=100,
            bn=False,
            model_type='plstm'):
    # (batch size, max sequence length, number of features)
    l_in = lasagne.layers.InputLayer(shape=(None, None, 1),
                                     input_var=input_var)  #L0?
    # Mask as matrices of dimensionality (N_BATCH, MAX_LENGTH)
    l_mask = lasagne.layers.InputLayer(shape=(None, None),
                                       input_var=mask_var)  #l6
    # Time as matrices of dimensionality (N_BATCH, MAX_LENGTH)
    l_t = lasagne.layers.InputLayer(shape=(None, None),
                                    input_var=time_var)  #l5

    # Allows arbitrary sizes
    batch_size, seq_len, _ = input_var.shape

    if model_type == 'plstm':
        print('Using PLSTM.')
        # RNN layer 1
        l_forward = PLSTMLayer(
            l_in,
            time_input=l_t,
            num_units=arch_size[1],
            mask_input=l_mask,
            ingate=Gate(b=lasagne.init.Constant(-0.1)),
            forgetgate=Gate(b=lasagne.init.Constant(0),
                            nonlinearity=lasagne.nonlinearities.sigmoid),
            cell=Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh),
            outgate=Gate(),
            nonlinearity=lasagne.nonlinearities.tanh,
            grad_clipping=GRAD_CLIP,
            bn=bn,
            learn_time_params=[True, True, True],
            timegate=PLSTMTimeGate(Period=ExponentialUniformInit((1, 3)),
                                   Shift=lasagne.init.Uniform((0., 100)),
                                   On_End=lasagne.init.Constant(0.05)))

    else:
        print('Using LSTM, with BN: {}'.format(bn))
        # RNN layers
        l_forward = LSTMWBNLayer(
            lasagne.layers.ConcatLayer([
                l_in,
                lasagne.layers.ReshapeLayer(l_t, [batch_size, seq_len, 1])
            ],
                                       axis=2),
            num_units=arch_size[1],
            mask_input=l_mask,
            grad_clipping=GRAD_CLIP,
            ingate=Gate(b=lasagne.init.Constant(-0.1)),
            forgetgate=Gate(b=lasagne.init.Constant(0),
                            nonlinearity=lasagne.nonlinearities.sigmoid),
            cell=Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh),
            outgate=Gate(),
            nonlinearity=lasagne.nonlinearities.tanh,
            bn=bn)

    # Need to slice off the last layer now
    l_slice = lasagne.layers.SliceLayer(l_forward, -1, axis=1)  #l11

    # Softmax
    l_dense = lasagne.layers.DenseLayer(
        l_slice,
        num_units=arch_size[2],
        nonlinearity=lasagne.nonlinearities.leaky_rectify)
    l_out = lasagne.layers.NonlinearityLayer(
        l_dense, nonlinearity=lasagne.nonlinearities.softmax)

    return l_out
Exemplo n.º 9
0
    def _get_l_out(self, input_vars):
        check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        prev_output_var, mask_var = input_vars[-2:]
        color_input_vars = input_vars[:-2]

        context_len = self.context_len if hasattr(self, 'context_len') else 1
        l_color_repr, color_inputs = self.color_vec.get_input_layer(
            color_input_vars,
            recurrent_length=self.seq_vec.max_len - 1,
            cell_size=self.options.speaker_cell_size,
            context_len=context_len,
            id=self.id)
        l_hidden_color = dimshuffle(l_color_repr, (0, 2, 1))
        for i in range(1, self.options.speaker_hidden_color_layers + 1):
            l_hidden_color = NINLayer(
                l_hidden_color,
                num_units=self.options.speaker_cell_size,
                nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity],
                name=id_tag + 'hidden_color%d' % i)
        l_hidden_color = dimshuffle(l_hidden_color, (0, 2, 1))

        l_prev_out = InputLayer(shape=(None, self.seq_vec.max_len - 1),
                                input_var=prev_output_var,
                                name=id_tag + 'prev_input')
        l_prev_embed = EmbeddingLayer(
            l_prev_out,
            input_size=len(self.seq_vec.tokens),
            output_size=self.options.speaker_cell_size,
            name=id_tag + 'prev_embed')
        l_in = ConcatLayer([l_hidden_color, l_prev_embed],
                           axis=2,
                           name=id_tag + 'color_prev')
        l_mask_in = InputLayer(shape=(None, self.seq_vec.max_len - 1),
                               input_var=mask_var,
                               name=id_tag + 'mask_input')
        l_rec_drop = l_in

        cell = CELLS[self.options.speaker_cell]
        cell_kwargs = {
            'mask_input':
            (None if self.options.speaker_no_mask else l_mask_in),
            'grad_clipping': self.options.speaker_grad_clipping,
            'num_units': self.options.speaker_cell_size,
        }
        if self.options.speaker_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(
                b=Constant(self.options.speaker_forget_bias))
        if self.options.speaker_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[
                self.options.speaker_nonlinearity]

        for i in range(1, self.options.speaker_recurrent_layers):
            l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % i, **cell_kwargs)
            if self.options.speaker_dropout > 0.0:
                l_rec_drop = DropoutLayer(l_rec,
                                          p=self.options.speaker_dropout,
                                          name=id_tag + 'rec%d_drop' % i)
            else:
                l_rec_drop = l_rec
        l_rec = cell(l_rec_drop,
                     name=id_tag +
                     'rec%d' % self.options.speaker_recurrent_layers,
                     **cell_kwargs)
        l_shape = ReshapeLayer(l_rec, (-1, self.options.speaker_cell_size),
                               name=id_tag + 'reshape')
        l_hidden_out = l_shape
        for i in range(1, self.options.speaker_hidden_out_layers + 1):
            l_hidden_out = DenseLayer(
                l_hidden_out,
                num_units=self.options.speaker_cell_size,
                nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity],
                name=id_tag + 'hidden_out%d' % i)
        l_softmax = DenseLayer(l_hidden_out,
                               num_units=len(self.seq_vec.tokens),
                               nonlinearity=softmax,
                               name=id_tag + 'softmax')
        l_out = ReshapeLayer(
            l_softmax,
            (-1, self.seq_vec.max_len - 1, len(self.seq_vec.tokens)),
            name=id_tag + 'out')

        return l_out, color_inputs + [l_prev_out, l_mask_in]
Exemplo n.º 10
0
    def _get_l_out(self, input_vars):
        check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        color_mask_var, prev_output_var, mask_var = input_vars[-3:]
        color_input_vars = input_vars[:-3]

        num_contexts = color_mask_var.shape[0]
        num_colors = color_mask_var.shape[1]
        l_color_repr, color_inputs = self.color_vec.get_input_layer(
            color_input_vars,
            recurrent_length=0,
            cell_size=self.options.speaker_cell_size,
            context_len=None,
            id=self.id)
        l_color_reshaped = ReshapeLayer(
            l_color_repr,
            (num_contexts, num_colors, self.color_vec.output_size),
            name=id_tag + 'color_reshaped')
        l_color_mask_in = InputLayer(shape=(None, None),
                                     input_var=color_mask_var,
                                     name=id_tag + 'color_mask')

        cell = CELLS[self.options.speaker_cell]
        cell_kwargs = {
            'mask_input':
            (None if self.options.speaker_no_mask else l_color_mask_in),
            'grad_clipping':
            self.options.speaker_grad_clipping,
            'num_units':
            self.options.speaker_cell_size,
        }
        if self.options.speaker_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(
                b=Constant(self.options.speaker_forget_bias))
        if self.options.speaker_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[
                self.options.speaker_nonlinearity]

        l_context_out = cell(l_color_reshaped,
                             name=id_tag + 'reccontext',
                             only_return_final=True,
                             **cell_kwargs)
        l_context_tiled = RepeatLayer(l_context_out,
                                      self.seq_vec.max_len - 1,
                                      name=id_tag + 'reccontext_tiled')

        l_prev_out = InputLayer(shape=(None, self.seq_vec.max_len - 1),
                                input_var=prev_output_var,
                                name=id_tag + 'prev_input')
        l_prev_embed = EmbeddingLayer(
            l_prev_out,
            input_size=len(self.seq_vec.tokens),
            output_size=self.options.speaker_cell_size,
            name=id_tag + 'prev_embed')
        l_in = ConcatLayer([l_context_tiled, l_prev_embed],
                           axis=2,
                           name=id_tag + 'color_prev')
        l_mask_in = InputLayer(shape=(None, self.seq_vec.max_len - 1),
                               input_var=mask_var,
                               name=id_tag + 'mask_input')
        l_rec_drop = l_in

        cell_kwargs['mask_input'] = (None if self.options.speaker_no_mask else
                                     l_mask_in)

        for i in range(1, self.options.speaker_recurrent_layers):
            l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % i, **cell_kwargs)
            if self.options.speaker_dropout > 0.0:
                l_rec_drop = DropoutLayer(l_rec,
                                          p=self.options.speaker_dropout,
                                          name=id_tag + 'rec%d_drop' % i)
            else:
                l_rec_drop = l_rec
        l_rec = cell(l_rec_drop,
                     name=id_tag +
                     'rec%d' % self.options.speaker_recurrent_layers,
                     **cell_kwargs)
        l_shape = ReshapeLayer(l_rec, (-1, self.options.speaker_cell_size),
                               name=id_tag + 'reshape')
        l_hidden_out = l_shape
        for i in range(1, self.options.speaker_hidden_out_layers + 1):
            l_hidden_out = DenseLayer(
                l_hidden_out,
                num_units=self.options.speaker_cell_size,
                nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity],
                name=id_tag + 'hidden_out%d' % i)
        l_softmax = DenseLayer(l_hidden_out,
                               num_units=len(self.seq_vec.tokens),
                               nonlinearity=softmax,
                               name=id_tag + 'softmax')
        l_out = ReshapeLayer(
            l_softmax,
            (-1, self.seq_vec.max_len - 1, len(self.seq_vec.tokens)),
            name=id_tag + 'out')

        return l_out, color_inputs + [l_color_mask_in, l_prev_out, l_mask_in]
Exemplo n.º 11
0
    def __init__(
            self,
            incoming,  # 输入层输出	(batch size, SEQ_LENGTH, num_features)
            num_units,  # 隐藏层单元个数 (128)
            time_input,  # 输入层时间 (batch size, SEQ_LENGTH)
            duration_input,  # 输入持续时间(batch size,SEQ_LENGTH)
            ingate=Gate(),
            cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh),
            outgate=OutGate(),
            nonlinearity=nonlinearities.tanh,
            cell_init=init.Constant(0.),
            hid_init=init.Constant(0.),
            hid1_init=init.Constant(0.),
            hid2_init=init.Constant(0.),
            backwards=False,
            learn_init=False,
            peepholes=True,
            gradient_steps=-1,
            grad_clipping=0,
            unroll_scan=False,
            precompute_input=True,
            mask_input=None,  # 输入层有效序列(1 1 1 1 1 1 ... 0 0 0 0) (batch size, SEQ_LENGTH)
            only_return_final=False,
            bn=False,
            tgate1=TimeGate(W_t=init.Uniform((-1, 0))),  # add 添加时间门
            boundary=-0.00001,  # add2 不知道什么用 constraint ceil
            dgate2=DurationGate(),  # addv
            wgate=WGate(),
            **kwargs):
        # 建立incomings作为所有输入层的list,并将incoming作为第一个元素
        incomings = [incoming]
        # add 时间作为必要输入
        incomings.append(time_input)
        self.time_incoming_index = len(incomings) - 1

        # addv 持续时间作为必要输入
        incomings.append(duration_input)
        self.duration_incoming_index = len(incomings) - 1

        self.mask_incoming_index = -1
        self.hid_init_incoming_index = -1
        self.cell_init_incoming_index = -1

        # v:MergeLayer可以有多个输入层,可以使用append将输入层叠加,然后调用父类的__init__初始化
        if mask_input is not None:
            incomings.append(mask_input)
            self.mask_incoming_index = len(incomings) - 1
        if isinstance(hid_init, Layer):
            incomings.append(hid_init)
            self.hid_init_incoming_index = len(incomings) - 1
        if isinstance(cell_init, Layer):
            incomings.append(cell_init)
            self.cell_init_incoming_index = len(incomings) - 1

        # Initialize parent layer
        super(VDTLSTMEMLayer, self).__init__(incomings, **kwargs)

        # If the provided nonlinearity is None, make it linear
        if nonlinearity is None:
            self.nonlinearity = nonlinearities.identity
        else:
            self.nonlinearity = nonlinearity

        # v:多个变量不知道什么意思
        self.learn_init = learn_init  # 不知道什么意思 default:false 可能没有什么用
        self.num_units = num_units  # default:128
        self.backwards = backwards  # 不知道什么意思 default:false
        self.peepholes = peepholes  # 不知道什么意思 default:true
        self.gradient_steps = gradient_steps  # 不知道什么意思 default:-1
        self.grad_clipping = grad_clipping  # 不知道什么意思 default:0
        self.unroll_scan = unroll_scan  # 不知道什么意思 default:false
        self.precompute_input = precompute_input  # 不知道什么意思 default:false
        self.only_return_final = only_return_final  # 不知道什么意思 default:false
        self.boundary = boundary  # add2

        if unroll_scan and gradient_steps != -1:
            raise ValueError(
                "Gradient steps must be -1 when unroll_scan is true.")

        # 验证输入向量
        # input_shapes是自带的方法,用于查看输入的维度
        input_shape = self.input_shapes[0]
        # add
        time_shape = self.input_shapes[1]

        if unroll_scan and input_shape[1] is None:
            raise ValueError("Input sequence length cannot be specified as "
                             "None when unroll_scan is True")

        # 返回给定轴上的数组元素的乘积。
        num_inputs = np.prod(input_shape[2:])

        def add_gate_params(gate, gate_name):
            """ Convenience function for adding layer parameters from a Gate
            instance. """
            return (self.add_param(gate.W_in, (num_inputs, num_units),
                                   name="W_in_to_{}".format(gate_name)),
                    self.add_param(gate.W_hid, (num_units, num_units),
                                   name="W_hid_to_{}".format(gate_name)),
                    self.add_param(gate.b, (num_units, ),
                                   name="b_{}".format(gate_name),
                                   regularizable=False), gate.nonlinearity)

        def add_outgate_params(gate, gate_name):
            return (self.add_param(gate.W_in, (num_inputs, num_units),
                                   name="W_in_to_{}".format(gate_name)),
                    self.add_param(gate.W_hid, (num_units, num_units),
                                   name="W_hid_to_{}".format(gate_name)),
                    self.add_param(gate.W_to, (1, num_units),
                                   name="W_to_to_{}".format(gate_name)),
                    self.add_param(gate.b, (num_units, ),
                                   name="b_{}".format(gate_name),
                                   regularizable=False), gate.nonlinearity)

        # add
        def add_timegate_params(gate, gate_name):
            return (self.add_param(gate.W_t, (1, num_units),
                                   name="W_t_to_{}".format(gate_name)),
                    self.add_param(gate.W_x, (num_inputs, num_units),
                                   name="W_x_to_{}".format(gate_name)),
                    self.add_param(gate.b, (num_units, ),
                                   name="b_{}".format(gate_name)),
                    gate.nonlinearity_inside, gate.nonlinearity_outside)

        # addv
        def add_duration_gate_params(gate, gate_name):
            return (self.add_param(gate.W_d, (1, num_units),
                                   name="W_d_to_{}".format(gate_name)),
                    self.add_param(gate.W_x, (num_inputs, num_units),
                                   name="W_x_to_{}".format(gate_name)),
                    self.add_param(gate.b, (num_units, ),
                                   name="b_{}".format(gate_name)),
                    gate.nonlinearity, gate.nonlinearity_outside)

        # addvw
        def add_wgate_params(gate, gate_name):
            return (self.add_param(gate.W_x, (num_units, num_units),
                                   name="W_x_{}".format(gate_name)),
                    self.add_param(gate.b, (num_units, ),
                                   name="W_b_{}".format(gate_name),
                                   regularizable=False), gate.nonlinearity)

        # 添加LSTM的输入门
        (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate,
         self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate')
        # 添加LSTM的单元(cell)
        (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell,
         self.nonlinearity_cell) = add_gate_params(cell, 'cell')
        # 添加LSTM的输出门
        (self.W_in_to_outgate, self.W_hid_to_outgate, self.W_to_to_outgate,
         self.b_outgate,
         self.nonlinearity_outgate) = add_outgate_params(outgate, 'outgate')
        # add
        (self.W_t1_to_tg1, self.W_x1_to_tg1, self.b1_tg1,
         self.nonlinearity_inside_tg1,
         self.nonlinearity_outside_tg1) = add_timegate_params(
             tgate1, 'tgate1')

        (self.W_d2_to_dg2, self.W_x2_to_dg2, self.b2_dg2,
         self.nonlinearity_dg2,
         self.nonlinearity_outside_dg2) = add_duration_gate_params(
             dgate2, 'dgate2')

        # addvw 添加一个权重w
        (self.W_x_wg, self.b_wg,
         self.nonlinearity_wg) = add_wgate_params(wgate, 'wgate')

        # 即cell的输出会通到输入门,输出门,忘记门
        if self.peepholes:
            self.W_cell_to_ingate = self.add_param(ingate.W_cell,
                                                   (num_units, ),
                                                   name="W_cell_to_ingate")

            self.W_cell_to_outgate = self.add_param(outgate.W_cell,
                                                    (num_units, ),
                                                    name="W_cell_to_outgate")

        # 这两个单元就是cell和hid,以下是第一次初始化
        if isinstance(cell_init, Layer):
            self.cell_init = cell_init
        else:
            self.cell_init = self.add_param(cell_init, (1, num_units),
                                            name="cell_init",
                                            trainable=learn_init,
                                            regularizable=False)

        if isinstance(hid_init, Layer):
            self.hid_init = hid_init
        else:
            self.hid_init = self.add_param(hid_init, (1, self.num_units),
                                           name="hid_init",
                                           trainable=learn_init,
                                           regularizable=False)

        if isinstance(hid1_init, Layer):
            self.hid1_init = hid1_init
        else:
            self.hid1_init = self.add_param(hid1_init, (1, self.num_units),
                                            name="hid1_init",
                                            trainable=learn_init,
                                            regularizable=False)

        if isinstance(hid2_init, Layer):
            self.hid2_init = hid2_init
        else:
            self.hid2_init = self.add_param(hid2_init, (1, self.num_units),
                                            name="hid2_init",
                                            trainable=learn_init,
                                            regularizable=False)

        # 如果bn为true,则构造BatchNormLayer,This layer implements batch normalization of its inputs.
        # self.params.update(self.bn.params)?似乎是对所有的参数进行标准化
        if bn:
            self.bn = lasagne.layers.BatchNormLayer(input_shape, axes=(0, 1))
            self.params.update(self.bn.params)
        else:
            self.bn = False
Exemplo n.º 12
0
    def __init__(self,
                 x,
                 hid_previous,
                 num_units,
                 resetgate=Gate(W_cell=None),
                 updategate=Gate(W_cell=None),
                 hidden_update=Gate(W_cell=None,
                                    nonlinearity=nonlinearities.tanh),
                 hid_init=init.Constant(0.),
                 learn_init=False,
                 grad_clipping=0,
                 **kwargs):

        if hid_previous.output_shape[-1] != num_units:
            raise ValueError('Number of hid_previous inputs should be the '
                             'same as num_units_gru')

        if x.output_shape[0] != hid_previous.output_shape[0]:
            raise ValueError('first dimension output of x and hid_previous '
                             'should be equal')

        # Initialize parent layer
        super(GRUCell, self).__init__([x, hid_previous], **kwargs)
        self.learn_init = learn_init
        self.num_units = num_units  # this could also be inferred?
        self.grad_clipping = grad_clipping
        self.unroll_scan = unroll_scan

        # Retrieve the dimensionality of the incoming layer
        input_shape_x = self.input_shapes[0]

        # Input dimensionality is the output dimensionality of the input layer
        num_inputs_x = np.prod(input_shape_x[1:])

        def add_gate_params(gate, gate_name):
            """ Convenience function for adding layer parameters from a Gate
            instance. """
            return (self.add_param(gate.W_in, (num_inputs_x, num_units),
                                   name="W_in_to_{}".format(gate_name)),
                    self.add_param(gate.W_hid, (num_units, num_units),
                                   name="W_hid_to_{}".format(gate_name)),
                    self.add_param(gate.b, (num_units, ),
                                   name="b_{}".format(gate_name),
                                   regularizable=False), gate.nonlinearity)

        # Add in all parameters from gates
        (self.W_in_to_updategate, self.W_hid_to_updategate, self.b_updategate,
         self.nonlinearity_updategate) = add_gate_params(
             updategate, 'updategate')
        (self.W_in_to_resetgate, self.W_hid_to_resetgate, self.b_resetgate,
         self.nonlinearity_resetgate) = add_gate_params(
             resetgate, 'resetgate')

        (self.W_in_to_hidden_update, self.W_hid_to_hidden_update,
         self.b_hidden_update,
         self.nonlinearity_hid) = add_gate_params(hidden_update,
                                                  'hidden_update')

        # Initialize hidden state
        self.hid_init = self.add_param(hid_init, (1, self.num_units),
                                       name="hid_init",
                                       trainable=learn_init,
                                       regularizable=False)

        # Stack input weight matrices into a (num_inputs, 3*num_units_gru)
        # matrix, which speeds up computation
        self.W_in_stacked = T.concatenate([
            self.W_in_to_resetgate, self.W_in_to_updategate,
            self.W_in_to_hidden_update
        ],
                                          axis=1)

        # Same for hidden weight matrices
        self.W_hid_stacked = T.concatenate([
            self.W_hid_to_resetgate, self.W_hid_to_updategate,
            self.W_hid_to_hidden_update
        ],
                                           axis=1)

        # Stack gate biases into a (3*num_units_gru) vector
        self.b_stacked = T.concatenate(
            [self.b_resetgate, self.b_updategate, self.b_hidden_update],
            axis=0)
Exemplo n.º 13
0
    def __init__(self, incoming, time_input, num_units,
                 ingate=Gate(b=lasagne.init.Constant(0)),
                 forgetgate=Gate(b=lasagne.init.Constant(2), nonlinearity=nonlinearities.sigmoid),
                 timegate=PLSTMTimeGate(),
                 cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh),
                 outgate=Gate(),
                 nonlinearity=nonlinearities.tanh,
                 cell_init=init.Constant(0.),
                 hid_init=init.Constant(0.),
                 backwards=False,
                 learn_init=False,
                 peepholes=True,
                 gradient_steps=-1,
                 grad_clipping=0,
                 unroll_scan=False,
                 precompute_input=True,
                 mask_input=None,
                 only_return_final=False,
                 bn=False,
                 learn_time_params=[True, True, False],
                 off_alpha=1e-3,
                 **kwargs):

        # This layer inherits from a MergeLayer, because it can have four
        # inputs - the layer input, the mask, the initial hidden state and the
        # inital cell state. We will just provide the layer input as incomings,
        # unless a mask input, inital hidden state or initial cell state was
        # provided.
        incomings = [incoming]
        # TIME STUFF
        incomings.append(time_input)
        self.time_incoming_index = len(incomings)-1

        self.mask_incoming_index = -2
        self.hid_init_incoming_index = -2
        self.cell_init_incoming_index = -2
        #ADD TIME INPUT HERE
        if mask_input is not None:
            incomings.append(mask_input)
            self.mask_incoming_index = len(incomings)-1
        if isinstance(hid_init, Layer):
            incomings.append(hid_init)
            self.hid_init_incoming_index = len(incomings)-1
        if isinstance(cell_init, Layer):
            incomings.append(cell_init)
            self.cell_init_incoming_index = len(incomings)-1

        # Initialize parent layer
        super(PLSTMLayer, self).__init__(incomings, **kwargs)

        # If the provided nonlinearity is None, make it linear
        if nonlinearity is None:
            self.nonlinearity = nonlinearities.identity
        else:
            self.nonlinearity = nonlinearity

        self.learn_init = learn_init
        self.num_units = num_units
        self.backwards = backwards
        self.peepholes = peepholes
        self.gradient_steps = gradient_steps
        self.grad_clipping = grad_clipping
        self.unroll_scan = unroll_scan
        self.precompute_input = precompute_input
        self.only_return_final = only_return_final

        if unroll_scan and gradient_steps != -1:
            raise ValueError(
                "Gradient steps must be -1 when unroll_scan is true.")

        # Retrieve the dimensionality of the incoming layer
        input_shape = self.input_shapes[0]
        time_shape = self.input_shapes[1]


        if unroll_scan and input_shape[1] is None:
            raise ValueError("Input sequence length cannot be specified as "
                             "None when unroll_scan is True")
        # m
        num_inputs = np.prod(input_shape[2:])

        def add_gate_params(gate, gate_name):
            """ Convenience function for adding layer parameters from a Gate
            instance. """
            return (self.add_param(gate.W_in, (num_inputs, num_units),
                                   name="W_in_to_{}".format(gate_name)),
                    self.add_param(gate.W_hid, (num_units, num_units),
                                   name="W_hid_to_{}".format(gate_name)),
                    self.add_param(gate.b, (num_units,),
                                   name="b_{}".format(gate_name),
                                   regularizable=False),
                gate.nonlinearity)

        # PHASED LSTM: Initialize params for the time gate
        self.off_alpha = off_alpha
        if timegate == None:
            timegate = TimeGate()
        def add_timegate_params(gate, gate_name):
            """ Convenience function for adding layer parameters from a Gate
            instance. """
            return (self.add_param(gate.Period, (num_units, ),
                                   name="Period_{}".format(gate_name),
                                   trainable=learn_time_params[0]),
                    self.add_param(gate.Shift, (num_units, ),
                                   name="Shift_{}".format(gate_name),
                                   trainable=learn_time_params[1]),
                    self.add_param(gate.On_End, (num_units, ),
                                   name="On_End_{}".format(gate_name),
                                   trainable=learn_time_params[2]))
        print('Learnableness: {}'.format(learn_time_params))
        (self.period_timegate, self.shift_timegate,
         self.on_end_timegate) = add_timegate_params(timegate, 'timegate')

        # Add in parameters from the supplied Gate instances
        (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate,
         self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate')

        (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate,
         self.nonlinearity_forgetgate) = add_gate_params(forgetgate,
                                                         'forgetgate')

        (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell,
         self.nonlinearity_cell) = add_gate_params(cell, 'cell')

        (self.W_in_to_outgate, self.W_hid_to_outgate, self.b_outgate,
         self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate')

        # If peephole (cell to gate) connections were enabled, initialize
        # peephole connections.  These are elementwise products with the cell
        # state, so they are represented as vectors.
        if self.peepholes:
            self.W_cell_to_ingate = self.add_param(
                ingate.W_cell, (num_units, ), name="W_cell_to_ingate")

            self.W_cell_to_forgetgate = self.add_param(
                forgetgate.W_cell, (num_units, ), name="W_cell_to_forgetgate")

            self.W_cell_to_outgate = self.add_param(
                outgate.W_cell, (num_units, ), name="W_cell_to_outgate")

        # Setup initial values for the cell and the hidden units
        if isinstance(cell_init, Layer):
            self.cell_init = cell_init
        else:
            self.cell_init = self.add_param(
                cell_init, (1, num_units), name="cell_init",
                trainable=learn_init, regularizable=False)

        if isinstance(hid_init, Layer):
            self.hid_init = hid_init
        else:
            self.hid_init = self.add_param(
                hid_init, (1, self.num_units), name="hid_init",
                trainable=learn_init, regularizable=False)

        if bn:
            self.bn = lasagne.layers.BatchNormLayer(input_shape, axes=(0,1))  # create BN layer for correct input shape
            self.params.update(self.bn.params)  # make BN params your params
        else:
            self.bn = False
Exemplo n.º 14
0
    def __init__(self,
                 x,
                 cell_previous,
                 hid_previous,
                 num_units,
                 ingate=Gate(),
                 forgetgate=Gate(),
                 cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh),
                 outgate=Gate(),
                 nonlinearity=nonlinearities.tanh,
                 cell_init=init.Constant(0.),
                 hid_init=init.Constant(0.),
                 learn_init=False,
                 peepholes=True,
                 grad_clipping=0,
                 **kwargs):

        if hid_previous.output_shape[-1] != num_units:
            raise ValueError('Number of hid_previous inputs should be the '
                             'same as num_units_lstm')

        if cell_previous.output_shape[-1] != num_units:
            raise ValueError('Number of cell_previous inputs should be the '
                             'same as num_units_lstm')

        if x.output_shape[0] != cell_previous.output_shape[0]:
            raise ValueError('first dimension output of x and hid_previous '
                             'should be equal')

        if x.output_shape[0] != hid_previous.output_shape[0]:
            raise ValueError('first dimension output of x and hid_previous '
                             'should be equal')

        # Initialize parent layer
        super(LSTMCell, self).__init__([x, cell_previous, hid_previous],
                                       **kwargs)

        # If the provided nonlinearity is None, make it linear
        if nonlinearity is None:
            self.nonlinearity = nonlinearities.identity
        else:
            self.nonlinearity = nonlinearity

        self.learn_init = learn_init
        self.num_units = num_units
        self.peepholes = peepholes
        self.grad_clipping = grad_clipping
        self.unroll_scan = unroll_scan

        # Retrieve the dimensionality of the incoming layer
        input_shape_x = self.input_shapes[0]

        # Input dimensionality is the output dimensionality of the input layer
        num_inputs_x = np.prod(input_shape_x[1:])

        def add_gate_params(gate, gate_name):
            """ Convenience function for adding layer parameters from a Gate
            instance. """
            return (self.add_param(gate.W_in, (num_inputs_x, num_units),
                                   name="W_in_to_{}".format(gate_name)),
                    self.add_param(gate.W_hid, (num_units, num_units),
                                   name="W_hid_to_{}".format(gate_name)),
                    self.add_param(gate.b, (num_units, ),
                                   name="b_{}".format(gate_name),
                                   regularizable=False), gate.nonlinearity)

        # Add in parameters from the supplied Gate instances
        (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate,
         self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate')

        (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate,
         self.nonlinearity_forgetgate) = add_gate_params(
             forgetgate, 'forgetgate')

        (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell,
         self.nonlinearity_cell) = add_gate_params(cell, 'cell')

        (self.W_in_to_outgate, self.W_hid_to_outgate, self.b_outgate,
         self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate')

        # If peephole (cell to gate) connections were enabled, initialize
        # peephole connections.  These are elementwise products with the cell
        # state, so they are represented as vectors.
        if self.peepholes:
            self.W_cell_to_ingate = self.add_param(ingate.W_cell,
                                                   (num_units, ),
                                                   name="W_cell_to_ingate")

            self.W_cell_to_forgetgate = self.add_param(
                forgetgate.W_cell, (num_units, ), name="W_cell_to_forgetgate")

            self.W_cell_to_outgate = self.add_param(outgate.W_cell,
                                                    (num_units, ),
                                                    name="W_cell_to_outgate")

        # Setup initial values for the cell and the hidden units
        self.cell_init = self.add_param(cell_init, (1, num_units),
                                        name="cell_init",
                                        trainable=learn_init,
                                        regularizable=False)

        self.hid_init = self.add_param(hid_init, (1, self.num_units),
                                       name="hid_init",
                                       trainable=learn_init,
                                       regularizable=False)

        # Stack input weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        self.W_in_stacked = T.concatenate([
            self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell,
            self.W_in_to_outgate
        ],
                                          axis=1)

        # Same for hidden weight matrices
        self.W_hid_stacked = T.concatenate([
            self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell,
            self.W_hid_to_outgate
        ],
                                           axis=1)

        # Stack biases into a (4*num_units) vector
        self.b_stacked = T.concatenate(
            [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate],
            axis=0)
Exemplo n.º 15
0
    def __init__(
            self,
            incoming,
            time_input,
            num_units,
            ingate=Gate(),
            forgetgate=Gate(),
            tgate1=TimeGate(W_t=init.Uniform((-1, 0))),
            tgate2=TimeGate(),
            cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh),
            outgate=OutGate(),
            nonlinearity=nonlinearities.tanh,
            cell_init=init.Constant(0.),
            hid_init=init.Constant(0.),
            backwards=False,
            learn_init=False,
            peepholes=True,
            gradient_steps=-1,
            grad_clipping=0,
            unroll_scan=False,
            precompute_input=True,
            mask_input=None,
            only_return_final=False,
            bn=False,
            boundary=-0.00001,  # constraint ceil
            **kwargs):

        # This layer inherits from a MergeLayer, because it can have four
        # inputs - the layer input, the mask, the initial hidden state and the
        # inital cell state. We will just provide the layer input as incomings,
        # unless a mask input, inital hidden state or initial cell state was
        # provided.
        incomings = [incoming]
        incomings.append(time_input)
        self.time_incoming_index = len(incomings) - 1

        self.mask_incoming_index = -1
        self.hid_init_incoming_index = -1
        self.cell_init_incoming_index = -1
        if mask_input is not None:
            incomings.append(mask_input)
            self.mask_incoming_index = len(incomings) - 1
        if isinstance(hid_init, Layer):
            incomings.append(hid_init)
            self.hid_init_incoming_index = len(incomings) - 1
        if isinstance(cell_init, Layer):
            incomings.append(cell_init)
            self.cell_init_incoming_index = len(incomings) - 1

        # Initialize parent layer
        super(TLSTM2Layer, self).__init__(incomings, **kwargs)

        # If the provided nonlinearity is None, make it linear
        if nonlinearity is None:
            self.nonlinearity = nonlinearities.identity
        else:
            self.nonlinearity = nonlinearity

        self.learn_init = learn_init
        self.num_units = num_units
        self.backwards = backwards
        self.peepholes = peepholes
        self.gradient_steps = gradient_steps
        self.grad_clipping = grad_clipping
        self.unroll_scan = unroll_scan
        self.precompute_input = precompute_input
        self.only_return_final = only_return_final
        self.boundary = boundary

        if unroll_scan and gradient_steps != -1:
            raise ValueError(
                "Gradient steps must be -1 when unroll_scan is true.")

        # Retrieve the dimensionality of the incoming layer
        input_shape = self.input_shapes[0]
        time_shape = self.input_shapes[1]

        if unroll_scan and input_shape[1] is None:
            raise ValueError("Input sequence length cannot be specified as "
                             "None when unroll_scan is True")

        num_inputs = np.prod(input_shape[2:])

        def add_gate_params(gate, gate_name):
            """ Convenience function for adding layer parameters from a Gate
            instance. """
            return (self.add_param(gate.W_in, (num_inputs, num_units),
                                   name="W_in_to_{}".format(gate_name)),
                    self.add_param(gate.W_hid, (num_units, num_units),
                                   name="W_hid_to_{}".format(gate_name)),
                    self.add_param(gate.b, (num_units, ),
                                   name="b_{}".format(gate_name),
                                   regularizable=False), gate.nonlinearity)

        def add_outgate_params(gate, gate_name):
            return (self.add_param(gate.W_in, (num_inputs, num_units),
                                   name="W_in_to_{}".format(gate_name)),
                    self.add_param(gate.W_hid, (num_units, num_units),
                                   name="W_hid_to_{}".format(gate_name)),
                    self.add_param(gate.W_to, (1, num_units),
                                   name="W_to_to_{}".format(gate_name)),
                    self.add_param(gate.b, (num_units, ),
                                   name="b_{}".format(gate_name),
                                   regularizable=False), gate.nonlinearity)

        def add_timegate_params(gate, gate_name):
            return (self.add_param(gate.W_t, (1, num_units),
                                   name="W_t_to_{}".format(gate_name)),
                    self.add_param(gate.W_x, (num_inputs, num_units),
                                   name="W_x_to_{}".format(gate_name)),
                    self.add_param(gate.b, (num_units, ),
                                   name="b_{}".format(gate_name)),
                    gate.nonlinearity_inside, gate.nonlinearity_outside)

        # Add in parameters from the supplied Gate instances
        (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate,
         self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate')

        (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate,
         self.nonlinearity_forgetgate) = add_gate_params(
             forgetgate, 'forgetgate')

        (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell,
         self.nonlinearity_cell) = add_gate_params(cell, 'cell')

        (self.W_in_to_outgate, self.W_hid_to_outgate, self.W_to_to_outgate,
         self.b_outgate,
         self.nonlinearity_outgate) = add_outgate_params(outgate, 'outgate')

        (self.W_t1_to_tg1, self.W_x1_to_tg1, self.b1_tg1,
         self.nonlinearity_inside_tg1,
         self.nonlinearity_outside_tg1) = add_timegate_params(
             tgate1, 'tgate1')

        (self.W_t2_to_tg2, self.W_x2_to_tg2, self.b2_tg2,
         self.nonlinearity_inside_tg2,
         self.nonlinearity_outside_tg2) = add_timegate_params(
             tgate2, 'tgate2')

        # If peephole (cell to gate) connections were enabled, initialize
        # peephole connections.  These are elementwise products with the cell
        # state, so they are represented as vectors.
        if self.peepholes:
            self.W_cell_to_ingate = self.add_param(ingate.W_cell,
                                                   (num_units, ),
                                                   name="W_cell_to_ingate")

            self.W_cell_to_forgetgate = self.add_param(
                forgetgate.W_cell, (num_units, ), name="W_cell_to_forgetgate")

            self.W_cell_to_outgate = self.add_param(outgate.W_cell,
                                                    (num_units, ),
                                                    name="W_cell_to_outgate")

        # Setup initial values for the cell and the hidden units
        if isinstance(cell_init, Layer):
            self.cell_init = cell_init
        else:
            self.cell_init = self.add_param(cell_init, (1, num_units),
                                            name="cell_init",
                                            trainable=learn_init,
                                            regularizable=False)

        if isinstance(hid_init, Layer):
            self.hid_init = hid_init
        else:
            self.hid_init = self.add_param(hid_init, (1, self.num_units),
                                           name="hid_init",
                                           trainable=learn_init,
                                           regularizable=False)

        if bn:
            self.bn = lasagne.layers.BatchNormLayer(input_shape, axes=(0, 1))
            self.params.update(self.bn.params)
        else:
            self.bn = False
Exemplo n.º 16
0
def get_rnn(event_var,
            feature_idx,
            feature_value,
            mask_var,
            time_var,
            arch_size,
            num_attention=0,
            embed_size=40,
            init_period=(1, 3),
            seq_len=1000,
            GRAD_CLIP=100,
            bn=False,
            model_type='LSTM'):

    #input layers
    l_in_event = lasagne.layers.InputLayer(shape=(None, seq_len),
                                           input_var=event_var)
    l_in_feature_idx = lasagne.layers.InputLayer(shape=(None, seq_len, 3),
                                                 input_var=feature_idx)
    l_in_feature_value = lasagne.layers.InputLayer(shape=(None, seq_len, 3),
                                                   input_var=feature_value)
    l_mask = lasagne.layers.InputLayer(shape=(None, seq_len),
                                       input_var=mask_var)
    l_t = lasagne.layers.InputLayer(shape=(None, seq_len), input_var=time_var)

    #embed event
    embed_event = lasagne.layers.EmbeddingLayer(l_in_event,
                                                input_size=3418,
                                                output_size=embed_size)
    #embed feature_idx
    embed_feature_idx = lasagne.layers.EmbeddingLayer(l_in_feature_idx,
                                                      input_size=649,
                                                      output_size=embed_size)
    #embed feature_value bias
    embed_feature_b = lasagne.layers.EmbeddingLayer(l_in_feature_idx,
                                                    input_size=649,
                                                    output_size=1)
    #embed feature_value trans
    embed_feature_trans = lasagne.layers.EmbeddingLayer(l_in_feature_idx,
                                                        input_size=649,
                                                        output_size=1)

    embed_params = [
        embed_event.W, embed_feature_idx.W, embed_feature_b.W,
        embed_feature_trans.W
    ]

    #get input_var
    l_in_merge = MergeEmbeddingLayer(embed_event, embed_feature_idx,
                                     embed_feature_b, embed_feature_trans,
                                     l_in_feature_value)

    if model_type == "LSTM":
        l_in_merge = lasagne.layers.ConcatLayer(
            [l_in_merge,
             lasagne.layers.ReshapeLayer(l_t, [-1, seq_len, 1])],
            axis=2)

    l_forward = HELSTMLayer(
        incoming=l_in_merge,
        time_input=l_t,
        event_input=embed_event,
        num_units=arch_size[1],
        num_attention=num_attention,
        model=model_type,
        mask_input=l_mask,
        ingate=Gate(),
        forgetgate=Gate(),
        cell=Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh),
        outgate=Gate(),
        nonlinearity=lasagne.nonlinearities.tanh,
        grad_clipping=GRAD_CLIP,
        bn=bn,
        only_return_final=True,
        timegate=HELSTMGate(Period=ExponentialUniformInit(init_period),
                            Shift=lasagne.init.Uniform((0., 1000)),
                            On_End=lasagne.init.Constant(0.05)))

    gate_params = []
    if model_type != 'LSTM':
        gate_params = l_forward.get_gate_params()

    # Softmax
    l_dense = lasagne.layers.DenseLayer(
        l_forward,
        num_units=arch_size[2],
        nonlinearity=lasagne.nonlinearities.leaky_rectify)
    l_out = lasagne.layers.NonlinearityLayer(
        l_dense, nonlinearity=lasagne.nonlinearities.softmax)
    return l_out, gate_params, embed_params
Exemplo n.º 17
0
    def _get_l_out(self, input_vars, multi_utt=None):
        check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        input_var = input_vars[0]
        extra_vars = input_vars[1:]

        if multi_utt is None:
            l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var,
                              name=id_tag + 'desc_input')
            l_in_flattened = l_in
        else:
            l_in = InputLayer(shape=(None, multi_utt, self.seq_vec.max_len), input_var=input_var,
                              name=id_tag + 'desc_input')
            l_in_flattened = reshape(l_in, (-1, self.seq_vec.max_len),
                                     name=id_tag + 'input_flattened')
        l_in_embed, context_vars = self.get_embedding_layer(l_in_flattened, extra_vars)

        cell = CELLS[self.options.listener_cell]
        cell_kwargs = {
            'grad_clipping': self.options.listener_grad_clipping,
            'num_units': self.options.listener_cell_size,
        }
        if self.options.listener_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(b=Constant(self.options.listener_forget_bias))
        if self.options.listener_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[self.options.listener_nonlinearity]

        l_rec1 = cell(l_in_embed, name=id_tag + 'rec1', only_return_final=True, **cell_kwargs)
        if self.options.listener_bidi:
            l_rec1_backwards = cell(l_in_embed, name=id_tag + 'rec1_back', backwards=True,
                                    only_return_final=True, **cell_kwargs)
            l_rec1 = ConcatLayer([l_rec1, l_rec1_backwards], axis=1,
                                 name=id_tag + 'rec1_bidi_concat')
        if self.options.listener_dropout > 0.0:
            l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout,
                                       name=id_tag + 'rec1_drop')
        else:
            l_rec1_drop = l_rec1

        # (batch_size [ * multi_utt], repr_size)
        l_pred_mean = DenseLayer(l_rec1_drop, num_units=self.color_vec.output_size,
                                 nonlinearity=None, name=id_tag + 'pred_mean')
        # (batch_size [ * multi_utt], repr_size * repr_size)
        l_pred_covar_vec = DenseLayer(l_rec1_drop, num_units=self.color_vec.output_size ** 2,
                                      # initially produce identity matrix
                                      b=np.eye(self.color_vec.output_size,
                                               dtype=theano.config.floatX).ravel(),
                                      nonlinearity=None, name=id_tag + 'pred_covar_vec')
        # (batch_size [ * multi_utt], repr_size, repr_size)
        l_pred_covar = reshape(l_pred_covar_vec, ([0], self.color_vec.output_size,
                                                  self.color_vec.output_size),
                               name=id_tag + 'pred_covar')
        if multi_utt is not None:
            l_pred_mean = reshape(l_pred_mean, (-1, multi_utt, self.color_vec.output_size),
                                  name=id_tag + 'pred_mean_reshape')
            l_pred_covar = reshape(l_pred_covar, (-1, multi_utt, self.color_vec.output_size,
                                                  self.color_vec.output_size),
                                   name=id_tag + 'pred_covar_reshape')

        # Context repr has shape (batch_size, context_len * repr_size)
        l_context_repr, context_inputs = self.color_vec.get_input_layer(
            context_vars,
            cell_size=self.options.listener_cell_size,
            context_len=self.context_len,
            id=self.id
        )
        l_context_points = reshape(l_context_repr, ([0], self.context_len,
                                                    self.color_vec.output_size))

        # (batch_size, [multi_utt,] context_len)
        l_unnorm_scores = GaussianScoreLayer(l_context_points, l_pred_mean, l_pred_covar,
                                             name=id_tag + 'gaussian_score')

        if multi_utt is not None:
            l_unnorm_scores = reshape(l_unnorm_scores, (-1, self.context_len),
                                      name=id_tag + 'gaussian_score_reshape')
        # (batch_size [ * multi_utt], context_len)
        # XXX: returning probs for normal models, log probs for AC model!
        # This is really surprising and definitely not the best solution.
        # We should be using log probs everywhere for stability...
        final_softmax = (softmax if multi_utt is None else logit_softmax_nd(axis=2))
        l_scores = NonlinearityLayer(l_unnorm_scores, nonlinearity=final_softmax,
                                     name=id_tag + 'scores')
        if multi_utt is not None:
            l_scores = reshape(l_unnorm_scores, (-1, multi_utt, self.context_len),
                               name=id_tag + 'scores_reshape')

        self.gaussian_fn = theano.function(input_vars, [get_output(l_pred_mean,
                                                                   deterministic=True),
                                                        get_output(l_pred_covar,
                                                                   deterministic=True),
                                                        get_output(l_context_points,
                                                                   deterministic=True),
                                                        get_output(l_unnorm_scores,
                                                                   deterministic=True)],
                                           name=id_tag + 'gaussian',
                                           on_unused_input='ignore')

        self.repr_fn = theano.function(input_vars, get_output(l_rec1_drop,
                                                              deterministic=True),
                                       name=id_tag + 'repr',
                                       on_unused_input='ignore')

        return l_scores, [l_in] + context_inputs
Exemplo n.º 18
0
    def _get_l_out(self, input_vars, multi_utt='ignored'):
        check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        input_var = input_vars[0]
        extra_vars = input_vars[1:]

        l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var,
                          name=id_tag + 'desc_input')
        l_in_embed, context_vars = self.get_embedding_layer(l_in, extra_vars)

        # Context repr has shape (batch_size, seq_len, context_len * repr_size)
        l_context_repr, context_inputs = self.color_vec.get_input_layer(
            context_vars,
            recurrent_length=self.seq_vec.max_len,
            cell_size=self.options.listener_cell_size,
            context_len=self.context_len,
            id=self.id
        )
        l_hidden_context = dimshuffle(l_context_repr, (0, 2, 1))
        for i in range(1, self.options.listener_hidden_color_layers + 1):
            l_hidden_context = NINLayer(
                l_hidden_context, num_units=self.options.listener_cell_size,
                nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity],
                name=id_tag + 'hidden_context%d' % i)
        l_hidden_context = dimshuffle(l_hidden_context, (0, 2, 1))
        l_concat = ConcatLayer([l_hidden_context, l_in_embed], axis=2,
                               name=id_tag + 'concat_inp_context')

        cell = CELLS[self.options.listener_cell]
        cell_kwargs = {
            'grad_clipping': self.options.listener_grad_clipping,
            'num_units': self.options.listener_cell_size,
        }
        if self.options.listener_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(b=Constant(self.options.listener_forget_bias))
        if self.options.listener_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[self.options.listener_nonlinearity]

        l_rec1 = cell(l_concat, name=id_tag + 'rec1', **cell_kwargs)
        if self.options.listener_dropout > 0.0:
            l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout,
                                       name=id_tag + 'rec1_drop')
        else:
            l_rec1_drop = l_rec1
        l_rec2 = cell(l_rec1_drop, name=id_tag + 'rec2', **cell_kwargs)
        if self.options.listener_dropout > 0.0:
            l_rec2_drop = DropoutLayer(l_rec2, p=self.options.listener_dropout,
                                       name=id_tag + 'rec2_drop')
        else:
            l_rec2_drop = l_rec2

        l_hidden = DenseLayer(l_rec2_drop, num_units=self.options.listener_cell_size,
                              nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity],
                              name=id_tag + 'hidden')
        if self.options.listener_dropout > 0.0:
            l_hidden_drop = DropoutLayer(l_hidden, p=self.options.listener_dropout,
                                         name=id_tag + 'hidden_drop')
        else:
            l_hidden_drop = l_hidden
        l_scores = DenseLayer(l_hidden_drop, num_units=self.context_len, nonlinearity=softmax,
                              name=id_tag + 'scores')

        return l_scores, [l_in] + context_inputs
Exemplo n.º 19
0
    def __init__(self, incoming, num_units,
                 ingate=Gate(),
                 forgetgate=Gate(),
                 cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh),
                 outgate=Gate(),
                 nonlinearity=nonlinearities.tanh,
                 cell_init=init.Constant(0.),
                 hid_init=init.Constant(0.),
                 backwards=False,
                 learn_init=False,
                 peepholes=True,
                 gradient_steps=-1,
                 grad_clipping=0,
                 unroll_scan=False,
                 precompute_input=True,
                 mask_input=None,
                 only_return_final=False,
                 batch_norm=True,
                 **kwargs):

        # This layer inherits from a MergeLayer, because it can have four
        # inputs - the layer input, the mask, the initial hidden state and the
        # inital cell state. We will just provide the layer input as incomings,
        # unless a mask input, inital hidden state or initial cell state was
        # provided.
        incomings = [incoming]
        self.mask_incoming_index = -1
        self.hid_init_incoming_index = -1
        self.cell_init_incoming_index = -1
        
        if mask_input is not None:
            incomings.append(mask_input)
            self.mask_incoming_index = len(incomings)-1
        if isinstance(hid_init, Layer):
            incomings.append(hid_init)
            self.hid_init_incoming_index = len(incomings)-1
        if isinstance(cell_init, Layer):
            incomings.append(cell_init)
            self.cell_init_incoming_index = len(incomings)-1

        # Initialize parent layer
        super(LSTMLayer, self).__init__(incomings, **kwargs)

        # If the provided nonlinearity is None, make it linear
        if nonlinearity is None:
            self.nonlinearity = nonlinearities.identity
        else:
            self.nonlinearity = nonlinearity

        self.learn_init = learn_init
        self.num_units = num_units
        self.backwards = backwards
        self.peepholes = peepholes
        self.gradient_steps = gradient_steps
        self.grad_clipping = grad_clipping
        self.unroll_scan = unroll_scan
        self.precompute_input = precompute_input
        self.only_return_final = only_return_final
        self.batch_norm = batch_norm

        if unroll_scan and gradient_steps != -1:
            raise ValueError(
                "Gradient steps must be -1 when unroll_scan is true.")

        # Retrieve the dimensionality of the incoming layer
        input_shape = self.input_shapes[0]
        self.batch_size = input_shape[0]

        if unroll_scan and input_shape[1] is None:
            raise ValueError("Input sequence length cannot be specified as "
                             "None when unroll_scan is True")

        num_inputs = np.prod(input_shape[2:])

        def add_gate_params(gate, gate_name):
            """ Convenience function for adding layer parameters from a Gate
            instance. """
            return (self.add_param(gate.W_in, (num_inputs, num_units),
                                   name="W_in_to_{}".format(gate_name)),
                    self.add_param(gate.W_hid, (num_units, num_units),
                                   name="W_hid_to_{}".format(gate_name)),
                    gate.nonlinearity)
        
        def add_gate_params_b(gate, gate_name):
            return self.add_param(gate.b, (num_units,), name="b_{}".format(gate_name), regularizable=False)
    
        # Add in parameters from the supplied Gate instances
        (self.W_in_to_ingate, self.W_hid_to_ingate,
        self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate')
    
        (self.W_in_to_forgetgate, self.W_hid_to_forgetgate,
        self.nonlinearity_forgetgate) = add_gate_params(forgetgate,
                                                             'forgetgate')
    
        (self.W_in_to_cell, self.W_hid_to_cell,
        self.nonlinearity_cell) = add_gate_params(cell, 'cell')
    
        (self.W_in_to_outgate, self.W_hid_to_outgate,
        self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate')
        
        if not self.batch_norm:
            # add b
            self.b_ingate = add_gate_params_b(ingate, 'ingate')
            self.b_forgetgate = add_gate_params_b(forgetgate, 'forgetgate')
            self.b_cell = add_gate_params_b(cell, 'cell')
            self.b_outgate = add_gate_params_b(outgate, 'outgate')
            
        
        if self.batch_norm:
            # add 4 batch norm layers for i, f, c and o
            n_time_step = input_shape[1]
            bn_shape = (n_time_step, self.batch_size, 4*num_units)
            
            self.bn = SequenceBatchNorm(bn_shape, axes=(0,1))  # create BN layer for correct input shape
            self.params.update(self.bn.params)  # make BN params your params
            
        # If peephole (cell to gate) connections were enabled, initialize
        # peephole connections.  These are elementwise products with the cell
        # state, so they are represented as vectors.
        if self.peepholes:
            self.W_cell_to_ingate = self.add_param(
                ingate.W_cell, (num_units, ), name="W_cell_to_ingate")

            self.W_cell_to_forgetgate = self.add_param(
                forgetgate.W_cell, (num_units, ), name="W_cell_to_forgetgate")

            self.W_cell_to_outgate = self.add_param(
                outgate.W_cell, (num_units, ), name="W_cell_to_outgate")

        # Setup initial values for the cell and the hidden units
        if isinstance(cell_init, Layer):
            self.cell_init = cell_init
        else:
            self.cell_init = self.add_param(
                cell_init, (1, num_units), name="cell_init",
                trainable=learn_init, regularizable=False)

        if isinstance(hid_init, Layer):
            self.hid_init = hid_init
        else:
            self.hid_init = self.add_param(
                hid_init, (1, self.num_units), name="hid_init",
                trainable=learn_init, regularizable=False)
Exemplo n.º 20
0
def main(num_epochs=NUM_EPOCHS):
    print("Loading data ...")
    snli = SNLI(batch_size=BATCH_SIZE)
    train_batches = list(snli.train_minibatch_generator())
    dev_batches = list(snli.dev_minibatch_generator())
    test_batches = list(snli.test_minibatch_generator())
    W_word_embedding = snli.weight  # W shape: (# vocab size, WE_DIM)
    del snli

    print("Building network ...")
    ########### sentence embedding encoder ###########
    # sentence vector, with each number standing for a word number
    input_var = T.TensorType('int32', [False, False])('sentence_vector')
    input_var.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (50, 20), 'int32'), numpy.zeros(
            (50, 5)).astype('int32')))
    input_var.tag.test_value[1, 20:22] = (413, 45)
    l_in = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                     input_var=input_var)

    input_mask = T.TensorType('int32', [False, False])('sentence_mask')
    input_mask.tag.test_value = numpy.hstack((numpy.ones(
        (50, 20), dtype='int32'), numpy.zeros((50, 5), dtype='int32')))
    input_mask.tag.test_value[1, 20:22] = 1
    l_mask = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                       input_var=input_mask)

    # output shape (BATCH_SIZE, None, WE_DIM)
    l_word_embed = lasagne.layers.EmbeddingLayer(
        l_in,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=W_word_embedding)  # how to set it to be non-trainable?

    # bidirectional LSTM
    l_forward = lasagne.layers.LSTMLayer(
        l_word_embed,
        mask_input=l_mask,
        num_units=LSTM_HIDDEN,
        ingate=Gate(W_in=init.Normal(STD),
                    W_hid=init.Normal(STD),
                    W_cell=init.Normal(STD)),
        forgetgate=Gate(W_in=init.Normal(STD),
                        W_hid=init.Normal(STD),
                        W_cell=init.Normal(STD)),
        cell=Gate(W_in=init.Normal(STD),
                  W_hid=init.Normal(STD),
                  W_cell=None,
                  nonlinearity=nonlinearities.tanh),
        outgate=Gate(W_in=init.Normal(STD),
                     W_hid=init.Normal(STD),
                     W_cell=init.Normal(STD)),
        nonlinearity=lasagne.nonlinearities.tanh,
        peepholes=False,
        grad_clipping=GRAD_CLIP)

    l_backward = lasagne.layers.LSTMLayer(
        l_word_embed,
        mask_input=l_mask,
        num_units=LSTM_HIDDEN,
        ingate=Gate(W_in=init.Normal(STD),
                    W_hid=init.Normal(STD),
                    W_cell=init.Normal(STD)),
        forgetgate=Gate(W_in=init.Normal(STD),
                        W_hid=init.Normal(STD),
                        W_cell=init.Normal(STD)),
        cell=Gate(W_in=init.Normal(STD),
                  W_hid=init.Normal(STD),
                  W_cell=None,
                  nonlinearity=nonlinearities.tanh),
        outgate=Gate(W_in=init.Normal(STD),
                     W_hid=init.Normal(STD),
                     W_cell=init.Normal(STD)),
        nonlinearity=lasagne.nonlinearities.tanh,
        peepholes=False,
        grad_clipping=GRAD_CLIP,
        backwards=True)

    # output dim: (BATCH_SIZE, None, 2*LSTM_HIDDEN)
    l_concat = lasagne.layers.ConcatLayer([l_forward, l_backward], axis=2)

    # Attention mechanism to get sentence embedding
    # output dim: (BATCH_SIZE, None, ATTENTION_HIDDEN)
    l_ws1 = DenseLayer3DInput(l_concat, num_units=ATTENTION_HIDDEN)
    # output dim: (BATCH_SIZE, None, N_ROWS)
    l_ws2 = DenseLayer3DInput(l_ws1, num_units=N_ROWS, nonlinearity=None)
    l_annotations = Softmax3D(l_ws2, mask=l_mask)
    # output dim: (BATCH_SIZE, 2*LSTM_HIDDEN, N_ROWS)
    l_sentence_embedding = ApplyAttention([l_annotations, l_concat])

    # beam search? Bi lstm in the sentence embedding layer? etc.

    ########### get embeddings for hypothesis and premise ###########
    # hypothesis
    input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector')
    input_var_h.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (50, 18), 'int32'), numpy.zeros(
            (50, 6)).astype('int32')))
    l_in_h = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                       input_var=input_var_h)

    input_mask_h = T.TensorType('int32', [False, False])('hypo_mask')
    input_mask_h.tag.test_value = numpy.hstack((numpy.ones(
        (50, 18), dtype='int32'), numpy.zeros((50, 6), dtype='int32')))
    input_mask_h.tag.test_value[1, 18:22] = 1
    l_mask_h = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                         input_var=input_mask_h)

    # premise
    input_var_p = T.TensorType('int32', [False, False])('premise_vector')
    input_var_p.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (50, 16), 'int32'), numpy.zeros(
            (50, 3)).astype('int32')))
    l_in_p = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                       input_var=input_var_p)

    input_mask_p = T.TensorType('int32', [False, False])('premise_mask')
    input_mask_p.tag.test_value = numpy.hstack((numpy.ones(
        (50, 16), dtype='int32'), numpy.zeros((50, 3), dtype='int32')))
    input_mask_p.tag.test_value[1, 16:18] = 1
    l_mask_p = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None),
                                         input_var=input_mask_p)

    hypothesis_embedding, hypothesis_annotation = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_h.input_var,
            l_mask: l_mask_h.input_var
        })
    premise_embedding, premise_annotation = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_p.input_var,
            l_mask: l_mask_p.input_var
        })

    ########### gated encoder and output MLP ##########
    l_hypo_embed = lasagne.layers.InputLayer(shape=(BATCH_SIZE, N_ROWS,
                                                    2 * LSTM_HIDDEN),
                                             input_var=hypothesis_embedding)
    l_pre_embed = lasagne.layers.InputLayer(shape=(BATCH_SIZE, N_ROWS,
                                                   2 * LSTM_HIDDEN),
                                            input_var=premise_embedding)

    # output dim: (BATCH_SIZE, 2*LSTM_HIDDEN, N_ROWS)
    l_factors = GatedEncoder3D([l_hypo_embed, l_pre_embed],
                               num_hfactors=2 * LSTM_HIDDEN)

    # Dropout:
    l_factors_noise = lasagne.layers.DropoutLayer(l_factors,
                                                  p=GAEREG,
                                                  rescale=True)

    # l_hids = DenseLayer3DWeight()

    l_outhid = lasagne.layers.DenseLayer(
        l_factors_noise,
        num_units=OUT_HIDDEN,
        nonlinearity=lasagne.nonlinearities.rectify)

    # Dropout:
    l_outhid_noise = lasagne.layers.DropoutLayer(l_outhid,
                                                 p=GAEREG,
                                                 rescale=True)

    l_output = lasagne.layers.DenseLayer(
        l_outhid_noise,
        num_units=3,
        nonlinearity=lasagne.nonlinearities.softmax)

    ########### target, cost, validation, etc. ##########
    target_values = T.ivector('target_output')
    target_values.tag.test_value = numpy.asarray([
        1,
    ] * 50, dtype='int32')

    network_output = lasagne.layers.get_output(l_output)
    network_output_clean = lasagne.layers.get_output(l_output,
                                                     deterministic=True)

    # penalty term and cost
    attention_penalty = T.mean(
        (
            T.batched_dot(
                hypothesis_annotation,
                # pay attention to this line:
                # T.extra_ops.cpu_contiguous(hypothesis_annotation.dimshuffle(0, 2, 1))
                hypothesis_annotation.dimshuffle(0, 2, 1)) -
            T.eye(hypothesis_annotation.shape[1]).dimshuffle('x', 0, 1))**2,
        axis=(0, 1, 2)
    ) + T.mean(
        (
            T.batched_dot(
                premise_annotation,
                # T.extra_ops.cpu_contiguous(premise_annotation.dimshuffle(0, 2, 1))  # ditto.
                premise_annotation.dimshuffle(0, 2, 1)  # ditto.
            ) - T.eye(premise_annotation.shape[1]).dimshuffle('x', 0, 1))**2,
        axis=(0, 1, 2))

    cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values) + \
                  ATTENTION_PENALTY * attention_penalty)
    cost_clean = T.mean(T.nnet.categorical_crossentropy(network_output_clean, target_values) + \
                        ATTENTION_PENALTY * attention_penalty)

    # Retrieve all parameters from the network
    all_params = lasagne.layers.get_all_params(l_output) + \
                 lasagne.layers.get_all_params(l_sentence_embedding)
    numparams = sum(
        [numpy.prod(i) for i in [i.shape.eval() for i in all_params]])
    print("Number of params: {}".format(numparams))

    # if exist param file then load params
    look_for = 'params' + os.sep + 'params_' + filename + '.pkl'
    if os.path.isfile(look_for):
        print("Resuming from file: " + look_for)
        all_param_values = cPickle.load(open(look_for, 'rb'))
        for p, v in zip(all_params, all_param_values):
            p.set_value(v)

    # withoutwe_params = all_params + [l_word_embed.W]

    # Compute updates for training
    print("Computing updates ...")
    updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    network_prediction = T.argmax(network_output, axis=1)
    error_rate = T.mean(T.neq(network_prediction, target_values))
    network_prediction_clean = T.argmax(network_output_clean, axis=1)
    error_rate_clean = T.mean(T.neq(network_prediction_clean, target_values))

    train = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost, error_rate],
                            updates=updates)
    compute_cost = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost_clean, error_rate_clean])

    def evaluate(mode):
        if mode == 'dev':
            data = dev_batches
        if mode == 'test':
            data = test_batches

        set_cost = 0.
        set_error_rate = 0.
        for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1):
            _cost, _error = compute_cost(hypo, hm, premise, pm, truth)
            set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \
                       1.0 / batches_seen * _cost
            set_error_rate = (1.0 - 1.0 / batches_seen) * set_error_rate + \
                             1.0 / batches_seen * _error

        return set_cost, set_error_rate

    dev_set_cost, dev_set_error = evaluate('dev')
    print("BEFORE TRAINING: dev cost %f, error %f" %
          (dev_set_cost, dev_set_error))
    print("Training ...")
    try:
        for epoch in range(num_epochs):
            train_set_cost = 0.
            train_set_error = 0.
            start = time.time()

            for batches_seen, (hypo, hm, premise, pm,
                               truth) in enumerate(train_batches, 1):
                _cost, _error = train(hypo, hm, premise, pm, truth)
                train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \
                                 1.0 / batches_seen * _cost
                train_set_error = (1.0 - 1.0 / batches_seen) * train_set_error + \
                                       1.0 / batches_seen * _error
                if batches_seen % 100 == 0:
                    end = time.time()
                    print("Sample %d %.2fs, lr %.4f, train cost %f, error %f" %
                          (batches_seen * BATCH_SIZE, LEARNING_RATE,
                           end - start, train_set_cost, train_set_error))
                    start = end

                if batches_seen % 2000 == 0:
                    dev_set_cost, dev_set_error = evaluate('dev')
                    test_set_cost, test_set_error = evaluate('test')
                    print("***dev  cost %f, error %f" %
                          (dev_set_cost, dev_set_error))
                    print("***test cost %f, error %f" %
                          (test_set_cost, test_set_error))

            # save parameters
            all_param_values = [p.get_value() for p in all_params]
            cPickle.dump(
                all_param_values,
                open('params' + os.sep + 'params_' + filename + '.pkl', 'wb'))

            # load params
            # all_param_values = cPickle.load(open('params' + os.sep + 'params_' + filename, 'rb'))
            # for p, v in zip(all_params, all_param_values):
            #     p.set_value(v)

            dev_set_cost, dev_set_error = evaluate('dev')
            test_set_cost, test_set_error = evaluate('test')

            print("epoch %d, cost: train %f dev %f test %f;\n"
                  "         error train %f dev %f test %f" %
                  (epoch, train_set_cost, dev_set_cost, test_set_cost,
                   train_set_error, dev_set_error, test_set_error))
    except KeyboardInterrupt:
        pdb.set_trace()
        pass
Exemplo n.º 21
0
    def __init__(
            self,
            incoming,  #  就是x的输入  (None, features)
            time_input,  #  每一个 batch的时间点 信息  ,int
            mask_input=None,  #  time_step 决定哪些
            cell_init=init.Constant(0.),  #  细胞状态初始化
            hid_init=init.Constant(0.),  # 隐含层状态初始化
            num_units,  # 神经元节点数
            ingate=Gate(b=lasagne.init.Constant(0)),  # 初始化输入门
            forgetgate=Gate(b=lasagne.init.Constant(2),
                            nonlinearity=nonlinearities.sigmoid),  # 初始化遗忘门
            timegate=PLSTMTimeGate(),  # 创建一个时间门
            cell=Gate(W_cell=None,
                      nonlinearity=nonlinearities.tanh),  # 创建细胞状态控制的门
            outgate=Gate(),  # 初始化输出门
            nonlinearity=nonlinearities.tanh,  # 这一层的输出的 非线性激活函数
            backwards=False,
            learn_init=False,
            peepholes=True,  #  是否利用窥视孔连接
            gradient_steps=-1,
            grad_clipping=0,
            unroll_scan=False,
            precompute_input=True,
            only_return_final=False,
            bn=False,  # 是否采用另外一种 结构  BN-LSTM
            learn_time_params=[True, True, False],  # 是否学习 事件门的参数  
            off_alpha=1e-3,  #  leak rate
            **kwargs):

        # This layer inherits from a MergeLayer, because it can have four
        # inputs - the layer input, the mask, the initial hidden state and the
        # inital cell state. We will just provide the layer input as incomings,
        # unless a mask input, inital hidden state or initial cell state was
        # provided.

        incomings = [incoming]

        # TIME STUFF

        incomings.append(time_input)

        self.time_incoming_index = len(incomings) - 1

        self.mask_incoming_index = -2
        self.hid_init_incoming_index = -2
        self.cell_init_incoming_index = -2

        #  incomings 是总的输入  原则上包含了
        #  x 的输入  ,  mask 输入 ,  初始化的隐含层状态输入,  初始化的 细胞状态输入

        #  上述 的 赋值 是给了每一种参数  在 incoming 列表里面的 索引信息

        # 一开始的 incoming 不用必须包含  mask , init_b,  init_c
        # 下面如果有输入的话再加上去  incomings.append()

        #ADD TIME INPUT HERE
        if mask_input is not None:
            incomings.append(mask_input)
            self.mask_incoming_index = len(incomings) - 1  # 更新索引信息

        if isinstance(hid_init, Layer):
            incomings.append(hid_init)
            self.hid_init_incoming_index = len(incomings) - 1  # 更新索引信息

        if isinstance(cell_init, Layer):
            incomings.append(cell_init)
            self.cell_init_incoming_index = len(incomings) - 1  # 更新索引信息

        # Initialize parent layer
        super(PLSTMLayer, self).__init__(incomings, **kwargs)

        # If the provided nonlinearity is None, make it linear
        if nonlinearity is None:
            self.nonlinearity = nonlinearities.identity
        else:
            self.nonlinearity = nonlinearity

        self.learn_init = learn_init
        self.num_units = num_units
        self.backwards = backwards
        self.peepholes = peepholes
        self.gradient_steps = gradient_steps
        self.grad_clipping = grad_clipping
        self.unroll_scan = unroll_scan
        self.precompute_input = precompute_input
        self.only_return_final = only_return_final

        #  这段还不知道是干嘛的
        if unroll_scan and gradient_steps != -1:
            raise ValueError(
                "Gradient steps must be -1 when unroll_scan is true.")

        # Retrieve the dimensionality of the incoming layer
        # 检索传入图层的维度

        input_shape = self.input_shapes[0]

        time_shape = self.input_shapes[1]

        ## unroll_scan 展开扫描
        # 如果要展开扫描  ,  那么时间 维度的 shape 就不能为None
        if unroll_scan and input_shape[1] is None:
            raise ValueError("Input sequence length cannot be specified as "
                             "None when unroll_scan is True")

        #  num_inputs == features np.prod 是吧除了 batch_size-- input_shape[0] , time_step--input_shape[1]以外的其余
        #  维度数全部相乘,当成features
        num_inputs = np.prod(input_shape[2:])

        def add_gate_params(gate, gate_name):  #  指定门的 变量,  和名字(名字后面有用)
            """ Convenience function for adding layer parameters from a Gate
            instance. """
            # 用于从Gate实例添加图层参数的便捷功能
            return (
                self.add_param(
                    gate.W_in,
                    (num_inputs,
                     num_units),  # 这里相当于给的 shape ,用initializer 类 进行初始化
                    name="W_in_to_{}".format(gate_name)),
                self.add_param(gate.W_hid, (num_units, num_units),
                               name="W_hid_to_{}".format(gate_name)),
                self.add_param(gate.b, (num_units, ),
                               name="b_{}".format(gate_name),
                               regularizable=False),
                gate.nonlinearity)

        # PHASED LSTM: Initialize params for the time gate
        #  初始化时间门的 参数

        self.off_alpha = off_alpha
        # leak_rate

        if timegate == None:  #  如果时间门为空的话, 那么指定生成一个 时间门 instance
            timegate = PLSTMTimeGate()

        def add_timegate_params(gate, gate_name):
            """ Convenience function for adding layer parameters from a Gate
            instance. """

            # self.add_param
            return (
                self.add_param(
                    gate.Period,
                    (num_units, ),  #  这里相当于给的 shape ,用initializer 类 进行初始化
                    #  这个地方为什么要留一个维度呢, 这三个参数都只是针对 h 和 c 进行的
                    name="Period_{}".format(gate_name),
                    trainable=learn_time_params[0]),
                self.add_param(gate.Shift, (num_units, ),
                               name="Shift_{}".format(gate_name),
                               trainable=learn_time_params[1]),
                self.add_param(gate.On_End, (num_units, ),
                               name="On_End_{}".format(gate_name),
                               trainable=learn_time_params[2]))

        print('Learnableness: {}'.format(learn_time_params))

        #  这里 实际上  是创建  self--PLSTMLayer  类 的 变量  进行初始化
        #  初始化时间门的参数

        (self.period_timegate, self.shift_timegate,
         self.on_end_timegate) = add_timegate_params(timegate, 'timegate')

        # Add in parameters from the supplied Gate instances
        #  初始化 输入门,  遗忘门,   输出门的参数
        (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate,
         self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate')

        (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate,
         self.nonlinearity_forgetgate) = add_gate_params(
             forgetgate, 'forgetgate')

        (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell,
         self.nonlinearity_cell) = add_gate_params(cell, 'cell')

        (self.W_in_to_outgate, self.W_hid_to_outgate, self.b_outgate,
         self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate')

        # If peephole (cell to gate) connections were enabled, initialize
        # peephole connections.  These are elementwise products with the cell
        # state, so they are represented as vectors.
        #  如果采用了窥视孔的连接,那么还要初始化窥视孔连接的参数

        #  窥视孔 在三个门 都有一个 W.cell 权重矩阵

        if self.peepholes:
            self.W_cell_to_ingate = self.add_param(ingate.W_cell,
                                                   (num_units, ),
                                                   name="W_cell_to_ingate")

            self.W_cell_to_forgetgate = self.add_param(
                forgetgate.W_cell, (num_units, ), name="W_cell_to_forgetgate")

            self.W_cell_to_outgate = self.add_param(outgate.W_cell,
                                                    (num_units, ),
                                                    name="W_cell_to_outgate")

        # Setup initial values for the cell and the hidden units

        #  如果 cell_init 是一个 Layer 类,那么就把 Layer 类赋值给 这个 plstmlayer 的 cell_init

        if isinstance(cell_init, Layer):
            self.cell_init = cell_init

        else:
            self.cell_init = self.add_param(cell_init, (1, num_units),
                                            name="cell_init",
                                            trainable=learn_init,
                                            regularizable=False)

        if isinstance(hid_init, Layer):

            self.hid_init = hid_init

        else:
            self.hid_init = self.add_param(hid_init, (1, self.num_units),
                                           name="hi d_init",
                                           trainable=learn_init,
                                           regularizable=False)

        if bn:  #  如果要做 bn-lstm 的话
            self.bn = lasagne.layers.BatchNormLayer(
                input_shape,
                axes=(0, 1))  # create BN layer for correct input shape
            self.params.update(self.bn.params)  # make BN params your params
        else:
            self.bn = False
Exemplo n.º 22
0
def main(num_epochs=NEPOCH):
    if DSET == 'yelp':
        print("Loading yelp dataset ...")
        loaded_dataset = YELP(
            batch_size=BSIZE,
            datapath="/home/hantek/datasets/NLC_data/yelp/word2vec_yelp.pkl")
    elif DSET == 'age2':
        print("Loading age2 dataset ...")
        loaded_dataset = AGE2(
            batch_size=BSIZE,
            datapath="/home/hantek/datasets/NLC_data/age2/word2vec_age2.pkl")
    else:
        raise ValueError("DSET was set incorrectly. Check your cmd args.")
    #                     yelp     age2
    # train data        500000    68450
    # dev/test data       2000     4000
    # vocab                      ~1.2e5
    #

    train_batches = list(loaded_dataset.train_minibatch_generator())
    dev_batches = list(loaded_dataset.dev_minibatch_generator())
    test_batches = list(loaded_dataset.test_minibatch_generator())
    W_word_embedding = loaded_dataset.weight  # W shape: (# vocab size, WE_DIM)
    del loaded_dataset

    print("Building network ...")
    ########### sentence embedding encoder ###########
    # sentence vector, with each number standing for a word number
    input_var = T.TensorType('int32', [False, False])('sentence_vector')
    input_var.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (BSIZE, 20),
                              'int32'), numpy.zeros(
                                  (BSIZE, 5)).astype('int32')))
    input_var.tag.test_value[1, 20:22] = (413, 45)
    l_in = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var)

    input_mask = T.TensorType('int32', [False, False])('sentence_mask')
    input_mask.tag.test_value = numpy.hstack((numpy.ones(
        (BSIZE, 20), dtype='int32'), numpy.zeros((BSIZE, 5), dtype='int32')))
    input_mask.tag.test_value[1, 20:22] = 1
    l_mask = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                       input_var=input_mask)

    # output shape (BSIZE, None, WEDIM)
    l_word_embed = lasagne.layers.EmbeddingLayer(
        l_in,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=W_word_embedding)

    # bidirectional LSTM
    l_forward = lasagne.layers.LSTMLayer(
        l_word_embed,
        mask_input=l_mask,
        num_units=LSTMHID,
        ingate=Gate(W_in=init.Normal(STD),
                    W_hid=init.Normal(STD),
                    W_cell=init.Normal(STD)),
        forgetgate=Gate(W_in=init.Normal(STD),
                        W_hid=init.Normal(STD),
                        W_cell=init.Normal(STD)),
        cell=Gate(W_in=init.Normal(STD),
                  W_hid=init.Normal(STD),
                  W_cell=None,
                  nonlinearity=nonlinearities.tanh),
        outgate=Gate(W_in=init.Normal(STD),
                     W_hid=init.Normal(STD),
                     W_cell=init.Normal(STD)),
        nonlinearity=lasagne.nonlinearities.tanh,
        peepholes=False,
        only_return_final=False,
        grad_clipping=GCLIP)

    l_backward = lasagne.layers.LSTMLayer(
        l_word_embed,
        mask_input=l_mask,
        num_units=LSTMHID,
        ingate=Gate(W_in=init.Normal(STD),
                    W_hid=init.Normal(STD),
                    W_cell=init.Normal(STD)),
        forgetgate=Gate(W_in=init.Normal(STD),
                        W_hid=init.Normal(STD),
                        W_cell=init.Normal(STD)),
        cell=Gate(W_in=init.Normal(STD),
                  W_hid=init.Normal(STD),
                  W_cell=None,
                  nonlinearity=nonlinearities.tanh),
        outgate=Gate(W_in=init.Normal(STD),
                     W_hid=init.Normal(STD),
                     W_cell=init.Normal(STD)),
        nonlinearity=lasagne.nonlinearities.tanh,
        peepholes=False,
        only_return_final=False,
        grad_clipping=GCLIP,
        backwards=True)

    # output dim: (BSIZE, None, 2*LSTMHID)
    l_concat = lasagne.layers.ConcatLayer([l_forward, l_backward], axis=2)

    # output dim: (BSIZE, 2*LSTMHID)
    l_maxpool = Maxpooling(l_concat, axis=1)
    l_maxpool_dpout = lasagne.layers.DropoutLayer(l_maxpool,
                                                  p=DPOUT,
                                                  rescale=True)

    l_outhid = lasagne.layers.DenseLayer(
        l_maxpool_dpout,
        num_units=OUTHID,
        nonlinearity=lasagne.nonlinearities.rectify)
    l_outhid_dpout = lasagne.layers.DropoutLayer(l_outhid,
                                                 p=DPOUT,
                                                 rescale=True)

    l_output = lasagne.layers.DenseLayer(
        l_outhid_dpout,
        num_units=5,
        nonlinearity=lasagne.nonlinearities.softmax)

    ########### target, cost, validation, etc. ##########
    target_values = T.ivector('target_output')
    target_values.tag.test_value = numpy.asarray([
        1,
    ] * BSIZE, dtype='int32')

    network_output = lasagne.layers.get_output(l_output)
    network_prediction = T.argmax(network_output, axis=1)
    accuracy = T.mean(T.eq(network_prediction, target_values))

    network_output_clean = lasagne.layers.get_output(l_output,
                                                     deterministic=True)
    network_prediction_clean = T.argmax(network_output_clean, axis=1)
    accuracy_clean = T.mean(T.eq(network_prediction_clean, target_values))

    L2_lstm = ((l_forward.W_in_to_ingate ** 2).sum() + \
               (l_forward.W_hid_to_ingate ** 2).sum() + \
               (l_forward.W_in_to_forgetgate ** 2).sum() + \
               (l_forward.W_hid_to_forgetgate ** 2).sum() + \
               (l_forward.W_in_to_cell ** 2).sum() + \
               (l_forward.W_hid_to_cell ** 2).sum() + \
               (l_forward.W_in_to_outgate ** 2).sum() + \
               (l_forward.W_hid_to_outgate ** 2).sum() + \
               (l_backward.W_in_to_ingate ** 2).sum() + \
               (l_backward.W_hid_to_ingate ** 2).sum() + \
               (l_backward.W_in_to_forgetgate ** 2).sum() + \
               (l_backward.W_hid_to_forgetgate ** 2).sum() + \
               (l_backward.W_in_to_cell ** 2).sum() + \
               (l_backward.W_hid_to_cell ** 2).sum() + \
               (l_backward.W_in_to_outgate ** 2).sum() + \
               (l_backward.W_hid_to_outgate ** 2).sum())
    L2_outputhid = (l_outhid.W**2).sum()
    L2_softmax = (l_output.W**2).sum()
    L2 = L2_lstm + L2_outputhid + L2_softmax

    cost = T.mean(T.nnet.categorical_crossentropy(network_output,
                                                  target_values)) + \
           L2REG * L2
    cost_clean = T.mean(T.nnet.categorical_crossentropy(network_output_clean,
                                                        target_values)) + \
                 L2REG * L2

    # Retrieve all parameters from the network
    all_params = lasagne.layers.get_all_params(l_output)
    if not UPDATEWE:
        all_params.remove(l_word_embed.W)

    numparams = sum(
        [numpy.prod(i) for i in [i.shape.eval() for i in all_params]])
    print("Number of params: {}\nName\t\t\tShape\t\t\tSize".format(numparams))
    print("-----------------------------------------------------------------")
    for item in all_params:
        print("{0:24}{1:24}{2}".format(item, item.shape.eval(),
                                       numpy.prod(item.shape.eval())))

    # if exist param file then load params
    look_for = 'params' + os.sep + 'params_' + filename + '.pkl'
    if os.path.isfile(look_for):
        print("Resuming from file: " + look_for)
        all_param_values = cPickle.load(open(look_for, 'rb'))
        for p, v in zip(all_params, all_param_values):
            p.set_value(v)

    # Compute SGD updates for training
    print("Computing updates ...")
    updates = lasagne.updates.adagrad(cost, all_params, LR)

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train = theano.function([l_in.input_var, l_mask.input_var, target_values],
                            [cost, accuracy],
                            updates=updates)
    compute_cost = theano.function(
        [l_in.input_var, l_mask.input_var, target_values],
        [cost_clean, accuracy_clean])
    predict = theano.function([l_in.input_var, l_mask.input_var],
                              network_prediction_clean)

    def evaluate(mode, verbose=False):
        if mode == 'dev':
            data = dev_batches
        if mode == 'test':
            data = test_batches

        set_cost = 0.
        set_accuracy = 0.
        for batches_seen, (hypo, hm, truth) in enumerate(data, 1):
            _cost, _accuracy = compute_cost(hypo, hm, truth)
            set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \
                       1.0 / batches_seen * _cost
            set_accuracy = (1.0 - 1.0 / batches_seen) * set_accuracy + \
                           1.0 / batches_seen * _accuracy
        if verbose == True:
            predicted = []
            truth = []
            for batches_seen, (sent, mask, th) in enumerate(data, 1):
                predicted.append(predict(sent, mask))
                truth.append(th)
            truth = numpy.concatenate(truth)
            predicted = numpy.concatenate(predicted)
            cm = confusion_matrix(truth, predicted)
            pr_a = cm.trace() * 1.0 / truth.size
            pr_e = ((cm.sum(axis=0)*1.0/truth.size) * \
                    (cm.sum(axis=1)*1.0/truth.size)).sum()
            k = (pr_a - pr_e) / (1 - pr_e)
            print(mode + " set statistics:")
            print("kappa index of agreement: %f" % k)
            print("confusion matrix:")
            print(cm)

        return set_cost, set_accuracy

    print("Done. Evaluating scratch model ...")
    test_set_cost, test_set_accuracy = evaluate('test', verbose=True)
    print("BEFORE TRAINING: test cost %f, accuracy %f" %
          (test_set_cost, test_set_accuracy))
    print("Training ...")
    try:
        for epoch in range(num_epochs):
            train_set_cost = 0.
            train_set_accuracy = 0.
            start = time.time()

            for batches_seen, (hypo, hm, truth) in enumerate(train_batches, 1):
                _cost, _accuracy = train(hypo, hm, truth)
                train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \
                                 1.0 / batches_seen * _cost
                train_set_accuracy = (1.0 - 1.0 / batches_seen) * train_set_accuracy + \
                                  1.0 / batches_seen * _accuracy
                if batches_seen % 100 == 0:
                    end = time.time()
                    print(
                        "Sample %d %.2fs, lr %.4f, train cost %f, accuracy %f"
                        % (batches_seen * BSIZE, end - start, LR,
                           train_set_cost, train_set_accuracy))
                    start = end

                if batches_seen % 2000 == 0:
                    dev_set_cost, dev_set_accuracy = evaluate('dev')
                    test_set_cost, test_set_accuracy = evaluate('test')
                    print("RECORD: cost: train %f dev %f test %f\n"
                          "        accu: train %f dev %f test %f" %
                          (train_set_cost, dev_set_cost, test_set_cost,
                           train_set_accuracy, dev_set_accuracy,
                           test_set_accuracy))

            # save parameters
            all_param_values = [p.get_value() for p in all_params]
            cPickle.dump(
                all_param_values,
                open('params' + os.sep + 'params_' + filename + '.pkl', 'wb'))

            dev_set_cost, dev_set_accuracy = evaluate('dev')
            test_set_cost, test_set_accuracy = evaluate('test', verbose=True)
            print("RECORD:epoch %d, cost: train %f dev %f test %f\n"
                  "         accu: train %f dev %f test %f" %
                  (epoch, train_set_cost, dev_set_cost, test_set_cost,
                   train_set_accuracy, dev_set_accuracy, test_set_accuracy))
    except KeyboardInterrupt:
        pdb.set_trace()
        pass
Exemplo n.º 23
0
    def _get_l_out(self, input_vars):
        check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        input_var = input_vars[0]
        context_vars = input_vars[1:]

        l_in = InputLayer(shape=(None, self.seq_vec.max_len),
                          input_var=input_var,
                          name=id_tag + 'desc_input')
        l_in_embed = EmbeddingLayer(
            l_in,
            input_size=len(self.seq_vec.tokens),
            output_size=self.options.listener_cell_size,
            name=id_tag + 'desc_embed')

        cell = CELLS[self.options.listener_cell]
        cell_kwargs = {
            'grad_clipping': self.options.listener_grad_clipping,
            'num_units': self.options.listener_cell_size,
        }
        if self.options.listener_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(
                b=Constant(self.options.listener_forget_bias))
        if self.options.listener_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[
                self.options.listener_nonlinearity]

        l_rec1 = cell(l_in_embed,
                      name=id_tag + 'rec1',
                      only_return_final=True,
                      **cell_kwargs)
        if self.options.listener_bidi:
            l_rec1_backwards = cell(l_in_embed,
                                    name=id_tag + 'rec1_back',
                                    backwards=True,
                                    only_return_final=True,
                                    **cell_kwargs)
            l_rec1 = ConcatLayer([l_rec1, l_rec1_backwards],
                                 axis=1,
                                 name=id_tag + 'rec1_bidi_concat')
        if self.options.listener_dropout > 0.0:
            l_rec1_drop = DropoutLayer(l_rec1,
                                       p=self.options.listener_dropout,
                                       name=id_tag + 'rec1_drop')
        else:
            l_rec1_drop = l_rec1

        # (batch_size, repr_size)
        l_pred_mean = DenseLayer(l_rec1_drop,
                                 num_units=self.color_vec.output_size,
                                 nonlinearity=None,
                                 name=id_tag + 'pred_mean')
        # (batch_size, repr_size * repr_size)
        l_pred_covar_vec = DenseLayer(
            l_rec1_drop,
            num_units=self.color_vec.output_size**2,
            # initially produce identity matrix
            b=np.eye(self.color_vec.output_size,
                     dtype=theano.config.floatX).ravel(),
            nonlinearity=None,
            name=id_tag + 'pred_covar_vec')
        # (batch_size, repr_size, repr_size)
        l_pred_covar = reshape(
            l_pred_covar_vec,
            ([0], self.color_vec.output_size, self.color_vec.output_size),
            name=id_tag + 'pred_covar')

        # Context repr has shape (batch_size, context_len * repr_size)
        l_context_repr, context_inputs = self.color_vec.get_input_layer(
            context_vars,
            cell_size=self.options.listener_cell_size,
            context_len=self.context_len,
            id=self.id)
        l_context_points = reshape(
            l_context_repr,
            ([0], self.context_len, self.color_vec.output_size))

        l_unnorm_scores = GaussianScoreLayer(l_context_points,
                                             l_pred_mean,
                                             l_pred_covar,
                                             name=id_tag + 'gaussian_score')

        l_scores = NonlinearityLayer(l_unnorm_scores,
                                     nonlinearity=softmax,
                                     name=id_tag + 'scores')

        return l_scores, [l_in] + context_inputs
    def __init__(self,
                 incoming,
                 num_units,
                 in_dropout=0.0,
                 hid_dropout=0.0,
                 ingate=Gate(),
                 forgetgate=Gate(),
                 cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh),
                 outgate=Gate(),
                 nonlinearity=nonlinearities.tanh,
                 cell_init=init.Constant(0.),
                 hid_init=init.Constant(0.),
                 backwards=False,
                 learn_init=False,
                 peepholes=True,
                 gradient_steps=-1,
                 grad_clipping=0,
                 unroll_scan=False,
                 precompute_input=True,
                 mask_input=None,
                 only_return_final=False,
                 **kwargs):

        # This layer inherits from a MergeLayer, because it can have two
        # inputs - the layer input, and the mask.  We will just provide the
        # layer input as incomings, unless a mask input was provided.
        incomings = [incoming]
        if mask_input is not None:
            incomings.append(mask_input)

        # Initialize parent layer
        super(LSTMDropoutLayer, self).__init__(incomings, **kwargs)

        # If the provided nonlinearity is None, make it linear
        if nonlinearity is None:
            self.nonlinearity = nonlinearities.identity
        else:
            self.nonlinearity = nonlinearity

        self._srng = RandomStreams(get_rng().randint(1, 2147462579))
        self.p_in = in_dropout
        self.p_hid = hid_dropout
        self.learn_init = learn_init
        self.num_units = num_units
        self.backwards = backwards
        self.peepholes = peepholes
        self.gradient_steps = gradient_steps
        self.grad_clipping = grad_clipping
        self.unroll_scan = unroll_scan
        self.precompute_input = precompute_input
        self.only_return_final = only_return_final

        if unroll_scan and gradient_steps != -1:
            raise ValueError(
                "Gradient steps must be -1 when unroll_scan is true.")

        # Retrieve the dimensionality of the incoming layer
        input_shape = self.input_shapes[0]

        if unroll_scan and input_shape[1] is None:
            raise ValueError("Input sequence length cannot be specified as "
                             "None when unroll_scan is True")

        num_inputs = np.prod(input_shape[2:])
        self.num_inputs = num_inputs

        def add_gate_params(gate, gate_name):
            """ Convenience function for adding layer parameters from a Gate
            instance. """
            return (self.add_param(gate.W_in, (num_inputs, num_units),
                                   name="W_in_to_{}".format(gate_name)),
                    self.add_param(gate.W_hid, (num_units, num_units),
                                   name="W_hid_to_{}".format(gate_name)),
                    self.add_param(gate.b, (num_units, ),
                                   name="b_{}".format(gate_name),
                                   regularizable=False), gate.nonlinearity)

        # Add in parameters from the supplied Gate instances
        (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate,
         self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate')

        (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate,
         self.nonlinearity_forgetgate) = add_gate_params(
             forgetgate, 'forgetgate')

        (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell,
         self.nonlinearity_cell) = add_gate_params(cell, 'cell')

        (self.W_in_to_outgate, self.W_hid_to_outgate, self.b_outgate,
         self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate')

        # If peephole (cell to gate) connections were enabled, initialize
        # peephole connections.  These are elementwise products with the cell
        # state, so they are represented as vectors.
        if self.peepholes:
            self.W_cell_to_ingate = self.add_param(ingate.W_cell,
                                                   (num_units, ),
                                                   name="W_cell_to_ingate")

            self.W_cell_to_forgetgate = self.add_param(
                forgetgate.W_cell, (num_units, ), name="W_cell_to_forgetgate")

            self.W_cell_to_outgate = self.add_param(outgate.W_cell,
                                                    (num_units, ),
                                                    name="W_cell_to_outgate")

        # Setup initial values for the cell and the hidden units
        if isinstance(cell_init, T.TensorVariable):
            if cell_init.ndim != 2:
                raise ValueError(
                    "When cell_init is provided as a TensorVariable, it should"
                    " have 2 dimensions and have shape (num_batch, num_units)")
            self.cell_init = cell_init
        else:
            self.cell_init = self.add_param(cell_init, (1, num_units),
                                            name="cell_init",
                                            trainable=learn_init,
                                            regularizable=False)

        if isinstance(hid_init, T.TensorVariable):
            if hid_init.ndim != 2:
                raise ValueError(
                    "When hid_init is provided as a TensorVariable, it should "
                    "have 2 dimensions and have shape (num_batch, num_units)")
            self.hid_init = hid_init
        else:
            self.hid_init = self.add_param(hid_init, (1, self.num_units),
                                           name="hid_init",
                                           trainable=learn_init,
                                           regularizable=False)
Exemplo n.º 25
0
    def _get_l_out(self, input_vars):
        check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        input_var = input_vars[0]
        context_vars = input_vars[1:]

        l_in = InputLayer(shape=(None, self.seq_vec.max_len),
                          input_var=input_var,
                          name=id_tag + 'desc_input')
        l_in_embed = EmbeddingLayer(
            l_in,
            input_size=len(self.seq_vec.tokens),
            output_size=self.options.listener_cell_size,
            name=id_tag + 'desc_embed')

        # Context repr has shape (batch_size, seq_len, context_len * repr_size)
        l_context_repr, context_inputs = self.color_vec.get_input_layer(
            context_vars,
            recurrent_length=self.seq_vec.max_len,
            cell_size=self.options.listener_cell_size,
            context_len=self.context_len,
            id=self.id)
        l_context_repr = reshape(
            l_context_repr,
            ([0], [1], self.context_len, self.color_vec.output_size))
        l_hidden_context = dimshuffle(l_context_repr, (0, 3, 1, 2),
                                      name=id_tag + 'shuffle_in')
        for i in range(1, self.options.listener_hidden_color_layers + 1):
            l_hidden_context = NINLayer(
                l_hidden_context,
                num_units=self.options.listener_cell_size,
                nonlinearity=NONLINEARITIES[
                    self.options.listener_nonlinearity],
                b=Constant(0.1),
                name=id_tag + 'hidden_context%d' % i)
        l_pool = FeaturePoolLayer(l_hidden_context,
                                  pool_size=self.context_len,
                                  axis=3,
                                  pool_function=T.mean,
                                  name=id_tag + 'pool')
        l_pool_squeezed = reshape(l_pool, ([0], [1], [2]),
                                  name=id_tag + 'pool_squeezed')
        l_pool_shuffle = dimshuffle(l_pool_squeezed, (0, 2, 1),
                                    name=id_tag + 'shuffle_out')
        l_concat = ConcatLayer([l_pool_shuffle, l_in_embed],
                               axis=2,
                               name=id_tag + 'concat_inp_context')

        cell = CELLS[self.options.listener_cell]
        cell_kwargs = {
            'grad_clipping': self.options.listener_grad_clipping,
            'num_units': self.options.listener_cell_size,
        }
        if self.options.listener_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(
                b=Constant(self.options.listener_forget_bias))
        if self.options.listener_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[
                self.options.listener_nonlinearity]

        # l_rec1_drop = l_concat
        l_rec1 = cell(l_concat, name=id_tag + 'rec1', **cell_kwargs)
        if self.options.listener_dropout > 0.0:
            l_rec1_drop = DropoutLayer(l_rec1,
                                       p=self.options.listener_dropout,
                                       name=id_tag + 'rec1_drop')
        else:
            l_rec1_drop = l_rec1
        l_rec2 = cell(l_rec1_drop,
                      name=id_tag + 'rec2',
                      only_return_final=True,
                      **cell_kwargs)
        if self.options.listener_dropout > 0.0:
            l_rec2_drop = DropoutLayer(l_rec2,
                                       p=self.options.listener_dropout,
                                       name=id_tag + 'rec2_drop')
        else:
            l_rec2_drop = l_rec2

        l_rec2_drop = NINLayer(l_rec2_drop,
                               num_units=self.options.listener_cell_size,
                               nonlinearity=None,
                               name=id_tag + 'rec2_dense')

        # Context is fed into the RNN as one copy for each time step; just use
        # the first time step for output.
        # Input shape: (batch_size, repr_size, seq_len, context_len)
        # Output shape: (batch_size, repr_size, context_len)
        l_context_nonrec = SliceLayer(l_hidden_context,
                                      indices=0,
                                      axis=2,
                                      name=id_tag + 'context_nonrec')
        l_pool_nonrec = SliceLayer(l_pool_squeezed,
                                   indices=0,
                                   axis=2,
                                   name=id_tag + 'pool_nonrec')

        # Output shape: (batch_size, repr_size, context_len)
        l_sub = broadcast_sub_layer(
            l_pool_nonrec,
            l_context_nonrec,
            feature_dim=self.options.listener_cell_size,
            id_tag=id_tag)
        # Output shape: (batch_size, repr_size * 2, context_len)
        l_concat_sub = ConcatLayer([l_context_nonrec, l_sub],
                                   axis=1,
                                   name=id_tag + 'concat_inp_context')
        # Output shape: (batch_size, cell_size, context_len)
        l_hidden = NINLayer(l_concat_sub,
                            num_units=self.options.listener_cell_size,
                            nonlinearity=None,
                            name=id_tag + 'hidden')
        if self.options.listener_dropout > 0.0:
            l_hidden_drop = DropoutLayer(l_hidden,
                                         p=self.options.listener_dropout,
                                         name=id_tag + 'hidden_drop')
        else:
            l_hidden_drop = l_hidden

        l_dot = broadcast_dot_layer(
            l_rec2_drop,
            l_hidden_drop,
            feature_dim=self.options.listener_cell_size,
            id_tag=id_tag)
        l_dot_bias = l_dot  # BiasLayer(l_dot, name=id_tag + 'dot_bias')
        l_dot_clipped = NonlinearityLayer(
            l_dot_bias,
            nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity],
            name=id_tag + 'dot_clipped')
        l_scores = NonlinearityLayer(l_dot_clipped,
                                     nonlinearity=softmax,
                                     name=id_tag + 'scores')

        return l_scores, [l_in] + context_inputs
Exemplo n.º 26
0
    def __init__(
            self,
            incoming,
            time_input,
            event_input,
            num_units,
            num_attention,
            model='HELSTM',  #model options: LSTM, PLSTM or HELSTM
            mask_input=None,
            ingate=Gate(),
            forgetgate=Gate(),
            cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh),
            timegate=HELSTMGate(),
            nonlinearity=nonlinearities.tanh,
            cell_init=init.Constant(0.),
            hid_init=init.Constant(0.),
            outgate=Gate(),
            backwards=False,
            learn_init=False,
            peepholes=True,
            grad_clipping=0,
            bn=False,
            only_return_final=False,
            off_alpha=1e-3,
            **kwargs):
        incomings = [incoming, time_input, event_input]
        self.time_incoming_idx = 1
        self.event_incoming_idx = 2
        self.mask_incoming_index = -2
        self.hid_init_incoming_index = -2
        self.cell_init_incoming_index = -2

        if mask_input is not None:
            incomings.append(mask_input)
            self.mask_incoming_index = len(incomings) - 1
        if isinstance(hid_init, Layer):
            incomings.append(hid_init)
            self.hid_init_incoming_index = len(incomings) - 1
        if isinstance(cell_init, Layer):
            incomings.append(cell_init)
            self.cell_init_incoming_index = len(incomings) - 1

        super(HELSTMLayer, self).__init__(incomings, **kwargs)

        self.nonlinearity = nonlinearity
        self.learn_init = learn_init
        self.num_units = num_units
        self.num_attention = num_attention
        self.peepholes = peepholes
        self.grad_clipping = grad_clipping
        self.backwards = backwards
        self.off_alpha = off_alpha
        self.only_return_final = only_return_final
        self.model = model
        if self.model == 'LSTM':
            print 'using LSTM'
        elif self.model == 'PLSTM':
            print 'using PLSTM'
        else:
            assert self.model == 'HELSTM'
            print 'using HELSTM'

        input_shape = self.input_shapes[0]
        num_inputs = np.prod(input_shape[2:])

        def add_gate_params(gate, gate_name):
            return (self.add_param(gate.W_in, (num_inputs, num_units),
                                   name="W_in_to_{}".format(gate_name)),
                    self.add_param(gate.W_hid, (num_units, num_units),
                                   name="W_hid_to_{}".format(gate_name)),
                    self.add_param(gate.b, (num_units, ),
                                   name="b_{}".format(gate_name),
                                   regularizable=False), gate.nonlinearity)

        # Add in parameters from the supplied Gate instances
        (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate,
         self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate')

        (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate,
         self.nonlinearity_forgetgate) = add_gate_params(
             forgetgate, 'forgetgate')

        (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell,
         self.nonlinearity_cell) = add_gate_params(cell, 'cell')

        (self.W_in_to_outgate, self.W_hid_to_outgate, self.b_outgate,
         self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate')

        # If peephole (cell to gate) connections were enabled, initialize
        # peephole connections.  These are elementwise products with the cell
        # state, so they are represented as vectors.
        if self.peepholes:
            self.W_cell_to_ingate = self.add_param(ingate.W_cell,
                                                   (num_units, ),
                                                   name="W_cell_to_ingate")

            self.W_cell_to_forgetgate = self.add_param(
                forgetgate.W_cell, (num_units, ), name="W_cell_to_forgetgate")

            self.W_cell_to_outgate = self.add_param(outgate.W_cell,
                                                    (num_units, ),
                                                    name="W_cell_to_outgate")

        # Setup initial values for the cell and the hidden units
        if isinstance(cell_init, Layer):
            self.cell_init = cell_init
        else:
            self.cell_init = self.add_param(cell_init, (1, num_units),
                                            name="cell_init",
                                            trainable=learn_init,
                                            regularizable=False)

        if isinstance(hid_init, Layer):
            self.hid_init = hid_init
        else:
            self.hid_init = self.add_param(hid_init, (1, self.num_units),
                                           name="hid_init",
                                           trainable=learn_init,
                                           regularizable=False)

        if bn:
            self.bn = lasagne.layers.BatchNormLayer(
                input_shape,
                axes=(0, 1))  # create BN layer for correct input shape
            self.params.update(self.bn.params)  # make BN params your params
        else:
            self.bn = False

        def add_timegate_params(gate, gate_name, attention=False):
            params = [
                self.add_param(gate.Period, (num_units, ),
                               name="Period_{}".format(gate_name)),
                self.add_param(gate.Shift, (num_units, ),
                               name="Shift_{}".format(gate_name)),
                self.add_param(gate.On_End, (num_units, ),
                               name="On_End_{}".format(gate_name))
            ]
            if attention:
                params += [
                    self.add_param(gate.Event_W, (num_inputs, num_attention),
                                   name="Event_W_{}".format(gate_name)),
                    self.add_param(gate.Event_b, (num_attention, ),
                                   name="Event_b_{}".format(gate_name)),
                    self.add_param(gate.out_W, (num_attention, num_units),
                                   name="out_b_{}".format(gate_name)),
                    self.add_param(gate.out_b, (num_units, ),
                                   name="out_b_{}".format(gate_name))
                ]
            return params

        if model != 'LSTM':
            if model == 'PLSTM':
                (self.period_timegate, self.shift_timegate,
                 self.on_end_timegate) = add_timegate_params(
                     timegate, 'timegate')
            else:
                assert model == 'HELSTM'
                (self.period_timegate, self.shift_timegate,
                 self.on_end_timegate, self.event_w_timegate,
                 self.event_b_timegate, self.out_w_timegate,
                 self.out_b_timegate) = add_timegate_params(timegate,
                                                            'timegate',
                                                            attention=True)
def main(num_epochs=NEPOCH):
    print("Loading data ...")
    snli = SNLI(batch_size=BSIZE)
    train_batches = list(snli.train_minibatch_generator())
    dev_batches = list(snli.dev_minibatch_generator())
    test_batches = list(snli.test_minibatch_generator())
    W_word_embedding = snli.weight  # W shape: (# vocab size, WE_DIM)
    del snli

    print("Building network ...")
    ########### sentence embedding encoder ###########
    # sentence vector, with each number standing for a word number
    input_var = T.TensorType('int32', [False, False])('sentence_vector')
    input_var.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (BSIZE, 20),
                              'int32'), numpy.zeros(
                                  (BSIZE, 5)).astype('int32')))
    input_var.tag.test_value[1, 20:22] = (413, 45)
    l_in = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var)

    input_mask = T.TensorType('int32', [False, False])('sentence_mask')
    input_mask.tag.test_value = numpy.hstack((numpy.ones(
        (BSIZE, 20), dtype='int32'), numpy.zeros((BSIZE, 5), dtype='int32')))
    input_mask.tag.test_value[1, 20:22] = 1
    l_mask = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                       input_var=input_mask)

    # output shape (BSIZE, None, WEDIM)
    l_word_embed = lasagne.layers.EmbeddingLayer(
        l_in,
        input_size=W_word_embedding.shape[0],
        output_size=W_word_embedding.shape[1],
        W=W_word_embedding)

    # bidirectional LSTM
    l_forward = lasagne.layers.LSTMLayer(
        l_word_embed,
        mask_input=l_mask,
        num_units=LSTMHID,
        ingate=Gate(W_in=init.Normal(STD),
                    W_hid=init.Normal(STD),
                    W_cell=init.Normal(STD)),
        forgetgate=Gate(W_in=init.Normal(STD),
                        W_hid=init.Normal(STD),
                        W_cell=init.Normal(STD)),
        cell=Gate(W_in=init.Normal(STD),
                  W_hid=init.Normal(STD),
                  W_cell=None,
                  nonlinearity=nonlinearities.tanh),
        outgate=Gate(W_in=init.Normal(STD),
                     W_hid=init.Normal(STD),
                     W_cell=init.Normal(STD)),
        nonlinearity=lasagne.nonlinearities.tanh,
        peepholes=False,
        grad_clipping=GCLIP)

    l_backward = lasagne.layers.LSTMLayer(
        l_word_embed,
        mask_input=l_mask,
        num_units=LSTMHID,
        ingate=Gate(W_in=init.Normal(STD),
                    W_hid=init.Normal(STD),
                    W_cell=init.Normal(STD)),
        forgetgate=Gate(W_in=init.Normal(STD),
                        W_hid=init.Normal(STD),
                        W_cell=init.Normal(STD)),
        cell=Gate(W_in=init.Normal(STD),
                  W_hid=init.Normal(STD),
                  W_cell=None,
                  nonlinearity=nonlinearities.tanh),
        outgate=Gate(W_in=init.Normal(STD),
                     W_hid=init.Normal(STD),
                     W_cell=init.Normal(STD)),
        nonlinearity=lasagne.nonlinearities.tanh,
        peepholes=False,
        grad_clipping=GCLIP,
        backwards=True)

    # output dim: (BSIZE, None, 2*LSTMHID)
    l_concat = lasagne.layers.ConcatLayer([l_forward, l_backward], axis=2)
    l_concat_dpout = lasagne.layers.DropoutLayer(
        l_concat, p=DPOUT, rescale=True)  # might not need this line

    # Attention mechanism to get sentence embedding
    # output dim: (BSIZE, None, ATTHID)
    l_ws1 = DenseLayer3DInput(l_concat_dpout, num_units=ATTHID)
    l_ws1_dpout = lasagne.layers.DropoutLayer(l_ws1, p=DPOUT, rescale=True)

    # output dim: (BSIZE, None, NROW)
    l_ws2 = DenseLayer3DInput(l_ws1_dpout, num_units=NROW, nonlinearity=None)
    l_annotations = Softmax3D(l_ws2, mask=l_mask)
    # output dim: (BSIZE, 2*LSTMHID, NROW)
    l_sentence_embedding = ApplyAttention([l_annotations, l_concat])

    # beam search? Bi lstm in the sentence embedding layer? etc.

    ########### get embeddings for hypothesis and premise ###########
    # hypothesis
    input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector')
    input_var_h.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (BSIZE, 18),
                              'int32'), numpy.zeros(
                                  (BSIZE, 6)).astype('int32')))
    l_in_h = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                       input_var=input_var_h)

    input_mask_h = T.TensorType('int32', [False, False])('hypo_mask')
    input_mask_h.tag.test_value = numpy.hstack((numpy.ones(
        (BSIZE, 18), dtype='int32'), numpy.zeros((BSIZE, 6), dtype='int32')))
    input_mask_h.tag.test_value[1, 18:22] = 1
    l_mask_h = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                         input_var=input_mask_h)

    # premise
    input_var_p = T.TensorType('int32', [False, False])('premise_vector')
    input_var_p.tag.test_value = numpy.hstack(
        (numpy.random.randint(1, 10000, (BSIZE, 16),
                              'int32'), numpy.zeros(
                                  (BSIZE, 3)).astype('int32')))
    l_in_p = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                       input_var=input_var_p)

    input_mask_p = T.TensorType('int32', [False, False])('premise_mask')
    input_mask_p.tag.test_value = numpy.hstack((numpy.ones(
        (BSIZE, 16), dtype='int32'), numpy.zeros((BSIZE, 3), dtype='int32')))
    input_mask_p.tag.test_value[1, 16:18] = 1
    l_mask_p = lasagne.layers.InputLayer(shape=(BSIZE, None),
                                         input_var=input_mask_p)

    hypothesis_embedding, hypothesis_annotation = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_h.input_var,
            l_mask: l_mask_h.input_var
        })
    premise_embedding, premise_annotation = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_p.input_var,
            l_mask: l_mask_p.input_var
        })

    hypothesis_embedding_clean, hypothesis_annotation_clean = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_h.input_var,
            l_mask: l_mask_h.input_var
        },
        deterministic=True)
    premise_embedding_clean, premise_annotation_clean = lasagne.layers.get_output(
        [l_sentence_embedding, l_annotations], {
            l_in: l_in_p.input_var,
            l_mask: l_mask_p.input_var
        },
        deterministic=True)

    ########### gated encoder and output MLP ##########
    l_hypo_embed = lasagne.layers.InputLayer(shape=(BSIZE, NROW, 2 * LSTMHID),
                                             input_var=hypothesis_embedding)
    l_hypo_embed_dpout = lasagne.layers.DropoutLayer(l_hypo_embed,
                                                     p=DPOUT,
                                                     rescale=True)
    l_pre_embed = lasagne.layers.InputLayer(shape=(BSIZE, NROW, 2 * LSTMHID),
                                            input_var=premise_embedding)
    l_pre_embed_dpout = lasagne.layers.DropoutLayer(l_pre_embed,
                                                    p=DPOUT,
                                                    rescale=True)

    # output dim: (BSIZE, NROW, 2*LSTMHID)
    l_factors = GatedEncoder3D([l_hypo_embed_dpout, l_pre_embed_dpout],
                               num_hfactors=2 * LSTMHID)
    l_factors_dpout = lasagne.layers.DropoutLayer(l_factors,
                                                  p=DPOUT,
                                                  rescale=True)

    # l_hids = DenseLayer3DWeight()

    l_outhid = lasagne.layers.DenseLayer(
        l_factors_dpout,
        num_units=OUTHID,
        nonlinearity=lasagne.nonlinearities.rectify)
    l_outhid_dpout = lasagne.layers.DropoutLayer(l_outhid,
                                                 p=DPOUT,
                                                 rescale=True)

    l_output = lasagne.layers.DenseLayer(
        l_outhid_dpout,
        num_units=3,
        nonlinearity=lasagne.nonlinearities.softmax)

    ########### target, cost, validation, etc. ##########
    target_values = T.ivector('target_output')
    target_values.tag.test_value = numpy.asarray([
        1,
    ] * BSIZE, dtype='int32')

    network_output = lasagne.layers.get_output(l_output)
    network_prediction = T.argmax(network_output, axis=1)
    accuracy = T.mean(T.eq(network_prediction, target_values))

    network_output_clean = lasagne.layers.get_output(
        l_output, {
            l_hypo_embed: hypothesis_embedding_clean,
            l_pre_embed: premise_embedding_clean
        },
        deterministic=True)
    network_prediction_clean = T.argmax(network_output_clean, axis=1)
    accuracy_clean = T.mean(T.eq(network_prediction_clean, target_values))

    # penalty term and cost
    attention_penalty = T.mean(
        (T.batched_dot(hypothesis_annotation,
                       hypothesis_annotation.dimshuffle(0, 2, 1)) -
         T.eye(hypothesis_annotation.shape[1]).dimshuffle('x', 0, 1))**2,
        axis=(0, 1, 2)) + T.mean(
            (T.batched_dot(premise_annotation,
                           premise_annotation.dimshuffle(0, 2, 1)) -
             T.eye(premise_annotation.shape[1]).dimshuffle('x', 0, 1))**2,
            axis=(0, 1, 2))

    L2_lstm = ((l_forward.W_in_to_ingate ** 2).sum() + \
               (l_forward.W_hid_to_ingate ** 2).sum() + \
               (l_forward.W_in_to_forgetgate ** 2).sum() + \
               (l_forward.W_hid_to_forgetgate ** 2).sum() + \
               (l_forward.W_in_to_cell ** 2).sum() + \
               (l_forward.W_hid_to_cell ** 2).sum() + \
               (l_forward.W_in_to_outgate ** 2).sum() + \
               (l_forward.W_hid_to_outgate ** 2).sum() + \
               (l_backward.W_in_to_ingate ** 2).sum() + \
               (l_backward.W_hid_to_ingate ** 2).sum() + \
               (l_backward.W_in_to_forgetgate ** 2).sum() + \
               (l_backward.W_hid_to_forgetgate ** 2).sum() + \
               (l_backward.W_in_to_cell ** 2).sum() + \
               (l_backward.W_hid_to_cell ** 2).sum() + \
               (l_backward.W_in_to_outgate ** 2).sum() + \
               (l_backward.W_hid_to_outgate ** 2).sum())
    L2_attention = (l_ws1.W**2).sum() + (l_ws2.W**2).sum()
    L2_gae = (l_factors.Wxf**2).sum() + (l_factors.Wyf**2).sum()
    L2_outputhid = (l_outhid.W**2).sum()
    L2_softmax = (l_output.W**2).sum()
    L2 = L2_lstm + L2_attention + L2_gae + L2_outputhid + L2_softmax

    cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values)) + \
           L2REG * L2
    cost_clean = T.mean(T.nnet.categorical_crossentropy(network_output_clean, target_values)) + \
                 L2REG * L2
    if ATTPENALTY != 0.:
        cost = cost + ATTPENALTY * attention_penalty
        cost_clean = cost_clean + ATTPENALTY * attention_penalty

    # Retrieve all parameters from the network
    all_params = lasagne.layers.get_all_params(l_output) + \
                 lasagne.layers.get_all_params(l_sentence_embedding)
    if not UPDATEWE:
        all_params.remove(l_word_embed.W)

    numparams = sum(
        [numpy.prod(i) for i in [i.shape.eval() for i in all_params]])
    print("Number of params: {}\nName\t\t\tShape\t\t\tSize".format(numparams))
    print("-----------------------------------------------------------------")
    for item in all_params:
        print("{0:24}{1:24}{2}".format(item, item.shape.eval(),
                                       numpy.prod(item.shape.eval())))

    # if exist param file then load params
    look_for = 'params' + os.sep + 'params_' + filename + '.pkl'
    if os.path.isfile(look_for):
        print("Resuming from file: " + look_for)
        all_param_values = cPickle.load(open(look_for, 'rb'))
        for p, v in zip(all_params, all_param_values):
            p.set_value(v)

    # Compute SGD updates for training
    print("Computing updates ...")
    updates = lasagne.updates.adagrad(cost, all_params, LR)

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost, accuracy],
                            updates=updates)
    compute_cost = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var, target_values
    ], [cost_clean, accuracy_clean])
    predict = theano.function([
        l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var,
        l_mask_p.input_var
    ], network_prediction_clean)

    def evaluate(mode, verbose=False):
        if mode == 'dev':
            data = dev_batches
        if mode == 'test':
            data = test_batches

        set_cost = 0.
        set_accuracy = 0.
        for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1):
            _cost, _accuracy = compute_cost(hypo, hm, premise, pm, truth)
            set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \
                       1.0 / batches_seen * _cost
            set_accuracy = (1.0 - 1.0 / batches_seen) * set_accuracy + \
                             1.0 / batches_seen * _accuracy

        if verbose == True:
            predicted = []
            truth = []
            for batches_seen, (hypo, hm, premise, pm,
                               th) in enumerate(data, 1):
                predicted.append(predict(hypo, hm, premise, pm))
                truth.append(th)
            truth = numpy.concatenate(truth)
            predicted = numpy.concatenate(predicted)
            cm = confusion_matrix(truth, predicted)
            pr_a = cm.trace() * 1.0 / truth.size
            pr_e = ((cm.sum(axis=0)*1.0/truth.size) * \
                    (cm.sum(axis=1)*1.0/truth.size)).sum()
            k = (pr_a - pr_e) / (1 - pr_e)
            print(mode + " set statistics:")
            print("kappa index of agreement: %f" % k)
            print("confusion matrix:")
            print(cm)

        return set_cost, set_accuracy

    print("Done. Evaluating scratch model ...")
    test_set_cost, test_set_accuracy = evaluate('test', verbose=True)
    print("BEFORE TRAINING: dev cost %f, accuracy %f" %
          (test_set_cost, test_set_accuracy))
    print("Training ...")
    try:
        for epoch in range(num_epochs):
            train_set_cost = 0.
            train_set_accuracy = 0.
            start = time.time()

            for batches_seen, (hypo, hm, premise, pm,
                               truth) in enumerate(train_batches, 1):
                _cost, _accuracy = train(hypo, hm, premise, pm, truth)
                train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \
                                 1.0 / batches_seen * _cost
                train_set_accuracy = (1.0 - 1.0 / batches_seen) * train_set_accuracy + \
                                  1.0 / batches_seen * _accuracy
                if batches_seen % 100 == 0:
                    end = time.time()
                    print(
                        "Sample %d %.2fs, lr %.4f, train cost %f, accuracy %f"
                        % (batches_seen * BSIZE, end - start, LR,
                           train_set_cost, train_set_accuracy))
                    start = end

                if batches_seen % 2000 == 0:
                    dev_set_cost, dev_set_accuracy = evaluate('dev')
                    print("***dev cost %f, accuracy %f" %
                          (dev_set_cost, dev_set_accuracy))

            # save parameters
            all_param_values = [p.get_value() for p in all_params]
            cPickle.dump(
                all_param_values,
                open('params' + os.sep + 'params_' + filename + '.pkl', 'wb'))

            dev_set_cost, dev_set_accuracy = evaluate('dev')
            test_set_cost, test_set_accuracy = evaluate('test', verbose=True)

            print("epoch %d, cost: train %f dev %f test %f;\n"
                  "         accu: train %f dev %f test %f" %
                  (epoch, train_set_cost, dev_set_cost, test_set_cost,
                   train_set_accuracy, dev_set_accuracy, test_set_accuracy))
    except KeyboardInterrupt:
        pdb.set_trace()
        pass