예제 #1
0
    def apply(self, h1, c, p_from_c):
        h_shape = K.shape(h1)
        # time_stpes, 1, nb_samples, dim
        p_from_h = K.expand_dims(K.dot(h1, self.B_hp) + self.b_tt, axis=1)
        # 1, time_stpes, nb_samples, dim
        p_from_c = K.expand_dims(p_from_c, axis=0).repeat(h_shape[0], axis=0)
        p = p_from_h + p_from_c
        # energy = exp(dot(tanh(p), self.D_pe) + self.c_tt).reshape((source_len, target_num))
        # since self.c_tt has nothing to do with the probs, why? since it contributes an e^c_tt() to the the denominator and nominator
        # note: self.D_pe has a shape of (hidden_output_dim,1)
        # time_steps, nb_samples, 1
        energy = K.exp(K.dot(K.tanh(p), self.D_pe))

        #normalizer = K.sum(energy, axis=1, keepdims=True)
        print "--sum--attention:normalizer"
        normalizer = sum_op.Sum_op(keepdim=True, dimension=1)(energy)
        probs = energy / normalizer
        probs = K.squeeze(probs, axis=3)

        c = K.expand_dims(c, axis=0).repeat(h_shape[0], axis=0)
        #ctx = K.sum(c * K.expand_dims(probs), axis=1)
        print "--sum--attention:ctx"
        ctx = sum_op.Sum_op(keepdim=True,
                            dimension=1)(c * K.expand_dims(probs))
        ctx = K.squeeze(ctx, axis=1)

        return [ctx, probs]
예제 #2
0
    def apply(self,
              state_below,
              mask_below=None,
              init_state=None,
              context=None):

        if K.ndim(state_below) == 2:
            state_below = K.expand_dims(state_below, 1)

        if mask_below is None:
            #mask_below = K.ones_like(K.sum(state_below, axis=2, keepdims=True))
            print "--sum--MKL_GRU"
            mask_below = K.ones_like(
                sum_op.Sum_op(keepdim=True, dimension=2)(state_below))

        if K.ndim(mask_below) == 2:
            mask_below = K.expand_dims(mask_below)

        #if init_state is None:
        # nb_samples,n_hids
        #init_state = K.repeat_elements(K.expand_dims(K.zeros_like(K.sum(state_below, axis=[0, 2]))), self.n_hids, axis=1)
        print "--sum--MKL_GRU"
        tmp1 = sum_op.Sum_op(keepdim=True, dimension=0)(state_below)
        tmp2 = K.squeeze(tmp1, 0)
        print "--sum--MKL_GRU"
        tmp2 = sum_op.Sum_op(keepdim=True, dimension=1)(tmp2)
        tmp = K.squeeze(tmp2, 1)
        init_state = K.repeat_elements(K.expand_dims(K.zeros_like(tmp)),
                                       self.n_hids,
                                       axis=1)
        '''
        state_below_xh = K. dot(state_below, self.W_xh)
        state_below_xz = K. dot(state_below, self.W_xz)
        state_below_xr = K.dot(state_below, self.W_xr)
        sequences = [state_below_xh, state_below_xz, state_below_xr, mask_below]
        fn = lambda x_h, x_z, x_r, x_m, h_tm1: self._step(x_h, x_z, x_r, x_m, h_tm1)
        rval = K.scan(fn, sequences=sequences, outputs_initials=init_state, name=_p(self.pname, 'layers'))
        self.output = rval
        '''
        #print('1,', K.ndim(init_state))
        #print(init_state.shape)
        #exit()
        self.output = self.GRU_op(state_below, self.W_x, self.W_h, init_state,
                                  self.b)[0]
        #print('2,', K.ndim(self.output))
        return self.output
예제 #3
0
def Sum_op(dimension):
    t = T.tensor3("t", dtype='float64')
    r = T.tensor3("r", dtype='float64')

    r = sum_op.Sum_op(dimension=dimension, keepdim=True)(t)
    loss = r.sum()
    gt = theano.grad(loss, t)

    f = theano.function([t], [r, gt])
    # theano.printing.pydotprint(f, outfile='sum_bw.png', var_with_name_simple=True)
    return f
예제 #4
0
def get_category_cross_entropy_from_flat_logits(logits_flat, targets, mask=None):
    assert K.ndim(targets) == 2  # time_steps * nb_samples
    nb_samples = K.cast(K.shape(targets)[1], K.dtype(logits_flat))

    targets_flat = K.flatten(targets)
    if K._BACKEND == 'tensorflow':
        ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits_flat, K.cast(targets_flat, 'int64'))
    else:
        # Theano will internally call one hot version if the two dims do not match
        ce = K.categorical_crossentropy(output = logits_flat, target = targets_flat, from_logits=True)
    if mask is not None:
        mask_flat = K.flatten(mask)
        ce *= mask_flat

    print "--sum--get_category_cross_entropy_from_flat_logits"
    tmp = sum_op.Sum_op(keepdim=True, dimension=0)(ce)
    tmp = K.squeeze(tmp,0)
    #return K.sum(ce) / nb_samples
    return tmp / nb_samples
예제 #5
0
    def build_trainer(self, src, src_mask, trg, trg_mask, ite,
                      l1_reg_weight=1e-6,
                      l2_reg_weight=1e-6,
                      softmax_output_num_sampled=100000,mlsl_obj=None,dist=None):

        src_mask_3d = K.expand_dims(src_mask)
        trg_mask_3d = K.expand_dims(trg_mask)
        annotations = self.encoder.apply(src, src_mask_3d)
        
        # init_context = annotations[0, :, -self.n_hids_src:]
        # modification #1
        # mean pooling
        print "--sum--build_trainer"
        #init_context = K.sum(annotations * src_mask_3d, axis=0) / K.sum(src_mask_3d, axis=0)
        tmp = sum_op.Sum_op(keepdim=True, dimension=0)(annotations * src_mask_3d)
        tmp1 = K.squeeze(tmp,0)
        tmp = sum_op.Sum_op(keepdim=True, dimension=0)(src_mask_3d)
        tmp2 = K.squeeze(tmp,0)
        init_context = tmp1 / tmp2

        trg_emb = self.table_trg.apply(trg)
        # shift_right assumes a 3D tensor, and time steps is dimension one
        trg_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(trg_emb, [1, 0, 2])),
                                               [1, 0, 2])

        hiddens, readout, _ = self.decoder.run_pipeline(state_below=trg_emb_shifted,
                                                        mask_below=trg_mask_3d,
                                                        init_context=init_context,
                                                        c=annotations,
                                                        c_mask=src_mask_3d)
        # apply dropout
        if self.dropout > 0.:
            logger.info('Apply dropout with p = {}'.format(self.dropout))
            readout = Dropout(readout, self.dropout)

        self.cost = calc_loss_from_readout(readout=readout,
                                           targets=trg,
                                           targets_mask=trg_mask_3d,
                                           logisticRegressionLayer=self.logistic_layer,
                                           softmax_output_num_sampled=softmax_output_num_sampled)
        # for reconstruction
    
        #self.L1 = sum([K.sum(K.abs(param)) for param in self.params])
        #self.L2 = sum([K.sum(K.square(param)) for param in self.params])
        print "--sum--build_trainerL1L2"
        self.L1 = sum([sum_op.Sum_op(keepdim=True)(K.abs(param)) for param in self.params])
        self.L2 = sum([sum_op.Sum_op(keepdim=True)(K.square(param)) for param in self.params])
        
        params_regular = self.L1 * l1_reg_weight + self.L2 * l2_reg_weight

        # train cost
        train_cost = self.cost + params_regular

        # gradients
        grads = K.gradients(train_cost, self.params)

        # apply gradient clipping here
        grads = grad_clip(grads, self.clip_c)

        # updates
        updates = adadelta(mlsl_obj = mlsl_obj, dist = dist, parameters = self.params, gradients = grads)

        # train function
        inps = [src, src_mask, trg, trg_mask]

        self.train_fn = K.function(inps, [train_cost], updates=updates, name='train_func')
예제 #6
0
    def sampled_softmax_loss(weights,
                             biases,
                             num_sampled,
                             num_classes,
                             labels,
                             inputs,
                             mask=None,
                             num_true=1,
                             sampled_values=None,
                             remove_accidental_hits=True):
        """Computes and returns the sampled softmax training loss.
        This is a faster way to train a softmax classifier over a huge number of
        classes.
        This operation is for training only.  It is generally an underestimate of
        the full softmax loss.
        At inference time, you can compute full softmax probabilities with the
        expression `tf.nn.softmax(tf.matmul(inputs, tf.transpose(weights)) + biases)`.
        See our [Candidate Sampling Algorithms Reference]
        (../../extras/candidate_sampling.pdf)
        Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
        ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
        Args:
          weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
              objects whose concatenation along dimension 0 has shape
              [num_classes, dim].  The (possibly-sharded) class embeddings.
          biases: A `Tensor` of shape `[num_classes]`.  The class biases.
          inputs: A `Tensor` of shape `[time steps, batch_size, dim]`.  The forward
              activations of the input network.
          mask: A tensor of shape [time_steps, batch_size,1].
          labels: A `Tensor` of type `int64` and shape `[time_steps,batch_size,
              num_true]`. The target classes.  Note that this format differs from
              the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
          num_sampled: An `int`.  The number of classes to randomly sample per batch.
          num_classes: An `int`. The number of possible classes.
          num_true: An `int`.  The number of target classes per training example.
          sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
              `sampled_expected_count`) returned by a `*_candidate_sampler` function.
              (if None, we default to `log_uniform_candidate_sampler`)
          remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
              where a sampled class equals one of the target classes.  Default is
              True.
          partition_strategy: A string specifying the partitioning strategy, relevant
              if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
              Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
          name: A name for the operation (optional).
        Returns:
          A `batch_size` 1-D tensor of per-example sampled softmax losses.
        """
        assert K.ndim(inputs) == 3  # time_steps, number_samples, input_dim
        nb_samples = K.cast(K.shape(inputs)[1], K.dtype(weights))

        inputs = K.reshape(inputs, (-1, K.shape(inputs)[2]))
        labels = K.reshape(labels, (-1, 1))
        labels = K.cast(labels, 'int64')

        ce = tf.nn.sampled_softmax_loss(weights=weights,
                                        biases=biases,
                                        inputs=inputs,
                                        labels=labels,
                                        num_sampled=num_sampled,
                                        num_classes=num_classes,
                                        num_true=num_true,
                                        sampled_values=sampled_values,
                                        remove_accidental_hits=remove_accidental_hits)
        if mask is not None:
            mask_flat = K.flatten(mask)  # time_steps*nb_samples
            ce *= mask_flat
        print "--sum--sampled_softmax_loss"
        tmp = sum_op.Sum_op(keepdim=True, dimension=0)(ce)
        tmp = K.squeeze(tmp,0)
        #return K.sum(ce) / nb_samples
        return tmp / nb_samples
예제 #7
0
    def calc_loss_with_model_parallel(self, src, src_mask_3d, trg, trg_mask_3d, ps_device, devices, l1_reg_weight=1e-6,
                                      l2_reg_weight=1e-6):
        assert K._BACKEND == 'tensorflow'

        with tf.device(devices[0]):

            annotations = self.encoder.apply(src, src_mask_3d)

            #init_context = K.sum(annotations * src_mask_3d, axis=0) / K.sum(src_mask_3d, axis=0)
            print "--sum--calc_loss_with_model_parallel"
            tmp = sum_op.Sum_op(keepdim=True, dimension=0)(annotations * src_mask_3d)
            tmp1 = K.squeeze(tmp,0)
            tmp = sum_op.Sum_op(keepdim=True, dimension=0)(src_mask_3d)
            tmp2 = K.squeeze(tmp,0)
            init_context = tmp1 / tmp2

            trg_emb = self.table_trg.apply(trg)
            # shift_right assumes a 3D tensor, and time steps is dimension one
            trg_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(trg_emb, [1, 0, 2])),
                                                   [1, 0, 2])
            hiddens, readout, alignment = self.decoder.run_pipeline(
                state_below=trg_emb_shifted,
                mask_below=trg_mask_3d,
                init_context=init_context,
                c=annotations,
                c_mask=src_mask_3d)

            if self.dropout > 0.:
                logger.info('Apply dropout with p = {}'.format(self.dropout))
                readout = Dropout(readout, self.dropout)

        logits = self.logistic_layer.get_logits_with_multiple_devices(readout, ps_device, devices)

        with tf.device(devices[0]):
            logits_flat = K.reshape(logits, shape=(-1, self.logistic_layer.n_out))
            cost = get_category_cross_entropy_from_flat_logits(logits_flat, trg, trg_mask_3d)

        if self.with_reconstruction:
            with tf.device(devices[0]):
                inverse_init_context = K.sum(hiddens * trg_mask_3d, axis=0) / K.sum(trg_mask_3d, axis=0)
                src_emb = self.table_src.apply(src)
                src_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(
                    src_emb, [1, 0, 2])), [1, 0, 2])
                inverse_hiddens, inverse_readout, inverse_alignment = self.inverse_decoder.run_pipeline(
                    state_below=src_emb_shifted,
                    mask_below=src_mask_3d,
                    init_context=inverse_init_context,
                    c=hiddens,
                    c_mask=trg_mask_3d)
            with tf.device(devices[0]):
                if self.dropout > 0.:
                    inverse_readout = Dropout(inverse_readout, self.dropout)

            inverse_logits = self.inverse_logistic_layer.get_logits_with_multiple_devices(inverse_readout, ps_device,
                                                                                          devices)
            with tf.device(devices[0]):
                inverse_logits_flat = K.reshape(inverse_logits, shape=(-1, self.inverse_logistic_layer.n_out))
                reconstruction_cost = get_category_cross_entropy_from_flat_logits(inverse_logits_flat, src, src_mask_3d)

            with tf.device(devices[0]):
                cost += reconstruction_cost * self.reconstruction_weight

        #L1 = sum([K.sum(K.abs(param)) for param in self.params])
        #L2 = sum([K.sum(K.square(param)) for param in self.params])
        print "--sum--calc_loss_with_model_parallelL1L2"
        L1 = sum([K.squeeze(sum_op.Sum_op(keepdim=True, dimension=0)(K.abs(param)), 0)for param in self.params])
        L2 = sum([K.squeeze(sum_op.Sum_op(keepdim=True, dimension=0)(K.square(param)), 0)for param in self.params])

        params_regular = L1 * l1_reg_weight + L2 * l2_reg_weight

        cost += params_regular

        return cost
예제 #8
0
    def calc_loss(self, src, src_mask_3d, trg, trg_mask_3d,
                  l1_reg_weight=1e-6,
                  l2_reg_weight=1e-6,
                  softmax_output_num_sampled=100000):

        annotations = self.encoder.apply(src, src_mask_3d)
        # init_context = annotations[0, :, -self.n_hids_src:]
        # modification #1
        # mean pooling
        #init_context = K.sum(annotations * src_mask_3d, axis=0) / K.sum(src_mask_3d, axis=0)
        print "--sum--calc_loss"
        tmp = sum_op.Sum_op(keepdim=True, dimension=0)(annotations * src_mask_3d)
        tmp1 = K.squeeze(tmp,0)
        tmp = sum_op.Sum_op(keepdim=True, dimension=0)(src_mask_3d)
        tmp2 = K.squeeze(tmp,0)
        init_context = tmp1 / tmp2

        trg_emb = self.table_trg.apply(trg)
        # shift_right assumes a 3D tensor, and time steps is dimension one
        trg_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(trg_emb, [1, 0, 2])),
                                               [1, 0, 2])

        hiddens, readout, alignment = self.decoder.run_pipeline(state_below=trg_emb_shifted,
                                                                mask_below=trg_mask_3d,
                                                                init_context=init_context,
                                                                c=annotations,
                                                                c_mask=src_mask_3d)

        # apply dropout
        if self.dropout > 0.:
            logger.info('Apply dropout with p = {}'.format(self.dropout))
            readout = Dropout(readout, self.dropout)

        cost = calc_loss_from_readout(readout=readout,
                                      targets=trg,
                                      targets_mask=trg_mask_3d,
                                      logisticRegressionLayer=self.logistic_layer,
                                      softmax_output_num_sampled=softmax_output_num_sampled)

        if self.with_reconstruction:
            inverse_init_context = K.sum(hiddens * trg_mask_3d, axis=0) / K.sum(trg_mask_3d, axis=0)
            src_emb = self.table_src.apply(src)
            src_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(src_emb, [1, 0, 2])),
                                                   [1, 0, 2])

            inverse_hiddens, inverse_readout, inverse_alignment = self.inverse_decoder.run_pipeline(
                state_below=src_emb_shifted,
                mask_below=src_mask_3d,
                init_context=inverse_init_context,
                c=hiddens,
                c_mask=trg_mask_3d)

            if self.dropout > 0.:
                inverse_readout = Dropout(inverse_readout, self.dropout)

            inverse_logits = self.inverse_logistic_layer.get_logits(inverse_readout)
            inverse_logits_flat = K.reshape(inverse_logits, shape=(-1, self.inverse_logistic_layer.n_out))
            reconstruction_cost = get_category_cross_entropy_from_flat_logits(inverse_logits_flat, src, src_mask_3d)

            cost += reconstruction_cost * self.reconstruction_weight

        #L1 = sum([K.sum(K.abs(param)) for param in self.params])
        #L2 = sum([K.sum(K.square(param)) for param in self.params])
        print "--sum--calc_lossL1L2"
        L1 = sum([K.squeeze(sum_op.Sum_op(keepdim=True, dimension=0)(K.abs(param)), 0)for param in self.params])
        L2 = sum([K.squeeze(sum_op.Sum_op(keepdim=True, dimension=0)(K.square(param)), 0)for param in self.params])

        params_regular = L1 * l1_reg_weight + L2 * l2_reg_weight

        cost += params_regular

        return cost
예제 #9
0
    def apply(self,
              state_below,
              mask_below=None,
              init_state=None,
              init_context=None,
              c=None,
              c_mask=None,
              one_step=False,
              cov_before=None,
              fertility=None):

        # assert c, 'Context must be provided'
        # assert c.ndim == 3, 'Context must be 3-d: n_seq * batch_size * dim'

        # state_below: n_steps * batch_size/1 * embedding

        # mask
        if mask_below is None:  # sampling or beamsearch
            #mask_below = K.ones_like(K.sum(state_below, axis=-1, keepdims=True))    # nb_samples
            print "--sum--decoder"
            mask_below = K.ones_like(
                sum_op.Sum_op(keepdim=True, dimension=-1)(state_below))

        if K.ndim(mask_below) != K.ndim(state_below):
            mask_below = K.expand_dims(mask_below)

        assert K.ndim(mask_below) == K.ndim(state_below)

        if one_step:
            assert init_state is not None, 'previous state must be provided'

        if init_state is None:
            init_state = self.create_init_state(init_context)

        state_below_xh = K.dot(state_below, self.W_xh)
        state_below_xz = K.dot(state_below, self.W_xz)
        state_below_xr = K.dot(state_below, self.W_xr)

        if self.with_attention:
            # time steps, nb_samples, n_hids
            p_from_c = K.reshape(K.dot(c, self.A_cp),
                                 shape=(K.shape(c)[0], K.shape(c)[1],
                                        self.n_hids))
        else:
            c_z = K.dot(init_context, self.W_cz)
            c_r = K.dot(init_context, self.W_cr)
            c_h = K.dot(init_context, self.W_ch)

        if one_step:
            if self.with_attention:
                return self._step_attention(state_below_xh,
                                            state_below_xz,
                                            state_below_xr,
                                            mask_below,
                                            init_state,
                                            c,
                                            c_mask,
                                            p_from_c,
                                            cov_tm1=cov_before,
                                            fertility=fertility)
            else:
                return self._step_context(state_below_xh, state_below_xz,
                                          state_below_xr, mask_below,
                                          init_state, c_z, c_r, c_h,
                                          init_context)
        else:
            sequences = [
                state_below_xh, state_below_xz, state_below_xr, mask_below
            ]
            # decoder hidden state
            outputs_info = [init_state]
            if self.with_attention:
                # ctx, probs
                if K._BACKEND == 'theano':
                    outputs_info += [None, None]
                else:
                    outputs_info += [
                        K.zeros_like(K.sum(c, axis=0)),
                        K.zeros_like(K.sum(c, axis=-1))
                    ]

                if self.with_coverage:
                    # initialization for coverage
                    # TODO: check c is 3D
                    init_cov = K.repeat_elements(K.expand_dims(
                        K.zeros_like(K.sum(c, axis=2))),
                                                 self.coverage_dim,
                                                 axis=2)
                    outputs_info.append(init_cov)
                    # fertility is not constructed outside when training
                    if self.coverage_type is 'linguistic':
                        fertility = self._get_fertility(c)
                    else:
                        fertility = K.zeros_like(K.sum(c, axis=2))
                    if K._BACKEND == 'theano':
                        fn = lambda x_h, x_z, x_r, x_m, h_tm1, cov_tm1: self._step_attention(
                            x_h,
                            x_z,
                            x_r,
                            x_m,
                            h_tm1,
                            c,
                            c_mask,
                            p_from_c,
                            cov_tm1=cov_tm1,
                            fertility=fertility)
                    else:
                        fn = lambda (h_tm1, ctx_tm1, probs_tm1, cov_tm1), (
                            x_h, x_z, x_r, x_m): self._step_attention(
                                x_h,
                                x_z,
                                x_r,
                                x_m,
                                h_tm1,
                                c,
                                c_mask,
                                p_from_c,
                                cov_tm1=cov_tm1,
                                fertility=fertility)
                else:
                    if K._BACKEND == 'theano':
                        if self.mkl == True:
                            print('with mkl')
                            #alignment GRU
                            W_x_a = K.concatenate(
                                [self.W_xh, self.W_xz, self.W_xr], axis=0)
                            W_h_a = K.concatenate(
                                [self.W_n1_z, self.W_n1_r, self.W_n1_h],
                                axis=0)
                            b_a = K.concatenate(
                                [self.b_n1_z, self.b_n1_r, self.b_n1_h],
                                axis=0)
                            hidden_alignment = self.GRU_op(
                                state_below, W_x_a, W_h_a, init_state, b_a)[0]
                            #attention
                            ctx, probs = self.attention_.apply(
                                hidden_alignment, c, p_from_c)
                            #decoder GRU
                            W_x_c = K.concatenate(
                                [self.W_cz, self.W_cr, self.W_ch], axis=0)
                            W_h_c = K.concatenate(
                                [self.W_hz, self.W_hr, self.W_hh], axis=0)
                            b_c = K.concatenate([self.b_z, self.b_r, self.b_h],
                                                axis=0)
                            init = hidden_alignment[
                                K.shape(hidden_alignment)[0] - 1, :, :]
                            hidden_decoder = self.GRU_op(
                                ctx, W_x_c, W_h_c, init, b_c)[0]

                            self.output = [hidden_decoder, ctx, probs]
                        else:
                            fn = lambda x_h, x_z, x_r, x_m, h_tm1: self._step_attention(
                                x_h, x_z, x_r, x_m, h_tm1, c, c_mask, p_from_c)
                    else:
                        fn = lambda (h_tm1, ctx_tm1, probs_tm1), (
                            x_h, x_z, x_r, x_m): self._step_attention(
                                x_h, x_z, x_r, x_m, h_tm1, c, c_mask, p_from_c)

            else:
                if K._BACKEND == 'theano':
                    fn = lambda x_h, x_z, x_r, x_m, h_tm1: self._step_context(
                        x_h, x_z, x_r, x_m, h_tm1, c_z, c_r, c_h, init_context)
                else:
                    fn = lambda (h_tm1, ), (
                        x_h, x_z, x_r, x_m): self._step_context(
                            x_h, x_z, x_r, x_m, h_tm1, c_z, c_r, c_h,
                            init_context)

            if self.mkl == False:
                self.output = K.scan(fn,
                                     sequences=sequences,
                                     outputs_initials=outputs_info,
                                     name=_p(self.pname, 'layers'))

            return self.output
예제 #10
0
    def _step_attention(self,
                        x_h,
                        x_z,
                        x_r,
                        x_m,
                        h_tm1,
                        c,
                        c_m,
                        p_from_c,
                        cov_tm1=None,
                        fertility=None):
        '''
        x_h: input at time t
        x_z: update of input
        x_r: reset of input
        x_m: mask of x_t
        h_tm1: previous state
        cov_tm1:  coverage at time (t-1)
        fertility:  fertility of individual source word
        '''

        # here h1 combines previous hidden state and lastly generated word with GRU
        # note that this is different from the paper
        z1 = K.sigmoid(K.dot(h_tm1, self.W_n1_z) + x_z + self.b_n1_z)
        r1 = K.sigmoid(K.dot(h_tm1, self.W_n1_r) + x_r + self.b_n1_r)
        h1 = K.tanh(r1 * K.dot(h_tm1, self.W_n1_h) + x_h + self.b_n1_h)
        # nb_samples, n_hids
        h1 = z1 * h_tm1 + (1. - z1) * h1
        h1 = x_m * h1 + (1. - x_m) * h_tm1

        # 1, nb_samples, dim
        p_from_h = K.expand_dims(K.dot(h1, self.B_hp) + self.b_tt, axis=0)
        # time_stpes, nb_samples, dim
        p = p_from_h + p_from_c

        if self.with_coverage:
            p_from_cov = K.dot(cov_tm1, self.C_covp)
            p += p_from_cov

        # energy = exp(dot(tanh(p), self.D_pe) + self.c_tt).reshape((source_len, target_num))
        # since self.c_tt has nothing to do with the probs, why? since it contributes an e^c_tt() to the the denominator and nominator
        # note: self.D_pe has a shape of (hidden_output_dim,1)
        # time_steps, nb_samples, 1
        energy = K.exp(K.dot(K.tanh(p), self.D_pe))

        # c_m: time_steps, nb_samples
        if c_m is not None:
            energy *= c_m

        print "--sum--attention ori:normalizer"
        #normalizer = K.sum(energy, axis=0, keepdims=True)
        normalizer = sum_op.Sum_op(keepdim=True, dimension=0)(energy)
        probs = energy / normalizer
        probs = K.squeeze(probs, axis=2)

        print "--sum--attention ori:ctx"
        #ctx = K.sum(c * K.expand_dims(probs), axis=0)
        ctx = sum_op.Sum_op(keepdim=True,
                            dimension=0)(c * K.expand_dims(probs))
        ctx = K.squeeze(ctx, axis=0)

        # update coverage after producing attention probabilities at time t
        if self.with_coverage:
            cov = self._update_coverage(cov_tm1, probs, c, h_tm1, fertility)

        # this is even more consistent with our context gate
        # h1 corresponds to target context, while ctx corresponds to source context

        if self.with_context_gate:
            gate = K.sigmoid(
                K.dot(h1, self.W_ctx_h) + K.dot(ctx, self.W_ctx_c) +
                self.b_ctx)

            # we directly scale h1, since it used in computing both can_h_t and h_t
            h1 = h1 * (1. - gate)
        else:
            gate = 1.

        z_t = K.sigmoid(
            K.dot(h1, self.W_hz) + gate * K.dot(ctx, self.W_cz) + self.b_z)
        r_t = K.sigmoid(
            K.dot(h1, self.W_hr) + gate * K.dot(ctx, self.W_cr) + self.b_r)
        h_t = K.tanh(r_t * K.dot(h1, self.W_hh) +
                     gate * K.dot(ctx, self.W_ch) + self.b_h)

        h_t = z_t * h1 + (1. - z_t) * h_t
        h_t = x_m * h_t + (1. - x_m) * h1

        if self.with_coverage:
            return [h_t, ctx, probs, cov]
        else:
            return [h_t, ctx, probs]