Exemplo n.º 1
0
    def __init__(self,
                 state_names,
                 state_dims,
                 sequence_dim,
                 match_dim,
                 state_transformer=None,
                 sequence_transformer=None,
                 energy_computer=None,
                 **kwargs):
        super(SequenceContentAttention, self).__init__(**kwargs)
        self.state_names = state_names
        self.state_dims = state_dims
        self.sequence_dim = sequence_dim
        self.match_dim = match_dim
        self.state_transformer = state_transformer

        self.state_transformers = Parallel(input_names=state_names,
                                           prototype=state_transformer,
                                           name="state_trans")
        if not sequence_transformer:
            sequence_transformer = MLP([Identity()], name="seq_trans")
        if not energy_computer:
            energy_computer = EnergyComputer(name="energy_comp")
        self.sequence_transformer = sequence_transformer
        self.energy_computer = energy_computer

        self.children = [
            self.state_transformers, sequence_transformer, energy_computer
        ]
Exemplo n.º 2
0
    def __init__(self, area_transform, patch_transform, response_transform,
                 n_spatial_dims, batch_normalize, whatwhere_interaction="additive",
                 **kwargs):
        super(Merger, self).__init__(**kwargs)

        self.patch_transform = patch_transform
        self.area_transform = area_transform

        self.whatwhere_interaction = whatwhere_interaction
        self.response_merge = Parallel(
            input_names="area patch".split(),
            input_dims=[area_transform.brick.output_dim,
                        patch_transform.brick.output_dim],
            output_dims=2*[response_transform.brick.input_dim],
            prototype=Linear(use_bias=False),
            child_prefix="response_merge")
        self.response_merge_activation = NormalizedActivation(
            shape=[response_transform.brick.input_dim],
            name="response_merge_activation",
            batch_normalize=batch_normalize)
        self.response_transform = response_transform

        self.children = [self.response_merge_activation,
                         self.response_merge,
                         patch_transform.brick,
                         area_transform.brick,
                         response_transform.brick]
Exemplo n.º 3
0
    def __init__(self, **kwargs):
        super(TreeAttention, self).__init__(**kwargs)
        state_transformer = Linear()
        self.state_transformers = Parallel(input_names=self.state_names,
                                           prototype=state_transformer,
                                           name="state_trans")
        self.parent1_transformer = Linear(name="parent1_trans")
        self.parent2_transformer = Linear(name="parent2_trans")

        self.children = [self.state_transformers,
                         self.parent1_transformer,
                         self.parent2_transformer]
    def __init__(self, stack_dim=500, **kwargs):
        """Sole constructor.
        
        Args:
            stack_dim (int): Size of vectors on the stack.
        """
        super(PushDownSequenceContentAttention, self).__init__(**kwargs)
        self.stack_dim = stack_dim
        self.max_stack_depth = 25

        self.stack_op_names = self.state_names + ['weighted_averages']

        self.stack_pop_transformer = MLP(activations=[Logistic()], dims=None)
        self.stack_pop_transformers = Parallel(
            input_names=self.stack_op_names,
            prototype=self.stack_pop_transformer,
            name="stack_pop")

        self.stack_push_transformer = MLP(activations=[Logistic()], dims=None)
        self.stack_push_transformers = Parallel(
            input_names=self.stack_op_names,
            prototype=self.stack_push_transformer,
            name="stack_push")

        self.stack_input_transformer = Linear()
        self.stack_input_transformers = Parallel(
            input_names=self.stack_op_names,
            prototype=self.stack_input_transformer,
            name="stack_input")
        self.children.append(self.stack_pop_transformers)
        self.children.append(self.stack_push_transformers)
        self.children.append(self.stack_input_transformers)
Exemplo n.º 5
0
    def __init__(self,
                 state_names,
                 state_dims,
                 sequence_dim,
                 match_dim,
                 state_transformer=None,
                 sequence_transformer=None,
                 energy_computer=None,
                 **kwargs):
        super(SequenceContentAttention, self).__init__(**kwargs)
        update_instance(self, locals())

        self.state_transformers = Parallel(channel_names=state_names,
                                           prototype=self.state_transformer,
                                           name="state_trans")
        if not self.sequence_transformer:
            self.sequence_transformer = MLP([Identity()], name="seq_trans")
        if not self.energy_computer:
            self.energy_computer = MLP([Identity()], name="energy_comp")
        self.children = [
            self.state_transformers, self.sequence_transformer,
            self.energy_computer
        ]
Exemplo n.º 6
0
    def __init__(self, state_names, state_dims, sequence_dim, match_dim,
                 state_transformer=None, sequence_transformer=None,
                 energy_computer=None, weights_init=None, biases_init=None,
                 **kwargs):
        super(SequenceContentAttention, self).__init__(**kwargs)
        update_instance(self, locals())

        self.state_transformers = Parallel(channel_names=state_names,
                                           prototype=self.state_transformer,
                                           name="state_trans")
        if not self.sequence_transformer:
            self.sequence_transformer = MLP([Identity()], name="seq_trans")
        if not self.energy_computer:
            self.energy_computer = MLP([Identity()], name="energy_comp")
        self.children = [self.state_transformers, self.sequence_transformer,
                         self.energy_computer]
Exemplo n.º 7
0
class Merger(Initializable):
    def __init__(self, area_transform, patch_transform, response_transform,
                 n_spatial_dims, batch_normalize, whatwhere_interaction="additive",
                 **kwargs):
        super(Merger, self).__init__(**kwargs)

        self.patch_transform = patch_transform
        self.area_transform = area_transform

        self.whatwhere_interaction = whatwhere_interaction
        self.response_merge = Parallel(
            input_names="area patch".split(),
            input_dims=[area_transform.brick.output_dim,
                        patch_transform.brick.output_dim],
            output_dims=2*[response_transform.brick.input_dim],
            prototype=Linear(use_bias=False),
            child_prefix="response_merge")
        self.response_merge_activation = NormalizedActivation(
            shape=[response_transform.brick.input_dim],
            name="response_merge_activation",
            batch_normalize=batch_normalize)
        self.response_transform = response_transform

        self.children = [self.response_merge_activation,
                         self.response_merge,
                         patch_transform.brick,
                         area_transform.brick,
                         response_transform.brick]

    @application(inputs="patch location scale".split(),
                 outputs=['response'])
    def apply(self, patch, location, scale):
        # don't backpropagate through these to avoid the model using
        # the location/scale as merely additional hidden units
        #location, scale = list(map(theano.gradient.disconnected_grad, (location, scale)))
        patch = self.patch_transform(patch)
        area = self.area_transform(T.concatenate([location, scale], axis=1))
        parts = self.response_merge.apply(area, patch)
        if self.whatwhere_interaction == "additive":
            response = sum(parts)
        elif self.whatwhere_interaction == "multiplicative":
            response = reduce(operator.mul, parts)
        response = self.response_merge_activation.apply(response)
        response = self.response_transform(response)
        return response
Exemplo n.º 8
0
    def __init__(self, match_dim, state_transformer=None,
                 attended_transformer=None, energy_computer=None, **kwargs):
        super(SequenceContentAttention, self).__init__(**kwargs)
        self.match_dim = match_dim
        self.state_transformer = state_transformer

        self.state_transformers = Parallel(input_names=self.state_names,
                                           prototype=state_transformer,
                                           name="state_trans")
        if not attended_transformer:
            attended_transformer = Linear(name="preprocess")
        if not energy_computer:
            energy_computer = ShallowEnergyComputer(name="energy_comp")
        self.attended_transformer = attended_transformer
        self.energy_computer = energy_computer

        self.children = [self.state_transformers, attended_transformer,
                         energy_computer]
Exemplo n.º 9
0
    def __init__(self, match_dim, conv_n, conv_num_filters=1,
                 state_transformer=None,
                 attended_transformer=None, energy_computer=None,
                 prior=None, energy_normalizer=None, **kwargs):
        super(SequenceContentAndConvAttention, self).__init__(**kwargs)
        if not state_transformer:
            state_transformer = Linear(use_bias=False)

        self.match_dim = match_dim
        self.state_transformer = state_transformer

        self.state_transformers = Parallel(input_names=self.state_names,
                                           prototype=state_transformer,
                                           name="state_trans")
        if not attended_transformer:
            # Only this contributor to the match vector
            # is allowed to have biases
            attended_transformer = Linear(name="preprocess")

        if not energy_normalizer:
            energy_normalizer = 'softmax'
        self.energy_normalizer = energy_normalizer

        if not energy_computer:
            energy_computer = ShallowEnergyComputer(
                name="energy_comp",
                use_bias=self.energy_normalizer != 'softmax')
        self.filter_handler = Linear(name="handler", use_bias=False)
        self.attended_transformer = attended_transformer
        self.energy_computer = energy_computer

        if not prior:
            prior = dict(type='expanding', initial_begin=0, initial_end=10000,
                         min_speed=0, max_speed=0)
        self.prior = prior

        self.conv_n = conv_n
        self.conv_num_filters = conv_num_filters
        self.conv = Conv1D(conv_num_filters, 2 * conv_n + 1)

        self.children = [self.state_transformers, self.attended_transformer,
                         self.energy_computer, self.filter_handler, self.conv]
Exemplo n.º 10
0
    def __init__(self, match_dim, state_transformer=None,
                 attended_transformer=None, energy_computer=None, **kwargs):
        if not state_transformer:
            state_transformer = Linear(use_bias=False)
        self.match_dim = match_dim
        self.state_transformer = state_transformer

        self.state_transformers = Parallel(input_names=kwargs['state_names'],
                                           prototype=state_transformer,
                                           name="state_trans")
        if not attended_transformer:
            attended_transformer = Linear(name="preprocess")
        if not energy_computer:
            energy_computer = ShallowEnergyComputer(name="energy_comp")
        self.attended_transformer = attended_transformer
        self.energy_computer = energy_computer

        children = [self.state_transformers, attended_transformer,
                    energy_computer] + kwargs.get('children', [])
        super(SequenceContentAttention, self).__init__(children=children,
                                                       **kwargs)
Exemplo n.º 11
0
class SequenceContentAndConvAttention(GenericSequenceAttention, Initializable):
    @lazy()
    def __init__(self, match_dim, conv_n, conv_num_filters=1,
                 state_transformer=None,
                 attended_transformer=None, energy_computer=None,
                 prior=None, energy_normalizer=None, **kwargs):
        super(SequenceContentAndConvAttention, self).__init__(**kwargs)
        if not state_transformer:
            state_transformer = Linear(use_bias=False)

        self.match_dim = match_dim
        self.state_transformer = state_transformer

        self.state_transformers = Parallel(input_names=self.state_names,
                                           prototype=state_transformer,
                                           name="state_trans")
        if not attended_transformer:
            # Only this contributor to the match vector
            # is allowed to have biases
            attended_transformer = Linear(name="preprocess")

        if not energy_normalizer:
            energy_normalizer = 'softmax'
        self.energy_normalizer = energy_normalizer

        if not energy_computer:
            energy_computer = ShallowEnergyComputer(
                name="energy_comp",
                use_bias=self.energy_normalizer != 'softmax')
        self.filter_handler = Linear(name="handler", use_bias=False)
        self.attended_transformer = attended_transformer
        self.energy_computer = energy_computer

        if not prior:
            prior = dict(type='expanding', initial_begin=0, initial_end=10000,
                         min_speed=0, max_speed=0)
        self.prior = prior

        self.conv_n = conv_n
        self.conv_num_filters = conv_num_filters
        self.conv = Conv1D(conv_num_filters, 2 * conv_n + 1)

        self.children = [self.state_transformers, self.attended_transformer,
                         self.energy_computer, self.filter_handler, self.conv]

    def _push_allocation_config(self):
        self.state_transformers.input_dims = self.state_dims
        self.state_transformers.output_dims = [self.match_dim
                                               for name in self.state_names]
        self.attended_transformer.input_dim = self.attended_dim
        self.attended_transformer.output_dim = self.match_dim
        self.energy_computer.input_dim = self.match_dim
        self.energy_computer.output_dim = 1
        self.filter_handler.input_dim = self.conv_num_filters
        self.filter_handler.output_dim = self.match_dim

    @application
    def compute_energies(self, attended, preprocessed_attended,
                         previous_weights, states):
        if not preprocessed_attended:
            preprocessed_attended = self.preprocess(attended)
        transformed_states = self.state_transformers.apply(as_dict=True,
                                                           **states)
        # Broadcasting of transformed states should be done automatically
        match_vectors = sum(transformed_states.values(),
                            preprocessed_attended)
        conv_result = self.conv.apply(previous_weights)
        match_vectors += self.filter_handler.apply(
            conv_result[:, :, self.conv_n:-self.conv_n]
            .dimshuffle(0, 2, 1)).dimshuffle(1, 0, 2)
        energies = self.energy_computer.apply(match_vectors).reshape(
            match_vectors.shape[:-1], ndim=match_vectors.ndim - 1)
        return energies

    @staticmethod
    def mask_row(offset, length, empty_row):
        return tensor.set_subtensor(empty_row[offset:offset+length], 1)

    @application(outputs=['weighted_averages', 'weights', 'energies', 'step'])
    def take_glimpses(self, attended, preprocessed_attended=None,
                      attended_mask=None, weights=None, step=None, **states):
        # Cut the considered window.
        p = self.prior
        length = attended.shape[0]
        prior_type = p.get('type', 'expanding')
        if prior_type=='expanding':
            begin = p['initial_begin'] + step[0] * p['min_speed']
            end = p['initial_end'] + step[0] * p['max_speed']
            begin = tensor.maximum(0, tensor.minimum(length - 1, begin))
            end = tensor.maximum(0, tensor.minimum(length, end))
            additional_mask = None
        elif prior_type.startswith('window_around'):
            #check whether we want the mean or median!
            if prior_type == 'window_around_mean':
                position_in_attended = tensor.arange(length, dtype=floatX)[None, :]
                expected_last_source_pos = (weights * position_in_attended).sum(axis=1)
            elif prior_type == 'window_around_median':
                ali_to_05 = tensor.extra_ops.cumsum(weights, axis=1) - 0.5
                ali_to_05 = (ali_to_05>=0)
                ali_median_pos = ali_to_05[:,1:] - ali_to_05[:,:-1]
                expected_last_source_pos = tensor.argmax(ali_median_pos, axis=1)
                expected_last_source_pos = theano.gradient.disconnected_grad(
                    expected_last_source_pos)
            else:
                raise ValueError
            #the window taken around each element
            begins = tensor.floor(expected_last_source_pos - p['before'])
            ends = tensor.ceil(expected_last_source_pos + p['after'])
            #the global window to optimize computations
            begin = tensor.maximum(0, begins.min()).astype('int64')
            end = tensor.minimum(length, ends.max()).astype('int64')
            #the new mask, already cut to begin:end
            position_in_attended_cut = tensor.arange(
                begin * 1., end * 1., 1., dtype=floatX)[None, :]
            additional_mask = ((position_in_attended_cut > begins[:,None]) *
                               (position_in_attended_cut < ends[:,None]))
        else:
            raise Exception("Unknown prior type: %s", prior_type)
        begin = tensor.floor(begin).astype('int64')
        end = tensor.ceil(end).astype('int64')
        attended_cut = attended[begin:end]
        preprocessed_attended_cut = (preprocessed_attended[begin:end]
                                     if preprocessed_attended else None)
        attended_mask_cut = (
            (attended_mask[begin:end] if attended_mask else None)
            * (additional_mask.T if additional_mask else 1))
        weights_cut = weights[:, begin:end]

        # Call
        energies_cut = self.compute_energies(attended_cut, preprocessed_attended_cut,
                                             weights_cut, states)
        weights_cut = self.compute_weights(energies_cut, attended_mask_cut)
        weighted_averages = self.compute_weighted_averages(weights_cut, attended_cut)

        # Paste
        new_weights = new_energies = tensor.zeros_like(weights.T)
        new_weights = tensor.set_subtensor(new_weights[begin:end],
                                           weights_cut)
        new_energies = tensor.set_subtensor(new_energies[begin:end],
                                            energies_cut)

        return weighted_averages, new_weights.T, new_energies.T, step + 1

    @take_glimpses.property('inputs')
    def take_glimpses_inputs(self):
        return (['attended', 'preprocessed_attended',
                 'attended_mask', 'weights', 'step'] +
                self.state_names)

    @application
    def compute_weights(self, energies, attended_mask):
        if self.energy_normalizer == 'softmax':
            logger.info("Using softmax attention weights normalization")
            energies = energies - energies.max(axis=0)
            unnormalized_weights = tensor.exp(energies)
        elif self.energy_normalizer == 'logistic':
            logger.info("Using smoothfocus (logistic sigm) "
                        "attention weights normalization")
            unnormalized_weights = tensor.nnet.sigmoid(energies)
        elif self.energy_normalizer == 'relu':
            logger.info("Using ReLU attention weights normalization")
            unnormalized_weights = tensor.maximum(energies/1000., 0.0)
        else:
            raise Exception("Unknown energey_normalizer: {}"
                            .format(self.energy_computer))
        if attended_mask:
            unnormalized_weights *= attended_mask

        # If mask consists of all zeros use 1 as the normalization coefficient
        normalization = (unnormalized_weights.sum(axis=0) +
                         tensor.all(1 - attended_mask, axis=0))
        return unnormalized_weights / normalization

    @application
    def initial_glimpses(self, batch_size, attended):
        return ([tensor.zeros((batch_size, self.attended_dim))]
            + 2 * [tensor.concatenate([
                       tensor.ones((batch_size, 1)),
                       tensor.zeros((batch_size, attended.shape[0] - 1))],
                       axis=1)]
            + [tensor.zeros((batch_size,), dtype='int64')])

    @initial_glimpses.property('outputs')
    def initial_glimpses_outputs(self):
        return ['weight_averages', 'weights', 'energies', 'step']

    @application(inputs=['attended'], outputs=['preprocessed_attended'])
    def preprocess(self, attended):
        return self.attended_transformer.apply(attended)

    def get_dim(self, name):
        if name in ['weighted_averages']:
            return self.attended_dim
        if name in ['weights', 'energies', 'step']:
            return 0
        return super(SequenceContentAndConvAttention, self).get_dim(name)
Exemplo n.º 12
0
class SequenceContentAttention(GenericSequenceAttention, Initializable):
    """Attention mechanism that looks for relevant content in a sequence.

    This is the attention mechanism used in [BCB]_. The idea in a nutshell:

    1. The states and the sequence are transformed independently,

    2. The transformed states are summed with every transformed sequence
       element to obtain *match vectors*,

    3. A match vector is transformed into a single number interpreted as
       *energy*,

    4. Energies are normalized in softmax-like fashion. The resulting
       summing to one weights are called *attention weights*,

    5. Weighted average of the sequence elements with attention weights
       is computed.

    In terms of the :class:`AbstractAttention` documentation, the sequence
    is the attended. The weighted averages from 5 and the attention
    weights from 4 form the set of glimpses produced by this attention
    mechanism.

    Parameters
    ----------
    state_names : list of str
        The names of the network states.
    attended_dim : int
        The dimension of the sequence elements.
    match_dim : int
        The dimension of the match vector.
    state_transformer : :class:`.Brick`
        A prototype for state transformations. If ``None``,
        a linear transformation is used.
    attended_transformer : :class:`.Feedforward`
        The transformation to be applied to the sequence. If ``None`` an
        affine transformation is used.
    energy_computer : :class:`.Feedforward`
        Computes energy from the match vector. If ``None``, an affine
        transformations preceeded by :math:`tanh` is used.

    Notes
    -----
    See :class:`.Initializable` for initialization parameters.

    .. [BCB] Dzmitry Bahdanau, Kyunghyun Cho and Yoshua Bengio. Neural
       Machine Translation by Jointly Learning to Align and Translate.

    """
    @lazy(allocation=['match_dim'])
    def __init__(self, match_dim, state_transformer=None,
                 attended_transformer=None, energy_computer=None, **kwargs):
        if not state_transformer:
            state_transformer = Linear(use_bias=False)
        self.match_dim = match_dim
        self.state_transformer = state_transformer

        self.state_transformers = Parallel(input_names=kwargs['state_names'],
                                           prototype=state_transformer,
                                           name="state_trans")
        if not attended_transformer:
            attended_transformer = Linear(name="preprocess")
        if not energy_computer:
            energy_computer = ShallowEnergyComputer(name="energy_comp")
        self.attended_transformer = attended_transformer
        self.energy_computer = energy_computer

        children = [self.state_transformers, attended_transformer,
                    energy_computer] + kwargs.get('children', [])
        super(SequenceContentAttention, self).__init__(children=children,
                                                       **kwargs)

    def _push_allocation_config(self):
        self.state_transformers.input_dims = self.state_dims
        self.state_transformers.output_dims = [self.match_dim
                                               for name in self.state_names]
        self.attended_transformer.input_dim = self.attended_dim
        self.attended_transformer.output_dim = self.match_dim
        self.energy_computer.input_dim = self.match_dim
        self.energy_computer.output_dim = 1

    @application
    def compute_energies(self, attended, preprocessed_attended, states):
        if not preprocessed_attended:
            preprocessed_attended = self.preprocess(attended)
        transformed_states = self.state_transformers.apply(as_dict=True,
                                                           **states)
        # Broadcasting of transformed states should be done automatically
        match_vectors = sum(transformed_states.values(),
                            preprocessed_attended)
        energies = self.energy_computer.apply(match_vectors).reshape(
            match_vectors.shape[:-1], ndim=match_vectors.ndim - 1)
        return energies

    @application(outputs=['weighted_averages', 'weights'])
    def take_glimpses(self, attended, preprocessed_attended=None,
                      attended_mask=None, **states):
        r"""Compute attention weights and produce glimpses.

        Parameters
        ----------
        attended : :class:`~tensor.TensorVariable`
            The sequence, time is the 1-st dimension.
        preprocessed_attended : :class:`~tensor.TensorVariable`
            The preprocessed sequence. If ``None``, is computed by calling
            :meth:`preprocess`.
        attended_mask : :class:`~tensor.TensorVariable`
            A 0/1 mask specifying available data. 0 means that the
            corresponding sequence element is fake.
        \*\*states
            The states of the network.

        Returns
        -------
        weighted_averages : :class:`~theano.Variable`
            Linear combinations of sequence elements with the attention
            weights.
        weights : :class:`~theano.Variable`
            The attention weights. The first dimension is batch, the second
            is time.

        """
        energies = self.compute_energies(attended, preprocessed_attended,
                                         states)
        weights = self.compute_weights(energies, attended_mask)
        weighted_averages = self.compute_weighted_averages(weights, attended)
        return weighted_averages, weights.T

    @take_glimpses.property('inputs')
    def take_glimpses_inputs(self):
        return (['attended', 'preprocessed_attended', 'attended_mask'] +
                self.state_names)

    @application(outputs=['weighted_averages', 'weights'])
    def initial_glimpses(self, batch_size, attended):
        return [tensor.zeros((batch_size, self.attended_dim)),
                tensor.zeros((batch_size, attended.shape[0]))]

    @application(inputs=['attended'], outputs=['preprocessed_attended'])
    def preprocess(self, attended):
        """Preprocess the sequence for computing attention weights.

        Parameters
        ----------
        attended : :class:`~tensor.TensorVariable`
            The attended sequence, time is the 1-st dimension.

        """
        return self.attended_transformer.apply(attended)

    def get_dim(self, name):
        if name in ['weighted_averages']:
            return self.attended_dim
        if name in ['weights']:
            return 0
        return super(SequenceContentAttention, self).get_dim(name)
Exemplo n.º 13
0
class SequenceContentAttention(AbstractAttention, Initializable):
    """Attention mechanism that looks for relevant content in a sequence.

    This is the attention mechanism used in [BCB]_. The idea in a nutshell:

    1. The states and the sequence are transformed independently,

    2. The transformed states are summed with every transformed sequence
       element to obtain *match vectors*,

    3. A match vector is transformed into a single number interpreted as
       *energy*,

    4. Energies are normalized in softmax-like fashion. The resulting
       summing to one weights are called *attention weights*,

    5. Linear combination of the sequence elements with attention weights
       is computed.

    In terms of the :class:`AbstractAttention` documentation, the sequence
    is the attended. This linear combinations from 5 and the attention
    weights from 4 form the set of glimpses produced by this attention
    mechanism.

    Parameters
    ----------
    state_names : list of str
        The names of the network states.
    sequence_dim : int
        The dimension of the sequence elements.
    match_dim : int
        The dimension of the match vector.
    state_transformer : :class:`.Brick`
        A prototype for state transformations. If ``None``, the default
        transformation from :class:`.Parallel` is used.
    sequence_transformer : :class:`.Feedforward`
        The transformation to be applied to the sequence. If ``None`` an
        affine transformation is used.
    energy_computer : :class:`.Feedforward`
        Computes energy from the match vector. If ``None``, an affine
        transformations preceeded by :math:`tanh` is used.

    Notes
    -----
    See :class:`.Initializable` for initialization parameters.

    .. [BCB] Dzmitry Bahdanau, Kyunghyun Cho and Yoshua Bengio. Neural
       Machine Translation by Jointly Learning to Align and Translate.

    """
    @lazy
    def __init__(self, state_names, state_dims, sequence_dim, match_dim,
                 state_transformer=None, sequence_transformer=None,
                 energy_computer=None,
                 **kwargs):
        super(SequenceContentAttention, self).__init__(**kwargs)
        self.state_names = state_names
        self.state_dims = state_dims
        self.sequence_dim = sequence_dim
        self.match_dim = match_dim
        self.state_transformer = state_transformer

        self.state_transformers = Parallel(input_names=state_names,
                                           prototype=state_transformer,
                                           name="state_trans")
        if not sequence_transformer:
            sequence_transformer = Linear(name="preprocess")
        if not energy_computer:
            energy_computer = ShallowEnergyComputer(name="energy_comp")
        self.sequence_transformer = sequence_transformer
        self.energy_computer = energy_computer

        self.children = [self.state_transformers, sequence_transformer,
                         energy_computer]

    def _push_allocation_config(self):
        self.state_transformers.input_dims = self.state_dims
        self.state_transformers.output_dims = {name: self.match_dim
                                               for name in self.state_names}
        self.sequence_transformer.input_dim = self.sequence_dim
        self.sequence_transformer.output_dim = self.match_dim
        self.energy_computer.input_dim = self.match_dim
        self.energy_computer.output_dim = 1

    @application(outputs=['glimpses', 'weights'])
    def take_glimpses(self, sequence, preprocessed_sequence=None, mask=None,
                      **states):
        r"""Compute attention weights and produce glimpses.

        Parameters
        ----------
        sequence : :class:`~tensor.TensorVariable`
            The sequence, time is the 1-st dimension.
        preprocessed_sequence : :class:`~tensor.TensorVariable`
            The preprocessed sequence. If ``None``, is computed by calling
            :meth:`preprocess`.
        mask : :class:`~tensor.TensorVariable`
            A 0/1 mask specifying available data. 0 means that the
            corresponding sequence element is fake.
        \*\*states
            The states of the network.

        Returns
        -------
        glimpses : :class:`~theano.Variable`
            Linear combinations of sequence elements with the attention
            weights.
        weights : :class:`~theano.Variable`
            The attention weights. The first dimension is batch, the second
            is time.

        """
        if not preprocessed_sequence:
            preprocessed_sequence = self.preprocess(sequence)
        transformed_states = self.state_transformers.apply(as_dict=True,
                                                           **states)
        # Broadcasting of transformed states should be done automatically
        match_vectors = sum(transformed_states.values(),
                            preprocessed_sequence)
        energies = self.energy_computer.apply(match_vectors).reshape(
            match_vectors.shape[:-1], ndim=match_vectors.ndim - 1)
        unormalized_weights = tensor.exp(energies)
        if mask:
            unormalized_weights *= mask
        weights = unormalized_weights / unormalized_weights.sum(axis=0)
        glimpses = (tensor.shape_padright(weights) * sequence).sum(axis=0)
        return glimpses, weights.dimshuffle(1, 0)

    @take_glimpses.property('inputs')
    def take_glimpses_inputs(self):
        return (['sequence', 'preprocessed_sequence', 'mask'] +
                self.state_names)

    @application
    def initial_glimpses(self, name, batch_size, sequence):
        if name == "glimpses":
            return tensor.zeros((batch_size, self.sequence_dim))
        elif name == "weights":
            return tensor.zeros((batch_size, sequence.shape[0]))
        else:
            raise ValueError("Unknown glimpse name {}".format(name))

    @application(inputs=['sequence'], outputs=['preprocessed_sequence'])
    def preprocess(self, sequence):
        """Preprocess a sequence for computing attention weights.

        Parameters
        ----------
        sequence : :class:`~tensor.TensorVariable`
            The sequence, time is the 1-st dimension.

        """
        return self.sequence_transformer.apply(sequence)

    def get_dim(self, name):
        if name in ['glimpses', 'sequence', 'preprocessed_sequence']:
            return self.sequence_dim
        if name in ['mask', 'weights']:
            return 0
        return super(SequenceContentAttention, self).get_dim(name)
Exemplo n.º 14
0
    Logistic(name='output')],
                 dims=[
                     32,
                     8,
                     8,
                     2,
                 ],
                 weights_init=IsotropicGaussian(),
                 biases_init=IsotropicGaussian())

output_mlp.initialize()

parallel_nets = Parallel(
    input_names=['l_x', 'r_x'],
    input_dims=[left_dim, right_dim],
    output_dims=[16, 16],
    weights_init=IsotropicGaussian(),
    biases_init=IsotropicGaussian(),
    prototype=input_mlp,
)
parallel_nets.initialize()
l_h, r_h = parallel_nets.apply(l_x=l_x, r_x=r_x)

# Concatenate the inputs from the two hidden subnets into a single variable
# for input into the next layer.
merge = tensor.concatenate([l_h, r_h], axis=1)

y_hat = output_mlp.apply(merge)

# Define a cost function to optimize, and a classification error rate:
# Also apply the outputs from the net, and corresponding targets:
cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat)
Exemplo n.º 15
0
],
                 dims=[
                     32,
                     8,
                     8,
                     2,
                 ],
                 weights_init=IsotropicGaussian(),
                 biases_init=IsotropicGaussian())

output_mlp.initialize()

parallel_nets = Parallel(
    input_names=['l_x', 'r_x'],
    input_dims=[left_dim, right_dim],
    output_dims=[16, 16],
    weights_init=IsotropicGaussian(),
    biases_init=IsotropicGaussian(),
    prototype=input_mlp,
)
parallel_nets.initialize()
l_h, r_h = parallel_nets.apply(l_x=l_x, r_x=r_x)

# Concatenate the inputs from the two hidden subnets into a single variable
# for input into the next layer.
merge = tensor.concatenate([l_h, r_h], axis=1)

y_hat = output_mlp.apply(merge)

# Define a cost function to optimize, and a classification error rate:
# Also apply the outputs from the net, and corresponding targets:
cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat)
class SequenceMultiContentAttention(GenericSequenceAttention, Initializable):
    @lazy(allocation=['match_dim'])
    def __init__(self,
                 n_att_weights,
                 match_dim,
                 state_transformer=None,
                 attended_transformer=None,
                 energy_computer=None,
                 **kwargs):
        super(SequenceContentAttention, self).__init__(**kwargs)
        self.n_att_weights = n_att_weights
        if not state_transformer:
            state_transformer = Linear(use_bias=False)
        self.match_dim = match_dim
        self.state_transformer = state_transformer

        self.state_transformers = Parallel(input_names=self.state_names,
                                           prototype=state_transformer,
                                           name="state_trans")
        if not attended_transformer:
            attended_transformer = Linear(name="preprocess")
        if not energy_computer:
            energy_computer = MultiShallowEnergyComputer(n_att_weights,
                                                         name="energy_comp")
        self.attended_transformer = attended_transformer
        self.energy_computer = energy_computer

        self.children = [
            self.state_transformers, attended_transformer, energy_computer
        ]

    def _push_allocation_config(self):
        self.state_transformers.input_dims = self.state_dims
        self.state_transformers.output_dims = [
            self.match_dim for name in self.state_names
        ]
        self.attended_transformer.input_dim = self.attended_dim
        self.attended_transformer.output_dim = self.match_dim
        self.energy_computer.input_dim = self.match_dim
        self.energy_computer.output_dim = 1

    @application
    def compute_energies(self, attended, preprocessed_attended, states):
        if not preprocessed_attended:
            preprocessed_attended = self.preprocess(attended)
        transformed_states = self.state_transformers.apply(as_dict=True,
                                                           **states)
        # Broadcasting of transformed states should be done automatically
        match_vectors = sum(transformed_states.values(), preprocessed_attended)
        energies = self.energy_computer.apply(match_vectors).reshape(
            match_vectors.shape[:-1], ndim=match_vectors.ndim - 1)
        return energies

    @application(outputs=['weighted_averages', 'weights'])
    def take_glimpses(self,
                      attended,
                      preprocessed_attended=None,
                      attended_mask=None,
                      **states):
        r"""Compute attention weights and produce glimpses.

        Parameters
        ----------
        attended : :class:`~tensor.TensorVariable`
            The sequence, time is the 1-st dimension.
        preprocessed_attended : :class:`~tensor.TensorVariable`
            The preprocessed sequence. If ``None``, is computed by calling
            :meth:`preprocess`.
        attended_mask : :class:`~tensor.TensorVariable`
            A 0/1 mask specifying available data. 0 means that the
            corresponding sequence element is fake.
        \*\*states
            The states of the network.

        Returns
        -------
        weighted_averages : :class:`~theano.Variable`
            Linear combinations of sequence elements with the attention
            weights.
        weights : :class:`~theano.Variable`
            The attention weights. The first dimension is batch, the second
            is time.

        """
        energies = self.compute_energies(attended, preprocessed_attended,
                                         states)
        weights = self.compute_weights(energies, attended_mask)
        weighted_averages = self.compute_weighted_averages(weights, attended)
        return weighted_averages, weights.T

    @take_glimpses.property('inputs')
    def take_glimpses_inputs(self):
        return (['attended', 'preprocessed_attended', 'attended_mask'] +
                self.state_names)

    @application(outputs=['weighted_averages', 'weights'])
    def initial_glimpses(self, batch_size, attended):
        return [
            tensor.zeros((batch_size, self.attended_dim)),
            tensor.zeros((batch_size, attended.shape[0]))
        ]

    @application(inputs=['attended'], outputs=['preprocessed_attended'])
    def preprocess(self, attended):
        """Preprocess the sequence for computing attention weights.

        Parameters
        ----------
        attended : :class:`~tensor.TensorVariable`
            The attended sequence, time is the 1-st dimension.

        """
        return self.attended_transformer.apply(attended)

    def get_dim(self, name):
        if name in ['weighted_averages']:
            return self.attended_dim
        if name in ['weights']:
            return 0
        return super(SequenceContentAttention, self).get_dim(name)
    def __init__(self,
                 match_dim,
                 max_fertility,
                 state_transformer=None,
                 attended_transformer=None,
                 fertility_transformer=None,
                 att_record_transformer=None,
                 energy_computer=None,
                 **kwargs):
        """Creates an attention brick with 'linguistic' coverage.
        Compare with ``SequenceContentAttention``.
        
        Args:
            match_dim (int): Dimensionality of the match vector
            max_fertility (float): Maximum fertility of a source
                                   annotation (N in Tu et al.). If
                                   this is set to 0 or smaller, we fix
                                   fertilities to 1 and do not estimate
                                   them.
            state_transformer (Brick): Transformation for the decoder
                                       state
            attended_transformer (Brick): Transformation for the source
                                          annotations
            fertility_transformer (Brick): Transformation which 
                                           calculates fertilities
            att_record_transformer (Brick): Transformation for the 
                                            attentional records
            energy_computer (Brick): Sub network for calculating the 
                                     energies from the match vector 
        """

        super(CoverageContentAttention, self).__init__(**kwargs)
        self.use_fertility = (max_fertility > 0.0001)
        self.max_fertility = max_fertility
        if not state_transformer:
            state_transformer = Linear(use_bias=False)
        self.match_dim = match_dim
        self.state_transformer = state_transformer

        self.state_transformers = Parallel(input_names=self.state_names,
                                           prototype=state_transformer,
                                           name="state_trans")
        if not attended_transformer:
            attended_transformer = Linear(name="preprocess")

        if not att_record_transformer:
            att_record_transformer = Linear(name="att_record_trans")
        if not energy_computer:
            energy_computer = ShallowEnergyComputer(name="energy_comp")
        self.attended_transformer = attended_transformer
        self.att_record_transformer = att_record_transformer
        self.energy_computer = energy_computer

        self.children = [
            self.state_transformers, attended_transformer,
            att_record_transformer, energy_computer
        ]

        if self.use_fertility:
            if not fertility_transformer:
                fertility_transformer = MLP(activations=[Logistic()],
                                            name='fertility_trans')
            self.fertility_transformer = fertility_transformer
            self.children.append(fertility_transformer)
class CoverageContentAttention(GenericSequenceAttention, Initializable):
    """This is the 'linguistic' coverage model from Tu et al., 2016. 
    The fertility of each source annotation is estimated with a linear
    transform followed by a sigmoid times N (N is the maximum fertility)
    The coverage model keeps track of the attention record for each
    annotation and feeds the cumulative record divided by the fertility
    to the match vector which eventually determines the attention 
    weight.
    
    This code base of this implementation is close to 
    ``SequenceContentAttention``.
    """
    @lazy(allocation=['match_dim'])
    def __init__(self,
                 match_dim,
                 max_fertility,
                 state_transformer=None,
                 attended_transformer=None,
                 fertility_transformer=None,
                 att_record_transformer=None,
                 energy_computer=None,
                 **kwargs):
        """Creates an attention brick with 'linguistic' coverage.
        Compare with ``SequenceContentAttention``.
        
        Args:
            match_dim (int): Dimensionality of the match vector
            max_fertility (float): Maximum fertility of a source
                                   annotation (N in Tu et al.). If
                                   this is set to 0 or smaller, we fix
                                   fertilities to 1 and do not estimate
                                   them.
            state_transformer (Brick): Transformation for the decoder
                                       state
            attended_transformer (Brick): Transformation for the source
                                          annotations
            fertility_transformer (Brick): Transformation which 
                                           calculates fertilities
            att_record_transformer (Brick): Transformation for the 
                                            attentional records
            energy_computer (Brick): Sub network for calculating the 
                                     energies from the match vector 
        """

        super(CoverageContentAttention, self).__init__(**kwargs)
        self.use_fertility = (max_fertility > 0.0001)
        self.max_fertility = max_fertility
        if not state_transformer:
            state_transformer = Linear(use_bias=False)
        self.match_dim = match_dim
        self.state_transformer = state_transformer

        self.state_transformers = Parallel(input_names=self.state_names,
                                           prototype=state_transformer,
                                           name="state_trans")
        if not attended_transformer:
            attended_transformer = Linear(name="preprocess")

        if not att_record_transformer:
            att_record_transformer = Linear(name="att_record_trans")
        if not energy_computer:
            energy_computer = ShallowEnergyComputer(name="energy_comp")
        self.attended_transformer = attended_transformer
        self.att_record_transformer = att_record_transformer
        self.energy_computer = energy_computer

        self.children = [
            self.state_transformers, attended_transformer,
            att_record_transformer, energy_computer
        ]

        if self.use_fertility:
            if not fertility_transformer:
                fertility_transformer = MLP(activations=[Logistic()],
                                            name='fertility_trans')
            self.fertility_transformer = fertility_transformer
            self.children.append(fertility_transformer)

    def _push_allocation_config(self):
        self.state_transformers.input_dims = self.state_dims
        self.state_transformers.output_dims = [
            self.match_dim for name in self.state_names
        ]
        self.attended_transformer.input_dim = self.attended_dim
        self.attended_transformer.output_dim = self.match_dim
        self.att_record_transformer.input_dim = 1
        self.att_record_transformer.output_dim = self.match_dim
        self.energy_computer.input_dim = self.match_dim
        self.energy_computer.output_dim = 1
        if self.use_fertility:
            self.fertility_transformer.dims = [self.attended_dim, 1]

    @application
    def compute_energies(self, attended, preprocessed_attended, att_records,
                         states):
        if not preprocessed_attended:
            preprocessed_attended = self.preprocess(attended)
        transformed_states = self.state_transformers.apply(as_dict=True,
                                                           **states)
        transformed_att_records = self.att_record_transformer.apply(
            att_records.dimshuffle((1, 0, 2)))
        # Broadcasting of transformed states should be done automatically
        match_vectors = sum(transformed_states.values(), preprocessed_attended)
        match_vectors = match_vectors + transformed_att_records
        energies = self.energy_computer.apply(match_vectors).reshape(
            match_vectors.shape[:-1], ndim=match_vectors.ndim - 1)
        return energies

    @application(outputs=['weighted_averages', 'weights', 'att_records'])
    def take_glimpses(self,
                      attended,
                      preprocessed_attended=None,
                      attended_mask=None,
                      att_records=None,
                      **states):
        energies = self.compute_energies(attended, preprocessed_attended,
                                         att_records, states)
        weights = self.compute_weights(energies, attended_mask)
        if self.use_fertility:
            fertilities = self.max_fertility * self.fertility_transformer.apply(
                attended)
            # Theanos optimizer ensures that fertilities are computed only once
            att_records = att_records + weights.dimshuffle((1, 0, 'x')) / \
                                        fertilities.dimshuffle((1, 0, 2))
        else:
            att_records = att_records + weights.dimshuffle((1, 0, 'x'))
        weighted_averages = self.compute_weighted_averages(weights, attended)
        return weighted_averages, weights.T, att_records

    @take_glimpses.property('inputs')
    def take_glimpses_inputs(self):
        return ([
            'attended', 'preprocessed_attended', 'attended_mask', 'att_records'
        ] + self.state_names)

    @application(outputs=['weighted_averages', 'weights', 'att_records'])
    def initial_glimpses(self, batch_size, attended):
        return [
            tensor.zeros((batch_size, self.attended_dim)),
            tensor.zeros((batch_size, attended.shape[0])),
            tensor.zeros((batch_size, attended.shape[0], 1))
        ]

    @application(inputs=['attended'], outputs=['preprocessed_attended'])
    def preprocess(self, attended):
        """Preprocess the sequence for computing attention weights.

        Args:
            attended (TensorVariable): The attended sequence, time is 
                                       the 1-st dimension.
        """
        return self.attended_transformer.apply(attended)

    def get_dim(self, name):
        if name in ['weighted_averages']:
            return self.attended_dim
        if name in ['weights', 'att_records']:
            return 0
        return super(CoverageContentAttention, self).get_dim(name)
class PushDownSequenceContentAttention(SequenceContentAttention,
                                       Initializable):
    """Adds an external memory structure in form of a neural stack to
    the decoder. The neural stack is operated through a pop operation,
    a push operation, and an input variable, which all are computed
    from the decoder state. This neural stack implementation is similar
    to Mikolovs model:
    
    - Apply the (continuous) pop operation if the pop gate is on
    - Read the top element on the stack
    - Push the stack input vector if the push gate is on
    - Concatenate the read element from the stack to the weighted
      averages of source annotations to obtain the final context
      vector
      
    Note that this implementation realizes a stack with limited depth
    because Blocks didn't allow to have glimpses of varying size. In
    practice, however, we think that a limited size is appropriate for
    machine translation.
    """
    def __init__(self, stack_dim=500, **kwargs):
        """Sole constructor.
        
        Args:
            stack_dim (int): Size of vectors on the stack.
        """
        super(PushDownSequenceContentAttention, self).__init__(**kwargs)
        self.stack_dim = stack_dim
        self.max_stack_depth = 25

        self.stack_op_names = self.state_names + ['weighted_averages']

        self.stack_pop_transformer = MLP(activations=[Logistic()], dims=None)
        self.stack_pop_transformers = Parallel(
            input_names=self.stack_op_names,
            prototype=self.stack_pop_transformer,
            name="stack_pop")

        self.stack_push_transformer = MLP(activations=[Logistic()], dims=None)
        self.stack_push_transformers = Parallel(
            input_names=self.stack_op_names,
            prototype=self.stack_push_transformer,
            name="stack_push")

        self.stack_input_transformer = Linear()
        self.stack_input_transformers = Parallel(
            input_names=self.stack_op_names,
            prototype=self.stack_input_transformer,
            name="stack_input")
        self.children.append(self.stack_pop_transformers)
        self.children.append(self.stack_push_transformers)
        self.children.append(self.stack_input_transformers)

    def _push_allocation_config(self):
        """Sets the dimensions of the stack operation networks """
        super(PushDownSequenceContentAttention, self)._push_allocation_config()
        self.stack_op_dims = self.state_dims + [self.attended_dim]
        n_states = len(self.stack_op_dims)
        self.stack_pop_transformers.input_dims = self.stack_op_dims
        self.stack_pop_transformers.output_dims = [1] * n_states

        self.stack_push_transformers.input_dims = self.stack_op_dims
        self.stack_push_transformers.output_dims = [1] * n_states

        self.stack_input_transformers.input_dims = self.stack_op_dims
        self.stack_input_transformers.output_dims = [self.stack_dim] * n_states

    def _allocate(self):
        """Allocates the single parameter of this brick: the initial
        element on the stack.
        """
        self.parameters.append(
            shared_floatx_nans((1, self.stack_dim), name='init_stack'))
        add_role(self.parameters[-1], INITIAL_STATE)

    def _initialize(self):
        """Initializes the initial element on the stack with zero. """
        self.biases_init.initialize(self.parameters[-1], self.rng)

    @application(outputs=['context_vector', 'weights', 'stack'])
    def take_glimpses(self,
                      attended,
                      preprocessed_attended=None,
                      attended_mask=None,
                      stack=None,
                      **states):
        """This method is an extension to ``take_glimpses`` in
        ``SequenceContentAttention``. After computing the weighted
        averages of source annotations, it operates the stack, i.e.
        pops the top element, reads out the top of the stack, and 
        pushes a new element. The first glimpse ``context_vector`` is
        the concatenation of weighted source annotations and stack
        output.
        
        Args:
            attended (Variable): Source annotations
            preprocessed_attended (Variable): Transformed source 
                                              annotations used to
                                              compute energies
            attended_mask (Variable): Source mask
            stack (Variable): Current state of the stack
            \*\*states (Variable): Decoder state 
        
        Returns:
            Tuple. The first element is used as context vector for the
            decoder state update. ``stack`` is a recurrent glimpse which
            is used in the next ``take_glimpse`` iteration.
        """
        energies = self.compute_energies(attended, preprocessed_attended,
                                         states)
        weights = self.compute_weights(energies, attended_mask)
        weighted_averages = self.compute_weighted_averages(weights, attended)

        stack_op_input = states
        stack_op_input['weighted_averages'] = weighted_averages

        stack_pop = sum(
            self.stack_pop_transformers.apply(as_dict=True,
                                              **stack_op_input).values())
        stack_push = sum(
            self.stack_push_transformers.apply(as_dict=True,
                                               **stack_op_input).values())
        stack_input = sum(
            self.stack_input_transformers.apply(as_dict=True,
                                                **stack_op_input).values())

        # the stack has shape (batch_size, stack_depth, stack_dim)
        batch_size = stack.shape[0]
        stack_dim = stack_input.shape[1]
        default_stack_entry = tensor.repeat(self.parameters[-1][None, :, :],
                                            batch_size, 0)

        pushed_stack = tensor.concatenate(
            [stack_input.reshape((batch_size, 1, stack_dim)), stack[:, 1:, :]],
            axis=1)
        popped_stack = tensor.concatenate(
            [stack[:, :-1, :], default_stack_entry], axis=1)
        pop_gate = stack_pop.reshape((batch_size, 1, 1))
        push_gate = stack_push.reshape((batch_size, 1, 1))
        read_stack = pop_gate * popped_stack + (1.0 - pop_gate) * stack
        stack_output = read_stack[:, 0, :]
        new_stack = push_gate * pushed_stack + (1.0 - push_gate) * read_stack

        context_vector = tensor.concatenate([weighted_averages, stack_output],
                                            axis=1)
        return context_vector, weights.T, new_stack

    @take_glimpses.property('inputs')
    def take_glimpses_inputs(self):
        """Defines the ``inputs`` decoration for ``take_glimpses``. """
        return (
            ['attended', 'preprocessed_attended', 'attended_mask', 'stack'] +
            self.state_names)

    @application(outputs=['context_vector', 'weights', 'stack'])
    def initial_glimpses(self, batch_size, attended):
        """The stack is initialized with the default entry. All other
        glimpses are initialized with zero.
        """
        default_stack_entry = tensor.repeat(self.parameters[-1][None, :, :],
                                            batch_size, 0)
        return [
            tensor.zeros((batch_size, self.attended_dim + self.stack_dim)),
            tensor.zeros((batch_size, attended.shape[0])),
            tensor.repeat(default_stack_entry, self.max_stack_depth, 1)
        ]

    def get_dim(self, name):
        """Get dimensions of variables. Delegates to super class if
        ``name`` is not used in this class.
        """
        if name in ['context_vector']:
            return self.attended_dim + self.stack_dim
        if name in ['weights']:
            return 0
        if name in ['stack']:
            return self.max_stack_depth, self.stack_dim
        return super(PushDownSequenceContentAttention, self).get_dim(name)