class Merger(Initializable): def __init__(self, area_transform, patch_transform, response_transform, n_spatial_dims, batch_normalize, whatwhere_interaction="additive", **kwargs): super(Merger, self).__init__(**kwargs) self.patch_transform = patch_transform self.area_transform = area_transform self.whatwhere_interaction = whatwhere_interaction self.response_merge = Parallel(input_names="area patch".split(), input_dims=[ area_transform.brick.output_dim, patch_transform.brick.output_dim ], output_dims=2 * [response_transform.brick.input_dim], prototype=Linear(use_bias=False), child_prefix="response_merge") self.response_merge_activation = NormalizedActivation( shape=[response_transform.brick.input_dim], name="response_merge_activation", batch_normalize=batch_normalize) self.response_transform = response_transform self.children = [ self.response_merge_activation, self.response_merge, patch_transform.brick, area_transform.brick, response_transform.brick ] @application(inputs="patch location scale".split(), outputs=['response']) def apply(self, patch, location, scale): # don't backpropagate through these to avoid the model using # the location/scale as merely additional hidden units #location, scale = list(map(theano.gradient.disconnected_grad, (location, scale))) patch = self.patch_transform(patch) area = self.area_transform(T.concatenate([location, scale], axis=1)) parts = self.response_merge.apply(area, patch) if self.whatwhere_interaction == "additive": response = sum(parts) elif self.whatwhere_interaction == "multiplicative": response = reduce(operator.mul, parts) response = self.response_merge_activation.apply(response) response = self.response_transform(response) return response
class Merger(Initializable): def __init__(self, area_transform, patch_transform, response_transform, n_spatial_dims, batch_normalize, whatwhere_interaction="additive", **kwargs): super(Merger, self).__init__(**kwargs) self.patch_transform = patch_transform self.area_transform = area_transform self.whatwhere_interaction = whatwhere_interaction self.response_merge = Parallel( input_names="area patch".split(), input_dims=[area_transform.brick.output_dim, patch_transform.brick.output_dim], output_dims=2*[response_transform.brick.input_dim], prototype=Linear(use_bias=False), child_prefix="response_merge") self.response_merge_activation = NormalizedActivation( shape=[response_transform.brick.input_dim], name="response_merge_activation", batch_normalize=batch_normalize) self.response_transform = response_transform self.children = [self.response_merge_activation, self.response_merge, patch_transform.brick, area_transform.brick, response_transform.brick] @application(inputs="patch location scale".split(), outputs=['response']) def apply(self, patch, location, scale): # don't backpropagate through these to avoid the model using # the location/scale as merely additional hidden units #location, scale = list(map(theano.gradient.disconnected_grad, (location, scale))) patch = self.patch_transform(patch) area = self.area_transform(T.concatenate([location, scale], axis=1)) parts = self.response_merge.apply(area, patch) if self.whatwhere_interaction == "additive": response = sum(parts) elif self.whatwhere_interaction == "multiplicative": response = reduce(operator.mul, parts) response = self.response_merge_activation.apply(response) response = self.response_transform(response) return response
class SequenceContentAndConvAttention(GenericSequenceAttention, Initializable): @lazy() def __init__(self, match_dim, conv_n, conv_num_filters=1, state_transformer=None, attended_transformer=None, energy_computer=None, prior=None, energy_normalizer=None, **kwargs): super(SequenceContentAndConvAttention, self).__init__(**kwargs) if not state_transformer: state_transformer = Linear(use_bias=False) self.match_dim = match_dim self.state_transformer = state_transformer self.state_transformers = Parallel(input_names=self.state_names, prototype=state_transformer, name="state_trans") if not attended_transformer: # Only this contributor to the match vector # is allowed to have biases attended_transformer = Linear(name="preprocess") if not energy_normalizer: energy_normalizer = 'softmax' self.energy_normalizer = energy_normalizer if not energy_computer: energy_computer = ShallowEnergyComputer( name="energy_comp", use_bias=self.energy_normalizer != 'softmax') self.filter_handler = Linear(name="handler", use_bias=False) self.attended_transformer = attended_transformer self.energy_computer = energy_computer if not prior: prior = dict(type='expanding', initial_begin=0, initial_end=10000, min_speed=0, max_speed=0) self.prior = prior self.conv_n = conv_n self.conv_num_filters = conv_num_filters self.conv = Conv1D(conv_num_filters, 2 * conv_n + 1) self.children = [self.state_transformers, self.attended_transformer, self.energy_computer, self.filter_handler, self.conv] def _push_allocation_config(self): self.state_transformers.input_dims = self.state_dims self.state_transformers.output_dims = [self.match_dim for name in self.state_names] self.attended_transformer.input_dim = self.attended_dim self.attended_transformer.output_dim = self.match_dim self.energy_computer.input_dim = self.match_dim self.energy_computer.output_dim = 1 self.filter_handler.input_dim = self.conv_num_filters self.filter_handler.output_dim = self.match_dim @application def compute_energies(self, attended, preprocessed_attended, previous_weights, states): if not preprocessed_attended: preprocessed_attended = self.preprocess(attended) transformed_states = self.state_transformers.apply(as_dict=True, **states) # Broadcasting of transformed states should be done automatically match_vectors = sum(transformed_states.values(), preprocessed_attended) conv_result = self.conv.apply(previous_weights) match_vectors += self.filter_handler.apply( conv_result[:, :, self.conv_n:-self.conv_n] .dimshuffle(0, 2, 1)).dimshuffle(1, 0, 2) energies = self.energy_computer.apply(match_vectors).reshape( match_vectors.shape[:-1], ndim=match_vectors.ndim - 1) return energies @staticmethod def mask_row(offset, length, empty_row): return tensor.set_subtensor(empty_row[offset:offset+length], 1) @application(outputs=['weighted_averages', 'weights', 'energies', 'step']) def take_glimpses(self, attended, preprocessed_attended=None, attended_mask=None, weights=None, step=None, **states): # Cut the considered window. p = self.prior length = attended.shape[0] prior_type = p.get('type', 'expanding') if prior_type=='expanding': begin = p['initial_begin'] + step[0] * p['min_speed'] end = p['initial_end'] + step[0] * p['max_speed'] begin = tensor.maximum(0, tensor.minimum(length - 1, begin)) end = tensor.maximum(0, tensor.minimum(length, end)) additional_mask = None elif prior_type.startswith('window_around'): #check whether we want the mean or median! if prior_type == 'window_around_mean': position_in_attended = tensor.arange(length, dtype=floatX)[None, :] expected_last_source_pos = (weights * position_in_attended).sum(axis=1) elif prior_type == 'window_around_median': ali_to_05 = tensor.extra_ops.cumsum(weights, axis=1) - 0.5 ali_to_05 = (ali_to_05>=0) ali_median_pos = ali_to_05[:,1:] - ali_to_05[:,:-1] expected_last_source_pos = tensor.argmax(ali_median_pos, axis=1) expected_last_source_pos = theano.gradient.disconnected_grad( expected_last_source_pos) else: raise ValueError #the window taken around each element begins = tensor.floor(expected_last_source_pos - p['before']) ends = tensor.ceil(expected_last_source_pos + p['after']) #the global window to optimize computations begin = tensor.maximum(0, begins.min()).astype('int64') end = tensor.minimum(length, ends.max()).astype('int64') #the new mask, already cut to begin:end position_in_attended_cut = tensor.arange( begin * 1., end * 1., 1., dtype=floatX)[None, :] additional_mask = ((position_in_attended_cut > begins[:,None]) * (position_in_attended_cut < ends[:,None])) else: raise Exception("Unknown prior type: %s", prior_type) begin = tensor.floor(begin).astype('int64') end = tensor.ceil(end).astype('int64') attended_cut = attended[begin:end] preprocessed_attended_cut = (preprocessed_attended[begin:end] if preprocessed_attended else None) attended_mask_cut = ( (attended_mask[begin:end] if attended_mask else None) * (additional_mask.T if additional_mask else 1)) weights_cut = weights[:, begin:end] # Call energies_cut = self.compute_energies(attended_cut, preprocessed_attended_cut, weights_cut, states) weights_cut = self.compute_weights(energies_cut, attended_mask_cut) weighted_averages = self.compute_weighted_averages(weights_cut, attended_cut) # Paste new_weights = new_energies = tensor.zeros_like(weights.T) new_weights = tensor.set_subtensor(new_weights[begin:end], weights_cut) new_energies = tensor.set_subtensor(new_energies[begin:end], energies_cut) return weighted_averages, new_weights.T, new_energies.T, step + 1'inputs') def take_glimpses_inputs(self): return (['attended', 'preprocessed_attended', 'attended_mask', 'weights', 'step'] + self.state_names) @application def compute_weights(self, energies, attended_mask): if self.energy_normalizer == 'softmax':"Using softmax attention weights normalization") energies = energies - energies.max(axis=0) unnormalized_weights = tensor.exp(energies) elif self.energy_normalizer == 'logistic':"Using smoothfocus (logistic sigm) " "attention weights normalization") unnormalized_weights = tensor.nnet.sigmoid(energies) elif self.energy_normalizer == 'relu':"Using ReLU attention weights normalization") unnormalized_weights = tensor.maximum(energies/1000., 0.0) else: raise Exception("Unknown energey_normalizer: {}" .format(self.energy_computer)) if attended_mask: unnormalized_weights *= attended_mask # If mask consists of all zeros use 1 as the normalization coefficient normalization = (unnormalized_weights.sum(axis=0) + tensor.all(1 - attended_mask, axis=0)) return unnormalized_weights / normalization @application def initial_glimpses(self, batch_size, attended): return ([tensor.zeros((batch_size, self.attended_dim))] + 2 * [tensor.concatenate([ tensor.ones((batch_size, 1)), tensor.zeros((batch_size, attended.shape[0] - 1))], axis=1)] + [tensor.zeros((batch_size,), dtype='int64')])'outputs') def initial_glimpses_outputs(self): return ['weight_averages', 'weights', 'energies', 'step'] @application(inputs=['attended'], outputs=['preprocessed_attended']) def preprocess(self, attended): return self.attended_transformer.apply(attended) def get_dim(self, name): if name in ['weighted_averages']: return self.attended_dim if name in ['weights', 'energies', 'step']: return 0 return super(SequenceContentAndConvAttention, self).get_dim(name)
class SequenceContentAttention(GenericSequenceAttention, Initializable): """Attention mechanism that looks for relevant content in a sequence. This is the attention mechanism used in [BCB]_. The idea in a nutshell: 1. The states and the sequence are transformed independently, 2. The transformed states are summed with every transformed sequence element to obtain *match vectors*, 3. A match vector is transformed into a single number interpreted as *energy*, 4. Energies are normalized in softmax-like fashion. The resulting summing to one weights are called *attention weights*, 5. Weighted average of the sequence elements with attention weights is computed. In terms of the :class:`AbstractAttention` documentation, the sequence is the attended. The weighted averages from 5 and the attention weights from 4 form the set of glimpses produced by this attention mechanism. Parameters ---------- state_names : list of str The names of the network states. attended_dim : int The dimension of the sequence elements. match_dim : int The dimension of the match vector. state_transformer : :class:`.Brick` A prototype for state transformations. If ``None``, a linear transformation is used. attended_transformer : :class:`.Feedforward` The transformation to be applied to the sequence. If ``None`` an affine transformation is used. energy_computer : :class:`.Feedforward` Computes energy from the match vector. If ``None``, an affine transformations preceeded by :math:`tanh` is used. Notes ----- See :class:`.Initializable` for initialization parameters. .. [BCB] Dzmitry Bahdanau, Kyunghyun Cho and Yoshua Bengio. Neural Machine Translation by Jointly Learning to Align and Translate. """ @lazy(allocation=['match_dim']) def __init__(self, match_dim, state_transformer=None, attended_transformer=None, energy_computer=None, **kwargs): if not state_transformer: state_transformer = Linear(use_bias=False) self.match_dim = match_dim self.state_transformer = state_transformer self.state_transformers = Parallel(input_names=kwargs['state_names'], prototype=state_transformer, name="state_trans") if not attended_transformer: attended_transformer = Linear(name="preprocess") if not energy_computer: energy_computer = ShallowEnergyComputer(name="energy_comp") self.attended_transformer = attended_transformer self.energy_computer = energy_computer children = [self.state_transformers, attended_transformer, energy_computer] + kwargs.get('children', []) super(SequenceContentAttention, self).__init__(children=children, **kwargs) def _push_allocation_config(self): self.state_transformers.input_dims = self.state_dims self.state_transformers.output_dims = [self.match_dim for name in self.state_names] self.attended_transformer.input_dim = self.attended_dim self.attended_transformer.output_dim = self.match_dim self.energy_computer.input_dim = self.match_dim self.energy_computer.output_dim = 1 @application def compute_energies(self, attended, preprocessed_attended, states): if not preprocessed_attended: preprocessed_attended = self.preprocess(attended) transformed_states = self.state_transformers.apply(as_dict=True, **states) # Broadcasting of transformed states should be done automatically match_vectors = sum(transformed_states.values(), preprocessed_attended) energies = self.energy_computer.apply(match_vectors).reshape( match_vectors.shape[:-1], ndim=match_vectors.ndim - 1) return energies @application(outputs=['weighted_averages', 'weights']) def take_glimpses(self, attended, preprocessed_attended=None, attended_mask=None, **states): r"""Compute attention weights and produce glimpses. Parameters ---------- attended : :class:`~tensor.TensorVariable` The sequence, time is the 1-st dimension. preprocessed_attended : :class:`~tensor.TensorVariable` The preprocessed sequence. If ``None``, is computed by calling :meth:`preprocess`. attended_mask : :class:`~tensor.TensorVariable` A 0/1 mask specifying available data. 0 means that the corresponding sequence element is fake. \*\*states The states of the network. Returns ------- weighted_averages : :class:`~theano.Variable` Linear combinations of sequence elements with the attention weights. weights : :class:`~theano.Variable` The attention weights. The first dimension is batch, the second is time. """ energies = self.compute_energies(attended, preprocessed_attended, states) weights = self.compute_weights(energies, attended_mask) weighted_averages = self.compute_weighted_averages(weights, attended) return weighted_averages, weights.T'inputs') def take_glimpses_inputs(self): return (['attended', 'preprocessed_attended', 'attended_mask'] + self.state_names) @application(outputs=['weighted_averages', 'weights']) def initial_glimpses(self, batch_size, attended): return [tensor.zeros((batch_size, self.attended_dim)), tensor.zeros((batch_size, attended.shape[0]))] @application(inputs=['attended'], outputs=['preprocessed_attended']) def preprocess(self, attended): """Preprocess the sequence for computing attention weights. Parameters ---------- attended : :class:`~tensor.TensorVariable` The attended sequence, time is the 1-st dimension. """ return self.attended_transformer.apply(attended) def get_dim(self, name): if name in ['weighted_averages']: return self.attended_dim if name in ['weights']: return 0 return super(SequenceContentAttention, self).get_dim(name)
class SequenceContentAttention(AbstractAttention, Initializable): """Attention mechanism that looks for relevant content in a sequence. This is the attention mechanism used in [BCB]_. The idea in a nutshell: 1. The states and the sequence are transformed independently, 2. The transformed states are summed with every transformed sequence element to obtain *match vectors*, 3. A match vector is transformed into a single number interpreted as *energy*, 4. Energies are normalized in softmax-like fashion. The resulting summing to one weights are called *attention weights*, 5. Linear combination of the sequence elements with attention weights is computed. In terms of the :class:`AbstractAttention` documentation, the sequence is the attended. This linear combinations from 5 and the attention weights from 4 form the set of glimpses produced by this attention mechanism. Parameters ---------- state_names : list of str The names of the network states. sequence_dim : int The dimension of the sequence elements. match_dim : int The dimension of the match vector. state_transformer : :class:`.Brick` A prototype for state transformations. If ``None``, the default transformation from :class:`.Parallel` is used. sequence_transformer : :class:`.Feedforward` The transformation to be applied to the sequence. If ``None`` an affine transformation is used. energy_computer : :class:`.Feedforward` Computes energy from the match vector. If ``None``, an affine transformations preceeded by :math:`tanh` is used. Notes ----- See :class:`.Initializable` for initialization parameters. .. [BCB] Dzmitry Bahdanau, Kyunghyun Cho and Yoshua Bengio. Neural Machine Translation by Jointly Learning to Align and Translate. """ @lazy def __init__(self, state_names, state_dims, sequence_dim, match_dim, state_transformer=None, sequence_transformer=None, energy_computer=None, **kwargs): super(SequenceContentAttention, self).__init__(**kwargs) self.state_names = state_names self.state_dims = state_dims self.sequence_dim = sequence_dim self.match_dim = match_dim self.state_transformer = state_transformer self.state_transformers = Parallel(input_names=state_names, prototype=state_transformer, name="state_trans") if not sequence_transformer: sequence_transformer = Linear(name="preprocess") if not energy_computer: energy_computer = ShallowEnergyComputer(name="energy_comp") self.sequence_transformer = sequence_transformer self.energy_computer = energy_computer self.children = [self.state_transformers, sequence_transformer, energy_computer] def _push_allocation_config(self): self.state_transformers.input_dims = self.state_dims self.state_transformers.output_dims = {name: self.match_dim for name in self.state_names} self.sequence_transformer.input_dim = self.sequence_dim self.sequence_transformer.output_dim = self.match_dim self.energy_computer.input_dim = self.match_dim self.energy_computer.output_dim = 1 @application(outputs=['glimpses', 'weights']) def take_glimpses(self, sequence, preprocessed_sequence=None, mask=None, **states): r"""Compute attention weights and produce glimpses. Parameters ---------- sequence : :class:`~tensor.TensorVariable` The sequence, time is the 1-st dimension. preprocessed_sequence : :class:`~tensor.TensorVariable` The preprocessed sequence. If ``None``, is computed by calling :meth:`preprocess`. mask : :class:`~tensor.TensorVariable` A 0/1 mask specifying available data. 0 means that the corresponding sequence element is fake. \*\*states The states of the network. Returns ------- glimpses : :class:`~theano.Variable` Linear combinations of sequence elements with the attention weights. weights : :class:`~theano.Variable` The attention weights. The first dimension is batch, the second is time. """ if not preprocessed_sequence: preprocessed_sequence = self.preprocess(sequence) transformed_states = self.state_transformers.apply(as_dict=True, **states) # Broadcasting of transformed states should be done automatically match_vectors = sum(transformed_states.values(), preprocessed_sequence) energies = self.energy_computer.apply(match_vectors).reshape( match_vectors.shape[:-1], ndim=match_vectors.ndim - 1) unormalized_weights = tensor.exp(energies) if mask: unormalized_weights *= mask weights = unormalized_weights / unormalized_weights.sum(axis=0) glimpses = (tensor.shape_padright(weights) * sequence).sum(axis=0) return glimpses, weights.dimshuffle(1, 0)'inputs') def take_glimpses_inputs(self): return (['sequence', 'preprocessed_sequence', 'mask'] + self.state_names) @application def initial_glimpses(self, name, batch_size, sequence): if name == "glimpses": return tensor.zeros((batch_size, self.sequence_dim)) elif name == "weights": return tensor.zeros((batch_size, sequence.shape[0])) else: raise ValueError("Unknown glimpse name {}".format(name)) @application(inputs=['sequence'], outputs=['preprocessed_sequence']) def preprocess(self, sequence): """Preprocess a sequence for computing attention weights. Parameters ---------- sequence : :class:`~tensor.TensorVariable` The sequence, time is the 1-st dimension. """ return self.sequence_transformer.apply(sequence) def get_dim(self, name): if name in ['glimpses', 'sequence', 'preprocessed_sequence']: return self.sequence_dim if name in ['mask', 'weights']: return 0 return super(SequenceContentAttention, self).get_dim(name)
class SequenceContentAttention(AbstractAttention, Initializable): """Attention mechanism that looks for relevant content in a sequence. This is the attention mechanism used in [BCB]_. The idea in a nutshell: 1. The states and the sequence are transformed independently, 2. The transformed states are summed with every transformed sequence element to obtain *match vectors*, 3. A match vector is transformed into a single number interpreted as *energy*, 4. Energies are normalized in softmax-like fashion. The resulting summing to one weights are called *attention weights*, 5. Linear combination of the sequence elements with attention weights is computed. In terms of the :class:`AbstractAttention` documentation, the sequence is the attended. This linear combinations from 5 and the attention weights from 4 form the set of glimpses produced by this attention mechanism. Parameters ---------- state_names : list of str The names of the agent states. sequence_dim : int The dimension of the sequence elements. match_dim : int The dimension of the match vector. state_transformer : :class:`.Brick` A prototype for state transformations. If ``None``, the default transformation from :class:`.Parallel` is used. sequence_transformer : :class:`.Feedforward` The transformation to be applied to the sequence. If ``None`` an affine transformation is used. energy_computer : :class:`.Feedforward` Computes energy from the match vector. If ``None``, an affine transformations preceeded by :math:`tanh` is used. Notes ----- See :class:`.Initializable` for initialization parameters. .. [BCB] Dzmitry Bahdanau, Kyunghyun Cho and Yoshua Bengio. Neural Machine Translation by Jointly Learning to Align and Translate. """ @lazy def __init__(self, state_names, state_dims, sequence_dim, match_dim, state_transformer=None, sequence_transformer=None, energy_computer=None, **kwargs): super(SequenceContentAttention, self).__init__(**kwargs) self.state_names = state_names self.state_dims = state_dims self.sequence_dim = sequence_dim self.match_dim = match_dim self.state_transformer = state_transformer self.state_transformers = Parallel(input_names=state_names, prototype=state_transformer, name="state_trans") if not sequence_transformer: sequence_transformer = Linear(name="preprocess") if not energy_computer: energy_computer = ShallowEnergyComputer(name="energy_comp") self.sequence_transformer = sequence_transformer self.energy_computer = energy_computer self.children = [ self.state_transformers, sequence_transformer, energy_computer ] def _push_allocation_config(self): self.state_transformers.input_dims = self.state_dims self.state_transformers.output_dims = { name: self.match_dim for name in self.state_names } self.sequence_transformer.input_dim = self.sequence_dim self.sequence_transformer.output_dim = self.match_dim self.energy_computer.input_dim = self.match_dim self.energy_computer.output_dim = 1 @application(outputs=['glimpses', 'weights']) def take_glimpses(self, sequence, preprocessed_sequence=None, mask=None, **states): r"""Compute attention weights and produce glimpses. Parameters ---------- sequence : :class:`~tensor.TensorVariable` The sequence, time is the 1-st dimension. preprocessed_sequence : :class:`~tensor.TensorVariable` The preprocessed sequence. If ``None``, is computed by calling :meth:`preprocess`. mask : :class:`~tensor.TensorVariable` A 0/1 mask specifying available data. 0 means that the corresponding sequence element is fake. \*\*states The states of the agent. Returns ------- glimpses : :class:`~theano.Variable` Linear combinations of sequence elements with the attention weights. weights : :class:`~theano.Variable` The attention weights. The first dimension is batch, the second is time. """ if not preprocessed_sequence: preprocessed_sequence = self.preprocess(sequence) transformed_states = self.state_transformers.apply(return_dict=True, **states) # Broadcasting of transformed states should be done automatically match_vectors = sum(transformed_states.values(), preprocessed_sequence) energies = self.energy_computer.apply(match_vectors).reshape( match_vectors.shape[:-1], ndim=match_vectors.ndim - 1) unormalized_weights = tensor.exp(energies) if mask: unormalized_weights *= mask weights = unormalized_weights / unormalized_weights.sum(axis=0) glimpses = (tensor.shape_padright(weights) * sequence).sum(axis=0) return glimpses, weights.dimshuffle(1, 0)'inputs') def take_glimpses_inputs(self): return (['sequence', 'preprocessed_sequence', 'mask'] + self.state_names) @application def initial_glimpses(self, name, batch_size, sequence): if name == "glimpses": return tensor.zeros((batch_size, self.sequence_dim)) elif name == "weights": return tensor.zeros((batch_size, sequence.shape[0])) else: raise ValueError("Unknown glimpse name {}".format(name)) @application(inputs=['sequence'], outputs=['preprocessed_sequence']) def preprocess(self, sequence): """Preprocess a sequence for computing attention weights. Parameters ---------- sequence : :class:`~tensor.TensorVariable` The sequence, time is the 1-st dimension. """ return self.sequence_transformer.apply(sequence) def get_dim(self, name): if name in ['glimpses', 'sequence', 'preprocessed_sequence']: return self.sequence_dim if name in ['mask', 'weights']: return 0 return super(SequenceContentAttention, self).get_dim(name)
class SequenceContentAndConvAttention(GenericSequenceAttention, Initializable): @lazy() def __init__(self, match_dim, conv_n, conv_num_filters=1, state_transformer=None, attended_transformer=None, energy_computer=None, prior=None, energy_normalizer=None, **kwargs): super(SequenceContentAndConvAttention, self).__init__(**kwargs) if not state_transformer: state_transformer = Linear(use_bias=False) self.match_dim = match_dim self.state_transformer = state_transformer self.state_transformers = Parallel(input_names=self.state_names, prototype=state_transformer, name="state_trans") if not attended_transformer: # Only this contributor to the match vector # is allowed to have biases attended_transformer = Linear(name="preprocess") if not energy_normalizer: energy_normalizer = 'softmax' self.energy_normalizer = energy_normalizer if not energy_computer: energy_computer = ShallowEnergyComputer( name="energy_comp", use_bias=self.energy_normalizer != 'softmax') self.filter_handler = Linear(name="handler", use_bias=False) self.attended_transformer = attended_transformer self.energy_computer = energy_computer if not prior: prior = dict(type='expanding', initial_begin=0, initial_end=10000, min_speed=0, max_speed=0) self.prior = prior self.conv_n = conv_n self.conv_num_filters = conv_num_filters self.conv = Conv1D(conv_num_filters, 2 * conv_n + 1) self.children = [ self.state_transformers, self.attended_transformer, self.energy_computer, self.filter_handler, self.conv ] def _push_allocation_config(self): self.state_transformers.input_dims = self.state_dims self.state_transformers.output_dims = [ self.match_dim for name in self.state_names ] self.attended_transformer.input_dim = self.attended_dim self.attended_transformer.output_dim = self.match_dim self.energy_computer.input_dim = self.match_dim self.energy_computer.output_dim = 1 self.filter_handler.input_dim = self.conv_num_filters self.filter_handler.output_dim = self.match_dim @application def compute_energies(self, attended, preprocessed_attended, previous_weights, states): if not preprocessed_attended: preprocessed_attended = self.preprocess(attended) transformed_states = self.state_transformers.apply(as_dict=True, **states) # Broadcasting of transformed states should be done automatically match_vectors = sum(transformed_states.values(), preprocessed_attended) conv_result = self.conv.apply(previous_weights) match_vectors += self.filter_handler.apply( conv_result[:, :, self.conv_n:-self.conv_n].dimshuffle( 0, 2, 1)).dimshuffle(1, 0, 2) energies = self.energy_computer.apply(match_vectors).reshape( match_vectors.shape[:-1], ndim=match_vectors.ndim - 1) return energies @staticmethod def mask_row(offset, length, empty_row): return tensor.set_subtensor(empty_row[offset:offset + length], 1) @application(outputs=['weighted_averages', 'weights', 'energies', 'step']) def take_glimpses(self, attended, preprocessed_attended=None, attended_mask=None, weights=None, step=None, **states): # Cut the considered window. p = self.prior length = attended.shape[0] prior_type = p.get('type', 'expanding') if prior_type == 'expanding': begin = p['initial_begin'] + step[0] * p['min_speed'] end = p['initial_end'] + step[0] * p['max_speed'] begin = tensor.maximum(0, tensor.minimum(length - 1, begin)) end = tensor.maximum(0, tensor.minimum(length, end)) additional_mask = None elif prior_type.startswith('window_around'): #check whether we want the mean or median! if prior_type == 'window_around_mean': position_in_attended = tensor.arange(length, dtype=floatX)[None, :] expected_last_source_pos = (weights * position_in_attended).sum(axis=1) elif prior_type == 'window_around_median': ali_to_05 = tensor.extra_ops.cumsum(weights, axis=1) - 0.5 ali_to_05 = (ali_to_05 >= 0) ali_median_pos = ali_to_05[:, 1:] - ali_to_05[:, :-1] expected_last_source_pos = tensor.argmax(ali_median_pos, axis=1) expected_last_source_pos = theano.gradient.disconnected_grad( expected_last_source_pos) else: raise ValueError #the window taken around each element begins = tensor.floor(expected_last_source_pos - p['before']) ends = tensor.ceil(expected_last_source_pos + p['after']) #the global window to optimize computations begin = tensor.maximum(0, begins.min()).astype('int64') end = tensor.minimum(length, ends.max()).astype('int64') #the new mask, already cut to begin:end position_in_attended_cut = tensor.arange(begin * 1., end * 1., 1., dtype=floatX)[None, :] additional_mask = ((position_in_attended_cut > begins[:, None]) * (position_in_attended_cut < ends[:, None])) else: raise Exception("Unknown prior type: %s", prior_type) begin = tensor.floor(begin).astype('int64') end = tensor.ceil(end).astype('int64') attended_cut = attended[begin:end] preprocessed_attended_cut = (preprocessed_attended[begin:end] if preprocessed_attended else None) attended_mask_cut = ( (attended_mask[begin:end] if attended_mask else None) * (additional_mask.T if additional_mask else 1)) weights_cut = weights[:, begin:end] # Call energies_cut = self.compute_energies(attended_cut, preprocessed_attended_cut, weights_cut, states) weights_cut = self.compute_weights(energies_cut, attended_mask_cut) weighted_averages = self.compute_weighted_averages( weights_cut, attended_cut) # Paste new_weights = new_energies = tensor.zeros_like(weights.T) new_weights = tensor.set_subtensor(new_weights[begin:end], weights_cut) new_energies = tensor.set_subtensor(new_energies[begin:end], energies_cut) return weighted_averages, new_weights.T, new_energies.T, step + 1'inputs') def take_glimpses_inputs(self): return ([ 'attended', 'preprocessed_attended', 'attended_mask', 'weights', 'step' ] + self.state_names) @application def compute_weights(self, energies, attended_mask): if self.energy_normalizer == 'softmax':"Using softmax attention weights normalization") energies = energies - energies.max(axis=0) unnormalized_weights = tensor.exp(energies) elif self.energy_normalizer == 'logistic':"Using smoothfocus (logistic sigm) " "attention weights normalization") unnormalized_weights = tensor.nnet.sigmoid(energies) elif self.energy_normalizer == 'relu':"Using ReLU attention weights normalization") unnormalized_weights = tensor.maximum(energies / 1000., 0.0) else: raise Exception("Unknown energey_normalizer: {}".format( self.energy_computer)) if attended_mask: unnormalized_weights *= attended_mask # If mask consists of all zeros use 1 as the normalization coefficient normalization = (unnormalized_weights.sum(axis=0) + tensor.all(1 - attended_mask, axis=0)) return unnormalized_weights / normalization @application def initial_glimpses(self, batch_size, attended): return ([tensor.zeros((batch_size, self.attended_dim))] + 2 * [ tensor.concatenate([ tensor.ones((batch_size, 1)), tensor.zeros((batch_size, attended.shape[0] - 1)) ], axis=1) ] + [tensor.zeros((batch_size, ), dtype='int64')])'outputs') def initial_glimpses_outputs(self): return ['weight_averages', 'weights', 'energies', 'step'] @application(inputs=['attended'], outputs=['preprocessed_attended']) def preprocess(self, attended): return self.attended_transformer.apply(attended) def get_dim(self, name): if name in ['weighted_averages']: return self.attended_dim if name in ['weights', 'energies', 'step']: return 0 return super(SequenceContentAndConvAttention, self).get_dim(name)
], weights_init=IsotropicGaussian(), biases_init=IsotropicGaussian()) output_mlp.initialize() parallel_nets = Parallel( input_names=['l_x', 'r_x'], input_dims=[left_dim, right_dim], output_dims=[16, 16], weights_init=IsotropicGaussian(), biases_init=IsotropicGaussian(), prototype=input_mlp, ) parallel_nets.initialize() l_h, r_h = parallel_nets.apply(l_x=l_x, r_x=r_x) # Concatenate the inputs from the two hidden subnets into a single variable # for input into the next layer. merge = tensor.concatenate([l_h, r_h], axis=1) y_hat = output_mlp.apply(merge) # Define a cost function to optimize, and a classification error rate: # Also apply the outputs from the net, and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) = 'error' # Need to define the computation graph: graph = ComputationGraph(cost)
class SequenceMultiContentAttention(GenericSequenceAttention, Initializable): @lazy(allocation=['match_dim']) def __init__(self, n_att_weights, match_dim, state_transformer=None, attended_transformer=None, energy_computer=None, **kwargs): super(SequenceContentAttention, self).__init__(**kwargs) self.n_att_weights = n_att_weights if not state_transformer: state_transformer = Linear(use_bias=False) self.match_dim = match_dim self.state_transformer = state_transformer self.state_transformers = Parallel(input_names=self.state_names, prototype=state_transformer, name="state_trans") if not attended_transformer: attended_transformer = Linear(name="preprocess") if not energy_computer: energy_computer = MultiShallowEnergyComputer(n_att_weights, name="energy_comp") self.attended_transformer = attended_transformer self.energy_computer = energy_computer self.children = [ self.state_transformers, attended_transformer, energy_computer ] def _push_allocation_config(self): self.state_transformers.input_dims = self.state_dims self.state_transformers.output_dims = [ self.match_dim for name in self.state_names ] self.attended_transformer.input_dim = self.attended_dim self.attended_transformer.output_dim = self.match_dim self.energy_computer.input_dim = self.match_dim self.energy_computer.output_dim = 1 @application def compute_energies(self, attended, preprocessed_attended, states): if not preprocessed_attended: preprocessed_attended = self.preprocess(attended) transformed_states = self.state_transformers.apply(as_dict=True, **states) # Broadcasting of transformed states should be done automatically match_vectors = sum(transformed_states.values(), preprocessed_attended) energies = self.energy_computer.apply(match_vectors).reshape( match_vectors.shape[:-1], ndim=match_vectors.ndim - 1) return energies @application(outputs=['weighted_averages', 'weights']) def take_glimpses(self, attended, preprocessed_attended=None, attended_mask=None, **states): r"""Compute attention weights and produce glimpses. Parameters ---------- attended : :class:`~tensor.TensorVariable` The sequence, time is the 1-st dimension. preprocessed_attended : :class:`~tensor.TensorVariable` The preprocessed sequence. If ``None``, is computed by calling :meth:`preprocess`. attended_mask : :class:`~tensor.TensorVariable` A 0/1 mask specifying available data. 0 means that the corresponding sequence element is fake. \*\*states The states of the network. Returns ------- weighted_averages : :class:`~theano.Variable` Linear combinations of sequence elements with the attention weights. weights : :class:`~theano.Variable` The attention weights. The first dimension is batch, the second is time. """ energies = self.compute_energies(attended, preprocessed_attended, states) weights = self.compute_weights(energies, attended_mask) weighted_averages = self.compute_weighted_averages(weights, attended) return weighted_averages, weights.T'inputs') def take_glimpses_inputs(self): return (['attended', 'preprocessed_attended', 'attended_mask'] + self.state_names) @application(outputs=['weighted_averages', 'weights']) def initial_glimpses(self, batch_size, attended): return [ tensor.zeros((batch_size, self.attended_dim)), tensor.zeros((batch_size, attended.shape[0])) ] @application(inputs=['attended'], outputs=['preprocessed_attended']) def preprocess(self, attended): """Preprocess the sequence for computing attention weights. Parameters ---------- attended : :class:`~tensor.TensorVariable` The attended sequence, time is the 1-st dimension. """ return self.attended_transformer.apply(attended) def get_dim(self, name): if name in ['weighted_averages']: return self.attended_dim if name in ['weights']: return 0 return super(SequenceContentAttention, self).get_dim(name)
class CoverageContentAttention(GenericSequenceAttention, Initializable): """This is the 'linguistic' coverage model from Tu et al., 2016. The fertility of each source annotation is estimated with a linear transform followed by a sigmoid times N (N is the maximum fertility) The coverage model keeps track of the attention record for each annotation and feeds the cumulative record divided by the fertility to the match vector which eventually determines the attention weight. This code base of this implementation is close to ``SequenceContentAttention``. """ @lazy(allocation=['match_dim']) def __init__(self, match_dim, max_fertility, state_transformer=None, attended_transformer=None, fertility_transformer=None, att_record_transformer=None, energy_computer=None, **kwargs): """Creates an attention brick with 'linguistic' coverage. Compare with ``SequenceContentAttention``. Args: match_dim (int): Dimensionality of the match vector max_fertility (float): Maximum fertility of a source annotation (N in Tu et al.). If this is set to 0 or smaller, we fix fertilities to 1 and do not estimate them. state_transformer (Brick): Transformation for the decoder state attended_transformer (Brick): Transformation for the source annotations fertility_transformer (Brick): Transformation which calculates fertilities att_record_transformer (Brick): Transformation for the attentional records energy_computer (Brick): Sub network for calculating the energies from the match vector """ super(CoverageContentAttention, self).__init__(**kwargs) self.use_fertility = (max_fertility > 0.0001) self.max_fertility = max_fertility if not state_transformer: state_transformer = Linear(use_bias=False) self.match_dim = match_dim self.state_transformer = state_transformer self.state_transformers = Parallel(input_names=self.state_names, prototype=state_transformer, name="state_trans") if not attended_transformer: attended_transformer = Linear(name="preprocess") if not att_record_transformer: att_record_transformer = Linear(name="att_record_trans") if not energy_computer: energy_computer = ShallowEnergyComputer(name="energy_comp") self.attended_transformer = attended_transformer self.att_record_transformer = att_record_transformer self.energy_computer = energy_computer self.children = [ self.state_transformers, attended_transformer, att_record_transformer, energy_computer ] if self.use_fertility: if not fertility_transformer: fertility_transformer = MLP(activations=[Logistic()], name='fertility_trans') self.fertility_transformer = fertility_transformer self.children.append(fertility_transformer) def _push_allocation_config(self): self.state_transformers.input_dims = self.state_dims self.state_transformers.output_dims = [ self.match_dim for name in self.state_names ] self.attended_transformer.input_dim = self.attended_dim self.attended_transformer.output_dim = self.match_dim self.att_record_transformer.input_dim = 1 self.att_record_transformer.output_dim = self.match_dim self.energy_computer.input_dim = self.match_dim self.energy_computer.output_dim = 1 if self.use_fertility: self.fertility_transformer.dims = [self.attended_dim, 1] @application def compute_energies(self, attended, preprocessed_attended, att_records, states): if not preprocessed_attended: preprocessed_attended = self.preprocess(attended) transformed_states = self.state_transformers.apply(as_dict=True, **states) transformed_att_records = self.att_record_transformer.apply( att_records.dimshuffle((1, 0, 2))) # Broadcasting of transformed states should be done automatically match_vectors = sum(transformed_states.values(), preprocessed_attended) match_vectors = match_vectors + transformed_att_records energies = self.energy_computer.apply(match_vectors).reshape( match_vectors.shape[:-1], ndim=match_vectors.ndim - 1) return energies @application(outputs=['weighted_averages', 'weights', 'att_records']) def take_glimpses(self, attended, preprocessed_attended=None, attended_mask=None, att_records=None, **states): energies = self.compute_energies(attended, preprocessed_attended, att_records, states) weights = self.compute_weights(energies, attended_mask) if self.use_fertility: fertilities = self.max_fertility * self.fertility_transformer.apply( attended) # Theanos optimizer ensures that fertilities are computed only once att_records = att_records + weights.dimshuffle((1, 0, 'x')) / \ fertilities.dimshuffle((1, 0, 2)) else: att_records = att_records + weights.dimshuffle((1, 0, 'x')) weighted_averages = self.compute_weighted_averages(weights, attended) return weighted_averages, weights.T, att_records'inputs') def take_glimpses_inputs(self): return ([ 'attended', 'preprocessed_attended', 'attended_mask', 'att_records' ] + self.state_names) @application(outputs=['weighted_averages', 'weights', 'att_records']) def initial_glimpses(self, batch_size, attended): return [ tensor.zeros((batch_size, self.attended_dim)), tensor.zeros((batch_size, attended.shape[0])), tensor.zeros((batch_size, attended.shape[0], 1)) ] @application(inputs=['attended'], outputs=['preprocessed_attended']) def preprocess(self, attended): """Preprocess the sequence for computing attention weights. Args: attended (TensorVariable): The attended sequence, time is the 1-st dimension. """ return self.attended_transformer.apply(attended) def get_dim(self, name): if name in ['weighted_averages']: return self.attended_dim if name in ['weights', 'att_records']: return 0 return super(CoverageContentAttention, self).get_dim(name)
class PushDownSequenceContentAttention(SequenceContentAttention, Initializable): """Adds an external memory structure in form of a neural stack to the decoder. The neural stack is operated through a pop operation, a push operation, and an input variable, which all are computed from the decoder state. This neural stack implementation is similar to Mikolovs model: - Apply the (continuous) pop operation if the pop gate is on - Read the top element on the stack - Push the stack input vector if the push gate is on - Concatenate the read element from the stack to the weighted averages of source annotations to obtain the final context vector Note that this implementation realizes a stack with limited depth because Blocks didn't allow to have glimpses of varying size. In practice, however, we think that a limited size is appropriate for machine translation. """ def __init__(self, stack_dim=500, **kwargs): """Sole constructor. Args: stack_dim (int): Size of vectors on the stack. """ super(PushDownSequenceContentAttention, self).__init__(**kwargs) self.stack_dim = stack_dim self.max_stack_depth = 25 self.stack_op_names = self.state_names + ['weighted_averages'] self.stack_pop_transformer = MLP(activations=[Logistic()], dims=None) self.stack_pop_transformers = Parallel( input_names=self.stack_op_names, prototype=self.stack_pop_transformer, name="stack_pop") self.stack_push_transformer = MLP(activations=[Logistic()], dims=None) self.stack_push_transformers = Parallel( input_names=self.stack_op_names, prototype=self.stack_push_transformer, name="stack_push") self.stack_input_transformer = Linear() self.stack_input_transformers = Parallel( input_names=self.stack_op_names, prototype=self.stack_input_transformer, name="stack_input") self.children.append(self.stack_pop_transformers) self.children.append(self.stack_push_transformers) self.children.append(self.stack_input_transformers) def _push_allocation_config(self): """Sets the dimensions of the stack operation networks """ super(PushDownSequenceContentAttention, self)._push_allocation_config() self.stack_op_dims = self.state_dims + [self.attended_dim] n_states = len(self.stack_op_dims) self.stack_pop_transformers.input_dims = self.stack_op_dims self.stack_pop_transformers.output_dims = [1] * n_states self.stack_push_transformers.input_dims = self.stack_op_dims self.stack_push_transformers.output_dims = [1] * n_states self.stack_input_transformers.input_dims = self.stack_op_dims self.stack_input_transformers.output_dims = [self.stack_dim] * n_states def _allocate(self): """Allocates the single parameter of this brick: the initial element on the stack. """ self.parameters.append( shared_floatx_nans((1, self.stack_dim), name='init_stack')) add_role(self.parameters[-1], INITIAL_STATE) def _initialize(self): """Initializes the initial element on the stack with zero. """ self.biases_init.initialize(self.parameters[-1], self.rng) @application(outputs=['context_vector', 'weights', 'stack']) def take_glimpses(self, attended, preprocessed_attended=None, attended_mask=None, stack=None, **states): """This method is an extension to ``take_glimpses`` in ``SequenceContentAttention``. After computing the weighted averages of source annotations, it operates the stack, i.e. pops the top element, reads out the top of the stack, and pushes a new element. The first glimpse ``context_vector`` is the concatenation of weighted source annotations and stack output. Args: attended (Variable): Source annotations preprocessed_attended (Variable): Transformed source annotations used to compute energies attended_mask (Variable): Source mask stack (Variable): Current state of the stack \*\*states (Variable): Decoder state Returns: Tuple. The first element is used as context vector for the decoder state update. ``stack`` is a recurrent glimpse which is used in the next ``take_glimpse`` iteration. """ energies = self.compute_energies(attended, preprocessed_attended, states) weights = self.compute_weights(energies, attended_mask) weighted_averages = self.compute_weighted_averages(weights, attended) stack_op_input = states stack_op_input['weighted_averages'] = weighted_averages stack_pop = sum( self.stack_pop_transformers.apply(as_dict=True, **stack_op_input).values()) stack_push = sum( self.stack_push_transformers.apply(as_dict=True, **stack_op_input).values()) stack_input = sum( self.stack_input_transformers.apply(as_dict=True, **stack_op_input).values()) # the stack has shape (batch_size, stack_depth, stack_dim) batch_size = stack.shape[0] stack_dim = stack_input.shape[1] default_stack_entry = tensor.repeat(self.parameters[-1][None, :, :], batch_size, 0) pushed_stack = tensor.concatenate( [stack_input.reshape((batch_size, 1, stack_dim)), stack[:, 1:, :]], axis=1) popped_stack = tensor.concatenate( [stack[:, :-1, :], default_stack_entry], axis=1) pop_gate = stack_pop.reshape((batch_size, 1, 1)) push_gate = stack_push.reshape((batch_size, 1, 1)) read_stack = pop_gate * popped_stack + (1.0 - pop_gate) * stack stack_output = read_stack[:, 0, :] new_stack = push_gate * pushed_stack + (1.0 - push_gate) * read_stack context_vector = tensor.concatenate([weighted_averages, stack_output], axis=1) return context_vector, weights.T, new_stack'inputs') def take_glimpses_inputs(self): """Defines the ``inputs`` decoration for ``take_glimpses``. """ return ( ['attended', 'preprocessed_attended', 'attended_mask', 'stack'] + self.state_names) @application(outputs=['context_vector', 'weights', 'stack']) def initial_glimpses(self, batch_size, attended): """The stack is initialized with the default entry. All other glimpses are initialized with zero. """ default_stack_entry = tensor.repeat(self.parameters[-1][None, :, :], batch_size, 0) return [ tensor.zeros((batch_size, self.attended_dim + self.stack_dim)), tensor.zeros((batch_size, attended.shape[0])), tensor.repeat(default_stack_entry, self.max_stack_depth, 1) ] def get_dim(self, name): """Get dimensions of variables. Delegates to super class if ``name`` is not used in this class. """ if name in ['context_vector']: return self.attended_dim + self.stack_dim if name in ['weights']: return 0 if name in ['stack']: return self.max_stack_depth, self.stack_dim return super(PushDownSequenceContentAttention, self).get_dim(name)