Exemplo n.º 1
0
    def merge_top_features(self, decoder_output):
        """ Merges features of decoder top layers, as the input
        of softmax layer.
        Features to be merged are as follows:
            1. current decoder RNN state;
            2. current attention context.

        Args:
            decoder_output: An instance of `collections.namedtuple`
              whose element types are defined by `output_dtype`
              property.

        Returns: A instance of `tf.Tensor`, as the input of
          softmax layer.
        """
        assert isinstance(decoder_output, self._DecoderOutputSpec)
        cur_decoder_hidden = decoder_output.cur_decoder_hidden
        prev_input = decoder_output.prev_input
        attention_context = decoder_output.attention_context

        logit_lstm = fflayer(cur_decoder_hidden, output_size=self.params["logits_dimension"], activation=None,
                             dropout_input_keep_prob=self.params["dropout_hidden_keep_prob"], name="ff_logit_lstm")
        # TODO here to fit old version code
        # logit_prev = fflayer(prev_input, output_size=self.params["logits_dimension"],
        #                      dropout_input_keep_prob=self.params["dropout_embedding_keep_prob"],
        #                      activation=None, name="ff_logit_prev")
        logit_ctx = fflayer(attention_context, output_size=self.params["logits_dimension"], activation=None,
                            dropout_input_keep_prob=self.params["dropout_hidden_keep_prob"], name="ff_logit_ctx")
        # merged_output = tf.tanh(logit_lstm + logit_prev + logit_ctx)
        merged_output = tf.tanh(logit_lstm + logit_ctx)
        return merged_output
Exemplo n.º 2
0
    def top(self, top_features):
        """ Computes logits on the top layer.

        Args:
            top_features: A Tensor.

        Returns: A logits Tensor.
        """
        feature_last_dim = top_features.get_shape().as_list()[-1]
        if self.params["share_embedding_and_softmax_weights"]:
            assert feature_last_dim == self._body_input_depth, \
                "when shared_embedding_and_softmax_weights, dim_logits should be equal to input_depth"
            scope_name = "shared"
            with tf.variable_scope(scope_name, reuse=True):
                var = tf.transpose(self._get_weight(feature_last_dim), [1, 0])
        else:
            scope_name = "softmax"
            var = None
        logits = fflayer(top_features,
                         output_size=self.top_dimension,
                         handle=var, activation=None,
                         dropout_input_keep_prob=self.params["dropout_logit_keep_prob"],
                         followed_by_softmax=True,
                         name=scope_name)
        return logits
Exemplo n.º 3
0
    def prepare(self, encoder_output, bridge, helper):
        """ Prepares for `step()` function.
        Do:
            1. initialize decoder RNN states using `bridge`;
            2. acquire attention information from `encoder_output`;
            3. pre-project the attention keys

        Args:
            encoder_output: An instance of `collections.namedtuple`
              from `Encoder.encode()`.
            bridge: An instance of `Bridge` that initializes the
              decoder states.
            helper: An instance of `Feedback` that samples next
              symbols from logits.
        Returns: A tuple `(init_decoder_states, decoding_params)`.
          `decoding_params` is a tuple containing pre-projected
          attention keys, attention values and attention length,
          and will be passed to `step()` function.
        """
        attention_values = encoder_output.attention_values
        attention_length = encoder_output.attention_length
        with tf.variable_scope(self._attention.name):
            projected_attention_keys = fflayer(
                inputs=attention_values,
                output_size=self._attention.attention_units,
                dropout_input_keep_prob=self.
                params["dropout_context_keep_prob"],
                activation=None,
                name="ff_att_keys")
        init_rnn_states = bridge(self._r_rnn_cells.state_size)
        decoding_params = (projected_attention_keys, attention_values,
                           attention_length)

        return init_rnn_states, decoding_params
Exemplo n.º 4
0
    def top(self, top_features):
        """ Computes logits on the top layer.

        Args:
            top_features: A Tensor.

        Returns: A logits Tensor.
        """
        feature_last_dim = top_features.get_shape().as_list()[-1]
        if self.params["share_embedding_and_softmax_weights"]:
            assert feature_last_dim == self._body_input_depth, \
                "when shared_embedding_and_softmax_weights, dim_logits should be equal to input_depth"
            scope_name = "shared"
            with tf.variable_scope(scope_name, reuse=True):
                var = tf.transpose(self._get_weight(feature_last_dim), [1, 0])
        else:
            scope_name = "softmax"
            var = None
        logits = fflayer(
            top_features,
            output_size=self.top_dimension,
            handle=var,
            activation=None,
            name=scope_name,
            dropout_input_keep_prob=self.params["dropout_logit_keep_prob"])
        return logits
Exemplo n.º 5
0
    def _create(self, encoder_output, decoder_state_size, **kwargs):
        """ Creates decoder's initial RNN states according to
        `decoder_state_size`.

        Do linear transformations to encoder output/state and map the
        structure to `decoder_state_size`.
        If params[`bridge_input`] == "output", first average the encoder
        output tensor over timesteps.
        Args:
            encoder_output: An instance of `collections.namedtuple`
              from `Encoder.encode()`.
            decoder_state_size: RNN decoder state size.
            **kwargs:

        Returns: The decoder states with the structure determined
          by `decoder_state_size`.

        Raises:
            ValueError: if `encoder_output` has no attribute named
              params[`bridge_input`].
        """
        if not hasattr(encoder_output, self.params["bridge_input"]):
            raise ValueError("encoder output has not attribute: {}, "
                             "only final_state and outputs available"
                             .format(self.params["bridge_input"]))
        if self.params["bridge_input"] == "outputs":
            # [batch_size, max_time, num_units]
            context = encoder_output.outputs
            mask = tf.sequence_mask(
                lengths=tf.to_int32(encoder_output.attention_length),
                maxlen=tf.shape(context)[1],
                dtype=tf.float32)
            # [batch_size, num_units]
            bridge_input = tf.truediv(
                tf.reduce_sum(context * tf.expand_dims(mask, 2), axis=1),
                tf.expand_dims(
                    tf.to_float(encoder_output.attention_length), 1))
        elif self.params["bridge_input"] == "final_states":
            bridge_input = nest.flatten(_final_states(encoder_output.final_states))
            bridge_input = tf.concat(bridge_input, 1)
        else:
            raise ValueError("Unrecognized value of bridge_input: {}, "
                             "should be outputs or final_state".format(self.params["bridge_input"]))
        state_size_splits = nest.flatten(decoder_state_size)
        total_decoder_state_size = sum(state_size_splits)
        # [batch_size, total_decoder_state_size]
        init_state = fflayer(inputs=bridge_input,
                             output_size=total_decoder_state_size,
                             activation=self._activation,
                             name="init_state_trans")
        init_state = nest.pack_sequence_as(
            decoder_state_size,
            tf.split(init_state, state_size_splits, axis=1))
        return init_state
Exemplo n.º 6
0
    def prepare(self, encoder_output, bridge, helper):
        """ Prepares for `step()` function.
        Do,
            1. initialize decoder RNN states using `bridge`;
            2. acquire attention information from `encoder_output`;
            3. pre-project the attention keys

        Args:
            encoder_output: An instance of `collections.namedtuple`
              from `Encoder.encode()`.
            bridge: An instance of `Bridge` that initializes the
              decoder states.
            helper: An instance of `Feedback` that samples next
              symbols from logits.
        Returns: A dict containing decoder RNN states, pre-projected attention
          keys, attention values and attention length, and will be passed
          to `step()` function.
        """
        attention_values = encoder_output.attention_values  # [batch_size, timesteps, dim_context]
        if hasattr(encoder_output, "attention_bias"):
            attention_bias = encoder_output.attention_bias
        else:
            attention_length = encoder_output.attention_length
            attention_bias = getattr(eval(self.params["attention.class"]),
                                     "attention_length_to_bias")(
                                         tf.shape(attention_values)[1],
                                         attention_length)
        with tf.variable_scope(self._attention.name):
            projected_attention_keys = fflayer(
                inputs=attention_values,
                output_size=self._attention.attention_units,
                activation=None,
                dropout_input_keep_prob=self.
                params["dropout_context_keep_prob"],
                name="ff_att_keys")
        init_rnn_states = bridge(encoder_output, self._rnn_cells.state_size)
        if self._attention.attention_value_depth > 0:
            init_att_context = tf.zeros([
                tf.shape(attention_values)[0],
                self._attention.attention_value_depth
            ],
                                        dtype=tf.float32)
        else:
            init_att_context = tf.zeros_like(attention_values[:, 0, :],
                                             dtype=tf.float32)
        init_cache = initialize_cache(decoding_states={
            "rnn_states": init_rnn_states,
            "attention_context": init_att_context
        },
                                      attention_keys=projected_attention_keys,
                                      memory=attention_values,
                                      memory_bias=attention_bias)

        return init_cache
    def build(self,
              query,
              memory,
              memory_length=None,
              memory_bias=None,
              cache=None):
        """ Builds attention context via a simple process.

        Args:
            query: Attention query tensor with shape
              [batch_size, channels_query].
            keys: Attention keys tensor with shape
              [batch_size, num_of_keys, channels_key].
            memory: Attention values tensor with shape
              [batch_size, num_of_values, channels_value].
            memory_length: The number of attention values, a
              Tensor with shape [batch_size,].
            memory_bias: The bias tensor for attention values.
            query_is_projected: Whether the `query` is already projected.
            key_is_projected: Whether the `keys` is already projected.
            cache: A dictionary containing pre-projected keys and values.
              This field is specifically for MultiHeadAttention.

        Returns: A tuple `(attention_scores, attention_context)`. The
          `attention_scores` has shape [batch_size, num_of_values].
          The `attention_context` has shape [batch_size, channels_value].
        """
        _ = cache
        with tf.variable_scope(self.name):
            query = fflayer(query,
                            output_size=self.attention_units,
                            activation=None,
                            name="ff_att_query")
            keys = memory
            if cache is not None and "attention_keys" in cache:
                keys = cache["attention_keys"]

            if memory_bias is None:
                if memory_length is not None:
                    memory_bias = BaseAttention.attention_length_to_bias(
                        tf.shape(memory)[1], memory_length)

            # attention weights: [batch_size, num_of_values]
            attention_weight = self.att_fn(query, keys, memory_bias)

            # Calculate the weighted average of the attention inputs
            # according to the scores
            #   [batch_size, num_of_values, 1] * [batch_size, num_of_values, channels_value]
            context = tf.expand_dims(attention_weight, 2) * memory
            #   [batch_size, channels_value]
            context = tf.reduce_sum(context, 1, name="context")
            context.set_shape([None, memory.get_shape().as_list()[-1]])

            return attention_weight, context
Exemplo n.º 8
0
    def step(self, decoder_input, decoder_states, decoding_params):
        """ Decodes one step.

        Args:
            decoder_input: The decoder input for this timestep, an
              instance of `tf.Tensor`, [batch_size, dim_word].
            decoder_states: The decoder RNN states at previous timestep.
              Must have the same structure with `init_decoder_states`
              returned from `prepare()` function.
            decoding_params: The same as `decoding_params` returned
              from `prepare()` function.

        Returns: A tuple `(cur_decoder_outputs, cur_decoder_states)`
          at this timestep. The `cur_decoder_outputs` must be an
          instance of `collections.namedtuple` whose element types
          are defined by `output_dtype` property. The
          `cur_decoder_states` must have the same structure with
          `decoder_states`.
        """
        projected_attention_keys, attention_values, attention_length = decoding_params
        # layer0: get hidden1
        cell_output0, cell_state0 = self._cond_rnn_cell(
            decoder_input, decoder_states[0])

        # Compute attention
        # att_scores: [batch_size, 1]
        # attention_context: [batch_size, dim_context]
        with tf.variable_scope(self._attention.name):
            projected_query = fflayer(
                cell_output0,
                output_size=self._attention.attention_units,
                dropout_input_keep_prob=self.
                params["dropout_hidden_keep_prob"],
                activation=None,
                name="ff_att_query")
        # compute attention using hidden1
        # [batch_size, n_timesteps_src]
        attention_scores, attention_context = self._attention.build(
            query=projected_query,
            keys=projected_attention_keys,
            memory=attention_values,
            memory_length=attention_length)
        # hidden1's state is the hidden2 's initial state
        following_decoder_state = tuple([cell_state0] +
                                        list(decoder_states[1:]))
        cell_output, cell_states = self._r_rnn_cells(attention_context,
                                                     following_decoder_state)

        outputs = self._DecoderOutputSpec(cur_decoder_hidden=cell_output,
                                          prev_input=decoder_input,
                                          attention_context=attention_context,
                                          attention_scores=attention_scores)

        return outputs, cell_states
Exemplo n.º 9
0
    def merge_top_features(self, decoder_output):
        """ Merges features of decoder top layers, as the input
        of softmax layer.
        Features to be merged are as follows:
            1. current decoder RNN state;
            2. current attention context;
            3. previous predicted word.

        Args:
            decoder_output: An instance of `collections.namedtuple`
              whose element types are defined by `output_dtype`
              property.

        Returns: A instance of `tf.Tensor`, as the input of
          softmax layer.
        """
        assert isinstance(decoder_output, self._DecoderOutputSpec)
        cur_decoder_hidden = decoder_output.cur_decoder_hidden
        prev_input = decoder_output.prev_input
        attention_context = decoder_output.attention_context

        logit_lstm = fflayer(
            cur_decoder_hidden,
            output_size=self.params["logits_dimension"],
            dropout_input_keep_prob=self.params["dropout_hidden_keep_prob"],
            activation=None,
            name="ff_logit_lstm")
        logit_prev = fflayer(
            prev_input,
            output_size=self.params["logits_dimension"],
            dropout_input_keep_prob=self.params["dropout_embedding_keep_prob"],
            activation=None,
            name="ff_logit_prev")
        logit_ctx = fflayer(
            attention_context,
            output_size=self.params["logits_dimension"],
            dropout_input_keep_prob=self.params["dropout_hidden_keep_prob"],
            activation=None,
            name="ff_logit_ctx")
        merged_output = tf.tanh(logit_lstm + logit_prev + logit_ctx)
        return merged_output
Exemplo n.º 10
0
    def build(self,
              query,
              memory,
              memory_length=None,
              memory_bias=None,
              cache=None):
        """ Builds attention context via a simple process.

        Args:
            query: Attention query tensor with shape
              [batch_size, channels_query].
            keys: Attention keys tensor with shape
              [batch_size, num_of_keys, channels_key].
            memory: Attention values tensor with shape
              [batch_size, num_of_values, channels_value].
            memory_length: The number of attention values, a
              Tensor with shape [batch_size,].
            memory_bias: The bias tensor for attention values.
            query_is_projected: Whether the `query` is already projected.
            key_is_projected: Whether the `keys` is already projected.
            cache: A dictionary containing pre-projected keys and values.
              This field is specifically for MultiHeadAttention.

        Returns: A tuple `(attention_scores, attention_context)`. The
          `attention_scores` has shape [batch_size, num_of_values].
          The `attention_context` has shape [batch_size, channels_value].
        """
        _ = cache
        with tf.variable_scope(self.name):
            query = fflayer(query, output_size=self.attention_units, activation=None, name="ff_att_query")
            keys = memory
            if cache is not None and "attention_keys" in cache:
                keys = cache["attention_keys"]

            if memory_bias is None:
                if memory_length is not None:
                    memory_bias = BaseAttention.attention_length_to_bias(tf.shape(memory)[1], memory_length)

            # attention weights: [batch_size, num_of_values]
            attention_weight = self.att_fn(query, keys, memory_bias)

            # Calculate the weighted average of the attention inputs
            # according to the scores
            #   [batch_size, num_of_values, 1] * [batch_size, num_of_values, channels_value]
            context = tf.expand_dims(attention_weight, 2) * memory
            #   [batch_size, channels_value]
            context = tf.reduce_sum(context, 1, name="context")
            context.set_shape([None, memory.get_shape().as_list()[-1]])

            return attention_weight, context
Exemplo n.º 11
0
    def merge_top_features(self, decoder_states):
        """ Merges features of decoder top layers, as the input
        of softmax layer.
        Features to be merged are as follows:
            1. current decoder RNN state;
            2. previous predicted word.

        Args:
            decoder_output: An instance of `collections.namedtuple`
              whose element types are defined by `output_dtype`
              property.

        Returns: A instance of `tf.Tensor`, as the input of
          softmax layer.
        """
        cur_decoder_hidden = decoder_states.cur_decoder_hidden
        prev_input = decoder_states.prev_input
        logit_lstm = fflayer(cur_decoder_hidden, output_size=self.params["logits_dimension"], activation=None,
                             dropout_input_keep_prob=self.params["dropout_hidden_keep_prob"], name="ff_logit_lstm")
        logit_prev = fflayer(prev_input, output_size=self.params["logits_dimension"], activation=None,
                             dropout_input_keep_prob=self.params["dropout_embedding_keep_prob"], name="ff_logit_prev")
        merged_output = tf.tanh(logit_lstm + logit_prev)
        return merged_output
Exemplo n.º 12
0
    def prepare(self, encoder_output, bridge, helper):
        """ Prepares for `step()` function.
        Do,
            1. initialize decoder RNN states using `bridge`;
            2. acquire attention information from `encoder_output`;
            3. pre-project the attention keys

        Args:
            encoder_output: An instance of `collections.namedtuple`
              from `Encoder.encode()`.
            bridge: An instance of `Bridge` that initializes the
              decoder states.
            helper: An instance of `Feedback` that samples next
              symbols from logits.
        Returns: A dict containing decoder RNN states, pre-projected attention
          keys, attention values and attention length, and will be passed
          to `step()` function.
        """
        attention_values = encoder_output.attention_values  # [batch_size, timesteps, dim_context]
        if hasattr(encoder_output, "attention_bias"):
            attention_bias = encoder_output.attention_bias
        else:
            attention_length = encoder_output.attention_length
            attention_bias = getattr(eval(self.params["attention.class"]),
                                     "attention_length_to_bias")(tf.shape(attention_values)[1], attention_length)
        with tf.variable_scope(self._attention.name):
            projected_attention_keys = fflayer(inputs=attention_values, output_size=self._attention.attention_units,
                                               activation=None,
                                               dropout_input_keep_prob=self.params["dropout_context_keep_prob"],
                                               name="ff_att_keys")
        init_rnn_states = bridge(encoder_output, self._rnn_cells.state_size)
        if self._attention.attention_value_depth > 0:
            init_att_context = tf.zeros([tf.shape(attention_values)[0],
                                         self._attention.attention_value_depth], dtype=tf.float32)
        else:
            init_att_context = tf.zeros_like(attention_values[:, 0, :], dtype=tf.float32)
        init_cache = initialize_cache(
            decoding_states={"rnn_states": init_rnn_states,
                             "attention_context": init_att_context},
            attention_keys=projected_attention_keys,
            memory=attention_values,
            memory_bias=attention_bias)

        return init_cache