def merge_top_features(self, decoder_output): """ Merges features of decoder top layers, as the input of softmax layer. Features to be merged are as follows: 1. current decoder RNN state; 2. current attention context. Args: decoder_output: An instance of `collections.namedtuple` whose element types are defined by `output_dtype` property. Returns: A instance of `tf.Tensor`, as the input of softmax layer. """ assert isinstance(decoder_output, self._DecoderOutputSpec) cur_decoder_hidden = decoder_output.cur_decoder_hidden prev_input = decoder_output.prev_input attention_context = decoder_output.attention_context logit_lstm = fflayer(cur_decoder_hidden, output_size=self.params["logits_dimension"], activation=None, dropout_input_keep_prob=self.params["dropout_hidden_keep_prob"], name="ff_logit_lstm") # TODO here to fit old version code # logit_prev = fflayer(prev_input, output_size=self.params["logits_dimension"], # dropout_input_keep_prob=self.params["dropout_embedding_keep_prob"], # activation=None, name="ff_logit_prev") logit_ctx = fflayer(attention_context, output_size=self.params["logits_dimension"], activation=None, dropout_input_keep_prob=self.params["dropout_hidden_keep_prob"], name="ff_logit_ctx") # merged_output = tf.tanh(logit_lstm + logit_prev + logit_ctx) merged_output = tf.tanh(logit_lstm + logit_ctx) return merged_output
def top(self, top_features): """ Computes logits on the top layer. Args: top_features: A Tensor. Returns: A logits Tensor. """ feature_last_dim = top_features.get_shape().as_list()[-1] if self.params["share_embedding_and_softmax_weights"]: assert feature_last_dim == self._body_input_depth, \ "when shared_embedding_and_softmax_weights, dim_logits should be equal to input_depth" scope_name = "shared" with tf.variable_scope(scope_name, reuse=True): var = tf.transpose(self._get_weight(feature_last_dim), [1, 0]) else: scope_name = "softmax" var = None logits = fflayer(top_features, output_size=self.top_dimension, handle=var, activation=None, dropout_input_keep_prob=self.params["dropout_logit_keep_prob"], followed_by_softmax=True, name=scope_name) return logits
def prepare(self, encoder_output, bridge, helper): """ Prepares for `step()` function. Do: 1. initialize decoder RNN states using `bridge`; 2. acquire attention information from `encoder_output`; 3. pre-project the attention keys Args: encoder_output: An instance of `collections.namedtuple` from `Encoder.encode()`. bridge: An instance of `Bridge` that initializes the decoder states. helper: An instance of `Feedback` that samples next symbols from logits. Returns: A tuple `(init_decoder_states, decoding_params)`. `decoding_params` is a tuple containing pre-projected attention keys, attention values and attention length, and will be passed to `step()` function. """ attention_values = encoder_output.attention_values attention_length = encoder_output.attention_length with tf.variable_scope(self._attention.name): projected_attention_keys = fflayer( inputs=attention_values, output_size=self._attention.attention_units, dropout_input_keep_prob=self. params["dropout_context_keep_prob"], activation=None, name="ff_att_keys") init_rnn_states = bridge(self._r_rnn_cells.state_size) decoding_params = (projected_attention_keys, attention_values, attention_length) return init_rnn_states, decoding_params
def top(self, top_features): """ Computes logits on the top layer. Args: top_features: A Tensor. Returns: A logits Tensor. """ feature_last_dim = top_features.get_shape().as_list()[-1] if self.params["share_embedding_and_softmax_weights"]: assert feature_last_dim == self._body_input_depth, \ "when shared_embedding_and_softmax_weights, dim_logits should be equal to input_depth" scope_name = "shared" with tf.variable_scope(scope_name, reuse=True): var = tf.transpose(self._get_weight(feature_last_dim), [1, 0]) else: scope_name = "softmax" var = None logits = fflayer( top_features, output_size=self.top_dimension, handle=var, activation=None, name=scope_name, dropout_input_keep_prob=self.params["dropout_logit_keep_prob"]) return logits
def _create(self, encoder_output, decoder_state_size, **kwargs): """ Creates decoder's initial RNN states according to `decoder_state_size`. Do linear transformations to encoder output/state and map the structure to `decoder_state_size`. If params[`bridge_input`] == "output", first average the encoder output tensor over timesteps. Args: encoder_output: An instance of `collections.namedtuple` from `Encoder.encode()`. decoder_state_size: RNN decoder state size. **kwargs: Returns: The decoder states with the structure determined by `decoder_state_size`. Raises: ValueError: if `encoder_output` has no attribute named params[`bridge_input`]. """ if not hasattr(encoder_output, self.params["bridge_input"]): raise ValueError("encoder output has not attribute: {}, " "only final_state and outputs available" .format(self.params["bridge_input"])) if self.params["bridge_input"] == "outputs": # [batch_size, max_time, num_units] context = encoder_output.outputs mask = tf.sequence_mask( lengths=tf.to_int32(encoder_output.attention_length), maxlen=tf.shape(context)[1], dtype=tf.float32) # [batch_size, num_units] bridge_input = tf.truediv( tf.reduce_sum(context * tf.expand_dims(mask, 2), axis=1), tf.expand_dims( tf.to_float(encoder_output.attention_length), 1)) elif self.params["bridge_input"] == "final_states": bridge_input = nest.flatten(_final_states(encoder_output.final_states)) bridge_input = tf.concat(bridge_input, 1) else: raise ValueError("Unrecognized value of bridge_input: {}, " "should be outputs or final_state".format(self.params["bridge_input"])) state_size_splits = nest.flatten(decoder_state_size) total_decoder_state_size = sum(state_size_splits) # [batch_size, total_decoder_state_size] init_state = fflayer(inputs=bridge_input, output_size=total_decoder_state_size, activation=self._activation, name="init_state_trans") init_state = nest.pack_sequence_as( decoder_state_size, tf.split(init_state, state_size_splits, axis=1)) return init_state
def prepare(self, encoder_output, bridge, helper): """ Prepares for `step()` function. Do, 1. initialize decoder RNN states using `bridge`; 2. acquire attention information from `encoder_output`; 3. pre-project the attention keys Args: encoder_output: An instance of `collections.namedtuple` from `Encoder.encode()`. bridge: An instance of `Bridge` that initializes the decoder states. helper: An instance of `Feedback` that samples next symbols from logits. Returns: A dict containing decoder RNN states, pre-projected attention keys, attention values and attention length, and will be passed to `step()` function. """ attention_values = encoder_output.attention_values # [batch_size, timesteps, dim_context] if hasattr(encoder_output, "attention_bias"): attention_bias = encoder_output.attention_bias else: attention_length = encoder_output.attention_length attention_bias = getattr(eval(self.params["attention.class"]), "attention_length_to_bias")( tf.shape(attention_values)[1], attention_length) with tf.variable_scope(self._attention.name): projected_attention_keys = fflayer( inputs=attention_values, output_size=self._attention.attention_units, activation=None, dropout_input_keep_prob=self. params["dropout_context_keep_prob"], name="ff_att_keys") init_rnn_states = bridge(encoder_output, self._rnn_cells.state_size) if self._attention.attention_value_depth > 0: init_att_context = tf.zeros([ tf.shape(attention_values)[0], self._attention.attention_value_depth ], dtype=tf.float32) else: init_att_context = tf.zeros_like(attention_values[:, 0, :], dtype=tf.float32) init_cache = initialize_cache(decoding_states={ "rnn_states": init_rnn_states, "attention_context": init_att_context }, attention_keys=projected_attention_keys, memory=attention_values, memory_bias=attention_bias) return init_cache
def build(self, query, memory, memory_length=None, memory_bias=None, cache=None): """ Builds attention context via a simple process. Args: query: Attention query tensor with shape [batch_size, channels_query]. keys: Attention keys tensor with shape [batch_size, num_of_keys, channels_key]. memory: Attention values tensor with shape [batch_size, num_of_values, channels_value]. memory_length: The number of attention values, a Tensor with shape [batch_size,]. memory_bias: The bias tensor for attention values. query_is_projected: Whether the `query` is already projected. key_is_projected: Whether the `keys` is already projected. cache: A dictionary containing pre-projected keys and values. This field is specifically for MultiHeadAttention. Returns: A tuple `(attention_scores, attention_context)`. The `attention_scores` has shape [batch_size, num_of_values]. The `attention_context` has shape [batch_size, channels_value]. """ _ = cache with tf.variable_scope(self.name): query = fflayer(query, output_size=self.attention_units, activation=None, name="ff_att_query") keys = memory if cache is not None and "attention_keys" in cache: keys = cache["attention_keys"] if memory_bias is None: if memory_length is not None: memory_bias = BaseAttention.attention_length_to_bias( tf.shape(memory)[1], memory_length) # attention weights: [batch_size, num_of_values] attention_weight = self.att_fn(query, keys, memory_bias) # Calculate the weighted average of the attention inputs # according to the scores # [batch_size, num_of_values, 1] * [batch_size, num_of_values, channels_value] context = tf.expand_dims(attention_weight, 2) * memory # [batch_size, channels_value] context = tf.reduce_sum(context, 1, name="context") context.set_shape([None, memory.get_shape().as_list()[-1]]) return attention_weight, context
def step(self, decoder_input, decoder_states, decoding_params): """ Decodes one step. Args: decoder_input: The decoder input for this timestep, an instance of `tf.Tensor`, [batch_size, dim_word]. decoder_states: The decoder RNN states at previous timestep. Must have the same structure with `init_decoder_states` returned from `prepare()` function. decoding_params: The same as `decoding_params` returned from `prepare()` function. Returns: A tuple `(cur_decoder_outputs, cur_decoder_states)` at this timestep. The `cur_decoder_outputs` must be an instance of `collections.namedtuple` whose element types are defined by `output_dtype` property. The `cur_decoder_states` must have the same structure with `decoder_states`. """ projected_attention_keys, attention_values, attention_length = decoding_params # layer0: get hidden1 cell_output0, cell_state0 = self._cond_rnn_cell( decoder_input, decoder_states[0]) # Compute attention # att_scores: [batch_size, 1] # attention_context: [batch_size, dim_context] with tf.variable_scope(self._attention.name): projected_query = fflayer( cell_output0, output_size=self._attention.attention_units, dropout_input_keep_prob=self. params["dropout_hidden_keep_prob"], activation=None, name="ff_att_query") # compute attention using hidden1 # [batch_size, n_timesteps_src] attention_scores, attention_context = self._attention.build( query=projected_query, keys=projected_attention_keys, memory=attention_values, memory_length=attention_length) # hidden1's state is the hidden2 's initial state following_decoder_state = tuple([cell_state0] + list(decoder_states[1:])) cell_output, cell_states = self._r_rnn_cells(attention_context, following_decoder_state) outputs = self._DecoderOutputSpec(cur_decoder_hidden=cell_output, prev_input=decoder_input, attention_context=attention_context, attention_scores=attention_scores) return outputs, cell_states
def merge_top_features(self, decoder_output): """ Merges features of decoder top layers, as the input of softmax layer. Features to be merged are as follows: 1. current decoder RNN state; 2. current attention context; 3. previous predicted word. Args: decoder_output: An instance of `collections.namedtuple` whose element types are defined by `output_dtype` property. Returns: A instance of `tf.Tensor`, as the input of softmax layer. """ assert isinstance(decoder_output, self._DecoderOutputSpec) cur_decoder_hidden = decoder_output.cur_decoder_hidden prev_input = decoder_output.prev_input attention_context = decoder_output.attention_context logit_lstm = fflayer( cur_decoder_hidden, output_size=self.params["logits_dimension"], dropout_input_keep_prob=self.params["dropout_hidden_keep_prob"], activation=None, name="ff_logit_lstm") logit_prev = fflayer( prev_input, output_size=self.params["logits_dimension"], dropout_input_keep_prob=self.params["dropout_embedding_keep_prob"], activation=None, name="ff_logit_prev") logit_ctx = fflayer( attention_context, output_size=self.params["logits_dimension"], dropout_input_keep_prob=self.params["dropout_hidden_keep_prob"], activation=None, name="ff_logit_ctx") merged_output = tf.tanh(logit_lstm + logit_prev + logit_ctx) return merged_output
def build(self, query, memory, memory_length=None, memory_bias=None, cache=None): """ Builds attention context via a simple process. Args: query: Attention query tensor with shape [batch_size, channels_query]. keys: Attention keys tensor with shape [batch_size, num_of_keys, channels_key]. memory: Attention values tensor with shape [batch_size, num_of_values, channels_value]. memory_length: The number of attention values, a Tensor with shape [batch_size,]. memory_bias: The bias tensor for attention values. query_is_projected: Whether the `query` is already projected. key_is_projected: Whether the `keys` is already projected. cache: A dictionary containing pre-projected keys and values. This field is specifically for MultiHeadAttention. Returns: A tuple `(attention_scores, attention_context)`. The `attention_scores` has shape [batch_size, num_of_values]. The `attention_context` has shape [batch_size, channels_value]. """ _ = cache with tf.variable_scope(self.name): query = fflayer(query, output_size=self.attention_units, activation=None, name="ff_att_query") keys = memory if cache is not None and "attention_keys" in cache: keys = cache["attention_keys"] if memory_bias is None: if memory_length is not None: memory_bias = BaseAttention.attention_length_to_bias(tf.shape(memory)[1], memory_length) # attention weights: [batch_size, num_of_values] attention_weight = self.att_fn(query, keys, memory_bias) # Calculate the weighted average of the attention inputs # according to the scores # [batch_size, num_of_values, 1] * [batch_size, num_of_values, channels_value] context = tf.expand_dims(attention_weight, 2) * memory # [batch_size, channels_value] context = tf.reduce_sum(context, 1, name="context") context.set_shape([None, memory.get_shape().as_list()[-1]]) return attention_weight, context
def merge_top_features(self, decoder_states): """ Merges features of decoder top layers, as the input of softmax layer. Features to be merged are as follows: 1. current decoder RNN state; 2. previous predicted word. Args: decoder_output: An instance of `collections.namedtuple` whose element types are defined by `output_dtype` property. Returns: A instance of `tf.Tensor`, as the input of softmax layer. """ cur_decoder_hidden = decoder_states.cur_decoder_hidden prev_input = decoder_states.prev_input logit_lstm = fflayer(cur_decoder_hidden, output_size=self.params["logits_dimension"], activation=None, dropout_input_keep_prob=self.params["dropout_hidden_keep_prob"], name="ff_logit_lstm") logit_prev = fflayer(prev_input, output_size=self.params["logits_dimension"], activation=None, dropout_input_keep_prob=self.params["dropout_embedding_keep_prob"], name="ff_logit_prev") merged_output = tf.tanh(logit_lstm + logit_prev) return merged_output
def prepare(self, encoder_output, bridge, helper): """ Prepares for `step()` function. Do, 1. initialize decoder RNN states using `bridge`; 2. acquire attention information from `encoder_output`; 3. pre-project the attention keys Args: encoder_output: An instance of `collections.namedtuple` from `Encoder.encode()`. bridge: An instance of `Bridge` that initializes the decoder states. helper: An instance of `Feedback` that samples next symbols from logits. Returns: A dict containing decoder RNN states, pre-projected attention keys, attention values and attention length, and will be passed to `step()` function. """ attention_values = encoder_output.attention_values # [batch_size, timesteps, dim_context] if hasattr(encoder_output, "attention_bias"): attention_bias = encoder_output.attention_bias else: attention_length = encoder_output.attention_length attention_bias = getattr(eval(self.params["attention.class"]), "attention_length_to_bias")(tf.shape(attention_values)[1], attention_length) with tf.variable_scope(self._attention.name): projected_attention_keys = fflayer(inputs=attention_values, output_size=self._attention.attention_units, activation=None, dropout_input_keep_prob=self.params["dropout_context_keep_prob"], name="ff_att_keys") init_rnn_states = bridge(encoder_output, self._rnn_cells.state_size) if self._attention.attention_value_depth > 0: init_att_context = tf.zeros([tf.shape(attention_values)[0], self._attention.attention_value_depth], dtype=tf.float32) else: init_att_context = tf.zeros_like(attention_values[:, 0, :], dtype=tf.float32) init_cache = initialize_cache( decoding_states={"rnn_states": init_rnn_states, "attention_context": init_att_context}, attention_keys=projected_attention_keys, memory=attention_values, memory_bias=attention_bias) return init_cache