def call(self, inputs, state): cur_state_pos = 0 x = inputs new_states = [] attn_cell = self._cells[0] with tf.variable_scope("cell_%d" % 0): if self._state_is_tuple: if not nest.is_sequence(state): raise ValueError( "Expected state to be a tuple of length %d, but received: %s" % (len(self.state_size), state)) cur_state = state[0] else: cur_state = tf.slice(state, [0, cur_state_pos], [-1, attn_cell.state_size]) cur_state_pos += attn_cell.state_size if not isinstance(cur_state, seq2seq.AttentionWrapperState): raise ValueError("First state should be instance of AttentionWrapperState") attention = cur_state.attention rnn_input = tf.concat([x, self.agenda], axis=-1) h, new_state = attn_cell(rnn_input, cur_state) new_states.append(new_state) new_attention = new_state.attention # no skip connection on x = h for i, cell in enumerate(self._cells[1:]): i = i + 1 with tf.variable_scope("cell_%d" % (i)): if self._state_is_tuple: if not nest.is_sequence(state): raise ValueError( "Expected state to be a tuple of length %d, but received: %s" % (len(self.state_size), state)) cur_state = state[i] else: cur_state = tf.slice(state, [0, cur_state_pos], [-1, cell.state_size]) cur_state_pos += cell.state_size rnn_input = tf.concat([x, self.agenda, attention], axis=-1) h, new_state = cell(rnn_input, cur_state) new_states.append(new_state) # add residual connection from cell input to cell output x = x + h output = tf.concat([x, new_attention], axis=1) output = (output, x) new_states = (tuple(new_states) if self._state_is_tuple else tf.concat(new_states, 1)) return output, new_states
def linear(args, output_size, bias, bias_start=0.0): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Args: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_start: starting value to initialize the bias; 0 by default. Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape() for a in args] for shape in shapes: if shape.ndims != 2: raise ValueError("linear is expecting 2D arguments: %s" % shapes) if shape[1].value is None: raise ValueError( "linear expects shape[1] to be provided for shape %s, " "but saw %s" % (shape, shape[1])) else: total_arg_size += shape[1].value dtype = [a.dtype for a in args][0] # Now the computation. scope = tf.get_variable_scope() with tf.variable_scope(scope) as outer_scope: weights = tf.get_variable("weights", [total_arg_size, output_size], dtype=dtype) if len(args) == 1: res = tf.matmul(args[0], weights) else: res = tf.matmul(tf.concat(args, 1), weights) if not bias: return res with tf.variable_scope(outer_scope) as inner_scope: inner_scope.set_partitioner(None) biases = tf.get_variable("biases", [output_size], dtype=dtype, initializer=tf.constant_initializer( bias_start, dtype=dtype)) return tf.nn.bias_add(res, biases)
def __init__(self, cells): super(MultiGatedFeedbackRNNCell, self).__init__() if not cells: raise ValueError("Must specify at least one cell for MultiGatedFeedbackRNNCell") if not nest.is_sequence(cells): raise TypeError("cells must be a list of tuple, but saw: %s." % cells) self._cells = cells
def call(self, inputs, state): cur_inp = inputs new_states = [] for i, cell in enumerate(self._cells): with variable_scope('cell_%d' % i): if not nest.is_sequence(state): raise ValueError('Expected state to be a tuple of length %d, but received: %s' % (len(self.state_size), state)) cur_inp, new_state = cell(cur_inp, state) new_states.append(new_state) new_states = tuple(new_states) return cur_inp, new_states
def decoder(feed_previous_bool): reuse = None if feed_previous_bool else True with tf.variable_scope(tf.get_variable_scope(), reuse=reuse): outputs, state = embedding_attention_decoder( decoder_inputs, encoder_state, attention_states, dec_cell, num_decoder_symbols, embedding_size, num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous_bool, update_embedding_for_previous=False, initial_state_attention=initial_state_attention) state_list = [state] if nest.is_sequence(state): state_list = nest.flatten(state) return outputs + state_list
def call(self, inputs, state): if not nest.is_sequence(state): raise ValueError("Expected state to be a tuple of length %d, but receive: %s" % (len(self.state_size), state)) n_layer = len(state) c, h = state[self._layer_pos] concat_h = array_ops.concat([s[-1] for s in state], axis=1) with variable_scope('input-forget-output-gates'): conc = _linear([inputs, h], 3 * self._num_units, True, bias_initializer=self._bias_initializer, kernel_initializer=self._kernel_initializer) i, f, o = array_ops.split(conc, 3, axis=1) with variable_scope('scalar-gates'): gates = _linear([inputs, concat_h], n_layer, True, bias_initializer=self._bias_initializer, kernel_initializer=self._kernel_initializer) with variable_scope('gated-inputs'): gated_h = \ _linear(array_ops.reshape(array_ops.expand_dims(gates, axis=2) * array_ops.expand_dims(h, axis=1), (-1, n_layer * self._num_units)), self._num_units, True, bias_initializer=self._bias_initializer, kernel_initializer=self._kernel_initializer) with variable_scope('new-inputs'): new_inputs = \ self._activation(_linear(inputs, self._num_units, True, bias_initializer=self._bias_initializer, kernel_initializer=self._kernel_initializer) + gated_h) new_c = self._activation(c * math_ops.sigmoid(f + self._forget_bias) + math_ops.sigmoid(i) * new_inputs) new_h = new_c * math_ops.sigmoid(o) new_state = LSTMStateTuple(new_c, new_h) return (new_h, new_state)
def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, enc_cell, dec_cell, num_encoder_symbols, num_decoder_symbols, embedding_size, num_heads=1, output_projection=None, feed_previous=False, dtype=None, scope=None, initial_state_attention=False): """Embedding sequence-to-sequence model with attention. This model first embeds encoder_inputs by a newly created embedding (of shape [num_encoder_symbols x input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. It keeps the outputs of this RNN at every step to use for attention later. Next, it embeds decoder_inputs by another newly created embedding (of shape [num_decoder_symbols x input_size]). Then it runs attention decoder, initialized with the last encoder state, on embedded decoder_inputs and attending to encoder outputs. Warning: when output_projection is None, the size of the attention vectors and variables will be made proportional to num_decoder_symbols, can be large. Args: encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. cell: tf.nn.rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: Integer; number of symbols on the encoder side. num_decoder_symbols: Integer; number of symbols on the decoder side. embedding_size: Integer, the length of the embedding vector for each symbol. num_heads: Number of attention heads that read from attention_states. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_decoder_symbols] and B has shape [num_decoder_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. """ with tf.variable_scope(scope or "embedding_attention_seq2seq", dtype=dtype) as scope: dtype = scope.dtype # Encoder. encoder_cell = enc_cell encoder_cell = rnn.EmbeddingWrapper( encoder_cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size) encoder_outputs, encoder_state = rnn.static_rnn(encoder_cell, encoder_inputs, dtype=dtype) # First calculate a concatenation of encoder outputs to put attention on. top_states = [ tf.reshape(e, [-1, 1, encoder_cell.output_size]) for e in encoder_outputs ] attention_states = tf.concat(top_states, 1) # Decoder. output_size = None if output_projection is None: dec_cell = rnn.OutputProjectionWrapper(dec_cell, num_decoder_symbols) output_size = num_decoder_symbols if isinstance(feed_previous, bool): return embedding_attention_decoder( decoder_inputs, encoder_state, attention_states, dec_cell, num_decoder_symbols, embedding_size, num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous, initial_state_attention=initial_state_attention) # If feed_previous is a Tensor, we construct 2 graphs and use cond. def decoder(feed_previous_bool): reuse = None if feed_previous_bool else True with tf.variable_scope(tf.get_variable_scope(), reuse=reuse): outputs, state = embedding_attention_decoder( decoder_inputs, encoder_state, attention_states, dec_cell, num_decoder_symbols, embedding_size, num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous_bool, update_embedding_for_previous=False, initial_state_attention=initial_state_attention) state_list = [state] if nest.is_sequence(state): state_list = nest.flatten(state) return outputs + state_list outputs_and_state = tf.cond(feed_previous, lambda: decoder(True), lambda: decoder(False)) outputs_len = len( decoder_inputs) # Outputs length same as decoder inputs. state_list = outputs_and_state[outputs_len:] state = state_list[0] if nest.is_sequence(encoder_state): state = nest.pack_sequence_as(structure=encoder_state, flat_sequence=state_list) return outputs_and_state[:outputs_len], state