def __call__(self, inputs, state, scope=None): """Run one step of F-LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: not used Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the F-LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of F-LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ (c_prev, m_prev) = state input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( "Could not infer input size from inputs.get_shape()[-1]") with vs.variable_scope(scope or "flstm_cell", initializer=self._initializer): with vs.variable_scope("factor"): fact = linear([inputs, m_prev], self._factor_size, False) concat = linear(fact, 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1) c = math_ops.sigmoid(f + self._forget_bias) * c_prev + math_ops.sigmoid( i) * math_ops.tanh(j) m = math_ops.sigmoid(o) * self._activation(c) if self._num_proj is not None: with vs.variable_scope("projection"): m = linear(m, self._num_proj, bias=False, scope=scope) new_state = LSTMStateTuple(c, m) return m, new_state
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM).""" from tensorflow.python.ops import array_ops from tensorflow.contrib.rnn.python.ops.core_rnn_cell_impl import _linear as linear with tf.variable_scope(scope or "basic_lstm_cell"): # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1) concat = linear([inputs, h], 4 * self._num_units, True, scope=scope) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1) i = ln(i, scope='i/') j = ln(i, scope='j/') f = ln(i, scope='f/') o = ln(i, scope='o/') new_c = (c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * self._activation(j)) new_h = self._activation(new_c) * tf.sigmoid(o) if self._state_is_tuple: new_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat([new_c, new_h], 1) return new_h, new_state
def attention(query, prev_alpha): """Calculate attention weights.""" with variable_scope.variable_scope("Attention"): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) if self.use_conv: conv_features = nn_ops.conv2d( prev_alpha, F, [1, 1, 1, 1], "SAME") feat_reshape = nn_ops.conv2d( conv_features, U, [1, 1, 1, 1], "SAME") s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y + feat_reshape), [2, 3]) else: s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) alpha = nn_ops.softmax(s) * attn_mask sum_vec = tf.reduce_sum(alpha, reduction_indices=[1], keep_dims=True) + 1e-12 norm_term = tf.tile(sum_vec, tf.stack([1, tf.shape(alpha)[1]])) alpha = alpha / norm_term alpha = tf.expand_dims(alpha, 2) alpha = tf.expand_dims(alpha, 3) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum(alpha * hidden, [1, 2]) d = array_ops.reshape(d, [-1, attn_size]) return tuple([d, alpha])
def testLinear(self): with self.test_session() as sess: with tf.variable_scope("root", initializer=tf.constant_initializer(1.0)): x = tf.zeros([1, 2]) l = linear([x], 2, False) sess.run([tf.global_variables_initializer()]) res = sess.run([l], {x.name: np.array([[1., 2.]])}) self.assertAllClose(res[0], [[3.0, 3.0]]) # Checks prevent you from accidentally creating a shared function. with self.assertRaises(ValueError): l1 = linear([x], 2, False) # But you can create a new one in a new scope and share the variables. with tf.variable_scope("l1") as new_scope: l1 = linear([x], 2, False) with tf.variable_scope(new_scope, reuse=True): linear([l1], 2, False) self.assertEqual(len(tf.trainable_variables()), 2)
def testLinear(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(1.0)): x = array_ops.zeros([1, 2]) l = linear([x], 2, False) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([l], {x.name: np.array([[1., 2.]])}) self.assertAllClose(res[0], [[3.0, 3.0]]) # Checks prevent you from accidentally creating a shared function. with self.assertRaises(ValueError): l1 = linear([x], 2, False) # But you can create a new one in a new scope and share the variables. with variable_scope.variable_scope("l1") as new_scope: l1 = linear([x], 2, False) with variable_scope.variable_scope(new_scope, reuse=True): linear([l1], 2, False) self.assertEqual(len(variables_lib.trainable_variables()), 2)
def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds
def basic_rnn_cell(inputs, state, num_units, scope=None): if state is None: if inputs is not None: batch_size = inputs.get_shape()[0] dtype = inputs.dtype else: batch_size = 0 dtype = tf.float32 init_output = tf.zeros(tf.stack([batch_size, num_units]), dtype=dtype) init_state = tf.zeros(tf.stack([batch_size, num_units]), dtype=dtype) init_output.set_shape([batch_size, num_units]) init_state.set_shape([batch_size, num_units]) return init_output, init_state else: with tf.variable_scope(scope, "basic_rnn_cell", [inputs, state]): output = tf.tanh(linear([inputs, state], num_units, True)) return output, output
def attn_loop_function(time, cell_output, state, loop_state): def attention(query, prev_alpha): """Calculate attention weights.""" with variable_scope.variable_scope("Attention"): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) alpha = nn_ops.softmax(s) * attn_mask sum_vec = tf.reduce_sum(alpha, reduction_indices=[1], keep_dims=True) + 1e-12 norm_term = tf.tile(sum_vec, tf.stack([1, tf.shape(alpha)[1]])) alpha = alpha / norm_term alpha = tf.expand_dims(alpha, 2) alpha = tf.expand_dims(alpha, 3) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum(alpha * hidden, [1, 2]) d = array_ops.reshape(d, [-1, attn_size]) return tuple([d, alpha]) # If loop_function is set, we use it instead of decoder_inputs. elements_finished = (time >= seq_len) finished = tf.reduce_all(elements_finished) if cell_output is None: next_state = final_state output = None loop_state = tuple([attn, alpha]) next_input = inputs_ta.read(time) else: next_state = state loop_state = attention(cell_output, loop_state[1]) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output, loop_state[0]], self.cell.output_size, True) if loop_function is not None: simple_input = loop_function(output) # print ("Yolo") else: simple_input = tf.cond( finished, lambda: tf.zeros([batch_size, embedding_size], dtype=tf.float32), lambda: inputs_ta.read(time)) # Merge input and previous attentions into one vector of # the right size. input_size = simple_input.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size") with variable_scope.variable_scope("InputProjection"): next_input = linear([simple_input, loop_state[0]], input_size, True) return (elements_finished, next_input, next_state, output, loop_state)
def beam_attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=dtypes.float32, scope=None, initial_state_attention=False, output_projection=None, beam_size=10): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. This implementation is based on http://arxiv.org/abs/1412.7449 (see below for details). It is recommended for complex sequence-to-sequence tasks. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: Size of the output vectors; if None, we use cell.output_size. num_heads: Number of attention heads that read from attention_states. loop_function: If not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either the i-th element of decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, shapes of attention_states are not set, or input size cannot be inferred from the input. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_decoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) print("Initial_state") state = initial_state def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) # for c in range(ct): ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] prev = None batch_attn_size = tf.stack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) if initial_state_attention: attns = [] attns.append(attention(initial_state)) tmp = tf.reshape(tf.concat(axis=0, values=attns), [-1, attn_size]) attns = [] attns.append(tmp) log_beam_probs, beam_path, beam_symbols = [],[],[] for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None : with variable_scope.variable_scope("loop_function", reuse=True): if prev is not None: inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols) input_size = inp.get_shape().with_rank(2)[1] x = linear([inp] + attns, input_size, True) cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns = attention(state) else: attns = attention(state) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([cell_output] + attns, output_size, True) if loop_function is not None: prev = output if i ==0: with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns = attention(state) outputs.append(tf.argmax(nn_ops.xw_plus_b( output, output_projection[0], output_projection[1]), axis=1)) return outputs, state, tf.reshape(tf.concat(axis=0, values=beam_path),[-1,beam_size]), tf.reshape(tf.concat(axis=0, values=beam_symbols),[-1,beam_size])
def __call__(self, inputs, state, scope=None): """Run one step of G-LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: not used Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the G-LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of G-LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ (c_prev, m_prev) = state input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( "Could not infer input size from inputs.get_shape()[-1]") dtype = inputs.dtype with vs.variable_scope(scope or "glstm_cell", initializer=self._initializer): i_parts = [] j_parts = [] f_parts = [] o_parts = [] for group_id in xrange(self._number_of_groups): with vs.variable_scope("group%d" % group_id): x_g_id = array_ops.concat([ self._get_input_for_group(inputs, group_id, self._group_shape[0]), self._get_input_for_group(m_prev, group_id, self._group_shape[0]) ], axis=1) R_k = linear(x_g_id, 4 * self._group_shape[1], bias=False, scope=scope) #will add per gate biases later i_k, j_k, f_k, o_k = array_ops.split(R_k, 4, 1) i_parts.append(i_k) j_parts.append(j_k) f_parts.append(f_k) o_parts.append(o_k) #it is more efficient to have per gate biases then per gate, per group bi = vs.get_variable(name="biases_i", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) bj = vs.get_variable(name="biases_j", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) bf = vs.get_variable(name="biases_f", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) bo = vs.get_variable(name="biases_o", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) i = nn_ops.bias_add(array_ops.concat(i_parts, axis=1), bi) j = nn_ops.bias_add(array_ops.concat(j_parts, axis=1), bj) f = nn_ops.bias_add(array_ops.concat(f_parts, axis=1), bf) o = nn_ops.bias_add(array_ops.concat(o_parts, axis=1), bo) c = math_ops.sigmoid(f + self._forget_bias) * c_prev + math_ops.sigmoid( i) * math_ops.tanh(j) m = math_ops.sigmoid(o) * self._activation(c) if self._num_proj is not None: with vs.variable_scope("projection"): m = linear(m, self._num_proj, bias=False, scope=scope) new_state = LSTMStateTuple(c, m) return m, new_state