def body(i, prev_c, prev_h, actions, log_probs): # pylint: disable=g-long-lambda signal = control_flow_ops.cond( math_ops.equal(i, 0), lambda: array_ops.tile(device_go_embedding, [self.hparams.num_children, 1]), lambda: embedding_ops.embedding_lookup(device_embeddings, actions.read(i - 1)) ) if self.hparams.keep_prob is not None: signal = nn_ops.dropout(signal, self.hparams.keep_prob) next_c, next_h = lstm(signal, prev_c, prev_h, w_lstm, forget_bias) query = math_ops.matmul(next_h, attn_w_2) query = array_ops.reshape( query, [self.hparams.num_children, 1, self.hparams.hidden_size]) query = math_ops.tanh(query + attn_mem) query = array_ops.reshape(query, [ self.hparams.num_children * self.num_groups, self.hparams.hidden_size ]) query = math_ops.matmul(query, attn_v) query = array_ops.reshape(query, [self.hparams.num_children, self.num_groups]) query = nn_ops.softmax(query) query = array_ops.reshape(query, [self.hparams.num_children, self.num_groups, 1]) query = math_ops.reduce_sum(attn_mem * query, axis=1) query = array_ops.concat([next_h, query], axis=1) logits = math_ops.matmul(query, device_softmax) logits /= self.hparams.temperature if self.hparams.tanh_constant > 0: logits = math_ops.tanh(logits) * self.hparams.tanh_constant if self.hparams.logits_std_noise > 0: num_in_logits = math_ops.cast( array_ops.size(logits), dtype=dtypes.float32) avg_norm = math_ops.divide( linalg_ops.norm(logits), math_ops.sqrt(num_in_logits)) logits_noise = random_ops.random_normal( array_ops.shape(logits), stddev=self.hparams.logits_std_noise * avg_norm) logits = control_flow_ops.cond( self.global_step > self.hparams.stop_noise_step, lambda: logits, lambda: logits + logits_noise) if mode == "sample": next_y = random_ops.multinomial(logits, 1, seed=self.hparams.seed) elif mode == "greedy": next_y = math_ops.argmax(logits, 1) elif mode == "target": next_y = array_ops.slice(y, [0, i], [-1, 1]) else: raise NotImplementedError next_y = math_ops.to_int32(next_y) next_y = array_ops.reshape(next_y, [self.hparams.num_children]) actions = actions.write(i, next_y) log_probs += nn_ops.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=next_y) return i + 1, next_c, next_h, actions, log_probs
def LSTMCell(cls, x, mprev, cprev, weights): xm = array_ops.concat([x, mprev], 1) i_i, i_g, f_g, o_g = array_ops.split( value=math_ops.matmul(xm, weights), num_or_size_splits=4, axis=1) new_c = math_ops.sigmoid(f_g) * cprev + math_ops.sigmoid( i_g) * math_ops.tanh(i_i) new_c = clip_ops.clip_by_value(new_c, -50.0, 50.0) new_m = math_ops.sigmoid(o_g) * math_ops.tanh(new_c) return new_m, new_c
def _bahdanau_score(processed_query, keys, normalize): """Implements Bahdanau-style (additive) scoring function. This attention has two forms. The first is Bhandanau attention, as described in: Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio. "Neural Machine Translation by Jointly Learning to Align and Translate." ICLR 2015. https://arxiv.org/abs/1409.0473 The second is the normalized form. This form is inspired by the weight normalization article: Tim Salimans, Diederik P. Kingma. "Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks." https://arxiv.org/abs/1602.07868 To enable the second form, set `normalize=True`. Args: processed_query: Tensor, shape `[batch_size, num_units]` to compare to keys. keys: Processed memory, shape `[batch_size, max_time, num_units]`. normalize: Whether to normalize the score function. Returns: A `[batch_size, max_time]` tensor of unnormalized score values. """ dtype = processed_query.dtype # Get the number of hidden units from the trailing dimension of keys num_units = keys.shape[2].value or array_ops.shape(keys)[2] # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting. processed_query = array_ops.expand_dims(processed_query, 1) v = variable_scope.get_variable( "attention_v", [num_units], dtype=dtype) if normalize: # Scalar used in weight normalization g = variable_scope.get_variable( "attention_g", dtype=dtype, initializer=math.sqrt((1. / num_units))) # Bias added prior to the nonlinearity b = variable_scope.get_variable( "attention_b", [num_units], dtype=dtype, initializer=init_ops.zeros_initializer()) # normed_v = g * v / ||v|| normed_v = g * v * math_ops.rsqrt( math_ops.reduce_sum(math_ops.square(v))) return math_ops.reduce_sum( normed_v * math_ops.tanh(keys + processed_query + b), [2]) else: return math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query), [2])
def attention(decoder_state, coverage=None): """Calculate the context vector and attention distribution from the decoder state. Args: decoder_state: state of the decoder coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1). Returns: context_vector: weighted sum of encoder_states attn_dist: attention distribution coverage: new coverage vector. shape (batch_size, attn_len, 1, 1) """ with variable_scope.variable_scope("Attention"): # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper) decoder_features = linear(decoder_state, attention_vec_size, True) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims(tf.expand_dims(decoder_features, 1), 1) # reshape to (batch_size, 1, 1, attention_vec_size) def masked_attention(e): """Take softmax of e then apply enc_padding_mask and re-normalize""" attn_dist = nn_ops.softmax(e) # take softmax. shape (batch_size, attn_length) attn_dist *= enc_padding_mask # apply mask masked_sums = tf.reduce_sum(attn_dist, axis=1) # shape (batch_size) return attn_dist / tf.reshape(masked_sums, [-1, 1]) # re-normalize if use_coverage and coverage is not None: # non-first step of coverage # Multiply coverage vector by w_c to get coverage_features. coverage_features = nn_ops.conv2d(coverage, w_c, [1, 1, 1, 1], "SAME") # c has shape (batch_size, attn_length, 1, attention_vec_size) # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn) e = math_ops.reduce_sum(v * math_ops.tanh(encoder_features + decoder_features + coverage_features), [2, 3]) # shape (batch_size,attn_length) # Calculate attention distribution attn_dist = masked_attention(e) # Update coverage vector coverage += array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) else: # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) e = math_ops.reduce_sum(v * math_ops.tanh(encoder_features + decoder_features), [2, 3]) # calculate e # Calculate attention distribution attn_dist = masked_attention(e) if use_coverage: # first step of training coverage = tf.expand_dims(tf.expand_dims(attn_dist,2),2) # initialize coverage # Calculate the context vector from attn_dist and encoder_states context_vector = math_ops.reduce_sum(array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) * encoder_states, [1, 2]) # shape (batch_size, attn_size). context_vector = array_ops.reshape(context_vector, [-1, attn_size]) return context_vector, attn_dist, coverage
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM).""" with vs.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. c, h = array_ops.split(1, 2, state) concat = linear([inputs, h], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(1, 4, concat) new_c = c * sigmoid(f + self._forget_bias) + sigmoid(i) * tanh(j) new_h = tanh(new_c) * sigmoid(o) return new_h, array_ops.concat(1, [new_c, new_h])
def embed(self, func, embedding_classes, embedding_size, inputs, dtype=None, scope=None, keep_prob=1.0, initializer=None): embedder_cell = func(self._cell, embedding_classes, embedding_size, initializer=initializer) # Like rnn(..) in rnn.py, but we call only the Embedder, not the RNN cell outputs = [] with vs.variable_scope(scope or "Embedder") as varscope: if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) for time, input_ in enumerate(inputs): if time > 0: vs.get_variable_scope().reuse_variables() embedding = embedder_cell.__call__(input_, scope) if keep_prob < 1: embedding = tf.nn.dropout(embedding, keep_prob) # annotation = C~_t = tanh ( E(x_t) + b_c) b_c = tf.get_variable("annotation_b", [embedding_size]) annotation = tanh(tf.nn.bias_add(embedding, b_c)) # weighted annotation = i_t * C~_t # i = sigmoid ( E(x_t) + b_i) b_i = tf.get_variable("input_b", [embedding_size]) i = sigmoid(tf.nn.bias_add(embedding, b_i)) w_annotation = i * annotation outputs.append(w_annotation) # return empty state, will be initialized by decoder batch_size = array_ops.shape(inputs[0])[0] state = self._cell.zero_state(batch_size, dtype) return (outputs, state)
def testOptimizerInit(self): with ops.Graph().as_default(): layer_collection = lc.LayerCollection() inputs = array_ops.ones((2, 1)) * 2 weights_val = np.ones((1, 1), dtype=np.float32) * 3. weights = variable_scope.get_variable( 'w', initializer=array_ops.constant(weights_val)) bias = variable_scope.get_variable( 'b', initializer=init_ops.zeros_initializer(), shape=(1, 1)) output = math_ops.matmul(inputs, weights) + bias layer_collection.register_fully_connected((weights, bias), inputs, output) logits = math_ops.tanh(output) targets = array_ops.constant([[0.], [1.]]) output = math_ops.reduce_mean( nn.softmax_cross_entropy_with_logits(logits=logits, labels=targets)) layer_collection.register_categorical_predictive_distribution(logits) optimizer.KfacOptimizer( 0.1, 0.2, 0.3, layer_collection, momentum=0.5, momentum_type='regular')
def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) # multiply with source mask, then do softmax if src_mask is not None: s = s * src_mask a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds
def _logits_cumulative(self, inputs, stop_gradient): """Evaluate logits of the cumulative densities. Args: inputs: The values at which to evaluate the cumulative densities, expected to be a `Tensor` of shape `(channels, 1, batch)`. stop_gradient: Boolean. Whether to add `array_ops.stop_gradient` calls so that the gradient of the output with respect to the density model parameters is disconnected (the gradient with respect to `inputs` is left untouched). Returns: A `Tensor` of the same shape as `inputs`, containing the logits of the cumulative densities evaluated at the given inputs. """ logits = inputs for i in range(len(self.filters) + 1): matrix = self._matrices[i] if stop_gradient: matrix = array_ops.stop_gradient(matrix) logits = math_ops.matmul(matrix, logits) bias = self._biases[i] if stop_gradient: bias = array_ops.stop_gradient(bias) logits += bias if i < len(self._factors): factor = self._factors[i] if stop_gradient: factor = array_ops.stop_gradient(factor) logits += factor * math_ops.tanh(logits) return logits
def call(self, inputs, state): """ """ (c_prev, m_prev) = state self._batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0] scope = vs.get_variable_scope() with vs.variable_scope(scope, initializer=self._initializer): x = array_ops.concat([inputs, m_prev], axis=1) with vs.variable_scope("first_gemm"): if self._linear1 is None: # no bias for bottleneck self._linear1 = _Linear(x, self._fact_size, False) R_fact = self._linear1(x) with vs.variable_scope("second_gemm"): if self._linear2 is None: self._linear2 = _Linear(R_fact, 4*self._num_units, True) R = self._linear2(R_fact) i, j, f, o = array_ops.split(R, 4, 1) c = (math_ops.sigmoid(f + self._forget_bias) * c_prev + math_ops.sigmoid(i) * math_ops.tanh(j)) m = math_ops.sigmoid(o) * self._activation(c) if self._num_proj is not None: with vs.variable_scope("projection"): if self._linear3 is None: self._linear3 = _Linear(m, self._num_proj, False) m = self._linear3(m) new_state = rnn_cell_impl.LSTMStateTuple(c, m) return m, new_state
def __call__(self, inputs, state, scope=None): with _checked_scope(self, scope or "rwa_cell", reuse=self._reuse): h, n, d, a_max = state with vs.variable_scope("u"): u = _linear(inputs, self._num_units, True) with vs.variable_scope("g"): g = _linear([inputs, h], self._num_units, True) with vs.variable_scope("a"): a = _linear([inputs, h], self._num_units, False) # The bias term when factored out of the numerator and denominator cancels and is unnecessary z = tf.multiply(u, tanh(g)) a_newmax = tf.maximum(a_max, a) exp_diff = tf.exp(a_max - a_newmax) exp_scaled = tf.exp(a - a_newmax) n = tf.multiply(n, exp_diff) + tf.multiply(z, exp_scaled) # Numerically stable update of numerator d = tf.multiply(d, exp_diff) + exp_scaled # Numerically stable update of denominator h_new = self._activation(tf.div(n, d)) new_state = RWACellTuple(h_new, n, d, a_newmax) return h_new, new_state
def attention(query, use_attention=False): """Put attention masks on hidden using hidden_features and query.""" attn_weights = [] ds = [] # Results of attention reads will be stored here. for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) if use_attention is False: # apply mean pooling weights = tf.tile(sequence_length, tf.stack([attn_length])) weights = array_ops.reshape(weights, tf.shape(s)) a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(weights) # a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(tf.shape(s)[1]) else: a = nn_ops.softmax(s) attn_weights.append(a) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return attn_weights, ds
def downscale(self, inp): with vs.variable_scope("Downscale"): inp2d = tf.reshape(tf.transpose(inp, perm=[1, 0, 2]), [-1, 2 * self.size]) out2d = rnn_cell.linear(inp2d, self.size, True, 1.0) out3d = tf.reshape(out2d, [self.batch_size, -1, self.size]) out3d = tf.transpose(out3d, perm=[1, 0, 2]) out = tanh(out3d) return out
def __init__(self, num_units, encoder_output, scope=None): self.hs = encoder_output with vs.variable_scope(scope or type(self).__name__): with vs.variable_scope("Attn1"): hs2d = tf.reshape(self.hs, [-1, num_units]) phi_hs2d = tanh(rnn_cell.linear(hs2d, num_units, True, 1.0)) self.phi_hs = tf.reshape(phi_hs2d, tf.shape(self.hs)) super(GRUCellAttn, self).__init__(num_units)
def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): y = rnn_cell.linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) return s
def _GenerateOrderedInputs(self, size, n): inputs = self._GenerateUnorderedInputs(size, 1) queue = data_flow_ops.FIFOQueue( capacity=1, dtypes=[inputs[0].dtype], shapes=[inputs[0].get_shape()]) for _ in xrange(n - 1): op = queue.enqueue(inputs[-1]) with ops.control_dependencies([op]): inputs.append(math_ops.tanh(1.0 + queue.dequeue())) return inputs
def __call__(self, query, previous_alignments): """Score the query based on the keys and values. Args: query: Tensor of dtype matching `self.values` and shape `[batch_size, query_depth]`. previous_alignments: Tensor of dtype matching `self.values` and shape `[batch_size, alignments_size]` (`alignments_size` is memory's `max_time`). Returns: alignments: Tensor of dtype matching `self.values` and shape `[batch_size, alignments_size]` (`alignments_size` is memory's `max_time`). """ with variable_scope.variable_scope(None, "bahdanau_attention", [query]): processed_query = self.query_layer(query) if self.query_layer else query dtype = processed_query.dtype # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting. processed_query = array_ops.expand_dims(processed_query, 1) keys = self._keys v = variable_scope.get_variable( "attention_v", [self._num_units], dtype=dtype) if self._normalize: # Scalar used in weight normalization g = variable_scope.get_variable( "attention_g", dtype=dtype, initializer=math.sqrt((1. / self._num_units))) # Bias added prior to the nonlinearity b = variable_scope.get_variable( "attention_b", [self._num_units], dtype=dtype, initializer=init_ops.zeros_initializer()) # normed_v = g * v / ||v|| normed_v = g * v * math_ops.rsqrt( math_ops.reduce_sum(math_ops.square(v))) score = math_ops.reduce_sum( normed_v * math_ops.tanh(keys + processed_query + b), [2]) else: score = math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query), [2]) alignments = self._probability_fn(score, previous_alignments) return alignments
def testGradientThroughNewStep(self): with imperative_mode.ImperativeMode(self._target) as mode: x = constant_op.constant(np.random.rand(3)) y = math_ops.tanh(x) with mode.new_step(): z = constant_op.constant(np.random.rand(3)) w = math_ops.multiply(y, z) dx = gradients_impl.gradients(w, x) self.assertAllClose(dx[0].value, z.value * (1.0 - y.value ** 2))
def testIsSequence(self): self.assertFalse(nest.is_sequence("1234")) self.assertTrue(nest.is_sequence([1, 3, [4, 5]])) self.assertTrue(nest.is_sequence(((7, 8), (5, 6)))) self.assertTrue(nest.is_sequence([])) self.assertFalse(nest.is_sequence(set([1, 2]))) ones = array_ops.ones([2, 3]) self.assertFalse(nest.is_sequence(ones)) self.assertFalse(nest.is_sequence(math_ops.tanh(ones))) self.assertFalse(nest.is_sequence(np.ones((4, 5))))
def _lstm_cell(prev_c, prev_h, x): """Create an LSTM cell.""" # i: input gate # f: forget gate # o: output gate # c: cell state # x: input # h: embedding bias = _bias([4]) w = _weight([8, 16]) ifoc = math_ops.matmul(array_ops.concat([x, prev_h], axis=1), w) i, f, o, c = array_ops.split(ifoc, 4, axis=1) i = math_ops.sigmoid(nn.bias_add(i, bias)) f = math_ops.sigmoid(nn.bias_add(f, bias)) o = math_ops.sigmoid(nn.bias_add(o, bias)) c = math_ops.tanh(nn.bias_add(c, bias)) next_c = f * prev_c + i * c next_h = o * math_ops.tanh(next_c) return next_c, next_h
def __call__(self, inputs, state, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope(scope or type(self).__name__): # "GRUCell" with vs.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. r, u = array_ops.split(1, 2, linear([inputs, state], 2 * self._num_units, True, 1.0)) r, u = sigmoid(r), sigmoid(u) with vs.variable_scope("Candidate"): c = tanh(linear([inputs, r * state], self._num_units, True)) new_h = u * state + (1 - u) * c return new_h, new_h
def decoder_type_1(decoder_hidden, attn_size, initializer=None): with vs.variable_scope("decoder_type_1", initializer=initializer): k = vs.get_variable("AttnDecW_%d" % 0, [1, 1, attn_size, 1], initializer=initializer) hidden_features = nn_ops.conv2d(decoder_hidden, k, [1, 1, 1, 1], "SAME") # s will be (?, timesteps) s = math_ops.reduce_sum(math_ops.tanh(hidden_features), [2, 3]) return s
def __call__(self, query): """Score the query based on the keys and values. Args: query: Tensor of dtype matching `self.values` and shape `[batch_size, query_depth]`. Returns: score: Tensor of dtype matching `self.values` and shape `[batch_size, max_time]` (`max_time` is memory's `max_time`). """ with ops.name_scope(None, "BahndahauAttentionCall", [query]): processed_query = self.query_layer(query) if self.query_layer else query dtype = processed_query.dtype # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting. processed_query = array_ops.expand_dims(processed_query, 1) v = variable_scope.get_variable( "attention_v", [self._num_units], dtype=dtype) if self._normalize: # Scalar used in weight normalization g = variable_scope.get_variable( "attention_g", dtype=dtype, initializer=math.sqrt((1. / self._num_units))) # Bias added prior to the nonlinearity b = variable_scope.get_variable( "attention_b", [self._num_units], dtype=dtype, initializer=init_ops.zeros_initializer()) # Scalar bias added to attention scores r = variable_scope.get_variable( "attention_r", dtype=dtype, initializer=self._attention_r_initializer) # normed_v = g * v / ||v|| normed_v = g * v * math_ops.rsqrt( math_ops.reduce_sum(math_ops.square(v))) score = math_ops.reduce_sum( normed_v * math_ops.tanh(self.keys + processed_query + b), [2]) + r else: score = math_ops.reduce_sum( v * math_ops.tanh(self.keys + processed_query), [2]) return score
def __call__(self, query, tiling_factor=1): """Score the query based on the keys and values. Args: query: Tensor of dtype matching `self.values` and shape `[batch_size, query_depth]`. tiling_factor: An integer factor for which to tile the batch dimension. Used with BeamSearchDecoder. Returns: score: Tensor of dtype matching `self.values` and shape `[batch_size, max_time]` (`max_time` is memory's `max_time`). """ with variable_scope.variable_scope(None, "bahdanau_attention", [query]): processed_query = self.query_layer(query) if self.query_layer else query dtype = processed_query.dtype # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting. processed_query = array_ops.expand_dims(processed_query, 1) keys = _maybe_tile_batch(self.keys, tiling_factor) v = variable_scope.get_variable( "attention_v", [self._num_units], dtype=dtype) if self._normalize: # Scalar used in weight normalization g = variable_scope.get_variable( "attention_g", dtype=dtype, initializer=math.sqrt((1. / self._num_units))) # Bias added prior to the nonlinearity b = variable_scope.get_variable( "attention_b", [self._num_units], dtype=dtype, initializer=init_ops.zeros_initializer()) # normed_v = g * v / ||v|| normed_v = g * v * math_ops.rsqrt( math_ops.reduce_sum(math_ops.square(v))) score = math_ops.reduce_sum( normed_v * math_ops.tanh(keys + processed_query + b), [2]) else: score = math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query), [2]) return score
def __call__(self, inputs, state, episodic_gate, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope("MGRUCell"): # "GRUCell" with vs.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. r = rnn_cell.linear([inputs, state], self._num_units, True, 1.0, scope=scope) r = sigmoid(r) with vs.variable_scope("Candidate"): c = tanh(rnn_cell.linear([inputs, r * state], self._num_units, True)) new_h = tf.mul(episodic_gate, c) + tf.mul((1 - episodic_gate), state) return new_h, new_h
def testIsSequence(self): self.assertFalse(nest.is_sequence("1234")) self.assertFalse(nest.is_sequence([1, 3, [4, 5]])) self.assertTrue(nest.is_sequence(((7, 8), (5, 6)))) self.assertFalse(nest.is_sequence([])) self.assertFalse(nest.is_sequence(set([1, 2]))) ones = array_ops.ones([2, 3]) self.assertFalse(nest.is_sequence(ones)) self.assertFalse(nest.is_sequence(math_ops.tanh(ones))) self.assertFalse(nest.is_sequence(np.ones((4, 5)))) self.assertTrue(nest.is_sequence({"foo": 1, "bar": 2})) self.assertFalse( nest.is_sequence(sparse_tensor.SparseTensorValue([[0]], [0], [1])))
def attention(query): """Put attention masks on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): # Attention mask is a softmax of h_in^T*decoder_hidden. dec_hid = array_ops.tile(query, [1, attn_length]) # replicate query for element-wise multiplication dec_hid = array_ops.reshape(dec_hid, [-1, attn_length, attention_vec_size]) attn_weight = nn_ops.softmax(math_ops.reduce_sum(attention_states*dec_hid, [2])) # attn weights for every hidden states in encoder # Now calculate the attention-weighted vector (context vector) cc. cc = math_ops.reduce_sum(array_ops.reshape(attn_weight, [-1, attn_length, 1, 1])*hidden, [1,2]) # attented hidden state with vs.variable_scope("AttnW1"): term1 = rnn_cell.linear(query, attn_size, False) with vs.variable_scope("AttnW2"): term2 = rnn_cell.linear(cc, attn_size, False) # environment representation if env: # 2D Tensor of shape [batch_size, env_size] with vs.variable_scope("Environment"): term3 = rnn_cell.linear(math_ops.to_float(env), attn_size, False) h_attn = math_ops.tanh(term1 + term2 + term3) else: h_attn = math_ops.tanh(term1 + term2) return h_attn, attn_weight
def __call__(self, inputs, state, scope=None): gru_out, gru_state = super(GRUCellAttn, self).__call__(inputs, state, scope) with vs.variable_scope(scope or type(self).__name__): with vs.variable_scope("Attn2"): gamma_h = tanh(rnn_cell.linear(gru_out, self._num_units, True, 1.0)) weights = tf.reduce_sum(self.phi_hs * gamma_h, reduction_indices=2, keep_dims=True) weights = tf.exp(weights - tf.reduce_max(weights, reduction_indices=0, keep_dims=True)) weights = weights / (1e-6 + tf.reduce_sum(weights, reduction_indices=0, keep_dims=True)) context = tf.reduce_sum(self.hs * weights, reduction_indices=0) with vs.variable_scope("AttnConcat"): out = tf.nn.relu(rnn_cell.linear([context, gru_out], self._num_units, True, 1.0)) self.attn_map = tf.squeeze(tf.slice(weights, [0, 0, 0], [-1, -1, 1])) return (out, out)
def lstm(x, prev_c, prev_h, w_lstm, forget_bias): """LSTM cell. Args: x: tensors of size [num_children, hidden_size]. prev_c: tensors of size [num_children, hidden_size]. prev_h: same as prev_c. w_lstm: . forget_bias: . Returns: next_c: next_h: """ ifog = math_ops.matmul(array_ops.concat([x, prev_h], axis=1), w_lstm) i, f, o, g = array_ops.split(ifog, 4, axis=1) i = math_ops.sigmoid(i) f = math_ops.sigmoid(f + forget_bias) o = math_ops.sigmoid(o) g = math_ops.tanh(g) next_c = i * g + f * prev_c next_h = o * math_ops.tanh(next_c) return next_c, next_h
def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum(array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds
def call(self, inputs, mask=None, training=None, initial_state=None): if not (inputs.shape is 2): raise ValueError( 'The dimension of the inputs vector should be 2: `(input_shape, reward)`' ) object_input = inputs[0] # (batch_dim, timesteps n_digits) reward_input = inputs[1] # (1,) n_digits = tensor_shape.dimension_value(object_input[-1]) batch_dim = tensor_shape.dimension_value(object_input[0]) self.units = tensor_shape.dimension_value(object_input[1]) # Unpacking state matrices object_queries = tf.tile( tf.reshape(self.O_state, (1, ) + self.O_state.shape), (batch_dim, ) + self.O_state.shape) # (batch_dim, timesteps n_digits) object_keys = tf.tile( tf.reshape(self.object_keys, (1, ) + self.object_keys.shape), (batch_dim, ) + self.object_keys.shape) # (batch_dim, Tk, n_digits) # (self.units, n_actions) action_queries = tf.tile( tf.reshape(self.A_state, (1, ) + self.A_state.shape), (batch_dim, ) + self.A_state.shape) action_keys = tf.tile( tf.reshape(self.action_keys, (1, ) + self.action_keys.shape), (batch_dim, ) + self.action_keys.shape) # (Tk, n_actions) # action_values = self.O_state[:, 2*int( # self.A_state.shape[1] / 3):3*int(self.A_state.shape[1] / 3), :] # Context generator p_object = self.p_gate( [object_queries, object_keys] ) # (batch_dim, timesteps n_digits), (batch_dim, Tk, n_digits) -> (batch_dim, timesteps n_digits) shifted_object_sequence = self._transformer_shift_objects( object_input, object_queries) # (batch_dim, timesteps n_digits) # (batch_dim, timesteps n_digits), (batch_dim, timesteps n_digits) -> (batch_dim, timesteps n_digits) object_query_corrected = math_ops.multiply(p_object, shifted_object_sequence) # (batch_dim, timesteps n_digits), (batch_dim, Tk, n_digits), (batch_dim, Tk, n_actions) -> (batch_dim, timesteps n_actions) action_by_object = self.an_gate( [object_query_corrected, object_keys, action_keys]) # Sympathetic circuit # (batch_dim, timesteps) steps = tf.tile(tf.constant(list(range(self.units)), dtype=float), tf.constant([1, batch_dim])) # (batch_dim, timesteps), (batch_dim, timesteps) -> (batch_dim, timesteps) old_reward = self.internal_reward self.internal_reward.assign(self.internal_reward + self.w_boost * reward_input * math_ops.exp(steps) - \ self.w_step * math_ops.exp(steps) - \ self.w_amount * math_ops.exp(reward_input)) # (batch_dim, timesteps n_digits), (batch_dim, timesteps n_actions) -> (batch_dim, timesteps, n_digits, n_actions) corrected_strategy = self.ao_gate(action_queries, action_keys) reward_matrix = K.softmax( tf.einsum('ijk,ijn->ijkn', object_queries, corrected_strategy) / math_ops.sqrt(0.5 * self.n_actions * n_digits)) # (batch_dim, timesteps n_digits), (batch_dim, timesteps n_actions) -> (batch_dim, timesteps n_digits, n_actions) potential_reward = K.softmax( math_ops.tanh( tf.einsum('ijk,ijn->ijkn', object_query_corrected, corrected_strategy))) # (batch_dim, timesteps n_digits, n_actions) * (batch_dim, timesteps) -> (batch_dim, timesteps n_digits, n_actions) delta_stimuli = potential_reward * self.internal_reward # tf.einsum('ijkn,ij->jkn', potential_reward, self.internal_reward) # ws(n_digits, n_actions) * (batch_dim, timesteps n_digits, n_actions) -> (batch_dim, timesteps n_digits, n_actions) new_state = self.w_stimuli * delta_stimuli # (batch_dim, timesteps n_digits, n_actions), (timesteps, n_digits, n_actions) -> (batch_dim, self.units) reward_intersection = tf.einsum('ijkn,ijkn->ij', reward_matrix, new_state) # w(1,) * (batch_dim, timesteps) + (batch_dim, timesteps) -> (batch_dim, timesteps) reward_forecast = self.w_rs * reward_intersection + self.internal_reward # (batch_dim, timesteps n_actions), (batch_dim, Tk, n_digits), (batch_dim, Tk, n_actions) -> (batch_dim, timesteps n_actions) rewarded_actions = self.SR_gate( [action_by_object, new_state, reward_matrix]) # (batch_dim, timesteps n_actions), (batch_dim, Tk, n_actions), (batch_dim, Tk, n_digits) -> (batch_dim, timesteps n_digits) object_forecast = self.f_gate( [rewarded_actions, action_keys, object_keys]) # (batch_dim, timesteps n_digits) -> (batch_dim, timesteps n_digits) object_forecast_seq = self._transformer_shift_objects( object_forecast, shifted_object_sequence) # (batch_dim, timesteps n_actions), (batch_dim, Tk, n_digits), (batch_dim, Tk, n_actions) -> (batch_dim, timesteps n_actions) simulated_action = self.d_gate( [object_forecast_seq, object_keys, action_keys]) # Repeater # (batch_dim, timesteps n_actions), ((batch_dim, self.units) - (batch_dim, self.units)) -> # w(1,), (batch_dim, self.units) -> (batch_dim, timesteps n_actions) reward_ratio_action = self.W_R * tf.einsum( 'ijk,ij->ijk', action_by_object, K.softmax(K.abs(self.internal_reward - self.expected_reward))) # (batch_dim, timesteps n_actions), (batch_dim, timesteps n_actions) -> (batch_dim, timesteps n_actions) selected_action = reward_ratio_action + \ K.softmax(K.abs(self.internal_reward - self.expected_reward)) * \ K.softmax(K.dot(self.W_S, simulated_action)+self.b_S) # (batch_dim, timesteps n_actions), (batch_dim, Tk, n_actions) -> (batch_dim, timesteps n_actions) new_strategy = self._transformer_shift_actions( selected_action, corrected_strategy) # (batch_dim, timesteps n_actions) # Packing and updating new_obj = object_query_corrected[:, -1, ...] new_act = new_strategy[:, -1, ...] O_c1 = K.dot(object_queries[self.conv_units:], tf.transpose(new_obj)) O_c2 = K.dot(object_queries[:self.units], tf.transpose(new_obj)) E_c1 = (1 / self.units) * tf.einsum( 'ik->', (object_queries[self.conv_units:] - O_c1)**2) # (timesteps,) E_c2 = (1 / self.units) * tf.einsum( 'ik->', (object_queries[self.conv_units:] - O_c2)**2) # (timesteps,) P_short = tf.math.softmax(K.dot(E_c1, self.W_Pshort) + self.b_Pshort) P_long = tf.math.softmax(K.dot(E_c2, self.W_Plong) + self.b_Plong) if (P_short < 0.51) & (P_long < 0.51): object_keys, action_keys = self._min_ABdict_replace_op( object_keys[-1, ...], action_keys[-1, ...], new_obj, new_act, reward_forecast[-1, ...] - self.internal_reward) else: object_keys, action_keys = self._mean_ABdict_mix_op( object_keys[-1, ...], action_keys[-1, ...], new_obj, new_act, reward_forecast[-1, ...] - self.internal_reward) object_keys, action_keys = self._mean_ABdict_mix_op( object_keys[-2, ...], action_keys[-2, ...], new_obj, new_act, old_reward - self.internal_reward) self.expected_reward.assign(reward_forecast[-1, ...]) self.S_state.assign( tf.cumsum(self.S_state, axis=0) + tf.reduce_sum(new_state, axis=0)) self.O_state.assign(object_query_corrected) self.A_state.assign(corrected_strategy) self.object_keys.assign(object_keys) self.action_keys.assign(action_keys) # self._update_relevance_matrix( # self.internal_reward, object_query_corrected[:, -2, ...], new_strategy[:, -2, ...]) # t-1 case # self._update_relevance_matrix( # self.expected_reward, new_obj, new_act) # t case return new_strategy
def attention(decoder_state, coverage=None): """Calculate the context vector and attention distribution from the decoder state. Args: decoder_state: state of the decoder coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1). Returns: context_vector: weighted sum of encoder_states attn_dist: attention distribution coverage: new coverage vector. shape (batch_size, attn_len, 1, 1) """ with variable_scope.variable_scope("Attention"): # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper) decoder_features = linear( decoder_state, attention_vec_size, True) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims( tf.expand_dims(decoder_features, 1), 1) # reshape to (batch_size, 1, 1, attention_vec_size) def masked_attention(e, padding_mask): "Take e softmax of e then apply enc_padding_mask and re-normalize" "" e = e * padding_mask + ( (1.0 - padding_mask) * tf.float32.min) attn_dist = nn_ops.softmax( e ) # take softmax. shape (batch_size, attn_length). Better way of computing attention. return attn_dist #attn_dist *= padding_mask # apply mask #masked_sums = tf.reduce_sum(attn_dist, axis=1) # shape (batch_size) #return attn_dist / tf.reshape(masked_sums, [-1, 1]) # re-normalize if use_query: with variable_scope.variable_scope("query"): decoder_q_features = linear( decoder_state, query_attn_size, True, name='query') # W_s_q s_t +b decoder_q_features = tf.expand_dims( tf.expand_dims(decoder_q_features, 1), 1 ) # reshape to (batch_size, 1, 1, q_attention_vec_size) q = math_ops.reduce_sum( v_q * math_ops.tanh(query_features + decoder_q_features), [ 2, 3 ]) # calculate q v^t tanh(W_q q_i + W_s_q s_t + b) q_dist = masked_attention(q, query_padding_mask) query_vector = math_ops.reduce_sum( array_ops.reshape(q_dist, [batch_size, -1, 1, 1]) * query_states, [1, 2]) # shape (batch_size, q_attn_size). q* query_vector = array_ops.reshape( query_vector, [-1, query_attn_size]) #This is q* with variable_scope.variable_scope("query_z"): query_z = linear(query_vector, attention_vec_size, False, name='query_z') #This is qz query_z = tf.expand_dims(tf.expand_dims(query_z, 1), 1) if use_coverage and coverage is not None: # non-first step of coverage # Multiply coverage vector by w_c to get coverage_features. coverage_features = nn_ops.conv2d( coverage, w_c, [1, 1, 1, 1], "SAME" ) # c has shape (batch_size, attn_length, 1, attention_vec_size) if use_query: e = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features + query_z + coverage_features), [2, 3]) # shape (batch_size,attn_length) else: # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn) e = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features + coverage_features), [2, 3]) # shape (batch_size,attn_length) # Calculate attention distribution attn_dist = masked_attention(e, enc_padding_mask) # Update coverage vector coverage += array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) else: if use_query: e = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features + query_z), [2, 3]) # calculate e else: # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) e = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features), [2, 3]) # calculate e # Calculate attention distribution attn_dist = masked_attention(e, enc_padding_mask) if use_coverage: # first step of training coverage = tf.expand_dims(tf.expand_dims(attn_dist, 2), 2) # initialize coverage # Calculate the context vector from attn_dist and encoder_states context_vector = math_ops.reduce_sum( array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) * encoder_states, [1, 2]) # shape (batch_size, attn_size). context_vector = array_ops.reshape(context_vector, [-1, attn_size]) return context_vector, attn_dist, coverage
def loop_fn(i): return math_ops.tanh(a * array_ops.gather(x, i) + array_ops.gather(y, i))
def _attn_add_fun(v, keys, query): return math_ops.reduce_sum(v * math_ops.tanh(keys + query), [2])
#arrays to save v and h context v_con_array = [] h_con_array = [] whole_con_array = [] cosine_penalty_array = [] #array to save y y_array = [] for i in range(side_len, whole_len - side_len): current_output = outputs_bidirection[i] #multiply output by w_a and add b_a, get inner_sum of size [batch_size, hidden_size] inner_sum = tf.add(tf.matmul(current_output, w_a_1), b_a_1) con_i = tanh(inner_sum) #calculate the vertical (feature) and horizontal (distal) context vectors #shape [batch_size, seq_len] con_v = tf.nn.softmax(tf.add(tf.matmul(con_i, w_a_v), b_a_v)) #shape [batch_size, num_feat] con_h = tf.add(tf.matmul(con_i, w_a_h), b_a_h) v_con_array.append(tf.expand_dims(con_v, 1)) h_con_array.append(tf.expand_dims(con_h, 1)) #tensor product each batch to generate the whole context con_vh tiled_con_v = tf.tile(tf.expand_dims(con_v, 2), tf.stack([1, 1, num_feat])) tiled_con_h = tf.tile(tf.expand_dims(con_h, 1), tf.stack([1, seq_len, 1])) #shape [batch_size, seq_len, num_feat] con_vh = tf.multiply(tiled_con_v, tiled_con_h)
def call(self, inputs, state, scope=None): """Run one step of Associative LSTM. Args: inputs: input Tensor, 2D, batch x cell_size. state: a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: VariableScope for the created subgraph; defaults to "AssociativeLSTMCell". Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, cell_size otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._cell_size if self._num_proj is None else self._num_proj (c_prev, m_prev) = state dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") # bs x (input_size + num_proj) cell_inputs = tf.concat([inputs, m_prev], 1, name = 'concat2') # bs x ((2.5 + _input_keys + _output_keys) * cell_size) lstm_matrix = tf.matmul(cell_inputs, self._kernel) lstm_matrix = tf.nn.bias_add(lstm_matrix, self._bias) # i = input_gate, f = forget_gate, o = output_gate # bs x (cell_size // 2) i, f, o = tf.split(value = lstm_matrix[:, :int(1.5 * self._cell_size)], axis = 1, num_or_size_splits = 3) # u # bs x cell_size u = tf.split(lstm_matrix[:, int(1.5 * self._cell_size):int(2.5 * self._cell_size)], axis = 1, num_or_size_splits = 1)[0] # ri # _input_keys x bs x cell_size input_keys = tf.split(lstm_matrix[:, int(2.5 * self._cell_size): int((2.5 + self._input_keys) * self._cell_size)], axis = 1, num_or_size_splits = 1)[0] input_keys = tf.reshape(input_keys, [self._input_keys, -1, self._cell_size]) # ro # _output_keys x bs x cell_size output_keys = tf.split(lstm_matrix[:, int((2.5 + self._input_keys) * self._cell_size):], axis = 1, num_or_size_splits = 1)[0] output_keys = tf.reshape(output_keys, [self._output_keys, -1, self._cell_size]) # applying the sigmoid activation function # bs x (cell_size // 2) i = sigmoid(i) f = sigmoid(f) o = sigmoid(o) # appending gates # bs x cell_size i = tf.concat([i, i], 1, name = 'concat3') f = tf.concat([f, f], 1, name = 'concat4') o = tf.concat([o, o], 1, name = 'concat5') # applying tanh activation function # bs x cell_size u = tanh(u) # _input_keys x bs x cell_size input_keys = tanh(input_keys) # _output_keys x bs x cell_size output_keys = tanh(output_keys) # applying permutations #_input_keys x num_copies x batch_size x cell_size input_keys = self._permute(input_keys, scope = 'input_keys') #_output_keys x num_copies x batch_size x cell_size output_keys = self._permute(output_keys, scope = 'output_keys') # memory copies update # num_copies x bs x cell_size memory_update = self._complex_multiplication( input_keys, tf.expand_dims(tf.expand_dims(u * i, 0), 0)) memory_update = tf.reduce_mean(memory_update, 0) # memory copies forget # num_copies x bs x cell_size memory_forget = tf.expand_dims(f, 0) * c_prev # updating memory # num_copies x bs x cell_size c = memory_forget + memory_update # reading refers to the reading gate # _output_keys x bs x cell_size reading_gate = tanh(tf.reduce_mean( self._complex_multiplication(output_keys, tf.expand_dims(c, 0)), 1)) # bs x num_proj m = tf.expand_dims(o, 0) * reading_gate m = tf.transpose(m, [1,0,2]) m = tf.reshape(m, [-1, self._num_proj]) new_state = rnn_cell.LSTMStateTuple(c, m) return m, new_state
def attention(decoder_state, coverage=None): """Calculate the context vector and attention distribution from the decoder state. Args: decoder_state: state of the decoder coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1). Returns: context_vector: weighted sum of encoder_states attn_dist: attention distribution coverage: new coverage vector. shape (batch_size, attn_len, 1, 1) """ with variable_scope.variable_scope("Attention"): # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper) decoder_features = linear( decoder_state, attention_vec_size, True) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims( tf.expand_dims(decoder_features, 1), 1) # reshape to (batch_size, 1, 1, attention_vec_size) def masked_attention(e): """Take softmax of e then apply enc_padding_mask and re-normalize""" attn_dist = nn_ops.softmax( e) # take softmax. shape (batch_size, attn_length) attn_dist *= enc_padding_mask # apply mask masked_sums = tf.reduce_sum(attn_dist, axis=1) # shape (batch_size) return attn_dist / tf.reshape(masked_sums, [-1, 1]) # re-normalize if use_coverage and coverage is not None: # non-first step of coverage # Multiply coverage vector by w_c to get coverage_features. coverage_features = nn_ops.conv2d( coverage, w_c, [1, 1, 1, 1], "SAME" ) # c has shape (batch_size, attn_length, 1, attention_vec_size) # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn) e = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features + coverage_features), [2, 3]) # shape (batch_size,attn_length) # Calculate attention distribution attn_dist = masked_attention(e) # Update coverage vector coverage += array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) else: # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) e = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features), [2, 3]) # calculate e # Calculate attention distribution attn_dist = masked_attention(e) if use_coverage: # first step of training coverage = tf.expand_dims( tf.expand_dims(attn_dist, 2), 2 ) # initialize coverage => HS: batch_size * att_length * 1 * 1이 됨 # Calculate the context vector from attn_dist and encoder_states context_vector = math_ops.reduce_sum( array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) * encoder_states, [1, 2]) # shape (batch_size, attn_size). context_vector = array_ops.reshape(context_vector, [-1, attn_size]) return context_vector, attn_dist, coverage
def attention(self, decoder_state, encoder_states, attention_vec_size, enc_padding_mask, hps): """Calculate the context vector and attention distribution from the decoder state. Args: decoder_state: state of the decoder Returns: context_vector: weighted sum of encoder_states attn_dist: attention distribution """ with tf.variable_scope('attention'): w_dec = tf.get_variable('w_dec', [attention_vec_size, hps.hidden_dim], dtype=tf.float32, initializer=self.trunc_norm_init) v_dec = tf.get_variable('v_dec', [attention_vec_size], dtype=tf.float32, initializer=self.trunc_norm_init) # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper) decoder_features = tf.nn.xw_plus_b( decoder_state, w_dec, v_dec) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims( tf.expand_dims(decoder_features, 1), 1) # reshape to (batch_size, 1, 1, attention_vec_size) def masked_attention(e): """Take softmax of e then apply enc_padding_mask and re-normalize""" attn_dist = nn_ops.softmax( e) # take softmax. shape (batch_size, attn_length) attn_dist *= enc_padding_mask # apply mask masked_sums = tf.reduce_sum(attn_dist, axis=1) # shape (batch_size) return attn_dist / tf.reshape(masked_sums, [-1, 1]) # re-normalize encoder_states = tf.expand_dims( encoder_states, axis=2) # now is shape (batch_size, attn_len, 1, attn_size) W_h = tf.get_variable("W_h", [1, 1, hps.hidden_dim, attention_vec_size]) encoder_features = nn_ops.conv2d( encoder_states, W_h, [1, 1, 1, 1], "SAME") # shape (batch_size,attn_length,1,attention_vec_size) # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) v = tf.get_variable("v_h", [attention_vec_size]) e = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features), [2, 3]) # calculate e # Calculate attention distribution attn_dist = masked_attention(e) # Calculate the context vector from attn_dist and encoder_states context_vector = math_ops.reduce_sum( array_ops.reshape(attn_dist, [hps.batch_size, -1, 1, 1]) * encoder_states, [1, 2]) # shape (batch_size, attn_size). context_vector = array_ops.reshape(context_vector, [-1, hps.hidden_dim]) return context_vector, attn_dist
def dynamic_distraction_m2_decoder(decoder_inputs, initial_state, distract_initial_state, attention_states, attention_states_query, cell1,cell2, distraction_cell, output_size=None, num_heads=1, loop_function=None, dtype=None, scope=None, initial_state_attention=False): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. This implementation is based on http://arxiv.org/abs/1412.7449 (see below for details). It is recommended for complex sequence-to-sequence tasks. Args: decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: Size of the output vectors; if None, we use cell.output_size. num_heads: Number of attention heads that read from attention_states. loop_function: If not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/abs/1506.03099. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either the i-th element of decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, shapes of attention_states are not set, or input size cannot be inferb_a from the input. """ if decoder_inputs is None: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if attention_states.get_shape()[2].value is None: raise ValueError("Shape[2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell1.output_size with variable_scope.variable_scope( scope or "dynamic_distraction_m2_decoder", dtype=dtype) as scope: dtype = scope.dtype batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length_state = attention_states.get_shape()[1].value attn_length_query = attention_states_query.get_shape()[1].value dim_1 = initial_state.get_shape()[1].value dim_2 = cell1.output_size project_initial_state_W = variable_scope.get_variable("Initial_State_W", [dim_1, dim_2]) project_initial_state_B = variable_scope.get_variable("Initial_State_Bias", [dim_2]) print ("Preksha " + scope.name) if attn_length_state is None: attn_length_state = shape(attention_states)[1] if attn_length_query is None: attn_length_query = shape(attention_states_query)[1] attn_size_state = attention_states.get_shape()[2].value attn_size_query = attention_states_query.get_shape()[2].value b_a = variable_scope.get_variable("b_a", [1, attn_size_state]) # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden_states = array_ops.reshape( attention_states, [-1, attn_length_state, 1, attn_size_state]) hidden_states_query = array_ops.reshape( attention_states_query, [-1, attn_length_query, 1, attn_size_query]) hidden_features_states = [] hidden_features_query = [] v_state = [] attention_vec_size_state = attn_size_state # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_State_%d" % a, [1, 1, attn_size_state, attention_vec_size_state]) hidden_features_states.append(nn_ops.conv2d(hidden_states, k, [1, 1, 1, 1], "SAME")) v_state.append( variable_scope.get_variable("AttnV_State_%d" % a, [attention_vec_size_state])) v_query = [] attention_vec_size_query = attn_size_query # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_Query_%d" %a, [1, 1, attn_size_query, attention_vec_size_query]) hidden_features_query.append(nn_ops.conv2d(hidden_states_query, k, [1, 1, 1, 1], "SAME")) v_query.append( variable_scope.get_variable("AttnV_Query_%d" % a, [attention_vec_size_query])) state_1 = math_ops.matmul(initial_state, project_initial_state_W) + project_initial_state_B state_2 = state_1 prev_states = [] for i in range(attn_length_state): prev_states.append(array_ops.zeros([batch_size])) def attention(query, prev_states, b_a): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): y = linear(query, attention_vec_size_state, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size_state]) # Attention mask is a softmax of v^T * tanh(...). temp = hidden_features_states[a] + y new_states = array_ops.squeeze(temp, [2]) new_states_list = array_ops.unpack(new_states, axis=1) #print(temp.get_shape(), new_states.get_shape(), len(new_states_list), new_states_list[0].get_shape()) distract_states_list = [] for i, _ in enumerate(new_states_list): temp = array_ops.reshape(prev_states[i], [-1, 1]) t1 = math_ops.matmul(temp, b_a) print ("b_a size and prev_states size", temp.get_shape(), prev_states[i].get_shape(), b_a.get_shape(), t1.get_shape()) distract_states_list.append(new_states_list[i] - t1) distract_states = array_ops.pack(distract_states_list, axis=1) print (len(distract_states_list), distract_states.get_shape()) s = math_ops.reduce_sum( v_state[a] * math_ops.tanh(distract_states), [2]) print(s.get_shape()) a = nn_ops.softmax(s) prev_states = array_ops.pack(prev_states, axis=1) prev_states = prev_states + a # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length_state, 1, 1]) * hidden_states, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size_state])) return ds, array_ops.unpack(prev_states, axis=1) def attention_query(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_Query_%d" % a): y = linear(query, attention_vec_size_query, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size_query]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v_query[a] * math_ops.tanh(hidden_features_query[a] + y), [2, 3]) a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length_query, 1, 1]) * hidden_states_query, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size_query])) return ds[0] outputs = [] ctx_vec = [] prev = None batch_attn_size_state = array_ops.pack([batch_size, attn_size_state]) batch_attn_size_query = array_ops.pack([batch_size, attn_size_query]) attns_state = [array_ops.zeros(batch_attn_size_state, dtype=dtype) for _ in xrange(num_heads)] attns_query = [array_ops.zeros(batch_attn_size_query, dtype=dtype) for _ in xrange(num_heads)] for a in attns_state: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size_state]) for a in attns_query: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size_query]) acc_ctx = array_ops.zeros([batch_size, attn_size_state]) if initial_state_attention: attns_query = attention_query(initial_state) list_of_queries = [initial_state, attns_query] attns_state, prev_states = attention(list_of_queries, prev_states) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) with variable_scope.variable_scope("Cell2"): input_2 = linear([state_1] + [inp], input_size, True) output_2, state_2 = cell2(input_2, state_2) # Run the RNN. #print (x.get_shape()) # Run the attention mechanism. if i == 0 and initial_state_attention: with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True): attns_query = attention_query(output_2) list_of_queries = [state, attns_query] attns_state, prev_states = attention(list_of_queries, prev_states, b_a) else: attns_query = attention_query(output_2) list_of_queries = [output_2, attns_query] attns_state, prev_states = attention(list_of_queries, prev_states, b_a) with variable_scope.variable_scope("AttnOutputProjection"): W = variable_scope.get_variable("W", [1,attn_size_state]) U = variable_scope.get_variable("U", [1,attn_size_state]) new_ctx = math_ops.mul(W, attns_state[0]) - math_ops.mul(U, acc_ctx) new_ctx = math_ops.tanh(new_ctx) acc_ctx = acc_ctx + new_ctx with variable_scope.variable_scope("Cell1"): input_1 = linear([output_2] + [new_ctx], input_size, True) output_1, state_1 = cell1(input_1, state_1) output = math_ops.tanh(linear([inp] + [output_1] + [new_ctx], output_size, True)) #x_shape = variable_scope.get_variable(name = 'x_shape',shape=cell_output.get_shape()) if loop_function is not None: prev = output outputs.append(output) ctx_vec.append(new_ctx) return outputs, state_1, ctx_vec
def RNN(x, weights, biases): x = tf.transpose(x, [1, 0, 2]) x = tf.reshape(x, [-1, n_input]) x = tf.split(0, n_steps, x) # n_steps(list) * batch * 200 gru_fw_cell = rnn_cell.GRUCell(n_hidden) gru_fw_cell = tf.nn.rnn_cell.DropoutWrapper(gru_fw_cell, output_keep_prob=0.7) gru_bw_cell = rnn_cell.GRUCell(n_hidden) gru_bw_cell = tf.nn.rnn_cell.DropoutWrapper(gru_bw_cell, output_keep_prob=0.7) outputs, _, _ = rnn.bidirectional_rnn(gru_fw_cell, gru_bw_cell, x,dtype=tf.float32) batch_s = 100 outputs_all = tf.concat(0,outputs) # (N*batch) * 2*n_hidden # dropout outputs_all = tf.nn.dropout(outputs_all, keep_prob=0.5) input_all = tf.concat(0,x) # (n_steps*batch) * 2*n_hidden # dropout input_all = tf.nn.dropout(input_all, keep_prob=0.5) #********************************************************************************************** M = tanh(tf.matmul(outputs_all,W_h)) # (N*batch) * 2*hidden # dropout M = tf.nn.dropout(M, keep_prob=0.5) a = tf.matmul(M,w) a = tf.reshape(a, [n_steps,-1]) # N*batch a = tf.transpose(a, [1,0]) # batch*N a = tf.nn.softmax(a) a = tf.reshape(a, [batch_s,1,n_steps]) # batch*1*N outputs_all = tf.reshape(outputs_all, [n_steps,-1, 2*n_hidden]) # N*batch*d outputs_all = tf.transpose(outputs_all, [1,0,2]) # batch*N*d a = tf.split(0, batch_s, a) outputs_all = tf.split(0, batch_s, outputs_all) r = [] for i in range(batch_s): a_temp = a[i][0:1,:,:] o_temp = outputs_all[i][0:1,:,:] att = tf.reshape(a_temp,[1, n_steps]) out = tf.reshape(o_temp,[n_steps,2*n_hidden]) # dropout att = tf.nn.dropout(att, keep_prob=0.5) out = tf.nn.dropout(out, keep_prob=0.5) r.append(tf.matmul(att,out)) r = tf.concat(0,r) # batch*d #********************************************************************************************** M_input = tanh(tf.matmul(input_all,W_h_input)) # (N*batch) * 2*hidden # dropout M_input = tf.nn.dropout(M_input, keep_prob=0.5) a_input = tf.matmul(M_input,w_input) #a_input = tf.matmul(input_all,w_input) a_input = tf.reshape(a_input, [n_steps,-1]) # N*batch a_input = tf.transpose(a_input, [1,0]) # batch*N a_input = tf.nn.softmax(a_input) a_input = tf.reshape(a_input, [batch_s,1,n_steps]) # batch*1*N ''' a_input = tf.nn.softmax(tf.matmul(M_input,w_input)) # (N*batch) * 1 a_input = tf.reshape(a_input, [n_steps,-1, 1]) # N*batch*1 a_input = tf.transpose(a_input, [1,2,0]) # batch*1*N ''' input_all = tf.reshape(input_all, [n_steps,-1, n_input]) # N*batch*n_input input_all = tf.transpose(input_all, [1,0,2]) # batch*N*n_input a_input = tf.split(0, batch_s, a_input) input_all = tf.split(0, batch_s, input_all) r_input = [] for i in range(batch_s): a_input_temp = a_input[i][0:1,:,:] o_input_temp = input_all[i][0:1,:,:] att_input = tf.reshape(a_input_temp,[1, n_steps]) input_input = tf.reshape(o_input_temp,[n_steps,n_input]) # dropout att_input = tf.nn.dropout(att_input, keep_prob=0.5) input_input = tf.nn.dropout(input_input, keep_prob=0.5) r_input.append(tf.matmul(att_input,input_input)) r_input = tf.concat(0,r_input) # batch*n_input ''' r_input_hidden = tanh(tf.matmul(r_input,W_x_input)) #r_input_hidden = tf.matmul(r_input,W_x_input) _h = tanh(W_p*r + W_p_input*r_input_hidden + W_x*outputs[-1]) predict = tf.matmul(_h, weights['out']) + biases['out'] ''' _h_temp_1 = tanh(W_p*r + W_x*outputs[-1]) _h_temp_2 = tanh(W_p_input*r_input) _h_concat = tf.concat(1,[_h_temp_1,_h_temp_2]) # dropout _h_concat = tf.nn.dropout(_h_concat, keep_prob=0.25) predict = tf.matmul(_h_concat, weights_concat['out_concat']) + biases['out'] return predict,outputs
def __write_memory(self, his_mem, enc_states, global_trace, step): with variable_scope.variable_scope("write_memory"): mem_slots = his_mem.get_shape()[1].value mem_size = his_mem.get_shape()[2].value for i, state in enumerate(enc_states): if i > 0: variable_scope.get_variable_scope().reuse_variables() # Concatenate history memory with the null slot tmp_mem = array_ops.concat([his_mem, tf.identity(self.null_mem)], axis=1) #[batch_size,his_mem_slots+1,his_mem_size] hidden = array_ops.reshape(tmp_mem, [-1, mem_slots+1, 1, mem_size]) k = variable_scope.get_variable("AttnW", [1, 1, mem_size, mem_size]) mem_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = variable_scope.get_variable("AttnV", [mem_size]) mstate = state y = linear([flatten_query(mstate), global_trace], mem_size, True, scope = "query_trans") y = array_ops.reshape(y, [-1, 1, 1, mem_size]) s = math_ops.reduce_sum(v * math_ops.tanh(mem_features + y), [2, 3]) #[batch_size,mem_slots+1] random_mask = 1.0 - tf.sign(math_ops.reduce_sum(tf.abs(tmp_mem), axis=2)) #[batch_size,his_mem_slots+1] #tf.sign(x) 若x==0,返回0;若x<0,返回-1;若x>0,返回1 # The random_mask shows if a slot is empty, 1 empty, 0 not empty. # The null mask is 1 if there is at least 1 empty slot. null_mask = random_mask[:, 0:self.hps.his_mem_slots] null_mask = math_ops.reduce_sum(null_mask, axis=1) #[batch_size] null_mask = tf.sign(null_mask) bias = self.random_bias * random_mask #random_bias tensor [batch_size,his_mem_slots+1] max_bias = tf.reduce_max(bias, axis=1) #[batch_size] max_bias = tf.expand_dims(max_bias, axis=1) #[batch_size,1] bias = tf.divide(bias, max_bias + 1e-12) max_s = tf.expand_dims(math_ops.reduce_max(s, axis=1), axis=1) #[batch_size,1] thred1 = tf.ones([self.b_size, self.hps.his_mem_slots+1], dtype=tf.float32) thred2 = tf.zeros([self.b_size, self.hps.his_mem_slots+1], dtype=tf.float32) thred = tf.where(tf.equal(null_mask, 1), thred1, thred2) bias1 = bias * tf.abs(max_s) * thred s1 = s + bias1 #为什么要加bias??? a = nn_ops.softmax(s1) #[batch_size,his_mem_slots+1] max_val = tf.reduce_max(a, axis=1) #[batch_size] max_val = tf.expand_dims(max_val, axis=1) #[batch_size,1] if self.mode == 'train': float_mask0 = tf.tanh(self.gama * (a - max_val)) + 1.0 elif self.mode == 'decode': float_mask0 = tf.sign(a - max_val) + 1.0 mask = self.write_masks[step][i] float_mask = tf.multiply(mask, float_mask0) float_mask = tf.expand_dims(float_mask, axis=2) #[batch_size,his_mem_slots+1,1] #print (np.shape(float_mask)) w_states = tf.tile(mstate, [1, mem_slots]) #[batch_size,2*hidden_size] 变为[batch_size,mem_slots*2*hidden_size] w_states = array_ops.reshape(w_states, [-1, mem_slots, mem_size]) final_mask = float_mask[:, 0:self.hps.his_mem_slots, :] #[batch_size,his_mem_slots,1] #print (final_mask.get_shape()) his_mem = (1.0 - final_mask) * his_mem + final_mask * w_states return his_mem
def attention_decoder(encoder_mask, decoder_inputs, initial_state, attention_states, cell, beam_size, output_size=None, num_layers=1, loop_function=None, dtype=dtypes.float32, scope=None, initial_state_attention=False): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. This implementation is based on http://arxiv.org/abs/1409.0473 (see below for details). Args: encoder_mask: the mask of encoder inputs [batch_size x attn_length]. decoder_inputs: A list of 2D Tensors [batch_size x input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. beam_size: the beam size of beam search output_size: Size of the output vectors; if None, we use cell.output_size. loop_function: When decoding, this function will be applied to i-th output in order to generate i+1-th input. The generation is by beam search. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state. Returns: A tuple of the form (outputs, state, symbols), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either the i-th element of decoder_inputs or loop_function(output {i-1}, i)) as follows. state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. symbols: When training, it is []; when decoding, it is the best translation generated by beam search. Raises: ValueError: when shapes of attention_states are not set, or input size cannot be inferred from the input. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError( "Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_decoder"): batch_size = array_ops.shape( decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value state_size = initial_state.get_shape()[1].value attention_vec_size = attn_size // 2 # Size of query vectors for attention. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) # compute the initial hidden state of decoder initial_state = math_ops.tanh( linear(initial_state, state_size, False, weight_initializer=init_ops.random_normal_initializer( 0, 0.01, seed=SEED))) with variable_scope.variable_scope(scope or "attention"): k = variable_scope.get_variable( "AttnW", [1, 1, attn_size, attention_vec_size], initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED)) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = variable_scope.get_variable( "AttnV", [attention_vec_size], initializer=init_ops.constant_initializer(0.0)) def attention(query, scope=None): """Put attention masks on hidden using hidden_features and query.""" with variable_scope.variable_scope(scope or "attention"): ds = [] # Results of attention reads will be stored here. if nest.is_sequence( query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) with variable_scope.variable_scope("AttnU"): y = linear( query, attention_vec_size, False, weight_initializer=init_ops.random_normal_initializer( 0, 0.001, seed=SEED)) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # the additive attention is computed by v^T * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) s = array_ops.transpose( array_ops.transpose(s) - math_ops.reduce_max(s, [1])) # sofxmax with mask s = math_ops.exp(s) s = math_ops.to_float(encoder_mask) * s a = array_ops.transpose( array_ops.transpose(s) / math_ops.reduce_sum(s, [1])) d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds outputs = [] output = None state = initial_state out_state = array_ops.split(1, num_layers, state)[-1] prev = None symbols = [] prev_probs = [0] batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp, prev_probs, index, prev_symbol = loop_function( prev, prev_probs, beam_size, i) out_state = array_ops.gather(out_state, index) # update prev state state = array_ops.gather(state, index) # update prev state attns = [array_ops.gather(attn, index) for attn in attns] # update prev attens for j, output in enumerate(outputs): outputs[j] = array_ops.gather( output, index) # update prev outputs for j, symbol in enumerate(symbols): symbols[j] = array_ops.gather( symbol, index) # update prev symbols symbols.append(prev_symbol) # Run the attention mechanism. if i > 0 or (i == 0 and initial_state_attention): attns = attention(out_state, scope="attention") # Run the RNN. cinp = array_ops.concat( 1, [inp, attns[0] ]) # concatenate next input and the context vector out_state, state = cell(cinp, state) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([out_state] + [cinp], output_size, False) output = array_ops.reshape(output, [-1, output_size // 2, 2]) output = math_ops.reduce_max(output, 2) # maxout if loop_function is not None: prev = output outputs.append(output) if loop_function is not None: # handle the last symbol inp, prev_probs, index, prev_symbol = loop_function( prev, prev_probs, beam_size, i + 1) out_state = array_ops.gather(out_state, index) # update prev state state = array_ops.gather(state, index) # update prev state for j, output in enumerate(outputs): outputs[j] = array_ops.gather(output, index) # update prev outputs for j, symbol in enumerate(symbols): symbols[j] = array_ops.gather(symbol, index) # update prev symbols symbols.append(prev_symbol) # output the best result of beam search for k, symbol in enumerate(symbols): symbols[k] = array_ops.gather(symbol, 0) out_state = array_ops.expand_dims(array_ops.gather(out_state, 0), 0) state = array_ops.expand_dims(array_ops.gather(state, 0), 0) for j, output in enumerate(outputs): outputs[j] = array_ops.expand_dims(array_ops.gather(output, 0), 0) # update prev outputs return outputs, state, symbols
def call(self, inputs, state): """ Run one time step of the cell. That is, given the current inputs and the state from the last time step, calculate the current state and cell output. You will notice that TensorFlow LSTMCell has a lot of other features. But we will not try them. Focus on the very basic LSTM functionality. Hint 1: If you try to figure out the tensor shapes, use print(a.get_shape()) to see the shape. Hint 2: In LSTM there exist both matrix multiplication and element-wise multiplication. Try not to mix them. :param inputs: The input at the current time step. The last dimension of it should be 1. :param state: The state value of the cell from the last time step. The state size can be found from function state_size(self). :return: A tuple containing (output, new_state). For details check TensorFlow LSTMCell class. """ ############################################# # TODO: YOUR CODE HERE # ############################################# params = self.params c_prev = array_ops.slice(state, [0, 0], [-1, params[0]]) h_prev = array_ops.slice(state, [0, params[0]], [-1, params[1]]) W = self.W b = self.b W_fh = W['W_fh'] W_ih = W['W_ih'] W_ch = W['W_ch'] W_oh = W['W_oh'] W_fi = W['W_fi'] W_ii = W['W_ii'] W_ci = W['W_ci'] W_oi = W['W_oi'] W_h = W['W_h'] W_fc = W['W_fc'] W_ic = W['W_ic'] W_oc = W['W_oc'] b_f = b['b_f'] b_i = b['b_i'] b_c = b['b_c'] b_o = b['b_o'] f = math_ops.sigmoid( tf.matmul(h_prev, W_fh) + tf.multiply(inputs, W_fi) + b_f + tf.matmul(c_prev, W_fc)) i = math_ops.sigmoid( tf.matmul(h_prev, W_ih) + tf.multiply(inputs, W_ii) + b_i + tf.matmul(c_prev, W_ic)) _c = math_ops.tanh( tf.matmul(h_prev, W_ch) + tf.multiply(inputs, W_ci) + b_c) c = f * c_prev + i * _c o = math_ops.sigmoid( tf.matmul(h_prev, W_oh) + tf.multiply(inputs, W_oi) + b_o + tf.matmul(c, W_oc)) h = o * math_ops.tanh(c) h = tf.matmul(h, W_h) new_state = (array_ops.concat([c, h], 1)) output = h return output, new_state
# In[6]: # Global variables batches = 1 stime = 500 num_units = 20 num_inputs = 1 rnn_init_state = np.zeros([1, num_units], dtype="float32") rnn_inputs = np.zeros((batches, stime, num_inputs), dtype="float32") rnn_inputs[0, :, 0] = np.sin(np.linspace(0,18*np.pi, stime)) + np.sin(np.linspace(0,5.3*np.pi, stime)) + np.sin(np.linspace(0,2.1*np.pi, stime)) plt.plot(rnn_inputs[0,:,:]) plt.show() activation = lambda x: math_ops.tanh(x) # Implementing a static graph without tensorflow API: # In[7]: tf.reset_default_graph() static_graph = tf.Graph() with static_graph.as_default() as g: rng = np.random.RandomState(random_seed) # Init the ESN cell cell = EchoStateRNNCell(num_units=num_units,
def call(self, inputs, state): """Run one step of G-LSTM. Args: inputs: input Tensor, 2D, [batch x num_units]. state: this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the G-LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - LSTMStateTuple representing the new state of G-LSTM cell after reading `inputs` when the previous state was `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ (c_prev, m_prev) = state self._batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0] input_size = inputs.shape[-1].value or array_ops.shape(inputs)[-1] dtype = inputs.dtype scope = vs.get_variable_scope() with vs.variable_scope(scope, initializer=self._initializer): i_parts = [] j_parts = [] f_parts = [] o_parts = [] for group_id in range(self._number_of_groups): with vs.variable_scope("group%d" % group_id): x_g_id = array_ops.concat( [ self._get_input_for_group( inputs, group_id, int(input_size / self._number_of_groups)), #self._group_shape[0]), # this is only correct if inputs dim = num_units!!! self._get_input_for_group( m_prev, group_id, int(self._output_size / self._number_of_groups)) ], axis=1) #self._group_shape[0])], axis=1) if self._linear1[group_id] is None: self._linear1[group_id] = _Linear( x_g_id, 4 * self._group_shape[1], False) R_k = self._linear1[group_id](x_g_id) # pylint: disable=invalid-name i_k, j_k, f_k, o_k = array_ops.split(R_k, 4, 1) i_parts.append(i_k) j_parts.append(j_k) f_parts.append(f_k) o_parts.append(o_k) bi = vs.get_variable(name="bias_i", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) bj = vs.get_variable(name="bias_j", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) bf = vs.get_variable(name="bias_f", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) bo = vs.get_variable(name="bias_o", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) i = nn_ops.bias_add(array_ops.concat(i_parts, axis=1), bi) j = nn_ops.bias_add(array_ops.concat(j_parts, axis=1), bj) f = nn_ops.bias_add(array_ops.concat(f_parts, axis=1), bf) o = nn_ops.bias_add(array_ops.concat(o_parts, axis=1), bo) c = (math_ops.sigmoid(f + self._forget_bias) * c_prev + math_ops.sigmoid(i) * math_ops.tanh(j)) m = math_ops.sigmoid(o) * self._activation(c) if self._num_proj is not None: with vs.variable_scope("projection"): if self._linear2 is None: self._linear2 = _Linear(m, self._num_proj, False) m = self._linear2(m) new_state = rnn_cell_impl.LSTMStateTuple(c, m) return m, new_state
def Cell(v): # If v is a vector [n, 1], x is a big square matrix. x = math_ops.tanh(v + array_ops.transpose(v, [1, 0])) return math_ops.reduce_sum(x, 1, keep_dims=True)
def attention(query): """ Put attention masks on hidden using hidden_features and query. :param query: Vector to compute attention with """ # Results of attention reads will be stored here. ds = [] # Will store masks over encoder context attn_masks = [] # Store attention logits attn_logits = [] # If the query is a tuple, flatten it. if nest.is_sequence(query): query_list = nest.flatten(query) # Check that ndims == 2 if specified. for q in query_list: ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a): if attn_type == "linear": y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) elif attn_type == "bilinear": query = tf.tile(tf.expand_dims(query, 1), [1, attn_length, 1]) query = batch_linear(query, attn_size, bias=True) hid = tf.squeeze(hidden, [2]) s = tf.reduce_sum(tf.mul(query, hid), [2]) else: # Two layer MLP y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). layer1 = math_ops.tanh(hidden_features[a] + y) k2 = variable_scope.get_variable( "AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) layer2 = nn_ops.conv2d(layer1, k2, [1, 1, 1, 1], "SAME") s = math_ops.reduce_sum(v[a] * math_ops.tanh(layer2), [2, 3]) a = nn_ops.softmax(s) attn_masks.append(a) attn_logits.append(s) # Now calculate the attention-weighted vector d. Hidden is encoder # hidden states d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds, attn_masks, attn_logits
def __call__(self, inputs, state, scope=None): """Run one step of G-LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: not used Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the G-LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of G-LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ (c_prev, m_prev) = state input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( "Could not infer input size from inputs.get_shape()[-1]") dtype = inputs.dtype with vs.variable_scope(scope or "glstm_cell", initializer=self._initializer): i_parts = [] j_parts = [] f_parts = [] o_parts = [] for group_id in xrange(self._number_of_groups): with vs.variable_scope("group%d" % group_id): x_g_id = array_ops.concat([ self._get_input_for_group(inputs, group_id, self._group_shape[0]), self._get_input_for_group(m_prev, group_id, self._group_shape[0]) ], axis=1) R_k = linear(x_g_id, 4 * self._group_shape[1], bias=False, scope=scope) #will add per gate biases later i_k, j_k, f_k, o_k = array_ops.split(R_k, 4, 1) i_parts.append(i_k) j_parts.append(j_k) f_parts.append(f_k) o_parts.append(o_k) #it is more efficient to have per gate biases then per gate, per group bi = vs.get_variable(name="biases_i", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) bj = vs.get_variable(name="biases_j", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) bf = vs.get_variable(name="biases_f", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) bo = vs.get_variable(name="biases_o", shape=[self._num_units], dtype=dtype, initializer=init_ops.constant_initializer( 0.0, dtype=dtype)) i = nn_ops.bias_add(array_ops.concat(i_parts, axis=1), bi) j = nn_ops.bias_add(array_ops.concat(j_parts, axis=1), bj) f = nn_ops.bias_add(array_ops.concat(f_parts, axis=1), bf) o = nn_ops.bias_add(array_ops.concat(o_parts, axis=1), bo) c = math_ops.sigmoid(f + self._forget_bias) * c_prev + math_ops.sigmoid( i) * math_ops.tanh(j) m = math_ops.sigmoid(o) * self._activation(c) if self._num_proj is not None: with vs.variable_scope("projection"): m = linear(m, self._num_proj, bias=False, scope=scope) new_state = LSTMStateTuple(c, m) return m, new_state
def RNN(x, weights, biases): x = tf.transpose(x, [1, 0, 2]) x = tf.reshape(x, [-1, n_input]) x = tf.split(0, n_steps, x) gru_fw_cell = rnn_cell.GRUCell(n_hidden) gru_fw_cell = tf.nn.rnn_cell.DropoutWrapper(gru_fw_cell, output_keep_prob=0.7) gru_bw_cell = rnn_cell.GRUCell(n_hidden) gru_bw_cell = tf.nn.rnn_cell.DropoutWrapper(gru_bw_cell, output_keep_prob=0.7) outputs, _, _ = rnn.bidirectional_rnn(gru_fw_cell, gru_bw_cell, x, dtype=tf.float32) batch_s = 100 outputs_all = tf.concat(0, outputs) # (N*batch) * 2*n_hidden # dropout outputs_all = tf.nn.dropout(outputs_all, keep_prob=0.5) M = tanh(tf.matmul(outputs_all, W_h)) # (N*batch) * 2*hidden M_2 = tanh(tf.matmul(outputs_all, W_h_2)) # (N*batch) * 2*hidden # dropout M = tf.nn.dropout(M, keep_prob=0.5) M_2 = tf.nn.dropout(M, keep_prob=0.5) #a = tf.matmul(M,w) a = tanh(tf.matmul(outputs_all, w)) a = tf.reshape(a, [n_steps, -1]) # N*batch a = tf.transpose(a, [1, 0]) # batch*N a = tf.nn.softmax(a) a = tf.reshape(a, [batch_s, 1, n_steps]) # batch*1*N a_2 = tanh(tf.matmul(outputs_all, w)) a_2 = tf.reshape(a_2, [n_steps, -1]) # N*batch a_2 = tf.transpose(a_2, [1, 0]) # batch*N a_2 = tf.nn.softmax(a_2) a_2 = tf.reshape(a_2, [batch_s, 1, n_steps]) # batch*1*N outputs_all = tf.reshape(outputs_all, [n_steps, -1, 2 * n_hidden]) # N*batch*d outputs_all = tf.transpose(outputs_all, [1, 0, 2]) # batch*N*d a = tf.split(0, batch_s, a) a_2 = tf.split(0, batch_s, a_2) outputs_all = tf.split(0, batch_s, outputs_all) r = [] r_2 = [] for i in range(batch_s): a_temp = a[i][0:1, :, :] o_temp = outputs_all[i][0:1, :, :] att = tf.reshape(a_temp, [1, n_steps]) # 1*N out = tf.reshape(o_temp, [n_steps, 2 * n_hidden]) # N*2*n_hidden a_2_temp = a_2[i][0:1, :, :] o_2_temp = outputs_all[i][0:1, :, :] att_2 = tf.reshape(a_2_temp, [1, n_steps]) # 1*N out_2 = tf.reshape(o_2_temp, [n_steps, 2 * n_hidden]) # N*2*n_hidden # dropout att = tf.nn.dropout(att, keep_prob=0.5) out = tf.nn.dropout(out, keep_prob=0.5) att_2 = tf.nn.dropout(att_2, keep_prob=0.5) out_2 = tf.nn.dropout(out_2, keep_prob=0.5) r.append(tf.matmul(att, out)) r_2.append(tf.matmul(att_2, out_2)) r = tf.concat(0, r) # batch*d r_2 = tf.concat(0, r_2) # batch*d _h = tanh(W_p * r + W_x * outputs[-1] + W_p_2 * r_2) # dropout _h = tf.nn.dropout(_h, keep_prob=0.25) predict = tf.matmul(_h, weights['out']) + biases['out'] return predict, outputs
def Foo(x, y, z): return math_ops.tanh(math_ops.matmul(x, y) + z)
def MLP(i, a, ws, bs): a = math_ops.tanh(math_ops.matmul(a, ws[i, :]) + bs[i, :]) return a, ws, bs
def Forward(x): return math_ops.reduce_sum(math_ops.tanh(x))
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: state Tensor, 2D, batch x state_size. scope: VariableScope for the created subgraph; defaults to "LSTMCell". Returns: A tuple containing: - A 2D, batch x output_dim, Tensor representing the output of the LSTM after reading "inputs" when previous state was "state". Here output_dim is: num_proj if num_proj was set, num_units otherwise. - A 2D, batch x state_size, Tensor representing the new state of LSTM after reading "inputs" when previous state was "state". Raises: ValueError: if an input_size was specified and the provided inputs have a different dimension. """ num_proj = self._num_units if self._num_proj is None else self._num_proj c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype actual_input_size = inputs.get_shape().as_list()[1] if self._input_size and self._input_size != actual_input_size: raise ValueError("Actual input size not same as specified: %d vs %d." % actual_input_size, self._input_size) with vs.variable_scope(scope or type(self).__name__, initializer=self._initializer): # "LSTMCell" concat_w = _get_concat_variable( "W", [actual_input_size + num_proj, 4 * self._num_units], dtype, self._num_unit_shards) b = vs.get_variable( "B", shape=[4 * self._num_units], initializer=array_ops.zeros_initializer, dtype=dtype) # i = input_gate, j = new_input, f = forget_gate, o = output_gate cell_inputs = array_ops.concat(1, [inputs, m_prev]) lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b) i, j, f, o = array_ops.split(1, 4, lstm_matrix) # Diagonal connections if self._use_peepholes: w_f_diag = vs.get_variable( "W_F_diag", shape=[self._num_units], dtype=dtype) w_i_diag = vs.get_variable( "W_I_diag", shape=[self._num_units], dtype=dtype) w_o_diag = vs.get_variable( "W_O_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev + sigmoid(i + w_i_diag * c_prev) * tanh(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * tanh(j)) if self._cell_clip is not None: c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) if self._use_peepholes: m = sigmoid(o + w_o_diag * c) * tanh(c) else: m = sigmoid(o) * tanh(c) if self._num_proj is not None: concat_w_proj = _get_concat_variable( "W_P", [self._num_units, self._num_proj], dtype, self._num_proj_shards) m = math_ops.matmul(m, concat_w_proj) return m, array_ops.concat(1, [c, m])
def attention_decoder(encoder_mask, decoder_inputs, encoder_embeds, encoder_probs, encoder_hs, mem_mask, initial_state, attention_states, cell, beam_size, output_size=None, num_heads=1, num_layers=1, loop_function=None, dtype=dtypes.float32, scope=None, initial_state_attention=False): """RNN decoder with attention for the sequence-to-sequence model. In this context "attention" means that, during decoding, the RNN can look up information in the additional tensor attention_states, and it does this by focusing on a few entries from the tensor. This model has proven to yield especially good results in a number of sequence-to-sequence tasks. Args: encoder_mask: A 2D Tensor [batch_size x input_size] decoder_inputs: A list of 3D Tensors [batch_size x input_size x hidden_emb]. encoder_embeds: A 3D Tensor [batch_size x 2*input_size x hidden_emb] encoder_probs: A 3D Tensor [batch_size x 2*input_size x target_vocab_size] encoder_hs: A 3D Tensor [batch_size x 2*input_size x input_size] mem_mask: A 2D Tensor [batch_size x 2*input_size] initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: Size of the output vectors; if None, we use cell.output_size. num_heads: Number of attention heads that read from attention_states. loop_function: If not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state, symbols, logits_mem, aligns_mem), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. symbols: A list of target word ids, the best results returned by beam search. aligns_mem: A list of memory attention weights. logits_mem: A list of [batch_size x target_vocab_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, shapes of attention_states are not set, or input size cannot be inferred from the input. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with variable_scope.variable_scope(scope or "attention_decoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value embed_size = encoder_embeds.get_shape()[2].value state_size = initial_state.get_shape()[1].value hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) # memory hidden states based on the probability in encoder_hs encoder_hs = math_ops.reduce_sum( array_ops.tile(array_ops.reshape(attention_states, [batch_size, 1, attn_length, attn_size]), [1, 2 * attn_length, 1, 1]) * array_ops.expand_dims(encoder_hs, 3), [2]) # merged hidden states are concatenated by target word embeddings mems = array_ops.concat(2, [encoder_hs, encoder_embeds]) mems = array_ops.transpose(array_ops.expand_dims(mems, 3), [0, 1, 3, 2]) hidden_features = [] v = [] attention_vec_size = attn_size // 2 # Size of query vectors for attention. initial_state = math_ops.tanh( linear(initial_state, state_size, False, weight_initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED))) def attention(query, scope=None): """Put attention masks on hidden using hidden_features and query.""" with variable_scope.variable_scope(scope or "attention"): for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size], initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED)) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size], initializer=init_ops.constant_initializer(0.0))) ds = [] # Results of attention reads will be stored here. aa = [] if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for a in xrange(num_heads): with variable_scope.variable_scope("AttnU_%d" % a): y = linear(query, attention_vec_size, False, weight_initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED)) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3]) s = array_ops.transpose(array_ops.transpose(s) - math_ops.reduce_max(s, [1])) # sofxmax with mask s = math_ops.exp(s) s = math_ops.to_float(encoder_mask) * s a = array_ops.transpose(array_ops.transpose(s) / math_ops.reduce_sum(s, [1])) aa.append(a) d = math_ops.reduce_sum(array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds, aa # memory attention def attention_mem(query, scope=None): with variable_scope.variable_scope(scope or "attention"): vt = [] hidden_targets = [] for a in xrange(num_heads): vt.append(variable_scope.get_variable("AttnVt_%d" % a, [attention_vec_size], initializer=init_ops.constant_initializer(0.0))) kt = variable_scope.get_variable("AttnWt_%d" % a, [1, 1, embed_size + attn_size, attention_vec_size], initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED)) hidden_targets.append(nn_ops.conv2d(mems, kt, [1, 1, 1, 1], "SAME")) ds_mem = [] as_mem = [] if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for a in xrange(num_heads): with variable_scope.variable_scope("AttnU_%d" % a): y_mem = linear(query, attention_vec_size, False, weight_initializer=init_ops.random_normal_initializer(0, 0.001, seed=SEED), scope="Linear_mem") y_mem = array_ops.reshape(y_mem, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s_mem = math_ops.reduce_sum(vt[a] * math_ops.tanh(hidden_targets[a] + y_mem), [2, 3]) s_mem = array_ops.transpose(array_ops.transpose(s_mem) - math_ops.reduce_max(s_mem, [1])) s_mem = math_ops.exp(s_mem) s_mem = mem_mask * s_mem a_mem = array_ops.transpose(array_ops.transpose(s_mem) / math_ops.reduce_sum(s_mem, [1])) as_mem.append(a_mem) # Now calculate the attention-weighted vector d. d_mem = math_ops.reduce_sum(array_ops.expand_dims(a_mem, 2) * encoder_probs, [1]) ds_mem.append(d_mem) return ds_mem, as_mem outputs = [] logits_mem = [] aligns_mem = [] output = None state = initial_state out_state = array_ops.split(1, num_layers, state)[-1] prev = None prev_d_mem = None symbols = [] prev_probs = [0] batch_attn_size = array_ops.pack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) for i, inp in enumerate(decoder_inputs): if i > 0: variable_scope.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with variable_scope.variable_scope("loop_function", reuse=True): inp, prev_probs, index, prev_symbol = loop_function(prev, prev_probs, beam_size, prev_d_mem, i) out_state = array_ops.gather(out_state, index) # update prev state state = array_ops.gather(state, index) # update prev state attns = [array_ops.gather(attn, index) for attn in attns] # update prev attens for j, output in enumerate(outputs): outputs[j] = array_ops.gather(output, index) # update prev outputs for j, symbol in enumerate(symbols): symbols[j] = array_ops.gather(symbol, index) # update prev symbols for j, logit_mem in enumerate(logits_mem): logits_mem[j] = array_ops.gather(logit_mem, index) # update prev outputs for j, align_mem in enumerate(aligns_mem): aligns_mem[j] = array_ops.gather(align_mem, index) # update prev outputs symbols.append(prev_symbol) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input: %s" % inp.name) # Run the attention mechanism. if i > 0 or (i == 0 and initial_state_attention): attns, aa = attention(out_state, scope="attention") query = array_ops.concat(1, [out_state, inp]) logit_mem, align_mem = attention_mem(query, scope="attention") logits_mem.append(logit_mem[0]) aligns_mem.append(align_mem[0]) # Run the RNN. cinp = array_ops.concat(1, [inp, attns[0]]) out_state, state = cell(cinp, state) with variable_scope.variable_scope("AttnOutputProjection"): output = linear([out_state] + [cinp], output_size, False) output = array_ops.reshape(output, [-1, output_size // 2, 2]) output = math_ops.reduce_max(output, 2) # maxout if loop_function is not None: prev = output prev_d_mem = logits_mem[-1] outputs.append(output) if loop_function is not None: # process the last symbol inp, prev_probs, index, prev_symbol = loop_function(prev, prev_probs, beam_size, prev_d_mem, i + 1) out_state = array_ops.gather(out_state, index) # update prev state state = array_ops.gather(state, index) # update prev state for j, output in enumerate(outputs): outputs[j] = array_ops.gather(output, index) # update prev outputs for j, symbol in enumerate(symbols): symbols[j] = array_ops.gather(symbol, index) # update prev symbols for j, logit_mem in enumerate(logits_mem): logits_mem[j] = array_ops.gather(logit_mem, index) # update prev outputs for j, align_mem in enumerate(aligns_mem): aligns_mem[j] = array_ops.gather(align_mem, index) # update prev outputs symbols.append(prev_symbol) # output the final best result of beam search for k, symbol in enumerate(symbols): symbols[k] = array_ops.gather(symbol, 0) out_state = array_ops.expand_dims(array_ops.gather(out_state, 0), 0) state = array_ops.expand_dims(array_ops.gather(state, 0), 0) for j, output in enumerate(outputs): outputs[j] = array_ops.expand_dims(array_ops.gather(output, 0), 0) # update prev outputs for k, logit_mem in enumerate(logits_mem): logits_mem[k] = array_ops.expand_dims(array_ops.gather(logit_mem, 0), 0) for k, align_mem in enumerate(aligns_mem): aligns_mem[k] = array_ops.expand_dims(array_ops.gather(align_mem, 0), 0) return outputs, state, symbols, logits_mem, aligns_mem
def __call__(self, input_, state, scope=None): """Run one step of LSTM. Args: input_: input Tensor, 2D, batch x num_units. state: state Tensor, 2D, batch x state_size. scope: VariableScope for the created subgraph; defaults to "LSTMCell". Returns: A tuple containing: - A 2D, batch x output_dim, Tensor representing the output of the LSTM after reading "input_" when previous state was "state". Here output_dim is: num_proj if num_proj was set, num_units otherwise. - A 2D, batch x state_size, Tensor representing the new state of LSTM after reading "input_" when previous state was "state". """ num_proj = self._num_units if self._num_proj is None else self._num_proj c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = input_.dtype with vs.variable_scope(scope or type(self).__name__): # "LSTMCell" sharded_w = _get_sharded_variable( "W", [self.input_size + num_proj, 4 * self._num_units], self._initializer, dtype, self._num_unit_shards) b = vs.get_variable("B", shape=[4 * self._num_units], initializer=array_ops.zeros_initializer, dtype=dtype) # i = input_gate, j = new_input, f = forget_gate, o = output_gate cell_inputs = array_ops.concat(1, [input_, m_prev]) lstm_matrix = nn_ops.bias_add( _matmul_with_sharded_variable(cell_inputs, sharded_w), b) i, j, f, o = array_ops.split(1, 4, lstm_matrix) # Diagonal connections if self._use_peepholes: w_f_diag = vs.get_variable("W_F_diag", shape=[self._num_units], initializer=self._initializer, dtype=dtype) w_i_diag = vs.get_variable("W_I_diag", shape=[self._num_units], initializer=self._initializer, dtype=dtype) w_o_diag = vs.get_variable("W_O_diag", shape=[self._num_units], initializer=self._initializer, dtype=dtype) if self._use_peepholes: c = (sigmoid(f + 1 + w_f_diag * c_prev) * c_prev + sigmoid(i + w_i_diag * c_prev) * tanh(j)) else: c = (sigmoid(f + 1) * c_prev + sigmoid(i) * tanh(j)) if self._cell_clip is not None: c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) if self._use_peepholes: m = sigmoid(o + w_o_diag * c) * tanh(c) else: m = sigmoid(o) * tanh(c) if self._num_proj is not None: sharded_w_proj = _get_sharded_variable( "W_P", [self._num_units, self._num_proj], self._initializer, dtype, self._num_proj_shards) m = _matmul_with_sharded_variable(m, sharded_w_proj) return m, array_ops.concat(1, [c, m])
def __call__(self, inputs, state, scope=None): """Most basic RNN: output = new_state = tanh(W * input + U * state + B).""" with vs.variable_scope(scope or type(self).__name__): # "BasicRNNCell" output = tanh(linear([inputs, state], self._num_units, True)) return output, output
def attention(decoder_state, coverage=None): """Calculate the context vector and attention distribution from the decoder state. Args: decoder_state: state of the decoder coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1). Returns: context_vector: weighted sum of encoder_states attn_dist: attention distribution coverage: new coverage vector. shape (batch_size, attn_len, 1, 1) """ with variable_scope.variable_scope("Attention"): # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper) decoder_features = linear( decoder_state, attention_vec_size, True) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims( tf.expand_dims(decoder_features, 1), 1) # reshape to (batch_size, 1, 1, attention_vec_size) def masked_attention(e): """Take softmax of e then apply enc_padding_mask and re-normalize""" attn_dist = nn_ops.softmax( e) # take softmax. shape (batch_size, attn_length) # If end2end, multiply the selector sentence probability with attnention probability if selector_probs is not None: attn_dist_norescale = attn_dist * enc_padding_mask # apply mask, attention probabilities of pad tokens will be 0 masked_sums = tf.reduce_sum( attn_dist_norescale, axis=1, keep_dims=True) # shape (batch_size) attn_dist_norescale = attn_dist_norescale / masked_sums batch_nums = tf.expand_dims( tf.range(0, limit=batch_size), 1) # shape (batch_size, 1) batch_nums_tile = tf.tile( batch_nums, [1, attn_len]) # shape (batch_size, attn_len) indices = tf.stack( (batch_nums_tile, enc_sent_id_mask), axis=2) # shape (batch_size, attn_len, 2) # All pad tokens will get probability of 0.0 since the sentence id is -1 (gather_nd will produce 0.0 for invalid indices) selector_probs_projected = tf.gather_nd( selector_probs, indices) # shape (batch_size, attn_len) attn_dist *= selector_probs_projected # shape (batch_size, attn_len) attn_dist *= enc_padding_mask masked_sums = tf.reduce_sum( attn_dist, axis=1, keep_dims=True) # shape (batch_size, 1) attn_dist = attn_dist / masked_sums # re-normalize return attn_dist_norescale, attn_dist else: attn_dist *= enc_padding_mask # apply mask, attention probabilities of pad tokens will be 0 masked_sums = tf.reduce_sum( attn_dist, axis=1, keep_dims=True) # shape (batch_size, 1) attn_dist = attn_dist / masked_sums # re-normalize return None, attn_dist if use_coverage and coverage is not None: # non-first step of coverage # Multiply coverage vector by w_c to get coverage_features. coverage_features = nn_ops.conv2d( coverage, w_c, [1, 1, 1, 1], "SAME" ) # c has shape (batch_size, attn_length, 1, attention_vec_size) # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn) e = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features + coverage_features), [2, 3]) # shape (batch_size,attn_length) # Calculate attention distribution attn_dist_norescale, attn_dist = masked_attention(e) # Update coverage vector coverage += array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) else: # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) e = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features), [2, 3]) # calculate e # Calculate attention distribution attn_dist_norescale, attn_dist = masked_attention(e) if use_coverage: # first step of training coverage = tf.expand_dims(tf.expand_dims(attn_dist, 2), 2) # initialize coverage # Calculate the context vector from attn_dist and encoder_states context_vector = math_ops.reduce_sum( array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) * encoder_states, [1, 2]) # shape (batch_size, attn_size). context_vector = array_ops.reshape(context_vector, [-1, attn_size]) return context_vector, attn_dist_norescale, attn_dist, coverage
def intra_decoder_attention(decoder_state, decoder_history_c, decoder_history_h): """Calculate the context vector and attention distribution from the decoder state and the previous decode states Args: decoder_state: state of the decoder decoder_history: tensor array [ (batch_size, state_size)] decoder_coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1). Returns: context_vector: weighted sum of encoder_states attn_dist: attention distribution coverage: new coverage vector. shape (batch_size, attn_len, 1, 1) """ with variable_scope.variable_scope("Intra_Decoder_Attention"): # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper) decoder_features = linear( decoder_state, decoder_cell_size, True) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims( tf.expand_dims(decoder_features, 1), 1) # reshape to (batch_size, 1, 1, state size) # Getting the history to this point and stack the item to produce a single tensor decoder_history_states_c = tf.TensorArray(tf.float32, size=0, dynamic_size=True) decoder_history_states_h = tf.TensorArray(tf.float32, size=0, dynamic_size=True) for i in range(len(decoder_history_c)): decoder_history_states_c.write(i, decoder_history_c[i]) decoder_history_states_h.write(i, decoder_history_h[i]) decoder_history_states_c = decoder_history_states_c.stack() decoder_history_states_c = tf.transpose( decoder_history_states_c, [1, 0, 2]) decoder_history_states_c = tf.expand_dims( decoder_history_states_c, axis=1) decoder_history_states_h = decoder_history_states_h.stack() decoder_history_states_h = tf.transpose( decoder_history_states_h, [1, 0, 2]) decoder_history_states_h = tf.expand_dims( decoder_history_states_h, axis=1) W_d_h = variable_scope.get_variable( "W_d_h", [1, 1, decoder_cell_size, decoder_cell_size]) decoder_history_features_h = nn_ops.conv2d( decoder_history_states_h, W_d_h, [1, 1, 1, 1], "SAME") # shape (batch_size,t,1,state size) W_d_c = variable_scope.get_variable( "W_d_c", [1, 1, decoder_cell_size, decoder_cell_size]) decoder_history_features_c = nn_ops.conv2d( decoder_history_states_c, W_d_c, [1, 1, 1, 1], "SAME") # shape (batch_size,t,1,state size) def masked_d_attention(e): """Take softmax of e then apply enc_padding_mask and re-normalize""" attn_d_dist = nn_ops.softmax( e) # take softmax. shape (batch_size, attn_length) # attn_d_dist *= dec_padding_mask # apply mask masked_d_sums = tf.reduce_sum(attn_dist, axis=1) # shape (batch_size) return attn_d_dist / tf.reshape(masked_d_sums, [-1, 1]) # re-normalize # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) e = math_ops.reduce_sum( v_d * math_ops.tanh(decoder_history_features_c + decoder_history_features_h + decoder_features), [2, 3]) # calculate e # print("e shape", e.get_shape()) # print("decoder_history_states_c shape", decoder_history_states_c.get_shape()) # Calculate attention distribution attn_d_dist = masked_d_attention(e) # print("attention dis shape", attn_d_dist.get_shape()) # Calculate the context vector from attn_dist and encoder_states context_d_vector_c = math_ops.reduce_sum( array_ops.reshape(attn_d_dist, [batch_size, -1, 1, 1]) * decoder_history_states_c, [1, 2]) # shape (batch_size, state size). context_d_vector_c = array_ops.reshape( context_d_vector_c, [-1, state.c.get_shape()[1].value]) context_d_vector_h = math_ops.reduce_sum( array_ops.reshape(attn_d_dist, [batch_size, -1, 1, 1]) * decoder_history_states_h, [1, 2]) # shape (batch_size, state size). context_d_vector_h = array_ops.reshape( context_d_vector_h, [-1, state.c.get_shape()[1].value]) return context_d_vector_c, context_d_vector_h
def fun(x): return math_ops.reduce_prod(math_ops.tanh(x)**2)
def __call__(self, inputs, state): embs = inputs[0] if len(inputs) == 2: mask_slice = inputs[1] else: mask_slice = None context = self.context context_mask = self.context_mask pctx_ = self.pctx_ """Gated recurrent unit (GRU) with nunits cells.""" tf.get_variable_scope().reuse_variables() W = tf.get_variable('W', dtype=self._precision) b = tf.get_variable('b', dtype=self._precision) U = tf.get_variable('U', dtype=self._precision) Wx = tf.get_variable('Wx', dtype=self._precision) Ux = tf.get_variable('Ux', dtype=self._precision) bx = tf.get_variable('bx', dtype=self._precision) U_nl = tf.get_variable('U_nl', dtype=self._precision) b_nl = tf.get_variable('b_nl', dtype=self._precision) Ux_nl = tf.get_variable('Ux_nl', dtype=self._precision) bx_nl = tf.get_variable('bx_nl', dtype=self._precision) Wc = tf.get_variable('Wc', dtype=self._precision) Wcx = tf.get_variable('Wcx', dtype=self._precision) W_comb_att = tf.get_variable('W_comb_att', dtype=self._precision) Wc_att = tf.get_variable('Wc_att', dtype=self._precision) b_att = tf.get_variable('b_att', dtype=self._precision) U_att = tf.get_variable('U_att', dtype=self._precision) c_tt = tf.get_variable('c_tt', dtype=self._precision) # graph build emb2hidden = math_ops.matmul(embs, Wx) + bx emb2gates = math_ops.matmul(embs, W) + b nlocation = tf.shape(context)[0] nsamples = tf.shape(context)[1] if state == None: raise ValueError("init state must be provided.") if mask_slice is None: mask_slice = tf.ones([nsamples, self._num_units]) # for decoding # gates input for first gru layer preAct1 = math_ops.matmul(state, U) preAct1 += emb2gates preAct1 = math_ops.sigmoid(preAct1) r1, u1 = array_ops.split(preAct1, 2, 1) # hidden input for first gru layer preActx1 = math_ops.matmul(state, Ux) preActx1 *= r1 preActx1 += emb2hidden h1 = math_ops.tanh(preActx1) h1 = u1 * state + (1. - u1) * h1 h1 = mask_slice * h1 + (1. - mask_slice) * state # attention pstate_ = math_ops.matmul(h1, W_comb_att) pctx__ = pctx_ + pstate_[None, :, :] pctx__ = math_ops.tanh(pctx__) pctx_2d = tf.reshape(pctx__, [-1, tf.shape(pctx__)[2]]) alpha = math_ops.matmul(pctx_2d, U_att) + c_tt #alpha = math_ops.matmul(pctx__, U_att) + c_tt alpha = tf.reshape(alpha, [nlocation, nsamples]) alpha = math_ops.exp(alpha) if context_mask is not None: alpha = alpha * context_mask alpha = alpha / tf.reduce_sum(alpha, 0, keep_dims=True) ctx_ = tf.reduce_sum(context * alpha[:, :, None], 0) preAct2 = math_ops.matmul(h1, U_nl) + b_nl preAct2 += math_ops.matmul(ctx_, Wc) preAct2 = math_ops.sigmoid(preAct2) r2, u2 = array_ops.split(preAct2, 2, 1) preActx2 = math_ops.matmul(h1, Ux_nl) + bx_nl preActx2 *= r2 preActx2 += math_ops.matmul(ctx_, Wcx) h2 = math_ops.tanh(preActx2) h2 = u2 * h1 + (1. - u2) * h2 h2 = mask_slice * h2 + (1. - mask_slice) * h1 output = tf.concat(axis=1, values=[h2, ctx_]) return output, h2