def __call__(self, inputs, state, scope=None): """Simplified Gating LSTM.""" with vs.variable_scope(scope or "simplified_gating_lstm_cell"): c, h = state with vs.variable_scope("gates_0") as gate_scope: if self._architecture in ['LS1', 'LS2']: concat = _linear( [h], 3 * self._num_units, True if self._architecture == 'LS1' else False, scope=scope) i, f, o = array_ops.split(value=concat, num_or_size_splits=3, axis=1) elif self._architecture == 'LS3': dtype = inputs.dtype bias = vs.get_variable("bias", shape=[3 * self._num_units], dtype=dtype) i, f, o = array_ops.split(value=bias, num_or_size_splits=3, axis=0) with vs.variable_scope("gates_1"): j = _linear([inputs, h], self._num_units, True, scope=scope) new_c = c * sigmoid(f + self._forget_bias) + sigmoid( i) * self._activation(j) new_h = self._activation(new_c) * sigmoid(o) new_state = LSTMStateTuple(new_c, new_h) return new_h, new_state
def __call__(self, inputs, state, scope=None): """JZS with num_units cells.""" with vs.variable_scope(scope or "jzs_cell"): # We start with bias of 1.0 to not reset and not update. if self._architecture == 'JZS1': r, u = [inputs, state], [inputs] elif self._architecture == 'JZS2': r, u = [state], [inputs, state] elif self._architecture == 'JZS3': r, u = [inputs, state], [inputs, tanh(state)] with vs.variable_scope("gates_0"): r = _linear(r, self._num_units, True, 1.0, scope=scope) if self._architecture == 'JZS2': r = r + inputs with vs.variable_scope("gates_1"): u = _linear(u, self._num_units, True, 1.0, scope=scope) r, u = sigmoid(r), sigmoid(u) with vs.variable_scope("candidate"): if self._architecture == 'JZS1': c = _linear( [r * state], self._num_units, True, scope=scope) + tanh(inputs) elif self._architecture in ['JZS2', 'JZS3']: c = _linear([inputs, r * state], self._num_units, True, scope=scope) c = self._activation(c) new_h = u * c + (1 - u) * state return new_h, new_h
def call(self, inputs, state): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope("gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. bias_ones = self._bias_initializer if self._bias_initializer is None: dtype = [a.dtype for a in [inputs, state]][0] bias_ones = init_ops.constant_initializer(1.0, dtype=dtype) #value = sigmoid(_linear([inputs, state], 2 * self._num_units, True, # bias_ones, self._kernel_initializer)) #r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) value = _linear([inputs, state], 2 * self._num_units, True, bias_ones, self._kernel_initializer) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) r = ln(r, scope='r/') u = ln(u, scope='u/') r, u = sigmoid(r), sigmoid(u) with vs.variable_scope("candidate"): # c = self._activation(_linear([inputs, r * state], self._num_units, True, # self._bias_initializer, self._kernel_initializer)) # new_h = u * state + (1 - u) * c Cand = _linear([inputs, r * state], self._num_units, True) c_pre = ln(Cand, scope='new_h/') c = self._activation(c_pre) new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, state, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" dtype = inputs.dtype batch_size, feature_size = inputs.get_shape().as_list() if self._use_tgate: # Time gate feature_size = feature_size - 1 tvscope = vs.get_variable_scope() with vs.variable_scope(tvscope, initializer=None) as unit_scope: with vs.variable_scope(unit_scope) as time_gate_scope: w_t1 = vs.get_variable( "w_t1", shape=[1, self._num_units], dtype=dtype) bias_t1 = vs.get_variable( "bias_t1", [self._num_units], dtype=dtype, initializer=init_ops.constant_initializer(0.0, dtype=dtype)) w_tx1 = vs.get_variable( "w_tx1", shape=[feature_size, self._num_units], dtype=dtype) seq = tf.slice(inputs, begin=[0, 0], size=[batch_size, feature_size]) delta_t = tf.slice(inputs, begin=[0, 56], size=[batch_size, 1]) t1_act = (self._activation(math_ops.matmul(delta_t, w_t1)) + math_ops.matmul(seq, w_tx1) + bias_t1) t1 = sigmoid(t1_act) inputs = seq # for initial state (state, state_decay) = state with vs.variable_scope("gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. value = sigmoid(_linear( [inputs, state], 2 * self._num_units, True, 1.0)) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) with vs.variable_scope("candidate"): c = self._activation(_linear([inputs, r * state], self._num_units, True)) new_h = u * state + (1 - u) * c if self._use_tgate: new_h_decay = u * t1 * state_decay + (1 - u * t1) * c new_state = (new_h, new_h_decay) new_state = (TGRUStateTuple(new_h, new_h_decay)) new_h = tf.concat([new_h, new_h_decay], axis=1) else: new_state = (new_h, new_h) new_state = (TGRUStateTuple(new_h, new_h)) return new_h, new_state
def attention(query, use_attention=False): """Put attention masks on hidden using hidden_features and query.""" attn_weights = [] ds = [] # Results of attention reads will be stored here. for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) if use_attention is False: # apply mean pooling weights = tf.tile(sequence_length, tf.stack([attn_length])) weights = array_ops.reshape(weights, tf.shape(s)) a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(weights) # a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(tf.shape(s)[1]) else: a = nn_ops.softmax(s) attn_weights.append(a) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return attn_weights, ds
def getMetaResults(self, meta_output, input, dimensions, scope="meta"): """calculate the gates results of basic lstm with meta-lstm network""" # with tf.variable_scope('z_trans'): # meta_output = rnn_cell._linear(meta_output, self._meta_num_units, False) with tf.variable_scope(scope): W_matrix_list = [] input_shape = int(input.get_shape()[-1]) #generate parameters of basic lstm for i in np.arange(4): P = tf.get_variable('P{}'.format(i), shape=[self._meta_num_units, dimensions], initializer=tf.uniform_unit_scaling_initializer(),dtype=tf.float32) Q = tf.get_variable('Q{}'.format(i), shape=[self._meta_num_units, input_shape], initializer=tf.uniform_unit_scaling_initializer(),dtype=tf.float32) _W_matrix = tf.matmul(tf.reshape(tf.matrix_diag(meta_output),[-1, self._meta_num_units]), P) _W_matrix = tf.reshape(_W_matrix, [-1, self._meta_num_units, dimensions]) _W_matrix = tf.matmul(tf.reshape(tf.transpose(_W_matrix, [0,2,1]), [-1, self._meta_num_units]), Q) _W_matrix = tf.reshape(_W_matrix, [-1, dimensions, input_shape]) W_matrix_list.append(_W_matrix) W_matrix = tf.concat(values=W_matrix_list, axis=1) Bias = rnn_cell._linear(meta_output, 4*dimensions, False) result = tf.matmul(W_matrix, tf.expand_dims(input, -1)) result = tf.add(tf.reshape(result, [-1, 4*dimensions]), Bias) return result
def __call__(self, inputs, state, d_act, scope=None): """Long short-term memory cell (LSTM).""" with vs.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: # c, h = array_ops.split(1, 2, state) c, h = array_ops.split(state, 2, 1) concat = _linear([inputs, h], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate # i, j, f, o = array_ops.split(1, 4, concat) i, j, f, o = array_ops.split(concat, 4, 1) w_d = vs.get_variable('w_d', [self.key_words_voc_size, self._num_units]) new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) * self._activation(j)) + tf.tanh(tf.matmul(d_act, w_d)) new_h = self._activation(new_c) * sigmoid(o) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: # new_state = array_ops.concat(1, [new_c, new_h]) new_state = array_ops.concat([new_c, new_h], 1) return new_h, new_state
def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=True, wd=0.0, input_keep_prob=1.0, is_train=True): if args is None or (nest.is_sequence(args) and not args): raise ValueError("args must be specified") if not nest.is_sequence(args): args = [args] flat_args = [flatten(arg, 1) for arg in args] is_train = tf.convert_to_tensor(is_train, dtype=tf.bool) #if input_keep_prob is not None: #assert is_train is not None flat_args = [tf.cond(is_train, lambda : tf.nn.dropout(arg, input_keep_prob), lambda : arg) for arg in flat_args] flat_out = _linear(flat_args, output_size, bias, bias_start=bias_start, scope=scope) out = reconstruct(flat_out, args[0], 1) shape = out.get_shape().as_list() shape.pop() print("shape", shape) if squeeze: #out = tf.squeeze(out, [len(args[0].get_shape().as_list())-1]) out = tf.squeeze(out, axis=[3]) #out = tf.reshape(out, shape=shape) if wd: add_wd(wd) print("out", out) return out
def __call__(self, inputs, state, context, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with _checked_scope(self, scope or "gru_cell", reuse=self._reuse): with vs.variable_scope("gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. value = sigmoid(_linear( [inputs, state, context], 2 * self._num_units, True, 1.0)) r, u = array_ops.split( value=value, num_or_size_splits=2, axis=1) with vs.variable_scope("candidate"): c = self._activation(_linear([inputs, r * state], self._num_units, True)) new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, state): """Gated recurrent unit (GRU) with nunits cells.""" with vs.variable_scope("Gates"): # Reset gate and update gate.,reuse=True # We start with bias of 1.0 to not reset and not update. value =_linear([inputs, state], 2 * self._num_units, True, 1.0) r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) r = ln(r, scope = 'r/') u = ln(u, scope = 'u/') r, u = sigmoid(r), sigmoid(u) with vs.variable_scope("Candidate"): # with vs.variable_scope("Layer_Parameters"): Cand = _linear([inputs, r *state], self._num_units, True) c_pre = ln(Cand, scope = 'new_h/') c = self._activation(c_pre) new_h = u * state + (1 - u) * c return new_h, new_h
def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0, is_train=None): if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] flat_args = [flatten(arg, 1) for arg in args] if input_keep_prob < 1.0: assert is_train is not None flat_args = [ tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg) for arg in flat_args ] with tf.variable_scope(scope or 'Linear'): flat_out = _linear(flat_args, output_size, bias, bias_start=bias_start) out = reconstruct(flat_out, args[0], 1) if squeeze: out = tf.squeeze(out, [len(args[0].get_shape().as_list()) - 1]) if wd: add_wd(wd) return out
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM). @param: inputs (batch,n) @param state: the states and hidden unit of the two cells """ with tf.variable_scope(scope or type(self).__name__): c1, c2, h1, h2 = state # change bias argument to False since LN will add bias via shift concat = _linear([inputs, h1, h2], 5 * self._num_units, False) i, j, f1, f2, o = tf.split(value=concat, num_or_size_splits=5, axis=1) # add layer normalization to each gate i = ln(i, scope='i/') j = ln(j, scope='j/') f1 = ln(f1, scope='f1/') f2 = ln(f2, scope='f2/') o = ln(o, scope='o/') new_c = (c1 * tf.nn.sigmoid(f1 + self._forget_bias) + c2 * tf.nn.sigmoid(f2 + self._forget_bias) + tf.nn.sigmoid(i) * self._activation(j)) # add layer_normalization in calculation of new hidden state new_h = self._activation(ln(new_c, scope='new_h/')) * tf.nn.sigmoid(o) new_state = LSTMStateTuple(new_c, new_h) return new_h, new_state
def decode(self, h_q, h_p, scope=None, reuse=None): """ takes in a knowledge representation and output a probability estimation over all paragraph tokens on which token should be the start of the answer span, and which should be the end of the answer span. :param knowledge_rep: it is a representation of the paragraph and question, decided by how you choose to implement the encoder :return: """ # Linear mix: h_q * W1 + h_p * W2 + b with vs.variable_scope('a_s'): a_s = _linear([h_q, h_p], self.output_size, True) with vs.variable_scope('a_e'): a_e = _linear([h_q, h_p], self.output_size, True) return a_s, a_e
def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): y = core_rnn_cell_impl._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) return s
def __init__(self, num_units, encoder_output, scope=None): self.hs = encoder_output with tf.variable_scope(scope or type(self).__name__): with tf.variable_scope("Attn1"): hs2d = tf.reshape(self.hs, [-1, num_units]) phi_hs2d = tf.tanh( core_rnn_cell_impl._linear(hs2d, num_units, True, 1.0)) self.phi_hs = tf.reshape(phi_hs2d, tf.shape(self.hs)) super(GRUCellAttn, self).__init__(num_units)
def beam_step(time, beam_probs, beam_seqs, cand_probs, cand_seqs, *states): batch_size = tf.shape(beam_probs)[0] inputs = tf.reshape( tf.slice(beam_seqs, [0, time], [batch_size, 1]), [batch_size]) decoder_input = tf.nn.embedding_lookup(self.L_dec, inputs) decoder_output, state_output = self.decoder_graph( decoder_input, states) with tf.variable_scope("Logistic", reuse=True): do2d = tf.reshape(decoder_output, [-1, self.size]) logits2d = core_rnn_cell_impl._linear(do2d, self.vocab_size, True, 1.0) logprobs2d = tf.nn.log_softmax(logits2d) total_probs = logprobs2d + tf.reshape(beam_probs, [-1, 1]) total_probs_noEOS = tf.concat([ tf.slice(total_probs, [0, 0], [batch_size, nlc_data.EOS_ID]), tf.tile([[-3e38]], [batch_size, 1]), tf.slice(total_probs, [0, nlc_data.EOS_ID + 1], [batch_size, self.vocab_size - nlc_data.EOS_ID - 1]) ], 1) flat_total_probs = tf.reshape(total_probs_noEOS, [-1]) beam_k = tf.minimum(tf.size(flat_total_probs), self.beam_size) next_beam_probs, top_indices = tf.nn.top_k(flat_total_probs, k=beam_k) next_bases = tf.floordiv(top_indices, self.vocab_size) next_mods = tf.mod(top_indices, self.vocab_size) next_states = [ tf.gather(state, next_bases) for state in state_output ] next_beam_seqs = tf.concat([ tf.gather(beam_seqs, next_bases), tf.reshape(next_mods, [-1, 1]) ], 1) cand_seqs_pad = tf.pad(cand_seqs, [[0, 0], [0, 1]]) beam_seqs_EOS = tf.pad(beam_seqs, [[0, 0], [0, 1]]) new_cand_seqs = tf.concat([cand_seqs_pad, beam_seqs_EOS], 0) EOS_probs = tf.slice(total_probs, [0, nlc_data.EOS_ID], [batch_size, 1]) new_cand_probs = tf.concat( [cand_probs, tf.reshape(EOS_probs, [-1])], 0) cand_k = tf.minimum(tf.size(new_cand_probs), self.beam_size) next_cand_probs, next_cand_indices = tf.nn.top_k(new_cand_probs, k=cand_k) next_cand_seqs = tf.gather(new_cand_seqs, next_cand_indices) return [ time + 1, next_beam_probs, next_beam_seqs, next_cand_probs, next_cand_seqs ] + next_states
def highway(input_, size, layer_size=1, bias=-2, f=tf.nn.relu): """Highway Network (cf. http://arxiv.org/abs/1505.00387). t = sigmoid(Wy + b) z = t * g(Wy + b) + (1 - t) * y where g is nonlinearity, t is transform gate, and (1 - t) is carry gate. """ output = input_ for idx in range(layer_size): with tf.variable_scope('output_lin_%d' % idx): output = f(core_rnn_cell_impl._linear(output, size, 0)) with tf.variable_scope('transform_lin_%d' % idx): transform_gate = tf.sigmoid( core_rnn_cell_impl._linear(input_, size, 0) + bias) carry_gate = 1. - transform_gate output = transform_gate * output + carry_gate * input_ return output
def __call__(self, inputs, state, scope=None): """GRU with attention.""" with tf.variable_scope(scope or 'attention_cell_wrapper'): output, _ = self._cell(inputs, state) att = _linear([output, self._attn_vec], self.output_size, bias=True) output = output * tf.sigmoid(att) return output, output
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM).""" with vs.variable_scope(scope or type(self).__name__): # Parameters of gates are concatenated into one multiply for efficiency. c, h = state concat = _linear([inputs, h], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1) new_c = (c * sigmoid(f) + sigmoid(i) * self._activation(j)) new_h = self._activation(new_c) * sigmoid(o) new_state = (new_c, new_h) return new_h, new_state
def body(previous_finished, time_step, previous_state, running_output, running_state, ponder_steps, remainders, running_p_sum): current_inputs = tf.where(tf.equal(time_step, 1), inputs_and_one, inputs_and_zero) current_output, current_state = self._cell( current_inputs, previous_state) if state_is_tuple: joint_current_state = tf.concat(current_state, 1) else: joint_current_state = current_state current_h = tf.nn.sigmoid( tf.squeeze( _linear([joint_current_state], 1, True, self._init_halting_bias), 1)) current_h_sum = running_p_sum + current_h limit_condition = time_step >= self._ponder_limit halting_condition = current_h_sum >= 1.0 - self._epsilon current_finished = tf.logical_or(halting_condition, limit_condition) just_finished = tf.logical_xor(current_finished, previous_finished) current_p = tf.where(current_finished, 1.0 - running_p_sum, current_h) expanded_current_p = tf.expand_dims(current_p, 1) running_output += expanded_current_p * current_output if state_is_tuple: running_state += tf.expand_dims(expanded_current_p, 0) * current_state else: running_state += expanded_current_p * current_state ponder_steps = tf.where(just_finished, tf.fill([batch_size], time_step), ponder_steps) remainders = tf.where(just_finished, current_p, remainders) running_p_sum += current_p return (current_finished, time_step + 1, current_state, running_output, running_state, ponder_steps, remainders, running_p_sum)
def __call__(self, inputs, state, scope=None): gru_out, gru_state = super(GRUCellAttn, self).__call__(inputs, state, scope) with tf.variable_scope(scope or type(self).__name__): with tf.variable_scope("Attn2"): gamma_h = tf.tanh( core_rnn_cell_impl._linear(gru_out, self._num_units, True, 1.0)) weights = tf.reduce_sum(self.phi_hs * gamma_h, reduction_indices=2, keep_dims=True) weights = tf.exp( weights - tf.reduce_max(weights, reduction_indices=0, keep_dims=True)) weights = weights / (1e-6 + tf.reduce_sum( weights, reduction_indices=0, keep_dims=True)) context = tf.reduce_sum(self.hs * weights, reduction_indices=0) with tf.variable_scope("AttnConcat"): out = tf.nn.relu( core_rnn_cell_impl._linear([context, gru_out], self._num_units, True, 1.0)) self.attn_map = tf.squeeze( tf.slice(weights, [0, 0, 0], [-1, -1, 1])) return (out, out)
def __call__(self, inputs, state, scope=None): """Long short-term memory cell (LSTM).""" with _checked_scope(self, scope or "basic_lstm_cell", reuse=self._reuse): # Parameters of gates are concatenated into one multiply for efficiency. c, h = state # i = input_gate, j = new_input, f = forget_gate, o = output_gate concat = _linear([inputs, h], 4 * self._num_units, True) i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1) new_c = (c * tf.nn.sigmoid(f + self._forget_bias) + \ tf.nn.sigmoid(i) * self._activation(j)) new_h = self._activation(new_c) * tf.nn.sigmoid(o) new_state = (new_c, new_h) return new_h, new_state
def attention(query): """Put attention masks on hidden using hidden_features and query.""" attn_weights = [] ds = [] # Results of attention reads will be stored here. for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) a = nn_ops.softmax(s) attn_weights.append(a) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return attn_weights, ds
def setup_loss(self): with tf.variable_scope("Logistic"): doshape = tf.shape(self.decoder_output) T, batch_size = doshape[0], doshape[1] do2d = tf.reshape(self.decoder_output, [-1, self.size]) logits2d = core_rnn_cell_impl._linear(do2d, self.vocab_size, True, 1.0) outputs2d = tf.nn.log_softmax(logits2d) self.outputs = tf.reshape( outputs2d, tf.stack([T, batch_size, self.vocab_size])) targets_no_GO = tf.slice(self.target_tokens, [1, 0], [-1, -1]) masks_no_GO = tf.slice(self.target_mask, [1, 0], [-1, -1]) # easier to pad target/mask than to split decoder input since # tensorflow does not support negative indexing labels1d = tf.reshape(tf.pad(targets_no_GO, [[0, 1], [0, 0]]), [-1]) mask1d = tf.reshape(tf.pad(masks_no_GO, [[0, 1], [0, 0]]), [-1]) losses1d = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits2d, labels=labels1d) * tf.to_float(mask1d) losses2d = tf.reshape(losses1d, tf.stack([T, batch_size])) self.losses = tf.reduce_sum(losses2d) / tf.to_float(batch_size)
def downscale(self, inp, mask): #return inp, mask with tf.variable_scope("Downscale"): inshape = tf.shape(inp) T, batch_size, dim = inshape[0], inshape[1], inshape[2] inp2d = tf.reshape(tf.transpose(inp, perm=[1, 0, 2]), [-1, 2 * self.size]) out2d = core_rnn_cell_impl._linear(inp2d, self.size, True, 1.0) out3d = tf.reshape(out2d, tf.stack((batch_size, tf.to_int32(T / 2), dim))) out3d = tf.transpose(out3d, perm=[1, 0, 2]) out3d.set_shape([None, None, self.size]) out = tf.tanh(out3d) mask = tf.transpose(mask) mask = tf.reshape(mask, [-1, 2]) mask = tf.cast(mask, tf.bool) mask = tf.reduce_any(mask, reduction_indices=1) mask = tf.to_int32(mask) mask = tf.reshape(mask, tf.stack([batch_size, -1])) mask = tf.transpose(mask) return out, mask
def attention_RNN(encoder_outputs, encoder_state, num_decoder_symbols, sequence_length, num_heads=1, dtype=dtypes.float32, use_attention=True, loop_function=None, scope=None): if use_attention: print ('Use the attention RNN model') if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") with variable_scope.variable_scope(scope or "attention_RNN"): output_size = encoder_outputs[0].get_shape()[1].value top_states = [array_ops.reshape(e, [-1, 1, output_size]) for e in encoder_outputs] attention_states = array_ops.concat(axis=1, values=top_states) if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) batch_size = array_ops.shape(top_states[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) def attention(query): """Put attention masks on hidden using hidden_features and query.""" attn_weights = [] ds = [] # Results of attention reads will be stored here. for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) a = nn_ops.softmax(s) attn_weights.append(a) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return attn_weights, ds batch_attn_size = array_ops.stack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) # loop through the encoder_outputs attention_encoder_outputs = list() sequence_attention_weights = list() for i in xrange(len(encoder_outputs)): if i > 0: variable_scope.get_variable_scope().reuse_variables() if i == 0: with variable_scope.variable_scope("Initial_Decoder_Attention"): initial_state = rnn_cell._linear(encoder_state, output_size, True) attn_weights, ds = attention(initial_state) else: attn_weights, ds = attention(encoder_outputs[i]) output = array_ops.concat(axis=1, values=[ds[0], encoder_outputs[i]]) # NOTE: here we temporarily assume num_head = 1 with variable_scope.variable_scope("AttnRnnOutputProjection"): logit = rnn_cell._linear(output, num_decoder_symbols, True) attention_encoder_outputs.append(logit) # NOTE: here we temporarily assume num_head = 1 sequence_attention_weights.append(attn_weights[0]) # NOTE: here we temporarily assume num_head = 1 else: print ('Use the NON attention RNN model') with variable_scope.variable_scope(scope or "non-attention_RNN"): attention_encoder_outputs = list() sequence_attention_weights = list() # copy over logits once out of sequence_length if encoder_outputs[0].get_shape().ndims != 1: (fixed_batch_size, output_size) = encoder_outputs[0].get_shape().with_rank(2) else: fixed_batch_size = encoder_outputs[0].get_shape().with_rank_at_least(1)[0] if fixed_batch_size.value: batch_size = fixed_batch_size.value else: batch_size = array_ops.shape(encoder_outputs[0])[0] if sequence_length is not None: sequence_length = math_ops.to_int32(sequence_length) if sequence_length is not None: # Prepare variables zero_logit = array_ops.zeros( array_ops.pack([batch_size, num_decoder_symbols]), encoder_outputs[0].dtype) zero_logit.set_shape( tensor_shape.TensorShape([fixed_batch_size.value, num_decoder_symbols])) min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) for time, input_ in enumerate(encoder_outputs): if time > 0: variable_scope.get_variable_scope().reuse_variables() # pylint: disable=cell-var-from-loop # call_cell = lambda: cell(input_, state) generate_logit = lambda: rnn_cell._linear(encoder_outputs[time], num_decoder_symbols, True) # pylint: enable=cell-var-from-loop if sequence_length is not None: logit = _step( time, sequence_length, min_sequence_length, max_sequence_length, zero_logit, generate_logit) else: logit = generate_logit attention_encoder_outputs.append(logit) return attention_encoder_outputs, sequence_attention_weights
def __call__(self, inputs, state, scope=None): """Run one step of simplified LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: This must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: VariableScope for the created subgraph; defaults to "simplified_lstm_cell". Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj (c_prev, m_prev) = state dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( "Could not infer input size from inputs.get_shape()[-1]") with vs.variable_scope(scope or "simplified_lstm_cell", initializer=self._initializer) as unit_scope: n_eqs = 3 if self._architecture in ['NOG', 'NFG', 'NIG', 'CIFG' ] else 4 lstm_matrix = _linear([inputs, m_prev], n_eqs * self._num_units, bias=True, scope=scope) if self._architecture == 'NOG': i, j, f = array_ops.split(value=lstm_matrix, num_or_size_splits=3, axis=1) elif self._architecture in ['NFG', 'CIFG']: i, j, o = array_ops.split(value=lstm_matrix, num_or_size_splits=3, axis=1) elif self._architecture == 'NIG': j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=3, axis=1) else: i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1) # Diagonal connections if self._use_peepholes: with vs.variable_scope(unit_scope) as projection_scope: if self._num_unit_shards is not None: projection_scope.set_partitioner(None) if self._architecture not in ['NFG', 'CIFG']: w_f_diag = vs.get_variable("w_f_diag", shape=[self._num_units], dtype=dtype) if self._architecture != 'NIG': w_i_diag = vs.get_variable("w_i_diag", shape=[self._num_units], dtype=dtype) if self._architecture != 'NOG': w_o_diag = vs.get_variable("w_o_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: if self._architecture == 'NIG': c = sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev + self._activation(j) elif self._architecture == 'NFG': c = c_prev + sigmoid(i + w_i_diag * c_prev) * self._activation(j) elif self._architecture == 'NIAF': c = sigmoid(f + self._forget_bias + w_f_diag * c_prev ) * c_prev + sigmoid(i + w_i_diag * c_prev) * j elif self._architecture == 'CIFG': _i = sigmoid(i + w_i_diag * c_prev) c = (1 - _i) * c_prev + _i * self._activation(j) else: c = sigmoid(f + self._forget_bias + w_f_diag * c_prev ) * c_prev + sigmoid(i + w_i_diag * c_prev ) * self._activation(j) else: if self._architecture == 'NIG': c = sigmoid( f + self._forget_bias) * c_prev + self._activation(j) elif self._architecture == 'NFG': c = c_prev + sigmoid(i) * self._activation(j) elif self._architecture == 'NIAF': c = sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * j elif self._architecture == 'CIFG': _i = sigmoid(i) c = (1 - _i) * c_prev + _i * self._activation(j) else: c = sigmoid(f + self._forget_bias) * c_prev + sigmoid( i) * self._activation(j) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: if self._architecture == 'NOG': m = self._activation(c) elif self._architecture == 'NOAF': m = sigmoid(o + w_o_diag * c) * c else: m = sigmoid(o + w_o_diag * c) * self._activation(c) else: if self._architecture == 'NOG': m = self._activation(c) elif self._architecture == 'NOAF': m = sigmoid(o) * c else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: with vs.variable_scope("projection") as proj_scope: m = _linear(m, self._num_proj, bias=False, scope=scope) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type new_state = LSTMStateTuple(c, m) return m, new_state
def __call__(self, inputs, state, scope=None): """Run one step of LSTM. Args: inputs: input Tensor, 2D, batch x num_units. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, batch x state_size`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. scope: VariableScope for the created subgraph; defaults to "lstm_cell". Returns: A tuple containing: - A `2-D, [batch x output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") with _checked_scope(self, scope or "lstm_cell", initializer=self._initializer, reuse=self._reuse) as unit_scope: if self._num_unit_shards is not None: unit_scope.set_partitioner( partitioned_variables.fixed_size_partitioner( self._num_unit_shards)) # i = input_gate, j = new_input, f = forget_gate, o = output_gate lstm_matrix = _linear([inputs, m_prev], 4 * self._num_units, bias=True) i, j, f, o = array_ops.split( value=lstm_matrix, num_or_size_splits=4, axis=1) # Diagonal connections if self._use_peepholes: with vs.variable_scope(unit_scope) as projection_scope: if self._num_unit_shards is not None: projection_scope.set_partitioner(None) w_f_diag = vs.get_variable( "w_f_diag", shape=[self._num_units], dtype=dtype) w_i_diag = vs.get_variable( "w_i_diag", shape=[self._num_units], dtype=dtype) w_o_diag = vs.get_variable( "w_o_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev + sigmoid(i + w_i_diag * c_prev) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + w_o_diag * c) * self._activation(c) else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: with vs.variable_scope("projection") as proj_scope: if self._num_proj_shards is not None: proj_scope.set_partitioner( partitioned_variables.fixed_size_partitioner( self._num_proj_shards)) m = _linear(m, self._num_proj, bias=False) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat([c, m], 1)) return m, new_state
def __call__(self, inputs, state, scope=None): """Run one step of TLSTM. """ sigmoid = math_ops.sigmoid tanh = math_ops.tanh (c_prev, m_prev) = state dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from inputs.get_shape()[-1]") batch_size, feature_size = inputs.get_shape().as_list() feature_size = feature_size - 1 seq = tf.slice(inputs, begin=[0, 0], size=[batch_size, feature_size]) delta_t = tf.slice(inputs, begin=[0, 48], size=[batch_size, 1]) scope = scope or vs.get_variable_scope() with vs.variable_scope(scope, initializer=self._initializer) as unit_scope: # i = input_gate, j = new_input, f = forget_gate, o = output_gate lstm_matrix = _linear([seq, m_prev], output_size=4 * self._num_units, bias=True) # Time gate with vs.variable_scope(unit_scope) as time_gate_scope: w_t1 = vs.get_variable( "w_t1", shape=[1, self._num_units], dtype=dtype) bias_t1 = vs.get_variable( "bias_t1", [self._num_units], dtype=dtype, initializer=init_ops.constant_initializer(0.0, dtype=dtype)) w_tx1 = vs.get_variable( "w_tx1", shape=[feature_size, self._num_units], dtype=dtype) w_tx2 = vs.get_variable( "w_tx2", shape=[feature_size, self._num_units], dtype=dtype) w_t2 = vs.get_variable( "w_t2", shape=[1, self._num_units], dtype=dtype) bias_t2 = vs.get_variable( "bias_t2", [self._num_units], dtype=dtype, initializer=init_ops.constant_initializer(0.0, dtype=dtype)) w_to = vs.get_variable( "w_to", shape=[1, self._num_units], dtype=dtype) w_t1_with_constraint = tf.minimum(w_t1, 0) t1_act = (self._activation(math_ops.matmul(delta_t, w_t1_with_constraint)) + math_ops.matmul(seq, w_tx1) + bias_t1) t2_act = (self._activation(math_ops.matmul(delta_t, w_t2)) + math_ops.matmul(seq, w_tx2) + bias_t2) t1 = sigmoid(t1_act) t2 = sigmoid(t2_act) i, j, f, o = array_ops.split( value=lstm_matrix, num_or_size_splits=4, axis=1) # Diagonal connections if self._use_peepholes: with vs.variable_scope(unit_scope) as projection_scope: w_f_diag = vs.get_variable( "w_f_diag", shape=[self._num_units], dtype=dtype) w_i_diag = vs.get_variable( "w_i_diag", shape=[self._num_units], dtype=dtype) w_o_diag = vs.get_variable( "w_o_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c_hat = ((1 - sigmoid(i + w_i_diag * c_prev)*t1) * c_prev + sigmoid(i + w_i_diag * c_prev)*t1 * self._activation(j)) c = ((1 - sigmoid(i + w_i_diag * c_prev)) * c_prev + sigmoid(i + w_i_diag * c_prev)*t2 * self._activation(j)) else: c_hat = ((1 - sigmoid(i)) * c_prev + sigmoid(i + w_i_diag * c_prev)*t1 * self._activation(j)) c = ((1 - sigmoid(i)) * c_prev + sigmoid(i + w_i_diag * c_prev)*t2 * self._activation(j)) if self._cell_clip is not None: c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) if self._use_peepholes: m = (sigmoid(o + math_ops.matmul(delta_t, w_to) + w_o_diag * c) * self._activation(c_hat)) else: m = sigmoid(o + math_ops.matmul(delta_t, w_to)) * self._activation(c_hat) new_state = (LSTMStateTuple(c, m)) return m, new_state
def pointer_decoder(decoder_inputs, initial_state, attention_states, cell, feed_prev=True, dtype=dtypes.float32, scope=None): """RNN decoder with pointer net for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "pointer_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state). Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) with vs.variable_scope(scope or "point_decoder"): batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. input_size = decoder_inputs[0].get_shape()[1].value attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) attention_vec_size = attn_size # Size of query vectors for attention. k = vs.get_variable("AttnW", [1, 1, attn_size, attention_vec_size]) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = vs.get_variable("AttnV", [attention_vec_size]) states = [initial_state] def attention(query): """Point on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): y = core_rnn_cell_impl._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) return s outputs = [] prev = None batch_attn_size = array_ops.stack([batch_size, attn_size]) attns = array_ops.zeros(batch_attn_size, dtype=dtype) attns.set_shape([None, attn_size]) inps = [] for i in range(len(decoder_inputs)): if i > 0: vs.get_variable_scope().reuse_variables() inp = decoder_inputs[i] if feed_prev and i > 0: inp = tf.stack(decoder_inputs) inp = tf.transpose(inp, perm=[1, 0, 2]) inp = tf.reshape(inp, [-1, attn_length, input_size]) inp = tf.reduce_sum(inp * tf.reshape(tf.nn.softmax(output), [-1, attn_length, 1]), 1) inp = tf.stop_gradient(inp) inps.append(inp) # Use the same inputs in inference, order internaly # Merge input and previous attentions into one vector of the right size. x = core_rnn_cell_impl._linear([inp, attns], cell.output_size, True) # Run the RNN. cell_output, new_state = cell(x, states[-1]) states.append(new_state) # Run the attention mechanism. output = attention(new_state) outputs.append(output) return outputs, states, inps
def __call__(self, inputs, state, scope=None): """ Phased long short-term memory cell (P-LSTM).""" with vs.variable_scope(scope or type(self).__name__): # Parameters of gates are concatenated into one multiply for efficiency. c_prev, h_prev = state # (batch_size, seq_len, 2) # NB: here we explicitly give t as input. x = tf.reshape(inputs[:, 0], (-1, 1)) t = inputs[:, 1][ -1] # Now we only accept one id. We have a batch so it's a bit more complex. # maybe the information should come from the outside. To be defined later. concat = _linear([x, h_prev], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1) dtype = inputs.dtype tau = vs.get_variable('tau', shape=[self._num_units], initializer=random_exp_initializer( 0, self.tau_init), dtype=dtype) r_on = vs.get_variable('r_on', shape=[self._num_units], initializer=init_ops.constant_initializer( self.r_on_init), dtype=dtype) s = vs.get_variable( 's', shape=[self._num_units], initializer=init_ops.random_uniform_initializer( 0., tau.initialized_value()), dtype=dtype) times = tf.tile(tf.reshape(t, [-1, 1]), [1, self._num_units]) phase = phi(times, s, tau) kappa = time_gate_fast(phase, r_on, self._leak_rate, self._training_phase) w_o_peephole = None if self._use_peepholes: w_i_peephole = vs.get_variable('W_I_peephole', shape=[self._num_units], dtype=dtype) w_f_peephole = vs.get_variable('W_F_peephole', shape=[self._num_units], dtype=dtype) w_o_peephole = vs.get_variable('W_O_peephole', shape=[self._num_units], dtype=dtype) f += w_f_peephole * c_prev i += w_i_peephole * c_prev new_c_tilde = sigmoid(f) * c_prev + sigmoid(i) * self._activation( j) if self._use_peepholes: o += w_o_peephole * new_c_tilde new_h_tilde = sigmoid(o) * self._activation(new_c_tilde) """ Hi all, Yes, Philippe, you are correct in that Equation 4 should reference c_tilde and not c. I can add a point to the paper to mention that, and will update Figure 1 so the line is correctly drawn to c_tilde instead. The intuition here is that the gates should be blind to the effect of the khronos gate; input, forget and output gate should all operate as if the cell were a normal LSTM cell, while the khronos gate allows it to either operate or not operate (and then linearly interpolates between these two states). If the output gate is influenced by the khronos gate (if the peepholes reference c instead of c_tilde), then the PLSTM would no longer be a gated LSTM cell, but somehow be self-dependent on the time gate's actual operation. I think everyone's right in that it wouldn't influence much -- but it should be updated in the paper. Thanks very much for pointing out the issue, Philippe! -Danny""" # Apply Khronos gate new_h = kappa * new_h_tilde + (1 - kappa) * h_prev new_c = kappa * new_c_tilde + (1 - kappa) * c_prev new_state = (new_c, new_h) return new_h, new_state