def __call__(self, inputs, state, past_inputs = None, past_states = None, scope=None): with tf.device("/gpu:"+str(self._gpu_for_layer)): '''This is a modified GRU that has the ability to incorporate an additional past input and/or additional past state. Very useful for skip-connecting RNN's horizontally or vertically. ''' if past_inputs is not None and past_states is None: with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. r, u, pr = tf.split(1, 3, lfe.enhanced_linear([inputs, state, past_inputs], 3 * self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor)) r, u, pr = tf.sigmoid(r), tf.sigmoid(u), tf.sigmoid(pr) with tf.variable_scope("Candidate"): #you need a different one because you're doing a new linear c = tf.tanh(linear.linear([inputs, r * state, pr*state], self._num_units, True)) new_h = u * state + (1 - u) * c return new_h, new_h elif past_states is not None and past_inputs is None: with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. r, u, ps = tf.split(1, 3, lfe.enhanced_linear([inputs, state, past_states], 3 * self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor)) r, u, ps = tf.sigmoid(r), tf.sigmoid(u), tf.sigmoid(ps) with tf.variable_scope("Candidate"): #you need a different one because you're doing a new linear c = tf.tanh(linear.linear([inputs, r * state], self._num_units, True)) new_h = u * state + (1 - u) * c + (1 - ps) * c return new_h, new_h elif past_states is not None and past_inputs is not None: with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. r, u, pr, ps = tf.split(1, 4, lfe.enhanced_linear([inputs, state, past_inputs, past_states], 4 * self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor)) r, u, pr, ps = tf.sigmoid(r), tf.sigmoid(u), tf.sigmoid(pr), tf.sigmoid(ps) with tf.variable_scope("Candidate"): #you need a different one because you're doing a new linear c = tf.tanh(linear.linear([inputs, r * state, pr*state], self._num_units, True)) new_h = u * state + ps * past_states + (1 - u) * c + (1 - ps) * c return new_h, new_h else: with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not udpate. r, u = tf.split(1, 2, lfe.enhanced_linear([inputs, state], 2 * self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor)) r, u = tf.sigmoid(r), tf.sigmoid(u) with tf.variable_scope("Candidate"): #you need a different one because you're doing a new linear #notice they have the activation/non-linear step right here! c = tf.tanh(linear.linear([inputs, r * state], self._num_units, True)) new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, state,scope=None): with tf.device("/gpu:"+str(self._gpu_for_layer)): '''Modifying skip connections part -- Added Additional Input Nick, in the future, you can also add an additional hidden value input as well!''' if self._skip_connections: with tf.variable_scope("Skip_Connections"): timestep_counter.assign(timestep_counter+1) #add one to timestep counter print('for testing, you added one to the timestep_counter') if tf.add_n(previous_inputs) == 0: if previous_inputs.shape == 1: previous_inputs.assign(tf.zeros(tf.shape(inputs))) '''you have modified the gru network to incorporate the previous inputs''' with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not udpate. r, u = tf.split(1, 3, lfe.enhanced_linear([inputs, state, previous_inputs], 3 * self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor)) r, u, pr = tf.sigmoid(r), tf.sigmoid(u), tf.sigmoid(pr) with tf.variable_scope("Candidate"): #you need a different one because you're doing a new linear #notice they have the activation/non-linear step right here! c = tf.tanh(linear.linear([inputs, r * state, pr*state], self._num_units, True)) new_h = u * state + (1 - u) * c '''need to update inputs if they are available''' if timestep_counter/skip_neuron_number == 0: previous_inputs.assign(inputs) print('you changed the previous inputs') # previous_hidden_states.assign(new_h) #only activate if you need this return new_h, new_h else: """Normal Gated recurrent unit (GRU) with nunits cells.""" with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not udpate. r, u = tf.split(1, 2, lfe.enhanced_linear([inputs, state], 2 * self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor)) r, u = tf.sigmoid(r), tf.sigmoid(u) with tf.variable_scope("Candidate"): #you need a different one because you're doing a new linear #notice they have the activation/non-linear step right here! c = tf.tanh(linear.linear([inputs, r * state], self._num_units, True)) new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, state, scope=None): """Run the cell and output projection on inputs, starting from state.""" output, res_state = self._cell(inputs, state) # Default scope: "OutputProjectionWrapper" with tf.variable_scope(scope or type(self).__name__): projected = linear.linear(output, self._output_size, True) return projected, res_state
def __call__(self, inputs, state, scope=None): with tf.device("/gpu:"+str(self._gpu_for_layer)): """JZS3, mutant 2 with n units cells.""" with tf.variable_scope(scope or type(self).__name__): # "JZS1Cell" with tf.variable_scope("Zinput"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. '''equation 1''' z = tf.sigmoid(lfe.enhanced_linear([inputs, tf.tanh(state)], self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor)) '''equation 2''' with tf.variable_scope("Rinput"): r = tf.sigmoid(lfe.enhanced_linear([inputs, state], self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor)) '''equation 3''' with tf.variable_scope("Candidate"): component_0 = linear.linear([state*r,inputs], self._num_units, True) component_2 = (tf.tanh(component_0))*z component_3 = state*(1 - z) h_t = component_2 + component_3 return h_t, h_t #there is only one hidden state output to keep track of.
def __call__(self, inputs, state, scope=None): with tf.device("/gpu:"+str(self._gpu_for_layer)): """JZS1, mutant 1 with n units cells.""" with tf.variable_scope(scope or type(self).__name__): # "JZS1Cell" with tf.variable_scope("Zinput"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. '''equation 1 z = sigm(WxzXt+Bz), x_t is inputs''' z = tf.sigmoid(lfe.enhanced_linear([inputs], self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor)) with tf.variable_scope("Rinput"): '''equation 2 r = sigm(WxrXt+Whrht+Br), h_t is the previous state''' r = tf.sigmoid(lfe.enhanced_linear([inputs,state], self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor)) '''equation 3''' with tf.variable_scope("Candidate"): component_0 = linear.linear([r*state], self._num_units, True) component_1 = tf.tanh(tf.tanh(inputs) + component_0) component_2 = component_1*z component_3 = state*(1 - z) h_t = component_2 + component_3 return h_t, h_t #there is only one hidden state output to keep track of.
def attention(query): #this is part of the attention_decoder. It is placed outside to avoid re-compile time. """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with tf.variable_scope("Attention_%d" % a): y = linear.linear(query, attention_vec_size, True) y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = tf.reduce_sum(v[a] * tf.tanh(hidden_features[a] + y), [2, 3]) a = tf.nn.softmax(s) # Now calculate the attention-weighted vector d. d = tf.reduce_sum(tf.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(tf.reshape(d, [-1, attn_size])) return ds
def attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=tf.float32, scope=None, average_states = False, average_hidden_state_influence = 0.5, temperature_decode = False, temperature = 1.0): """RNN decoder with attention for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: size of the output vectors; if None, we use cell.output_size. num_heads: number of attention heads that read from attention_states. loop_function: if not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x cell.output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x cell.input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state) Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, or shapes of attention_states are not set. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with tf.variable_scope(scope or "attention_decoder"): batch_size = tf.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = tf.reshape(attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = tf.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(tf.get_variable("AttnV_%d" % a, [attention_vec_size])) states = [initial_state] def attention(query): #this is part of the attention_decoder. It is placed outside to avoid re-compile time. """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with tf.variable_scope("Attention_%d" % a): y = linear.linear(query, attention_vec_size, True) y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = tf.reduce_sum(v[a] * tf.tanh(hidden_features[a] + y), [2, 3]) a = tf.nn.softmax(s) # Now calculate the attention-weighted vector d. d = tf.reduce_sum(tf.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(tf.reshape(d, [-1, attn_size])) return ds outputs = [] wids = [] prev = None batch_attn_size = tf.pack([batch_size, attn_size]) attns = [tf.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) for i in xrange(len(decoder_inputs)): #RIGHT HERE! THIS IS A LIST OF DECODING TIMESTEPS! WHAAAAHOOOOO!!!! if i > 0: tf.get_variable_scope().reuse_variables() inp = decoder_inputs[i] '''nick, you can implement sampling here by changing the input here! also curriculum learning too!''' # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with tf.variable_scope("loop_function", reuse=True): inp,wid = loop_function(prev, i, temperature_decode = temperature_decode, temperature = temperature) #basically, stop_gradient doesn't allow inputs to be taken into account wids.append(wid) #this will make an input that is combined with attention # Merge input and previous attentions into one vector of the right size. x = linear.linear([inp] + attns, cell.input_size, True) hidden_state_input = states[-1] if average_states: '''implement averaging of states''' print('WARNING YOU HAVE OPTED TO USE THE AVERAGING OF STATES!') hidden_state_input = average_hidden_states(states, average_hidden_state_influence) # Run the RNN. #right here, you could potentially make the skip-connections? I think you would have to #you would have to save the output part here, and then transfer it to the next part. cell_output, new_state = cell(x, hidden_state_input) #nick, changed this to your hidden state input states.append(new_state) # Run the attention mechanism. attns = attention(new_state) with tf.variable_scope("AttnOutputProjection"): output = linear.linear([cell_output] + attns, output_size, True) if loop_function is not None: # We do not propagate gradients over the loop function. prev = tf.stop_gradient(output) outputs.append(output) return outputs, states,wids
def attention_decoder(decoder_inputs, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=tf.float32, scope=None, average_states=False, average_hidden_state_influence=0.5, temperature_decode=False, temperature=1.0): """RNN decoder with attention for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. output_size: size of the output vectors; if None, we use cell.output_size. num_heads: number of attention heads that read from attention_states. loop_function: if not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x cell.output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x cell.input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on a combination of the input and previous attention masks: cell_output, new_state = cell(linear(input, prev_attn), prev_state) Then, we calculate new attention masks: new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) and then we calculate the output: output = linear(cell_output, new_attn). states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, or shapes of attention_states are not set. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if num_heads < 1: raise ValueError( "With less than 1 heads, use a non-attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError( "Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with tf.variable_scope(scope or "attention_decoder"): batch_size = tf.shape(decoder_inputs[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = tf.reshape(attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = tf.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append( tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(tf.get_variable("AttnV_%d" % a, [attention_vec_size])) states = [initial_state] def attention( query ): #this is part of the attention_decoder. It is placed outside to avoid re-compile time. """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. for a in xrange(num_heads): with tf.variable_scope("Attention_%d" % a): y = linear.linear(query, attention_vec_size, True) y = tf.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = tf.reduce_sum(v[a] * tf.tanh(hidden_features[a] + y), [2, 3]) a = tf.nn.softmax(s) # Now calculate the attention-weighted vector d. d = tf.reduce_sum( tf.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(tf.reshape(d, [-1, attn_size])) return ds outputs = [] wids = [] prev = None batch_attn_size = tf.pack([batch_size, attn_size]) attns = [ tf.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads) ] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) for i in xrange( len(decoder_inputs) ): #RIGHT HERE! THIS IS A LIST OF DECODING TIMESTEPS! WHAAAAHOOOOO!!!! if i > 0: tf.get_variable_scope().reuse_variables() inp = decoder_inputs[i] '''nick, you can implement sampling here by changing the input here! also curriculum learning too!''' # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with tf.variable_scope("loop_function", reuse=True): inp, wid = loop_function( prev, i, temperature_decode=temperature_decode, temperature=temperature ) #basically, stop_gradient doesn't allow inputs to be taken into account wids.append(wid) #this will make an input that is combined with attention # Merge input and previous attentions into one vector of the right size. x = linear.linear([inp] + attns, cell.input_size, True) hidden_state_input = states[-1] if average_states: '''implement averaging of states''' print('WARNING YOU HAVE OPTED TO USE THE AVERAGING OF STATES!') hidden_state_input = average_hidden_states( states, average_hidden_state_influence) # Run the RNN. #right here, you could potentially make the skip-connections? I think you would have to #you would have to save the output part here, and then transfer it to the next part. cell_output, new_state = cell( x, hidden_state_input ) #nick, changed this to your hidden state input states.append(new_state) # Run the attention mechanism. attns = attention(new_state) with tf.variable_scope("AttnOutputProjection"): output = linear.linear([cell_output] + attns, output_size, True) if loop_function is not None: # We do not propagate gradients over the loop function. prev = tf.stop_gradient(output) outputs.append(output) return outputs, states, wids
def __call__(self, inputs, state, scope=None): """Run the input projection and then the cell.""" # Default scope: "InputProjectionWrapper" with tf.variable_scope(scope or type(self).__name__): projected = linear.linear(inputs, self._cell.input_size, True) return self._cell(projected, state)
def __call__(self, inputs, state, scope=None): with tf.device("/gpu:"+str(self._gpu_for_layer)): print('testing') with tf.variable_scope(scope or type(self).__name__): # "UnitaryRNNCell" with tf.variable_scope("UnitaryGates"): # Reset gate and update gate. '''just for sake of consistency, we'll keep some var names the same as authors''' n_hidden = self._num_units h_prev = state '''development nick version here''' step1 = unitary_linear.times_diag_tf(h_prev, n_hidden) #this will create a diagonal tensor with given diagonal values #work on times_reflection next modulus = T.sqrt(lin_output_re ** 2 + lin_output_im ** 2) rescale = T.maximum(modulus + hidden_bias.dimshuffle('x',0), 0.) / (modulus + 1e-5) nonlin_output_re = lin_output_re * rescale nonlin_output_im = lin_output_im * rescale h_t = tf.concat(1, [nonlin_output_re, nonlin_output_im]) #keep in mind that you can use tf.complex to convert two numbers into a complex number -- this works for tensors! return h_t, h_t #check if h_t is the same as the output????? '''list of complex number functions in tf 1. tf.complex -- makes complex number 2. complex_abs -- finds the absolute value of the tensor 3. tf.conj -- makes conjugate 4. tf.imag -- returns imaginary part -- go back and forth between complex and imag 5. tf.real -- returns real part''' #keep in mind that identity matricies are a form of diagonal matricies, but they just have ones. '''----------------------------end of unitary rnn cell--------------------------''' # We start with bias of 1.0 to not reset and not update. '''First, we will start with the hidden linear transform W = D3R2F-1D2PermR1FD1 Keep in mind that originally the equation would be W = VDV*, but it leads to too much computation/memory o(n^2)''' step1 = times_diag(h_prev, n_hidden, theta[0,:]) step2 = step1 # step2 = do_fft(step1, n_hidden) step3 = times_reflection(step2, n_hidden, reflection[0,:]) step4 = vec_permutation(step3, n_hidden, index_permute) step5 = times_diag(step4, n_hidden, theta[1,:]) step6 = step5 # step6 = do_ifft(step5, n_hidden) step7 = times_reflection(step6, n_hidden, reflection[1,:]) step8 = times_diag(step7, n_hidden, theta[2,:]) step9 = scale_diag(step8, n_hidden, scale) hidden_lin_output = step9 z = tf.sigmoid(linear.linear([inputs], self._num_units, True, 1.0)) '''equation 2 r = sigm(WxrXt+Whrht+Br), h_t is the previous state''' r = tf.sigmoid((linear.linear([inputs,state], self._num_units, True, 1.0))) '''equation 3''' with tf.variable_scope("Candidate"): component_0 = linear.linear([r*state], self._num_units, True) component_1 = tf.tanh(tf.tanh(inputs) + component_0) component_2 = component_1*z component_3 = state*(1 - z) h_t = component_2 + component_3 h_t = tf.concat(concat_dim = 1, value =[nonlin_output_re, nonlin_output_im]) #I know here you need to concatenate the real and imaginary parts return h_t, h_t #there is only one hidden state output to keep track of.