def __call__(self, inputs, state, past_inputs = None, past_states = None, scope=None):
    with tf.device("/gpu:"+str(self._gpu_for_layer)):

      '''This is a modified GRU that has the ability to incorporate an additional past input and/or 
      additional past state. Very useful for skip-connecting RNN's horizontally or vertically. '''

      if past_inputs is not None and past_states is None:

          with tf.variable_scope(scope or type(self).__name__):  # "GRUCell"
            with tf.variable_scope("Gates"):  # Reset gate and update gate.
              r, u, pr = tf.split(1, 3, lfe.enhanced_linear([inputs, state, past_inputs],
                                                  3 * self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor))
              r, u, pr = tf.sigmoid(r), tf.sigmoid(u), tf.sigmoid(pr)
            with tf.variable_scope("Candidate"): #you need a different one because you're doing a new linear
              c = tf.tanh(linear.linear([inputs, r * state, pr*state], self._num_units, True))
            new_h = u * state + (1 - u) * c
          return new_h, new_h

      elif past_states is not None and past_inputs is None:

          with tf.variable_scope(scope or type(self).__name__):  # "GRUCell"
            with tf.variable_scope("Gates"):  # Reset gate and update gate.
              r, u, ps = tf.split(1, 3, lfe.enhanced_linear([inputs, state, past_states],
                                                  3 * self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor))
              r, u, ps = tf.sigmoid(r), tf.sigmoid(u), tf.sigmoid(ps)
            with tf.variable_scope("Candidate"): #you need a different one because you're doing a new linear
              c = tf.tanh(linear.linear([inputs, r * state], self._num_units, True))
            new_h = u * state + (1 - u) * c + (1 - ps) * c
          return new_h, new_h

      elif past_states is not None and past_inputs is not None:

          with tf.variable_scope(scope or type(self).__name__):  # "GRUCell"
            with tf.variable_scope("Gates"):  # Reset gate and update gate.
              r, u, pr, ps = tf.split(1, 4, lfe.enhanced_linear([inputs, state, past_inputs, past_states],
                                                  4 * self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor))
              r, u, pr, ps = tf.sigmoid(r), tf.sigmoid(u), tf.sigmoid(pr), tf.sigmoid(ps)
            with tf.variable_scope("Candidate"): #you need a different one because you're doing a new linear
              c = tf.tanh(linear.linear([inputs, r * state, pr*state], self._num_units, True))
            new_h = u * state + ps * past_states + (1 - u) * c + (1 - ps) * c
          return new_h, new_h

      else:
          with tf.variable_scope(scope or type(self).__name__):  # "GRUCell"
            with tf.variable_scope("Gates"):  # Reset gate and update gate.
              # We start with bias of 1.0 to not reset and not udpate.
              r, u = tf.split(1, 2, lfe.enhanced_linear([inputs, state],
                                                  2 * self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor))
              r, u = tf.sigmoid(r), tf.sigmoid(u)
            with tf.variable_scope("Candidate"): #you need a different one because you're doing a new linear
              #notice they have the activation/non-linear step right here! 
              c = tf.tanh(linear.linear([inputs, r * state], self._num_units, True))
            new_h = u * state + (1 - u) * c
          return new_h, new_h
  def __call__(self, inputs, state,scope=None):
    with tf.device("/gpu:"+str(self._gpu_for_layer)):

      '''Modifying skip connections part -- Added Additional Input

      Nick, in the future, you can also add an additional hidden value input as well!'''
      if self._skip_connections:
        with tf.variable_scope("Skip_Connections"):
          timestep_counter.assign(timestep_counter+1) #add one to timestep counter
          print('for testing, you added one to the timestep_counter')
          if tf.add_n(previous_inputs) == 0:
            if previous_inputs.shape == 1:
              previous_inputs.assign(tf.zeros(tf.shape(inputs)))

          '''you have modified the gru network to incorporate the previous inputs'''
          with tf.variable_scope(scope or type(self).__name__):  # "GRUCell"
            with tf.variable_scope("Gates"):  # Reset gate and update gate.
              # We start with bias of 1.0 to not reset and not udpate.
              r, u = tf.split(1, 3, lfe.enhanced_linear([inputs, state, previous_inputs],
                                                  3 * self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor))
              r, u, pr = tf.sigmoid(r), tf.sigmoid(u), tf.sigmoid(pr)
            with tf.variable_scope("Candidate"): #you need a different one because you're doing a new linear
              #notice they have the activation/non-linear step right here! 
              c = tf.tanh(linear.linear([inputs, r * state, pr*state], self._num_units, True))
            new_h = u * state + (1 - u) * c

          '''need to update inputs if they are available'''  
          if timestep_counter/skip_neuron_number == 0:
            previous_inputs.assign(inputs)
            print('you changed the previous inputs')
            # previous_hidden_states.assign(new_h) #only activate if you need this 

          return new_h, new_h

          

      else:
        """Normal Gated recurrent unit (GRU) with nunits cells."""
        with tf.variable_scope(scope or type(self).__name__):  # "GRUCell"
          with tf.variable_scope("Gates"):  # Reset gate and update gate.
            # We start with bias of 1.0 to not reset and not udpate.
            r, u = tf.split(1, 2, lfe.enhanced_linear([inputs, state],
                                                2 * self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor))
            r, u = tf.sigmoid(r), tf.sigmoid(u)
          with tf.variable_scope("Candidate"): #you need a different one because you're doing a new linear
            #notice they have the activation/non-linear step right here! 
            c = tf.tanh(linear.linear([inputs, r * state], self._num_units, True))
          new_h = u * state + (1 - u) * c
        return new_h, new_h
 def __call__(self, inputs, state, scope=None):
   """Run the cell and output projection on inputs, starting from state."""
   output, res_state = self._cell(inputs, state)
   # Default scope: "OutputProjectionWrapper"
   with tf.variable_scope(scope or type(self).__name__):
     projected = linear.linear(output, self._output_size, True)
   return projected, res_state
  def __call__(self, inputs, state, scope=None):
    with tf.device("/gpu:"+str(self._gpu_for_layer)):
      """JZS3, mutant 2 with n units cells."""
      with tf.variable_scope(scope or type(self).__name__):  # "JZS1Cell"
        with tf.variable_scope("Zinput"):  # Reset gate and update gate.
          # We start with bias of 1.0 to not reset and not update.
          '''equation 1'''

          z = tf.sigmoid(lfe.enhanced_linear([inputs, tf.tanh(state)], 
                            self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor))

          '''equation 2'''
        with tf.variable_scope("Rinput"):
          r = tf.sigmoid(lfe.enhanced_linear([inputs, state],
                            self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor))
          '''equation 3'''
        with tf.variable_scope("Candidate"):
          component_0 = linear.linear([state*r,inputs],
                            self._num_units, True)
          
          component_2 = (tf.tanh(component_0))*z
          component_3 = state*(1 - z)

        h_t = component_2 + component_3

      return h_t, h_t #there is only one hidden state output to keep track of. 
  def __call__(self, inputs, state, scope=None):
    with tf.device("/gpu:"+str(self._gpu_for_layer)):
      """JZS1, mutant 1 with n units cells."""
      with tf.variable_scope(scope or type(self).__name__):  # "JZS1Cell"
        with tf.variable_scope("Zinput"):  # Reset gate and update gate.
          # We start with bias of 1.0 to not reset and not update.
          '''equation 1 z = sigm(WxzXt+Bz), x_t is inputs'''

          z = tf.sigmoid(lfe.enhanced_linear([inputs], 
                            self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor)) 

        with tf.variable_scope("Rinput"):
          '''equation 2 r = sigm(WxrXt+Whrht+Br), h_t is the previous state'''

          r = tf.sigmoid(lfe.enhanced_linear([inputs,state],
                            self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor))
          '''equation 3'''
        with tf.variable_scope("Candidate"):
          component_0 = linear.linear([r*state], 
                            self._num_units, True) 
          component_1 = tf.tanh(tf.tanh(inputs) + component_0)
          component_2 = component_1*z
          component_3 = state*(1 - z)

        h_t = component_2 + component_3

      return h_t, h_t #there is only one hidden state output to keep track of. 
示例#6
0
 def attention(query): #this is part of the attention_decoder. It is placed outside to avoid re-compile time.
   """Put attention masks on hidden using hidden_features and query."""
   ds = []  # Results of attention reads will be stored here.
   for a in xrange(num_heads):
     with tf.variable_scope("Attention_%d" % a):
       y = linear.linear(query, attention_vec_size, True)
       y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
       # Attention mask is a softmax of v^T * tanh(...).
       s = tf.reduce_sum(v[a] * tf.tanh(hidden_features[a] + y), [2, 3])
       a = tf.nn.softmax(s)
       # Now calculate the attention-weighted vector d.
       d = tf.reduce_sum(tf.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                         [1, 2])
       ds.append(tf.reshape(d, [-1, attn_size]))
   return ds
示例#7
0
 def attention(query): #this is part of the attention_decoder. It is placed outside to avoid re-compile time. 
   """Put attention masks on hidden using hidden_features and query."""
   ds = []  # Results of attention reads will be stored here.
   for a in xrange(num_heads):
     with tf.variable_scope("Attention_%d" % a):
       y = linear.linear(query, attention_vec_size, True)
       y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
       # Attention mask is a softmax of v^T * tanh(...).
       s = tf.reduce_sum(v[a] * tf.tanh(hidden_features[a] + y), [2, 3])
       a = tf.nn.softmax(s)
       # Now calculate the attention-weighted vector d.
       d = tf.reduce_sum(tf.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                         [1, 2])
       ds.append(tf.reshape(d, [-1, attn_size]))
   return ds
示例#8
0
def attention_decoder(decoder_inputs, initial_state, attention_states, cell,
                      output_size=None, num_heads=1, loop_function=None,
                      dtype=tf.float32, scope=None, average_states = False, average_hidden_state_influence = 0.5,
                      temperature_decode = False, temperature = 1.0):
  """RNN decoder with attention for the sequence-to-sequence model.

  Args:
    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    output_size: size of the output vectors; if None, we use cell.output_size.
    num_heads: number of attention heads that read from attention_states.
    loop_function: if not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x cell.output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x cell.input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
      [batch_size x output_size]. These represent the generated outputs.
      Output i is computed from input i (which is either i-th decoder_inputs or
      loop_function(output {i-1}, i)) as follows. First, we run the cell
      on a combination of the input and previous attention masks:
        cell_output, new_state = cell(linear(input, prev_attn), prev_state)
      Then, we calculate new attention masks:
        new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
      and then we calculate the output:
        output = linear(cell_output, new_attn).
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, or shapes
      of attention_states are not set.
  """
  if not decoder_inputs:
    raise ValueError("Must provide at least 1 input to attention decoder.")
  if num_heads < 1:
    raise ValueError("With less than 1 heads, use a non-attention decoder.")
  if not attention_states.get_shape()[1:2].is_fully_defined():
    raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell.output_size

  with tf.variable_scope(scope or "attention_decoder"):
    batch_size = tf.shape(decoder_inputs[0])[0]  # Needed for reshaping.
    attn_length = attention_states.get_shape()[1].value
    attn_size = attention_states.get_shape()[2].value

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden = tf.reshape(attention_states, [-1, attn_length, 1, attn_size])
    hidden_features = []
    v = []
    attention_vec_size = attn_size  # Size of query vectors for attention.
    for a in xrange(num_heads):
      k = tf.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size])
      hidden_features.append(tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
      v.append(tf.get_variable("AttnV_%d" % a, [attention_vec_size]))

    states = [initial_state]

    def attention(query): #this is part of the attention_decoder. It is placed outside to avoid re-compile time.
      """Put attention masks on hidden using hidden_features and query."""
      ds = []  # Results of attention reads will be stored here.
      for a in xrange(num_heads):
        with tf.variable_scope("Attention_%d" % a):
          y = linear.linear(query, attention_vec_size, True)
          y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
          # Attention mask is a softmax of v^T * tanh(...).
          s = tf.reduce_sum(v[a] * tf.tanh(hidden_features[a] + y), [2, 3])
          a = tf.nn.softmax(s)
          # Now calculate the attention-weighted vector d.
          d = tf.reduce_sum(tf.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                            [1, 2])
          ds.append(tf.reshape(d, [-1, attn_size]))
      return ds

    outputs = []
    wids = []
    prev = None
    batch_attn_size = tf.pack([batch_size, attn_size])
    attns = [tf.zeros(batch_attn_size, dtype=dtype)
             for _ in xrange(num_heads)]
    for a in attns:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size])
    for i in xrange(len(decoder_inputs)): #RIGHT HERE! THIS IS A LIST OF DECODING TIMESTEPS! WHAAAAHOOOOO!!!!
      if i > 0:
        tf.get_variable_scope().reuse_variables()
      inp = decoder_inputs[i]

      '''nick, you can implement sampling here by changing the input here! also curriculum learning too!'''
      # If loop_function is set, we use it instead of decoder_inputs.
      if loop_function is not None and prev is not None:
        with tf.variable_scope("loop_function", reuse=True):
          inp,wid = loop_function(prev, i, temperature_decode = temperature_decode,
                      temperature = temperature) #basically, stop_gradient doesn't allow inputs to be taken into account
          wids.append(wid)

      #this will make an input that is combined with attention


      # Merge input and previous attentions into one vector of the right size.
      x = linear.linear([inp] + attns, cell.input_size, True)


      hidden_state_input = states[-1]
      if average_states:
        '''implement averaging of states'''
        print('WARNING YOU HAVE OPTED TO USE THE AVERAGING OF STATES!')
        hidden_state_input = average_hidden_states(states, average_hidden_state_influence)

      # Run the RNN.

      #right here, you could potentially make the skip-connections? I think you would have to
      #you would have to save the output part here, and then transfer it to the next part.
      cell_output, new_state = cell(x, hidden_state_input) #nick, changed this to your hidden state input
      states.append(new_state)



      # Run the attention mechanism.
      attns = attention(new_state)
      with tf.variable_scope("AttnOutputProjection"):
        output = linear.linear([cell_output] + attns, output_size, True)
      if loop_function is not None:
        # We do not propagate gradients over the loop function.
        prev = tf.stop_gradient(output)
      outputs.append(output)


  return outputs, states,wids
def attention_decoder(decoder_inputs,
                      initial_state,
                      attention_states,
                      cell,
                      output_size=None,
                      num_heads=1,
                      loop_function=None,
                      dtype=tf.float32,
                      scope=None,
                      average_states=False,
                      average_hidden_state_influence=0.5,
                      temperature_decode=False,
                      temperature=1.0):
    """RNN decoder with attention for the sequence-to-sequence model.

  Args:
    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    output_size: size of the output vectors; if None, we use cell.output_size.
    num_heads: number of attention heads that read from attention_states.
    loop_function: if not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x cell.output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x cell.input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
      [batch_size x output_size]. These represent the generated outputs.
      Output i is computed from input i (which is either i-th decoder_inputs or
      loop_function(output {i-1}, i)) as follows. First, we run the cell
      on a combination of the input and previous attention masks:
        cell_output, new_state = cell(linear(input, prev_attn), prev_state)
      Then, we calculate new attention masks:
        new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
      and then we calculate the output:
        output = linear(cell_output, new_attn).
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, or shapes
      of attention_states are not set.
  """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if num_heads < 1:
        raise ValueError(
            "With less than 1 heads, use a non-attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError(
            "Shape[1] and [2] of attention_states must be known: %s" %
            attention_states.get_shape())
    if output_size is None:
        output_size = cell.output_size

    with tf.variable_scope(scope or "attention_decoder"):
        batch_size = tf.shape(decoder_inputs[0])[0]  # Needed for reshaping.
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = tf.reshape(attention_states, [-1, attn_length, 1, attn_size])
        hidden_features = []
        v = []
        attention_vec_size = attn_size  # Size of query vectors for attention.
        for a in xrange(num_heads):
            k = tf.get_variable("AttnW_%d" % a,
                                [1, 1, attn_size, attention_vec_size])
            hidden_features.append(
                tf.nn.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
            v.append(tf.get_variable("AttnV_%d" % a, [attention_vec_size]))

        states = [initial_state]

        def attention(
            query
        ):  #this is part of the attention_decoder. It is placed outside to avoid re-compile time.
            """Put attention masks on hidden using hidden_features and query."""
            ds = []  # Results of attention reads will be stored here.
            for a in xrange(num_heads):
                with tf.variable_scope("Attention_%d" % a):
                    y = linear.linear(query, attention_vec_size, True)
                    y = tf.reshape(y, [-1, 1, 1, attention_vec_size])
                    # Attention mask is a softmax of v^T * tanh(...).
                    s = tf.reduce_sum(v[a] * tf.tanh(hidden_features[a] + y),
                                      [2, 3])
                    a = tf.nn.softmax(s)
                    # Now calculate the attention-weighted vector d.
                    d = tf.reduce_sum(
                        tf.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                        [1, 2])
                    ds.append(tf.reshape(d, [-1, attn_size]))
            return ds

        outputs = []
        wids = []
        prev = None
        batch_attn_size = tf.pack([batch_size, attn_size])
        attns = [
            tf.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)
        ]
        for a in attns:  # Ensure the second shape of attention vectors is set.
            a.set_shape([None, attn_size])
        for i in xrange(
                len(decoder_inputs)
        ):  #RIGHT HERE! THIS IS A LIST OF DECODING TIMESTEPS! WHAAAAHOOOOO!!!!
            if i > 0:
                tf.get_variable_scope().reuse_variables()
            inp = decoder_inputs[i]
            '''nick, you can implement sampling here by changing the input here! also curriculum learning too!'''
            # If loop_function is set, we use it instead of decoder_inputs.
            if loop_function is not None and prev is not None:
                with tf.variable_scope("loop_function", reuse=True):
                    inp, wid = loop_function(
                        prev,
                        i,
                        temperature_decode=temperature_decode,
                        temperature=temperature
                    )  #basically, stop_gradient doesn't allow inputs to be taken into account
                    wids.append(wid)

            #this will make an input that is combined with attention

            # Merge input and previous attentions into one vector of the right size.
            x = linear.linear([inp] + attns, cell.input_size, True)

            hidden_state_input = states[-1]
            if average_states:
                '''implement averaging of states'''
                print('WARNING YOU HAVE OPTED TO USE THE AVERAGING OF STATES!')
                hidden_state_input = average_hidden_states(
                    states, average_hidden_state_influence)

            # Run the RNN.

            #right here, you could potentially make the skip-connections? I think you would have to
            #you would have to save the output part here, and then transfer it to the next part.
            cell_output, new_state = cell(
                x, hidden_state_input
            )  #nick, changed this to your hidden state input
            states.append(new_state)

            # Run the attention mechanism.
            attns = attention(new_state)
            with tf.variable_scope("AttnOutputProjection"):
                output = linear.linear([cell_output] + attns, output_size,
                                       True)
            if loop_function is not None:
                # We do not propagate gradients over the loop function.
                prev = tf.stop_gradient(output)
            outputs.append(output)

    return outputs, states, wids
 def __call__(self, inputs, state, scope=None):
   """Run the input projection and then the cell."""
   # Default scope: "InputProjectionWrapper"
   with tf.variable_scope(scope or type(self).__name__):
     projected = linear.linear(inputs, self._cell.input_size, True)
   return self._cell(projected, state)
  def __call__(self, inputs, state, scope=None):
    with tf.device("/gpu:"+str(self._gpu_for_layer)):
      print('testing')
      with tf.variable_scope(scope or type(self).__name__):  # "UnitaryRNNCell"
        with tf.variable_scope("UnitaryGates"):  # Reset gate and update gate.


          '''just for sake of consistency, we'll keep some var names the same as authors'''

          n_hidden = self._num_units
          h_prev = state


          '''development nick version here'''
          step1 = unitary_linear.times_diag_tf(h_prev, n_hidden) #this will create a diagonal tensor with given diagonal values


          #work on times_reflection next



          modulus = T.sqrt(lin_output_re ** 2 + lin_output_im ** 2)
          rescale = T.maximum(modulus + hidden_bias.dimshuffle('x',0), 0.) / (modulus + 1e-5)
          nonlin_output_re = lin_output_re * rescale
          nonlin_output_im = lin_output_im * rescale

          h_t = tf.concat(1, [nonlin_output_re, 
                             nonlin_output_im]) 

          #keep in mind that you can use tf.complex to convert two numbers into a complex number -- this works for tensors!

          return h_t, h_t #check if h_t is the same as the output?????


          '''list of complex number functions in tf

          1. tf.complex -- makes complex number
          2. complex_abs -- finds the absolute value of the tensor
          3. tf.conj -- makes conjugate
          4. tf.imag -- returns imaginary part -- go back and forth between complex and imag
          5. tf.real -- returns real part'''

          #keep in mind that identity matricies are a form of diagonal matricies, but they just have ones.


          '''----------------------------end of unitary rnn cell--------------------------'''


          # We start with bias of 1.0 to not reset and not update.
          '''First, we will start with the hidden linear transform
          W = D3R2F-1D2PermR1FD1

          Keep in mind that originally the equation would be W = VDV*, but it leads to too much computation/memory o(n^2)'''
          step1 = times_diag(h_prev, n_hidden, theta[0,:])
          step2 = step1
  #        step2 = do_fft(step1, n_hidden)
          step3 = times_reflection(step2, n_hidden, reflection[0,:])
          step4 = vec_permutation(step3, n_hidden, index_permute)
          step5 = times_diag(step4, n_hidden, theta[1,:])
          step6 = step5
  #        step6 = do_ifft(step5, n_hidden)
          step7 = times_reflection(step6, n_hidden, reflection[1,:])
          step8 = times_diag(step7, n_hidden, theta[2,:])     
          step9 = scale_diag(step8, n_hidden, scale)

          hidden_lin_output = step9

          z = tf.sigmoid(linear.linear([inputs], 
                            self._num_units, True, 1.0))

          '''equation 2 r = sigm(WxrXt+Whrht+Br), h_t is the previous state'''

          r = tf.sigmoid((linear.linear([inputs,state],
                            self._num_units, True, 1.0)))
          '''equation 3'''
        with tf.variable_scope("Candidate"):
          component_0 = linear.linear([r*state],
                            self._num_units, True)
          component_1 = tf.tanh(tf.tanh(inputs) + component_0)
          component_2 = component_1*z
          component_3 = state*(1 - z)

          h_t = component_2 + component_3

          h_t = tf.concat(concat_dim = 1, value =[nonlin_output_re, nonlin_output_im]) #I know here you need to concatenate the real and imaginary parts


        return h_t, h_t #there is only one hidden state output to keep track of.