Exemplo n.º 1
0
def _create_multi_lstm_cell_ops(batch_size, num_units, input_depth,
                                num_layers, max_time, compiled):
  with variable_scope.variable_scope(
      "root",
      initializer=init_ops.random_uniform_initializer(-0.1, 0.1, seed=2)):
    inputs = variable_scope.get_variable(
        "inputs", initializer=random_ops.random_uniform(
            (max_time, batch_size, input_depth), seed=1))
    maybe_xla = lambda c: rnn_cell.CompiledWrapper(c) if compiled else c
    cell = core_rnn_cell_impl.MultiRNNCell(
        [maybe_xla(core_rnn_cell_impl.LSTMCell(num_units))
         for _ in range(num_layers)])
    initial_state = cell.zero_state(
        batch_size=batch_size, dtype=dtypes.float32)
    outputs, final_state = rnn.dynamic_rnn(
        cell=cell, inputs=inputs, initial_state=initial_state,
        time_major=True)
    flat_final_state = nest.flatten(final_state)
    trainable_variables = variables.trainable_variables()
    outputs_grad = gradients_impl.gradients(
        [outputs],
        trainable_variables + [inputs] + nest.flatten(initial_state))
    final_state_grad = gradients_impl.gradients(
        flat_final_state,
        trainable_variables + [inputs] + nest.flatten(initial_state))

    return {"outputs": outputs,
            "final_state": flat_final_state,
            "outputs_grad": outputs_grad,
            "final_state_grad": final_state_grad}
Exemplo n.º 2
0
 def _Model(x):
   w = variable_scope.get_variable(
       "w", (64, 64),
       initializer=init_ops.random_uniform_initializer(seed=312))
   b = variable_scope.get_variable(
       "b", (64), initializer=init_ops.zeros_initializer()),
   return math_ops.sigmoid(math_ops.matmul(x, w) + b)
Exemplo n.º 3
0
def _get_initializer(init_bound, dtype, seed):
  if dtype == dtypes.float16:
    return _MaskedRandomUniformInitializer(
        -init_bound, init_bound, dtype=dtype, seed=seed)
  else:
    return init_ops.random_uniform_initializer(
        -init_bound, init_bound, dtype=dtype, seed=seed)
Exemplo n.º 4
0
  def testBlockGRUToGRUCellSingleStep(self):
    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
      batch_size = 4
      cell_size = 5
      input_size = 6

      seed = 1994
      initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed)

      # Inputs
      x = array_ops.zeros([batch_size, input_size])
      h = array_ops.zeros([batch_size, cell_size])

      # Values for the inputs.
      x_value = np.random.rand(batch_size, input_size)
      h_value = np.random.rand(batch_size, cell_size)

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        output = rnn_cell.GRUCell(cell_size)(x, h)
        sess.run([variables.global_variables_initializer()])
        basic_res = sess.run([output], {x: x_value, h: h_value})

      # Output from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        output = gru_ops.GRUBlockCell(cell_size)(x, h)
        sess.run([variables.global_variables_initializer()])
        block_res = sess.run([output], {x: x_value, h: h_value})

      self.assertEqual(len(block_res), len(basic_res))
      for block, basic in zip(block_res, basic_res):
        self.assertAllClose(block, basic)
  def __call__(self, inputs, state, scope=None):
    """Run the cell on embedded inputs."""
    with vs.variable_scope(scope or type(self).__name__):  # "EmbeddingWrapper2"
      with ops.device("/cpu:0"):
        if self._initializer:
          initializer = self._initializer
        elif vs.get_variable_scope().initializer:
          initializer = vs.get_variable_scope().initializer
        else:
          # Default initializer for embeddings should have variance=1.
          sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
          initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
        embeddings = []
        for i in xrange(len(self._embedding_classes)):
            embeddings.append(vs.get_variable("embedding"+str(i), [self._embedding_classes[i],
                                                  self._embedding_sizes[i]],
                                    initializer=initializer))
        embedded = []
        for i in xrange(len(self._embedding_classes)):
            embedded.append(embedding_ops.embedding_lookup(
                  embeddings[i], array_ops.reshape(inputs[i], [-1])))

        finalEmbedded = tf.concat(1, embedded)

    return self._cell(finalEmbedded, state)
Exemplo n.º 6
0
  def __call__(self, inputs, state, scope=None):
    """Run the cell on embedded inputs."""
    with _checked_scope(self, scope or "embedding_wrapper", reuse=self._reuse):
      with ops.device("/cpu:0"):
        if self._initializer:
          initializer = self._initializer
        elif vs.get_variable_scope().initializer:
          initializer = vs.get_variable_scope().initializer
        else:
          # Default initializer for embeddings should have variance=1.
          sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
          initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)

        if type(state) is tuple:
          data_type = state[0].dtype
        else:
          data_type = state.dtype

        embedding = vs.get_variable(
            "embedding", [self._embedding_classes, self._embedding_size],
            initializer=initializer,
            dtype=data_type)
        embedded = embedding_ops.embedding_lookup(
            embedding, array_ops.reshape(inputs, [-1]))
    return self._cell(embedded, state)
 def model_fn():
   x = variable_scope.get_variable(
       'x',
       shape=(2, 3),
       initializer=init_ops.random_uniform_initializer(
           1.0, 10.0, dtype=dtypes.float32))
   return array_ops.identity(x)
 def build(self, _):
   self.embedding = self.add_variable(
       'embedding_kernel',
       shape=[self.vocab_size, self.embedding_dim],
       dtype=np.float32,
       initializer=init_ops.random_uniform_initializer(-0.1, 0.1),
       trainable=True)
Exemplo n.º 9
0
 def create_ops():
   with variable_scope.variable_scope(
       "root",
       initializer=init_ops.random_uniform_initializer(
           -0.1, 0.1, seed=2)):
     inputs = variable_scope.get_variable("var", (1,))
     return inputs
Exemplo n.º 10
0
 def create_ops():
   with variable_scope.variable_scope(
       "root",
       initializer=init_ops.random_uniform_initializer(
           -0.1, 0.1, seed=2)):
     inputs = random_ops.random_uniform((1,), seed=1)
     return inputs
  def benchmarkTfRNNLSTMTraining(self):
    test_configs = self._GetTestConfig()
    for config_name, config in test_configs.items():
      num_layers = config["num_layers"]
      num_units = config["num_units"]
      batch_size = config["batch_size"]
      seq_length = config["seq_length"]

      with ops.Graph().as_default(), ops.device("/gpu:0"):
        inputs = seq_length * [
            array_ops.zeros([batch_size, num_units], dtypes.float32)
        ]
        initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=127)

        cell = core_rnn_cell_impl.LSTMCell(
            num_units=num_units, initializer=initializer, state_is_tuple=True)
        multi_cell = core_rnn_cell_impl.MultiRNNCell([cell] * num_layers)
        outputs, final_state = core_rnn.static_rnn(
            multi_cell, inputs, dtype=dtypes.float32)
        trainable_variables = ops.get_collection(
            ops.GraphKeys.TRAINABLE_VARIABLES)
        gradients = gradients_impl.gradients([outputs, final_state],
                                             trainable_variables)
        training_op = control_flow_ops.group(*gradients)
        self._BenchmarkOp(training_op, "tf_rnn_lstm %s %s" %
                          (config_name, self._GetConfigDesc(config)))
Exemplo n.º 12
0
  def __call__(self, inputs, state, scope=None):
    """Run the cell on embedded inputs."""
    with vs.variable_scope(scope or type(self).__name__):  # "EmbeddingWrapper"
      with ops.device("/cpu:0"):
        if self._embedding:
          embedding = self._embedding
        else:
          if self._initializer:
            initializer = self._initializer
          elif vs.get_variable_scope().initializer:
            initializer = vs.get_variable_scope().initializer
          else:
            # Default initializer for embeddings should have variance=1.
            sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
            initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
          embedding = vs.get_variable("embedding", [self._embedding_classes,
                                                    self._cell.input_size],
                                      initializer=initializer)
        embedded = embedding_ops.embedding_lookup(
            embedding, array_ops.reshape(inputs, [-1]))

        """print (embedded)
        print ("{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}")"""

    return self._cell(embedded, state)
  def testWarmStartInputLayerEmbeddingColumn(self):
    # Create old and new vocabs for embedding column "sc_vocab".
    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                        "old_vocab")
    new_vocab_path = self._write_vocab(
        ["orange", "guava", "banana", "apple", "raspberry", "blueberry"],
        "new_vocab")

    # Save checkpoint from which to warm-start.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        _ = variable_scope.get_variable(
            "input_layer/sc_vocab_embedding/embedding_weights",
            initializer=[[0.5, 0.4], [1., 1.1], [2., 2.2], [3., 3.3]])
        self._write_checkpoint(sess)

    def _partitioner(shape, dtype):  # pylint:disable=unused-argument
      # Partition each var into 2 equal slices.
      partitions = [1] * len(shape)
      partitions[0] = min(2, shape[0].value)
      return partitions

    # Create feature columns.
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
    emb_vocab = fc.embedding_column(
        categorical_column=sc_vocab,
        dimension=2,
        # Can't use constant_initializer with load_and_remap.  In practice,
        # use a truncated normal initializer.
        initializer=init_ops.random_uniform_initializer(
            minval=0.42, maxval=0.42))
    all_deep_cols = [emb_vocab]
    # New graph, new session with warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = {}
        with variable_scope.variable_scope("", partitioner=_partitioner):
          # Create the variables.
          fc.input_layer(
              features=self._create_dummy_inputs(),
              feature_columns=all_deep_cols,
              cols_to_vars=cols_to_vars)
        ws_settings = ws_util._WarmStartSettings(
            self.get_temp_dir(), col_to_prev_vocab={
                emb_vocab: prev_vocab_path
            })
        ws_util._warmstart_input_layer(cols_to_vars, ws_settings)
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warmstarted. Var corresponding to
        # emb_vocab should be correctly warmstarted after vocab remapping.
        # Missing values are filled in with the EmbeddingColumn's initializer.
        self._assert_cols_to_vars(
            cols_to_vars, {
                emb_vocab: [
                    np.array([[3., 3.3], [2., 2.2], [1., 1.1]]),
                    np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]])
                ]
            }, sess)
Exemplo n.º 14
0
    def _build(self):
        """ build embedding table and
        build position embedding table if timing=="emb"

        :return:
        """
        self._embeddings = variable_scope.get_variable(
            name=(self._name or "embedding_table"),
            shape=[self._vocab_size, self._dimension],
            initializer=init_ops.random_uniform_initializer(
                -self._init_scale, self._init_scale))
        if self._timing == "emb":
            self._position_embedding = variable_scope.get_variable(
                name=(self._name or "embedding_table") + "_posi",
                shape=[self._maximum_position, self._dimension],
                initializer=init_ops.random_uniform_initializer(
                    -self._init_scale, self._init_scale))
 def testRandomInitializer(self):
   # Sanity check that the slices uses a different seed when using a random
   # initializer function.
   with self.test_session():
     var0, var1 = partitioned_variables.create_partitioned_variables(
         [20, 12], [1, 2], init_ops.random_uniform_initializer())
     variables.global_variables_initializer().run()
     val0, val1 = var0.eval().flatten(), var1.eval().flatten()
     self.assertTrue(np.linalg.norm(val0 - val1) > 1e-6)
   # Negative test that proves that slices have the same values if
   # the random initializer uses a seed.
   with self.test_session():
     var0, var1 = partitioned_variables.create_partitioned_variables(
         [20, 12], [1, 2], init_ops.random_uniform_initializer(seed=201))
     variables.global_variables_initializer().run()
     val0, val1 = var0.eval().flatten(), var1.eval().flatten()
     self.assertAllClose(val0, val1)
Exemplo n.º 16
0
  def testBlockGRUToGRUCellMultiStep(self):
    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
      batch_size = 2
      cell_size = 3
      input_size = 3
      time_steps = 4

      # Random initializers.
      seed = 1994
      initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed)
      np.random.seed(seed)

      # Inputs
      concat_x = array_ops.placeholder(
          dtypes.float32, shape=(time_steps, batch_size, input_size))
      h = array_ops.zeros([batch_size, cell_size])

      # Values for the inputs.
      x_values = np.random.rand(time_steps, batch_size, input_size)
      h_value = np.random.rand(batch_size, cell_size)

      # Output from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        cell = gru_ops.GRUBlockCell(cell_size)
        outputs_dynamic, state_dynamic = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        feeds = {concat_x: x_values, h: h_value}
        sess.run([variables.global_variables_initializer()])
        block_res = sess.run([outputs_dynamic, state_dynamic], feeds)

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        cell = rnn_cell.GRUCell(cell_size)
        outputs_dynamic, state_dynamic = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        feeds = {concat_x: x_values, h: h_value}
        sess.run([variables.global_variables_initializer()])
        basic_res = sess.run([outputs_dynamic, state_dynamic], feeds)

      # Check the lengths of the outputs_dynamic, and states.
      self.assertEqual(len(block_res), len(basic_res))
      self.assertEqual(len(block_res[0]), len(basic_res[0]))
      self.assertEqual(len(block_res[1]), len(basic_res[1]))

      # Check the outputs_dynamic values.
      for block_output, basic_output in zip(block_res[0], basic_res[0]):
        self.assertAllClose(block_output, basic_output)

      # Check the state_dynamic value.
      self.assertAllClose(block_res[1], block_res[1])
Exemplo n.º 17
0
  def _createStackBidirectionalDynamicRNN(self,
                                          use_gpu,
                                          use_shape,
                                          use_state_tuple,
                                          initial_states_fw=None,
                                          initial_states_bw=None,
                                          scope=None):
    self.layers = [2, 3]
    input_size = 5
    batch_size = 2
    max_length = 8

    initializer = init_ops.random_uniform_initializer(
        -0.01, 0.01, seed=self._seed)
    sequence_length = array_ops.placeholder(dtypes.int64)

    self.cells_fw = [
        core_rnn_cell_impl.LSTMCell(
            num_units,
            input_size,
            initializer=initializer,
            state_is_tuple=False) for num_units in self.layers
    ]
    self.cells_bw = [
        core_rnn_cell_impl.LSTMCell(
            num_units,
            input_size,
            initializer=initializer,
            state_is_tuple=False) for num_units in self.layers
    ]

    inputs = max_length * [
        array_ops.placeholder(
            dtypes.float32,
            shape=(batch_size, input_size) if use_shape else (None, input_size))
    ]
    inputs_c = array_ops.stack(inputs)
    inputs_c = array_ops.transpose(inputs_c, [1, 0, 2])
    outputs, st_fw, st_bw = rnn.stack_bidirectional_dynamic_rnn(
        self.cells_fw,
        self.cells_bw,
        inputs_c,
        initial_states_fw=initial_states_fw,
        initial_states_bw=initial_states_bw,
        dtype=dtypes.float32,
        sequence_length=sequence_length,
        scope=scope)

    # Outputs has shape (batch_size, max_length, 2* layer[-1].
    output_shape = [None, max_length, 2 * self.layers[-1]]
    if use_shape:
      output_shape[0] = batch_size

    self.assertAllEqual(outputs.get_shape().as_list(), output_shape)

    input_value = np.random.randn(batch_size, input_size)

    return input_value, inputs, outputs, st_fw, st_bw, sequence_length
Exemplo n.º 18
0
  def _createStackBidirectionalRNN(self,
                                   use_gpu,
                                   use_shape,
                                   use_sequence_length,
                                   initial_states_fw=None,
                                   initial_states_bw=None,
                                   scope=None):
    self.layers = [2, 3]
    input_size = 5
    batch_size = 2
    max_length = 8

    initializer = init_ops.random_uniform_initializer(
        -0.01, 0.01, seed=self._seed)
    sequence_length = array_ops.placeholder(
        dtypes.int64) if use_sequence_length else None

    self.cells_fw = [
        core_rnn_cell_impl.LSTMCell(
            num_units,
            input_size,
            initializer=initializer,
            state_is_tuple=False) for num_units in self.layers
    ]
    self.cells_bw = [
        core_rnn_cell_impl.LSTMCell(
            num_units,
            input_size,
            initializer=initializer,
            state_is_tuple=False) for num_units in self.layers
    ]

    inputs = max_length * [
        array_ops.placeholder(
            dtypes.float32,
            shape=(batch_size, input_size) if use_shape else (None, input_size))
    ]
    outputs, state_fw, state_bw = rnn.stack_bidirectional_rnn(
        self.cells_fw,
        self.cells_bw,
        inputs,
        initial_states_fw,
        initial_states_bw,
        dtype=dtypes.float32,
        sequence_length=sequence_length,
        scope=scope)

    self.assertEqual(len(outputs), len(inputs))
    for out in outputs:
      self.assertAlmostEqual(
          out.get_shape().as_list(),
          [batch_size if use_shape else None, 2 * self.layers[-1]])

    input_value = np.random.randn(batch_size, input_size)
    outputs = array_ops.stack(outputs)

    return input_value, inputs, outputs, state_fw, state_bw, sequence_length
Exemplo n.º 19
0
  def testTimeReversedFusedRNN(self):
    with self.test_session() as sess:
      initializer = init_ops.random_uniform_initializer(
          -0.01, 0.01, seed=19890213)
      fw_cell = core_rnn_cell_impl.BasicRNNCell(10)
      bw_cell = core_rnn_cell_impl.BasicRNNCell(10)
      batch_size = 5
      input_size = 20
      timelen = 15
      inputs = constant_op.constant(
          np.random.randn(timelen, batch_size, input_size))

      # test bi-directional rnn
      with variable_scope.variable_scope("basic", initializer=initializer):
        unpacked_inputs = array_ops.unstack(inputs)
        outputs, fw_state, bw_state = core_rnn.static_bidirectional_rnn(
            fw_cell, bw_cell, unpacked_inputs, dtype=dtypes.float64)
        packed_outputs = array_ops.stack(outputs)
        basic_vars = [
            v for v in variables.trainable_variables()
            if v.name.startswith("basic/")
        ]
        sess.run([variables.global_variables_initializer()])
        basic_outputs, basic_fw_state, basic_bw_state = sess.run(
            [packed_outputs, fw_state, bw_state])
        basic_grads = sess.run(gradients_impl.gradients(packed_outputs, inputs))
        basic_wgrads = sess.run(
            gradients_impl.gradients(packed_outputs, basic_vars))

      with variable_scope.variable_scope("fused", initializer=initializer):
        fused_cell = fused_rnn_cell.FusedRNNCellAdaptor(
            core_rnn_cell_impl.BasicRNNCell(10))
        fused_bw_cell = fused_rnn_cell.TimeReversedFusedRNN(
            fused_rnn_cell.FusedRNNCellAdaptor(
                core_rnn_cell_impl.BasicRNNCell(10)))
        fw_outputs, fw_state = fused_cell(
            inputs, dtype=dtypes.float64, scope="fw")
        bw_outputs, bw_state = fused_bw_cell(
            inputs, dtype=dtypes.float64, scope="bw")
        outputs = array_ops.concat([fw_outputs, bw_outputs], 2)
        fused_vars = [
            v for v in variables.trainable_variables()
            if v.name.startswith("fused/")
        ]
        sess.run([variables.global_variables_initializer()])
        fused_outputs, fused_fw_state, fused_bw_state = sess.run(
            [outputs, fw_state, bw_state])
        fused_grads = sess.run(gradients_impl.gradients(outputs, inputs))
        fused_wgrads = sess.run(gradients_impl.gradients(outputs, fused_vars))

      self.assertAllClose(basic_outputs, fused_outputs)
      self.assertAllClose(basic_fw_state, fused_fw_state)
      self.assertAllClose(basic_bw_state, fused_bw_state)
      self.assertAllClose(basic_grads, fused_grads)
      for basic, fused in zip(basic_wgrads, fused_wgrads):
        self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
Exemplo n.º 20
0
def inference_gru_block_vs_gru_cell(batch_size,
                                    cell_size,
                                    input_size,
                                    time_steps,
                                    use_gpu=False,
                                    iters=30):
  """Benchmark inference speed between GRUBlockCell vs GRUCell."""
  ops.reset_default_graph()
  with session.Session(graph=ops.Graph()) as sess:
    with benchmarking.device(use_gpu):

      # Random initializers.
      seed = 1994
      initializer = init_ops.random_uniform_initializer(-1, 1, seed=seed)
      np.random.seed(seed)

      # Inputs
      concat_x = vs.get_variable("concat_x",
                                 [time_steps, batch_size, input_size])
      h = vs.get_variable("h", [batch_size, cell_size])

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        cell = rnn_cell.GRUCell(cell_size)
        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        sess.run([variables.global_variables_initializer()])
        basic_time_inference = benchmarking.seconds_per_run(
            outputs_dynamic, sess, iters)

      # Output from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        cell = gru_ops.GRUBlockCell(cell_size)
        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        sess.run([variables.global_variables_initializer()])
        block_time_inference = benchmarking.seconds_per_run(
            outputs_dynamic, sess, iters)

    performance_inference = (basic_time_inference - block_time_inference
                            ) * 100 / basic_time_inference
    print(",".join([
        str(batch_size), str(cell_size), str(input_size), str(time_steps), str(
            use_gpu), str(basic_time_inference), str(block_time_inference), str(
                performance_inference)
    ]))

    return basic_time_inference, block_time_inference
Exemplo n.º 21
0
  def testLSTMBasicToBlockCellPeeping(self):
    with self.test_session(use_gpu=self._use_gpu) as sess:
      x = array_ops.zeros([1, 2])
      x_values = np.random.randn(1, 2)

      m0_val = 0.1 * np.ones([1, 2])
      m1_val = -0.1 * np.ones([1, 2])
      m2_val = -0.2 * np.ones([1, 2])
      m3_val = 0.2 * np.ones([1, 2])

      initializer = init_ops.random_uniform_initializer(
          -0.01, 0.01, seed=19890212)
      with variable_scope.variable_scope("basic", initializer=initializer):
        m0 = array_ops.zeros([1, 2])
        m1 = array_ops.zeros([1, 2])
        m2 = array_ops.zeros([1, 2])
        m3 = array_ops.zeros([1, 2])
        g, ((out_m0, out_m1),
            (out_m2, out_m3)) = core_rnn_cell_impl.MultiRNNCell(
                [
                    core_rnn_cell_impl.LSTMCell(
                        2, use_peepholes=True, state_is_tuple=True)
                ] * 2,
                state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
        sess.run([variables.global_variables_initializer()])
        basic_res = sess.run([g, out_m0, out_m1, out_m2, out_m3], {
            x.name: x_values,
            m0.name: m0_val,
            m1.name: m1_val,
            m2.name: m2_val,
            m3.name: m3_val
        })

      with variable_scope.variable_scope("block", initializer=initializer):
        m0 = array_ops.zeros([1, 2])
        m1 = array_ops.zeros([1, 2])
        m2 = array_ops.zeros([1, 2])
        m3 = array_ops.zeros([1, 2])
        g, ((out_m0, out_m1),
            (out_m2, out_m3)) = core_rnn_cell_impl.MultiRNNCell(
                [lstm_ops.LSTMBlockCell(
                    2, use_peephole=True)] * 2,
                state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
        sess.run([variables.global_variables_initializer()])
        block_res = sess.run([g, out_m0, out_m1, out_m2, out_m3], {
            x.name: x_values,
            m0.name: m0_val,
            m1.name: m1_val,
            m2.name: m2_val,
            m3.name: m3_val
        })

      self.assertEqual(len(basic_res), len(block_res))
      for basic, block in zip(basic_res, block_res):
        self.assertAllClose(basic, block)
Exemplo n.º 22
0
def glorot_initializer(in_size, out_size):
    """
    Normalized initialization proposed for variance stabilization per layer

    Links:

    Understanding the difficulty of training deep feedforward neural networks
    http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
    """
    width = math.sqrt(6.0 / (in_size + out_size))
    return init_ops.random_uniform_initializer(-width, width)
Exemplo n.º 23
0
def _static_vs_dynamic_rnn_benchmark_dynamic(inputs_t, sequence_length):
  (unused_0, unused_1, input_size) = inputs_t.get_shape().as_list()
  initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=127)
  cell = contrib_rnn.LSTMCell(
      num_units=input_size,
      use_peepholes=True,
      initializer=initializer,
      state_is_tuple=False)
  outputs, final_state = rnn.dynamic_rnn(
      cell, inputs_t, sequence_length=sequence_length, dtype=dtypes.float32)

  trainable_variables = ops_lib.get_collection(
      ops_lib.GraphKeys.TRAINABLE_VARIABLES)
  gradients = gradients_impl.gradients([outputs, final_state],
                                       trainable_variables)

  return control_flow_ops.group(final_state, outputs, *gradients)
Exemplo n.º 24
0
  def build(self, input_shape):
    input_shape = tensor_shape.TensorShape(input_shape)
    # TODO(sibyl-vie3Poto): Allow higher dimension inputs. Currently the input is expected
    # to have shape [batch_size, dimension].
    if input_shape.rank != 2:
      raise ValueError(
          'The rank of the input tensor should be 2. Got {} instead.'.format(
              input_shape.ndims))
    if input_shape.dims[1].value is None:
      raise ValueError(
          'The last dimension of the inputs to `RandomFourierFeatures` '
          'should be defined. Found `None`.')
    self.input_spec = input_spec.InputSpec(
        ndim=2, axes={1: input_shape.dims[1].value})
    input_dim = input_shape.dims[1].value

    kernel_initializer = _get_random_features_initializer(
        self.kernel_initializer, shape=(input_dim, self.output_dim))

    unscaled_kernel = self.add_weight(
        name='unscaled_random_features',
        shape=(input_dim, self.output_dim),
        dtype=dtypes.float32,
        initializer=kernel_initializer,
        trainable=False)

    self.bias = self.add_weight(
        name='random_features_bias',
        shape=(self.output_dim,),
        dtype=dtypes.float32,
        initializer=init_ops.random_uniform_initializer(
            minval=0.0, maxval=2 * np.pi, dtype=dtypes.float32),
        trainable=False)

    if self.scale is None:
      self.scale = _get_default_scale(self.kernel_initializer, input_dim)
    scale = self.add_weight(
        name='random_features_scale',
        shape=(1,),
        dtype=dtypes.float32,
        initializer=init_ops.constant_initializer(self.scale),
        trainable=True,
        constraint='NonNeg')
    self.kernel = (1.0 / scale) * unscaled_kernel
    super(RandomFourierFeatures, self).build(input_shape)
Exemplo n.º 25
0
def _half_seq_len_vs_unroll_half_rnn_benchmark(inputs_list_t, sequence_length):
  (_, input_size) = inputs_list_t[0].get_shape().as_list()
  initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=127)
  cell = rnn_cell_impl.LSTMCell(
      num_units=input_size,
      use_peepholes=True,
      initializer=initializer,
      state_is_tuple=False)
  outputs, final_state = rnn.static_rnn(
      cell,
      inputs_list_t,
      sequence_length=sequence_length,
      dtype=dtypes.float32)

  trainable_variables = ops_lib.get_collection(
      ops_lib.GraphKeys.TRAINABLE_VARIABLES)
  gradients = gradients_impl.gradients(outputs + [final_state],
                                       trainable_variables)

  return control_flow_ops.group(final_state, *(gradients + outputs))
Exemplo n.º 26
0
  def __call__(self, combine_inputs, state, scope=None):
    """Run the cell on embedded inputs."""

    with vs.variable_scope(scope or type(self).__name__):  # "EmbeddingWrapper"
      with ops.device("/cpu:0"):

        inputs = combine_inputs[0]
        alphabetEnc = combine_inputs[1]

        print ("************************************************************************")
        print (inputs)
        print ("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print (alphabetEnc)
        print ("========================================================================")

        if self._embedding:
          embedding = self._embedding
        else:
          if self._initializer:
            initializer = self._initializer
          elif vs.get_variable_scope().initializer:
            initializer = vs.get_variable_scope().initializer
          else:
            # Default initializer for embeddings should have variance=1.
            sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
            initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
          embedding = vs.get_variable("embedding", [self._embedding_classes,
                                                    self._cell.input_size],
                                      initializer=initializer)
        embedded = embedding_ops.embedding_lookup(
            embedding, array_ops.reshape(inputs, [-1]))

        print (embedded)
        print ("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")

        combine_embedded = array_ops.concat(1,[embedded,alphabetEnc])

        print (combine_embedded)
        print ("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")

    return self._cell(combine_embedded, state)
Exemplo n.º 27
0
def single_bprop_step_gru_block_vs_gru_cell(batch_size,
                                            cell_size,
                                            input_size,
                                            use_gpu=False,
                                            iters=30):
  """Benchmark single bprop step speed between GRUBlockCell vs GRUCell."""
  ops.reset_default_graph()
  with session.Session(graph=ops.Graph()) as sess:
    with benchmarking.device(use_gpu):
      initializer = init_ops.random_uniform_initializer(-1, 1, seed=1989)
      # Inputs
      x = vs.get_variable("x", [batch_size, input_size])
      h = vs.get_variable("h", [batch_size, cell_size])

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        output = rnn_cell.GRUCell(cell_size)(array_ops.identity(x),
                                             array_ops.identity(h))
        sess.run([variables.global_variables_initializer()])
        grad_output_wrt_input = gradients_impl.gradients([output], h)
        basic_time_bprop = benchmarking.seconds_per_run(grad_output_wrt_input,
                                                        sess, iters)

      # Output from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        output = gru_ops.GRUBlockCell(cell_size)(array_ops.identity(x),
                                                 array_ops.identity(h))
        sess.run([variables.global_variables_initializer()])
        grad_output_wrt_input = gradients_impl.gradients([output], h)
        block_time_bprop = benchmarking.seconds_per_run(grad_output_wrt_input,
                                                        sess, iters)

  performance_inference = (
      basic_time_bprop - block_time_bprop) * 100 / basic_time_bprop

  print(",".join([
      str(batch_size), str(cell_size), str(input_size), str(use_gpu), str(
          basic_time_bprop), str(block_time_bprop), str(performance_inference)
  ]))

  return basic_time_bprop, block_time_bprop
Exemplo n.º 28
0
def embedding_attention_decoder(decoder_inputs,
                                initial_state,
                                attention_states,
                                cell,
                                num_symbols,
                                batch_size,
                                state_size,
                                decoder_inputs_positions=None,
                                decoder_inputs_maps=None,
                                output_size=None,
                                feed_previous=False,
                                dtype=dtypes.float32,
                                scope=None):
    """RNN decoder with embedding and attention and a pure-decoding option.

  Args:
    decoder_inputs: a list of 1D batch-sized int32 Tensors (decoder inputs).
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function.
    num_symbols: integer, how many symbols come into the embedding.
    batch_size: need to clarify for decoding.
    decoder_inputs_positions: a list of 2D Tensors of shape [batch_size, 3].
    decoder_inputs_maps: a 1D Tensor of length batch_size.
    output_size: size of the output vectors; if None, use cell.output_size.
    feed_previous: Boolean; if True, only the first of decoder_inputs will be
      used (the "GO" symbol), and all other decoder inputs will be generated by:
        next = embedding_lookup(embedding, argmax(previous_output)),
      In effect, this implements a greedy decoder. It can also be used
      during training to emulate http://arxiv.org/pdf/1506.03099v2.pdf.
      If False, decoder_inputs are used as given (the standard decoder case).
    dtype: The dtype to use for the RNN initial states (default: tf.float32).
    scope: VariableScope for the created subgraph; defaults to
      "embedding_attention_decoder".

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors with
      shape [batch_size x output_size] containing the generated outputs.
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
    attentions: a list of 2D Tensors of shape [batch_size, cell.state_size].
    environments: a list of 2D Tensors of shape [batch_size, state_size].

  Raises:
    ValueError: when output_projection has the wrong shape.

  Modification:
    No output projection.
  """
    if output_size is None:
        output_size = cell.output_size

    with vs.variable_scope(scope or "embedding_attention_decoder"):
        with ops.device("/cpu:0"):
            embedding = vs.get_variable(
                "embedding",
                shape=[num_symbols, cell.input_size],
                initializer=init_ops.random_uniform_initializer(-0.08, 0.08))

        def extract_argmax_and_embed(prev, _):
            """Loop_function that extracts the symbol from prev and embeds it."""
            prev_symbol = array_ops.stop_gradient(math_ops.argmax(prev, 1))
            emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
            return emb_prev

        # beam search

        loop_function = None
        if feed_previous:
            loop_function = extract_argmax_and_embed

        emb_inp = [
            embedding_ops.embedding_lookup(embedding, i)
            for i in decoder_inputs
        ]
        return attention_decoder(
            emb_inp,
            initial_state,
            attention_states,
            cell,
            batch_size,
            state_size,
            decoder_inputs_positions=decoder_inputs_positions,
            decoder_inputs_maps=decoder_inputs_maps,
            output_size=output_size,
            loop_function=loop_function)
Exemplo n.º 29
0
    def testLSTMBasicToBlockPeeping(self):
        with self.test_session(use_gpu=True) as sess:
            batch_size = 2
            input_size = 3
            cell_size = 4
            sequence_length = 5

            inputs = []
            for _ in range(sequence_length):
                inp = ops.convert_to_tensor(np.random.randn(
                    batch_size, input_size),
                                            dtype=dtypes.float32)
                inputs.append(inp)

            initializer = init_ops.random_uniform_initializer(-0.01,
                                                              0.01,
                                                              seed=19890212)
            with variable_scope.variable_scope("basic",
                                               initializer=initializer):
                cell = rnn_cell.LSTMCell(cell_size,
                                         use_peepholes=True,
                                         state_is_tuple=True)
                outputs, state = rnn.static_rnn(cell,
                                                inputs,
                                                dtype=dtypes.float32)

                sess.run([variables.global_variables_initializer()])
                basic_outputs, basic_state = sess.run([outputs, state[0]])
                basic_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                basic_wgrads = sess.run(
                    gradients_impl.gradients(outputs,
                                             variables.trainable_variables()))

            with variable_scope.variable_scope("block",
                                               initializer=initializer):
                w = variable_scope.get_variable(
                    "w",
                    shape=[input_size + cell_size, cell_size * 4],
                    dtype=dtypes.float32)
                b = variable_scope.get_variable(
                    "b",
                    shape=[cell_size * 4],
                    dtype=dtypes.float32,
                    initializer=init_ops.zeros_initializer())

                wci = variable_scope.get_variable("wci",
                                                  shape=[cell_size],
                                                  dtype=dtypes.float32)
                wcf = variable_scope.get_variable("wcf",
                                                  shape=[cell_size],
                                                  dtype=dtypes.float32)
                wco = variable_scope.get_variable("wco",
                                                  shape=[cell_size],
                                                  dtype=dtypes.float32)

                _, _, _, _, _, _, outputs = block_lstm(ops.convert_to_tensor(
                    sequence_length, dtype=dtypes.int64),
                                                       inputs,
                                                       w,
                                                       b,
                                                       wci=wci,
                                                       wcf=wcf,
                                                       wco=wco,
                                                       cell_clip=0,
                                                       use_peephole=True)

                sess.run([variables.global_variables_initializer()])
                block_outputs = sess.run(outputs)
                block_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                block_wgrads = sess.run(
                    gradients_impl.gradients(outputs, [w, b, wci, wcf, wco]))

            self.assertAllClose(basic_outputs, block_outputs)
            self.assertAllClose(basic_grads, block_grads)
            for basic, block in zip(basic_wgrads, block_wgrads):
                self.assertAllClose(basic, block, rtol=1e-2, atol=1e-2)

            with variable_scope.variable_scope("fused",
                                               initializer=initializer):
                cell = lstm_ops.LSTMBlockFusedCell(cell_size,
                                                   cell_clip=0,
                                                   use_peephole=True)
                outputs, state = cell(inputs, dtype=dtypes.float32)

                sess.run([variables.global_variables_initializer()])
                fused_outputs, fused_state = sess.run([outputs, state[0]])
                fused_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                fused_vars = [
                    v for v in variables.trainable_variables()
                    if v.name.startswith("fused/")
                ]
                fused_wgrads = sess.run(
                    gradients_impl.gradients(outputs, fused_vars))

            self.assertAllClose(basic_outputs, fused_outputs)
            self.assertAllClose(basic_state, fused_state)
            self.assertAllClose(basic_grads, fused_grads)
            for basic, fused in zip(basic_wgrads, fused_wgrads):
                self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
Exemplo n.º 30
0
    def testBasicRNNFusedWrapper(self):
        """This test checks that using a wrapper for BasicRNN works as expected."""

        with self.test_session() as sess:
            initializer = init_ops.random_uniform_initializer(-0.01,
                                                              0.01,
                                                              seed=19890212)
            cell = core_rnn_cell_impl.BasicRNNCell(10)
            batch_size = 5
            input_size = 20
            timelen = 15
            inputs = constant_op.constant(
                np.random.randn(timelen, batch_size, input_size))
            with variable_scope.variable_scope("basic",
                                               initializer=initializer):
                unpacked_inputs = array_ops.unstack(inputs)
                outputs, state = core_rnn.static_rnn(cell,
                                                     unpacked_inputs,
                                                     dtype=dtypes.float64)
                packed_outputs = array_ops.stack(outputs)
                basic_vars = [
                    v for v in variables.trainable_variables()
                    if v.name.startswith("basic/")
                ]
                sess.run([variables.global_variables_initializer()])
                basic_outputs, basic_state = sess.run([packed_outputs, state])
                basic_grads = sess.run(
                    gradients_impl.gradients(packed_outputs, inputs))
                basic_wgrads = sess.run(
                    gradients_impl.gradients(packed_outputs, basic_vars))

            with variable_scope.variable_scope("fused_static",
                                               initializer=initializer):
                fused_cell = fused_rnn_cell.FusedRNNCellAdaptor(
                    core_rnn_cell_impl.BasicRNNCell(10))
                outputs, state = fused_cell(inputs, dtype=dtypes.float64)
                fused_static_vars = [
                    v for v in variables.trainable_variables()
                    if v.name.startswith("fused_static/")
                ]
                sess.run([variables.global_variables_initializer()])
                fused_static_outputs, fused_static_state = sess.run(
                    [outputs, state])
                fused_static_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                fused_static_wgrads = sess.run(
                    gradients_impl.gradients(outputs, fused_static_vars))

            self.assertAllClose(basic_outputs, fused_static_outputs)
            self.assertAllClose(basic_state, fused_static_state)
            self.assertAllClose(basic_grads, fused_static_grads)
            for basic, fused in zip(basic_wgrads, fused_static_wgrads):
                self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)

            with variable_scope.variable_scope("fused_dynamic",
                                               initializer=initializer):
                fused_cell = fused_rnn_cell.FusedRNNCellAdaptor(
                    core_rnn_cell_impl.BasicRNNCell(10), use_dynamic_rnn=True)
                outputs, state = fused_cell(inputs, dtype=dtypes.float64)
                fused_dynamic_vars = [
                    v for v in variables.trainable_variables()
                    if v.name.startswith("fused_dynamic/")
                ]
                sess.run([variables.global_variables_initializer()])
                fused_dynamic_outputs, fused_dynamic_state = sess.run(
                    [outputs, state])
                fused_dynamic_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                fused_dynamic_wgrads = sess.run(
                    gradients_impl.gradients(outputs, fused_dynamic_vars))

            self.assertAllClose(basic_outputs, fused_dynamic_outputs)
            self.assertAllClose(basic_state, fused_dynamic_state)
            self.assertAllClose(basic_grads, fused_dynamic_grads)
            for basic, fused in zip(basic_wgrads, fused_dynamic_wgrads):
                self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
Exemplo n.º 31
0
def training_gru_block_vs_gru_cell(batch_size,
                                   cell_size,
                                   input_size,
                                   time_steps,
                                   use_gpu=False,
                                   iters=30):
  """Benchmark training speed between GRUBlockCell vs GRUCell."""
  ops.reset_default_graph()
  with session.Session(graph=ops.Graph()) as sess:
    # Specify the device which is been used.
    with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):

      # Random initializers.
      seed = 1994
      initializer = init_ops.random_uniform_initializer(-1, 1, seed=seed)
      np.random.seed(seed)

      # Inputs
      concat_x = vs.get_variable("concat_x",
                                 [time_steps, batch_size, input_size])
      h = vs.get_variable("h", [batch_size, cell_size])
      y = vs.get_variable("y", [time_steps, batch_size, cell_size])

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        cell = core_rnn_cell_impl.GRUCell(cell_size)

        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        sess.run([variables.global_variables_initializer()])
        cost = math_ops.reduce_mean(math_ops.square(outputs_dynamic - y))
        learning_rate = 0.01
        optimizer = gradient_descent.GradientDescentOptimizer(
            learning_rate).minimize(cost)

        # time for a training step.
        basic_time_training = time_taken_by_op(optimizer, sess, iters)

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        cell = gru_ops.GRUBlockCell(cell_size)

        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        sess.run([variables.global_variables_initializer()])
        cost = math_ops.reduce_mean(math_ops.square(outputs_dynamic - y))
        learning_rate = 0.01
        optimizer = gradient_descent.GradientDescentOptimizer(
            learning_rate).minimize(cost)

        # time for a training step.
        block_time_training = time_taken_by_op(optimizer, sess, iters)

    performance_training = (
        basic_time_training - block_time_training) * 100 / basic_time_training

    print(",".join([
        str(batch_size), str(cell_size), str(input_size), str(time_steps), str(
            use_gpu), str(basic_time_training), str(block_time_training), str(
                performance_training)
    ]))

    return basic_time_training, block_time_training
Exemplo n.º 32
0
  def testDerivativeOfBlockGRUToGRUCellSingleStep(self):
    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
      batch_size = 2
      cell_size = 3
      input_size = 4

      seed = 1994
      initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed)
      np.random.seed(seed)

      # Inputs
      x = array_ops.zeros([batch_size, input_size])
      h = array_ops.zeros([batch_size, cell_size])

      # Values for the inputs.
      x_value = np.random.rand(batch_size, input_size)
      h_value = np.random.rand(batch_size, cell_size)

      # Gradients from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        output = gru_ops.GRUBlockCell(cell_size)(x, h)
        sess.run([variables.global_variables_initializer()])

        all_variables = variables.global_variables()[0:4]
        [w_ru, b_ru, w_c, b_c] = all_variables

        d_new_h_wrt_x = gradients_impl.gradients([output], x)
        d_new_h_wrt_h = gradients_impl.gradients([output], h)
        d_new_h_wrt_w_ru = gradients_impl.gradients([output], w_ru)
        d_new_h_wrt_w_c = gradients_impl.gradients([output], w_c)
        d_new_h_wrt_b_ru = gradients_impl.gradients([output], b_ru)
        d_new_h_wrt_b_c = gradients_impl.gradients([output], b_c)

        d_block_res = sess.run([
            d_new_h_wrt_x, d_new_h_wrt_h, d_new_h_wrt_w_ru, d_new_h_wrt_w_c,
            d_new_h_wrt_b_ru, d_new_h_wrt_b_c
        ], {x: x_value,
            h: h_value})

      # Gradients from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        output = core_rnn_cell_impl.GRUCell(cell_size)(x, h)
        sess.run([variables.global_variables_initializer()])

        all_variables = variables.global_variables()[4:8]
        [w_ru, b_ru, w_c, b_c] = all_variables

        d_new_h_wrt_x = gradients_impl.gradients([output], x)
        d_new_h_wrt_h = gradients_impl.gradients([output], h)
        d_new_h_wrt_w_ru = gradients_impl.gradients([output], w_ru)
        d_new_h_wrt_w_c = gradients_impl.gradients([output], w_c)
        d_new_h_wrt_b_ru = gradients_impl.gradients([output], b_ru)
        d_new_h_wrt_b_c = gradients_impl.gradients([output], b_c)

        d_basic_res = sess.run([
            d_new_h_wrt_x, d_new_h_wrt_h, d_new_h_wrt_w_ru, d_new_h_wrt_w_c,
            d_new_h_wrt_b_ru, d_new_h_wrt_b_c
        ], {x: x_value,
            h: h_value})

      # Check lengths of derivative results.
      self.assertEqual(len(d_block_res), len(d_basic_res))
      # Check the value of every derivative result.
      for block, basic in zip(d_block_res, d_basic_res):
        self.assertAllClose(block, basic)
Exemplo n.º 33
0
    def __call__(self, inputs, state, scope=None):
        """Run one step of LSTM.

        Args:
          inputs: input Tensor, 2D, batch x num_units.
          state: if `state_is_tuple` is False, this must be a state Tensor,
            `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
            tuple of state Tensors, both `2-D`, with column sizes `c_state` and
            `m_state`.
          scope: VariableScope for the created subgraph; defaults to "LSTMCell".

        Returns:
          A tuple containing:
          - A `2-D, [batch x output_dim]`, Tensor representing the output of the
            LSTM after reading `inputs` when previous state was `state`.
            Here output_dim is:
               num_proj if num_proj was set,
               num_units otherwise.
          - Tensor(s) representing the new state of LSTM after reading `inputs` when
            the previous state was `state`.  Same type and shape(s) as `state`.

        Raises:
          ValueError: If input size cannot be inferred from inputs via
            static shape inference.
        """
        num_proj = self._num_units if self._num_proj is None else self._num_proj
        
        if self._state_is_tuple:
            (c_prev, m_prev) = state
        else:
            c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
            m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
        
        dtype = inputs.dtype
        input_size = inputs.get_shape().with_rank(2)[1]
        if input_size.value is None:
            raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
        with vs.variable_scope(scope or type(self).__name__,
                               initializer=self._initializer):  # "LSTMCell"
            i_size = input_size.value - 1  # -1 to extract time
            times = array_ops.slice(inputs, [0, i_size], [-1, 1])
            filtered_inputs = array_ops.slice(inputs, [0, 0], [-1, i_size])
            
            # --------------------------------------- #
            # ------------- PHASED LSTM ------------- #
            # ---------------- BEGIN ---------------- #
            # --------------------------------------- #
            
            tau = vs.get_variable(
                "T", shape=[self._num_units],
                initializer=random_exp_initializer(0,
                                                   self.tau_init) if not self.manual_set else init_ops.constant_initializer(
                    self.tau_init),
                trainable=self.trainable, dtype=dtype)
            
            r_on = vs.get_variable(
                "R", shape=[self._num_units],
                initializer=init_ops.constant_initializer(self.r_on_init),
                trainable=self.trainable, dtype=dtype)
            
            s = vs.get_variable(
                "S", shape=[self._num_units],
                initializer=init_ops.random_uniform_initializer(0.,
                                                                tau.initialized_value()) if not self.manual_set else init_ops.constant_initializer(
                    0.),
                trainable=self.trainable, dtype=dtype)
            # for backward compatibility (v < 0.12.0) use the following line instead of the above
            # initializer = init_ops.random_uniform_initializer(0., tau), dtype = dtype)
            
            tau_broadcast = tf.expand_dims(tau, dim=0)
            r_on_broadcast = tf.expand_dims(r_on, dim=0)
            s_broadcast = tf.expand_dims(s, dim=0)
            
            r_on_broadcast = tf.abs(r_on_broadcast)
            tau_broadcast = tf.abs(tau_broadcast)
            times = tf.tile(times, [1, self._num_units])
            
            # calculate kronos gate
            phi = tf.div(tf.mod(tf.mod(times - s_broadcast, tau_broadcast) + tau_broadcast, tau_broadcast),
                         tau_broadcast)
            is_up = tf.less(phi, (r_on_broadcast * 0.5))
            is_down = tf.logical_and(tf.less(phi, r_on_broadcast), tf.logical_not(is_up))
            
            # when manually setting, hard on over r_on, else as previous
            if self.manual_set:
                k = tf.select(tf.logical_or(is_up, is_down), tf.to_float(is_up), self.alpha * phi)
            else:
                k = tf.select(is_up, phi / (r_on_broadcast * 0.5),
                              tf.select(is_down, 2. - 2. * (phi / r_on_broadcast), self.alpha * phi))
            
            # --------------------------------------- #
            # ------------- PHASED LSTM ------------- #
            # ----------------- END ----------------- #
            # --------------------------------------- #
            
            concat_w = _get_concat_variable(
                "W", [i_size + num_proj, 4 * self._num_units],
                dtype, self._num_unit_shards)
            
            b = vs.get_variable(
                "B", shape=[4 * self._num_units],
                initializer=init_ops.zeros_initializer, dtype=dtype)
            
            # i = input_gate, j = new_input, f = forget_gate, o = output_gate
            cell_inputs = array_ops.concat(1, [filtered_inputs, m_prev])
            lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b)
            i, j, f, o = array_ops.split(1, 4, lstm_matrix)
            
            # Diagonal connections
            if self._use_peepholes:
                w_f_diag = vs.get_variable(
                    "W_F_diag", shape=[self._num_units], dtype=dtype)
                w_i_diag = vs.get_variable(
                    "W_I_diag", shape=[self._num_units], dtype=dtype)
                w_o_diag = vs.get_variable(
                    "W_O_diag", shape=[self._num_units], dtype=dtype)
            
            if self._use_peepholes:
                c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
                     sigmoid(i + w_i_diag * c_prev) * self._activation(j))
            else:
                c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
                     self._activation(j))
            
            if self._cell_clip is not None:
                # pylint: disable=invalid-unary-operand-type
                c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
                # pylint: enable=invalid-unary-operand-type
            
            if self._use_peepholes:
                m = sigmoid(o + w_o_diag * c) * self._activation(c)
            else:
                m = sigmoid(o) * self._activation(c)
            
            if self._num_proj is not None:
                concat_w_proj = _get_concat_variable(
                    "W_P", [self._num_units, self._num_proj],
                    dtype, self._num_proj_shards)
                
                m = tf.math_ops.matmul(m, concat_w_proj)
                if self._proj_clip is not None:
                    # pylint: disable=invalid-unary-operand-type
                    m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
                    # pylint: enable=invalid-unary-operand-type
            
            # APPLY KRONOS GATE
            c = k * c + (1. - k) * c_prev
            m = k * m + (1. - k) * m_prev
            # END KRONOS GATE
        
        new_state = (LSTMStateTuple(c, m) if self._state_is_tuple
                     else array_ops.concat(1, [c, m]))
        return m, new_state
Exemplo n.º 34
0
    def testBlockGRUToGRUCellMultiStep(self):
        with self.session(use_gpu=True, graph=ops.Graph()) as sess:
            batch_size = 2
            cell_size = 3
            input_size = 3
            time_steps = 4

            # Random initializers.
            seed = 1994
            initializer = init_ops.random_uniform_initializer(-0.01,
                                                              0.01,
                                                              seed=seed)
            np.random.seed(seed)

            # Inputs
            concat_x = array_ops.placeholder(dtypes.float32,
                                             shape=(time_steps, batch_size,
                                                    input_size))
            h = array_ops.zeros([batch_size, cell_size])

            # Values for the inputs.
            x_values = np.random.rand(time_steps, batch_size, input_size)
            h_value = np.random.rand(batch_size, cell_size)

            # Output from the block GRU cell implementation.
            with vs.variable_scope("block", initializer=initializer):
                cell = gru_ops.GRUBlockCell(cell_size)
                outputs_dynamic, state_dynamic = rnn.dynamic_rnn(
                    cell,
                    inputs=concat_x,
                    initial_state=h,
                    time_major=True,
                    dtype=dtypes.float32)
                feeds = {concat_x: x_values, h: h_value}
                sess.run([variables.global_variables_initializer()])
                block_res = sess.run([outputs_dynamic, state_dynamic], feeds)

            # Output from the basic GRU cell implementation.
            with vs.variable_scope("basic", initializer=initializer):
                cell = rnn_cell.GRUCell(cell_size)
                outputs_dynamic, state_dynamic = rnn.dynamic_rnn(
                    cell,
                    inputs=concat_x,
                    initial_state=h,
                    time_major=True,
                    dtype=dtypes.float32)
                feeds = {concat_x: x_values, h: h_value}
                sess.run([variables.global_variables_initializer()])
                basic_res = sess.run([outputs_dynamic, state_dynamic], feeds)

            # Check the lengths of the outputs_dynamic, and states.
            self.assertEqual(len(block_res), len(basic_res))
            self.assertEqual(len(block_res[0]), len(basic_res[0]))
            self.assertEqual(len(block_res[1]), len(basic_res[1]))

            # Check the outputs_dynamic values.
            for block_output, basic_output in zip(block_res[0], basic_res[0]):
                self.assertAllClose(block_output, basic_output)

            # Check the state_dynamic value.
            self.assertAllClose(block_res[1], block_res[1])
Exemplo n.º 35
0
def tmp():
    initializer = init_ops.random_uniform_initializer(-0.01, 0.01)

    def lstm_cell():
        hidden_size = RNN_UNIT_SIZE
        input_size = CONTENT_DIM
        cell = tf.contrib.rnn.LSTMCell(hidden_size,
                                       input_size,
                                       initializer=initializer,
                                       state_is_tuple=True)
        return cell

    if True:
        attn_length = 16
        cells = [lstm_cell() for _ in range(2)]
        cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
        cell = tf.contrib.rnn.AttentionCellWrapper(cell,
                                                   attn_length,
                                                   state_is_tuple=True)
        outputs, states = tf.nn.dynamic_rnn(cell,
                                            content_embeddings,
                                            sequence_length=content_lengths,
                                            dtype=tf.float32)
        #last_outputs = states[0][-1].h
        last_outputs = tf.concat([states[0][-1].h, states[-1]], 1)
    elif True:
        content_embeddings = tf.unstack(content_embeddings, 200, 1)
        cell = lstm_ops.LSTMBlockFusedCell(RNN_UNIT_SIZE)
        content_lengths = tf.cast(content_lengths, tf.int32)
        outputs, state = cell(content_embeddings,
                              sequence_length=content_lengths,
                              dtype=tf.float32)
        last_outputs = state.h
    elif True:
        layer_sizes = [RNN_UNIT_SIZE, RNN_UNIT_SIZE]
        cell = make_rnn_cell(layer_sizes,
                             dropout_keep_prob=dropout_keep_prob,
                             base_cell=lstm_ops.LSTMBlockCell,
                             attn_length=16)
        outputs, final_state = tf.nn.dynamic_rnn(
            cell,
            content_embeddings,
            sequence_length=content_lengths,
            swap_memory=True,
            dtype=tf.float32)
        last_outputs = final_state[-1].h
        #last_outputs = tf.concat([final_state[-1].h, final_state[0][1]], 1)
    elif True:
        cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(2)],
                                           state_is_tuple=True)
        outputs, states = tf.nn.dynamic_rnn(cell,
                                            content_embeddings,
                                            sequence_length=content_lengths,
                                            dtype=tf.float32)
        last_outputs = states[-1].h
    elif True:
        num_hidden = RNN_UNIT_SIZE
        cell_fw = tf.nn.rnn_cell.LSTMCell(num_units=num_hidden,
                                          state_is_tuple=True)
        cell_bw = tf.nn.rnn_cell.LSTMCell(num_units=num_hidden,
                                          state_is_tuple=True)
        outputs, states = tf.nn.bidirectional_dynamic_rnn(
            cell_fw,
            cell_bw,
            content_embeddings,
            sequence_length=content_lengths,
            dtype=tf.float32)
        output_fw, output_bw = outputs
        output_state_fw, output_state_bw = states
        #last_outputs = tf.concat([output_fw[:, 0], output_state_bw.h], 1)
        last_outputs = tf.concat([output_state_fw.h, output_state_bw.h], 1)
    elif True:
        num_hidden = RNN_UNIT_SIZE
        lstm_fw_cell = tf.contrib.rnn.BasicLSTMCell(num_hidden,
                                                    forget_bias=1.0)
        lstm_bw_cell = tf.contrib.rnn.BasicLSTMCell(num_hidden,
                                                    forget_bias=1.0)
        content_embeddings = tf.unstack(content_embeddings, 200, 1)
        outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(
            lstm_fw_cell,
            lstm_bw_cell,
            content_embeddings,
            sequence_length=content_lengths,
            dtype=tf.float32)
        last_outputs = outputs[-1]
Exemplo n.º 36
0
    def __init__(self,
                 imgSize,
                 vocabSize,
                 embedSize,
                 use_lstm,
                 rnnHiddenSize,
                 rnnLayers,
                 start,
                 end,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 min_learning_rate,
                 training_steps_per_epoch,
                 keep_prob=0.5,
                 max_gradient_norm=5.0,
                 is_training=True):

        if is_training:
            self.global_step = tf.Variable(0, trainable=False)
            self.learning_rate = tf.maximum(
                tf.train.exponential_decay(learning_rate,
                                           self.global_step,
                                           training_steps_per_epoch,
                                           learning_rate_decay_factor,
                                           staircase=True), min_learning_rate)
            self.answers_ph = tf.placeholder(tf.int32,
                                             shape=[batch_size, 10, 20],
                                             name="answers")
            self.answer_lengths_ph = tf.placeholder(tf.int32,
                                                    shape=[batch_size, 10],
                                                    name="answer_lengths")
            self.targets_ph = tf.placeholder(tf.int32,
                                             shape=[batch_size, 10, 21],
                                             name="targets")

        self.image_feature_ph = tf.placeholder(tf.float32,
                                               shape=[batch_size, imgSize],
                                               name="image_feature")

        self.caption_ph = tf.placeholder(tf.int32,
                                         shape=[batch_size, 40],
                                         name="caption")
        self.caption_length_ph = tf.placeholder(tf.int32,
                                                shape=[batch_size],
                                                name="caption_length")

        self.questions_ph = tf.placeholder(tf.int32,
                                           shape=[batch_size, 10, 20],
                                           name="questions")
        self.question_lengths_ph = tf.placeholder(tf.int32,
                                                  shape=[batch_size, 10],
                                                  name="question_lengths")

        START = tf.constant(value=[start] * batch_size)
        END = tf.constant(value=[end] * batch_size)

        # Embedding (share)
        with ops.device("/cpu:0"):
            if vs.get_variable_scope().initializer:
                initializer = vs.get_variable_scope().initializer
            else:
                # Default initializer for embeddings should have variance=1.
                sqrt3 = math.sqrt(
                    3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
                initializer = init_ops.random_uniform_initializer(
                    -sqrt3, sqrt3)
                embedding = vs.get_variable("embedding",
                                            [vocabSize, embedSize],
                                            initializer=initializer,
                                            dtype=tf.float32)

        START_EMB = embedding_ops.embedding_lookup(embedding, START)
        END_EMB = embedding_ops.embedding_lookup(embedding, END)

        # split placeholders and embed
        questions = tf.split(
            value=self.questions_ph, num_or_size_splits=10,
            axis=1)  # list with length 10; questions[0]: [batch_size, 1, 20]
        questions = [
            tf.squeeze(input=question, axis=1) for question in questions
        ]  # list with length 10; questions[0]: [batch_size, 20]
        questions = [
            embedding_ops.embedding_lookup(embedding, question)
            for question in questions
        ]  # list with length 10; questions[0]: [batch_size, 20, embedSize]

        question_lengths = tf.split(value=self.question_lengths_ph,
                                    num_or_size_splits=10,
                                    axis=1)
        question_lengths = [
            tf.squeeze(question_length) for question_length in question_lengths
        ]

        if is_training:
            answers = tf.split(value=self.answers_ph,
                               num_or_size_splits=10,
                               axis=1)
            answers = [tf.squeeze(input=answer, axis=1) for answer in answers]
            answers = [
                embedding_ops.embedding_lookup(embedding, answer)
                for answer in answers
            ]

            answer_lengths = tf.split(value=self.answer_lengths_ph,
                                      num_or_size_splits=10,
                                      axis=1)
            answer_lengths = [
                tf.squeeze(answer_length) for answer_length in answer_lengths
            ]

            targets = tf.split(value=self.targets_ph,
                               num_or_size_splits=10,
                               axis=1)
            targets = [tf.squeeze(input=target, axis=1) for target in targets]

            weights = []
            for r in range(10):
                weight = []
                answer_length = answer_lengths[r]
                for i in range(21):
                    weight.append(tf.greater_equal(x=answer_length, y=i))
                weight = tf.cast(x=tf.stack(values=weight, axis=1),
                                 dtype=tf.float32)  # [batch_size, 21]
                weights.append(weight)

        # make RNN cell
        def single_cell():
            return GRUCell(rnnHiddenSize)

        if use_lstm:

            def single_cell():
                return BasicLSTMCell(rnnHiddenSize, state_is_tuple=False)

        make_cell = single_cell
        if rnnLayers > 1:

            def make_cell():
                return MultiRNNCell([single_cell() for _ in range(rnnLayers)],
                                    state_is_tuple=False)

        encoder_cell = make_cell()
        decoder_cell = OutputProjectionWrapper(cell=make_cell(),
                                               output_size=vocabSize,
                                               activation=None)

        # caption feature
        caption = embedding_ops.embedding_lookup(
            embedding, self.caption_ph)  # [batch_size, 40, embedSize]
        caption_length = tf.squeeze(self.caption_length_ph)
        with tf.variable_scope('EncoderRNN') as varscope:
            _, captionState = dynamic_rnn(
                cell=encoder_cell,
                inputs=caption,
                sequence_length=caption_length,
                dtype=tf.float32,
                scope=varscope)  # [batch_size, encoder_cell.state_size]

        if is_training:
            losses = []
        else:
            ans_word_probs = []

        for r in range(10):
            # 1. question
            with tf.variable_scope('EncoderRNN', reuse=True) as varscope:
                _, questionState = dynamic_rnn(
                    cell=encoder_cell,
                    inputs=questions[r],
                    sequence_length=question_lengths[r],
                    dtype=tf.float32,
                    scope=varscope)

            # 2. history
            if r == 0:
                historyState = captionState

            # 3. fusion
            concat = tf.concat(
                values=[self.image_feature_ph, questionState, historyState],
                axis=1)
            if is_training:
                concat = tf.nn.dropout(x=concat, keep_prob=keep_prob)
            with tf.variable_scope('Fusion', reuse=(r > 0)) as varscope:
                encoder_state = tf.contrib.layers.fully_connected(
                    inputs=concat,
                    num_outputs=decoder_cell.state_size,
                    activation_fn=tf.nn.tanh,
                    scope=varscope)

            # 4. decoder
            with tf.variable_scope('DecoderRNN', reuse=(r > 0)) as varscope:
                if is_training:
                    answer = [
                        tf.squeeze(input=word, axis=1) for word in tf.split(
                            value=answers[r], num_or_size_splits=20, axis=1)
                    ]
                    decoder_outputs, _ = rnn_decoder(
                        decoder_inputs=[START_EMB] + answer,
                        initial_state=encoder_state,
                        cell=decoder_cell,
                        loop_function=None,
                        scope=varscope)
                else:
                    self_answer = []
                    self_answer_emb = []

                    def loop_function(prev, _):
                        prev_symbol = math_ops.argmax(prev, 1)
                        self_answer.append(
                            tf.cast(x=prev_symbol, dtype=tf.int32))
                        emb_prev = embedding_ops.embedding_lookup(
                            embedding, prev_symbol)
                        self_answer_emb.append(emb_prev)
                        return emb_prev

                    decoder_outputs, _ = rnn_decoder(
                        decoder_inputs=[START_EMB] * 21,
                        initial_state=encoder_state,
                        cell=decoder_cell,
                        loop_function=loop_function,
                        scope=varscope)

            # 5. update history
            with tf.variable_scope('EncoderRNN', reuse=True) as varscope:
                _, historyState = dynamic_rnn(
                    cell=encoder_cell,
                    inputs=questions[r],
                    sequence_length=question_lengths[r],
                    initial_state=historyState,
                    scope=varscope)
                if is_training:
                    _, historyState = dynamic_rnn(
                        cell=encoder_cell,
                        inputs=answers[r],
                        sequence_length=answer_lengths[r],
                        initial_state=historyState,
                        scope=varscope)
                else:
                    self_answer = tf.stack(values=self_answer + [END],
                                           axis=1)  # [batch_size, 21]
                    self_answer_length = tf.argmax(input=tf.cast(
                        x=tf.equal(x=self_answer, y=end), dtype=tf.float32),
                                                   axis=1)
                    self_answer_emb = tf.stack(
                        values=self_answer_emb,
                        axis=1)  # [batch_size, 20, embSize]
                    _, historyState = dynamic_rnn(
                        cell=encoder_cell,
                        inputs=self_answer_emb,
                        sequence_length=self_answer_length,
                        initial_state=historyState,
                        scope=varscope)

            if is_training:
                decoder_outputs = tf.stack(
                    values=decoder_outputs,
                    axis=1)  # [batch_size, 21, vocabSize]
                loss = tf.contrib.seq2seq.sequence_loss(
                    logits=decoder_outputs,
                    targets=targets[r],
                    weights=weights[r],
                    average_across_batch=False)  # [batch_size]
                losses.append(loss)
            else:
                decoder_outputs = [
                    tf.log(tf.nn.softmax(decoder_output))
                    for decoder_output in decoder_outputs
                ]
                ans_word_probs.append(
                    tf.stack(values=decoder_outputs,
                             axis=1))  # [batch_size, 21, vocabSize]
        if is_training:
            losses = tf.stack(values=losses, axis=1)  # [batch_size, 10]
            self.loss = tf.reduce_mean(losses)
            params = tf.trainable_variables()
            gradients = tf.gradients(self.loss, params)
            clipped_gradients, norm = tf.clip_by_global_norm(
                gradients, max_gradient_norm)
            self.opt_op = tf.train.AdamOptimizer(
                self.learning_rate).apply_gradients(
                    zip(clipped_gradients, params),
                    global_step=self.global_step)
        else:
            self.ans_word_probs = tf.stack(
                values=ans_word_probs,
                axis=1)  # [batch_size, 10, 21, vocabSize]

        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=99999999)
Exemplo n.º 37
0
    def _init_weights(self, inputs):
        """Initialize the weights.

        Args:
          inputs: `2-D` tensor with shape `[batch_size x input_size]`,
            the input to the cell. Needed for calculating weight shapes.

        Returns:
          A dict of weight name:tensorflow-weight pairs.
        """
        dtype = inputs.dtype
        bias_initializer = init_ops.random_uniform_initializer(-0.1, 0.1, dtype=dtype) \
            if self._bias_initializer is None else self._bias_initializer
        weight_initializer = init_ops.random_uniform_initializer(-0.1, 0.1, dtype=dtype) \
            if self._weight_initializer is None else self._weight_initializer
        forget_bias_initializer = init_ops.constant_initializer(1.0, dtype=dtype) \
            if self._forget_bias_initializer is None else self._forget_bias_initializer

        weight_dict = {}

        # initialize shared weights
        with vs.variable_scope(self._shared_scope) as scope:
            for weight_name in self._shared_weights:
                if weight_name == _B_F:
                    with vs.variable_scope(scope) as bias_scope:
                        bias_scope.set_partitioner(None)
                        weight = vs.get_variable(
                            name=weight_name,
                            shape=self._get_weight_shape(weight_name, inputs),
                            dtype=dtype,
                            initializer=forget_bias_initializer)
                elif weight_name not in _BIASES:
                    weight = vs.get_variable(name=weight_name,
                                             shape=self._get_weight_shape(
                                                 weight_name, inputs),
                                             dtype=dtype,
                                             initializer=weight_initializer)
                else:
                    with vs.variable_scope(scope) as bias_scope:
                        bias_scope.set_partitioner(None)
                        weight = vs.get_variable(name=weight_name,
                                                 shape=self._get_weight_shape(
                                                     weight_name, inputs),
                                                 dtype=dtype,
                                                 initializer=bias_initializer)
                weight_dict[weight_name] = weight

        # initialize local weights
        for weight_name in _WEIGHTS | _UEIGHTS | _NEIGHBOUR_UEIGHTS:
            if weight_name not in self._shared_weights:
                weight = vs.get_variable(name=weight_name,
                                         shape=self._get_weight_shape(
                                             weight_name, inputs),
                                         dtype=dtype,
                                         initializer=weight_initializer)
                weight_dict[weight_name] = weight
        for weight_name in _BIASES:
            if weight_name not in self._shared_weights:
                if weight_name == _B_F:
                    weight = vs.get_variable(
                        name=weight_name,
                        shape=self._get_weight_shape(weight_name, inputs),
                        dtype=dtype,
                        initializer=forget_bias_initializer)
                else:
                    weight = vs.get_variable(name=weight_name,
                                             shape=self._get_weight_shape(
                                                 weight_name, inputs),
                                             dtype=dtype,
                                             initializer=bias_initializer)
                weight_dict[weight_name] = weight

        return weight_dict
Exemplo n.º 38
0
    def build(self, inputs_shape):
        if inputs_shape[1].value is None:
            raise ValueError(
                "Expected inputs.shape[-1] to be known, saw shape: %s" %
                inputs_shape)

        input_depth = inputs_shape[1].value
        self._input_kernel = self.add_variable(
            "input_kernel", shape=[input_depth, self._num_units])
        #initializer=self._input_initializer)

        self._input_kernel_top = self.add_variable(
            "input_kernel_top", shape=[self._num_units, self._num_units])
        #initializer=self._input_initializer)

        self._hierarchy_kernel1 = self.add_variable(
            "hierarchy_kernel1", shape=[self._num_units, self._num_units])

        #if self.topdown:
        #    self._hierarchy_kernel1 = clip_ops.clip_by_norm(self._hierarchy_kernel1, self._recurrent_max_abs, axes=1)
        #    if self._layer_idx > 1:
        #        self._input_kernel = clip_ops.clip_by_norm(self._input_kernel, self._recurrent_max_abs, axes=0)
        #        '''
        #        _input_kernel_top = None
        #        W_l2norm = math_ops.sqrt(math_ops.matmul(self._hierarchy_kernel1, _input_kernel_top))
        #        _input_kernel_top = _input_kernel_top * self._recurrent_max_abs / tf.maximum(self._recurrent_max_abs_tensor, W_l2norm)
        #        self._hierarchy_kernel1 = self._hierarchy_kernel1 * self._recurrent_max_abs / tf.maximum(self._recurrent_max_abs_tensor, W_l2norm)
        #        '''

        if self._recurrent_initializer is None:
            # Initialize the recurrent weights uniformly in [-max_abs, max_abs] or
            # [-1, 1] if max_abs exceeds 1
            init_bound = 1.0
            if self._recurrent_max_abs and self._recurrent_max_abs < init_bound:
                init_bound = self._recurrent_max_abs

            self._recurrent_initializer = init_ops.random_uniform_initializer(
                minval=-init_bound, maxval=init_bound)

        self._recurrent_kernel = self.add_variable(
            "recurrent_kernel",
            shape=[self._num_units],
            initializer=self._recurrent_initializer)

        # Clip the absolute values of the recurrent weights to the specified minimum
        if self._recurrent_min_abs:
            abs_kernel = math_ops.abs(self._recurrent_kernel)
            min_abs_kernel = math_ops.maximum(abs_kernel,
                                              self._recurrent_min_abs)
            self._recurrent_kernel = math_ops.multiply(
                math_ops.sign(self._recurrent_kernel), min_abs_kernel)

        # Clip the absolute values of the recurrent weights to the specified maximum
        if self._recurrent_max_abs:
            self._recurrent_kernel = clip_ops.clip_by_value(
                self._recurrent_kernel, -self._recurrent_max_abs,
                self._recurrent_max_abs)

        self._hierarchy_kernel = self.add_variable(
            "hierarchy_kernel",
            shape=[self._num_units],
            initializer=self._recurrent_initializer)

        if self._recurrent_min_abs:
            abs_kernel = math_ops.abs(self._hierarchy_kernel)
            min_abs_kernel = math_ops.maximum(abs_kernel,
                                              self._recurrent_min_abs)
            self._hierarchy_kernel = math_ops.multiply(
                math_ops.sign(self._hierarchy_kernel), min_abs_kernel)

        if self._recurrent_max_abs:
            self._hierarchy_kernel = clip_ops.clip_by_value(
                self._hierarchy_kernel, -self._recurrent_max_abs,
                self._recurrent_max_abs)

        self._bias = self.add_variable(
            "bias",
            shape=[self._num_units],
            initializer=init_ops.zeros_initializer(dtype=self.dtype))

        if self._batch_norm:
            self.bn = tf.keras.layers.BatchNormalization(momentum=0.9)

        self.built = True
Exemplo n.º 39
0
 def testInitializerDifferent(self):
   for dtype in [dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64]:
     init1 = init_ops.random_uniform_initializer(0, 7, seed=1, dtype=dtype)
     init2 = init_ops.random_uniform_initializer(0, 7, seed=2, dtype=dtype)
     self.assertFalse(identicaltest(self, init1, init2))
Exemplo n.º 40
0
class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
    def _assert_all_close(self, expected, actual, atol=0.001):
        if not context.executing_eagerly():
            with self.cached_session() as sess:
                keras_backend._initialize_variables(sess)
                self.assertAllClose(expected, actual, atol=atol)
        else:
            self.assertAllClose(expected, actual, atol=atol)

    @test_util.run_in_graph_and_eager_modes()
    def test_invalid_output_dim(self):
        with self.assertRaisesRegexp(
                ValueError,
                r'`output_dim` should be a positive integer. Given: -3.'):
            _ = kernel_layers.RandomFourierFeatures(output_dim=-3, scale=2.0)

    @test_util.run_in_graph_and_eager_modes()
    def test_unsupported_kernel_type(self):
        with self.assertRaisesRegexp(
                ValueError,
                r'Unsupported kernel type: \'unsupported_kernel\'.'):
            _ = kernel_layers.RandomFourierFeatures(3,
                                                    'unsupported_kernel',
                                                    stddev=2.0)

    @test_util.run_in_graph_and_eager_modes()
    def test_invalid_scale(self):
        with self.assertRaisesRegexp(
                ValueError,
                r'When provided, `scale` should be a positive float. Given: 0.0.'
        ):
            _ = kernel_layers.RandomFourierFeatures(output_dim=10, scale=0.0)

    @test_util.run_in_graph_and_eager_modes()
    def test_invalid_input_shape(self):
        inputs = random_ops.random_uniform((3, 2, 4), seed=1)
        rff_layer = kernel_layers.RandomFourierFeatures(output_dim=10,
                                                        scale=3.0)
        with self.assertRaisesRegexp(
                ValueError,
                r'The rank of the input tensor should be 2. Got 3 instead.'):
            _ = rff_layer(inputs)

    @parameterized.named_parameters(
        ('gaussian', 'gaussian', 10.0, False),
        ('random', init_ops.random_uniform_initializer, 1.0, True))
    @test_util.run_in_graph_and_eager_modes()
    def test_random_features_properties(self, initializer, scale, trainable):
        rff_layer = kernel_layers.RandomFourierFeatures(
            output_dim=10,
            kernel_initializer=initializer,
            scale=scale,
            trainable=trainable)
        self.assertEqual(rff_layer.output_dim, 10)
        self.assertEqual(rff_layer.kernel_initializer, initializer)
        self.assertEqual(rff_layer.scale, scale)
        self.assertEqual(rff_layer.trainable, trainable)

    @parameterized.named_parameters(('gaussian', 'gaussian', False),
                                    ('laplacian', 'laplacian', True),
                                    ('other', init_ops.ones_initializer, True))
    @test_util.run_in_graph_and_eager_modes()
    def test_call(self, initializer, trainable):
        rff_layer = kernel_layers.RandomFourierFeatures(
            output_dim=10,
            kernel_initializer=initializer,
            scale=1.0,
            trainable=trainable,
            name='random_fourier_features')
        inputs = random_ops.random_uniform((3, 2), seed=1)
        outputs = rff_layer(inputs)
        self.assertListEqual([3, 10], outputs.shape.as_list())
        num_trainable_vars = 1 if trainable else 0
        self.assertLen(rff_layer.non_trainable_variables,
                       3 - num_trainable_vars)

    @test_util.assert_no_new_pyobjects_executing_eagerly
    def test_no_eager_Leak(self):
        # Tests that repeatedly constructing and building a Layer does not leak
        # Python objects.
        inputs = random_ops.random_uniform((5, 4), seed=1)
        kernel_layers.RandomFourierFeatures(output_dim=4, name='rff')(inputs)
        kernel_layers.RandomFourierFeatures(output_dim=10, scale=2.0)(inputs)

    @test_util.run_in_graph_and_eager_modes()
    def test_output_shape(self):
        inputs = random_ops.random_uniform((3, 2), seed=1)
        rff_layer = kernel_layers.RandomFourierFeatures(
            output_dim=7, name='random_fourier_features', trainable=True)
        outputs = rff_layer(inputs)
        self.assertEqual([3, 7], outputs.shape.as_list())

    @parameterized.named_parameters(
        ('gaussian', 'gaussian'), ('laplacian', 'laplacian'),
        ('other', init_ops.random_uniform_initializer))
    def test_call_on_placeholder(self, initializer):
        with ops.Graph().as_default():
            inputs = array_ops.placeholder(dtype=dtypes.float32,
                                           shape=[None, None])
            rff_layer = kernel_layers.RandomFourierFeatures(
                output_dim=5,
                kernel_initializer=initializer,
                name='random_fourier_features')
            with self.assertRaisesRegexp(
                    ValueError, r'The last dimension of the inputs to '
                    '`RandomFourierFeatures` should be defined. Found `None`.'
            ):
                rff_layer(inputs)

            inputs = array_ops.placeholder(dtype=dtypes.float32,
                                           shape=[2, None])
            rff_layer = kernel_layers.RandomFourierFeatures(
                output_dim=5,
                kernel_initializer=initializer,
                name='random_fourier_features')
            with self.assertRaisesRegexp(
                    ValueError, r'The last dimension of the inputs to '
                    '`RandomFourierFeatures` should be defined. Found `None`.'
            ):
                rff_layer(inputs)

            inputs = array_ops.placeholder(dtype=dtypes.float32,
                                           shape=[None, 3])
            rff_layer = kernel_layers.RandomFourierFeatures(
                output_dim=5, name='random_fourier_features')
            rff_layer(inputs)

    @parameterized.named_parameters(
        ('gaussian', 10, 'gaussian', 2.0), ('laplacian', 5, 'laplacian', None),
        ('other', 10, init_ops.ones_initializer, 1.0))
    @test_util.run_in_graph_and_eager_modes()
    def test_compute_output_shape(self, output_dim, initializer, scale):
        rff_layer = kernel_layers.RandomFourierFeatures(output_dim,
                                                        initializer,
                                                        scale=scale,
                                                        name='rff')
        with self.assertRaises(ValueError):
            rff_layer.compute_output_shape(tensor_shape.TensorShape(None))
        with self.assertRaises(ValueError):
            rff_layer.compute_output_shape(tensor_shape.TensorShape([]))
        with self.assertRaises(ValueError):
            rff_layer.compute_output_shape(tensor_shape.TensorShape([3]))
        with self.assertRaises(ValueError):
            rff_layer.compute_output_shape(tensor_shape.TensorShape([3, 2, 3]))

        with self.assertRaisesRegexp(
                ValueError,
                r'The innermost dimension of input shape must be defined.'):
            rff_layer.compute_output_shape(tensor_shape.TensorShape([3, None]))

        self.assertEqual([None, output_dim],
                         rff_layer.compute_output_shape((None, 3)).as_list())
        self.assertEqual([None, output_dim],
                         rff_layer.compute_output_shape(
                             tensor_shape.TensorShape([None, 2])).as_list())
        self.assertEqual([4, output_dim],
                         rff_layer.compute_output_shape((4, 1)).as_list())

    @parameterized.named_parameters(
        ('gaussian', 10, 'gaussian', 3.0, False),
        ('laplacian', 5, 'laplacian', 5.5, True),
        ('other', 7, init_ops.random_uniform_initializer(), None, True))
    @test_util.run_in_graph_and_eager_modes()
    def test_get_config(self, output_dim, initializer, scale, trainable):
        rff_layer = kernel_layers.RandomFourierFeatures(
            output_dim,
            initializer,
            scale=scale,
            trainable=trainable,
            name='random_fourier_features',
        )
        expected_initializer = initializer
        if isinstance(initializer, init_ops.Initializer):
            expected_initializer = initializers.serialize(initializer)

        expected_dtype = ('float32'
                          if base_layer_utils.v2_dtype_behavior_enabled() else
                          None)
        expected_config = {
            'output_dim': output_dim,
            'kernel_initializer': expected_initializer,
            'scale': scale,
            'name': 'random_fourier_features',
            'trainable': trainable,
            'dtype': expected_dtype,
        }
        self.assertLen(expected_config, len(rff_layer.get_config()))
        self.assertSameElements(list(expected_config.items()),
                                list(rff_layer.get_config().items()))

    @parameterized.named_parameters(
        ('gaussian', 5, 'gaussian', None, True),
        ('laplacian', 5, 'laplacian', 5.5, False),
        ('other', 7, init_ops.ones_initializer(), 2.0, True))
    @test_util.run_in_graph_and_eager_modes()
    def test_from_config(self, output_dim, initializer, scale, trainable):
        model_config = {
            'output_dim': output_dim,
            'kernel_initializer': initializer,
            'scale': scale,
            'trainable': trainable,
            'name': 'random_fourier_features',
        }
        rff_layer = kernel_layers.RandomFourierFeatures.from_config(
            model_config)
        self.assertEqual(rff_layer.output_dim, output_dim)
        self.assertEqual(rff_layer.kernel_initializer, initializer)
        self.assertEqual(rff_layer.scale, scale)
        self.assertEqual(rff_layer.trainable, trainable)

        inputs = random_ops.random_uniform((3, 2), seed=1)
        outputs = rff_layer(inputs)
        self.assertListEqual([3, output_dim], outputs.shape.as_list())
        num_trainable_vars = 1 if trainable else 0
        self.assertLen(rff_layer.trainable_variables, num_trainable_vars)
        if trainable:
            self.assertEqual('random_fourier_features/random_features_scale:0',
                             rff_layer.trainable_variables[0].name)
        self.assertLen(rff_layer.non_trainable_variables,
                       3 - num_trainable_vars)

    @parameterized.named_parameters(
        ('gaussian', 10, 'gaussian', 3.0, True),
        ('laplacian', 5, 'laplacian', 5.5, False),
        ('other', 10, init_ops.random_uniform_initializer(), None, True))
    @test_util.run_in_graph_and_eager_modes()
    def test_same_random_features_params_reused(self, output_dim, initializer,
                                                scale, trainable):
        """Applying the layer on the same input twice gives the same output."""
        rff_layer = kernel_layers.RandomFourierFeatures(
            output_dim=output_dim,
            kernel_initializer=initializer,
            scale=scale,
            trainable=trainable,
            name='random_fourier_features')
        inputs = constant_op.constant(
            np.random.uniform(low=-1.0, high=1.0, size=(2, 4)))
        output1 = rff_layer(inputs)
        output2 = rff_layer(inputs)
        self._assert_all_close(output1, output2)

    @parameterized.named_parameters(
        ('gaussian', 'gaussian', 5.0), ('laplacian', 'laplacian', 3.0),
        ('other', init_ops.random_uniform_initializer(), 5.0))
    @test_util.run_in_graph_and_eager_modes()
    def test_different_params_similar_approximation(self, initializer, scale):
        random_seed.set_random_seed(12345)
        rff_layer1 = kernel_layers.RandomFourierFeatures(
            output_dim=3000,
            kernel_initializer=initializer,
            scale=scale,
            name='rff1')
        rff_layer2 = kernel_layers.RandomFourierFeatures(
            output_dim=2000,
            kernel_initializer=initializer,
            scale=scale,
            name='rff2')
        # Two distinct inputs.
        x = constant_op.constant([[1.0, -1.0, 0.5]])
        y = constant_op.constant([[-1.0, 1.0, 1.0]])

        # Apply both layers to both inputs.
        output_x1 = math.sqrt(2.0 / 3000.0) * rff_layer1(x)
        output_y1 = math.sqrt(2.0 / 3000.0) * rff_layer1(y)
        output_x2 = math.sqrt(2.0 / 2000.0) * rff_layer2(x)
        output_y2 = math.sqrt(2.0 / 2000.0) * rff_layer2(y)

        # Compute the inner products of the outputs (on inputs x and y) for both
        # layers. For any fixed random features layer rff_layer, and inputs x, y,
        # rff_layer(x)^T * rff_layer(y) ~= K(x,y) up to a normalization factor.
        approx_kernel1 = kernelized_utils.inner_product(output_x1, output_y1)
        approx_kernel2 = kernelized_utils.inner_product(output_x2, output_y2)
        self._assert_all_close(approx_kernel1, approx_kernel2, atol=0.08)

    @parameterized.named_parameters(
        ('gaussian', 'gaussian', 5.0, _exact_gaussian(stddev=5.0)),
        ('laplacian', 'laplacian', 20.0, _exact_laplacian(stddev=20.0)))
    @test_util.run_in_graph_and_eager_modes()
    def test_bad_kernel_approximation(self, initializer, scale,
                                      exact_kernel_fn):
        """Approximation is bad when output dimension is small."""
        # Two distinct inputs.
        x = constant_op.constant([[1.0, -1.0, 0.5]])
        y = constant_op.constant([[-1.0, 1.0, 1.0]])

        small_output_dim = 10
        random_seed.set_random_seed(1234)
        # Initialize layer.
        rff_layer = kernel_layers.RandomFourierFeatures(
            output_dim=small_output_dim,
            kernel_initializer=initializer,
            scale=scale,
            name='random_fourier_features')

        # Apply layer to both inputs.
        output_x = math.sqrt(2.0 / small_output_dim) * rff_layer(x)
        output_y = math.sqrt(2.0 / small_output_dim) * rff_layer(y)

        # The inner products of the outputs (on inputs x and y) approximates the
        # real value of the RBF kernel but poorly since the output dimension of the
        # layer is small.
        exact_kernel_value = exact_kernel_fn(x, y)
        approx_kernel_value = kernelized_utils.inner_product(
            output_x, output_y)
        abs_error = math_ops.abs(exact_kernel_value - approx_kernel_value)
        if not context.executing_eagerly():
            with self.cached_session() as sess:
                keras_backend._initialize_variables(sess)
                abs_error_eval = sess.run([abs_error])
                self.assertGreater(abs_error_eval[0][0], 0.05)
                self.assertLess(abs_error_eval[0][0], 0.5)
        else:
            self.assertGreater(abs_error, 0.05)
            self.assertLess(abs_error, 0.5)

    @parameterized.named_parameters(
        ('gaussian', 'gaussian', 5.0, _exact_gaussian(stddev=5.0)),
        ('laplacian', 'laplacian', 10.0, _exact_laplacian(stddev=10.0)))
    @test_util.run_in_graph_and_eager_modes()
    def test_good_kernel_approximation_multiple_inputs(self, initializer,
                                                       scale, exact_kernel_fn):
        # Parameters.
        input_dim = 5
        output_dim = 2000
        x_rows = 20
        y_rows = 30

        x = constant_op.constant(np.random.uniform(size=(x_rows, input_dim)),
                                 dtype=dtypes.float32)
        y = constant_op.constant(np.random.uniform(size=(y_rows, input_dim)),
                                 dtype=dtypes.float32)

        random_seed.set_random_seed(1234)
        rff_layer = kernel_layers.RandomFourierFeatures(
            output_dim=output_dim,
            kernel_initializer=initializer,
            scale=scale,
            name='random_fourier_features')

        # The shapes of output_x and output_y are (x_rows, output_dim) and
        # (y_rows, output_dim) respectively.
        output_x = math.sqrt(2.0 / output_dim) * rff_layer(x)
        output_y = math.sqrt(2.0 / output_dim) * rff_layer(y)

        approx_kernel_matrix = kernelized_utils.inner_product(
            output_x, output_y)
        exact_kernel_matrix = exact_kernel_fn(x, y)
        self._assert_all_close(approx_kernel_matrix,
                               exact_kernel_matrix,
                               atol=0.05)
Exemplo n.º 41
0
  def testWarmStartEmbeddingColumnLinearModel(self):
    # Create old and new vocabs for embedding column "sc_vocab".
    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                        "old_vocab")
    new_vocab_path = self._write_vocab(
        ["orange", "guava", "banana", "apple", "raspberry", "blueberry"],
        "new_vocab")

    # Save checkpoint from which to warm-start.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        variable_scope.get_variable(
            "linear_model/sc_vocab_embedding/embedding_weights",
            initializer=[[0.5, 0.4], [1., 1.1], [2., 2.2], [3., 3.3]])
        variable_scope.get_variable(
            "linear_model/sc_vocab_embedding/weights",
            initializer=[[0.69], [0.71]])
        self._write_checkpoint(sess)

    def _partitioner(shape, dtype):  # pylint:disable=unused-argument
      # Partition each var into 2 equal slices.
      partitions = [1] * len(shape)
      partitions[0] = min(2, shape[0].value)
      return partitions

    # Create feature columns.
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
    emb_vocab = fc.embedding_column(
        categorical_column=sc_vocab,
        dimension=2)
    all_deep_cols = [emb_vocab]
    # New graph, new session with warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = {}
        with variable_scope.variable_scope("", partitioner=_partitioner):
          # Create the variables.
          fc.linear_model(
              features=self._create_dummy_inputs(),
              feature_columns=all_deep_cols,
              cols_to_vars=cols_to_vars)

        # Construct the vocab_info for the embedding weight.
        vocab_info = ws_util._VocabInfo(
            new_vocab=sc_vocab.vocabulary_file,
            new_vocab_size=sc_vocab.vocabulary_size,
            num_oov_buckets=sc_vocab.num_oov_buckets,
            old_vocab=prev_vocab_path,
            # Can't use constant_initializer with load_and_remap.  In practice,
            # use a truncated normal initializer.
            backup_initializer=init_ops.random_uniform_initializer(
                minval=0.42, maxval=0.42)
        )
        ws_settings = ws_util._WarmStartSettings(
            self.get_temp_dir(),
            vars_to_warmstart=".*sc_vocab.*",
            var_name_to_vocab_info={
                "linear_model/sc_vocab_embedding/embedding_weights": vocab_info
            })
        ws_util._warmstart(ws_settings)
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warmstarted. Var corresponding to
        # emb_vocab should be correctly warmstarted after vocab remapping.
        # Missing values are filled in with the EmbeddingColumn's initializer.
        self._assert_cols_to_vars(
            cols_to_vars, {
                emb_vocab: [
                    # embedding_weights part 0.
                    np.array([[3., 3.3], [2., 2.2], [1., 1.1]]),
                    # embedding_weights part 1.
                    np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]]),
                    # linear weights part 0.
                    np.array([[0.69]]),
                    # linear weights part 1.
                    np.array([[0.71]])
                ]
            }, sess)
Exemplo n.º 42
0
 def testInitializerDifferent(self):
   for dtype in [dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64]:
     init1 = init_ops.random_uniform_initializer(0, 7, seed=1, dtype=dtype)
     init2 = init_ops.random_uniform_initializer(0, 7, seed=2, dtype=dtype)
     self.assertFalse(identicaltest(self, init1, init2))
Exemplo n.º 43
0
  def testDerivativeOfBlockGRUToGRUCellMultiSteps(self):
    batch_size = 2
    cell_size = 3
    input_size = 4
    time_steps = 2
    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
      # Random initializers.
      seed = 1994
      initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed)
      np.random.seed(seed)

      # Inputs
      concat_x = array_ops.placeholder(
          dtypes.float32, shape=(time_steps, batch_size, input_size))
      h = array_ops.zeros([batch_size, cell_size])

      # Values for the inputs.
      x_values = np.random.rand(time_steps, batch_size, input_size)
      h_value = np.random.rand(batch_size, cell_size)
      feeds = {concat_x: x_values, h: h_value}

      # Gradients from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        cell = gru_ops.GRUBlockCell(cell_size)

        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        grad_output_wrt_x = gradients_impl.gradients([outputs_dynamic[0]],
                                                     concat_x)
        grad_output_wrt_h = gradients_impl.gradients([outputs_dynamic[0]], h)

        sess.run([variables.global_variables_initializer()])
        block_grad_res_x, block_grad_res_h = sess.run(
            [grad_output_wrt_x, grad_output_wrt_h], feeds)

      # Gradients from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        cell = core_rnn_cell_impl.GRUCell(cell_size)

        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        grad_output_wrt_x = gradients_impl.gradients([outputs_dynamic[0]],
                                                     concat_x)
        grad_output_wrt_h = gradients_impl.gradients([outputs_dynamic[0]], h)

        sess.run([variables.global_variables_initializer()])
        basic_grad_res_x, basic_grad_res_h = sess.run(
            [grad_output_wrt_x, grad_output_wrt_h], feeds)

    # Check derivatives values of the outputs wrt to x.
    self.assertEqual(len(block_grad_res_x), len(basic_grad_res_x))

    # Check derivatives values of the outputs wrt to h.
    for block, basic in zip(block_grad_res_x, basic_grad_res_x):
      self.assertAllClose(block, basic)

    # Check derivatives values of the outputs wrt to x.
    self.assertEqual(len(block_grad_res_h), len(basic_grad_res_h))

    # Check derivatives values of the outputs wrt to h.
    for block, basic in zip(block_grad_res_h, basic_grad_res_h):
      self.assertAllClose(block, basic)
Exemplo n.º 44
0
    def call(self, inputs, state):
        """Run one step of LSTM.
    
        Args:
          inputs: input Tensor, 2D, batch x num_units.
          state: if `state_is_tuple` is False, this must be a state Tensor,
            `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
            tuple of state Tensors, both `2-D`, with column sizes `c_state` and
            `m_state`.
          scope: VariableScope for the created subgraph; defaults to "lstm_cell".
    
        Returns:
          A tuple containing:
    
          - A `2-D, [batch x output_dim]`, Tensor representing the output of the
            LSTM after reading `inputs` when previous state was `state`.
            Here output_dim is:
               num_proj if num_proj was set,
               num_units otherwise.
          - Tensor(s) representing the new state of LSTM after reading `inputs` when
            the previous state was `state`.  Same type and shape(s) as `state`.
    
        Raises:
          ValueError: If input size cannot be inferred from inputs via
            static shape inference.
        """
        num_proj = self._num_units if self._num_proj is None else self._num_proj
        
        if self._state_is_tuple:
            (c_prev, m_prev) = state
        else:
            c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
            m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
        
        dtype = inputs.dtype
        input_size = inputs.get_shape().with_rank(2)[1]

        if input_size.value is None:
            raise ValueError("Could not infer input size from inputs.get_shape()[-1]")

        # --------------------------------------- #
        # ------------- PHASED LSTM ------------- #
        # ---------------- BEGIN ---------------- #
        # --------------------------------------- #

        i_size = input_size.value - 1  # -1 to extract time
        times = array_ops.slice(inputs, [0, i_size], [-1, 1])
        filtered_inputs = array_ops.slice(inputs, [0, 0], [-1, i_size])

        tau = vs.get_variable(
            "T", shape=[self._num_units],
            initializer=random_exp_initializer(0, self.tau_init) if not self.manual_set else init_ops.constant_initializer(self.tau_init),
            trainable=self.trainable, dtype=dtype)

        r_on = vs.get_variable(
            "R", shape=[self._num_units],
            initializer=init_ops.constant_initializer(self.r_on_init),
            trainable=self.trainable, dtype=dtype)

        s = vs.get_variable(
            "S", shape=[self._num_units],
            initializer=init_ops.random_uniform_initializer(0., tau.initialized_value()) if not self.manual_set else init_ops.constant_initializer(0.),
            trainable=self.trainable, dtype=dtype)

        tau_broadcast = tf.expand_dims(tau, axis=0)
        r_on_broadcast = tf.expand_dims(r_on, axis=0)
        s_broadcast = tf.expand_dims(s, axis=0)

        r_on_broadcast = tf.abs(r_on_broadcast)
        tau_broadcast = tf.abs(tau_broadcast)
        times = tf.tile(times, [1, self._num_units])

        # calculate kronos gate
        phi = tf.div(tf.mod(tf.mod(times - s_broadcast, tau_broadcast) + tau_broadcast, tau_broadcast), tau_broadcast)
        is_up = tf.less(phi, (r_on_broadcast * 0.5))
        is_down = tf.logical_and(tf.less(phi, r_on_broadcast), tf.logical_not(is_up))

        k = tf.where(is_up, phi / (r_on_broadcast * 0.5), tf.where(is_down, 2. - 2. * (phi / r_on_broadcast), self.alpha * phi))

        lstm_matrix = math_ops.matmul(array_ops.concat([filtered_inputs, m_prev], 1), self._kernel)
        lstm_matrix = nn_ops.bias_add(lstm_matrix, self._bias)

        # --------------------------------------- #
        # ------------- PHASED LSTM ------------- #
        # ----------------- END ----------------- #
        # --------------------------------------- #

        i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1)

        if self._use_peepholes:
            c = (sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev +
                 sigmoid(i + self._w_i_diag * c_prev) * self._activation(j))
        else:
            c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j))

        if self._cell_clip is not None:
            # pylint: disable=invalid-unary-operand-type
            c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
            # pylint: enable=invalid-unary-operand-type

        if self._use_peepholes:
            m = sigmoid(o + self._w_o_diag * c) * self._activation(c)
        else:
            m = sigmoid(o) * self._activation(c)

        if self._num_proj is not None:

            m = math_ops.matmul(m, self._proj_kernel)

            if self._proj_clip is not None:
                # pylint: disable=invalid-unary-operand-type
                m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
                # pylint: enable=invalid-unary-operand-type

        # APPLY KRONOS GATE
        c = k * c + (1. - k) * c_prev
        m = k * m + (1. - k) * m_prev
        # END KRONOS GATE
        
        new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat([c, m], 1))
        return m, new_state
Exemplo n.º 45
0
def embedding_attention_seq2seq(encoder_inputs,
                                decoder_inputs,
                                cell,
                                num_encoder_symbols,
                                num_decoder_symbols,
                                batch_size,
                                state_size,
                                decoder_inputs_positions=None,
                                decoder_inputs_maps=None,
                                feed_previous=False,
                                dtype=dtypes.float32,
                                scope=None):
    """Embedding sequence-to-sequence model with attention.

  This model first embeds encoder_inputs by a newly created embedding (of shape
  [num_encoder_symbols x cell.input_size]). Then it runs an RNN to encode
  embedded encoder_inputs into a state vector. It keeps the outputs of this
  RNN at every step to use for attention later. Next, it embeds decoder_inputs
  by another newly created embedding (of shape [num_decoder_symbols x
  cell.input_size]). Then it runs attention decoder, initialized with the last
  encoder state, on embedded decoder_inputs and attending to encoder outputs.

  Args:
    encoder_inputs: a list of 1D int32 Tensors of shape [batch_size].
    decoder_inputs: a list of 1D int32 Tensors of shape [batch_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    num_encoder_symbols: integer; number of symbols on the encoder side.
    num_decoder_symbols: integer; number of symbols on the decoder side.
    batch_size: need to clarify for decoding.
    decoder_inputs_positions: a list of 2D Tensors of shape [batch_size, 3].
    decoder_inputs_maps: a 1D Tensor of length batch_size.
    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
      of decoder_inputs will be used (the "GO" symbol), and all other decoder
      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
      If False, decoder_inputs are used as given (the standard decoder case).
    dtype: The dtype of the initial RNN state (default: tf.float32).
    scope: VariableScope for the created subgraph; defaults to
      "embedding_attention_seq2seq".

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors with
      shape [batch_size x num_decoder_symbols] containing the generated outputs.
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
    attentions: a list of 2D Tensors of shape [batch_size, cell.state_size].
    environments: a list of 2D Tensors of shape [batch_size, state_size].

  """
    with vs.variable_scope(scope or "embedding_attention_seq2seq"):
        # Encoder.
        encoder_cell = rnn_cell.EmbeddingWrapper(
            cell,
            num_encoder_symbols,
            initializer=init_ops.random_uniform_initializer(-0.08, 0.08))
        encoder_outputs, encoder_states = rnn.rnn(encoder_cell,
                                                  encoder_inputs,
                                                  dtype=dtype)

        # First calculate a concatenation of encoder outputs to put attention on.
        top_states = [
            array_ops.reshape(e, [-1, 1, cell.output_size])
            for e in encoder_outputs
        ]
        attention_states = array_ops.concat(1, top_states)

        output_size = num_decoder_symbols

        if isinstance(feed_previous, bool):
            return embedding_attention_decoder(
                decoder_inputs,
                encoder_states[-1],
                attention_states,
                cell,
                num_decoder_symbols,
                batch_size,
                state_size,
                decoder_inputs_positions=decoder_inputs_positions,
                decoder_inputs_maps=decoder_inputs_maps,
                output_size=output_size,
                feed_previous=feed_previous)
        else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
            # We don't consider this case.
            raise ValueError("Imcompatible variable feed_previous.\n")
Exemplo n.º 46
0
    def __call__(
            self,
            inputs,  # 输入包含 x 输入 和 t 的输入
            state,  #  状态包含了细胞状态和隐含层状态
            scope=None):
        """
            Phased long short-term memory cell (P-LSTM).

        """

        with vs.variable_scope(scope or type(self).__name__):

            # Parameters of gates are concatenated into one multiply for efficiency.
            #  初始状态 state 是一个元组 ( c,h)

            if state is tuple:  #  如果是元组的话,就可以直接分开
                c_prev, h_prev = state
            else:  # 如果不是元组的话, 那么就是多维数组 , 就在第二个维度对他们进行划分
                c_prev, h_prev = array_ops.split(value=state,
                                                 num_or_size_splits=2,
                                                 axis=1)

            # (2, batch_size, seq_len)

            # NB: here we explicitly give t as input.

# input的第一个维度长度为2 , 第一个元素是 x 的输入, 第二个元素是时间变量的输入

            x = tf.reshape(inputs[:, 0], (-1, 1))  #  第二个维度的长度是1, 第一个维度根据需要摆放

            #  取最后一个批次的所有的 时间戳变量

            t = inputs[:, 1][
                -1]  # Now we only accept one id. We have a batch so it's a bit more complex.

            # maybe the information should come from the outside. To be defined later.
            #  就是矩阵乘法

            concat = _linear([x, h_prev], 4 * self._num_units,
                             True)  # 这会儿还没有涉及到及激活函数
            # 注意,这里只计算到线性组合的结果是有意义的
            # 因为后面 可以线性组合的后面再加上窥视孔连接
            #  的结果

            # i = input_gate, j = new_input, f = forget_gate, o = output_gate

            i, j, f, o = array_ops.split(value=concat,
                                         num_or_size_splits=4,
                                         axis=1)

            dtype = inputs.dtype

            #  忽然想到  , mask 是针对某一个 time_step 的, 而tau , r_on, s 以及算出来的 kt 是针对某一个
            #  隐含层或者细胞状态  神经元的
            tau = vs.get_variable(
                'tau',
                shape=[self._num_units],  #  为每一个隐含层神经元,细胞状态神经元
                #  分配一个tau--周期, r_on开放比例, s 相位
                initializer=random_exp_initializer(0, self.tau_init),
                dtype=dtype)

            r_on = vs.get_variable('r_on',
                                   shape=[self._num_units],
                                   initializer=init_ops.constant_initializer(
                                       self.r_on_init),
                                   dtype=dtype)

            s = vs.get_variable(
                's',
                shape=[self._num_units],
                initializer=init_ops.random_uniform_initializer(
                    0., tau.initialized_value()),
                dtype=dtype)

            #  tf.tile 的作用 是 rep

            times = tf.tile(tf.reshape(t, [-1, 1]), [1, self._num_units])

            phase = phi(times, s, tau)  #  element-wise calculation

            kappa = time_gate_fast(phase, r_on, self._leak_rate,
                                   self._training_phase)

            w_o_peephole = None  #

            #  如果使用了窥视孔连接的话,那么就把细胞状态的线性组合连接到前面线性组合的
            if self._use_peepholes:

                w_i_peephole = vs.get_variable('W_I_peephole',
                                               shape=[self._num_units],
                                               dtype=dtype)

                w_f_peephole = vs.get_variable('W_F_peephole',
                                               shape=[self._num_units],
                                               dtype=dtype)

                w_o_peephole = vs.get_variable('W_O_peephole',
                                               shape=[self._num_units],
                                               dtype=dtype)

                f += w_f_peephole * c_prev
                i += w_i_peephole * c_prev

            new_c_tilde = sigmoid(f) * c_prev + sigmoid(i) * self._activation(
                j)

            if self._use_peepholes:
                o += w_o_peephole * new_c_tilde

            new_h_tilde = sigmoid(o) * self._activation(new_c_tilde)
            """
            Hi all,
            Yes, Philippe, you are correct in that Equation 4 should reference c_tilde and not c.
            I can add a point to the paper to mention that, and will update Figure 1 so the line is
            correctly drawn to c_tilde instead. The intuition here is that the gates should be blind
            to the effect of the khronos gate; input, forget and output gate should all operate as if
            the cell were a normal LSTM cell, while the khronos gate allows it to either operate or
            not operate (and then linearly interpolates between these two states). If the output gate
            is influenced by the khronos gate (if the peepholes reference c instead of c_tilde), then
            the PLSTM would no longer be a gated LSTM cell, but somehow be self-dependent on the time gate's actual operation.
            I think everyone's right in that it wouldn't influence much -- but it should be updated in
            the paper. Thanks very much for pointing out the issue, Philippe!
            -Danny"""

            # Apply Khronos gate

            new_h = kappa * new_h_tilde + (1 - kappa) * h_prev

            new_c = kappa * new_c_tilde + (1 - kappa) * c_prev

            new_state = (new_c, new_h)

            # 根据采样频率更新 细胞状态
            return new_h, new_state
Exemplo n.º 47
0
    def testTimeReversedFusedRNN(self):
        with self.test_session() as sess:
            initializer = init_ops.random_uniform_initializer(-0.01,
                                                              0.01,
                                                              seed=19890213)
            fw_cell = core_rnn_cell_impl.BasicRNNCell(10)
            bw_cell = core_rnn_cell_impl.BasicRNNCell(10)
            batch_size = 5
            input_size = 20
            timelen = 15
            inputs = constant_op.constant(
                np.random.randn(timelen, batch_size, input_size))

            # test bi-directional rnn
            with variable_scope.variable_scope("basic",
                                               initializer=initializer):
                unpacked_inputs = array_ops.unstack(inputs)
                outputs, fw_state, bw_state = core_rnn.static_bidirectional_rnn(
                    fw_cell, bw_cell, unpacked_inputs, dtype=dtypes.float64)
                packed_outputs = array_ops.stack(outputs)
                basic_vars = [
                    v for v in variables.trainable_variables()
                    if v.name.startswith("basic/")
                ]
                sess.run([variables.global_variables_initializer()])
                basic_outputs, basic_fw_state, basic_bw_state = sess.run(
                    [packed_outputs, fw_state, bw_state])
                basic_grads = sess.run(
                    gradients_impl.gradients(packed_outputs, inputs))
                basic_wgrads = sess.run(
                    gradients_impl.gradients(packed_outputs, basic_vars))

            with variable_scope.variable_scope("fused",
                                               initializer=initializer):
                fused_cell = fused_rnn_cell.FusedRNNCellAdaptor(
                    core_rnn_cell_impl.BasicRNNCell(10))
                fused_bw_cell = fused_rnn_cell.TimeReversedFusedRNN(
                    fused_rnn_cell.FusedRNNCellAdaptor(
                        core_rnn_cell_impl.BasicRNNCell(10)))
                fw_outputs, fw_state = fused_cell(inputs,
                                                  dtype=dtypes.float64,
                                                  scope="fw")
                bw_outputs, bw_state = fused_bw_cell(inputs,
                                                     dtype=dtypes.float64,
                                                     scope="bw")
                outputs = array_ops.concat([fw_outputs, bw_outputs], 2)
                fused_vars = [
                    v for v in variables.trainable_variables()
                    if v.name.startswith("fused/")
                ]
                sess.run([variables.global_variables_initializer()])
                fused_outputs, fused_fw_state, fused_bw_state = sess.run(
                    [outputs, fw_state, bw_state])
                fused_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                fused_wgrads = sess.run(
                    gradients_impl.gradients(outputs, fused_vars))

            self.assertAllClose(basic_outputs, fused_outputs)
            self.assertAllClose(basic_fw_state, fused_fw_state)
            self.assertAllClose(basic_bw_state, fused_bw_state)
            self.assertAllClose(basic_grads, fused_grads)
            for basic, fused in zip(basic_wgrads, fused_wgrads):
                self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
Exemplo n.º 48
0
def adaptive_softmax_loss(inputs,
                          labels,
                          cutoff,
                          project_factor=4,
                          initializer=None,
                          name=None):
  """Computes and returns the adaptive softmax loss (a improvement of 
  hierarchical softmax).
    
  See [Efficient softmax approximation for GPUs](https://arxiv.org/pdf/1609.04309v2.pdf).
        
  This is a faster way to train a softmax classifier over a huge number of 
  classes, and can be used for **both training and prediction**. For example, it 
  can be used for training a Language Model with a very huge vocabulary, and 
  the trained languaed model can be used in speech recognition, text generation, 
  and machine translation very efficiently.
  
  Args:
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
      activations of the input network.
    labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
      `int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
    cutoff: A list indicating the limits of the different clusters.
    project_factor: A floating point value greater or equal to 1.0. The projection 
      factor between two neighboring clusters.
    initializer: Initializer for adaptive softmax variables (optional).
    name: A name for the operation (optional).
  Returns:
    loss: A `batch_size` 1-D tensor of the adaptive softmax cross entropy loss.
    training_losses: A list of 1-D tensors of adaptive softmax loss for each 
      cluster, which can be used for calculating the gradients and back 
      propagation when training.
  """
  input_dim = int(inputs.get_shape()[1])
  sample_num = int(inputs.get_shape()[0])
  cluster_num = len(cutoff) - 1
  with ops.name_scope(name, "AdaptiveSoftmax"):
    if initializer is None:
      stdv = math.sqrt(1. / input_dim)
      initializer = init_ops.random_uniform_initializer(-stdv * 0.8, stdv * 0.8)

    head_dim = cutoff[0] + cluster_num
    head_w = variable_scope.get_variable("adaptive_softmax_head_w", 
                             [input_dim, head_dim], initializer=initializer)

    tail_project_factor = project_factor
    tail_w = []
    for i in range(cluster_num):
      project_dim = max(1, input_dim // tail_project_factor)
      tail_dim = cutoff[i + 1] - cutoff[i]
      tail_w.append([
        variable_scope.get_variable("adaptive_softmax_tail{}_proj_w".format(i+1), 
                        [input_dim, project_dim], initializer=initializer),
        variable_scope.get_variable("adaptive_softmax_tail{}_w".format(i+1), 
                        [project_dim, tail_dim], initializer=initializer)
      ])
      tail_project_factor *= project_factor

    # Get tail masks and update head labels
    training_losses = []
    loss = array_ops.zeros([sample_num], dtype=dtypes.float32)
    head_labels = labels
    for i in range(cluster_num):
      mask = math_ops.logical_and(math_ops.greater_equal(labels, cutoff[i]), 
                                  math_ops.less(labels, cutoff[i + 1]))
      
      # Update head labels
      head_labels = tf.where(mask, array_ops.constant([cutoff[0] + i] *
                            sample_num), head_labels)

      # Compute tail loss
      tail_inputs = array_ops.boolean_mask(inputs, mask)
      tail_logits = math_ops.matmul(math_ops.matmul(tail_inputs, tail_w[i][0]), 
                                    tail_w[i][1])
      tail_labels = array_ops.boolean_mask(labels - cutoff[i], mask)
      tail_loss = nn.sparse_softmax_cross_entropy_with_logits(labels=tail_labels, logits=tail_logits)
      training_losses.append(tail_loss)
      aligned_tail_loss = sparse_tensor.SparseTensor(
        array_ops.squeeze(array_ops.where(mask)), tail_loss, [sample_num])
      loss += sparse_ops.sparse_tensor_to_dense(aligned_tail_loss)

    # Compute head loss
    head_logits = math_ops.matmul(inputs, head_w)
    head_loss = nn.sparse_softmax_cross_entropy_with_logits(logits=head_logits, labels=head_labels)
    loss += head_loss
    training_losses.append(head_loss)

    return loss, training_losses
Exemplo n.º 49
0
    def testLSTMBasicToBlockCellPeeping(self):
        with self.test_session(use_gpu=True) as sess:
            x = array_ops.zeros([1, 2])
            x_values = np.random.randn(1, 2)

            m0_val = 0.1 * np.ones([1, 2])
            m1_val = -0.1 * np.ones([1, 2])
            m2_val = -0.2 * np.ones([1, 2])
            m3_val = 0.2 * np.ones([1, 2])

            initializer = init_ops.random_uniform_initializer(-0.01,
                                                              0.01,
                                                              seed=19890212)
            with variable_scope.variable_scope("basic",
                                               initializer=initializer):
                m0 = array_ops.zeros([1, 2])
                m1 = array_ops.zeros([1, 2])
                m2 = array_ops.zeros([1, 2])
                m3 = array_ops.zeros([1, 2])
                g, ((out_m0, out_m1),
                    (out_m2, out_m3)) = rnn_cell.MultiRNNCell(
                        [
                            rnn_cell.LSTMCell(
                                2, use_peepholes=True, state_is_tuple=True)
                            for _ in range(2)
                        ],
                        state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
                sess.run([variables.global_variables_initializer()])
                basic_res = sess.run(
                    [g, out_m0, out_m1, out_m2, out_m3], {
                        x.name: x_values,
                        m0.name: m0_val,
                        m1.name: m1_val,
                        m2.name: m2_val,
                        m3.name: m3_val
                    })

            with variable_scope.variable_scope("block",
                                               initializer=initializer):
                m0 = array_ops.zeros([1, 2])
                m1 = array_ops.zeros([1, 2])
                m2 = array_ops.zeros([1, 2])
                m3 = array_ops.zeros([1, 2])
                g, ((out_m0, out_m1),
                    (out_m2, out_m3)) = rnn_cell.MultiRNNCell(
                        [
                            lstm_ops.LSTMBlockCell(2, use_peephole=True)
                            for _ in range(2)
                        ],
                        state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
                sess.run([variables.global_variables_initializer()])
                block_res = sess.run(
                    [g, out_m0, out_m1, out_m2, out_m3], {
                        x.name: x_values,
                        m0.name: m0_val,
                        m1.name: m1_val,
                        m2.name: m2_val,
                        m3.name: m3_val
                    })

            self.assertEqual(len(basic_res), len(block_res))
            for basic, block in zip(basic_res, block_res):
                self.assertAllClose(basic, block)
Exemplo n.º 50
0
def _eunn_param(hidden_size, capacity=2, fft=False, comp=True):
    """
    Create parameters and do the initial preparations
    """
    theta_phi_initializer = init_ops.random_uniform_initializer(-np.pi, np.pi)
    if fft:
        capacity = int(np.ceil(np.log2(hidden_size)))

        diag_list_0 = []
        off_list_0 = []
        varsize = 0
        for i in range(capacity):
            size = capacity - i
            normal_size = (hidden_size // (2**size)) * (2**(size - 1))
            extra_size = max(0, (hidden_size % (2**size)) - (2**(size - 1)))
            varsize += normal_size + extra_size

        params_theta = vs.get_variable("theta_0", [varsize],
                                       initializer=theta_phi_initializer)
        cos_theta = math_ops.cos(params_theta)
        sin_theta = math_ops.sin(params_theta)

        if comp:
            params_phi = vs.get_variable("phi_0", [varsize],
                                         initializer=theta_phi_initializer)
            cos_phi = math_ops.cos(params_phi)
            sin_phi = math_ops.sin(params_phi)

            cos_list_0 = math_ops.complex(cos_theta,
                                          array_ops.zeros_like(cos_theta))
            cos_list_1 = math_ops.complex(
                math_ops.multiply(cos_theta, cos_phi),
                math_ops.multiply(cos_theta, sin_phi))
            sin_list_0 = math_ops.complex(sin_theta,
                                          array_ops.zeros_like(sin_theta))
            sin_list_1 = math_ops.complex(
                -math_ops.multiply(sin_theta, cos_phi),
                -math_ops.multiply(sin_theta, sin_phi))

        last = 0
        for i in range(capacity):
            size = capacity - i
            normal_size = (hidden_size // (2**size)) * (2**(size - 1))
            extra_size = max(0, (hidden_size % (2**size)) - (2**(size - 1)))

            if comp:
                cos_list_normal = array_ops.concat([
                    array_ops.slice(cos_list_0, [last], [normal_size]),
                    array_ops.slice(cos_list_1, [last], [normal_size])
                ], 0)
                sin_list_normal = array_ops.concat([
                    array_ops.slice(sin_list_0, [last], [normal_size]),
                    -array_ops.slice(sin_list_1, [last], [normal_size])
                ], 0)
                last += normal_size

                cos_list_extra = array_ops.concat([
                    array_ops.slice(cos_list_0, [last], [extra_size]),
                    math_ops.complex(
                        tf.ones(
                            [hidden_size - 2 * normal_size - 2 * extra_size]),
                        tf.zeros(
                            [hidden_size - 2 * normal_size - 2 * extra_size])),
                    array_ops.slice(cos_list_1, [last], [extra_size])
                ], 0)
                sin_list_extra = array_ops.concat([
                    array_ops.slice(sin_list_0, [last], [extra_size]),
                    math_ops.complex(
                        tf.zeros(
                            [hidden_size - 2 * normal_size - 2 * extra_size]),
                        tf.zeros([
                            hidden_size - 2 * normal_size - 2 * extra_size
                        ])), -array_ops.slice(sin_list_1, [last], [extra_size])
                ], 0)
                last += extra_size

            else:
                cos_list_normal = array_ops.slice(cos_theta, [last],
                                                  [normal_size])
                cos_list_normal = array_ops.concat(
                    [cos_list_normal, cos_list_normal], 0)
                cos_list_extra = array_ops.slice(cos_theta,
                                                 [last + normal_size],
                                                 [extra_size])
                cos_list_extra = array_ops.concat([
                    cos_list_extra,
                    tf.ones([hidden_size - 2 * normal_size - 2 * extra_size]),
                    cos_list_extra
                ], 0)

                sin_list_normal = array_ops.slice(sin_theta, [last],
                                                  [normal_size])
                sin_list_normal = array_ops.concat(
                    [sin_list_normal, -sin_list_normal], 0)
                sin_list_extra = array_ops.slice(sin_theta,
                                                 [last + normal_size],
                                                 [extra_size])
                sin_list_extra = array_ops.concat([
                    sin_list_extra,
                    tf.zeros([hidden_size - 2 * normal_size - 2 * extra_size]),
                    -sin_list_extra
                ], 0)

                last += normal_size + extra_size

            if normal_size != 0:
                cos_list_normal = array_ops.reshape(
                    array_ops.transpose(
                        array_ops.reshape(cos_list_normal,
                                          [-1, 2 * normal_size // (2**size)])),
                    [-1])
                sin_list_normal = array_ops.reshape(
                    array_ops.transpose(
                        array_ops.reshape(sin_list_normal,
                                          [-1, 2 * normal_size // (2**size)])),
                    [-1])

            cos_list = array_ops.concat([cos_list_normal, cos_list_extra], 0)
            sin_list = array_ops.concat([sin_list_normal, sin_list_extra], 0)
            diag_list_0.append(cos_list)
            off_list_0.append(sin_list)

        diag_vec = array_ops.stack(diag_list_0, 0)
        off_vec = array_ops.stack(off_list_0, 0)

    else:
        capacity_b = capacity // 2
        capacity_a = capacity - capacity_b

        hidden_size_a = hidden_size // 2
        hidden_size_b = (hidden_size - 1) // 2

        params_theta_0 = vs.get_variable("theta_0",
                                         [capacity_a, hidden_size_a],
                                         initializer=theta_phi_initializer)
        cos_theta_0 = array_ops.reshape(math_ops.cos(params_theta_0),
                                        [capacity_a, -1, 1])
        sin_theta_0 = array_ops.reshape(math_ops.sin(params_theta_0),
                                        [capacity_a, -1, 1])

        params_theta_1 = vs.get_variable("theta_1",
                                         [capacity_b, hidden_size_b],
                                         initializer=theta_phi_initializer)
        cos_theta_1 = array_ops.reshape(math_ops.cos(params_theta_1),
                                        [capacity_b, -1, 1])
        sin_theta_1 = array_ops.reshape(math_ops.sin(params_theta_1),
                                        [capacity_b, -1, 1])

        if comp:
            params_phi_0 = vs.get_variable("phi_0",
                                           [capacity_a, hidden_size_a],
                                           initializer=theta_phi_initializer)
            cos_phi_0 = array_ops.reshape(math_ops.cos(params_phi_0),
                                          [capacity_a, -1, 1])
            sin_phi_0 = array_ops.reshape(math_ops.sin(params_phi_0),
                                          [capacity_a, -1, 1])

            cos_list_0_re = array_ops.reshape(
                array_ops.concat(
                    [cos_theta_0,
                     math_ops.multiply(cos_theta_0, cos_phi_0)], 2),
                [capacity_a, -1])
            cos_list_0_im = array_ops.reshape(
                array_ops.concat([
                    array_ops.zeros_like(cos_theta_0),
                    math_ops.multiply(cos_theta_0, sin_phi_0)
                ], 2), [capacity_a, -1])
            if hidden_size_a * 2 != hidden_size:
                cos_list_0_re = array_ops.concat(
                    [cos_list_0_re, tf.ones([capacity_a, 1])], 1)
                cos_list_0_im = array_ops.concat(
                    [cos_list_0_im, tf.zeros([capacity_a, 1])], 1)
            cos_list_0 = math_ops.complex(cos_list_0_re, cos_list_0_im)

            sin_list_0_re = array_ops.reshape(
                array_ops.concat(
                    [sin_theta_0, -math_ops.multiply(sin_theta_0, cos_phi_0)],
                    2), [capacity_a, -1])
            sin_list_0_im = array_ops.reshape(
                array_ops.concat([
                    array_ops.zeros_like(sin_theta_0),
                    -math_ops.multiply(sin_theta_0, sin_phi_0)
                ], 2), [capacity_a, -1])
            if hidden_size_a * 2 != hidden_size:
                sin_list_0_re = array_ops.concat(
                    [sin_list_0_re, tf.zeros([capacity_a, 1])], 1)
                sin_list_0_im = array_ops.concat(
                    [sin_list_0_im, tf.zeros([capacity_a, 1])], 1)
            sin_list_0 = math_ops.complex(sin_list_0_re, sin_list_0_im)

            params_phi_1 = vs.get_variable("phi_1",
                                           [capacity_b, hidden_size_b],
                                           initializer=theta_phi_initializer)
            cos_phi_1 = array_ops.reshape(math_ops.cos(params_phi_1),
                                          [capacity_b, -1, 1])
            sin_phi_1 = array_ops.reshape(math_ops.sin(params_phi_1),
                                          [capacity_b, -1, 1])

            cos_list_1_re = array_ops.reshape(
                array_ops.concat(
                    [cos_theta_1,
                     math_ops.multiply(cos_theta_1, cos_phi_1)], 2),
                [capacity_b, -1])
            cos_list_1_re = array_ops.concat(
                [tf.ones((capacity_b, 1)), cos_list_1_re], 1)
            cos_list_1_im = array_ops.reshape(
                array_ops.concat([
                    array_ops.zeros_like(cos_theta_1),
                    math_ops.multiply(cos_theta_1, sin_phi_1)
                ], 2), [capacity_b, -1])
            cos_list_1_im = array_ops.concat(
                [tf.zeros((capacity_b, 1)), cos_list_1_im], 1)
            if hidden_size_b * 2 != hidden_size - 1:
                cos_list_1_re = array_ops.concat(
                    [cos_list_1_re, tf.ones([capacity_b, 1])], 1)
                cos_list_1_im = array_ops.concat(
                    [cos_list_1_im, tf.zeros([capacity_b, 1])], 1)
            cos_list_1 = math_ops.complex(cos_list_1_re, cos_list_1_im)

            sin_list_1_re = array_ops.reshape(
                array_ops.concat(
                    [sin_theta_1, -math_ops.multiply(sin_theta_1, cos_phi_1)],
                    2), [capacity_b, -1])
            sin_list_1_re = array_ops.concat(
                [tf.zeros((capacity_b, 1)), sin_list_1_re], 1)
            sin_list_1_im = array_ops.reshape(
                array_ops.concat([
                    array_ops.zeros_like(sin_theta_1),
                    -math_ops.multiply(sin_theta_1, sin_phi_1)
                ], 2), [capacity_b, -1])
            sin_list_1_im = array_ops.concat(
                [tf.zeros((capacity_b, 1)), sin_list_1_im], 1)
            if hidden_size_b * 2 != hidden_size - 1:
                sin_list_1_re = array_ops.concat(
                    [sin_list_1_re, tf.zeros([capacity_b, 1])], 1)
                sin_list_1_im = array_ops.concat(
                    [sin_list_1_im, tf.zeros([capacity_b, 1])], 1)
            sin_list_1 = math_ops.complex(sin_list_1_re, sin_list_1_im)
        else:
            cos_list_0 = array_ops.reshape(
                array_ops.concat([cos_theta_0, cos_theta_0], 2),
                [capacity_a, -1])
            sin_list_0 = array_ops.reshape(
                array_ops.concat([sin_theta_0, -sin_theta_0], 2),
                [capacity_a, -1])
            if hidden_size_a * 2 != hidden_size:
                cos_list_0 = array_ops.concat(
                    [cos_list_0, tf.ones([capacity_a, 1])], 1)
                sin_list_0 = array_ops.concat(
                    [sin_list_0, tf.zeros([capacity_a, 1])], 1)

            cos_list_1 = array_ops.reshape(
                array_ops.concat([cos_theta_1, cos_theta_1], 2),
                [capacity_b, -1])
            cos_list_1 = array_ops.concat(
                [tf.ones((capacity_b, 1)), cos_list_1], 1)
            sin_list_1 = array_ops.reshape(
                array_ops.concat([sin_theta_1, -sin_theta_1], 2),
                [capacity_b, -1])
            sin_list_1 = array_ops.concat(
                [tf.zeros((capacity_b, 1)), sin_list_1], 1)
            if hidden_size_b * 2 != hidden_size - 1:
                cos_list_1 = array_ops.concat(
                    [cos_list_1, tf.zeros([capacity_b, 1])], 1)
                sin_list_1 = array_ops.concat(
                    [sin_list_1, tf.zeros([capacity_b, 1])], 1)

        if capacity_b != capacity_a:
            if comp:
                cos_list_1 = array_ops.concat([
                    cos_list_1,
                    math_ops.complex(tf.zeros([1, hidden_size]),
                                     tf.zeros([1, hidden_size]))
                ], 0)
                sin_list_1 = array_ops.concat([
                    sin_list_1,
                    math_ops.complex(tf.zeros([1, hidden_size]),
                                     tf.zeros([1, hidden_size]))
                ], 0)
            else:
                cos_list_1 = array_ops.concat(
                    [cos_list_1, tf.zeros([1, hidden_size])], 0)
                sin_list_1 = array_ops.concat(
                    [sin_list_1, tf.zeros([1, hidden_size])], 0)

        diag_vec = tf.reshape(tf.concat([cos_list_0, cos_list_1], 1),
                              [capacity_a * 2, hidden_size])
        off_vec = tf.reshape(tf.concat([sin_list_0, sin_list_1], 1),
                             [capacity_a * 2, hidden_size])

        if capacity_b != capacity_a:
            diag_vec = tf.slice(diag_vec, [0, 0], [capacity, hidden_size])
            off_vec = tf.slice(off_vec, [0, 0], [capacity, hidden_size])

    def _toTensorArray(elems):

        elems = ops.convert_to_tensor(elems)
        n = array_ops.shape(elems)[0]
        elems_ta = tensor_array_ops.TensorArray(dtype=elems.dtype,
                                                size=n,
                                                dynamic_size=False,
                                                infer_shape=True,
                                                clear_after_read=False)
        elems_ta = elems_ta.unstack(elems)
        return elems_ta

    diag_vec = _toTensorArray(diag_vec)
    off_vec = _toTensorArray(off_vec)
    if comp:
        omega = vs.get_variable("omega", [hidden_size],
                                initializer=theta_phi_initializer)
        diag = math_ops.complex(math_ops.cos(omega), math_ops.sin(omega))
    else:
        diag = None

    return diag_vec, off_vec, diag, capacity
Exemplo n.º 51
0
    def testLSTMFusedSequenceLengths(self):
        """Verify proper support for sequence lengths in LSTMBlockFusedCell."""
        with self.test_session(use_gpu=True) as sess:
            batch_size = 3
            input_size = 4
            cell_size = 5
            max_sequence_length = 6

            inputs = []
            for _ in range(max_sequence_length):
                inp = ops.convert_to_tensor(np.random.randn(
                    batch_size, input_size),
                                            dtype=dtypes.float32)
                inputs.append(inp)
            seq_lengths = constant_op.constant([3, 4, 5])

            initializer = init_ops.random_uniform_initializer(-0.01,
                                                              0.01,
                                                              seed=19890213)
            with variable_scope.variable_scope("basic",
                                               initializer=initializer):
                cell = rnn_cell.BasicLSTMCell(cell_size, state_is_tuple=True)
                outputs, state = rnn.static_rnn(cell,
                                                inputs,
                                                dtype=dtypes.float32,
                                                sequence_length=seq_lengths)
                sess.run([variables.global_variables_initializer()])
                basic_outputs, basic_state = sess.run([outputs, state[0]])
                basic_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                basic_wgrads = sess.run(
                    gradients_impl.gradients(outputs,
                                             variables.trainable_variables()))

            with variable_scope.variable_scope("fused",
                                               initializer=initializer):
                cell = lstm_ops.LSTMBlockFusedCell(cell_size,
                                                   cell_clip=0,
                                                   use_peephole=False)
                outputs, state = cell(inputs,
                                      dtype=dtypes.float32,
                                      sequence_length=seq_lengths)

                sess.run([variables.global_variables_initializer()])
                fused_outputs, fused_state = sess.run([outputs, state[0]])
                fused_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                fused_vars = [
                    v for v in variables.trainable_variables()
                    if v.name.startswith("fused/")
                ]
                fused_wgrads = sess.run(
                    gradients_impl.gradients(outputs, fused_vars))

            self.assertAllClose(basic_outputs, fused_outputs)
            self.assertAllClose(basic_state, fused_state)
            self.assertAllClose(basic_grads, fused_grads)
            for basic, fused in zip(basic_wgrads, fused_wgrads):
                self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)

            # Verify that state propagation works if we turn our sequence into
            # tiny (single-time) subsequences, i.e. unfuse the cell
            with variable_scope.variable_scope("unfused",
                                               initializer=initializer) as vs:
                cell = lstm_ops.LSTMBlockFusedCell(cell_size,
                                                   cell_clip=0,
                                                   use_peephole=False)
                outputs = []
                state = None
                for i, inp in enumerate(inputs):
                    lengths = [int(i < l) for l in seq_lengths.eval()]
                    output, state = cell([inp],
                                         initial_state=state,
                                         dtype=dtypes.float32,
                                         sequence_length=lengths)
                    vs.reuse_variables()
                    outputs.append(output[0])
                outputs = array_ops.stack(outputs)

                sess.run([variables.global_variables_initializer()])
                unfused_outputs, unfused_state = sess.run([outputs, state[0]])
                unfused_grads = sess.run(
                    gradients_impl.gradients(outputs, inputs))
                unfused_vars = [
                    v for v in variables.trainable_variables()
                    if v.name.startswith("unfused/")
                ]
                unfused_wgrads = sess.run(
                    gradients_impl.gradients(outputs, unfused_vars))

            self.assertAllClose(basic_outputs, unfused_outputs)
            self.assertAllClose(basic_state, unfused_state)
            self.assertAllClose(basic_grads, unfused_grads)
            for basic, unfused in zip(basic_wgrads, unfused_wgrads):
                self.assertAllClose(basic, unfused, rtol=1e-2, atol=1e-2)
Exemplo n.º 52
0
def RunLSTM(sess,
            num_units,
            input_size,
            batch_size,
            time,
            num_layers=1,
            variable_seq_lengths=False,
            time_major=True,
            dynamic_shape_input=False,
            is_training=True,
            dropout=0.,
            num_dirs=True,
            dtype=dtypes.float32):
  # TODO(jamesqin): add multi-layer tests.
  # TODO(jamesqin): add multi-dir tests
  assert num_layers == 1
  assert num_dirs == 1
  if is_training and not np.isclose(dropout, 0):
    raise ValueError("dropout can not be 0. when test training.")

  # set graph level random seed and numpy random seed.
  random_seed.set_random_seed(0)
  np.random.seed(0)

  shape = ([time, batch_size, input_size]
           if time_major else [batch_size, time, input_size])
  inputs_np = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
  inputs_static = variable_scope.get_variable(
      "inputs", initializer=inputs_np, dtype=dtype)
  inputs_dynamic = array_ops.placeholder(
      dtype, shape=[None, None, None], name="inputs")
  inputs = inputs_dynamic if dynamic_shape_input else inputs_static
  initial_h_op = variable_scope.get_variable(
      "initial_h_op",
      initializer=np.random.rand(batch_size,
                                 num_units).astype(dtype.as_numpy_dtype),
      dtype=dtype)
  initial_c_op = variable_scope.get_variable(
      "initial_c_op",
      initializer=np.random.rand(batch_size,
                                 num_units).astype(dtype.as_numpy_dtype),
      dtype=dtype)

  if variable_seq_lengths:
    lengths_v = np.random.randint(low=1, high=time + 1, size=batch_size)
    lengths_v[0] = time  # make sure the max sequence has 'time' elems
    lengths = ops.convert_to_tensor(lengths_v.astype(np.int32))
  else:
    lengths = None

  initializer = init_ops.random_uniform_initializer(
      -0.01, 0.01, dtype=dtype, seed=19980904)

  with variable_scope.variable_scope("test", initializer=initializer):
    w = variable_scope.get_variable(
        "rnn/lstm_cell/kernel",
        shape=[input_size + num_units, num_units * 4],
        dtype=dtype)
    b = variable_scope.get_variable(
        "rnn/lstm_cell/bias", shape=[num_units * 4], dtype=dtype)

    # canonical lstm. must set forget_bias to 0. to align with cudnn lstm.
    cell = rnn_cell_impl.LSTMCell(num_units, forget_bias=0., reuse=True)
    outputs_op, state_tuple_op = rnn.dynamic_rnn(
        cell,
        inputs_static,
        sequence_length=lengths,
        initial_state=rnn_cell_impl.LSTMStateTuple(
            h=initial_h_op, c=initial_c_op),
        dtype=dtype,
        time_major=time_major,
        scope=None)

  # Convert to cudnn opaque param.
  format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM(
      num_layers, num_units, input_size)
  opaque_params = format_converter.tf_canonical_to_opaque([w, b])

  cu_initial_h_op = array_ops.expand_dims(
      initial_h_op, axis=(0 if time_major else 1))
  cu_initial_c_op = array_ops.expand_dims(
      initial_c_op, axis=(0 if time_major else 1))
  cu_outputs_op, cu_h_op, cu_c_op = cudnn_rnn_ops._cudnn_rnn(
      inputs,
      cu_initial_h_op,
      cu_initial_c_op,
      opaque_params,
      sequence_lengths=lengths,
      time_major=time_major,
      dropout=dropout,
      is_training=is_training,
      rnn_mode=cudnn_rnn_ops.CUDNN_LSTM)
  # Remove the trivial 1st dimension.
  cu_state_tuple_op = rnn_cell_impl.LSTMStateTuple(
      c=array_ops.squeeze(cu_c_op, axis=0 if time_major else 1),
      h=array_ops.squeeze(cu_h_op, axis=0 if time_major else 1))

  if is_training:
    (inp_grad_op, hgrad_op,
     cgrad_op, wgrad_op, bgrad_op) = gradients_impl.gradients(
         outputs_op, [inputs_static, initial_h_op, initial_c_op, w, b])

    (cu_inp_grad_op, cu_hgrad_op,
     cu_cgrad_op, opaque_grad_op) = gradients_impl.gradients(
         cu_outputs_op,
         [inputs, cu_initial_h_op, cu_initial_c_op, opaque_params])
    # Remove the trivial 1st dimension
    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0 if time_major else 1)
    # Remove the trivial 1st dimension
    cu_cgrad_op = array_ops.squeeze(cu_cgrad_op, axis=0 if time_major else 1)

    cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
        opaque_grad_op)
    cu_wgrad_op = cu_wgrad_op[0]
    cu_bgrad_op = cu_bgrad_op[0]
    # cudnn lstm has 2 biases each gate. When converting to tf canonical format,
    # the two biases are summed into one. Thus here bias gradient should be
    # halved when comparing with tf lstm.
    cu_bgrad_op *= 0.5

  init_op = variables.global_variables_initializer()
  sess.run(init_op)

  if is_training:
    outputs, state_tuple, inp_grad, state_grad, wgrad, bgrad = sess.run([
        outputs_op, state_tuple_op, inp_grad_op,
        (hgrad_op, cgrad_op), wgrad_op, bgrad_op
    ])
    (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad,
     cu_bgrad) = sess.run(
         [
             cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op,
             (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op
         ],
         feed_dict={inputs: inputs_np} if dynamic_shape_input else None)

    logging.vlog(1, "outputs: %s" % outputs)
    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
    logging.vlog(1, "state_tuple: %s" % str(state_tuple))
    logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
    logging.vlog(1, "inp_grad: %s" % inp_grad)
    logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
    logging.vlog(1, "state_grad: %s" % str(state_grad))
    logging.vlog(1, "cu_state_grad: %s" % str(cu_state_grad))
    logging.vlog(1, "wgrad: %s" % str(wgrad))
    logging.vlog(1, "bgrad: %s" % str(bgrad))
    logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
    logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
    return (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad,
            cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad,
            cu_bgrad)
  else:
    outputs, state_tuple = sess.run([outputs_op, state_tuple_op])
    cu_outputs, cu_state_tuple = sess.run([cu_outputs_op, cu_state_tuple_op],
                                          feed_dict=({
                                              inputs: inputs_np
                                          } if dynamic_shape_input else None))

    logging.vlog(1, "outputs: %s" % outputs)
    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
    logging.vlog(1, "state_tuple: %s" % str(state_tuple))
    logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
  return outputs, cu_outputs, state_tuple, cu_state_tuple
Exemplo n.º 53
0
def blocks_match(sess, use_peephole):
  batch_size = 2
  input_size = 3
  cell_size = 4
  sequence_length = 4

  inputs = []
  for _ in range(sequence_length):
    inp = ops.convert_to_tensor(
        np.random.randn(batch_size, input_size), dtype=dtypes.float32)
    inputs.append(inp)
  stacked_inputs = array_ops.stack(inputs)

  initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890212)

  with variable_scope.variable_scope("test", initializer=initializer):
    # magic naming so that the cells pick up these variables and reuse them
    if use_peephole:
      wci = variable_scope.get_variable(
          "rnn/lstm_cell/w_i_diag", shape=[cell_size], dtype=dtypes.float32)
      wcf = variable_scope.get_variable(
          "rnn/lstm_cell/w_f_diag", shape=[cell_size], dtype=dtypes.float32)
      wco = variable_scope.get_variable(
          "rnn/lstm_cell/w_o_diag", shape=[cell_size], dtype=dtypes.float32)

    w = variable_scope.get_variable(
        "rnn/lstm_cell/kernel",
        shape=[input_size + cell_size, cell_size * 4],
        dtype=dtypes.float32)
    b = variable_scope.get_variable(
        "rnn/lstm_cell/bias",
        shape=[cell_size * 4],
        dtype=dtypes.float32,
        initializer=init_ops.zeros_initializer())

    basic_cell = rnn_cell.LSTMCell(
        cell_size, use_peepholes=use_peephole, state_is_tuple=True, reuse=True)
    basic_outputs_op, basic_state_op = rnn.static_rnn(
        basic_cell, inputs, dtype=dtypes.float32)

    if use_peephole:
      _, _, _, _, _, _, block_outputs_op = block_lstm(
          ops.convert_to_tensor(sequence_length, dtype=dtypes.int64),
          inputs,
          w,
          b,
          wci=wci,
          wcf=wcf,
          wco=wco,
          cell_clip=0,
          use_peephole=True)
    else:
      _, _, _, _, _, _, block_outputs_op = block_lstm(
          ops.convert_to_tensor(sequence_length, dtype=dtypes.int64),
          inputs,
          w,
          b,
          cell_clip=0)

    fused_cell = lstm_ops.LSTMBlockFusedCell(
        cell_size, cell_clip=0, use_peephole=use_peephole, reuse=True,
        name="rnn/lstm_cell")
    fused_outputs_op, fused_state_op = fused_cell(
        stacked_inputs, dtype=dtypes.float32)

    sess.run([variables.global_variables_initializer()])
    basic_outputs, basic_state = sess.run([basic_outputs_op, basic_state_op[0]])
    basic_grads = sess.run(gradients_impl.gradients(basic_outputs_op, inputs))
    xs = [w, b]
    if use_peephole:
      xs += [wci, wcf, wco]
    basic_wgrads = sess.run(gradients_impl.gradients(basic_outputs_op, xs))

    block_outputs = sess.run(block_outputs_op)
    block_grads = sess.run(gradients_impl.gradients(block_outputs_op, inputs))
    block_wgrads = sess.run(gradients_impl.gradients(block_outputs_op, xs))

    xs = [w, b]
    if use_peephole:
      xs += [wci, wcf, wco]
    fused_outputs, fused_state = sess.run([fused_outputs_op, fused_state_op[0]])
    fused_grads = sess.run(gradients_impl.gradients(fused_outputs_op, inputs))
    fused_wgrads = sess.run(gradients_impl.gradients(fused_outputs_op, xs))

    return (basic_state, fused_state, basic_outputs, block_outputs,
            fused_outputs, basic_grads, block_grads, fused_grads, basic_wgrads,
            block_wgrads, fused_wgrads)
Exemplo n.º 54
0
def RunGRU(sess,
           num_units,
           input_size,
           batch_size,
           time,
           num_layers=1,
           is_training=True,
           variable_seq_lengths=False,
           time_major=True,
           dynamic_shape_input=False,
           dropout=0.,
           num_dirs=True,
           dtype=dtypes.float32):
  # TODO(jamesqin): add multi-layer tests.
  # TODO(jamesqin): add multi-dir tests
  assert num_layers == 1
  assert num_dirs == 1
  if is_training and not np.isclose(dropout, 0):
    raise ValueError("dropout can not be 0. when test training.")

  # set graph level random seed and numpy random seed.
  random_seed.set_random_seed(0)
  np.random.seed(0)

  shape = ([time, batch_size, input_size]
           if time_major else [batch_size, time, input_size])
  inputs_np = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
  inputs_static = variable_scope.get_variable(
      "inputs", initializer=inputs_np, dtype=dtype)
  inputs_dynamic = array_ops.placeholder(
      dtype, shape=[None, None, None], name="inputs")
  inputs = inputs_dynamic if dynamic_shape_input else inputs_static
  initial_h_op = variable_scope.get_variable(
      "initial_h_op",
      initializer=np.random.rand(batch_size,
                                 num_units).astype(dtype.as_numpy_dtype),
      dtype=dtype)

  if variable_seq_lengths:
    lengths_v = np.random.randint(low=1, high=time + 1, size=batch_size)
    lengths_v[0] = time  # make sure the max sequence has 'time' elems
    lengths = ops.convert_to_tensor(lengths_v.astype(np.int32))
  else:
    lengths = None

  initializer = init_ops.random_uniform_initializer(
      -0.01, 0.01, dtype=dtype, seed=19980904)
  with variable_scope.variable_scope("test", initializer=initializer):
    gate_kernel = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/gates/kernel",
        shape=[input_size + num_units, num_units * 2],
        dtype=dtype)
    gate_bias = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/gates/bias",
        shape=[num_units * 2],
        dtype=dtype)
    candidate_inp_kernel = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/candidate/input_projection/kernel",
        shape=[input_size, num_units],
        dtype=dtype)
    candidate_inp_bias = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/candidate/input_projection/bias",
        shape=[num_units],
        dtype=dtype)
    candidate_hid_kernel = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/kernel",
        shape=[num_units, num_units],
        dtype=dtype)
    candidate_hid_bias = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/bias",
        shape=[num_units],
        dtype=dtype)

    cell = cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units, reuse=True)
    outputs_op, h_op = rnn.dynamic_rnn(
        cell,
        inputs_static,
        sequence_length=lengths,
        initial_state=initial_h_op,
        dtype=dtype,
        time_major=time_major,
        scope=None)

  ws = [gate_kernel, candidate_inp_kernel, candidate_hid_kernel]
  bs = [gate_bias, candidate_inp_bias, candidate_hid_bias]
  # Convert to cudnn opaque param.
  format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU(
      num_layers, num_units, input_size)
  opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)


  cu_initial_h_op = array_ops.expand_dims(
      initial_h_op, axis=(0 if time_major else 1))
  cu_outputs_op, cu_h_op, _ = cudnn_rnn_ops._cudnn_rnn(
      inputs,
      cu_initial_h_op,
      array_ops.zeros_like(cu_initial_h_op),  # not used
      opaque_params,
      sequence_lengths=lengths,
      time_major=time_major,
      dropout=dropout,
      is_training=is_training,
      rnn_mode=cudnn_rnn_ops.CUDNN_GRU)

  if is_training:
    (inp_grad_op, hgrad_op, gk_grad_op, cik_grad_op, chk_grad_op, gb_grad_op,
     cib_grad_op, chb_grad_op) = gradients_impl.gradients(
         outputs_op, [inputs_static, initial_h_op] + ws + bs)

    (cu_inp_grad_op, cu_hgrad_op, opaque_grad_op) = gradients_impl.gradients(
        cu_outputs_op, [inputs, cu_initial_h_op, opaque_params])
    # Remove the trivial 1st dimension
    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0 if time_major else 1)

    cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
        opaque_grad_op)
    (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op) = cu_wgrad_op
    (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op) = cu_bgrad_op
    # cudnn gru has 2 biases for reset and update gates. When converting to tf
    # canonical format, the two biases are summed into one.  Thus here relevant
    # bias gradient should be halved before comparing with tf gru.
    cu_gb_grad_op *= 0.5

  init_op = variables.global_variables_initializer()
  sess.run(init_op)

  if is_training:
    outputs, h, inp_grad, hgrad, wgrad, bgrad = sess.run([
        outputs_op, h_op, inp_grad_op, hgrad_op,
        (gk_grad_op, cik_grad_op, chk_grad_op),
        (gb_grad_op, cib_grad_op, chb_grad_op)
    ])
    (cu_outputs, cu_h, cu_inp_grad, cu_hgrad, cu_wgrad, cu_bgrad) = sess.run(
        [
            cu_outputs_op, cu_h_op, cu_inp_grad_op, cu_hgrad_op,
            (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op),
            (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op)
        ],
        feed_dict={inputs: inputs_np} if dynamic_shape_input else None)
    # Remove the trivial 1st dimension
    cu_h = np.squeeze(cu_h, axis=0 if time_major else 1)

    logging.vlog(1, "outputs: %s" % outputs)
    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
    logging.vlog(1, "h: %s" % h)
    logging.vlog(1, "cu_h: %s" % h)
    logging.vlog(1, "inp_grad: %s" % inp_grad)
    logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
    logging.vlog(1, "hgrad: %s" % hgrad)
    logging.vlog(1, "cu_hgrad: %s" % cu_hgrad)
    logging.vlog(1, "wgrad: %s" % str(wgrad))
    logging.vlog(1, "bgrad: %s" % str(bgrad))
    logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
    logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
    return (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad,
            cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad)
  else:
    outputs, h = sess.run([outputs_op, h_op])
    cu_outputs, cu_h = sess.run([cu_outputs_op, cu_h_op],
                                feed_dict=({
                                    inputs: inputs_np
                                } if dynamic_shape_input else None))
    # Remove the trivial 1st dimension.
    cu_h = np.squeeze(cu_h, axis=0 if time_major else 1)

    logging.vlog(1, "outputs: %s" % outputs)
    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
    logging.vlog(1, "h: %s" % h)
    logging.vlog(1, "cu_h: %s" % h)
  return outputs, cu_outputs, h, cu_h
Exemplo n.º 55
0
    def __init__(self,
                 vocab_size,
                 embedding_size,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_op,
                 memory_hops,
                 dropout_rate,
                 q_depth,
                 a_depth,
                 episodic_m_depth,
                 ep_depth,
                 attention_ff_l1_size,
                 max_gradient_norm,
                 maximum_story_length=5,
                 maximum_question_length=20,
                 use_lstm=False,
                 forward_only=False):

        # initialization
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = tf.Variable(
            float(learning_rate_decay_op), trainable=False)
        self.dropout_rate = dropout_rate
        self.global_step = tf.Variable(0, trainable=False, name='global_step')
        self.q_depth = q_depth  # question RNN depth
        self.a_depth = a_depth  # answer RNN depth
        self.m_depth = episodic_m_depth  # memory cell depth
        self.ep_depth = ep_depth  # episodic depth
        self.max_gradient_norm = max_gradient_norm
        self.memory_hops = memory_hops  # number of episodic memory pass
        self.m_input_size = embedding_size * 3
        self.m_size = embedding_size  # memory cell size
        self.attention_ff_l1_size = attention_ff_l1_size
        self.maximum_story_length = maximum_story_length

        print("[*] Creating Dynamic Memory Network ...")
        # Initializing word2vec
        sqrt3 = math.sqrt(3)
        initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
        W = tf.get_variable("embedding",
                            [self.vocab_size, self.embedding_size],
                            initializer=initializer)

        # W = tf.Variable(tf.constant(0.0, shape=[vocab_size, embedding_size]),
        # 						trainable=False, name="W")
        # self.embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_size])
        # self.embedding_init = W.assign(self.embedding_placeholder)

        # Sentence token placeholder
        self.story = []
        story_embedded = []
        for i in range(maximum_story_length):
            self.story.append(
                tf.placeholder(tf.int32, shape=[None, None], name="Story"))
            story_embedded.append(tf.nn.embedding_lookup(W, self.story[i]))
            story_embedded[i] = tf.transpose(story_embedded[i], [1, 0, 2])

        self.story_len = tf.placeholder(tf.int32,
                                        shape=[1],
                                        name="Story_length")

        self.question = tf.placeholder(tf.int32,
                                       shape=[None, None],
                                       name="Question")
        question_embedded = tf.transpose(
            tf.nn.embedding_lookup(W, self.question), [1, 0, 2])
        self.answer = tf.placeholder(tf.int64, name="answer")

        # configuration of attention gate
        answer_weights = tf.Variable(tf.truncated_normal(
            [self.m_size, self.vocab_size], -0.1, 0.1),
                                     name="answer_weights")
        answer_biases = tf.Variable(tf.zeros([self.vocab_size]),
                                    name="answer_biases")

        #------------ question module ------------
        with tf.variable_scope("embedding_rnn"):
            embedding_cell = tf.nn.rnn_cell.GRUCell(self.embedding_size)
            #embedding_cell = tf.nn.rnn_cell.DropoutWrapper(
            #	embedding_cell, output_keep_prob=dropout_rate)
            _, self.question_state = tf.nn.dynamic_rnn(embedding_cell,
                                                       question_embedded,
                                                       dtype=tf.float32,
                                                       time_major=True)

        #------------ Input module ------------
        # Story_embedding_cell = tf.nn.rnn_cell.GRUCell(self.embedding_size)
        # Story_embedding_cell = tf.nn.rnn_cell.DropoutWrapper(
        # 		Story_embedding_cell, output_keep_prob=dropout_rate)
        self.story_state_array = []
        # with tf.name_scope("story_embedding_rnn"):
        for i in range(maximum_story_length):
            with tf.variable_scope("embedding_rnn", reuse=True):
                _, story_states = tf.nn.dynamic_rnn(embedding_cell,
                                                    story_embedded[i],
                                                    dtype=tf.float32,
                                                    time_major=True)
                self.story_state_array.append(story_states)
        fusion_fw_cell = tf.nn.rnn_cell.GRUCell(self.embedding_size)
        #fusion_fw_cell = tf.nn.rnn_cell.DropoutWrapper(
        #		fusion_fw_cell, output_keep_prob=dropout_rate)
        fusion_bw_cell = tf.nn.rnn_cell.GRUCell(self.embedding_size)
        #fusion_bw_cell = tf.nn.rnn_cell.DropoutWrapper(
        #		fusion_bw_cell, output_keep_prob=dropout_rate)
        (self.facts_, _, _) = tf.nn.bidirectional_rnn(fusion_fw_cell,
                                                      fusion_bw_cell,
                                                      self.story_state_array,
                                                      dtype=tf.float32)
        # (self.facts_, _) = tf.nn.rnn(fusion_fw_cell, self.story_state_array, sequence_length=self.story_len, dtype=tf.float32, scope='story_rnn')

        #------------ episodic memory module ------------

        attention_ff_size = z_dim = self.embedding_size * 8
        attention_ff_l2_size = 1
        self.question_state_double = tf.concat(
            1, [self.question_state, self.question_state])
        # -------- multi-layer feedforward for multi-hop propagation -----------
        self.facts = tf.concat(0, self.facts_)
        # ep_cell = cell.MGRUCell(self.ep_size)
        # mem_cell = cell.MemCell(self.m_size)
        mem_weights = tf.get_variable("mem_weights",
                                      [embedding_size * 4, self.m_size],
                                      initializer=tf.random_normal_initializer(
                                          0.0, 0.5))
        mem_biases = tf.get_variable(
            "mem_biases", [self.m_size],
            initializer=tf.random_normal_initializer())
        l1_weights = tf.get_variable(
            "l1_weights", [attention_ff_size, attention_ff_l1_size],
            initializer=tf.random_normal_initializer())
        l1_biases = tf.get_variable("l1_biases", [attention_ff_l1_size],
                                    initializer=tf.random_normal_initializer())
        l2_weights = tf.get_variable(
            "l2_weights", [attention_ff_l1_size, attention_ff_l2_size],
            initializer=tf.random_normal_initializer())
        l2_biases = tf.get_variable("l2_biases", [attention_ff_l2_size],
                                    initializer=tf.random_normal_initializer())
        mgru_weights = {}
        embedding_size_double = embedding_size * 2
        mgru_weights['ur_weights'] = tf.get_variable(
            'ur_weights', [embedding_size_double, embedding_size_double],
            initializer=tf.random_normal_initializer())
        mgru_weights['wr_weights'] = tf.get_variable(
            'wr_weights', [embedding_size_double, embedding_size_double],
            initializer=tf.random_normal_initializer())
        mgru_weights['wr_bias'] = tf.get_variable(
            'wr_bias', [embedding_size_double],
            initializer=tf.random_normal_initializer())
        mgru_weights['uh_weights'] = tf.get_variable(
            'uh_weights', [embedding_size_double, embedding_size_double],
            initializer=tf.random_normal_initializer())
        mgru_weights['wh_weights'] = tf.get_variable(
            'wh_weights', [embedding_size_double, embedding_size_double],
            initializer=tf.random_normal_initializer())
        mgru_weights['wh_bias'] = tf.get_variable(
            'wh_bias', [embedding_size_double],
            initializer=tf.random_normal_initializer())

        def MGRU(inputs, episodic_gates):
            """	modified GRU 
				arg:

			"""
            batch_size = array_ops.shape(inputs[0])[0]
            state = tf.zeros([1, embedding_size_double], tf.float32)
            for time, (input_, episodic_gate_) in enumerate(
                    zip(inputs, episodic_gates)):
                input_ = tf.reshape(input_, [1, embedding_size_double])
                r = tf.sigmoid(
                    tf.matmul(input_, mgru_weights['ur_weights']) +
                    tf.matmul(state, mgru_weights['wr_weights']) +
                    mgru_weights['wr_bias'])
                c = tf.tanh(
                    tf.matmul(input_, mgru_weights['uh_weights']) +
                    tf.mul(r, tf.matmul(state, mgru_weights['wh_weights'])) +
                    mgru_weights['wh_bias'])
                state = tf.mul(episodic_gate_, c) + tf.mul(
                    (1 - episodic_gate_), state)
            return state

        # episodic_gate_unpacked = []
        # def condition(mem_state_previous, hops):
        # 	mem_state_previous = tf.concat(1, [mem_state_previous, mem_state_previous])
        # 	z = tf.concat(1, [tf.mul(self.facts, self.question_state_double), tf.mul(self.facts, mem_state_previous),
        # 		tf.abs(tf.sub(self.facts, self.question_state_double)), tf.abs(tf.sub(self.facts, mem_state_previous))], name="z")
        # 	episodic_array_reshaped = tf.reshape(tf.matmul(tf.tanh(tf.matmul(z , l1_weights) + l1_biases) , l2_weights)
        # 	 + l2_biases, [1,-1], name="episodic_array_reshaped")
        # 	episodic_gate = tf.nn.softmax(episodic_array_reshaped)
        # 	episodic_gate_unpacked = tf.unpack( tf.reshape(episodic_gate, [maximum_story_length,1]))
        # 	argmax_ep_gate = tf.to_int32(tf.argmax(episodic_gate, 1)) #should be 1
        # 	# return tf.cond(tf.equal(hops,0),lambda: tf.constant(True),
        # 	# 	lambda: tf.logical_and(tf.less(argmax_ep_gate,self.story_len)[0],tf.less(hops,tf.constant(self.memory_hops))))
        # 	# return tf.logical_and(tf.less(argmax_ep_gate,self.story_len)[0],tf.less(hops,tf.constant(self.memory_hops)))
        # 	return tf.less(hops,tf.constant(self.memory_hops))
        # def body(mem_state_previous, hops):

        # 	# attention GRU
        # 	# outputs, context = cell.rnn_ep(ep_cell, self.facts_, episodic_gate_unpacked, dtype=tf.float32)
        # 	# outputs, context = ep_cell(ep_cell, self.facts_, episodic_gate_unpacked)
        # 	context = MGRU(self.facts_, episodic_gate_unpacked)

        # 	# memory updates
        # 	# mem_state_current = mem_cell(mem_state_previous, self.question_state, mem_state_previous, mem_weights, mem_biases, hops)
        # 	#question_state_next = question_state_prev
        # 	#print (self.question_state, mem_state_previous)
        # 	mem_state_current = tf.nn.relu(tf.matmul(tf.concat(1, [mem_state_previous, context, self.question_state]), mem_weights) + mem_biases)

        # 	hops = tf.add(hops,1)
        # 	return  mem_state_current, hops
        mem_state_array = []
        gate_array = []
        mem_state = self.question_state
        for i in range(self.memory_hops):
            mem_state_double = tf.concat(1, [mem_state, mem_state])
            z = tf.concat(1, [
                tf.mul(self.facts, self.question_state_double),
                tf.mul(self.facts, mem_state_double),
                tf.abs(tf.sub(self.facts, self.question_state_double)),
                tf.abs(tf.sub(self.facts, mem_state_double))
            ],
                          name="z")
            episodic_array_reshaped = tf.reshape(
                tf.matmul(tf.tanh(tf.matmul(z, l1_weights) + l1_biases),
                          l2_weights) + l2_biases, [1, -1],
                name="episodic_array_reshaped")
            episodic_gate = tf.nn.softmax(episodic_array_reshaped)
            gate_array.append(episodic_gate)
            episodic_gate_unpacked = tf.unpack(
                tf.reshape(episodic_gate, [maximum_story_length, 1]))
            # argmax_ep_gate = tf.to_int32(tf.argmax(episodic_gate, 1)) #should be 1
            context = MGRU(self.facts_, episodic_gate_unpacked)
            mem_state = tf.nn.relu(
                tf.matmul(
                    tf.concat(1, [mem_state, context, self.question_state]),
                    mem_weights) + mem_biases)
            mem_state_array.append(mem_state)
        # initial_argmax_ep_gate = tf.constant(0)
        # initial_hops = tf.constant(0)
        # 	# initial_context = tf.constant([[0.5 for _ in range(50)]])
        # mem_state, self.hops = tf.while_loop(condition,body,[self.question_state, initial_hops], back_prop=True)

        self.gate_array = tf.concat(0, gate_array)
        self.a_state = mem_state_array[-1]

        self.predicted_answer = tf.matmul(self.a_state, answer_weights)
        self.softmax_answer = tf.nn.softmax(self.predicted_answer)
        self.argmax_answer = tf.argmax(self.softmax_answer, 1)
        answer = tf.reshape(tf.one_hot(self.answer, self.vocab_size, 1.0, 0.0),
                            [1, self.vocab_size])
        self.loss = tf.nn.softmax_cross_entropy_with_logits(
            self.predicted_answer, answer)
        # self.loss = tf.nn.softmax_cross_entropy_with_logits(self.predicted_answer, answer)
        params = tf.trainable_variables()

        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
            # optimizer = tf.train.AdamOptimizer(self.learning_rate)
            gradients = tf.gradients(self.loss, params)

            clipped_gradients, norm = tf.clip_by_global_norm(
                gradients, self.max_gradient_norm)
            self.gradient_norms = norm
            self.updates = optimizer.apply_gradients(
                zip(clipped_gradients, params), global_step=self.global_step)

        self.saver = tf.train.Saver(tf.all_variables())
Exemplo n.º 56
0
    def __call__(self, x, state, scope=None):
        with tf.variable_scope(scope or type(self).__name__):
            c, h, step = state
            time = tf.tile(step[0], [self.num_out_ch])
            batch_size = x.get_shape().as_list()[0]

            c = self.shape_in(c)
            h = self.shape_in(h)
            x = self.shape_in(x, True)

            if self.is_training:
                alpha = tf.constant(self.alpha, dtype=tf.float32)
            else:
                alpha = tf.constant(0, dtype=tf.float32)

            bias = tf.get_variable('bias', [4 * self.num_out_ch])

            tau = tf.get_variable("tau", [self.num_out_ch],
                                  initializer=random_exp_initializer(
                                      0, self.tau_init),
                                  dtype=tf.float32)
            s = tf.get_variable(
                "s", [self.num_out_ch],
                initializer=init_ops.random_uniform_initializer(
                    0., tau.initialized_value()),
                dtype=tf.float32)
            r_on = tf.get_variable("r_on", [self.num_out_ch],
                                   initializer=init_ops.constant_initializer(
                                       self.r_on_init),
                                   dtype=tf.float32,
                                   trainable=False)

            phi = dk_mod(dk_mod((time - s), tau) + tau, tau) / tau

            is_up = tf.less(phi, (r_on * 0.5))
            is_down = tf.logical_and(tf.less(phi, r_on), tf.logical_not(is_up))

            k = tf.where(
                is_up, 2. * (phi / r_on),
                tf.where(is_down, 2. - 2. * (phi / r_on), alpha * phi))

            k = tf.reshape(k, [1, 1, 1, self.num_out_ch])

            xh = conv_linear([x],
                             self.filter_size,
                             self.num_out_ch * 4,
                             False,
                             scope='xh',
                             initializer=conv_orthogonal_initializer,
                             init_param=None)
            hh = conv_linear([h],
                             self.filter_size,
                             self.num_out_ch * 4,
                             False,
                             scope='hh',
                             initializer=conv_identity_initializer,
                             init_param=0.95)

            hidden = xh + hh + bias

            i, j, f, o = tf.split(hidden, 4, axis=3)

            new_c = c * tf.nn.sigmoid(f) + tf.nn.sigmoid(i) * self.activation(
                j)

            phased_new_c = k * new_c + (1 - k) * c

            new_h = tf.nn.tanh(new_c) * tf.nn.sigmoid(o)
            phased_new_h = k * new_h + (1 - k) * h

            phased_new_c = self.shape_out(phased_new_c)
            phased_new_h = self.shape_out(phased_new_h)

            return phased_new_h, (phased_new_c, phased_new_h, step + 1)
Exemplo n.º 57
0
 def testDuplicatedInitializer(self):
   init = init_ops.random_uniform_initializer(0.0, 1.0)
   self.assertFalse(duplicated_initializer(self, init, 1))
Exemplo n.º 58
0
 def testDuplicatedInitializer(self):
     init = init_ops.random_uniform_initializer(0.0, 1.0)
     self.assertFalse(duplicated_initializer(self, init, 1))
Exemplo n.º 59
0
def EUNN_param(hidden_size, capacity=2, FFT=False, comp=False):

    theta_phi_initializer = init_ops.random_uniform_initializer(-np.pi, np.pi)
    if FFT:
        capacity = int(np.log2(hidden_size))

        params_theta_0 = vs.get_variable("theta_0",
                                         [capacity, hidden_size / 2],
                                         initializer=theta_phi_initializer)
        cos_theta_0 = math_ops.cos(params_theta_0)
        sin_theta_0 = math_ops.sin(params_theta_0)

        if comp:

            params_phi_0 = vs.get_variable("phi_0",
                                           [capacity, hidden_size / 2],
                                           initializer=theta_phi_initializer)
            cos_phi_0 = math_ops.cos(params_phi_0)
            sin_phi_0 = math_ops.sin(params_phi_0)

            cos_list_0_re = array_ops.concat(
                [cos_theta_0,
                 math_ops.multiply(cos_theta_0, cos_phi_0)], 1)
            cos_list_0_im = array_ops.concat([
                array_ops.zeros_like(cos_theta_0),
                math_ops.multiply(cos_theta_0, sin_phi_0)
            ], 1)
            sin_list_0_re = array_ops.concat(
                [sin_theta_0, -math_ops.multiply(sin_theta_0, cos_phi_0)], 1)
            sin_list_0_im = array_ops.concat([
                array_ops.zeros_like(sin_theta_0),
                -math_ops.multiply(sin_theta_0, sin_phi_0)
            ], 1)
            cos_list_0 = array_ops.unstack(
                math_ops.complex(cos_list_0_re, cos_list_0_im))
            sin_list_0 = array_ops.unstack(
                math_ops.complex(sin_list_0_re, sin_list_0_im))

        else:
            cos_list_0 = array_ops.unstack(
                array_ops.concat([cos_theta_0, cos_theta_0], 1))
            sin_list_0 = array_ops.unstack(
                array_ops.concat([sin_theta_0, -sin_theta_0], 1))

        ind, ind1 = permute_FFT(hidden_size)
        ind1_list = array_ops.unstack(ind1)

        diag_list_0 = []
        off_list_0 = []
        for i in range(capacity):
            diag_list_0.append(permute(cos_list_0[i], ind1_list[i]))
            off_list_0.append(permute(sin_list_0[i], ind1_list[i]))
        v1 = array_ops.stack(diag_list_0, 0)
        v2 = array_ops.stack(off_list_0, 0)

    else:

        params_theta_0 = vs.get_variable(
            "theta_0",
            [int(capacity / 2), int(hidden_size / 2)],
            initializer=theta_phi_initializer)
        cos_theta_0 = math_ops.cos(params_theta_0)
        sin_theta_0 = math_ops.sin(params_theta_0)

        if comp:
            params_phi_0 = vs.get_variable(
                "phi_0",
                [int(capacity / 2), int(hidden_size / 2)],
                initializer=theta_phi_initializer)
            cos_phi_0 = math_ops.cos(params_phi_0)
            sin_phi_0 = math_ops.sin(params_phi_0)

            cos_list_0_re = array_ops.concat(
                [cos_theta_0,
                 math_ops.multiply(cos_theta_0, cos_phi_0)], 1)
            cos_list_0_im = array_ops.concat([
                array_ops.zeros_like(cos_theta_0),
                math_ops.multiply(cos_theta_0, sin_phi_0)
            ], 1)
            sin_list_0_re = array_ops.concat(
                [sin_theta_0, -math_ops.multiply(sin_theta_0, cos_phi_0)], 1)
            sin_list_0_im = array_ops.concat([
                array_ops.zeros_like(sin_theta_0),
                -math_ops.multiply(sin_theta_0, sin_phi_0)
            ], 1)
            cos_list_0 = array_ops.unstack(
                math_ops.complex(cos_list_0_re, cos_list_0_im))
            sin_list_0 = array_ops.unstack(
                math_ops.complex(sin_list_0_re, sin_list_0_im))
        else:
            cos_list_0 = array_ops.concat([cos_theta_0, cos_theta_0], 1)
            sin_list_0 = array_ops.concat([sin_theta_0, -sin_theta_0], 1)

        params_theta_1 = vs.get_variable(
            "theta_1",
            [int(capacity / 2), int(hidden_size / 2) - 1],
            initializer=theta_phi_initializer)
        cos_theta_1 = math_ops.cos(params_theta_1)
        sin_theta_1 = math_ops.sin(params_theta_1)

        if comp:
            params_phi_1 = vs.get_variable(
                "phi_1", [int(capacity / 2),
                          int(hidden_size / 2) - 1],
                initializer=theta_phi_initializer)
            cos_phi_1 = math_ops.cos(params_phi_1)
            sin_phi_1 = math_ops.sin(params_phi_1)

            cos_list_1_re = array_ops.concat([
                np.ones((int(capacity / 2), 1)), cos_theta_1,
                math_ops.multiply(cos_theta_1, cos_phi_1),
                np.ones((int(capacity / 2), 1))
            ], 1)
            cos_list_1_im = array_ops.concat([
                np.zeros((int(capacity / 2), 1)),
                array_ops.zeros_like(cos_theta_1),
                math_ops.multiply(cos_theta_1, sin_phi_1),
                np.zeros((int(capacity / 2), 1))
            ], 1)
            sin_list_1_re = array_ops.concat([
                np.zeros((int(capacity / 2), 1)), sin_theta_1,
                -math_ops.multiply(sin_theta_1, cos_phi_1),
                np.zeros((int(capacity / 2), 1))
            ], 1)
            sin_list_1_im = array_ops.concat([
                np.zeros((int(capacity / 2), 1)),
                array_ops.zeros_like(sin_theta_1),
                -math_ops.multiply(sin_theta_1, sin_phi_1),
                np.zeros((int(capacity / 2), 1))
            ], 1)
            cos_list_1 = array_ops.unstack(
                math_ops.complex(cos_list_1_re, cos_list_1_im))
            sin_list_1 = array_ops.unstack(
                math_ops.complex(sin_list_1_re, sin_list_1_im))
        else:
            cos_list_1 = array_ops.concat([
                np.ones((int(capacity / 2), 1)), cos_theta_1, cos_theta_1,
                np.ones((int(capacity / 2), 1))
            ], 1)
            sin_list_1 = array_ops.concat([
                np.zeros((int(capacity / 2), 1)), sin_theta_1, -sin_theta_1,
                np.zeros((int(capacity / 2), 1))
            ], 1)

        ind, ind3, ind4 = permute_tunable(hidden_size, capacity)

        diag_list_0 = permute(cos_list_0, ind3)
        off_list_0 = permute(sin_list_0, ind3)
        diag_list_1 = permute(cos_list_1, ind4)
        off_list_1 = permute(sin_list_1, ind4)

        v1 = tf.reshape(tf.concat([diag_list_0, diag_list_1], 1),
                        [capacity, hidden_size])
        v2 = tf.reshape(tf.concat([off_list_0, off_list_1], 1),
                        [capacity, hidden_size])

    if comp:
        omega = vs.get_variable("omega", [hidden_size],
                                initializer=theta_phi_initializer)
        D = math_ops.complex(math_ops.cos(omega), math_ops.sin(omega))
    else:
        D = None

    v1 = toTensorArray(v1)
    v2 = toTensorArray(v2)
    ind = toTensorArray(ind)
    diag = D

    return v1, v2, ind, diag, capacity
Exemplo n.º 60
0
    def __call__(self, inputs, state, scope=None):
        """ Phased long short-term memory cell (P-LSTM)."""
        with vs.variable_scope(scope or type(self).__name__):
            # Parameters of gates are concatenated into one multiply for efficiency.
            c_prev, h_prev = state

            # (batch_size, seq_len, 2)
            # NB: here we explicitly give t as input.
            x = tf.reshape(inputs[:, 0], (-1, 1))
            t = inputs[:, 1][-1]  # Now we only accept one id. We have a batch so it's a bit more complex.

            # maybe the information should come from the outside. To be defined later.

            concat = _linear([x, h_prev], 4 * self._num_units, True)
            # i = input_gate, j = new_input, f = forget_gate, o = output_gate
            i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)

            dtype = inputs.dtype
            tau = vs.get_variable('tau', shape=[self._num_units],
                                  initializer=random_exp_initializer(0, self.tau_init), dtype=dtype)

            r_on = vs.get_variable('r_on', shape=[self._num_units],
                                   initializer=init_ops.constant_initializer(self.r_on_init), dtype=dtype)

            s = vs.get_variable('s', shape=[self._num_units],
                                initializer=init_ops.random_uniform_initializer(0., tau.initialized_value()),
                                dtype=dtype)

            times = tf.tile(tf.reshape(t, [-1, 1]), [1, self._num_units])
            phase = phi(times, s, tau)
            kappa = time_gate_fast(phase, r_on, self._leak_rate, self._training_phase)

            w_o_peephole = None
            if self._use_peepholes:
                w_i_peephole = vs.get_variable('W_I_peephole', shape=[self._num_units], dtype=dtype)
                w_f_peephole = vs.get_variable('W_F_peephole', shape=[self._num_units], dtype=dtype)
                w_o_peephole = vs.get_variable('W_O_peephole', shape=[self._num_units], dtype=dtype)
                f += w_f_peephole * c_prev
                i += w_i_peephole * c_prev

            new_c_tilde = sigmoid(f) * c_prev + sigmoid(i) * self._activation(j)
            if self._use_peepholes:
                o += w_o_peephole * new_c_tilde

            new_h_tilde = sigmoid(o) * self._activation(new_c_tilde)

            """
            Hi all,
            Yes, Philippe, you are correct in that Equation 4 should reference c_tilde and not c.
            I can add a point to the paper to mention that, and will update Figure 1 so the line is
            correctly drawn to c_tilde instead. The intuition here is that the gates should be blind
            to the effect of the khronos gate; input, forget and output gate should all operate as if
            the cell were a normal LSTM cell, while the khronos gate allows it to either operate or
            not operate (and then linearly interpolates between these two states). If the output gate
            is influenced by the khronos gate (if the peepholes reference c instead of c_tilde), then
            the PLSTM would no longer be a gated LSTM cell, but somehow be self-dependent on the time gate's actual operation.
            I think everyone's right in that it wouldn't influence much -- but it should be updated in
            the paper. Thanks very much for pointing out the issue, Philippe!
            -Danny"""

            # Apply Khronos gate
            new_h = kappa * new_h_tilde + (1 - kappa) * h_prev
            new_c = kappa * new_c_tilde + (1 - kappa) * c_prev
            new_state = (new_c, new_h)
            return new_h, new_state