def testGRUCell(self):
     with self.test_session() as sess:
         with variable_scope.variable_scope(
                 "root", initializer=init_ops.constant_initializer(0.5)):
             x = array_ops.zeros([1, 2])
             m = array_ops.zeros([1, 2])
             g, _ = core_rnn_cell_impl.GRUCell(2)(x, m)
             sess.run([variables_lib.global_variables_initializer()])
             res = sess.run([g], {
                 x.name: np.array([[1., 1.]]),
                 m.name: np.array([[0.1, 0.1]])
             })
             # Smoke test
             self.assertAllClose(res[0], [[0.175991, 0.175991]])
         with variable_scope.variable_scope(
                 "other", initializer=init_ops.constant_initializer(0.5)):
             x = array_ops.zeros(
                 [1, 3])  # Test GRUCell with input_size != num_units.
             m = array_ops.zeros([1, 2])
             g, _ = core_rnn_cell_impl.GRUCell(2)(x, m)
             sess.run([variables_lib.global_variables_initializer()])
             res = sess.run(
                 [g], {
                     x.name: np.array([[1., 1., 1.]]),
                     m.name: np.array([[0.1, 0.1]])
                 })
             # Smoke test
             self.assertAllClose(res[0], [[0.156736, 0.156736]])
Exemplo n.º 2
0
  def testMultiRNNCellWithStateTuple(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        x = array_ops.zeros([1, 2])
        m_bad = array_ops.zeros([1, 4])
        m_good = (array_ops.zeros([1, 2]), array_ops.zeros([1, 2]))

        # Test incorrectness of state
        with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
          core_rnn_cell_impl.MultiRNNCell(
              [core_rnn_cell_impl.GRUCell(2) for _ in range(2)],
              state_is_tuple=True)(x, m_bad)

        _, ml = core_rnn_cell_impl.MultiRNNCell(
            [core_rnn_cell_impl.GRUCell(2) for _ in range(2)],
            state_is_tuple=True)(x, m_good)

        sess.run([variables.global_variables_initializer()])
        res = sess.run(ml, {
            x.name: np.array([[1., 1.]]),
            m_good[0].name: np.array([[0.1, 0.1]]),
            m_good[1].name: np.array([[0.1, 0.1]])
        })

        # The numbers in results were not calculated, this is just a
        # smoke test.  However, these numbers should match those of
        # the test testMultiRNNCell.
        self.assertAllClose(res[0], [[0.175991, 0.175991]])
        self.assertAllClose(res[1], [[0.13248, 0.13248]])
Exemplo n.º 3
0
  def testEmbeddingAttentionDecoder(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        inp = [constant_op.constant(0.5, shape=[2, 2])] * 2
        cell_fn = lambda: core_rnn_cell_impl.GRUCell(2)
        cell = cell_fn()
        enc_outputs, enc_state = core_rnn.static_rnn(
            cell, inp, dtype=dtypes.float32)
        attn_states = array_ops.concat([
            array_ops.reshape(e, [-1, 1, cell.output_size]) for e in enc_outputs
        ], 1)
        dec_inp = [
            constant_op.constant(
                i, dtypes.int32, shape=[2]) for i in range(3)
        ]

        # Use a new cell instance since the attention decoder uses a
        # different variable scope.
        dec, mem = seq2seq_lib.embedding_attention_decoder(
            dec_inp,
            enc_state,
            attn_states,
            cell_fn(),
            num_symbols=4,
            embedding_size=2,
            output_size=3)
        sess.run([variables.global_variables_initializer()])
        res = sess.run(dec)
        self.assertEqual(3, len(res))
        self.assertEqual((2, 3), res[0].shape)

        res = sess.run([mem])
        self.assertEqual((2, 2), res[0].shape)
Exemplo n.º 4
0
  def testBlockGRUToGRUCellSingleStep(self):
    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
      batch_size = 4
      cell_size = 5
      input_size = 6

      seed = 1994
      initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed)

      # Inputs
      x = array_ops.zeros([batch_size, input_size])
      h = array_ops.zeros([batch_size, cell_size])

      # Values for the inputs.
      x_value = np.random.rand(batch_size, input_size)
      h_value = np.random.rand(batch_size, cell_size)

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        output = core_rnn_cell_impl.GRUCell(cell_size)(x, h)
        sess.run([variables.global_variables_initializer()])
        basic_res = sess.run([output], {x: x_value, h: h_value})

      # Output from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        output = gru_ops.GRUBlockCell(cell_size)(x, h)
        sess.run([variables.global_variables_initializer()])
        block_res = sess.run([output], {x: x_value, h: h_value})

      self.assertEqual(len(block_res), len(basic_res))
      for block, basic in zip(block_res, basic_res):
        self.assertAllClose(block, basic)
    def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self):
        if not test.is_gpu_available():
            # Can't perform this test w/o a GPU
            return

        with self.test_session(use_gpu=True) as sess:
            with variable_scope.variable_scope(
                    "root", initializer=init_ops.constant_initializer(0.5)):
                x = array_ops.zeros([1, 1, 3])
                cell = core_rnn_cell_impl.DeviceWrapper(
                    core_rnn_cell_impl.GRUCell(3), "/gpu:0")
                with ops.device("/cpu:0"):
                    outputs, _ = rnn.dynamic_rnn(cell=cell,
                                                 inputs=x,
                                                 dtype=dtypes.float32)
                run_metadata = config_pb2.RunMetadata()
                opts = config_pb2.RunOptions(
                    trace_level=config_pb2.RunOptions.FULL_TRACE)

                sess.run([variables_lib.global_variables_initializer()])
                _ = sess.run(outputs, options=opts, run_metadata=run_metadata)

            step_stats = run_metadata.step_stats
            ix = 0 if "gpu" in step_stats.dev_stats[0].device else 1
            gpu_stats = step_stats.dev_stats[ix].node_stats
            cpu_stats = step_stats.dev_stats[1 - ix].node_stats
            self.assertFalse(
                [s for s in cpu_stats if "gru_cell" in s.node_name])
            self.assertTrue(
                [s for s in gpu_stats if "gru_cell" in s.node_name])
Exemplo n.º 6
0
  def testAttentionDecoder1(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        cell_fn = lambda: core_rnn_cell_impl.GRUCell(2)
        cell = cell_fn()
        inp = [constant_op.constant(0.5, shape=[2, 2])] * 2
        enc_outputs, enc_state = core_rnn.static_rnn(
            cell, inp, dtype=dtypes.float32)
        attn_states = array_ops.concat([
            array_ops.reshape(e, [-1, 1, cell.output_size]) for e in enc_outputs
        ], 1)
        dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3

        # Create a new cell instance for the decoder, since it uses a
        # different variable scope
        dec, mem = seq2seq_lib.attention_decoder(
            dec_inp, enc_state, attn_states, cell_fn(), output_size=4)
        sess.run([variables.global_variables_initializer()])
        res = sess.run(dec)
        self.assertEqual(3, len(res))
        self.assertEqual((2, 4), res[0].shape)

        res = sess.run([mem])
        self.assertEqual((2, 2), res[0].shape)
Exemplo n.º 7
0
def inference_gru_block_vs_gru_cell(batch_size,
                                    cell_size,
                                    input_size,
                                    time_steps,
                                    use_gpu=False,
                                    iters=30):
    """Benchmark inference speed between GRUBlockCell vs GRUCell."""
    ops.reset_default_graph()
    with session.Session(graph=ops.Graph()) as sess:
        with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):

            # Random initializers.
            seed = 1994
            initializer = init_ops.random_uniform_initializer(-1, 1, seed=seed)
            np.random.seed(seed)

            # Inputs
            concat_x = vs.get_variable("concat_x",
                                       [time_steps, batch_size, input_size])
            h = vs.get_variable("h", [batch_size, cell_size])

            # Output from the basic GRU cell implementation.
            with vs.variable_scope("basic", initializer=initializer):
                cell = core_rnn_cell_impl.GRUCell(cell_size)
                outputs_dynamic, _ = rnn.dynamic_rnn(cell,
                                                     inputs=concat_x,
                                                     initial_state=h,
                                                     time_major=True,
                                                     dtype=dtypes.float32)
                sess.run([variables.global_variables_initializer()])
                basic_time_inference = time_taken_by_op(
                    outputs_dynamic, sess, iters)

            # Output from the block GRU cell implementation.
            with vs.variable_scope("block", initializer=initializer):
                cell = gru_ops.GRUBlockCell(cell_size)
                outputs_dynamic, _ = rnn.dynamic_rnn(cell,
                                                     inputs=concat_x,
                                                     initial_state=h,
                                                     time_major=True,
                                                     dtype=dtypes.float32)
                sess.run([variables.global_variables_initializer()])
                block_time_inference = time_taken_by_op(
                    outputs_dynamic, sess, iters)

        performance_inference = (basic_time_inference - block_time_inference
                                 ) * 100 / basic_time_inference
        print(",".join([
            str(batch_size),
            str(cell_size),
            str(input_size),
            str(time_steps),
            str(use_gpu),
            str(basic_time_inference),
            str(block_time_inference),
            str(performance_inference)
        ]))

        return basic_time_inference, block_time_inference
Exemplo n.º 8
0
  def testBlockGRUToGRUCellMultiStep(self):
    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
      batch_size = 2
      cell_size = 3
      input_size = 3
      time_steps = 4

      # Random initializers.
      seed = 1994
      initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed)
      np.random.seed(seed)

      # Inputs
      concat_x = array_ops.placeholder(
          dtypes.float32, shape=(time_steps, batch_size, input_size))
      h = array_ops.zeros([batch_size, cell_size])

      # Values for the inputs.
      x_values = np.random.rand(time_steps, batch_size, input_size)
      h_value = np.random.rand(batch_size, cell_size)

      # Output from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        cell = gru_ops.GRUBlockCell(cell_size)
        outputs_dynamic, state_dynamic = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        feeds = {concat_x: x_values, h: h_value}
        sess.run([variables.global_variables_initializer()])
        block_res = sess.run([outputs_dynamic, state_dynamic], feeds)

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        cell = core_rnn_cell_impl.GRUCell(cell_size)
        outputs_dynamic, state_dynamic = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        feeds = {concat_x: x_values, h: h_value}
        sess.run([variables.global_variables_initializer()])
        basic_res = sess.run([outputs_dynamic, state_dynamic], feeds)

      # Check the lengths of the outputs_dynamic, and states.
      self.assertEqual(len(block_res), len(basic_res))
      self.assertEqual(len(block_res[0]), len(basic_res[0]))
      self.assertEqual(len(block_res[1]), len(basic_res[1]))

      # Check the outputs_dynamic values.
      for block_output, basic_output in zip(block_res[0], basic_res[0]):
        self.assertAllClose(block_output, basic_output)

      # Check the state_dynamic value.
      self.assertAllClose(block_res[1], block_res[1])
Exemplo n.º 9
0
  def testRNNDecoder(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        inp = [constant_op.constant(0.5, shape=[2, 2])] * 2
        _, enc_state = core_rnn.static_rnn(
            core_rnn_cell_impl.GRUCell(2), inp, dtype=dtypes.float32)
        dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3
        cell = core_rnn_cell_impl.OutputProjectionWrapper(
            core_rnn_cell_impl.GRUCell(2), 4)
        dec, mem = seq2seq_lib.rnn_decoder(dec_inp, enc_state, cell)
        sess.run([variables.global_variables_initializer()])
        res = sess.run(dec)
        self.assertEqual(3, len(res))
        self.assertEqual((2, 4), res[0].shape)

        res = sess.run([mem])
        self.assertEqual((2, 2), res[0].shape)
Exemplo n.º 10
0
 def testDeviceWrapper(self):
     with variable_scope.variable_scope(
             "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 3])
         m = array_ops.zeros([1, 3])
         cell = core_rnn_cell_impl.DeviceWrapper(
             core_rnn_cell_impl.GRUCell(3), "/cpu:14159")
         outputs, _ = cell(x, m)
         self.assertTrue("cpu:14159" in outputs.device.lower())
Exemplo n.º 11
0
 def GRUSeq2Seq(enc_inp, dec_inp):
   cell = core_rnn_cell_impl.MultiRNNCell(
       [core_rnn_cell_impl.GRUCell(24) for _ in range(2)],
       state_is_tuple=True)
   return seq2seq_lib.embedding_attention_seq2seq(
       enc_inp,
       dec_inp,
       cell,
       num_encoder_symbols=classes,
       num_decoder_symbols=classes,
       embedding_size=24,
       output_projection=(w, b))
Exemplo n.º 12
0
    def get_rnncell(cell_type, cell_size, keep_prob, num_layer):
        if cell_type == "gru":
            cell = rnn_cell.GRUCell(cell_size)
        else:
            cell = rnn_cell.LSTMCell(cell_size, use_peepholes=False, forget_bias=1.0)

        if keep_prob < 1.0:
            cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=keep_prob)

        if num_layer > 1:
            cell = rnn_cell.MultiRNNCell([cell] * num_layer, state_is_tuple=True)

        return cell
Exemplo n.º 13
0
  def testBasicRNNSeq2Seq(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        inp = [constant_op.constant(0.5, shape=[2, 2])] * 2
        dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3
        cell = core_rnn_cell_impl.OutputProjectionWrapper(
            core_rnn_cell_impl.GRUCell(2), 4)
        dec, mem = seq2seq_lib.basic_rnn_seq2seq(inp, dec_inp, cell)
        sess.run([variables.global_variables_initializer()])
        res = sess.run(dec)
        self.assertEqual(3, len(res))
        self.assertEqual((2, 4), res[0].shape)

        res = sess.run([mem])
        self.assertEqual((2, 2), res[0].shape)
Exemplo n.º 14
0
 def testMultiRNNCell(self):
   with self.test_session() as sess:
     with variable_scope.variable_scope(
         "root", initializer=init_ops.constant_initializer(0.5)):
       x = array_ops.zeros([1, 2])
       m = array_ops.zeros([1, 4])
       _, ml = core_rnn_cell_impl.MultiRNNCell(
           [core_rnn_cell_impl.GRUCell(2) for _ in range(2)],
           state_is_tuple=False)(x, m)
       sess.run([variables_lib.global_variables_initializer()])
       res = sess.run(ml, {
           x.name: np.array([[1., 1.]]),
           m.name: np.array([[0.1, 0.1, 0.1, 0.1]])
       })
       # The numbers in results were not calculated, this is just a smoke test.
       self.assertAllClose(res, [[0.175991, 0.175991, 0.13248, 0.13248]])
Exemplo n.º 15
0
def single_bprop_step_gru_block_vs_gru_cell(batch_size,
                                            cell_size,
                                            input_size,
                                            use_gpu=False,
                                            iters=30):
    """Benchmark single bprop step speed between GRUBlockCell vs GRUCell."""
    ops.reset_default_graph()
    with session.Session(graph=ops.Graph()) as sess:
        with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):
            initializer = init_ops.random_uniform_initializer(-1, 1, seed=1989)
            # Inputs
            x = vs.get_variable("x", [batch_size, input_size])
            h = vs.get_variable("h", [batch_size, cell_size])

            # Output from the basic GRU cell implementation.
            with vs.variable_scope("basic", initializer=initializer):
                output = core_rnn_cell_impl.GRUCell(cell_size)(
                    array_ops.identity(x), array_ops.identity(h))
                sess.run([variables.global_variables_initializer()])
                grad_output_wrt_input = gradients_impl.gradients([output], h)
                basic_time_bprop = time_taken_by_op(grad_output_wrt_input,
                                                    sess, iters)

            # Output from the block GRU cell implementation.
            with vs.variable_scope("block", initializer=initializer):
                output = gru_ops.GRUBlockCell(cell_size)(array_ops.identity(x),
                                                         array_ops.identity(h))
                sess.run([variables.global_variables_initializer()])
                grad_output_wrt_input = gradients_impl.gradients([output], h)
                block_time_bprop = time_taken_by_op(grad_output_wrt_input,
                                                    sess, iters)

    performance_inference = (basic_time_bprop -
                             block_time_bprop) * 100 / basic_time_bprop

    print(",".join([
        str(batch_size),
        str(cell_size),
        str(input_size),
        str(use_gpu),
        str(basic_time_bprop),
        str(block_time_bprop),
        str(performance_inference)
    ]))

    return basic_time_bprop, block_time_bprop
Exemplo n.º 16
0
 def test_rnn_decoder(self):
   with self.test_session():
     decoder_inputs = [
         array_ops.placeholder(dtypes.float32, [2, 2]) for _ in range(3)
     ]
     encoding = array_ops.placeholder(dtypes.float32, [2, 2])
     cell = core_rnn_cell_impl.GRUCell(2)
     outputs, states, sampling_outputs, sampling_states = (
         ops.rnn_decoder(decoder_inputs, encoding, cell))
     self.assertEqual(len(outputs), 3)
     self.assertEqual(outputs[0].get_shape(), [2, 2])
     self.assertEqual(len(states), 4)
     self.assertEqual(states[0].get_shape(), [2, 2])
     self.assertEqual(len(sampling_outputs), 3)
     self.assertEqual(sampling_outputs[0].get_shape(), [2, 2])
     self.assertEqual(len(sampling_states), 4)
     self.assertEqual(sampling_states[0].get_shape(), [2, 2])
Exemplo n.º 17
0
 def testInputProjectionWrapper(self):
   with self.test_session() as sess:
     with variable_scope.variable_scope(
         "root", initializer=init_ops.constant_initializer(0.5)):
       x = array_ops.zeros([1, 2])
       m = array_ops.zeros([1, 3])
       cell = core_rnn_cell_impl.InputProjectionWrapper(
           core_rnn_cell_impl.GRUCell(3), num_proj=3)
       g, new_m = cell(x, m)
       sess.run([variables_lib.global_variables_initializer()])
       res = sess.run(
           [g, new_m],
           {x.name: np.array([[1., 1.]]),
            m.name: np.array([[0.1, 0.1, 0.1]])})
       self.assertEqual(res[1].shape, (1, 3))
       # The numbers in results were not calculated, this is just a smoke test.
       self.assertAllClose(res[0], [[0.154605, 0.154605, 0.154605]])
Exemplo n.º 18
0
 def testResidualWrapper(self):
   with self.test_session() as sess:
     with variable_scope.variable_scope(
         "root", initializer=init_ops.constant_initializer(0.5)):
       x = array_ops.zeros([1, 3])
       m = array_ops.zeros([1, 3])
       base_cell = core_rnn_cell_impl.GRUCell(3)
       g, m_new = base_cell(x, m)
       variable_scope.get_variable_scope().reuse_variables()
       g_res, m_new_res = core_rnn_cell_impl.ResidualWrapper(base_cell)(x, m)
       sess.run([variables_lib.global_variables_initializer()])
       res = sess.run([g, g_res, m_new, m_new_res], {
           x: np.array([[1., 1., 1.]]),
           m: np.array([[0.1, 0.1, 0.1]])
       })
       # Residual connections
       self.assertAllClose(res[1], res[0] + [1., 1., 1.])
       # States are left untouched
       self.assertAllClose(res[2], res[3])
Exemplo n.º 19
0
 def testEmbeddingWrapper(self):
     with self.test_session() as sess:
         with variable_scope.variable_scope(
                 "root", initializer=init_ops.constant_initializer(0.5)):
             x = array_ops.zeros([1, 1], dtype=dtypes.int32)
             m = array_ops.zeros([1, 2])
             embedding_cell = core_rnn_cell_impl.EmbeddingWrapper(
                 core_rnn_cell_impl.GRUCell(2),
                 embedding_classes=3,
                 embedding_size=2)
             self.assertEqual(embedding_cell.output_size, 2)
             g, new_m = embedding_cell(x, m)
             sess.run([variables_lib.global_variables_initializer()])
             res = sess.run([g, new_m], {
                 x.name: np.array([[1]]),
                 m.name: np.array([[0.1, 0.1]])
             })
             self.assertEqual(res[1].shape, (1, 2))
             # The numbers in results were not calculated, this is just a smoke test.
             self.assertAllClose(res[0], [[0.17139, 0.17139]])
Exemplo n.º 20
0
  def testDynamicAttentionDecoder1(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        cell_fn = lambda: core_rnn_cell_impl.GRUCell(2)
        cell = cell_fn()
        inp = constant_op.constant(0.5, shape=[2, 2, 2])
        enc_outputs, enc_state = rnn.dynamic_rnn(
            cell, inp, dtype=dtypes.float32)
        attn_states = enc_outputs
        dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3

        # Use a new cell instance since the attention decoder uses a
        # different variable scope.
        dec, mem = seq2seq_lib.attention_decoder(
            dec_inp, enc_state, attn_states, cell_fn(), output_size=4)
        sess.run([variables.global_variables_initializer()])
        res = sess.run(dec)
        self.assertEqual(3, len(res))
        self.assertEqual((2, 4), res[0].shape)

        res = sess.run([mem])
        self.assertEqual((2, 2), res[0].shape)
    def test_attention(self):
        with self.test_session() as sess:
            with variable_scope.variable_scope(
                    "root", initializer=init_ops.constant_initializer(0.5)):
                # Define inputs/outputs to model
                batch_size = 2
                encoder_embedding_size = 3
                decoder_embedding_size = 4
                encoder_hidden_size = 5
                decoder_hidden_size = encoder_hidden_size
                input_sequence_length = 6
                decoder_sequence_length = 7
                num_decoder_symbols = 20
                start_of_sequence_id = end_of_sequence_id = 1
                decoder_embeddings = variable_scope.get_variable(
                    "decoder_embeddings",
                    [num_decoder_symbols, decoder_embedding_size],
                    initializer=init_ops.random_normal_initializer(stddev=0.1))
                inputs = constant_op.constant(0.5,
                                              shape=[
                                                  input_sequence_length,
                                                  batch_size,
                                                  encoder_embedding_size
                                              ])
                decoder_inputs = constant_op.constant(
                    0.4,
                    shape=[
                        decoder_sequence_length, batch_size,
                        decoder_embedding_size
                    ])
                decoder_length = constant_op.constant(decoder_sequence_length,
                                                      dtype=dtypes.int32,
                                                      shape=[
                                                          batch_size,
                                                      ])

                # attention
                attention_option = "luong"  # can be "bahdanau"

                with variable_scope.variable_scope("rnn") as scope:
                    # Define model
                    encoder_outputs, encoder_state = rnn.dynamic_rnn(
                        cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size),
                        inputs=inputs,
                        dtype=dtypes.float32,
                        time_major=True,
                        scope=scope)

                    # attention_states: size [batch_size, max_time, num_units]
                    attention_states = array_ops.transpose(
                        encoder_outputs, [1, 0, 2])

                with variable_scope.variable_scope("decoder") as scope:
                    # Prepare attention
                    (attention_keys, attention_values, attention_score_fn,
                     attention_construct_fn) = (
                         attention_decoder_fn.prepare_attention(
                             attention_states, attention_option,
                             decoder_hidden_size))
                    decoder_fn_train = attention_decoder_fn.attention_decoder_fn_train(
                        encoder_state=encoder_state,
                        attention_keys=attention_keys,
                        attention_values=attention_values,
                        attention_score_fn=attention_score_fn,
                        attention_construct_fn=attention_construct_fn)

                    # setting up weights for computing the final output
                    def create_output_fn():
                        def output_fn(x):
                            return layers.linear(x,
                                                 num_decoder_symbols,
                                                 scope=scope)

                        return output_fn

                    output_fn = create_output_fn()

                    # Train decoder
                    decoder_cell = core_rnn_cell_impl.GRUCell(
                        decoder_hidden_size)
                    (decoder_outputs_train, decoder_state_train,
                     _) = (seq2seq.dynamic_rnn_decoder(
                         cell=decoder_cell,
                         decoder_fn=decoder_fn_train,
                         inputs=decoder_inputs,
                         sequence_length=decoder_length,
                         time_major=True,
                         scope=scope))
                    decoder_outputs_train = output_fn(decoder_outputs_train)
                    # Setup variable reuse
                    scope.reuse_variables()

                    # Inference decoder
                    decoder_fn_inference = (
                        attention_decoder_fn.attention_decoder_fn_inference(
                            output_fn=output_fn,
                            encoder_state=encoder_state,
                            attention_keys=attention_keys,
                            attention_values=attention_values,
                            attention_score_fn=attention_score_fn,
                            attention_construct_fn=attention_construct_fn,
                            embeddings=decoder_embeddings,
                            start_of_sequence_id=start_of_sequence_id,
                            end_of_sequence_id=end_of_sequence_id,
                            maximum_length=decoder_sequence_length - 1,
                            num_decoder_symbols=num_decoder_symbols,
                            dtype=dtypes.int32))
                    (decoder_outputs_inference, decoder_state_inference,
                     _) = (seq2seq.dynamic_rnn_decoder(
                         cell=decoder_cell,
                         decoder_fn=decoder_fn_inference,
                         time_major=True,
                         scope=scope))

                # Run model
                variables.global_variables_initializer().run()
                (decoder_outputs_train_res,
                 decoder_state_train_res) = sess.run(
                     [decoder_outputs_train, decoder_state_train])
                (decoder_outputs_inference_res,
                 decoder_state_inference_res) = sess.run(
                     [decoder_outputs_inference, decoder_state_inference])

                # Assert outputs
                self.assertEqual(
                    (decoder_sequence_length, batch_size, num_decoder_symbols),
                    decoder_outputs_train_res.shape)
                self.assertEqual((batch_size, num_decoder_symbols),
                                 decoder_outputs_inference_res.shape[1:3])
                self.assertEqual((batch_size, decoder_hidden_size),
                                 decoder_state_train_res.shape)
                self.assertEqual((batch_size, decoder_hidden_size),
                                 decoder_state_inference_res.shape)
                # The dynamic decoder might end earlier than `maximal_length`
                # under inference
                self.assertGreaterEqual(decoder_sequence_length,
                                        decoder_state_inference_res.shape[0])
    def __init__(self,
                 batch_size=100,
                 vocab_size=5620,
                 word_dim=100,
                 lstm_dim=100,
                 num_classes=4,
                 l2_reg_lambda=0.0,
                 lr=0.001,
                 clip=5,
                 init_embedding=None,
                 bi_gram=False,
                 stack=False,
                 lstm_net=False,
                 bi_direction=False):

        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.word_dim = word_dim
        self.lstm_dim = lstm_dim
        self.num_classes = num_classes
        self.l2_reg_lambda = l2_reg_lambda
        self.lr = lr
        self.clip = clip
        self.stack = stack
        self.lstm_net = lstm_net
        self.bi_direction = bi_direction

        if init_embedding is None:
            self.init_embedding = np.zeros([vocab_size, word_dim],
                                           dtype=np.float32)
        else:
            self.init_embedding = init_embedding

        # placeholders
        self.x = tf.placeholder(tf.int32, [None, None])
        self.y = tf.placeholder(tf.int32, [None, None])
        self.seq_len = tf.placeholder(tf.int32, [None])
        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name="dropout_keep_prob")

        with tf.variable_scope("embedding"):
            self.embedding = tf.Variable(self.init_embedding, name="embedding")

        with tf.variable_scope("softmax"):
            if self.bi_direction:
                self.W = tf.get_variable(
                    shape=[lstm_dim * 2, num_classes],
                    initializer=tf.truncated_normal_initializer(stddev=0.01),
                    name="weights",
                    regularizer=tf.contrib.layers.l2_regularizer(
                        self.l2_reg_lambda))
            else:
                self.W = tf.get_variable(
                    shape=[lstm_dim, num_classes],
                    initializer=tf.truncated_normal_initializer(stddev=0.01),
                    name="weights",
                    regularizer=tf.contrib.layers.l2_regularizer(
                        self.l2_reg_lambda))

            self.b = tf.Variable(tf.zeros([num_classes], name="bias"))

        with tf.variable_scope("lstm"):
            if self.lstm_net is False:
                self.fw_cell = core_rnn_cell_impl.GRUCell(self.lstm_dim)
                self.bw_cell = core_rnn_cell_impl.GRUCell(self.lstm_dim)
            else:
                self.fw_cell = core_rnn_cell_impl.BasicLSTMCell(self.lstm_dim)
                self.bw_cell = core_rnn_cell_impl.BasicLSTMCell(self.lstm_dim)

        with tf.variable_scope("forward"):
            seq_len = tf.cast(self.seq_len, tf.int64)
            x = tf.nn.embedding_lookup(
                self.embedding,
                self.x)  # batch_size * (sequence*9 or 1) * word_dim
            x = tf.nn.dropout(x, self.dropout_keep_prob)

            size = tf.shape(x)[0]
            if bi_gram is False:
                x = tf.reshape(x, [size, -1, word_dim])  # ba*se*wd
            else:
                x = tf.reshape(x, [size, -1, 9 * word_dim])

            if self.bi_direction:
                (forward_output,
                 backward_output), _ = tf.nn.bidirectional_dynamic_rnn(
                     self.fw_cell,
                     self.bw_cell,
                     x,
                     dtype=tf.float32,
                     sequence_length=seq_len,
                     scope='layer_1')
                output = tf.concat(axis=2,
                                   values=[forward_output, backward_output])
                if self.stack:
                    (forward_output,
                     backward_output), _ = tf.nn.bidirectional_dynamic_rnn(
                         self.fw_cell,
                         self.bw_cell,
                         output,
                         dtype=tf.float32,
                         sequence_length=seq_len,
                         scope='layer_2')
                    output = tf.concat(
                        axis=2, values=[forward_output, backward_output])
            else:
                forward_output, _ = tf.nn.dynamic_rnn(self.fw_cell,
                                                      x,
                                                      dtype=tf.float32,
                                                      sequence_length=seq_len,
                                                      scope='layer_1')
                output = forward_output
                if self.stack:
                    forward_output, _ = tf.nn.dynamic_rnn(
                        self.fw_cell,
                        output,
                        dtype=tf.float32,
                        sequence_length=seq_len,
                        scope='layer_2')
                    output = forward_output

            if self.bi_direction:
                output = tf.reshape(output, [-1, 2 * self.lstm_dim])
            else:
                output = tf.reshape(output, [-1, self.lstm_dim])

            matricized_unary_scores = tf.matmul(output, self.W) + self.b

            self.unary_scores = tf.reshape(matricized_unary_scores,
                                           [size, -1, self.num_classes])

        with tf.variable_scope("loss") as scope:
            # CRF log likelihood
            log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood(
                self.unary_scores, self.y, self.seq_len)

            self.loss = tf.reduce_mean(-log_likelihood)

        with tf.variable_scope("train_ops") as scope:
            self.optimizer = tf.train.AdamOptimizer(self.lr)

            self.global_step = tf.Variable(0,
                                           name="global_step",
                                           trainable=False)

            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),
                                              self.clip)
            self.train_op = self.optimizer.apply_gradients(
                zip(grads, tvars), global_step=self.global_step)
Exemplo n.º 23
0
def training_gru_block_vs_gru_cell(batch_size,
                                   cell_size,
                                   input_size,
                                   time_steps,
                                   use_gpu=False,
                                   iters=30):
  """Benchmark training speed between GRUBlockCell vs GRUCell."""
  ops.reset_default_graph()
  with session.Session(graph=ops.Graph()) as sess:
    # Specify the device which is been used.
    with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):

      # Random initializers.
      seed = 1994
      initializer = init_ops.random_uniform_initializer(-1, 1, seed=seed)
      np.random.seed(seed)

      # Inputs
      concat_x = vs.get_variable("concat_x",
                                 [time_steps, batch_size, input_size])
      h = vs.get_variable("h", [batch_size, cell_size])
      y = vs.get_variable("y", [time_steps, batch_size, cell_size])

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        cell = core_rnn_cell_impl.GRUCell(cell_size)

        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        sess.run([variables.global_variables_initializer()])
        cost = math_ops.reduce_mean(math_ops.square(outputs_dynamic - y))
        learning_rate = 0.01
        optimizer = gradient_descent.GradientDescentOptimizer(
            learning_rate).minimize(cost)

        # time for a training step.
        basic_time_training = time_taken_by_op(optimizer, sess, iters)

      # Output from the basic GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        cell = gru_ops.GRUBlockCell(cell_size)

        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        sess.run([variables.global_variables_initializer()])
        cost = math_ops.reduce_mean(math_ops.square(outputs_dynamic - y))
        learning_rate = 0.01
        optimizer = gradient_descent.GradientDescentOptimizer(
            learning_rate).minimize(cost)

        # time for a training step.
        block_time_training = time_taken_by_op(optimizer, sess, iters)

    performance_training = (
        basic_time_training - block_time_training) * 100 / basic_time_training

    print(",".join([
        str(batch_size), str(cell_size), str(input_size), str(time_steps), str(
            use_gpu), str(basic_time_training), str(block_time_training), str(
                performance_training)
    ]))

    return basic_time_training, block_time_training
Exemplo n.º 24
0
  def testDerivativeOfBlockGRUToGRUCellMultiSteps(self):
    batch_size = 2
    cell_size = 3
    input_size = 4
    time_steps = 2
    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
      # Random initializers.
      seed = 1994
      initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed)
      np.random.seed(seed)

      # Inputs
      concat_x = array_ops.placeholder(
          dtypes.float32, shape=(time_steps, batch_size, input_size))
      h = array_ops.zeros([batch_size, cell_size])

      # Values for the inputs.
      x_values = np.random.rand(time_steps, batch_size, input_size)
      h_value = np.random.rand(batch_size, cell_size)
      feeds = {concat_x: x_values, h: h_value}

      # Gradients from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        cell = gru_ops.GRUBlockCell(cell_size)

        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        grad_output_wrt_x = gradients_impl.gradients([outputs_dynamic[0]],
                                                     concat_x)
        grad_output_wrt_h = gradients_impl.gradients([outputs_dynamic[0]], h)

        sess.run([variables.global_variables_initializer()])
        block_grad_res_x, block_grad_res_h = sess.run(
            [grad_output_wrt_x, grad_output_wrt_h], feeds)

      # Gradients from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        cell = core_rnn_cell_impl.GRUCell(cell_size)

        outputs_dynamic, _ = rnn.dynamic_rnn(
            cell,
            inputs=concat_x,
            initial_state=h,
            time_major=True,
            dtype=dtypes.float32)
        grad_output_wrt_x = gradients_impl.gradients([outputs_dynamic[0]],
                                                     concat_x)
        grad_output_wrt_h = gradients_impl.gradients([outputs_dynamic[0]], h)

        sess.run([variables.global_variables_initializer()])
        basic_grad_res_x, basic_grad_res_h = sess.run(
            [grad_output_wrt_x, grad_output_wrt_h], feeds)

    # Check derivatives values of the outputs wrt to x.
    self.assertEqual(len(block_grad_res_x), len(basic_grad_res_x))

    # Check derivatives values of the outputs wrt to h.
    for block, basic in zip(block_grad_res_x, basic_grad_res_x):
      self.assertAllClose(block, basic)

    # Check derivatives values of the outputs wrt to x.
    self.assertEqual(len(block_grad_res_h), len(basic_grad_res_h))

    # Check derivatives values of the outputs wrt to h.
    for block, basic in zip(block_grad_res_h, basic_grad_res_h):
      self.assertAllClose(block, basic)
Exemplo n.º 25
0
  def testDerivativeOfBlockGRUToGRUCellSingleStep(self):
    with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess:
      batch_size = 2
      cell_size = 3
      input_size = 4

      seed = 1994
      initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed)
      np.random.seed(seed)

      # Inputs
      x = array_ops.zeros([batch_size, input_size])
      h = array_ops.zeros([batch_size, cell_size])

      # Values for the inputs.
      x_value = np.random.rand(batch_size, input_size)
      h_value = np.random.rand(batch_size, cell_size)

      # Gradients from the block GRU cell implementation.
      with vs.variable_scope("block", initializer=initializer):
        output = gru_ops.GRUBlockCell(cell_size)(x, h)
        sess.run([variables.global_variables_initializer()])

        all_variables = variables.global_variables()[0:4]
        [w_ru, b_ru, w_c, b_c] = all_variables

        d_new_h_wrt_x = gradients_impl.gradients([output], x)
        d_new_h_wrt_h = gradients_impl.gradients([output], h)
        d_new_h_wrt_w_ru = gradients_impl.gradients([output], w_ru)
        d_new_h_wrt_w_c = gradients_impl.gradients([output], w_c)
        d_new_h_wrt_b_ru = gradients_impl.gradients([output], b_ru)
        d_new_h_wrt_b_c = gradients_impl.gradients([output], b_c)

        d_block_res = sess.run([
            d_new_h_wrt_x, d_new_h_wrt_h, d_new_h_wrt_w_ru, d_new_h_wrt_w_c,
            d_new_h_wrt_b_ru, d_new_h_wrt_b_c
        ], {x: x_value,
            h: h_value})

      # Gradients from the basic GRU cell implementation.
      with vs.variable_scope("basic", initializer=initializer):
        output = core_rnn_cell_impl.GRUCell(cell_size)(x, h)
        sess.run([variables.global_variables_initializer()])

        all_variables = variables.global_variables()[4:8]
        [w_ru, b_ru, w_c, b_c] = all_variables

        d_new_h_wrt_x = gradients_impl.gradients([output], x)
        d_new_h_wrt_h = gradients_impl.gradients([output], h)
        d_new_h_wrt_w_ru = gradients_impl.gradients([output], w_ru)
        d_new_h_wrt_w_c = gradients_impl.gradients([output], w_c)
        d_new_h_wrt_b_ru = gradients_impl.gradients([output], b_ru)
        d_new_h_wrt_b_c = gradients_impl.gradients([output], b_c)

        d_basic_res = sess.run([
            d_new_h_wrt_x, d_new_h_wrt_h, d_new_h_wrt_w_ru, d_new_h_wrt_w_c,
            d_new_h_wrt_b_ru, d_new_h_wrt_b_c
        ], {x: x_value,
            h: h_value})

      # Check lengths of derivative results.
      self.assertEqual(len(d_block_res), len(d_basic_res))
      # Check the value of every derivative result.
      for block, basic in zip(d_block_res, d_basic_res):
        self.assertAllClose(block, basic)
Exemplo n.º 26
0
def test_dynamic_rnn_decoder():
    with tf.Session() as sess:
        with tf.variable_scope(
                "root", initializer=tf.constant_initializer(0.5)) as varscope:
            batch_size = 2
            encoder_embedding_size = 3
            decoder_embedding_size = 4
            encoder_hidden_size = 5
            decoder_hidden_size = encoder_hidden_size
            input_sequence_length = 6
            decoder_sequence_length = 7
            num_decoder_symbols = 20
            start_of_sequence_id = end_of_sequence_id = 1

            decoder_embeddings = tf.get_variable(
                "decoder_embeddings",
                [num_decoder_symbols, decoder_embedding_size],
                initializer=tf.random_normal_initializer(stddev=0.1))

            inputs = tf.constant(0.5,
                                 shape=[
                                     input_sequence_length, batch_size,
                                     encoder_embedding_size
                                 ])

            decoder_inputs = tf.constant(0.4,
                                         shape=[
                                             decoder_sequence_length,
                                             batch_size, decoder_embedding_size
                                         ])

            decoder_length = tf.constant(decoder_sequence_length,
                                         dtype=dtypes.int32,
                                         shape=[
                                             batch_size,
                                         ])

            with tf.variable_scope("rnn") as scope:
                # setting up weights for computing the final output
                output_fn = lambda x: layers.linear(
                    x, num_decoder_symbols, scope=scope)

                # Define model
                encoder_outputs, encoder_state = rnn.dynamic_rnn(
                    cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size),
                    inputs=inputs,
                    dtype=dtypes.float32,
                    time_major=True,
                    scope=scope)

            with tf.variable_scope("decoder") as scope:
                # Train decoder
                decoder_cell = core_rnn_cell_impl.GRUCell(decoder_hidden_size)

                decoder_fn_train = _decoder_fn_with_context_state(
                    decoder_fn_lib.simple_decoder_fn_train(
                        encoder_state=encoder_state))

                (decoder_outputs_train, decoder_state_train,
                 decoder_context_state_train) = seq2seq.dynamic_rnn_decoder(
                     cell=decoder_cell,
                     decoder_fn=decoder_fn_train,
                     inputs=decoder_inputs,
                     sequence_length=decoder_length,
                     time_major=True,
                     scope=scope)

                decoder_outputs_train = output_fn(decoder_outputs_train)

                # Setup variable reuse
                scope.reuse_variables()

                # Inference decoder
                decoder_fn_inference = _decoder_fn_with_context_state(
                    decoder_fn_lib.simple_decoder_fn_inference(
                        output_fn=output_fn,
                        encoder_state=encoder_state,
                        embeddings=decoder_embeddings,
                        start_of_sequence_id=start_of_sequence_id,
                        end_of_sequence_id=end_of_sequence_id,
                        maximum_length=decoder_sequence_length - 1,
                        num_decoder_symbols=num_decoder_symbols,
                        dtype=dtypes.int32))

                (decoder_outputs_inference, decoder_state_inference,
                 decoder_context_state_inference) = (
                     seq2seq.dynamic_rnn_decoder(
                         cell=decoder_cell,
                         decoder_fn=decoder_fn_inference,
                         time_major=True,
                         scope=scope))

                output_train = tf.argmax(decoder_outputs_train, axis=2)
                output_inference = tf.argmax(decoder_outputs_inference, axis=2)

                tf.global_variables_initializer().run()
                (decoder_outputs_train_res, decoder_state_train_res,
                 decoder_context_state_train_res) = sess.run([
                     decoder_outputs_train, decoder_state_train,
                     decoder_context_state_train
                 ])

                (decoder_outputs_inference_res, decoder_state_inference_res,
                 decoder_context_state_inference_res) = sess.run([
                     decoder_outputs_inference, decoder_state_inference,
                     decoder_context_state_inference
                 ])

                print np.shape(decoder_outputs_train_res)
                print np.shape(decoder_outputs_inference_res)
                output_train, output_inference = sess.run(
                    [output_train, output_inference])
                print output_train
                print output_inference
Exemplo n.º 27
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False,
                 config=None,
                 corrective_tokens_mask=None):
        """Create the model.

        Args:
          source_vocab_size: size of the source vocabulary.
          target_vocab_size: size of the target vocabulary.
          buckets: a list of pairs (I, O), where I specifies maximum input
            length that will be processed in that bucket, and O specifies
            maximum output length. Training instances that have longer than I
            or outputs longer than O will be pushed to the next bucket and
            padded accordingly. We assume that the list is sorted, e.g., [(2,
            4), (8, 16)].
          size: number of units in each layer of the model.
          num_layers: number of layers in the model.
          max_gradient_norm: gradients will be clipped to maximally this norm.
          batch_size: the size of the batches used during training;
            the model construction is independent of batch_size, so it can be
            changed after initialization if this is convenient, e.g.,
            for decoding.
          learning_rate: learning rate to start with.
          learning_rate_decay_factor: decay learning rate by this much when
            needed.
          use_lstm: if true, we use LSTM cells instead of GRU cells.
          num_samples: number of samples for sampled softmax.
          forward_only: if set, we do not construct the backward pass in the
            model.
        """
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)
        self.config = config

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in range(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in range(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        # One hot encoding of corrective tokens.
        corrective_tokens_tensor = tf.constant(
            corrective_tokens_mask
            if corrective_tokens_mask else np.zeros(self.target_vocab_size),
            shape=[self.target_vocab_size],
            dtype=tf.float32)
        batched_corrective_tokens = tf.stack([corrective_tokens_tensor] *
                                             self.batch_size)
        self.batch_corrective_tokens_mask = batch_corrective_tokens_mask = \
            tf.placeholder(
            tf.float32,
            shape=[None, None],
            name="corrective_tokens")

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in range(len(self.decoder_inputs) - 1)
        ]
        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary
        # size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
            w = tf.get_variable("proj_w", [size, self.target_vocab_size])
            w_t = tf.transpose(w)
            b = tf.get_variable("proj_b", [self.target_vocab_size])

            output_projection = (w, b)

            def sampled_loss(labels, inputs):
                labels = tf.reshape(labels, [-1, 1])
                return tf.nn.sampled_softmax_loss(w_t, b, labels, inputs,
                                                  num_samples,
                                                  self.target_vocab_size)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = core_rnn_cell_impl.GRUCell(size)
        if use_lstm:
            single_cell = core_rnn_cell_impl.BasicLSTMCell(size)
        cell = single_cell
        if num_layers > 1:
            cell = core_rnn_cell_impl.MultiRNNCell([single_cell] * num_layers)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            """

            :param encoder_inputs: list of length equal to the input bucket
            length of 1-D tensors (of length equal to the batch size) whose
            elements consist of the token index of each sample in the batch
            at a given index in the input.
            :param decoder_inputs:
            :param do_decode:
            :return:
            """

            if do_decode:
                # Modify bias here to bias the model towards selecting words
                # present in the input sentence.
                input_bias = self.build_input_bias(
                    encoder_inputs, batch_corrective_tokens_mask)

                # Redefined seq2seq to allow for the injection of a special
                # decoding function that
                return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    num_encoder_symbols=source_vocab_size,
                    num_decoder_symbols=target_vocab_size,
                    embedding_size=size,
                    output_projection=output_projection,
                    feed_previous=do_decode,
                    loop_fn_factory=
                    apply_input_bias_and_extract_argmax_fn_factory(input_bias))
            else:
                return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    num_encoder_symbols=source_vocab_size,
                    num_decoder_symbols=target_vocab_size,
                    embedding_size=size,
                    output_projection=output_projection,
                    feed_previous=do_decode)

        # Training outputs and losses.
        if forward_only:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, True),
                softmax_loss_function=softmax_loss_function)

            if output_projection is not None:
                for b in range(len(buckets)):
                    # We need to apply the same input bias used during model
                    # evaluation when decoding.
                    input_bias = self.build_input_bias(
                        self.encoder_inputs[:buckets[b][0]],
                        batch_corrective_tokens_mask)
                    self.outputs[b] = [
                        project_and_apply_input_bias(output, output_projection,
                                                     input_bias)
                        for output in self.outputs[b]
                    ]
        else:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function)

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.RMSPropOptimizer(0.001) if self.config.use_rms_prop \
                else tf.train.GradientDescentOptimizer(self.learning_rate)
            # opt = tf.train.AdamOptimizer()

            for b in range(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        self.saver = tf.train.Saver(tf.global_variables())
Exemplo n.º 28
0
    def _build_graph(self):

        # build the graph
        self.graph = tf.Graph()

        with self.graph.as_default():
            tf.set_random_seed(self.random_seed)

            # DATASET PLACEHOLDERS

            # (batch, time)
            source = tf.placeholder(tf.int32)
            source_mask = tf.placeholder(tf.float32)
            target = tf.placeholder(tf.int32)
            target_mask = tf.placeholder(tf.float32)
            output = tf.placeholder(tf.int32)
            output_mask = tf.placeholder(tf.float32)

            # TODO: add factored contexts (POS, NER, ETC...)
            # ner_context = tf.placeholder(tf.int32)

            # sets the probability of dropping out
            dropout_prob = tf.placeholder(tf.float32)

            with tf.name_scope('embeddings'):
                source_embeddings = tf.get_variable(
                    "source_embeddings",
                    [self.src_vocab_size, self.config['embedding_size']],
                    trainable=True)
                # TODO: support factors for source and target inputs
                # ner_embeddings = tf.get_variable("ner_embeddings", [self.meta['num_ner_tags'], self.meta['ner_embedding_size']],
                #                                   trainable=True)

                # default: just embed the tokens in the source context
                source_embed = tf.nn.embedding_lookup(source_embeddings,
                                                      source)

                if self.use_ner_embeddings:
                    pass
                    # TODO: support factors for source input
                    # ner_embed = tf.nn.embedding_lookup(ner_embeddings, ner_context)
                    # context_embed = tf.concat([context_embed, ner_embed], 2)
                    # context_embed.set_shape([None, None, self.meta['embedding_size'] + self.meta['ner_embedding_size']])
                else:
                    # this is to fix shape inference bug in rnn.py -- see this issue: https://github.com/tensorflow/tensorflow/issues/2938
                    source_embed.set_shape(
                        [None, None, self.config['embedding_size']])

                # TODO: switch this to target language embeddings
                # TODO: support target language factors (POS, NER, etc...)
                target_embeddings = tf.get_variable(
                    "target_embeddings",
                    [self.trg_vocab_size, self.config['embedding_size']])

                # target embeddings - these are the _inputs_ to the decoder
                target_embed = tf.nn.embedding_lookup(target_embeddings,
                                                      target)
                target_embed.set_shape(
                    [None, None, self.config['embedding_size']])

            # Construct input representation that we'll put attention over
            # Note: dropout is turned on/off by `dropout_prob`
            with tf.name_scope('input_representation'):
                lstm_cells = [
                    tf.contrib.rnn.DropoutWrapper(
                        tf.contrib.rnn.LSTMCell(
                            self.config['encoder_hidden_size'],
                            use_peepholes=True,
                            state_is_tuple=True),
                        input_keep_prob=dropout_prob,
                        output_keep_prob=dropout_prob)
                    for _ in range(self.config['lstm_stack_size'])
                ]

                cell = tf.contrib.rnn.MultiRNNCell(lstm_cells,
                                                   state_is_tuple=True)

                # use the description mask to get the sequence lengths
                source_sequence_length = tf.cast(tf.reduce_sum(source_mask, 1),
                                                 tf.int64)

                # BIDIRECTIONAL RNNs
                # Bidir outputs are (output_fw, output_bw)
                bidir_outputs, bidir_state = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=cell,
                    cell_bw=cell,
                    inputs=source_embed,
                    sequence_length=source_sequence_length,
                    dtype=tf.float32)
                l_to_r_states, r_to_l_states = bidir_state

                # Transpose to be time-major
                # TODO: do we need to transpose?
                # attention_states = tf.transpose(tf.concat(bidir_outputs, 2), [1, 0, 2])
                attention_states = tf.concat(bidir_outputs, 2)

                # Note: encoder is bidirectional, so we reduce dimensionality by 1/2 to make decoder initial state
                init_state_transformation = tf.get_variable(
                    'decoder_init_transform',
                    (self.config['encoder_hidden_size'] * 2,
                     self.config['decoder_hidden_size']))
                initialization_state = tf.matmul(
                    tf.concat([r_to_l_states[-1][1], l_to_r_states[-1][1]], 1),
                    init_state_transformation)

                # alternatively just use the final l_to_r state
                # initialization_state = l_to_r_states[-1][1]

                # TODO: try with simple L-->R GRU
                # encoder_outputs, encoder_state = rnn.dynamic_rnn(
                #     cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size),
                #     inputs=inputs,
                #     dtype=dtypes.float32,
                #     time_major=False,
                #     scope=scope)

            with tf.name_scope('target_representation'):
                target_lstm_cells = [
                    tf.contrib.rnn.DropoutWrapper(
                        tf.contrib.rnn.LSTMCell(
                            self.config['encoder_hidden_size'],
                            use_peepholes=True,
                            state_is_tuple=True),
                        input_keep_prob=dropout_prob,
                        output_keep_prob=dropout_prob)
                    for _ in range(self.config['lstm_stack_size'])
                ]

                target_cell = tf.contrib.rnn.MultiRNNCell(target_lstm_cells,
                                                          state_is_tuple=True)
                # bidirectional target representation
                target_lengths = tf.cast(tf.reduce_sum(target_mask, axis=1),
                                         dtype=tf.int32)
                target_bidir_outputs, target_bidir_state = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=target_cell,
                    cell_bw=target_cell,
                    inputs=target_embed,
                    sequence_length=target_lengths,
                    dtype=tf.float32,
                    scope='target_bidir_rnn')
                target_l_to_r_states, target_r_to_l_states = target_bidir_state
                target_representation = tf.concat(target_bidir_outputs, 2)

            # Now construct the decoder
            decoder_hidden_size = self.config['decoder_hidden_size']
            # attention
            attention_option = "bahdanau"  # can be "luong"

            with variable_scope.variable_scope("decoder") as scope:

                # Prepare attention
                (attention_keys, attention_values, attention_score_fn,
                 attention_construct_fn) = (
                     attention_decoder_fn.prepare_attention(
                         attention_states, attention_option,
                         decoder_hidden_size))

                decoder_fn_train = attention_decoder_fn.attention_decoder_fn_train(
                    encoder_state=initialization_state,
                    attention_keys=attention_keys,
                    attention_values=attention_values,
                    attention_score_fn=attention_score_fn,
                    attention_construct_fn=attention_construct_fn)

                # Note: this is different from the "normal" seq2seq encoder-decoder model, because we have different
                # input and output vocabularies for the decoder (target vocab vs. QE symbols)
                # num_decoder_symbols = self.output_vocab_size
                # decoder vocab is characters or sub-words? -- either way, we need to learn the vocab over the entity set
                # setting up weights for computing the final output
                # def create_output_fn():
                #     def output_fn(x):
                #         return layers.linear(x, num_decoder_symbols, scope=scope)
                #     return output_fn

                # output_fn = create_output_fn()

                intermediate_dim = 512
                output_transformation_1 = tf.Variable(
                    tf.random_normal([
                        self.config['decoder_hidden_size'] +
                        self.config['encoder_hidden_size'] * 2,
                        intermediate_dim
                    ]),
                    name='output_transformation_1')
                output_biases_1 = tf.Variable(tf.zeros([intermediate_dim]),
                                              name='output_biases_1')

                output_transformation_2 = tf.Variable(
                    tf.random_normal(
                        [intermediate_dim, self.output_vocab_size]),
                    name='output_transformation_2')
                output_biases_2 = tf.Variable(tf.zeros(
                    [self.output_vocab_size]),
                                              name='output_biases_2')

                # Train decoder
                decoder_cell = core_rnn_cell_impl.GRUCell(decoder_hidden_size)

                (decoder_outputs_train, decoder_state_train,
                 _) = (seq2seq.dynamic_rnn_decoder(
                     cell=decoder_cell,
                     decoder_fn=decoder_fn_train,
                     inputs=target_embed,
                     sequence_length=target_lengths,
                     time_major=False,
                     scope=scope))

                # TODO: for attentive QE, we don't need to separate train and inference decoders
                # TODO: we can directly use train decoder output at both training and prediction time

                # concat with target lm representation
                decoder_outputs_train = tf.concat(
                    [decoder_outputs_train, target_representation], 2)
                decoder_outputs_train = tf.nn.elu(decoder_outputs_train)
                decoder_outputs_train = tf.nn.dropout(decoder_outputs_train,
                                                      keep_prob=dropout_prob)

                output_shape = tf.shape(decoder_outputs_train)

                decoder_outputs_train = tf.matmul(
                    tf.reshape(decoder_outputs_train,
                               [output_shape[0] * output_shape[1], -1]),
                    output_transformation_1)
                decoder_outputs_train += output_biases_1
                decoder_outputs_train = tf.nn.elu(decoder_outputs_train)
                decoder_outputs_train = tf.nn.dropout(decoder_outputs_train,
                                                      keep_prob=dropout_prob)

                # one more linear layer
                decoder_outputs_train = tf.matmul(decoder_outputs_train,
                                                  output_transformation_2)
                decoder_outputs_train += output_biases_2

                decoder_outputs_train = tf.reshape(
                    decoder_outputs_train,
                    [output_shape[0], output_shape[1], -1])

                # DEBUGGING: dump these
                # self.decoder_outputs_train = decoder_outputs_train

            with tf.name_scope('predictions'):
                prediction_logits = decoder_outputs_train
                logit_histo = tf.summary.histogram('prediction_logits',
                                                   prediction_logits)

                predictions = tf.nn.softmax(prediction_logits)
                self.predictions = predictions

                # correct_predictions = tf.equal(tf.cast(tf.argmax(predictions, 1), tf.int32), entity)
                # accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
                # accuracy_summary = tf.summary.scalar('accuracy', accuracy)

            with tf.name_scope('xent'):
                # Note: set output and output_mask shape because they're needed here:
                # https://github.com/tensorflow/tensorflow/blob/r1.0/tensorflow/contrib/seq2seq/python/ops/loss.py#L65-L70
                output.set_shape([None, None])
                output_mask.set_shape([None, None])
                costs = tf.contrib.seq2seq.sequence_loss(
                    logits=decoder_outputs_train,
                    targets=output,
                    weights=output_mask,
                    average_across_timesteps=True)
                cost = tf.reduce_mean(costs)
                cost_summary = tf.summary.scalar('minibatch_cost', cost)

            # expose placeholders and ops on the class
            self.source = source
            self.source_mask = source_mask
            self.target = target
            self.target_mask = target_mask
            self.output = output
            self.output_mask = output_mask
            self.predictions = predictions
            self.cost = cost
            self.dropout_prob = dropout_prob

            # TODO: expose embeddings so that they can be visualized?

            optimizer = tf.train.AdamOptimizer()
            with tf.name_scope('train'):
                gradients = optimizer.compute_gradients(
                    cost, tf.trainable_variables())
                if self.config['max_gradient_norm'] is not None:
                    gradients, variables = zip(*gradients)
                    clipped_gradients, _ = clip_ops.clip_by_global_norm(
                        gradients, self.config['max_gradient_norm'])
                    gradients = list(zip(clipped_gradients, variables))

                for gradient, variable in gradients:
                    if isinstance(gradient, ops.IndexedSlices):
                        grad_values = gradient.values
                    else:
                        grad_values = gradient
                    tf.summary.histogram(variable.name, variable)
                    tf.summary.histogram(variable.name + '/gradients',
                                         grad_values)
                    tf.summary.histogram(variable.name + '/gradient_norm',
                                         clip_ops.global_norm([grad_values]))

                self.full_graph_optimizer = optimizer.apply_gradients(
                    gradients)

                # Optimizer #2 -- updates entity representations only
                # entity_representation_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                #                                                      "representation/entity_lookup")
                # self.entity_representation_optimizer = optimizer.minimize(cost,
                #                                                           var_list=entity_representation_train_vars)

            self.saver = tf.train.Saver()

            # self.accuracy = accuracy
            self.merged = tf.summary.merge_all()

            logger.info('Finished building model graph')
    def test_dynamic_rnn_decoder_time_major(self):
        with self.test_session() as sess:
            with variable_scope.variable_scope(
                    "root", initializer=init_ops.constant_initializer(
                        0.5)) as varscope:
                # Define inputs/outputs to model
                batch_size = 2
                encoder_embedding_size = 3
                decoder_embedding_size = 4
                encoder_hidden_size = 5
                decoder_hidden_size = encoder_hidden_size
                input_sequence_length = 6
                decoder_sequence_length = 7
                num_decoder_symbols = 20
                start_of_sequence_id = end_of_sequence_id = 1
                decoder_embeddings = variable_scope.get_variable(
                    "decoder_embeddings",
                    [num_decoder_symbols, decoder_embedding_size],
                    initializer=init_ops.random_normal_initializer(stddev=0.1))
                inputs = constant_op.constant(0.5,
                                              shape=[
                                                  input_sequence_length,
                                                  batch_size,
                                                  encoder_embedding_size
                                              ])
                decoder_inputs = constant_op.constant(
                    0.4,
                    shape=[
                        decoder_sequence_length, batch_size,
                        decoder_embedding_size
                    ])
                decoder_length = constant_op.constant(decoder_sequence_length,
                                                      dtype=dtypes.int32,
                                                      shape=[
                                                          batch_size,
                                                      ])
                with variable_scope.variable_scope("rnn") as scope:
                    # setting up weights for computing the final output
                    output_fn = lambda x: layers.linear(
                        x, num_decoder_symbols, scope=scope)

                    # Define model
                    encoder_outputs, encoder_state = rnn.dynamic_rnn(
                        cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size),
                        inputs=inputs,
                        dtype=dtypes.float32,
                        time_major=True,
                        scope=scope)

                with variable_scope.variable_scope("decoder") as scope:
                    # Train decoder
                    decoder_cell = core_rnn_cell_impl.GRUCell(
                        decoder_hidden_size)
                    decoder_fn_train = Seq2SeqTest._decoder_fn_with_context_state(
                        decoder_fn_lib.simple_decoder_fn_train(
                            encoder_state=encoder_state))
                    (decoder_outputs_train, decoder_state_train,
                     decoder_context_state_train) = (
                         seq2seq.dynamic_rnn_decoder(
                             cell=decoder_cell,
                             decoder_fn=decoder_fn_train,
                             inputs=decoder_inputs,
                             sequence_length=decoder_length,
                             time_major=True,
                             scope=scope))
                    decoder_outputs_train = output_fn(decoder_outputs_train)

                    # Setup variable reuse
                    scope.reuse_variables()

                    # Inference decoder
                    decoder_fn_inference = Seq2SeqTest._decoder_fn_with_context_state(
                        decoder_fn_lib.simple_decoder_fn_inference(
                            output_fn=output_fn,
                            encoder_state=encoder_state,
                            embeddings=decoder_embeddings,
                            start_of_sequence_id=start_of_sequence_id,
                            end_of_sequence_id=end_of_sequence_id,
                            #TODO: find out why it goes to +1
                            maximum_length=decoder_sequence_length - 1,
                            num_decoder_symbols=num_decoder_symbols,
                            dtype=dtypes.int32))
                    (decoder_outputs_inference, decoder_state_inference,
                     decoder_context_state_inference) = (
                         seq2seq.dynamic_rnn_decoder(
                             cell=decoder_cell,
                             decoder_fn=decoder_fn_inference,
                             time_major=True,
                             scope=scope))

                # Run model
                variables.global_variables_initializer().run()
                (decoder_outputs_train_res, decoder_state_train_res,
                 decoder_context_state_train_res) = sess.run([
                     decoder_outputs_train, decoder_state_train,
                     decoder_context_state_train
                 ])
                (decoder_outputs_inference_res, decoder_state_inference_res,
                 decoder_context_state_inference_res) = sess.run([
                     decoder_outputs_inference, decoder_state_inference,
                     decoder_context_state_inference
                 ])

                # Assert outputs
                self.assertEqual(
                    (decoder_sequence_length, batch_size, num_decoder_symbols),
                    decoder_outputs_train_res.shape)
                self.assertEqual((batch_size, num_decoder_symbols),
                                 decoder_outputs_inference_res.shape[1:3])
                self.assertEqual(decoder_sequence_length,
                                 decoder_context_state_inference_res)
                self.assertEqual((batch_size, decoder_hidden_size),
                                 decoder_state_train_res.shape)
                self.assertEqual((batch_size, decoder_hidden_size),
                                 decoder_state_inference_res.shape)
                self.assertEqual(decoder_sequence_length,
                                 decoder_context_state_train_res)
                # The dynamic decoder might end earlier than `maximal_length`
                # under inference
                self.assertGreaterEqual(decoder_sequence_length,
                                        decoder_state_inference_res.shape[0])