def testGRUCell(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m = array_ops.zeros([1, 2]) g, _ = core_rnn_cell_impl.GRUCell(2)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([g], { x.name: np.array([[1., 1.]]), m.name: np.array([[0.1, 0.1]]) }) # Smoke test self.assertAllClose(res[0], [[0.175991, 0.175991]]) with variable_scope.variable_scope( "other", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros( [1, 3]) # Test GRUCell with input_size != num_units. m = array_ops.zeros([1, 2]) g, _ = core_rnn_cell_impl.GRUCell(2)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( [g], { x.name: np.array([[1., 1., 1.]]), m.name: np.array([[0.1, 0.1]]) }) # Smoke test self.assertAllClose(res[0], [[0.156736, 0.156736]])
def testMultiRNNCellWithStateTuple(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m_bad = array_ops.zeros([1, 4]) m_good = (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])) # Test incorrectness of state with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"): core_rnn_cell_impl.MultiRNNCell( [core_rnn_cell_impl.GRUCell(2) for _ in range(2)], state_is_tuple=True)(x, m_bad) _, ml = core_rnn_cell_impl.MultiRNNCell( [core_rnn_cell_impl.GRUCell(2) for _ in range(2)], state_is_tuple=True)(x, m_good) sess.run([variables.global_variables_initializer()]) res = sess.run(ml, { x.name: np.array([[1., 1.]]), m_good[0].name: np.array([[0.1, 0.1]]), m_good[1].name: np.array([[0.1, 0.1]]) }) # The numbers in results were not calculated, this is just a # smoke test. However, these numbers should match those of # the test testMultiRNNCell. self.assertAllClose(res[0], [[0.175991, 0.175991]]) self.assertAllClose(res[1], [[0.13248, 0.13248]])
def testEmbeddingAttentionDecoder(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): inp = [constant_op.constant(0.5, shape=[2, 2])] * 2 cell_fn = lambda: core_rnn_cell_impl.GRUCell(2) cell = cell_fn() enc_outputs, enc_state = core_rnn.static_rnn( cell, inp, dtype=dtypes.float32) attn_states = array_ops.concat([ array_ops.reshape(e, [-1, 1, cell.output_size]) for e in enc_outputs ], 1) dec_inp = [ constant_op.constant( i, dtypes.int32, shape=[2]) for i in range(3) ] # Use a new cell instance since the attention decoder uses a # different variable scope. dec, mem = seq2seq_lib.embedding_attention_decoder( dec_inp, enc_state, attn_states, cell_fn(), num_symbols=4, embedding_size=2, output_size=3) sess.run([variables.global_variables_initializer()]) res = sess.run(dec) self.assertEqual(3, len(res)) self.assertEqual((2, 3), res[0].shape) res = sess.run([mem]) self.assertEqual((2, 2), res[0].shape)
def testBlockGRUToGRUCellSingleStep(self): with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess: batch_size = 4 cell_size = 5 input_size = 6 seed = 1994 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed) # Inputs x = array_ops.zeros([batch_size, input_size]) h = array_ops.zeros([batch_size, cell_size]) # Values for the inputs. x_value = np.random.rand(batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): output = core_rnn_cell_impl.GRUCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) basic_res = sess.run([output], {x: x_value, h: h_value}) # Output from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): output = gru_ops.GRUBlockCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) block_res = sess.run([output], {x: x_value, h: h_value}) self.assertEqual(len(block_res), len(basic_res)) for block, basic in zip(block_res, basic_res): self.assertAllClose(block, basic)
def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self): if not test.is_gpu_available(): # Can't perform this test w/o a GPU return with self.test_session(use_gpu=True) as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 1, 3]) cell = core_rnn_cell_impl.DeviceWrapper( core_rnn_cell_impl.GRUCell(3), "/gpu:0") with ops.device("/cpu:0"): outputs, _ = rnn.dynamic_rnn(cell=cell, inputs=x, dtype=dtypes.float32) run_metadata = config_pb2.RunMetadata() opts = config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE) sess.run([variables_lib.global_variables_initializer()]) _ = sess.run(outputs, options=opts, run_metadata=run_metadata) step_stats = run_metadata.step_stats ix = 0 if "gpu" in step_stats.dev_stats[0].device else 1 gpu_stats = step_stats.dev_stats[ix].node_stats cpu_stats = step_stats.dev_stats[1 - ix].node_stats self.assertFalse( [s for s in cpu_stats if "gru_cell" in s.node_name]) self.assertTrue( [s for s in gpu_stats if "gru_cell" in s.node_name])
def testAttentionDecoder1(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): cell_fn = lambda: core_rnn_cell_impl.GRUCell(2) cell = cell_fn() inp = [constant_op.constant(0.5, shape=[2, 2])] * 2 enc_outputs, enc_state = core_rnn.static_rnn( cell, inp, dtype=dtypes.float32) attn_states = array_ops.concat([ array_ops.reshape(e, [-1, 1, cell.output_size]) for e in enc_outputs ], 1) dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3 # Create a new cell instance for the decoder, since it uses a # different variable scope dec, mem = seq2seq_lib.attention_decoder( dec_inp, enc_state, attn_states, cell_fn(), output_size=4) sess.run([variables.global_variables_initializer()]) res = sess.run(dec) self.assertEqual(3, len(res)) self.assertEqual((2, 4), res[0].shape) res = sess.run([mem]) self.assertEqual((2, 2), res[0].shape)
def inference_gru_block_vs_gru_cell(batch_size, cell_size, input_size, time_steps, use_gpu=False, iters=30): """Benchmark inference speed between GRUBlockCell vs GRUCell.""" ops.reset_default_graph() with session.Session(graph=ops.Graph()) as sess: with ops.device("/cpu:0" if not use_gpu else "/gpu:0"): # Random initializers. seed = 1994 initializer = init_ops.random_uniform_initializer(-1, 1, seed=seed) np.random.seed(seed) # Inputs concat_x = vs.get_variable("concat_x", [time_steps, batch_size, input_size]) h = vs.get_variable("h", [batch_size, cell_size]) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): cell = core_rnn_cell_impl.GRUCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn(cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) basic_time_inference = time_taken_by_op( outputs_dynamic, sess, iters) # Output from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): cell = gru_ops.GRUBlockCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn(cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) block_time_inference = time_taken_by_op( outputs_dynamic, sess, iters) performance_inference = (basic_time_inference - block_time_inference ) * 100 / basic_time_inference print(",".join([ str(batch_size), str(cell_size), str(input_size), str(time_steps), str(use_gpu), str(basic_time_inference), str(block_time_inference), str(performance_inference) ])) return basic_time_inference, block_time_inference
def testBlockGRUToGRUCellMultiStep(self): with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess: batch_size = 2 cell_size = 3 input_size = 3 time_steps = 4 # Random initializers. seed = 1994 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed) np.random.seed(seed) # Inputs concat_x = array_ops.placeholder( dtypes.float32, shape=(time_steps, batch_size, input_size)) h = array_ops.zeros([batch_size, cell_size]) # Values for the inputs. x_values = np.random.rand(time_steps, batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) # Output from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): cell = gru_ops.GRUBlockCell(cell_size) outputs_dynamic, state_dynamic = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) feeds = {concat_x: x_values, h: h_value} sess.run([variables.global_variables_initializer()]) block_res = sess.run([outputs_dynamic, state_dynamic], feeds) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): cell = core_rnn_cell_impl.GRUCell(cell_size) outputs_dynamic, state_dynamic = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) feeds = {concat_x: x_values, h: h_value} sess.run([variables.global_variables_initializer()]) basic_res = sess.run([outputs_dynamic, state_dynamic], feeds) # Check the lengths of the outputs_dynamic, and states. self.assertEqual(len(block_res), len(basic_res)) self.assertEqual(len(block_res[0]), len(basic_res[0])) self.assertEqual(len(block_res[1]), len(basic_res[1])) # Check the outputs_dynamic values. for block_output, basic_output in zip(block_res[0], basic_res[0]): self.assertAllClose(block_output, basic_output) # Check the state_dynamic value. self.assertAllClose(block_res[1], block_res[1])
def testRNNDecoder(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): inp = [constant_op.constant(0.5, shape=[2, 2])] * 2 _, enc_state = core_rnn.static_rnn( core_rnn_cell_impl.GRUCell(2), inp, dtype=dtypes.float32) dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3 cell = core_rnn_cell_impl.OutputProjectionWrapper( core_rnn_cell_impl.GRUCell(2), 4) dec, mem = seq2seq_lib.rnn_decoder(dec_inp, enc_state, cell) sess.run([variables.global_variables_initializer()]) res = sess.run(dec) self.assertEqual(3, len(res)) self.assertEqual((2, 4), res[0].shape) res = sess.run([mem]) self.assertEqual((2, 2), res[0].shape)
def testDeviceWrapper(self): with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 3]) m = array_ops.zeros([1, 3]) cell = core_rnn_cell_impl.DeviceWrapper( core_rnn_cell_impl.GRUCell(3), "/cpu:14159") outputs, _ = cell(x, m) self.assertTrue("cpu:14159" in outputs.device.lower())
def GRUSeq2Seq(enc_inp, dec_inp): cell = core_rnn_cell_impl.MultiRNNCell( [core_rnn_cell_impl.GRUCell(24) for _ in range(2)], state_is_tuple=True) return seq2seq_lib.embedding_attention_seq2seq( enc_inp, dec_inp, cell, num_encoder_symbols=classes, num_decoder_symbols=classes, embedding_size=24, output_projection=(w, b))
def get_rnncell(cell_type, cell_size, keep_prob, num_layer): if cell_type == "gru": cell = rnn_cell.GRUCell(cell_size) else: cell = rnn_cell.LSTMCell(cell_size, use_peepholes=False, forget_bias=1.0) if keep_prob < 1.0: cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=keep_prob) if num_layer > 1: cell = rnn_cell.MultiRNNCell([cell] * num_layer, state_is_tuple=True) return cell
def testBasicRNNSeq2Seq(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): inp = [constant_op.constant(0.5, shape=[2, 2])] * 2 dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3 cell = core_rnn_cell_impl.OutputProjectionWrapper( core_rnn_cell_impl.GRUCell(2), 4) dec, mem = seq2seq_lib.basic_rnn_seq2seq(inp, dec_inp, cell) sess.run([variables.global_variables_initializer()]) res = sess.run(dec) self.assertEqual(3, len(res)) self.assertEqual((2, 4), res[0].shape) res = sess.run([mem]) self.assertEqual((2, 2), res[0].shape)
def testMultiRNNCell(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m = array_ops.zeros([1, 4]) _, ml = core_rnn_cell_impl.MultiRNNCell( [core_rnn_cell_impl.GRUCell(2) for _ in range(2)], state_is_tuple=False)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run(ml, { x.name: np.array([[1., 1.]]), m.name: np.array([[0.1, 0.1, 0.1, 0.1]]) }) # The numbers in results were not calculated, this is just a smoke test. self.assertAllClose(res, [[0.175991, 0.175991, 0.13248, 0.13248]])
def single_bprop_step_gru_block_vs_gru_cell(batch_size, cell_size, input_size, use_gpu=False, iters=30): """Benchmark single bprop step speed between GRUBlockCell vs GRUCell.""" ops.reset_default_graph() with session.Session(graph=ops.Graph()) as sess: with ops.device("/cpu:0" if not use_gpu else "/gpu:0"): initializer = init_ops.random_uniform_initializer(-1, 1, seed=1989) # Inputs x = vs.get_variable("x", [batch_size, input_size]) h = vs.get_variable("h", [batch_size, cell_size]) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): output = core_rnn_cell_impl.GRUCell(cell_size)( array_ops.identity(x), array_ops.identity(h)) sess.run([variables.global_variables_initializer()]) grad_output_wrt_input = gradients_impl.gradients([output], h) basic_time_bprop = time_taken_by_op(grad_output_wrt_input, sess, iters) # Output from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): output = gru_ops.GRUBlockCell(cell_size)(array_ops.identity(x), array_ops.identity(h)) sess.run([variables.global_variables_initializer()]) grad_output_wrt_input = gradients_impl.gradients([output], h) block_time_bprop = time_taken_by_op(grad_output_wrt_input, sess, iters) performance_inference = (basic_time_bprop - block_time_bprop) * 100 / basic_time_bprop print(",".join([ str(batch_size), str(cell_size), str(input_size), str(use_gpu), str(basic_time_bprop), str(block_time_bprop), str(performance_inference) ])) return basic_time_bprop, block_time_bprop
def test_rnn_decoder(self): with self.test_session(): decoder_inputs = [ array_ops.placeholder(dtypes.float32, [2, 2]) for _ in range(3) ] encoding = array_ops.placeholder(dtypes.float32, [2, 2]) cell = core_rnn_cell_impl.GRUCell(2) outputs, states, sampling_outputs, sampling_states = ( ops.rnn_decoder(decoder_inputs, encoding, cell)) self.assertEqual(len(outputs), 3) self.assertEqual(outputs[0].get_shape(), [2, 2]) self.assertEqual(len(states), 4) self.assertEqual(states[0].get_shape(), [2, 2]) self.assertEqual(len(sampling_outputs), 3) self.assertEqual(sampling_outputs[0].get_shape(), [2, 2]) self.assertEqual(len(sampling_states), 4) self.assertEqual(sampling_states[0].get_shape(), [2, 2])
def testInputProjectionWrapper(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 2]) m = array_ops.zeros([1, 3]) cell = core_rnn_cell_impl.InputProjectionWrapper( core_rnn_cell_impl.GRUCell(3), num_proj=3) g, new_m = cell(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( [g, new_m], {x.name: np.array([[1., 1.]]), m.name: np.array([[0.1, 0.1, 0.1]])}) self.assertEqual(res[1].shape, (1, 3)) # The numbers in results were not calculated, this is just a smoke test. self.assertAllClose(res[0], [[0.154605, 0.154605, 0.154605]])
def testResidualWrapper(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 3]) m = array_ops.zeros([1, 3]) base_cell = core_rnn_cell_impl.GRUCell(3) g, m_new = base_cell(x, m) variable_scope.get_variable_scope().reuse_variables() g_res, m_new_res = core_rnn_cell_impl.ResidualWrapper(base_cell)(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([g, g_res, m_new, m_new_res], { x: np.array([[1., 1., 1.]]), m: np.array([[0.1, 0.1, 0.1]]) }) # Residual connections self.assertAllClose(res[1], res[0] + [1., 1., 1.]) # States are left untouched self.assertAllClose(res[2], res[3])
def testEmbeddingWrapper(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([1, 1], dtype=dtypes.int32) m = array_ops.zeros([1, 2]) embedding_cell = core_rnn_cell_impl.EmbeddingWrapper( core_rnn_cell_impl.GRUCell(2), embedding_classes=3, embedding_size=2) self.assertEqual(embedding_cell.output_size, 2) g, new_m = embedding_cell(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([g, new_m], { x.name: np.array([[1]]), m.name: np.array([[0.1, 0.1]]) }) self.assertEqual(res[1].shape, (1, 2)) # The numbers in results were not calculated, this is just a smoke test. self.assertAllClose(res[0], [[0.17139, 0.17139]])
def testDynamicAttentionDecoder1(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): cell_fn = lambda: core_rnn_cell_impl.GRUCell(2) cell = cell_fn() inp = constant_op.constant(0.5, shape=[2, 2, 2]) enc_outputs, enc_state = rnn.dynamic_rnn( cell, inp, dtype=dtypes.float32) attn_states = enc_outputs dec_inp = [constant_op.constant(0.4, shape=[2, 2])] * 3 # Use a new cell instance since the attention decoder uses a # different variable scope. dec, mem = seq2seq_lib.attention_decoder( dec_inp, enc_state, attn_states, cell_fn(), output_size=4) sess.run([variables.global_variables_initializer()]) res = sess.run(dec) self.assertEqual(3, len(res)) self.assertEqual((2, 4), res[0].shape) res = sess.run([mem]) self.assertEqual((2, 2), res[0].shape)
def test_attention(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): # Define inputs/outputs to model batch_size = 2 encoder_embedding_size = 3 decoder_embedding_size = 4 encoder_hidden_size = 5 decoder_hidden_size = encoder_hidden_size input_sequence_length = 6 decoder_sequence_length = 7 num_decoder_symbols = 20 start_of_sequence_id = end_of_sequence_id = 1 decoder_embeddings = variable_scope.get_variable( "decoder_embeddings", [num_decoder_symbols, decoder_embedding_size], initializer=init_ops.random_normal_initializer(stddev=0.1)) inputs = constant_op.constant(0.5, shape=[ input_sequence_length, batch_size, encoder_embedding_size ]) decoder_inputs = constant_op.constant( 0.4, shape=[ decoder_sequence_length, batch_size, decoder_embedding_size ]) decoder_length = constant_op.constant(decoder_sequence_length, dtype=dtypes.int32, shape=[ batch_size, ]) # attention attention_option = "luong" # can be "bahdanau" with variable_scope.variable_scope("rnn") as scope: # Define model encoder_outputs, encoder_state = rnn.dynamic_rnn( cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size), inputs=inputs, dtype=dtypes.float32, time_major=True, scope=scope) # attention_states: size [batch_size, max_time, num_units] attention_states = array_ops.transpose( encoder_outputs, [1, 0, 2]) with variable_scope.variable_scope("decoder") as scope: # Prepare attention (attention_keys, attention_values, attention_score_fn, attention_construct_fn) = ( attention_decoder_fn.prepare_attention( attention_states, attention_option, decoder_hidden_size)) decoder_fn_train = attention_decoder_fn.attention_decoder_fn_train( encoder_state=encoder_state, attention_keys=attention_keys, attention_values=attention_values, attention_score_fn=attention_score_fn, attention_construct_fn=attention_construct_fn) # setting up weights for computing the final output def create_output_fn(): def output_fn(x): return layers.linear(x, num_decoder_symbols, scope=scope) return output_fn output_fn = create_output_fn() # Train decoder decoder_cell = core_rnn_cell_impl.GRUCell( decoder_hidden_size) (decoder_outputs_train, decoder_state_train, _) = (seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_train, inputs=decoder_inputs, sequence_length=decoder_length, time_major=True, scope=scope)) decoder_outputs_train = output_fn(decoder_outputs_train) # Setup variable reuse scope.reuse_variables() # Inference decoder decoder_fn_inference = ( attention_decoder_fn.attention_decoder_fn_inference( output_fn=output_fn, encoder_state=encoder_state, attention_keys=attention_keys, attention_values=attention_values, attention_score_fn=attention_score_fn, attention_construct_fn=attention_construct_fn, embeddings=decoder_embeddings, start_of_sequence_id=start_of_sequence_id, end_of_sequence_id=end_of_sequence_id, maximum_length=decoder_sequence_length - 1, num_decoder_symbols=num_decoder_symbols, dtype=dtypes.int32)) (decoder_outputs_inference, decoder_state_inference, _) = (seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_inference, time_major=True, scope=scope)) # Run model variables.global_variables_initializer().run() (decoder_outputs_train_res, decoder_state_train_res) = sess.run( [decoder_outputs_train, decoder_state_train]) (decoder_outputs_inference_res, decoder_state_inference_res) = sess.run( [decoder_outputs_inference, decoder_state_inference]) # Assert outputs self.assertEqual( (decoder_sequence_length, batch_size, num_decoder_symbols), decoder_outputs_train_res.shape) self.assertEqual((batch_size, num_decoder_symbols), decoder_outputs_inference_res.shape[1:3]) self.assertEqual((batch_size, decoder_hidden_size), decoder_state_train_res.shape) self.assertEqual((batch_size, decoder_hidden_size), decoder_state_inference_res.shape) # The dynamic decoder might end earlier than `maximal_length` # under inference self.assertGreaterEqual(decoder_sequence_length, decoder_state_inference_res.shape[0])
def __init__(self, batch_size=100, vocab_size=5620, word_dim=100, lstm_dim=100, num_classes=4, l2_reg_lambda=0.0, lr=0.001, clip=5, init_embedding=None, bi_gram=False, stack=False, lstm_net=False, bi_direction=False): self.batch_size = batch_size self.vocab_size = vocab_size self.word_dim = word_dim self.lstm_dim = lstm_dim self.num_classes = num_classes self.l2_reg_lambda = l2_reg_lambda self.lr = lr self.clip = clip self.stack = stack self.lstm_net = lstm_net self.bi_direction = bi_direction if init_embedding is None: self.init_embedding = np.zeros([vocab_size, word_dim], dtype=np.float32) else: self.init_embedding = init_embedding # placeholders self.x = tf.placeholder(tf.int32, [None, None]) self.y = tf.placeholder(tf.int32, [None, None]) self.seq_len = tf.placeholder(tf.int32, [None]) self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") with tf.variable_scope("embedding"): self.embedding = tf.Variable(self.init_embedding, name="embedding") with tf.variable_scope("softmax"): if self.bi_direction: self.W = tf.get_variable( shape=[lstm_dim * 2, num_classes], initializer=tf.truncated_normal_initializer(stddev=0.01), name="weights", regularizer=tf.contrib.layers.l2_regularizer( self.l2_reg_lambda)) else: self.W = tf.get_variable( shape=[lstm_dim, num_classes], initializer=tf.truncated_normal_initializer(stddev=0.01), name="weights", regularizer=tf.contrib.layers.l2_regularizer( self.l2_reg_lambda)) self.b = tf.Variable(tf.zeros([num_classes], name="bias")) with tf.variable_scope("lstm"): if self.lstm_net is False: self.fw_cell = core_rnn_cell_impl.GRUCell(self.lstm_dim) self.bw_cell = core_rnn_cell_impl.GRUCell(self.lstm_dim) else: self.fw_cell = core_rnn_cell_impl.BasicLSTMCell(self.lstm_dim) self.bw_cell = core_rnn_cell_impl.BasicLSTMCell(self.lstm_dim) with tf.variable_scope("forward"): seq_len = tf.cast(self.seq_len, tf.int64) x = tf.nn.embedding_lookup( self.embedding, self.x) # batch_size * (sequence*9 or 1) * word_dim x = tf.nn.dropout(x, self.dropout_keep_prob) size = tf.shape(x)[0] if bi_gram is False: x = tf.reshape(x, [size, -1, word_dim]) # ba*se*wd else: x = tf.reshape(x, [size, -1, 9 * word_dim]) if self.bi_direction: (forward_output, backward_output), _ = tf.nn.bidirectional_dynamic_rnn( self.fw_cell, self.bw_cell, x, dtype=tf.float32, sequence_length=seq_len, scope='layer_1') output = tf.concat(axis=2, values=[forward_output, backward_output]) if self.stack: (forward_output, backward_output), _ = tf.nn.bidirectional_dynamic_rnn( self.fw_cell, self.bw_cell, output, dtype=tf.float32, sequence_length=seq_len, scope='layer_2') output = tf.concat( axis=2, values=[forward_output, backward_output]) else: forward_output, _ = tf.nn.dynamic_rnn(self.fw_cell, x, dtype=tf.float32, sequence_length=seq_len, scope='layer_1') output = forward_output if self.stack: forward_output, _ = tf.nn.dynamic_rnn( self.fw_cell, output, dtype=tf.float32, sequence_length=seq_len, scope='layer_2') output = forward_output if self.bi_direction: output = tf.reshape(output, [-1, 2 * self.lstm_dim]) else: output = tf.reshape(output, [-1, self.lstm_dim]) matricized_unary_scores = tf.matmul(output, self.W) + self.b self.unary_scores = tf.reshape(matricized_unary_scores, [size, -1, self.num_classes]) with tf.variable_scope("loss") as scope: # CRF log likelihood log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood( self.unary_scores, self.y, self.seq_len) self.loss = tf.reduce_mean(-log_likelihood) with tf.variable_scope("train_ops") as scope: self.optimizer = tf.train.AdamOptimizer(self.lr) self.global_step = tf.Variable(0, name="global_step", trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), self.clip) self.train_op = self.optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step)
def training_gru_block_vs_gru_cell(batch_size, cell_size, input_size, time_steps, use_gpu=False, iters=30): """Benchmark training speed between GRUBlockCell vs GRUCell.""" ops.reset_default_graph() with session.Session(graph=ops.Graph()) as sess: # Specify the device which is been used. with ops.device("/cpu:0" if not use_gpu else "/gpu:0"): # Random initializers. seed = 1994 initializer = init_ops.random_uniform_initializer(-1, 1, seed=seed) np.random.seed(seed) # Inputs concat_x = vs.get_variable("concat_x", [time_steps, batch_size, input_size]) h = vs.get_variable("h", [batch_size, cell_size]) y = vs.get_variable("y", [time_steps, batch_size, cell_size]) # Output from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): cell = core_rnn_cell_impl.GRUCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) cost = math_ops.reduce_mean(math_ops.square(outputs_dynamic - y)) learning_rate = 0.01 optimizer = gradient_descent.GradientDescentOptimizer( learning_rate).minimize(cost) # time for a training step. basic_time_training = time_taken_by_op(optimizer, sess, iters) # Output from the basic GRU cell implementation. with vs.variable_scope("block", initializer=initializer): cell = gru_ops.GRUBlockCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) cost = math_ops.reduce_mean(math_ops.square(outputs_dynamic - y)) learning_rate = 0.01 optimizer = gradient_descent.GradientDescentOptimizer( learning_rate).minimize(cost) # time for a training step. block_time_training = time_taken_by_op(optimizer, sess, iters) performance_training = ( basic_time_training - block_time_training) * 100 / basic_time_training print(",".join([ str(batch_size), str(cell_size), str(input_size), str(time_steps), str( use_gpu), str(basic_time_training), str(block_time_training), str( performance_training) ])) return basic_time_training, block_time_training
def testDerivativeOfBlockGRUToGRUCellMultiSteps(self): batch_size = 2 cell_size = 3 input_size = 4 time_steps = 2 with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess: # Random initializers. seed = 1994 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed) np.random.seed(seed) # Inputs concat_x = array_ops.placeholder( dtypes.float32, shape=(time_steps, batch_size, input_size)) h = array_ops.zeros([batch_size, cell_size]) # Values for the inputs. x_values = np.random.rand(time_steps, batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) feeds = {concat_x: x_values, h: h_value} # Gradients from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): cell = gru_ops.GRUBlockCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) grad_output_wrt_x = gradients_impl.gradients([outputs_dynamic[0]], concat_x) grad_output_wrt_h = gradients_impl.gradients([outputs_dynamic[0]], h) sess.run([variables.global_variables_initializer()]) block_grad_res_x, block_grad_res_h = sess.run( [grad_output_wrt_x, grad_output_wrt_h], feeds) # Gradients from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): cell = core_rnn_cell_impl.GRUCell(cell_size) outputs_dynamic, _ = rnn.dynamic_rnn( cell, inputs=concat_x, initial_state=h, time_major=True, dtype=dtypes.float32) grad_output_wrt_x = gradients_impl.gradients([outputs_dynamic[0]], concat_x) grad_output_wrt_h = gradients_impl.gradients([outputs_dynamic[0]], h) sess.run([variables.global_variables_initializer()]) basic_grad_res_x, basic_grad_res_h = sess.run( [grad_output_wrt_x, grad_output_wrt_h], feeds) # Check derivatives values of the outputs wrt to x. self.assertEqual(len(block_grad_res_x), len(basic_grad_res_x)) # Check derivatives values of the outputs wrt to h. for block, basic in zip(block_grad_res_x, basic_grad_res_x): self.assertAllClose(block, basic) # Check derivatives values of the outputs wrt to x. self.assertEqual(len(block_grad_res_h), len(basic_grad_res_h)) # Check derivatives values of the outputs wrt to h. for block, basic in zip(block_grad_res_h, basic_grad_res_h): self.assertAllClose(block, basic)
def testDerivativeOfBlockGRUToGRUCellSingleStep(self): with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()) as sess: batch_size = 2 cell_size = 3 input_size = 4 seed = 1994 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed) np.random.seed(seed) # Inputs x = array_ops.zeros([batch_size, input_size]) h = array_ops.zeros([batch_size, cell_size]) # Values for the inputs. x_value = np.random.rand(batch_size, input_size) h_value = np.random.rand(batch_size, cell_size) # Gradients from the block GRU cell implementation. with vs.variable_scope("block", initializer=initializer): output = gru_ops.GRUBlockCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) all_variables = variables.global_variables()[0:4] [w_ru, b_ru, w_c, b_c] = all_variables d_new_h_wrt_x = gradients_impl.gradients([output], x) d_new_h_wrt_h = gradients_impl.gradients([output], h) d_new_h_wrt_w_ru = gradients_impl.gradients([output], w_ru) d_new_h_wrt_w_c = gradients_impl.gradients([output], w_c) d_new_h_wrt_b_ru = gradients_impl.gradients([output], b_ru) d_new_h_wrt_b_c = gradients_impl.gradients([output], b_c) d_block_res = sess.run([ d_new_h_wrt_x, d_new_h_wrt_h, d_new_h_wrt_w_ru, d_new_h_wrt_w_c, d_new_h_wrt_b_ru, d_new_h_wrt_b_c ], {x: x_value, h: h_value}) # Gradients from the basic GRU cell implementation. with vs.variable_scope("basic", initializer=initializer): output = core_rnn_cell_impl.GRUCell(cell_size)(x, h) sess.run([variables.global_variables_initializer()]) all_variables = variables.global_variables()[4:8] [w_ru, b_ru, w_c, b_c] = all_variables d_new_h_wrt_x = gradients_impl.gradients([output], x) d_new_h_wrt_h = gradients_impl.gradients([output], h) d_new_h_wrt_w_ru = gradients_impl.gradients([output], w_ru) d_new_h_wrt_w_c = gradients_impl.gradients([output], w_c) d_new_h_wrt_b_ru = gradients_impl.gradients([output], b_ru) d_new_h_wrt_b_c = gradients_impl.gradients([output], b_c) d_basic_res = sess.run([ d_new_h_wrt_x, d_new_h_wrt_h, d_new_h_wrt_w_ru, d_new_h_wrt_w_c, d_new_h_wrt_b_ru, d_new_h_wrt_b_c ], {x: x_value, h: h_value}) # Check lengths of derivative results. self.assertEqual(len(d_block_res), len(d_basic_res)) # Check the value of every derivative result. for block, basic in zip(d_block_res, d_basic_res): self.assertAllClose(block, basic)
def test_dynamic_rnn_decoder(): with tf.Session() as sess: with tf.variable_scope( "root", initializer=tf.constant_initializer(0.5)) as varscope: batch_size = 2 encoder_embedding_size = 3 decoder_embedding_size = 4 encoder_hidden_size = 5 decoder_hidden_size = encoder_hidden_size input_sequence_length = 6 decoder_sequence_length = 7 num_decoder_symbols = 20 start_of_sequence_id = end_of_sequence_id = 1 decoder_embeddings = tf.get_variable( "decoder_embeddings", [num_decoder_symbols, decoder_embedding_size], initializer=tf.random_normal_initializer(stddev=0.1)) inputs = tf.constant(0.5, shape=[ input_sequence_length, batch_size, encoder_embedding_size ]) decoder_inputs = tf.constant(0.4, shape=[ decoder_sequence_length, batch_size, decoder_embedding_size ]) decoder_length = tf.constant(decoder_sequence_length, dtype=dtypes.int32, shape=[ batch_size, ]) with tf.variable_scope("rnn") as scope: # setting up weights for computing the final output output_fn = lambda x: layers.linear( x, num_decoder_symbols, scope=scope) # Define model encoder_outputs, encoder_state = rnn.dynamic_rnn( cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size), inputs=inputs, dtype=dtypes.float32, time_major=True, scope=scope) with tf.variable_scope("decoder") as scope: # Train decoder decoder_cell = core_rnn_cell_impl.GRUCell(decoder_hidden_size) decoder_fn_train = _decoder_fn_with_context_state( decoder_fn_lib.simple_decoder_fn_train( encoder_state=encoder_state)) (decoder_outputs_train, decoder_state_train, decoder_context_state_train) = seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_train, inputs=decoder_inputs, sequence_length=decoder_length, time_major=True, scope=scope) decoder_outputs_train = output_fn(decoder_outputs_train) # Setup variable reuse scope.reuse_variables() # Inference decoder decoder_fn_inference = _decoder_fn_with_context_state( decoder_fn_lib.simple_decoder_fn_inference( output_fn=output_fn, encoder_state=encoder_state, embeddings=decoder_embeddings, start_of_sequence_id=start_of_sequence_id, end_of_sequence_id=end_of_sequence_id, maximum_length=decoder_sequence_length - 1, num_decoder_symbols=num_decoder_symbols, dtype=dtypes.int32)) (decoder_outputs_inference, decoder_state_inference, decoder_context_state_inference) = ( seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_inference, time_major=True, scope=scope)) output_train = tf.argmax(decoder_outputs_train, axis=2) output_inference = tf.argmax(decoder_outputs_inference, axis=2) tf.global_variables_initializer().run() (decoder_outputs_train_res, decoder_state_train_res, decoder_context_state_train_res) = sess.run([ decoder_outputs_train, decoder_state_train, decoder_context_state_train ]) (decoder_outputs_inference_res, decoder_state_inference_res, decoder_context_state_inference_res) = sess.run([ decoder_outputs_inference, decoder_state_inference, decoder_context_state_inference ]) print np.shape(decoder_outputs_train_res) print np.shape(decoder_outputs_inference_res) output_train, output_inference = sess.run( [output_train, output_inference]) print output_train print output_inference
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False, config=None, corrective_tokens_mask=None): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.config = config # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in range(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in range(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # One hot encoding of corrective tokens. corrective_tokens_tensor = tf.constant( corrective_tokens_mask if corrective_tokens_mask else np.zeros(self.target_vocab_size), shape=[self.target_vocab_size], dtype=tf.float32) batched_corrective_tokens = tf.stack([corrective_tokens_tensor] * self.batch_size) self.batch_corrective_tokens_mask = batch_corrective_tokens_mask = \ tf.placeholder( tf.float32, shape=[None, None], name="corrective_tokens") # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in range(len(self.decoder_inputs) - 1) ] # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary # size. if num_samples > 0 and num_samples < self.target_vocab_size: w = tf.get_variable("proj_w", [size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) output_projection = (w, b) def sampled_loss(labels, inputs): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, labels, inputs, num_samples, self.target_vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = core_rnn_cell_impl.GRUCell(size) if use_lstm: single_cell = core_rnn_cell_impl.BasicLSTMCell(size) cell = single_cell if num_layers > 1: cell = core_rnn_cell_impl.MultiRNNCell([single_cell] * num_layers) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): """ :param encoder_inputs: list of length equal to the input bucket length of 1-D tensors (of length equal to the batch size) whose elements consist of the token index of each sample in the batch at a given index in the input. :param decoder_inputs: :param do_decode: :return: """ if do_decode: # Modify bias here to bias the model towards selecting words # present in the input sentence. input_bias = self.build_input_bias( encoder_inputs, batch_corrective_tokens_mask) # Redefined seq2seq to allow for the injection of a special # decoding function that return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=do_decode, loop_fn_factory= apply_input_bias_and_extract_argmax_fn_factory(input_bias)) else: return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=do_decode) # Training outputs and losses. if forward_only: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) if output_projection is not None: for b in range(len(buckets)): # We need to apply the same input bias used during model # evaluation when decoding. input_bias = self.build_input_bias( self.encoder_inputs[:buckets[b][0]], batch_corrective_tokens_mask) self.outputs[b] = [ project_and_apply_input_bias(output, output_projection, input_bias) for output in self.outputs[b] ] else: self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.RMSPropOptimizer(0.001) if self.config.use_rms_prop \ else tf.train.GradientDescentOptimizer(self.learning_rate) # opt = tf.train.AdamOptimizer() for b in range(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.global_variables())
def _build_graph(self): # build the graph self.graph = tf.Graph() with self.graph.as_default(): tf.set_random_seed(self.random_seed) # DATASET PLACEHOLDERS # (batch, time) source = tf.placeholder(tf.int32) source_mask = tf.placeholder(tf.float32) target = tf.placeholder(tf.int32) target_mask = tf.placeholder(tf.float32) output = tf.placeholder(tf.int32) output_mask = tf.placeholder(tf.float32) # TODO: add factored contexts (POS, NER, ETC...) # ner_context = tf.placeholder(tf.int32) # sets the probability of dropping out dropout_prob = tf.placeholder(tf.float32) with tf.name_scope('embeddings'): source_embeddings = tf.get_variable( "source_embeddings", [self.src_vocab_size, self.config['embedding_size']], trainable=True) # TODO: support factors for source and target inputs # ner_embeddings = tf.get_variable("ner_embeddings", [self.meta['num_ner_tags'], self.meta['ner_embedding_size']], # trainable=True) # default: just embed the tokens in the source context source_embed = tf.nn.embedding_lookup(source_embeddings, source) if self.use_ner_embeddings: pass # TODO: support factors for source input # ner_embed = tf.nn.embedding_lookup(ner_embeddings, ner_context) # context_embed = tf.concat([context_embed, ner_embed], 2) # context_embed.set_shape([None, None, self.meta['embedding_size'] + self.meta['ner_embedding_size']]) else: # this is to fix shape inference bug in rnn.py -- see this issue: https://github.com/tensorflow/tensorflow/issues/2938 source_embed.set_shape( [None, None, self.config['embedding_size']]) # TODO: switch this to target language embeddings # TODO: support target language factors (POS, NER, etc...) target_embeddings = tf.get_variable( "target_embeddings", [self.trg_vocab_size, self.config['embedding_size']]) # target embeddings - these are the _inputs_ to the decoder target_embed = tf.nn.embedding_lookup(target_embeddings, target) target_embed.set_shape( [None, None, self.config['embedding_size']]) # Construct input representation that we'll put attention over # Note: dropout is turned on/off by `dropout_prob` with tf.name_scope('input_representation'): lstm_cells = [ tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.LSTMCell( self.config['encoder_hidden_size'], use_peepholes=True, state_is_tuple=True), input_keep_prob=dropout_prob, output_keep_prob=dropout_prob) for _ in range(self.config['lstm_stack_size']) ] cell = tf.contrib.rnn.MultiRNNCell(lstm_cells, state_is_tuple=True) # use the description mask to get the sequence lengths source_sequence_length = tf.cast(tf.reduce_sum(source_mask, 1), tf.int64) # BIDIRECTIONAL RNNs # Bidir outputs are (output_fw, output_bw) bidir_outputs, bidir_state = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell, cell_bw=cell, inputs=source_embed, sequence_length=source_sequence_length, dtype=tf.float32) l_to_r_states, r_to_l_states = bidir_state # Transpose to be time-major # TODO: do we need to transpose? # attention_states = tf.transpose(tf.concat(bidir_outputs, 2), [1, 0, 2]) attention_states = tf.concat(bidir_outputs, 2) # Note: encoder is bidirectional, so we reduce dimensionality by 1/2 to make decoder initial state init_state_transformation = tf.get_variable( 'decoder_init_transform', (self.config['encoder_hidden_size'] * 2, self.config['decoder_hidden_size'])) initialization_state = tf.matmul( tf.concat([r_to_l_states[-1][1], l_to_r_states[-1][1]], 1), init_state_transformation) # alternatively just use the final l_to_r state # initialization_state = l_to_r_states[-1][1] # TODO: try with simple L-->R GRU # encoder_outputs, encoder_state = rnn.dynamic_rnn( # cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size), # inputs=inputs, # dtype=dtypes.float32, # time_major=False, # scope=scope) with tf.name_scope('target_representation'): target_lstm_cells = [ tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.LSTMCell( self.config['encoder_hidden_size'], use_peepholes=True, state_is_tuple=True), input_keep_prob=dropout_prob, output_keep_prob=dropout_prob) for _ in range(self.config['lstm_stack_size']) ] target_cell = tf.contrib.rnn.MultiRNNCell(target_lstm_cells, state_is_tuple=True) # bidirectional target representation target_lengths = tf.cast(tf.reduce_sum(target_mask, axis=1), dtype=tf.int32) target_bidir_outputs, target_bidir_state = tf.nn.bidirectional_dynamic_rnn( cell_fw=target_cell, cell_bw=target_cell, inputs=target_embed, sequence_length=target_lengths, dtype=tf.float32, scope='target_bidir_rnn') target_l_to_r_states, target_r_to_l_states = target_bidir_state target_representation = tf.concat(target_bidir_outputs, 2) # Now construct the decoder decoder_hidden_size = self.config['decoder_hidden_size'] # attention attention_option = "bahdanau" # can be "luong" with variable_scope.variable_scope("decoder") as scope: # Prepare attention (attention_keys, attention_values, attention_score_fn, attention_construct_fn) = ( attention_decoder_fn.prepare_attention( attention_states, attention_option, decoder_hidden_size)) decoder_fn_train = attention_decoder_fn.attention_decoder_fn_train( encoder_state=initialization_state, attention_keys=attention_keys, attention_values=attention_values, attention_score_fn=attention_score_fn, attention_construct_fn=attention_construct_fn) # Note: this is different from the "normal" seq2seq encoder-decoder model, because we have different # input and output vocabularies for the decoder (target vocab vs. QE symbols) # num_decoder_symbols = self.output_vocab_size # decoder vocab is characters or sub-words? -- either way, we need to learn the vocab over the entity set # setting up weights for computing the final output # def create_output_fn(): # def output_fn(x): # return layers.linear(x, num_decoder_symbols, scope=scope) # return output_fn # output_fn = create_output_fn() intermediate_dim = 512 output_transformation_1 = tf.Variable( tf.random_normal([ self.config['decoder_hidden_size'] + self.config['encoder_hidden_size'] * 2, intermediate_dim ]), name='output_transformation_1') output_biases_1 = tf.Variable(tf.zeros([intermediate_dim]), name='output_biases_1') output_transformation_2 = tf.Variable( tf.random_normal( [intermediate_dim, self.output_vocab_size]), name='output_transformation_2') output_biases_2 = tf.Variable(tf.zeros( [self.output_vocab_size]), name='output_biases_2') # Train decoder decoder_cell = core_rnn_cell_impl.GRUCell(decoder_hidden_size) (decoder_outputs_train, decoder_state_train, _) = (seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_train, inputs=target_embed, sequence_length=target_lengths, time_major=False, scope=scope)) # TODO: for attentive QE, we don't need to separate train and inference decoders # TODO: we can directly use train decoder output at both training and prediction time # concat with target lm representation decoder_outputs_train = tf.concat( [decoder_outputs_train, target_representation], 2) decoder_outputs_train = tf.nn.elu(decoder_outputs_train) decoder_outputs_train = tf.nn.dropout(decoder_outputs_train, keep_prob=dropout_prob) output_shape = tf.shape(decoder_outputs_train) decoder_outputs_train = tf.matmul( tf.reshape(decoder_outputs_train, [output_shape[0] * output_shape[1], -1]), output_transformation_1) decoder_outputs_train += output_biases_1 decoder_outputs_train = tf.nn.elu(decoder_outputs_train) decoder_outputs_train = tf.nn.dropout(decoder_outputs_train, keep_prob=dropout_prob) # one more linear layer decoder_outputs_train = tf.matmul(decoder_outputs_train, output_transformation_2) decoder_outputs_train += output_biases_2 decoder_outputs_train = tf.reshape( decoder_outputs_train, [output_shape[0], output_shape[1], -1]) # DEBUGGING: dump these # self.decoder_outputs_train = decoder_outputs_train with tf.name_scope('predictions'): prediction_logits = decoder_outputs_train logit_histo = tf.summary.histogram('prediction_logits', prediction_logits) predictions = tf.nn.softmax(prediction_logits) self.predictions = predictions # correct_predictions = tf.equal(tf.cast(tf.argmax(predictions, 1), tf.int32), entity) # accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32)) # accuracy_summary = tf.summary.scalar('accuracy', accuracy) with tf.name_scope('xent'): # Note: set output and output_mask shape because they're needed here: # https://github.com/tensorflow/tensorflow/blob/r1.0/tensorflow/contrib/seq2seq/python/ops/loss.py#L65-L70 output.set_shape([None, None]) output_mask.set_shape([None, None]) costs = tf.contrib.seq2seq.sequence_loss( logits=decoder_outputs_train, targets=output, weights=output_mask, average_across_timesteps=True) cost = tf.reduce_mean(costs) cost_summary = tf.summary.scalar('minibatch_cost', cost) # expose placeholders and ops on the class self.source = source self.source_mask = source_mask self.target = target self.target_mask = target_mask self.output = output self.output_mask = output_mask self.predictions = predictions self.cost = cost self.dropout_prob = dropout_prob # TODO: expose embeddings so that they can be visualized? optimizer = tf.train.AdamOptimizer() with tf.name_scope('train'): gradients = optimizer.compute_gradients( cost, tf.trainable_variables()) if self.config['max_gradient_norm'] is not None: gradients, variables = zip(*gradients) clipped_gradients, _ = clip_ops.clip_by_global_norm( gradients, self.config['max_gradient_norm']) gradients = list(zip(clipped_gradients, variables)) for gradient, variable in gradients: if isinstance(gradient, ops.IndexedSlices): grad_values = gradient.values else: grad_values = gradient tf.summary.histogram(variable.name, variable) tf.summary.histogram(variable.name + '/gradients', grad_values) tf.summary.histogram(variable.name + '/gradient_norm', clip_ops.global_norm([grad_values])) self.full_graph_optimizer = optimizer.apply_gradients( gradients) # Optimizer #2 -- updates entity representations only # entity_representation_train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, # "representation/entity_lookup") # self.entity_representation_optimizer = optimizer.minimize(cost, # var_list=entity_representation_train_vars) self.saver = tf.train.Saver() # self.accuracy = accuracy self.merged = tf.summary.merge_all() logger.info('Finished building model graph')
def test_dynamic_rnn_decoder_time_major(self): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer( 0.5)) as varscope: # Define inputs/outputs to model batch_size = 2 encoder_embedding_size = 3 decoder_embedding_size = 4 encoder_hidden_size = 5 decoder_hidden_size = encoder_hidden_size input_sequence_length = 6 decoder_sequence_length = 7 num_decoder_symbols = 20 start_of_sequence_id = end_of_sequence_id = 1 decoder_embeddings = variable_scope.get_variable( "decoder_embeddings", [num_decoder_symbols, decoder_embedding_size], initializer=init_ops.random_normal_initializer(stddev=0.1)) inputs = constant_op.constant(0.5, shape=[ input_sequence_length, batch_size, encoder_embedding_size ]) decoder_inputs = constant_op.constant( 0.4, shape=[ decoder_sequence_length, batch_size, decoder_embedding_size ]) decoder_length = constant_op.constant(decoder_sequence_length, dtype=dtypes.int32, shape=[ batch_size, ]) with variable_scope.variable_scope("rnn") as scope: # setting up weights for computing the final output output_fn = lambda x: layers.linear( x, num_decoder_symbols, scope=scope) # Define model encoder_outputs, encoder_state = rnn.dynamic_rnn( cell=core_rnn_cell_impl.GRUCell(encoder_hidden_size), inputs=inputs, dtype=dtypes.float32, time_major=True, scope=scope) with variable_scope.variable_scope("decoder") as scope: # Train decoder decoder_cell = core_rnn_cell_impl.GRUCell( decoder_hidden_size) decoder_fn_train = Seq2SeqTest._decoder_fn_with_context_state( decoder_fn_lib.simple_decoder_fn_train( encoder_state=encoder_state)) (decoder_outputs_train, decoder_state_train, decoder_context_state_train) = ( seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_train, inputs=decoder_inputs, sequence_length=decoder_length, time_major=True, scope=scope)) decoder_outputs_train = output_fn(decoder_outputs_train) # Setup variable reuse scope.reuse_variables() # Inference decoder decoder_fn_inference = Seq2SeqTest._decoder_fn_with_context_state( decoder_fn_lib.simple_decoder_fn_inference( output_fn=output_fn, encoder_state=encoder_state, embeddings=decoder_embeddings, start_of_sequence_id=start_of_sequence_id, end_of_sequence_id=end_of_sequence_id, #TODO: find out why it goes to +1 maximum_length=decoder_sequence_length - 1, num_decoder_symbols=num_decoder_symbols, dtype=dtypes.int32)) (decoder_outputs_inference, decoder_state_inference, decoder_context_state_inference) = ( seq2seq.dynamic_rnn_decoder( cell=decoder_cell, decoder_fn=decoder_fn_inference, time_major=True, scope=scope)) # Run model variables.global_variables_initializer().run() (decoder_outputs_train_res, decoder_state_train_res, decoder_context_state_train_res) = sess.run([ decoder_outputs_train, decoder_state_train, decoder_context_state_train ]) (decoder_outputs_inference_res, decoder_state_inference_res, decoder_context_state_inference_res) = sess.run([ decoder_outputs_inference, decoder_state_inference, decoder_context_state_inference ]) # Assert outputs self.assertEqual( (decoder_sequence_length, batch_size, num_decoder_symbols), decoder_outputs_train_res.shape) self.assertEqual((batch_size, num_decoder_symbols), decoder_outputs_inference_res.shape[1:3]) self.assertEqual(decoder_sequence_length, decoder_context_state_inference_res) self.assertEqual((batch_size, decoder_hidden_size), decoder_state_train_res.shape) self.assertEqual((batch_size, decoder_hidden_size), decoder_state_inference_res.shape) self.assertEqual(decoder_sequence_length, decoder_context_state_train_res) # The dynamic decoder might end earlier than `maximal_length` # under inference self.assertGreaterEqual(decoder_sequence_length, decoder_state_inference_res.shape[0])