def _createStackBidirectionalDynamicRNN(self, use_gpu, use_shape, use_state_tuple, initial_states_fw=None, initial_states_bw=None, scope=None): self.layers = [2, 3] input_size = 5 batch_size = 2 max_length = 8 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=self._seed) sequence_length = array_ops.placeholder(dtypes.int64) self.cells_fw = [ core_rnn_cell_impl.LSTMCell(num_units, input_size, initializer=initializer, state_is_tuple=False) for num_units in self.layers ] self.cells_bw = [ core_rnn_cell_impl.LSTMCell(num_units, input_size, initializer=initializer, state_is_tuple=False) for num_units in self.layers ] inputs = max_length * [ array_ops.placeholder( dtypes.float32, shape=(batch_size, input_size) if use_shape else (None, input_size)) ] inputs_c = array_ops.stack(inputs) inputs_c = array_ops.transpose(inputs_c, [1, 0, 2]) outputs, st_fw, st_bw = rnn.stack_bidirectional_dynamic_rnn( self.cells_fw, self.cells_bw, inputs_c, initial_states_fw=initial_states_fw, initial_states_bw=initial_states_bw, dtype=dtypes.float32, sequence_length=sequence_length, scope=scope) # Outputs has shape (batch_size, max_length, 2* layer[-1]. output_shape = [None, max_length, 2 * self.layers[-1]] if use_shape: output_shape[0] = batch_size self.assertAllEqual(outputs.get_shape().as_list(), output_shape) input_value = np.random.randn(batch_size, input_size) return input_value, inputs, outputs, st_fw, st_bw, sequence_length
def _createStackBidirectionalRNN(self, use_gpu, use_shape, use_sequence_length, initial_states_fw=None, initial_states_bw=None, scope=None): self.layers = [2, 3] input_size = 5 batch_size = 2 max_length = 8 initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=self._seed) sequence_length = array_ops.placeholder( dtypes.int64) if use_sequence_length else None self.cells_fw = [ core_rnn_cell_impl.LSTMCell(num_units, input_size, initializer=initializer, state_is_tuple=False) for num_units in self.layers ] self.cells_bw = [ core_rnn_cell_impl.LSTMCell(num_units, input_size, initializer=initializer, state_is_tuple=False) for num_units in self.layers ] inputs = max_length * [ array_ops.placeholder( dtypes.float32, shape=(batch_size, input_size) if use_shape else (None, input_size)) ] outputs, state_fw, state_bw = rnn.stack_bidirectional_rnn( self.cells_fw, self.cells_bw, inputs, initial_states_fw, initial_states_bw, dtype=dtypes.float32, sequence_length=sequence_length, scope=scope) self.assertEqual(len(outputs), len(inputs)) for out in outputs: self.assertAlmostEqual( out.get_shape().as_list(), [batch_size if use_shape else None, 2 * self.layers[-1]]) input_value = np.random.randn(batch_size, input_size) outputs = array_ops.stack(outputs) return input_value, inputs, outputs, state_fw, state_bw, sequence_length
def testUsingSecondCellInScopeWithExistingVariablesFails(self): # This test should go away when this behavior is no longer an # error (Approx. May 2017) cell1 = core_rnn_cell_impl.LSTMCell(3) cell2 = core_rnn_cell_impl.LSTMCell(3) x = array_ops.zeros([1, 3]) m = core_rnn_cell_impl.LSTMStateTuple(*[array_ops.zeros([1, 3])] * 2) cell1(x, m) with self.assertRaisesRegexp(ValueError, r"LSTMCell\(..., reuse=True\)"): cell2(x, m)
def testCompatibleNames(self): with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()): cell = core_rnn_cell_impl.LSTMCell(10) pcell = core_rnn_cell_impl.LSTMCell(10, use_peepholes=True) inputs = [array_ops.zeros([4, 5])] * 6 core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32, scope="basic") core_rnn.static_rnn(pcell, inputs, dtype=dtypes.float32, scope="peephole") basic_names = { v.name: v.get_shape() for v in variables.trainable_variables() } with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()): cell = lstm_ops.LSTMBlockCell(10) pcell = lstm_ops.LSTMBlockCell(10, use_peephole=True) inputs = [array_ops.zeros([4, 5])] * 6 core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32, scope="basic") core_rnn.static_rnn(pcell, inputs, dtype=dtypes.float32, scope="peephole") block_names = { v.name: v.get_shape() for v in variables.trainable_variables() } with self.test_session(use_gpu=self._use_gpu, graph=ops.Graph()): cell = lstm_ops.LSTMBlockFusedCell(10) pcell = lstm_ops.LSTMBlockFusedCell(10, use_peephole=True) inputs = [array_ops.zeros([4, 5])] * 6 cell(inputs, dtype=dtypes.float32, scope="basic/lstm_cell") pcell(inputs, dtype=dtypes.float32, scope="peephole/lstm_cell") fused_names = { v.name: v.get_shape() for v in variables.trainable_variables() } self.assertEqual(basic_names, block_names) self.assertEqual(basic_names, fused_names)
def _create_multi_lstm_cell_ops(batch_size, num_units, input_depth, num_layers, max_time, compiled): with variable_scope.variable_scope( "root", initializer=init_ops.random_uniform_initializer(-0.1, 0.1, seed=2)): inputs = variable_scope.get_variable( "inputs", initializer=random_ops.random_uniform( (max_time, batch_size, input_depth), seed=1)) maybe_xla = lambda c: rnn_cell.CompiledWrapper(c) if compiled else c cell = core_rnn_cell_impl.MultiRNNCell( [maybe_xla(core_rnn_cell_impl.LSTMCell(num_units)) for _ in range(num_layers)]) initial_state = cell.zero_state( batch_size=batch_size, dtype=dtypes.float32) outputs, final_state = rnn.dynamic_rnn( cell=cell, inputs=inputs, initial_state=initial_state, time_major=True) flat_final_state = nest.flatten(final_state) trainable_variables = variables.trainable_variables() outputs_grad = gradients_impl.gradients( [outputs], trainable_variables + [inputs] + nest.flatten(initial_state)) final_state_grad = gradients_impl.gradients( flat_final_state, trainable_variables + [inputs] + nest.flatten(initial_state)) return {"outputs": outputs, "final_state": flat_final_state, "outputs_grad": outputs_grad, "final_state_grad": final_state_grad}
def testLSTMCell(self): with self.test_session() as sess: num_units = 8 num_proj = 6 state_size = num_units + num_proj batch_size = 3 input_size = 2 with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([batch_size, input_size]) m = array_ops.zeros([batch_size, state_size]) cell = core_rnn_cell_impl.LSTMCell(num_units=num_units, num_proj=num_proj, forget_bias=1.0, state_is_tuple=False) output, state = cell(x, m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run( [output, state], { x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]), m.name: 0.1 * np.ones((batch_size, state_size)) }) self.assertEqual(len(res), 2) # The numbers in results were not calculated, this is mostly just a # smoke test. self.assertEqual(res[0].shape, (batch_size, num_proj)) self.assertEqual(res[1].shape, (batch_size, state_size)) # Different inputs so different outputs and states for i in range(1, batch_size): self.assertTrue( float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) > 1e-6) self.assertTrue( float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) > 1e-6)
def benchmarkTfRNNLSTMTraining(self): test_configs = self._GetTestConfig() for config_name, config in test_configs.items(): num_layers = config["num_layers"] num_units = config["num_units"] batch_size = config["batch_size"] seq_length = config["seq_length"] with ops.Graph().as_default(), ops.device("/gpu:0"): inputs = seq_length * [ array_ops.zeros([batch_size, num_units], dtypes.float32) ] initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=127) cell = core_rnn_cell_impl.LSTMCell( num_units=num_units, initializer=initializer, state_is_tuple=True) multi_cell = core_rnn_cell_impl.MultiRNNCell( [cell() for _ in range(num_layers)]) outputs, final_state = core_rnn.static_rnn( multi_cell, inputs, dtype=dtypes.float32) trainable_variables = ops.get_collection( ops.GraphKeys.TRAINABLE_VARIABLES) gradients = gradients_impl.gradients([outputs, final_state], trainable_variables) training_op = control_flow_ops.group(*gradients) self._BenchmarkOp(training_op, "tf_rnn_lstm %s %s" % (config_name, self._GetConfigDesc(config)))
def _testDropoutWrapper(self, batch_size=None, time_steps=None, parallel_iterations=None, **kwargs): with self.test_session() as sess: with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): if batch_size is None and time_steps is None: # 2 time steps, batch size 1, depth 3 batch_size = 1 time_steps = 2 x = constant_op.constant( [[[2., 2., 2.]], [[1., 1., 1.]]], dtype=dtypes.float32) m = core_rnn_cell_impl.LSTMStateTuple( *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32)] * 2) else: x = constant_op.constant( np.random.randn(time_steps, batch_size, 3).astype(np.float32)) m = core_rnn_cell_impl.LSTMStateTuple( *[constant_op.constant([[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32)] * 2) outputs, final_state = rnn.dynamic_rnn( cell=core_rnn_cell_impl.DropoutWrapper( core_rnn_cell_impl.LSTMCell(3), dtype=x.dtype, **kwargs), time_major=True, parallel_iterations=parallel_iterations, inputs=x, initial_state=m) sess.run([variables_lib.global_variables_initializer()]) res = sess.run([outputs, final_state]) self.assertEqual(res[0].shape, (time_steps, batch_size, 3)) self.assertEqual(res[1].c.shape, (batch_size, 3)) self.assertEqual(res[1].h.shape, (batch_size, 3)) return res
def testLSTMBasicToBlockCellPeeping(self): with self.test_session(use_gpu=self._use_gpu) as sess: x = array_ops.zeros([1, 2]) x_values = np.random.randn(1, 2) m0_val = 0.1 * np.ones([1, 2]) m1_val = -0.1 * np.ones([1, 2]) m2_val = -0.2 * np.ones([1, 2]) m3_val = 0.2 * np.ones([1, 2]) initializer = init_ops.random_uniform_initializer( -0.01, 0.01, seed=19890212) with variable_scope.variable_scope("basic", initializer=initializer): m0 = array_ops.zeros([1, 2]) m1 = array_ops.zeros([1, 2]) m2 = array_ops.zeros([1, 2]) m3 = array_ops.zeros([1, 2]) g, ((out_m0, out_m1), (out_m2, out_m3)) = core_rnn_cell_impl.MultiRNNCell( [ core_rnn_cell_impl.LSTMCell( 2, use_peepholes=True, state_is_tuple=True) ] * 2, state_is_tuple=True)(x, ((m0, m1), (m2, m3))) sess.run([variables.global_variables_initializer()]) basic_res = sess.run([g, out_m0, out_m1, out_m2, out_m3], { x.name: x_values, m0.name: m0_val, m1.name: m1_val, m2.name: m2_val, m3.name: m3_val }) with variable_scope.variable_scope("block", initializer=initializer): m0 = array_ops.zeros([1, 2]) m1 = array_ops.zeros([1, 2]) m2 = array_ops.zeros([1, 2]) m3 = array_ops.zeros([1, 2]) g, ((out_m0, out_m1), (out_m2, out_m3)) = core_rnn_cell_impl.MultiRNNCell( [lstm_ops.LSTMBlockCell( 2, use_peephole=True)] * 2, state_is_tuple=True)(x, ((m0, m1), (m2, m3))) sess.run([variables.global_variables_initializer()]) block_res = sess.run([g, out_m0, out_m1, out_m2, out_m3], { x.name: x_values, m0.name: m0_val, m1.name: m1_val, m2.name: m2_val, m3.name: m3_val }) self.assertEqual(len(basic_res), len(block_res)) for basic, block in zip(basic_res, block_res): self.assertAllClose(basic, block)
def testUsingCellInDifferentScopeFromFirstCallFails(self): # This test should go away when this behavior is no longer an # error (Approx. May 2017) cell = core_rnn_cell_impl.LSTMCell(3) x = array_ops.zeros([1, 3]) m = core_rnn_cell_impl.LSTMStateTuple(*[array_ops.zeros([1, 3])] * 2) with variable_scope.variable_scope("scope1"): cell(x, m) with variable_scope.variable_scope("scope2"): with self.assertRaisesRegexp(ValueError, r"Attempt to reuse RNNCell"): cell(x, m)
def get_rnncell(cell_type, cell_size, keep_prob, num_layer): if cell_type == "gru": cell = rnn_cell.GRUCell(cell_size) else: cell = rnn_cell.LSTMCell(cell_size, use_peepholes=False, forget_bias=1.0) if keep_prob < 1.0: cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=keep_prob) if num_layer > 1: cell = rnn_cell.MultiRNNCell([cell] * num_layer, state_is_tuple=True) return cell
def testStateTupleDictConversion(self): """Test `state_tuple_to_dict` and `dict_to_state_tuple`.""" cell_sizes = [5, 3, 7] # A MultiRNNCell of LSTMCells is both a common choice and an interesting # test case, because it has two levels of nesting, with an inner class that # is not a plain tuple. cell = core_rnn_cell_impl.MultiRNNCell( [core_rnn_cell_impl.LSTMCell(i) for i in cell_sizes]) state_dict = { dynamic_rnn_estimator._get_state_name(i): array_ops.expand_dims(math_ops.range(cell_size), 0) for i, cell_size in enumerate([5, 5, 3, 3, 7, 7]) } expected_state = (core_rnn_cell_impl.LSTMStateTuple( np.reshape(np.arange(5), [1, -1]), np.reshape(np.arange(5), [1, -1])), core_rnn_cell_impl.LSTMStateTuple( np.reshape(np.arange(3), [1, -1]), np.reshape(np.arange(3), [1, -1])), core_rnn_cell_impl.LSTMStateTuple( np.reshape(np.arange(7), [1, -1]), np.reshape(np.arange(7), [1, -1]))) actual_state = dynamic_rnn_estimator.dict_to_state_tuple( state_dict, cell) flattened_state = dynamic_rnn_estimator.state_tuple_to_dict( actual_state) with self.test_session() as sess: (state_dict_val, actual_state_val, flattened_state_val) = sess.run( [state_dict, actual_state, flattened_state]) def _recursive_assert_equal(x, y): self.assertEqual(type(x), type(y)) if isinstance(x, (list, tuple)): self.assertEqual(len(x), len(y)) for i, _ in enumerate(x): _recursive_assert_equal(x[i], y[i]) elif isinstance(x, np.ndarray): np.testing.assert_array_equal(x, y) else: self.fail('Unexpected type: {}'.format(type(x))) for k in state_dict_val.keys(): np.testing.assert_array_almost_equal( state_dict_val[k], flattened_state_val[k], err_msg='Wrong value for state component {}.'.format(k)) _recursive_assert_equal(expected_state, actual_state_val)
def testLSTMCellVariables(self): with self.test_session(): num_units = 8 num_proj = 6 state_size = num_units + num_proj batch_size = 3 input_size = 2 with variable_scope.variable_scope( "root", initializer=init_ops.constant_initializer(0.5)): x = array_ops.zeros([batch_size, input_size]) m = array_ops.zeros([batch_size, state_size]) cell = core_rnn_cell_impl.LSTMCell(num_units=num_units, num_proj=num_proj, forget_bias=1.0, state_is_tuple=False) cell(x, m) # Execute to create variables variables = variables_lib.global_variables() self.assertEquals(variables[0].op.name, "root/lstm_cell/weights") self.assertEquals(variables[1].op.name, "root/lstm_cell/biases") self.assertEquals(variables[2].op.name, "root/lstm_cell/projection/weights")
def testLSTMBasicToBlockPeeping(self): with self.test_session(use_gpu=self._use_gpu) as sess: batch_size = 2 input_size = 3 cell_size = 4 sequence_length = 5 inputs = [] for _ in range(sequence_length): inp = ops.convert_to_tensor( np.random.randn(batch_size, input_size), dtype=dtypes.float32) inputs.append(inp) initializer = init_ops.random_uniform_initializer( -0.01, 0.01, seed=19890212) with variable_scope.variable_scope("basic", initializer=initializer): cell = core_rnn_cell_impl.LSTMCell( cell_size, use_peepholes=True, state_is_tuple=True) outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) basic_outputs, basic_state = sess.run([outputs, state[0]]) basic_grads = sess.run(gradients_impl.gradients(outputs, inputs)) basic_wgrads = sess.run( gradients_impl.gradients(outputs, variables.trainable_variables())) with variable_scope.variable_scope("block", initializer=initializer): w = variable_scope.get_variable( "w", shape=[input_size + cell_size, cell_size * 4], dtype=dtypes.float32) b = variable_scope.get_variable( "b", shape=[cell_size * 4], dtype=dtypes.float32, initializer=init_ops.zeros_initializer()) wci = variable_scope.get_variable( "wci", shape=[cell_size], dtype=dtypes.float32) wcf = variable_scope.get_variable( "wcf", shape=[cell_size], dtype=dtypes.float32) wco = variable_scope.get_variable( "wco", shape=[cell_size], dtype=dtypes.float32) _, _, _, _, _, _, outputs = block_lstm( ops.convert_to_tensor( sequence_length, dtype=dtypes.int64), inputs, w, b, wci=wci, wcf=wcf, wco=wco, cell_clip=0, use_peephole=True) sess.run([variables.global_variables_initializer()]) block_outputs = sess.run(outputs) block_grads = sess.run(gradients_impl.gradients(outputs, inputs)) block_wgrads = sess.run( gradients_impl.gradients(outputs, [w, b, wci, wcf, wco])) self.assertAllClose(basic_outputs, block_outputs) self.assertAllClose(basic_grads, block_grads) for basic, block in zip(basic_wgrads, block_wgrads): self.assertAllClose(basic, block, rtol=1e-2, atol=1e-2) with variable_scope.variable_scope("fused", initializer=initializer): cell = lstm_ops.LSTMBlockFusedCell( cell_size, cell_clip=0, use_peephole=True) outputs, state = cell(inputs, dtype=dtypes.float32) sess.run([variables.global_variables_initializer()]) fused_outputs, fused_state = sess.run([outputs, state[0]]) fused_grads = sess.run(gradients_impl.gradients(outputs, inputs)) fused_vars = [ v for v in variables.trainable_variables() if v.name.startswith("fused/") ] fused_wgrads = sess.run(gradients_impl.gradients(outputs, fused_vars)) self.assertAllClose(basic_outputs, fused_outputs) self.assertAllClose(basic_state, fused_state) self.assertAllClose(basic_grads, fused_grads) for basic, fused in zip(basic_wgrads, fused_wgrads): self.assertAllClose(basic, fused, rtol=1e-2, atol=1e-2)
batch_size = 2 display_step = 10 # Network Parameters n_input = 8 # data input n_hidden = 5 # hidden layer num of features # tf Graph input x = tf.placeholder("int32", [None, word_lenght, n_input]) y = tf.placeholder("int32", [None, n_input]) with tf.variable_scope("train_test", reuse=None): x = tf.unstack(x, word_lenght, 1) outputs, states = embedding_rnn_seq2seq( encoder_inputs=x, decoder_inputs=[0] * 20, cell=core_rnn_cell_impl.LSTMCell(n_hidden), num_encoder_symbols=256, num_decoder_symbols=256, embedding_size=100, output_projection=None, feed_previous=False) # Define loss and optimizer cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=outputs, labels=y)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) # Evaluate model correct_pred = tf.equal(tf.argmax(outputs, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, use_rnn=False, use_mtgru=False, use_mtlstm=False, num_samples=512, forward_only=False, dtype=tf.float32): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. dtype: the data type to use to store internal variables. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable( float(learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype) output_projection = (w, b) def sampled_loss(labels, inputs): labels = tf.reshape(labels, [-1, 1]) # We need to compute the sampled_softmax_loss using 32bit floats to # avoid numerical instabilities. local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss( weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=num_samples, num_classes=self.target_vocab_size), dtype) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. if use_mtgru: cell1 = core_rnn_cell_impl.MTGRUCell(size) cell2 = core_rnn_cell_impl.MTGRUCell(size, tau=0.999) #tau=0.99) #tau=0.98) cell3 = core_rnn_cell_impl.MTGRUCell(size, tau=0.998) #tau=0.98) #tau=0.95) #cell4 = core_rnn_cell_impl.MTGRUCell(size, tau=0.997) #tau=0.98) #tau=0.95) cell = core_rnn_cell_impl.MultiMTRNNCell([cell1, cell2, cell3]) #cell = core_rnn_cell_impl.MultiMTRNNCell([cell1, cell2, cell3, cell4]) #cell = core_rnn_cell_impl.MultiMTRNNCell([cell1, cell2, cell3]) elif use_mtlstm: cell1 = core_rnn_cell_impl.LSTMCell(size) cell2 = core_rnn_cell_impl.LSTMCell(size, tau=0.999) #tau=0.99) #tau=0.98) #cell3 = core_rnn_cell_impl.LSTMCell(size, tau=0.998) #tau=0.98) #tau=0.95) #cell4 = core_rnn_cell_impl.LSTMCell(size, tau=0.997) #tau=0.98) #tau=0.95) cell = core_rnn_cell_impl.MultiMTRNNCell([cell1, cell2]) #cell = core_rnn_cell_impl.MultiMTRNNCell([cell1, cell2, cell3]) else: def single_cell(): return tf.contrib.rnn.GRUCell(size) if use_lstm: def single_cell(): return tf.contrib.rnn.BasicLSTMCell(size) if use_rnn: def single_cell(): return tf.contrib.rnn.BasicRNNCell(size) cell = single_cell() if num_layers > 1: cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range(num_layers)]) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=do_decode, dtype=dtype) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(dtype, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] # Training outputs and losses. if forward_only: self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.global_variables())