def build_autoencoder(dpg): hidden_dim = dpg.spec.policy_dims[0] dec_cell = util.GRUCell(FLAGS.embedding_dim, hidden_dim) dec_cell = rnn_cell.OutputProjectionWrapper(dec_cell, FLAGS.vocab_size) dec_inp = [ tf.zeros_like(dpg.input_tokens[0], name="adec_inp%i" % t) for t in range(dpg.seq_length) ] dec_out, _ = util.embedding_rnn_decoder(dec_inp, dpg.encoder_states[-1], dec_cell, FLAGS.vocab_size, feed_previous=True, embedding=dpg.embeddings, scope="adec") labels = [ tf.placeholder(tf.int32, shape=(None, ), name="labels%i" % t) for t in range(dpg.seq_length) ] weights = [tf.ones_like(labels_t, dtype=tf.float32) for labels_t in labels] loss = seq2seq.sequence_loss(dec_out, labels, weights, FLAGS.vocab_size) optimizer = tf.train.AdamOptimizer(0.01) train_op = optimizer.minimize(loss) # TODO wrt what? return labels, loss, train_op
def testTiedRNNSeq2Seq(self): with self.test_session() as sess: with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)): inp = [tf.constant(0.5, shape=[2, 2]) for _ in xrange(2)] dec_inp = [tf.constant(0.4, shape=[2, 2]) for _ in xrange(3)] cell = rnn_cell.OutputProjectionWrapper(rnn_cell.GRUCell(2), 4) dec, mem = seq2seq.tied_rnn_seq2seq(inp, dec_inp, cell) sess.run([tf.initialize_all_variables()]) res = sess.run(dec) self.assertEqual(len(res), 3) self.assertEqual(res[0].shape, (2, 4)) res = sess.run(mem) self.assertEqual(len(res), 4) self.assertEqual(res[0].shape, (2, 2))
def testRNNDecoder(self): with self.test_session() as sess: with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)): inp = [tf.constant(0.5, shape=[2, 2]) for _ in xrange(2)] _, enc_states = rnn.rnn(rnn_cell.GRUCell(2), inp, dtype=tf.float32) dec_inp = [tf.constant(0.4, shape=[2, 2]) for _ in xrange(3)] cell = rnn_cell.OutputProjectionWrapper(rnn_cell.GRUCell(2), 4) dec, mem = seq2seq.rnn_decoder(dec_inp, enc_states[-1], cell) sess.run([tf.initialize_all_variables()]) res = sess.run(dec) self.assertEqual(len(res), 3) self.assertEqual(res[0].shape, (2, 4)) res = sess.run(mem) self.assertEqual(len(res), 4) self.assertEqual(res[0].shape, (2, 2))
def testOutputProjectionWrapper(self): with self.test_session() as sess: with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)): x = tf.zeros([1, 3]) m = tf.zeros([1, 3]) cell = rnn_cell.OutputProjectionWrapper(rnn_cell.GRUCell(3), 2) g, new_m = cell(x, m) sess.run([tf.variables.initialize_all_variables()]) res = sess.run( [g, new_m], { x.name: np.array([[1., 1., 1.]]), m.name: np.array([[0.1, 0.1, 0.1]]) }) self.assertEqual(res[1].shape, (1, 3)) # The numbers in results were not calculated, this is just a smoke test. self.assertAllClose(res[0], [[0.231907, 0.231907]])
def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, num_heads=1, output_projection=None, feed_previous=False, dtype=tf.float32, scope=None): """Embedding sequence-to-sequence model with attention. This model first embeds encoder_inputs by a newly created embedding (of shape [num_encoder_symbols x cell.input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. It keeps the outputs of this RNN at every step to use for attention later. Next, it embeds decoder_inputs by another newly created embedding (of shape [num_decoder_symbols x cell.input_size]). Then it runs attention decoder, initialized with the last encoder state, on embedded decoder_inputs and attending to encoder outputs. Args: encoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: integer; number of symbols on the encoder side. num_decoder_symbols: integer; number of symbols on the decoder side. num_heads: number of attention heads that read from attention_states. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [cell.output_size x num_decoder_symbols] and B has shape [num_decoder_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. """ with tf.variable_scope(scope or "embedding_attention_seq2seq"): # Encoder. encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols) encoder_outputs, encoder_states = rnn.rnn( encoder_cell, encoder_inputs, dtype=dtype) # First calculate a concatenation of encoder outputs to put attention on. top_states = [tf.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs] attention_states = tf.concat(1, top_states) # Decoder. output_size = None if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) output_size = num_decoder_symbols if isinstance(feed_previous, bool): return embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, num_heads, output_size, output_projection, feed_previous) else: # If feed_previous is a Tensor, we construct 2 graphs and use cond. outputs1, states1 = embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, num_heads, output_size, output_projection, True) tf.get_variable_scope().reuse_variables() outputs2, states2 = embedding_attention_decoder( decoder_inputs, encoder_states[-1], attention_states, cell, num_decoder_symbols, num_heads, output_size, output_projection, False) outputs = tf.control_flow_ops.cond(feed_previous, lambda: outputs1, lambda: outputs2) states = tf.control_flow_ops.cond(feed_previous, lambda: states1, lambda: states2) return outputs, states
def embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, num_symbols, output_projection=None, feed_previous=False, dtype=tf.float32, scope=None): """Embedding RNN sequence-to-sequence model with tied (shared) parameters. This model first embeds encoder_inputs by a newly created embedding (of shape [num_symbols x cell.input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. Next, it embeds decoder_inputs using the same embedding. Then it runs RNN decoder, initialized with the last encoder state, on embedded decoder_inputs. Args: encoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_symbols: integer; number of symbols for both encoder and decoder. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [cell.output_size x num_symbols] and B has shape [num_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype to use for the initial RNN states (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_tied_rnn_seq2seq". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when output_projection has the wrong shape. """ if output_projection is not None: proj_weights = tf.convert_to_tensor(output_projection[0], dtype=dtype) proj_weights.get_shape().assert_is_compatible_with([cell.output_size, num_symbols]) proj_biases = tf.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with tf.variable_scope(scope or "embedding_tied_rnn_seq2seq"): with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [num_symbols, cell.input_size]) emb_encoder_inputs = [tf.nn.embedding_lookup(embedding, x) for x in encoder_inputs] emb_decoder_inputs = [tf.nn.embedding_lookup(embedding, x) for x in decoder_inputs] def extract_argmax_and_embed(prev, _): """Loop_function that extracts the symbol from prev and embeds it.""" if output_projection is not None: prev = tf.nn.xw_plus_b(prev, output_projection[0], output_projection[1]) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, num_symbols) if isinstance(feed_previous, bool): loop_function = extract_argmax_and_embed if feed_previous else None return tied_rnn_seq2seq(emb_encoder_inputs, emb_decoder_inputs, cell, loop_function=loop_function, dtype=dtype) else: # If feed_previous is a Tensor, we construct 2 graphs and use cond. outputs1, states1 = tied_rnn_seq2seq( emb_encoder_inputs, emb_decoder_inputs, cell, loop_function=extract_argmax_and_embed, dtype=dtype) tf.get_variable_scope().reuse_variables() outputs2, states2 = tied_rnn_seq2seq( emb_encoder_inputs, emb_decoder_inputs, cell, dtype=dtype) outputs = tf.control_flow_ops.cond(feed_previous, lambda: outputs1, lambda: outputs2) states = tf.control_flow_ops.cond(feed_previous, lambda: states1, lambda: states2) return outputs, states
def __init__(self, input_size, args): self.input_size = input_size self.args = args self.batch_size = args.batch_size if args.cell == 'lstm': cell = rnn_cell.LSTMCell( args.state_size, input_size, num_proj=input_size) else: if args.cell == 'gru': cell_class = rnn_cell.GRUCell elif args.cell == 'lstm-basic': cell_class = rnn_cell.BasicLSTMCell basic_cell = cell_class(args.state_size) # TODO - do bulk input/output projecttions cell = rnn_cell.InputProjectionWrapper( rnn_cell.OutputProjectionWrapper(basic_cell, input_size), input_size) if args.n_layers > 1: cell = rnn_cell.MultiRNNCell([cell] * args.n_layers) self.encoder_inputs, self.decoder_inputs = [[ tf.placeholder(tf.float32, shape=[None, input_size], name='{}{}'.format(name, i)) for i in xrange(length)] for name, length in [ ('encoder', self.args.max_seq_length), ('decoder', self.args.max_seq_length)]] # TODO - maybe also use during training, # to avoid building one-hot representation (just an optimization). # Another (maybe better) way to do is described here # https://www.tensorflow.org/versions/master/tutorials/mnist/tf/index.html#loss embeddings = tf.constant(np.eye(input_size), dtype=tf.float32) loop_function = None if args.predict: def loop_function(prev, _): prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embeddings, prev_symbol) self.decoder_outputs, _ = seq2seq.tied_rnn_seq2seq( self.encoder_inputs, self.decoder_inputs, cell, loop_function=loop_function) # TODO - add weights targets = self.decoder_inputs[1:] # FIXME - this scaling by max_seq_length does not take # padding into account (see also weights) self.decoder_loss = (1. / self.args.max_seq_length) * \ tf.reduce_mean(tf.add_n([ tf.nn.softmax_cross_entropy_with_logits( logits, target, name='seq_loss_{}'.format(i)) for i, (logits, target) in enumerate( zip(self.decoder_outputs, targets))])) tf.scalar_summary('train loss', self.decoder_loss) self.valid_loss = 1.0 * self.decoder_loss # FIXME tf.scalar_summary('valid loss', self.valid_loss) self.global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.AdamOptimizer() params = tf.trainable_variables() gradients = tf.gradients(self.decoder_loss, params) clipped_gradients, _norm = tf.clip_by_global_norm( gradients, self.args.max_gradient_norm) # TODO - monitor norm self.train_op = optimizer.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) self.summary_op = tf.merge_all_summaries()
def __init__(self, is_training, config, decode_only=False): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size self.is_training = is_training vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) # Slightly better results can be obtained with forget gate biases # initialized to 1 but the hyperparameters of the model would need to be # different than reported in the paper. with tf.variable_scope("cell_encoder"): lstm_encoder_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0) if is_training and config.keep_prob < 1: lstm_encoder_cell = rnn_cell.DropoutWrapper( lstm_encoder_cell, output_keep_prob=config.keep_prob) cell_encoder = rnn_cell.MultiRNNCell([lstm_encoder_cell] * config.num_layers) # this is the linear projection layer down to num_encoder_symbols = 2*config.z_dim cell_encoder = rnn_cell.OutputProjectionWrapper(cell_encoder, 2 * config.z_dim) self._initial_state_encoder = cell_encoder.zero_state(batch_size, tf.float32) with tf.variable_scope("cell_decoder"): lstm_decoder_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0) if is_training and config.keep_prob < 1: lstm_decoder_cell = rnn_cell.DropoutWrapper( lstm_decoder_cell, output_keep_prob=config.keep_prob) cell_decoder = rnn_cell.MultiRNNCell([lstm_decoder_cell] * config.num_layers) self._initial_state_decoder = cell_decoder.zero_state(batch_size, tf.float32) with tf.device("/cpu:0"): with tf.variable_scope("embedding"): embedding = tf.get_variable("embedding", [vocab_size, size]) inputs = tf.split( 1, num_steps, tf.nn.embedding_lookup(embedding, self._input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] if is_training and config.keep_prob < 1: inputs = [tf.nn.dropout(input_, config.keep_prob) for input_ in inputs] # initial inputs inputs_encoder = inputs outputs_encoder, states_encoder = rnn.rnn(cell_encoder, inputs_encoder, initial_state=self._initial_state_encoder) # split the outputs to mu and log_sigma mu_and_log_sigmas = [tf.split(1, 2, output_encoder) for output_encoder in outputs_encoder] mus = [mu_and_log_sigma[0] for mu_and_log_sigma in mu_and_log_sigmas] log_sigmas = [mu_and_log_sigma[1] for mu_and_log_sigma in mu_and_log_sigmas] # epsilon is sampled from N(0,1) for location-scale transform epsilons = [tf.random_normal([config.batch_size, config.z_dim], dtype=tf.float32) for i in range(len(log_sigmas))] # do the location-scale transform z_samples = [tf.add(mu, tf.mul(tf.exp(log_sigma), epsilon)) for mu, log_sigma, epsilon in zip(mus, log_sigmas, epsilons)] if decode_only: # if we're decoding, just sample from a random normal z_samples = [tf.random_normal([1, config.z_dim], dtype=tf.float32) for i in range(len(z_samples))] # calculate KL. equation 10 from kingma - auto-encoding variational bayes. neg_KL_list = [tf.add_n([tf.ones_like(mu), tf.log(tf.square(tf.exp(log_sigma))), tf.neg(tf.square(mu)), tf.neg(tf.square(tf.exp(log_sigma)))]) for mu, log_sigma in zip(mus, log_sigmas)] # multiply by 0.5 neg_KL_list = [tf.mul(tf.constant(0.5, shape=[1, config.z_dim]), KL_term) for KL_term in neg_KL_list] # merge the list like we merge the outputs neg_KL = tf.reshape(tf.concat(1, neg_KL_list), [-1, config.z_dim]) # no pure decoding opt # outputs_decoder, states_decoder = rnn_decoder(decoder_inputs, self._initial_state_decoder, cell_decoder) softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) # concatenate z_samples with previous timesteps # decoder_inputs = [tf.concat(1, [single_input, z_sample]) for single_input, z_sample in zip(inputs_encoder, z_samples)] # outputs_decoder, states_decoder = rnn_decoder_argmax(decoder_inputs, self._initial_state_decoder, cell_decoder, vocab_size, # output_projection=[softmax_w, softmax_b], # feed_previous=True, # config=config) # refactored to be like sam's outputs_decoder, states_decoder = vae_decoder_argmax( inputs_encoder, z_samples, self._initial_state_decoder, cell_decoder, vocab_size, output_projection=[softmax_w, softmax_b], feed_previous=True, config=config) # final output # change to vanilla lstm outputs = outputs_encoder # do a softmax over the vocabulary using the decoder outputs! output = tf.reshape(tf.concat(1, outputs), [-1, size]) logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) NLL = seq2seq.sequence_loss_by_example([logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps])], vocab_size) NLL_scalar = tf.reduce_sum(NLL) KL_scalar = tf.neg(tf.reduce_sum(neg_KL)) # here we compute the *NEGATIVE* ELBO (because we don't know how the optimizer deals with negative learning rates / gradients) # the loss in seq2seq.sequence_loss_by_example is the cross-entropy, which is the *negative* log-likelihood, so we can add it. neg_ELBO = KL_scalar + NLL_scalar# / batch_size # grads_unclipped = tf.gradients(neg_ELBO, tvars) # grads, _ = tf.clip_by_global_norm(grads_unclipped, # config.max_grad_norm) def normalize(tensor): return tf.reduce_sum( tf.mul(tf.constant(1/(batch_size * self.num_steps), shape=tensor.get_shape()), tensor)) # summaries neg_ELBO_normalized = normalize(neg_ELBO) KL_normalized = normalize(KL_scalar) NLL_normalized = normalize(NLL_scalar) neg_ELBO_summary = tf.scalar_summary("neg_ELBO_normalized", neg_ELBO_normalized) KL_summary = tf.scalar_summary('KL_normalized', KL_normalized) NLL_summary = tf.scalar_summary('NLL_normalized', NLL_normalized) # expose costs, h self._neg_ELBO = neg_ELBO self._KL_scalar = KL_scalar self._NLL_scalar = NLL_scalar self._final_state = states_encoder[-1] if decode_only: self._logits = logits return if not is_training: return self._lr = tf.Variable(0.0, trainable=False, name='learning_rate') tvars = tf.trainable_variables() tvar_names = [tvar.name for tvar in tvars] grads_unclipped = tf.gradients(neg_ELBO, tvars) grads, _ = tf.clip_by_global_norm(grads_unclipped, config.max_grad_norm) grad_hists = [] for idx, grad in enumerate(grads_unclipped): if grad is None: pass else: grad_hists.append(tf.histogram_summary(tvar_names[idx], grad)) # optimizer = tf.train.GradientDescentOptimizer(self.lr) #NB: for adam, need to set epsilon to other than the default 1e-8, otherwise get nans! optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, epsilon=1e-1) self._train_op = optimizer.apply_gradients(zip(grads, tvars)) merged = tf.merge_all_summaries() self._merged = merged