예제 #1
0
def build_autoencoder(dpg):
    hidden_dim = dpg.spec.policy_dims[0]
    dec_cell = util.GRUCell(FLAGS.embedding_dim, hidden_dim)
    dec_cell = rnn_cell.OutputProjectionWrapper(dec_cell, FLAGS.vocab_size)

    dec_inp = [
        tf.zeros_like(dpg.input_tokens[0], name="adec_inp%i" % t)
        for t in range(dpg.seq_length)
    ]
    dec_out, _ = util.embedding_rnn_decoder(dec_inp,
                                            dpg.encoder_states[-1],
                                            dec_cell,
                                            FLAGS.vocab_size,
                                            feed_previous=True,
                                            embedding=dpg.embeddings,
                                            scope="adec")

    labels = [
        tf.placeholder(tf.int32, shape=(None, ), name="labels%i" % t)
        for t in range(dpg.seq_length)
    ]
    weights = [tf.ones_like(labels_t, dtype=tf.float32) for labels_t in labels]

    loss = seq2seq.sequence_loss(dec_out, labels, weights, FLAGS.vocab_size)

    optimizer = tf.train.AdamOptimizer(0.01)
    train_op = optimizer.minimize(loss)  # TODO wrt what?

    return labels, loss, train_op
예제 #2
0
  def testTiedRNNSeq2Seq(self):
    with self.test_session() as sess:
      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
        inp = [tf.constant(0.5, shape=[2, 2]) for _ in xrange(2)]
        dec_inp = [tf.constant(0.4, shape=[2, 2]) for _ in xrange(3)]
        cell = rnn_cell.OutputProjectionWrapper(rnn_cell.GRUCell(2), 4)
        dec, mem = seq2seq.tied_rnn_seq2seq(inp, dec_inp, cell)
        sess.run([tf.initialize_all_variables()])
        res = sess.run(dec)
        self.assertEqual(len(res), 3)
        self.assertEqual(res[0].shape, (2, 4))

        res = sess.run(mem)
        self.assertEqual(len(res), 4)
        self.assertEqual(res[0].shape, (2, 2))
예제 #3
0
  def testRNNDecoder(self):
    with self.test_session() as sess:
      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
        inp = [tf.constant(0.5, shape=[2, 2]) for _ in xrange(2)]
        _, enc_states = rnn.rnn(rnn_cell.GRUCell(2), inp, dtype=tf.float32)
        dec_inp = [tf.constant(0.4, shape=[2, 2]) for _ in xrange(3)]
        cell = rnn_cell.OutputProjectionWrapper(rnn_cell.GRUCell(2), 4)
        dec, mem = seq2seq.rnn_decoder(dec_inp, enc_states[-1], cell)
        sess.run([tf.initialize_all_variables()])
        res = sess.run(dec)
        self.assertEqual(len(res), 3)
        self.assertEqual(res[0].shape, (2, 4))

        res = sess.run(mem)
        self.assertEqual(len(res), 4)
        self.assertEqual(res[0].shape, (2, 2))
예제 #4
0
 def testOutputProjectionWrapper(self):
     with self.test_session() as sess:
         with tf.variable_scope("root",
                                initializer=tf.constant_initializer(0.5)):
             x = tf.zeros([1, 3])
             m = tf.zeros([1, 3])
             cell = rnn_cell.OutputProjectionWrapper(rnn_cell.GRUCell(3), 2)
             g, new_m = cell(x, m)
             sess.run([tf.variables.initialize_all_variables()])
             res = sess.run(
                 [g, new_m], {
                     x.name: np.array([[1., 1., 1.]]),
                     m.name: np.array([[0.1, 0.1, 0.1]])
                 })
             self.assertEqual(res[1].shape, (1, 3))
             # The numbers in results were not calculated, this is just a smoke test.
             self.assertAllClose(res[0], [[0.231907, 0.231907]])
예제 #5
0
def embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell,
                                num_encoder_symbols, num_decoder_symbols,
                                num_heads=1, output_projection=None,
                                feed_previous=False, dtype=tf.float32,
                                scope=None):
  """Embedding sequence-to-sequence model with attention.

  This model first embeds encoder_inputs by a newly created embedding (of shape
  [num_encoder_symbols x cell.input_size]). Then it runs an RNN to encode
  embedded encoder_inputs into a state vector. It keeps the outputs of this
  RNN at every step to use for attention later. Next, it embeds decoder_inputs
  by another newly created embedding (of shape [num_decoder_symbols x
  cell.input_size]). Then it runs attention decoder, initialized with the last
  encoder state, on embedded decoder_inputs and attending to encoder outputs.

  Args:
    encoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    num_encoder_symbols: integer; number of symbols on the encoder side.
    num_decoder_symbols: integer; number of symbols on the decoder side.
    num_heads: number of attention heads that read from attention_states.
    output_projection: None or a pair (W, B) of output projection weights and
      biases; W has shape [cell.output_size x num_decoder_symbols] and B has
      shape [num_decoder_symbols]; if provided and feed_previous=True, each
      fed previous output will first be multiplied by W and added B.
    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
      of decoder_inputs will be used (the "GO" symbol), and all other decoder
      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
      If False, decoder_inputs are used as given (the standard decoder case).
    dtype: The dtype of the initial RNN state (default: tf.float32).
    scope: VariableScope for the created subgraph; defaults to
      "embedding_attention_seq2seq".

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors with
      shape [batch_size x num_decoder_symbols] containing the generated outputs.
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].
  """
  with tf.variable_scope(scope or "embedding_attention_seq2seq"):
    # Encoder.
    encoder_cell = rnn_cell.EmbeddingWrapper(cell, num_encoder_symbols)
    encoder_outputs, encoder_states = rnn.rnn(
        encoder_cell, encoder_inputs, dtype=dtype)

    # First calculate a concatenation of encoder outputs to put attention on.
    top_states = [tf.reshape(e, [-1, 1, cell.output_size])
                  for e in encoder_outputs]
    attention_states = tf.concat(1, top_states)

    # Decoder.
    output_size = None
    if output_projection is None:
      cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
      output_size = num_decoder_symbols

    if isinstance(feed_previous, bool):
      return embedding_attention_decoder(
          decoder_inputs, encoder_states[-1], attention_states, cell,
          num_decoder_symbols, num_heads, output_size, output_projection,
          feed_previous)
    else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
      outputs1, states1 = embedding_attention_decoder(
          decoder_inputs, encoder_states[-1], attention_states, cell,
          num_decoder_symbols, num_heads, output_size, output_projection, True)
      tf.get_variable_scope().reuse_variables()
      outputs2, states2 = embedding_attention_decoder(
          decoder_inputs, encoder_states[-1], attention_states, cell,
          num_decoder_symbols, num_heads, output_size, output_projection, False)

      outputs = tf.control_flow_ops.cond(feed_previous,
                                         lambda: outputs1, lambda: outputs2)
      states = tf.control_flow_ops.cond(feed_previous,
                                        lambda: states1, lambda: states2)
      return outputs, states
예제 #6
0
def embedding_tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell,
                               num_symbols, output_projection=None,
                               feed_previous=False, dtype=tf.float32,
                               scope=None):
  """Embedding RNN sequence-to-sequence model with tied (shared) parameters.

  This model first embeds encoder_inputs by a newly created embedding (of shape
  [num_symbols x cell.input_size]). Then it runs an RNN to encode embedded
  encoder_inputs into a state vector. Next, it embeds decoder_inputs using
  the same embedding. Then it runs RNN decoder, initialized with the last
  encoder state, on embedded decoder_inputs.

  Args:
    encoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
    decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    num_symbols: integer; number of symbols for both encoder and decoder.
    output_projection: None or a pair (W, B) of output projection weights and
      biases; W has shape [cell.output_size x num_symbols] and B has
      shape [num_symbols]; if provided and feed_previous=True, each
      fed previous output will first be multiplied by W and added B.
    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
      of decoder_inputs will be used (the "GO" symbol), and all other decoder
      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
      If False, decoder_inputs are used as given (the standard decoder case).
    dtype: The dtype to use for the initial RNN states (default: tf.float32).
    scope: VariableScope for the created subgraph; defaults to
      "embedding_tied_rnn_seq2seq".

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors with
      shape [batch_size x num_decoder_symbols] containing the generated outputs.
    states: The state of each decoder cell in each time-step. This is a list
      with length len(decoder_inputs) -- one item for each time-step.
      Each item is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when output_projection has the wrong shape.
  """
  if output_projection is not None:
    proj_weights = tf.convert_to_tensor(output_projection[0], dtype=dtype)
    proj_weights.get_shape().assert_is_compatible_with([cell.output_size,
                                                        num_symbols])
    proj_biases = tf.convert_to_tensor(output_projection[1], dtype=dtype)
    proj_biases.get_shape().assert_is_compatible_with([num_symbols])

  with tf.variable_scope(scope or "embedding_tied_rnn_seq2seq"):
    with tf.device("/cpu:0"):
      embedding = tf.get_variable("embedding", [num_symbols, cell.input_size])

    emb_encoder_inputs = [tf.nn.embedding_lookup(embedding, x)
                          for x in encoder_inputs]
    emb_decoder_inputs = [tf.nn.embedding_lookup(embedding, x)
                          for x in decoder_inputs]

    def extract_argmax_and_embed(prev, _):
      """Loop_function that extracts the symbol from prev and embeds it."""
      if output_projection is not None:
        prev = tf.nn.xw_plus_b(prev, output_projection[0], output_projection[1])
      prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
      return tf.nn.embedding_lookup(embedding, prev_symbol)

    if output_projection is None:
      cell = rnn_cell.OutputProjectionWrapper(cell, num_symbols)

    if isinstance(feed_previous, bool):
      loop_function = extract_argmax_and_embed if feed_previous else None
      return tied_rnn_seq2seq(emb_encoder_inputs, emb_decoder_inputs, cell,
                              loop_function=loop_function, dtype=dtype)
    else:  # If feed_previous is a Tensor, we construct 2 graphs and use cond.
      outputs1, states1 = tied_rnn_seq2seq(
          emb_encoder_inputs, emb_decoder_inputs, cell,
          loop_function=extract_argmax_and_embed, dtype=dtype)
      tf.get_variable_scope().reuse_variables()
      outputs2, states2 = tied_rnn_seq2seq(
          emb_encoder_inputs, emb_decoder_inputs, cell, dtype=dtype)

      outputs = tf.control_flow_ops.cond(feed_previous,
                                         lambda: outputs1, lambda: outputs2)
      states = tf.control_flow_ops.cond(feed_previous,
                                        lambda: states1, lambda: states2)
      return outputs, states
예제 #7
0
    def __init__(self, input_size, args):
        self.input_size = input_size
        self.args = args
        self.batch_size = args.batch_size

        if args.cell == 'lstm':
            cell = rnn_cell.LSTMCell(
                args.state_size, input_size, num_proj=input_size)
        else:
            if args.cell == 'gru':
                cell_class = rnn_cell.GRUCell
            elif args.cell == 'lstm-basic':
                cell_class = rnn_cell.BasicLSTMCell
            basic_cell = cell_class(args.state_size)
            # TODO - do bulk input/output projecttions
            cell = rnn_cell.InputProjectionWrapper(
                rnn_cell.OutputProjectionWrapper(basic_cell, input_size),
                input_size)
        if args.n_layers > 1:
            cell = rnn_cell.MultiRNNCell([cell] * args.n_layers)

        self.encoder_inputs, self.decoder_inputs = [[
            tf.placeholder(tf.float32, shape=[None, input_size],
                        name='{}{}'.format(name, i))
            for i in xrange(length)] for name, length in [
                ('encoder', self.args.max_seq_length),
                ('decoder', self.args.max_seq_length)]]
        # TODO - maybe also use during training,
        # to avoid building one-hot representation (just an optimization).
        # Another (maybe better) way to do is described here
        # https://www.tensorflow.org/versions/master/tutorials/mnist/tf/index.html#loss
        embeddings = tf.constant(np.eye(input_size), dtype=tf.float32)
        loop_function = None
        if args.predict:
            def loop_function(prev, _):
                prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
                return tf.nn.embedding_lookup(embeddings, prev_symbol)
        self.decoder_outputs, _ = seq2seq.tied_rnn_seq2seq(
            self.encoder_inputs, self.decoder_inputs, cell,
            loop_function=loop_function)
        # TODO - add weights
        targets = self.decoder_inputs[1:]
        # FIXME - this scaling by max_seq_length does not take
        # padding into account (see also weights)
        self.decoder_loss = (1. / self.args.max_seq_length) * \
            tf.reduce_mean(tf.add_n([
                tf.nn.softmax_cross_entropy_with_logits(
                    logits, target, name='seq_loss_{}'.format(i))
                for i, (logits, target) in enumerate(
                    zip(self.decoder_outputs, targets))]))
        tf.scalar_summary('train loss', self.decoder_loss)
        self.valid_loss = 1.0 * self.decoder_loss  # FIXME
        tf.scalar_summary('valid loss', self.valid_loss)
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        optimizer = tf.train.AdamOptimizer()
        params = tf.trainable_variables()
        gradients = tf.gradients(self.decoder_loss, params)
        clipped_gradients, _norm = tf.clip_by_global_norm(
            gradients, self.args.max_gradient_norm)
        # TODO - monitor norm
        self.train_op = optimizer.apply_gradients(
            zip(clipped_gradients, params), global_step=self.global_step)
        self.summary_op = tf.merge_all_summaries()
예제 #8
0
  def __init__(self, is_training, config, decode_only=False):
    self.batch_size = batch_size = config.batch_size
    self.num_steps = num_steps = config.num_steps
    size = config.hidden_size
    self.is_training = is_training
    vocab_size = config.vocab_size

    self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
    self._targets = tf.placeholder(tf.int32, [batch_size, num_steps])

    # Slightly better results can be obtained with forget gate biases
    # initialized to 1 but the hyperparameters of the model would need to be
    # different than reported in the paper.
    with tf.variable_scope("cell_encoder"):
      lstm_encoder_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0)
      if is_training and config.keep_prob < 1:
        lstm_encoder_cell = rnn_cell.DropoutWrapper(
            lstm_encoder_cell, output_keep_prob=config.keep_prob)
      cell_encoder = rnn_cell.MultiRNNCell([lstm_encoder_cell] * config.num_layers)

      # this is the linear projection layer down to num_encoder_symbols = 2*config.z_dim
      cell_encoder = rnn_cell.OutputProjectionWrapper(cell_encoder, 2 * config.z_dim)

      self._initial_state_encoder = cell_encoder.zero_state(batch_size, tf.float32)


    with tf.variable_scope("cell_decoder"):
      lstm_decoder_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0)
      if is_training and config.keep_prob < 1:
        lstm_decoder_cell = rnn_cell.DropoutWrapper(
            lstm_decoder_cell, output_keep_prob=config.keep_prob)
      cell_decoder = rnn_cell.MultiRNNCell([lstm_decoder_cell] * config.num_layers)

      self._initial_state_decoder = cell_decoder.zero_state(batch_size, tf.float32)

    with tf.device("/cpu:0"):
      with tf.variable_scope("embedding"):
        embedding = tf.get_variable("embedding", [vocab_size, size])
      inputs = tf.split(
          1, num_steps, tf.nn.embedding_lookup(embedding, self._input_data))
      inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

    if is_training and config.keep_prob < 1:
      inputs = [tf.nn.dropout(input_, config.keep_prob) for input_ in inputs]

    # initial inputs
    inputs_encoder = inputs

    outputs_encoder, states_encoder = rnn.rnn(cell_encoder, inputs_encoder, initial_state=self._initial_state_encoder)

    # split the outputs to mu and log_sigma
    mu_and_log_sigmas = [tf.split(1, 2, output_encoder) for output_encoder in outputs_encoder]
    mus = [mu_and_log_sigma[0] for mu_and_log_sigma in mu_and_log_sigmas]
    log_sigmas = [mu_and_log_sigma[1] for mu_and_log_sigma in mu_and_log_sigmas]

    # epsilon is sampled from N(0,1) for location-scale transform
    epsilons = [tf.random_normal([config.batch_size, config.z_dim], dtype=tf.float32) for i in range(len(log_sigmas))]

    # do the location-scale transform
    z_samples = [tf.add(mu, tf.mul(tf.exp(log_sigma), epsilon)) for mu, log_sigma, epsilon in zip(mus, log_sigmas, epsilons)]
    if decode_only:
      # if we're decoding, just sample from a random normal
      z_samples = [tf.random_normal([1, config.z_dim], dtype=tf.float32) for i in range(len(z_samples))]

    # calculate KL. equation 10 from kingma - auto-encoding variational bayes.
    neg_KL_list = [tf.add_n([tf.ones_like(mu), tf.log(tf.square(tf.exp(log_sigma))), tf.neg(tf.square(mu)), tf.neg(tf.square(tf.exp(log_sigma)))]) for mu, log_sigma in zip(mus, log_sigmas)]

    # multiply by 0.5
    neg_KL_list = [tf.mul(tf.constant(0.5, shape=[1, config.z_dim]), KL_term) for KL_term in neg_KL_list]

    # merge the list like we merge the outputs
    neg_KL = tf.reshape(tf.concat(1, neg_KL_list), [-1, config.z_dim])

    # no pure decoding opt
    # outputs_decoder, states_decoder = rnn_decoder(decoder_inputs, self._initial_state_decoder, cell_decoder)

    softmax_w = tf.get_variable("softmax_w", [size, vocab_size])
    softmax_b = tf.get_variable("softmax_b", [vocab_size])

    # concatenate z_samples with previous timesteps
    # decoder_inputs = [tf.concat(1, [single_input, z_sample]) for single_input, z_sample in zip(inputs_encoder, z_samples)]
    # outputs_decoder, states_decoder = rnn_decoder_argmax(decoder_inputs, self._initial_state_decoder, cell_decoder, vocab_size,
    #   output_projection=[softmax_w, softmax_b],
    #   feed_previous=True,
    #   config=config)

    # refactored to be like sam's
    outputs_decoder, states_decoder = vae_decoder_argmax(
      inputs_encoder, z_samples, self._initial_state_decoder, cell_decoder, vocab_size,
      output_projection=[softmax_w, softmax_b],
      feed_previous=True,
      config=config)

    # final output
    # change to vanilla lstm
    outputs = outputs_encoder

    # do a softmax over the vocabulary using the decoder outputs!
    output = tf.reshape(tf.concat(1, outputs), [-1, size])
    logits = tf.nn.xw_plus_b(output,
                             softmax_w,
                             softmax_b)

    NLL = seq2seq.sequence_loss_by_example([logits],
                                            [tf.reshape(self._targets, [-1])],
                                            [tf.ones([batch_size * num_steps])],
                                            vocab_size)

    NLL_scalar = tf.reduce_sum(NLL)
    KL_scalar = tf.neg(tf.reduce_sum(neg_KL))

    # here we compute the *NEGATIVE* ELBO (because we don't know how the optimizer deals with negative learning rates / gradients)
    # the loss in seq2seq.sequence_loss_by_example is the cross-entropy, which is the *negative* log-likelihood, so we can add it.
    neg_ELBO = KL_scalar + NLL_scalar# / batch_size

    # grads_unclipped = tf.gradients(neg_ELBO, tvars)
    # grads, _ = tf.clip_by_global_norm(grads_unclipped,
    #                                   config.max_grad_norm)

    def normalize(tensor):
      return tf.reduce_sum(
      tf.mul(tf.constant(1/(batch_size * self.num_steps), shape=tensor.get_shape()), tensor))

    # summaries
    neg_ELBO_normalized = normalize(neg_ELBO)
    KL_normalized = normalize(KL_scalar)
    NLL_normalized = normalize(NLL_scalar)
    neg_ELBO_summary = tf.scalar_summary("neg_ELBO_normalized", neg_ELBO_normalized)
    KL_summary = tf.scalar_summary('KL_normalized', KL_normalized)
    NLL_summary = tf.scalar_summary('NLL_normalized', NLL_normalized)

    # expose costs, h
    self._neg_ELBO = neg_ELBO
    self._KL_scalar = KL_scalar
    self._NLL_scalar = NLL_scalar
    self._final_state = states_encoder[-1]

    if decode_only:
      self._logits = logits
      return

    if not is_training:
      return

    self._lr = tf.Variable(0.0, trainable=False, name='learning_rate')
    tvars = tf.trainable_variables()
    tvar_names = [tvar.name for tvar in tvars]

    grads_unclipped = tf.gradients(neg_ELBO, tvars)
    grads, _ = tf.clip_by_global_norm(grads_unclipped,
                                      config.max_grad_norm)

    grad_hists = []
    for idx, grad in enumerate(grads_unclipped):
      if grad is None:
        pass
      else:
        grad_hists.append(tf.histogram_summary(tvar_names[idx], grad))

    # optimizer = tf.train.GradientDescentOptimizer(self.lr)
    #NB: for adam, need to set epsilon to other than the default 1e-8, otherwise get nans!
    optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, epsilon=1e-1)
    self._train_op = optimizer.apply_gradients(zip(grads, tvars))

    merged = tf.merge_all_summaries()
    self._merged = merged