Exemplo n.º 1
0
  def construct_latent_tower(self, images):
    """Builds convolutional latent tower for stochastic model.

    At training time this tower generates a latent distribution (mean and std)
    conditioned on the entire video. This latent variable will be fed to the
    main tower as an extra variable to be used for future frames prediction.
    At inference time, the tower is disabled and only returns latents sampled
    from N(0,1).
    If the multi_latent flag is on, a different latent for every timestep would
    be generated.

    Args:
      images: tensor of ground truth image sequences
    Returns:
      latent_mean: predicted latent mean
      latent_std: predicted latent standard deviation
      latent_loss: loss of the latent twoer
      samples: random samples sampled from standard guassian
    """
    conv_size = self.tinyify([32, 64, 64])
    with tf.variable_scope("latent", reuse=tf.AUTO_REUSE):
      # this allows more predicted frames at inference time
      latent_num_frames = self.hparams.latent_num_frames
      if latent_num_frames == 0:  # use all frames by default.
        latent_num_frames = (self.hparams.video_num_input_frames +
                             self.hparams.video_num_target_frames)
      tf.logging.info("Creating latent tower with %d frames."%latent_num_frames)
      latent_images = tf.unstack(images[:latent_num_frames], axis=0)
      images = tf.concat(latent_images, 3)

      x = images
      x = common_layers.make_even_size(x)
      x = tfl.conv2d(x, conv_size[0], [3, 3], strides=(2, 2),
                     padding="SAME", activation=tf.nn.relu, name="latent_conv1")
      x = tfcl.batch_norm(x, updates_collections=None,
                          is_training=self.is_training, scope="latent_bn1")
      x = common_layers.make_even_size(x)
      x = tfl.conv2d(x, conv_size[1], [3, 3], strides=(2, 2),
                     padding="SAME", activation=tf.nn.relu, name="latent_conv2")
      x = tfcl.batch_norm(x, updates_collections=None,
                          is_training=self.is_training, scope="latent_bn2")
      x = tfl.conv2d(x, conv_size[2], [3, 3], strides=(1, 1),
                     padding="SAME", activation=tf.nn.relu, name="latent_conv3")
      x = tfcl.batch_norm(x, updates_collections=None,
                          is_training=self.is_training, scope="latent_bn3")

      nc = self.hparams.latent_channels
      mean = tfl.conv2d(x, nc, [3, 3], strides=(2, 2),
                        padding="SAME", activation=None, name="latent_mean")
      std = tfl.conv2d(x, nc, [3, 3], strides=(2, 2),
                       padding="SAME", activation=tf.nn.relu, name="latent_std")
      std += self.hparams.latent_std_min

      # No latent tower at inference time, just standard gaussian.
      if self.hparams.mode != tf.estimator.ModeKeys.TRAIN:
        return tf.zeros_like(mean), tf.zeros_like(std)

      return mean, std
Exemplo n.º 2
0
    def construct_latent_tower(self, images):
        """Builds convolutional latent tower for stochastic model.

    At training time this tower generates a latent distribution (mean and std)
    conditioned on the entire video. This latent variable will be fed to the
    main tower as an extra variable to be used for future frames prediction.
    At inference time, the tower is disabled and only returns latents sampled
    from N(0,1).
    If the multi_latent flag is on, a different latent for every timestep would
    be generated.

    Args:
      images: tensor of ground truth image sequences
    Returns:
      latent_mean: predicted latent mean
      latent_std: predicted latent standard deviation
      latent_loss: loss of the latent twoer
      samples: random samples sampled from standard guassian
    """
        sequence_length = len(images)

        with tf.variable_scope("latent"):
            images = tf.concat(images, 3)

            x = images
            x = common_layers.make_even_size(x)
            x = slim.conv2d(x, 32, [3, 3], stride=2, scope="latent_conv1")
            x = slim.batch_norm(x, scope="latent_bn1")
            x = common_layers.make_even_size(x)
            x = slim.conv2d(x, 64, [3, 3], stride=2, scope="latent_conv2")
            x = slim.batch_norm(x, scope="latent_bn2")
            x = slim.conv2d(x, 64, [3, 3], stride=1, scope="latent_conv3")
            x = slim.batch_norm(x, scope="latent_bn3")

            nc = self.hparams.latent_channels
            mean = slim.conv2d(x,
                               nc, [3, 3],
                               stride=2,
                               activation_fn=None,
                               scope="latent_mean")
            std = slim.conv2d(x, nc, [3, 3], stride=2, scope="latent_std")
            std += self.hparams.latent_std_min

        if self.hparams.multi_latent:
            # timestep x batch_size x latent_size
            samples = tf.random_normal([sequence_length - 1] + mean.shape,
                                       0,
                                       1,
                                       dtype=tf.float32)
        else:
            # batch_size x latent_size
            samples = tf.random_normal(tf.shape(mean), 0, 1, dtype=tf.float32)

        if self.hparams.mode == tf.estimator.ModeKeys.TRAIN:
            return mean, std, samples
        else:
            # No latent tower at inference time, just standard gaussian.
            return None, None, samples
Exemplo n.º 3
0
    def encode(self,
               inputs,
               target_space,
               hparams,
               features=None,
               losses=None):
        """Add layers of strided convolutions on top of encoder."""
        with tf.variable_scope("downstride"):
            hparams = self.hparams
            kernel, strides = (4, 4), (2, 2)
            x = inputs
            # Down-convolutions.
            for i in range(hparams.num_compress_steps):
                x = common_layers.make_even_size(x)
                x = tf.layers.conv2d(x,
                                     hparams.hidden_size,
                                     kernel,
                                     strides=strides,
                                     padding="SAME",
                                     activation=common_layers.belu,
                                     name="conv_%d" % i)
                x = common_layers.layer_norm(x)

        encoder_output, encoder_decoder_attention_bias = super(
            TransformerSketch, self).encode(x,
                                            target_space,
                                            hparams,
                                            features=features,
                                            losses=losses)
        return encoder_output, encoder_decoder_attention_bias
Exemplo n.º 4
0
def conv_latent_tower(images, time_axis, latent_channels=1, min_logvar=-5,
                      is_training=False, random_latent=False, tiny_mode=False):
  """Builds convolutional latent tower for stochastic model.

  At training time this tower generates a latent distribution (mean and std)
  conditioned on the entire video. This latent variable will be fed to the
  main tower as an extra variable to be used for future frames prediction.
  At inference time, the tower is disabled and only returns latents sampled
  from N(0,1).
  If the multi_latent flag is on, a different latent for every timestep would
  be generated.

  Args:
    images: tensor of ground truth image sequences
    time_axis: the time axis  in images tensor
    latent_channels: number of latent channels
    min_logvar: minimum value for log_var
    is_training: whether or not it is training mode
    random_latent: whether or not generate random latents
    tiny_mode: whether or not it is tiny_mode
  Returns:
    latent_mean: predicted latent mean
    latent_logvar: predicted latent log variance
  """
  conv_size = tinyify([32, 64, 64], tiny_mode)
  with tf.variable_scope("latent", reuse=tf.AUTO_REUSE):
    images = tf.to_float(images)
    images = tf.unstack(images, axis=time_axis)
    images = tf.concat(images, axis=3)

    x = images
    x = common_layers.make_even_size(x)
    x = tfl.conv2d(x, conv_size[0], [3, 3], strides=(2, 2),
                   padding="SAME", activation=tf.nn.relu, name="latent_conv1")
    x = tfcl.layer_norm(x)
    x = tfl.conv2d(x, conv_size[1], [3, 3], strides=(2, 2),
                   padding="SAME", activation=tf.nn.relu, name="latent_conv2")
    x = tfcl.layer_norm(x)
    x = tfl.conv2d(x, conv_size[2], [3, 3], strides=(1, 1),
                   padding="SAME", activation=tf.nn.relu, name="latent_conv3")
    x = tfcl.layer_norm(x)

    nc = latent_channels
    mean = tfl.conv2d(x, nc, [3, 3], strides=(2, 2),
                      padding="SAME", activation=None, name="latent_mean")
    logv = tfl.conv2d(x, nc, [3, 3], strides=(2, 2),
                      padding="SAME", activation=tf.nn.relu, name="latent_std")
    logvar = logv + min_logvar

    # No latent tower at inference time, just standard gaussian.
    if not is_training:
      return tf.zeros_like(mean), tf.zeros_like(logvar)

    # No latent in the first phase
    ret_mean, ret_logvar = tf.cond(
        random_latent,
        lambda: (tf.zeros_like(mean), tf.zeros_like(logvar)),
        lambda: (mean, logvar))

    return ret_mean, ret_logvar
Exemplo n.º 5
0
  def construct_latent_tower(self, images):
    """Builds convolutional latent tower for stochastic model.

    At training time this tower generates a latent distribution (mean and std)
    conditioned on the entire video. This latent variable will be fed to the
    main tower as an extra variable to be used for future frames prediction.
    At inference time, the tower is disabled and only returns latents sampled
    from N(0,1).
    If the multi_latent flag is on, a different latent for every timestep would
    be generated.

    Args:
      images: tensor of ground truth image sequences
    Returns:
      latent_mean: predicted latent mean
      latent_std: predicted latent standard deviation
      latent_loss: loss of the latent twoer
      samples: random samples sampled from standard guassian
    """
    conv_size = self.tinyify([32, 64, 64])
    with tf.variable_scope("latent", reuse=tf.AUTO_REUSE):
      # this allows more predicted frames at inference time
      latent_images = images[:self.hparams.latent_num_frames]
      images = tf.concat(latent_images, 3)

      x = images
      x = common_layers.make_even_size(x)
      x = slim.conv2d(x, conv_size[0], [3, 3], stride=2, scope="latent_conv1")
      x = slim.batch_norm(x, scope="latent_bn1")
      x = common_layers.make_even_size(x)
      x = slim.conv2d(x, conv_size[1], [3, 3], stride=2, scope="latent_conv2")
      x = slim.batch_norm(x, scope="latent_bn2")
      x = slim.conv2d(x, conv_size[2], [3, 3], stride=1, scope="latent_conv3")
      x = slim.batch_norm(x, scope="latent_bn3")

      nc = self.hparams.latent_channels
      mean = slim.conv2d(
          x, nc, [3, 3], stride=2, activation_fn=None, scope="latent_mean")
      std = slim.conv2d(x, nc, [3, 3], stride=2, scope="latent_std")
      std += self.hparams.latent_std_min

      # No latent tower at inference time, just standard gaussian.
      if self.hparams.mode != tf.estimator.ModeKeys.TRAIN:
        return tf.zeros_like(mean), tf.zeros_like(std)

      return mean, std
Exemplo n.º 6
0
  def construct_latent_tower(self, images):
    """Builds convolutional latent tower for stochastic model.

    At training time this tower generates a latent distribution (mean and std)
    conditioned on the entire video. This latent variable will be fed to the
    main tower as an extra variable to be used for future frames prediction.
    At inference time, the tower is disabled and only returns latents sampled
    from N(0,1).
    If the multi_latent flag is on, a different latent for every timestep would
    be generated.

    Args:
      images: tensor of ground truth image sequences
    Returns:
      latent_mean: predicted latent mean
      latent_std: predicted latent standard deviation
      latent_loss: loss of the latent twoer
      samples: random samples sampled from standard guassian
    """
    conv_size = self.tinyify([32, 64, 64])
    with tf.variable_scope("latent", reuse=tf.AUTO_REUSE):
      # this allows more predicted frames at inference time
      latent_images = images[:self.hparams.latent_num_frames]
      images = tf.concat(latent_images, 3)

      x = images
      x = common_layers.make_even_size(x)
      x = slim.conv2d(x, conv_size[0], [3, 3], stride=2, scope="latent_conv1")
      x = slim.batch_norm(x, scope="latent_bn1")
      x = common_layers.make_even_size(x)
      x = slim.conv2d(x, conv_size[1], [3, 3], stride=2, scope="latent_conv2")
      x = slim.batch_norm(x, scope="latent_bn2")
      x = slim.conv2d(x, conv_size[2], [3, 3], stride=1, scope="latent_conv3")
      x = slim.batch_norm(x, scope="latent_bn3")

      nc = self.hparams.latent_channels
      mean = slim.conv2d(
          x, nc, [3, 3], stride=2, activation_fn=None, scope="latent_mean")
      std = slim.conv2d(x, nc, [3, 3], stride=2, scope="latent_std")
      std += self.hparams.latent_std_min

      # No latent tower at inference time, just standard gaussian.
      if self.hparams.mode != tf.estimator.ModeKeys.TRAIN:
        return tf.zeros_like(mean), tf.zeros_like(std)

      return mean, std
Exemplo n.º 7
0
    def inject_latent(self, layer, features, filters):
        """Inject a deterministic latent based on the target frame."""
        del filters
        hparams = self.hparams
        final_filters = common_layers.shape_list(layer)[-1]
        filters = hparams.hidden_size
        kernel = (4, 4)

        if hparams.mode == tf.estimator.ModeKeys.PREDICT:
            layer_shape = common_layers.shape_list(layer)
            if hparams.full_latent_tower:
                rand = tf.random_uniform(layer_shape[:-1] +
                                         [hparams.bottleneck_bits])
            else:
                rand = tf.random_uniform(layer_shape[:-3] +
                                         [1, 1, hparams.bottleneck_bits])
            d = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0
            z = tf.layers.dense(d, final_filters, name="unbottleneck")
            return layer + z, 0.0

        # Embed.
        frames = tf.concat([features["cur_target_frame"], features["inputs"]],
                           axis=-1)
        x = tf.layers.dense(
            frames,
            filters,
            name="latent_embed",
            bias_initializer=tf.random_normal_initializer(stddev=0.01))
        x = common_attention.add_timing_signal_nd(x)

        if hparams.full_latent_tower:
            for i in range(hparams.num_compress_steps):
                with tf.variable_scope("latent_downstride%d" % i):
                    x = common_layers.make_even_size(x)
                    if i < hparams.filter_double_steps:
                        filters *= 2
                    x = common_attention.add_timing_signal_nd(x)
                    x = tf.layers.conv2d(x,
                                         filters,
                                         kernel,
                                         activation=common_layers.belu,
                                         strides=(2, 2),
                                         padding="SAME")
                    x = common_layers.layer_norm(x)
        else:
            x = common_layers.double_discriminator(x)
            x = tf.expand_dims(tf.expand_dims(x, axis=1), axis=1)
        x = tf.tanh(
            tf.layers.dense(x, hparams.bottleneck_bits, name="bottleneck"))
        d = x + tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x)) - 1.0 - x)
        if hparams.mode == tf.estimator.ModeKeys.TRAIN:
            noise = tf.random_uniform(common_layers.shape_list(x))
            noise = 2.0 * tf.to_float(tf.less(hparams.bottleneck_noise,
                                              noise)) - 1.0
            d *= noise

        z = tf.layers.dense(d, final_filters, name="unbottleneck")
        return layer + z, 0.0
Exemplo n.º 8
0
 def make_even_size(self, x):
   if not self.is1d:
     return common_layers.make_even_size(x)
   shape1 = x.get_shape().as_list()[1]
   if shape1 is not None and shape1 % 2 == 0:
     return x
   x, _ = common_layers.pad_to_same_length(
       x, x, final_length_divisible_by=2, axis=1)
   return x
Exemplo n.º 9
0
 def basic_conv_net(self, images, conv_size, scope):
   """Simple multi conv ln relu."""
   conv_size = self.tinyify(conv_size)
   with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
     x = images
     for i, c in enumerate(conv_size):
       if i > 0:
         x = tf.nn.relu(x)
       x = common_layers.make_even_size(x)
       x = tfl.conv2d(x, c, [3, 3], strides=(2, 2),
                      activation=None, padding="SAME", name="conv%d" % i)
       x = tfcl.layer_norm(x)
   return x
Exemplo n.º 10
0
  def encode(self, inputs, target_space, hparams, features=None, losses=None):
    """Add layers of strided convolutions on top of encoder."""
    with tf.variable_scope("downstride"):
      hparams = self.hparams
      kernel, strides = (4, 4), (2, 2)
      x = inputs
      # Down-convolutions.
      for i in range(hparams.num_compress_steps):
        x = common_layers.make_even_size(x)
        x = tf.layers.conv2d(
            x, hparams.hidden_size, kernel, strides=strides,
            padding="SAME", activation=common_layers.belu, name="conv_%d" % i)
        x = common_layers.layer_norm(x)

    encoder_output, encoder_decoder_attention_bias = super(
        TransformerSketch, self).encode(
            x, target_space, hparams, features=features, losses=losses)
    return encoder_output, encoder_decoder_attention_bias
Exemplo n.º 11
0
    def construct_model(self,
                        images,
                        actions,
                        rewards,
                        k=-1,
                        num_masks=10,
                        cdna=True,
                        dna=False,
                        context_frames=2):
        """Build convolutional lstm video predictor using CDNA, or DNA.

    Args:
      images: list of tensors of ground truth image sequences
              there should be a 4D image ?xWxHxC for each timestep
      actions: list of action tensors
               each action should be in the shape ?x1xZ
      rewards: list of reward tensors
               each reward should be in the shape ?x1xZ
      k: constant used for scheduled sampling. -1 to feed in own prediction.
      num_masks: the number of different pixel motion predictions (and
                 the number of masks for each of those predictions)
      cdna: True to use Convoluational Dynamic Neural Advection (CDNA)
      dna: True to use Dynamic Neural Advection (DNA)
      context_frames: number of ground truth frames to pass in before
                      feeding in own predictions
    Returns:
      gen_images: predicted future image frames
      gen_rewards: predicted future rewards
      latent_mean: mean of approximated posterior
      latent_std: std of approximated posterior

    Raises:
      ValueError: if more than one network option specified or more than 1 mask
      specified for DNA model.
    """
        # Each image is being used twice, in latent tower and main tower.
        # This is to make sure we are using the *same* image for both, ...
        # ... given how TF queues work.
        images = [tf.identity(image) for image in images]

        if cdna + dna != 1:
            raise ValueError("More than one, or no network option specified.")

        img_height, img_width, color_channels = self.hparams.problem.frame_shape
        batch_size = common_layers.shape_list(images[0])[0]

        # Predicted images and rewards.
        gen_rewards, gen_images = [], []

        if k == -1:
            feedself = True
        else:
            # Scheduled sampling:
            # Calculate number of ground-truth frames to pass in.
            iter_num = tf.train.get_global_step()
            # TODO(mbz): what should it be if it's undefined?
            if iter_num is None:
                iter_num = _LARGE_STEP_NUMBER
            num_ground_truth = tf.to_int32(
                tf.round(
                    tf.to_float(batch_size) *
                    (k /
                     (k + tf.exp(tf.to_float(iter_num) / tf.to_float(k))))))
            feedself = False

        # LSTM state sizes and states.
        lstm_size = np.array([32, 32, 64, 64, 128, 64, 32], dtype=np.int32)
        lstm_state = [None] * 7

        # Latent tower
        if self.hparams.stochastic_model:
            latent_tower_outputs = self.construct_latent_tower(images)
            latent_mean, latent_std, samples = latent_tower_outputs

        # Main tower
        layer_norm = tf.contrib.layers.layer_norm
        lstm_func = self.conv_lstm_2d

        for timestep, image, action, reward in zip(range(len(images) - 1),
                                                   images[:-1], actions[:-1],
                                                   rewards[:-1]):

            done_warm_start = len(gen_images) > context_frames - 1
            with tf.variable_scope("main", reuse=tf.AUTO_REUSE):
                if feedself and done_warm_start:
                    # Feed in generated image.
                    prev_image = gen_images[-1]
                    prev_reward = gen_rewards[-1]
                elif done_warm_start:
                    # Scheduled sampling
                    prev_image = self.scheduled_sample(image, gen_images[-1],
                                                       batch_size,
                                                       num_ground_truth)
                    prev_reward = self.scheduled_sample(
                        reward, gen_rewards[-1], batch_size, num_ground_truth)
                else:
                    # Always feed in ground_truth
                    prev_image = image
                    prev_reward = reward

                prev_image = common_layers.make_even_size(prev_image)
                enc0 = slim.layers.conv2d(
                    prev_image,
                    32, [5, 5],
                    stride=2,
                    scope="scale1_conv1",
                    normalizer_fn=layer_norm,
                    normalizer_params={"scope": "layer_norm1"})

                hidden1, lstm_state[0] = lstm_func(enc0,
                                                   lstm_state[0],
                                                   lstm_size[0],
                                                   scope="state1")
                hidden1 = layer_norm(hidden1, scope="layer_norm2")
                hidden2, lstm_state[1] = lstm_func(hidden1,
                                                   lstm_state[1],
                                                   lstm_size[1],
                                                   scope="state2")
                hidden2 = layer_norm(hidden2, scope="layer_norm3")
                hidden2 = common_layers.make_even_size(hidden2)
                enc1 = slim.layers.conv2d(hidden2,
                                          hidden2.get_shape()[3], [3, 3],
                                          stride=2,
                                          scope="conv2")

                hidden3, lstm_state[2] = lstm_func(enc1,
                                                   lstm_state[2],
                                                   lstm_size[2],
                                                   scope="state3")
                hidden3 = layer_norm(hidden3, scope="layer_norm4")
                hidden4, lstm_state[3] = lstm_func(hidden3,
                                                   lstm_state[3],
                                                   lstm_size[3],
                                                   scope="state4")
                hidden4 = layer_norm(hidden4, scope="layer_norm5")
                hidden4 = common_layers.make_even_size(hidden4)
                enc2 = slim.layers.conv2d(hidden4,
                                          hidden4.get_shape()[3], [3, 3],
                                          stride=2,
                                          scope="conv3")

                # Pass in reward and action.
                emb_action = self.encode_to_shape(action, enc2.get_shape())
                emb_reward = self.encode_to_shape(prev_reward,
                                                  enc2.get_shape())
                enc2 = tf.concat(axis=3, values=[enc2, emb_action, emb_reward])

                # Setup latent
                if self.hparams.stochastic_model:
                    latent = samples
                    if self.hparams.multi_latent:
                        latent = samples[timestep]
                    if self.hparams.mode == tf.estimator.ModeKeys.TRAIN:
                        # TODO(mbz): put 1st stage of training back in if necessary
                        latent = latent_mean + tf.exp(
                            latent_std / 2.0) * latent
                    with tf.control_dependencies([latent]):
                        enc2 = tf.concat([enc2, latent], 3)

                enc3 = slim.layers.conv2d(enc2,
                                          hidden4.get_shape()[3], [1, 1],
                                          stride=1,
                                          scope="conv4")

                hidden5, lstm_state[4] = lstm_func(enc3,
                                                   lstm_state[4],
                                                   lstm_size[4],
                                                   scope="state5")  # last 8x8
                hidden5 = layer_norm(hidden5, scope="layer_norm6")
                enc4 = slim.layers.conv2d_transpose(hidden5,
                                                    hidden5.get_shape()[3],
                                                    3,
                                                    stride=2,
                                                    scope="convt1")

                enc1_shape = common_layers.shape_list(enc1)
                enc4 = enc4[:, :enc1_shape[1], :
                            enc1_shape[2], :]  # Cut to shape.
                hidden6, lstm_state[5] = lstm_func(enc4,
                                                   lstm_state[5],
                                                   lstm_size[5],
                                                   scope="state6")  # 16x16
                hidden6 = layer_norm(hidden6, scope="layer_norm7")
                # Skip connection.
                hidden6 = tf.concat(axis=3, values=[hidden6,
                                                    enc1])  # both 16x16

                enc5 = slim.layers.conv2d_transpose(hidden6,
                                                    hidden6.get_shape()[3],
                                                    3,
                                                    stride=2,
                                                    scope="convt2")
                enc0_shape = common_layers.shape_list(enc0)
                enc5 = enc5[:, :enc0_shape[1], :
                            enc0_shape[2], :]  # Cut to shape.
                hidden7, lstm_state[6] = lstm_func(enc5,
                                                   lstm_state[6],
                                                   lstm_size[6],
                                                   scope="state7")  # 32x32
                hidden7 = layer_norm(hidden7, scope="layer_norm8")

                # Skip connection.
                hidden7 = tf.concat(axis=3, values=[hidden7,
                                                    enc0])  # both 32x32

                enc6 = slim.layers.conv2d_transpose(
                    hidden7,
                    hidden7.get_shape()[3],
                    3,
                    stride=2,
                    scope="convt3",
                    activation_fn=None,
                    normalizer_fn=layer_norm,
                    normalizer_params={"scope": "layer_norm9"})

                if dna:
                    # Using largest hidden state for predicting untied conv kernels.
                    enc7 = slim.layers.conv2d_transpose(
                        enc6,
                        self.hparams.dna_kernel_size**2,
                        1,
                        stride=1,
                        scope="convt4",
                        activation_fn=None)
                else:
                    # Using largest hidden state for predicting a new image layer.
                    enc7 = slim.layers.conv2d_transpose(enc6,
                                                        color_channels,
                                                        1,
                                                        stride=1,
                                                        scope="convt4",
                                                        activation_fn=None)
                    # This allows the network to also generate one image from scratch,
                    # which is useful when regions of the image become unoccluded.
                    transformed = [tf.nn.sigmoid(enc7)]

                if cdna:
                    # cdna_input = tf.reshape(hidden5, [int(batch_size), -1])
                    cdna_input = tf.contrib.layers.flatten(hidden5)
                    transformed += self.cdna_transformation(
                        prev_image, cdna_input, num_masks, int(color_channels))
                elif dna:
                    # Only one mask is supported (more should be unnecessary).
                    if num_masks != 1:
                        raise ValueError(
                            "Only one mask is supported for DNA model.")
                    transformed = [self.dna_transformation(prev_image, enc7)]

                masks = slim.layers.conv2d_transpose(enc6,
                                                     num_masks + 1,
                                                     1,
                                                     stride=1,
                                                     scope="convt7",
                                                     activation_fn=None)
                masks = tf.reshape(
                    tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [
                        batch_size,
                        int(img_height),
                        int(img_width), num_masks + 1
                    ])
                mask_list = tf.split(axis=3,
                                     num_or_size_splits=num_masks + 1,
                                     value=masks)
                output = mask_list[0] * prev_image
                for layer, mask in zip(transformed, mask_list[1:]):
                    output += layer * mask
                gen_images.append(output)

                p_reward = self.reward_prediction(hidden5)
                p_reward = self.decode_to_shape(p_reward, reward.shape)

                gen_rewards.append(p_reward)

        return gen_images, gen_rewards, latent_mean, latent_std
Exemplo n.º 12
0
    def next_frame(self, frames, actions, rewards, target_frame,
                   internal_states, video_extra):
        del rewards, video_extra

        hparams = self.hparams
        filters = hparams.hidden_size
        kernel2 = (4, 4)

        # Embed the inputs.
        stacked_frames = tf.concat(frames, axis=-1)
        inputs_shape = common_layers.shape_list(stacked_frames)
        # Using non-zero bias initializer below for edge cases of uniform inputs.
        x = tf.layers.dense(
            stacked_frames,
            filters,
            name="inputs_embed",
            bias_initializer=tf.random_normal_initializer(stddev=0.01))
        x = common_attention.add_timing_signal_nd(x)

        # Down-stride.
        layer_inputs = [x]
        for i in range(hparams.num_compress_steps):
            with tf.variable_scope("downstride%d" % i):
                layer_inputs.append(x)
                x = tf.nn.dropout(x, 1.0 - self.hparams.dropout)
                x = common_layers.make_even_size(x)
                if i < hparams.filter_double_steps:
                    filters *= 2
                x = common_attention.add_timing_signal_nd(x)
                x = tf.layers.conv2d(x,
                                     filters,
                                     kernel2,
                                     activation=common_layers.belu,
                                     strides=(2, 2),
                                     padding="SAME")
                x = common_layers.layer_norm(x)

        # Add embedded action if present.
        if self.has_actions:
            action = actions[-1]
            x = common_video.inject_additional_input(x, action, "action_enc",
                                                     hparams.action_injection)

        # Inject latent if present. Only for stochastic models.
        x, extra_loss = self.inject_latent(x, frames, target_frame)

        x_mid = tf.reduce_mean(x, axis=[1, 2], keepdims=True)
        x, internal_states = self.middle_network(x, internal_states)

        # Up-convolve.
        layer_inputs = list(reversed(layer_inputs))
        for i in range(hparams.num_compress_steps):
            with tf.variable_scope("upstride%d" % i):
                x = tf.nn.dropout(x, 1.0 - self.hparams.dropout)
                if self.has_actions:
                    x = common_video.inject_additional_input(
                        x, action, "action_enc", hparams.action_injection)
                if i >= hparams.num_compress_steps - hparams.filter_double_steps:
                    filters //= 2
                x = tf.layers.conv2d_transpose(x,
                                               filters,
                                               kernel2,
                                               activation=common_layers.belu,
                                               strides=(2, 2),
                                               padding="SAME")
                y = layer_inputs[i]
                shape = common_layers.shape_list(y)
                x = x[:, :shape[1], :shape[2], :]
                x = common_layers.layer_norm(x + y)
                x = common_attention.add_timing_signal_nd(x)

        # Cut down to original size.
        x = x[:, :inputs_shape[1], :inputs_shape[2], :]
        x_fin = tf.reduce_mean(x, axis=[1, 2], keepdims=True)
        if self.is_per_pixel_softmax:
            x = tf.layers.dense(x,
                                hparams.problem.num_channels * 256,
                                name="logits")
        else:
            x = tf.layers.dense(x, hparams.problem.num_channels, name="logits")

        # No reward prediction if not needed.
        if not self.has_rewards:
            return x, None, extra_loss, internal_states

        # Reward prediction based on middle and final logits.
        reward_pred = tf.concat([x_mid, x_fin], axis=-1)
        reward_pred = tf.nn.relu(
            tf.layers.dense(reward_pred, 128, name="reward_pred"))
        reward_pred = tf.squeeze(reward_pred, axis=1)  # Remove extra dims
        reward_pred = tf.squeeze(reward_pred, axis=1)  # Remove extra dims
        return x, reward_pred, extra_loss, internal_states
Exemplo n.º 13
0
    def body_single(self, features):
        hparams = self.hparams
        filters = hparams.hidden_size
        kernel1, kernel2 = (3, 3), (4, 4)

        # Embed the inputs.
        inputs_shape = common_layers.shape_list(features["inputs"])
        # Using non-zero bias initializer below for edge cases of uniform inputs.
        x = tf.layers.dense(
            features["inputs"],
            filters,
            name="inputs_embed",
            bias_initializer=tf.random_normal_initializer(stddev=0.01))
        x = common_attention.add_timing_signal_nd(x)

        # Down-stride.
        layer_inputs = [x]
        for i in range(hparams.num_compress_steps):
            with tf.variable_scope("downstride%d" % i):
                layer_inputs.append(x)
                x = common_layers.make_even_size(x)
                if i < hparams.filter_double_steps:
                    filters *= 2
                x = common_attention.add_timing_signal_nd(x)
                x = tf.layers.conv2d(x,
                                     filters,
                                     kernel2,
                                     activation=common_layers.belu,
                                     strides=(2, 2),
                                     padding="SAME")
                x = common_layers.layer_norm(x)

        # Add embedded action if present.
        if "input_action" in features:
            action = features["input_action"][:, -1, :]
            x = self.inject_additional_input(x, action, "action_enc",
                                             hparams.action_injection)

        x, extra_loss = self.inject_latent(x, features, filters)

        # Run a stack of convolutions.
        for i in range(hparams.num_hidden_layers):
            with tf.variable_scope("layer%d" % i):
                y = tf.nn.dropout(x, 1.0 - hparams.dropout)
                y = tf.layers.conv2d(y,
                                     filters,
                                     kernel1,
                                     activation=common_layers.belu,
                                     strides=(1, 1),
                                     padding="SAME")
                if i == 0:
                    x = y
                else:
                    x = common_layers.layer_norm(x + y)

        # Up-convolve.
        layer_inputs = list(reversed(layer_inputs))
        for i in range(hparams.num_compress_steps):
            with tf.variable_scope("upstride%d" % i):
                if "input_action" in features:
                    x = self.inject_additional_input(x, action, "action_enc",
                                                     hparams.action_injection)
                if i >= hparams.num_compress_steps - hparams.filter_double_steps:
                    filters //= 2
                x = tf.layers.conv2d_transpose(x,
                                               filters,
                                               kernel2,
                                               activation=common_layers.belu,
                                               strides=(2, 2),
                                               padding="SAME")
                y = layer_inputs[i]
                shape = common_layers.shape_list(y)
                x = x[:, :shape[1], :shape[2], :]
                x = common_layers.layer_norm(x + y)
                x = common_attention.add_timing_signal_nd(x)

        # Cut down to original size.
        x = x[:, :inputs_shape[1], :inputs_shape[2], :]
        if self.is_per_pixel_softmax:
            x = tf.layers.dense(x,
                                hparams.problem.num_channels * 256,
                                name="logits")
        else:
            x = tf.layers.dense(x, hparams.problem.num_channels, name="logits")

        # Reward prediction if needed.
        if "target_reward" not in features:
            return x
        reward_pred = tf.expand_dims(  # Add a fake channels dim.
            tf.reduce_mean(x, axis=[1, 2], keepdims=True),
            axis=3)
        return {"targets": x, "target_reward": reward_pred}, extra_loss
Exemplo n.º 14
0
  def next_frame(self, frames, actions, rewards, target_frame,
                 internal_states, video_extra):
    del rewards, video_extra

    hparams = self.hparams
    filters = hparams.hidden_size
    kernel2 = (4, 4)
    action = actions[-1]

    # Stack the inputs.
    if internal_states is not None and hparams.concat_internal_states:
      # Use the first part of the first internal state if asked to concatenate.
      batch_size = common_layers.shape_list(frames[0])[0]
      internal_state = internal_states[0][0][:batch_size, :, :, :]
      stacked_frames = tf.concat(frames + [internal_state], axis=-1)
    else:
      stacked_frames = tf.concat(frames, axis=-1)
    inputs_shape = common_layers.shape_list(stacked_frames)

    # Update internal states early if requested.
    if hparams.concat_internal_states:
      internal_states = self.update_internal_states_early(
          internal_states, frames)

    # Using non-zero bias initializer below for edge cases of uniform inputs.
    x = tf.layers.dense(
        stacked_frames, filters, name="inputs_embed",
        bias_initializer=tf.random_normal_initializer(stddev=0.01))
    x = common_attention.add_timing_signal_nd(x)

    # Down-stride.
    layer_inputs = [x]
    for i in range(hparams.num_compress_steps):
      with tf.variable_scope("downstride%d" % i):
        layer_inputs.append(x)
        x = tf.nn.dropout(x, 1.0 - self.hparams.dropout)
        x = common_layers.make_even_size(x)
        if i < hparams.filter_double_steps:
          filters *= 2
        x = common_attention.add_timing_signal_nd(x)
        x = tf.layers.conv2d(x, filters, kernel2, activation=common_layers.belu,
                             strides=(2, 2), padding="SAME")
        x = common_layers.layer_norm(x)

    # Add embedded action if present.
    if self.has_actions:
      x = common_video.inject_additional_input(
          x, action, "action_enc", hparams.action_injection)

    # Inject latent if present. Only for stochastic models.
    x, extra_loss = self.inject_latent(x, frames, target_frame, action)

    x_mid = tf.reduce_mean(x, axis=[1, 2], keepdims=True)
    x, internal_states = self.middle_network(x, internal_states)

    # Up-convolve.
    layer_inputs = list(reversed(layer_inputs))
    for i in range(hparams.num_compress_steps):
      with tf.variable_scope("upstride%d" % i):
        x = tf.nn.dropout(x, 1.0 - self.hparams.dropout)
        if self.has_actions:
          x = common_video.inject_additional_input(
              x, action, "action_enc", hparams.action_injection)
        if i >= hparams.num_compress_steps - hparams.filter_double_steps:
          filters //= 2
        x = tf.layers.conv2d_transpose(
            x, filters, kernel2, activation=common_layers.belu,
            strides=(2, 2), padding="SAME")
        y = layer_inputs[i]
        shape = common_layers.shape_list(y)
        x = x[:, :shape[1], :shape[2], :]
        x = common_layers.layer_norm(x + y)
        x = common_attention.add_timing_signal_nd(x)

    # Cut down to original size.
    x = x[:, :inputs_shape[1], :inputs_shape[2], :]
    x_fin = tf.reduce_mean(x, axis=[1, 2], keepdims=True)
    if self.is_per_pixel_softmax:
      x = tf.layers.dense(x, hparams.problem.num_channels * 256, name="logits")
    else:
      x = tf.layers.dense(x, hparams.problem.num_channels, name="logits")

    # No reward prediction if not needed.
    if not self.has_rewards:
      return x, None, extra_loss, internal_states

    # Reward prediction based on middle and final logits.
    reward_pred = tf.concat([x_mid, x_fin], axis=-1)
    reward_pred = tf.nn.relu(tf.layers.dense(
        reward_pred, 128, name="reward_pred"))
    reward_pred = tf.squeeze(reward_pred, axis=1)  # Remove extra dims
    reward_pred = tf.squeeze(reward_pred, axis=1)  # Remove extra dims
    return x, reward_pred, extra_loss, internal_states
Exemplo n.º 15
0
  def construct_model(self,
                      images,
                      actions,
                      states,
                      k=-1,
                      use_state=False,
                      num_masks=10,
                      cdna=True,
                      dna=False,
                      context_frames=2):
    """Build convolutional lstm video predictor using CDNA, or DNA.

    Args:
      images: tensor of ground truth image sequences
      actions: tensor of action sequences
      states: tensor of ground truth state sequences
      k: constant used for scheduled sampling. -1 to feed in own prediction.
      use_state: True to include state and action in prediction
      num_masks: the number of different pixel motion predictions (and
                 the number of masks for each of those predictions)
      cdna: True to use Convoluational Dynamic Neural Advection (CDNA)
      dna: True to use Dynamic Neural Advection (DNA)
      context_frames: number of ground truth frames to pass in before
                      feeding in own predictions
    Returns:
      gen_images: predicted future image frames
      gen_states: predicted future states

    Raises:
      ValueError: if more than one network option specified or more than 1 mask
      specified for DNA model.
    """
    # Each image is being used twice, in latent tower and main tower.
    # This is to make sure we are using the *same* image for both, ...
    # ... given how TF queues work.
    images = [tf.identity(image) for image in images]

    if cdna + dna != 1:
      raise ValueError("More than one, or no network option specified.")

    img_height, img_width, color_channels = self.hparams.problem.frame_shape
    batch_size = common_layers.shape_list(images[0])[0]
    lstm_func = self.basic_conv_lstm_cell

    # Generated robot states and images.
    gen_states, gen_images = [], []
    current_state = states[0]

    if k == -1:
      feedself = True
    else:
      # Scheduled sampling:
      # Calculate number of ground-truth frames to pass in.
      iter_num = tf.train.get_or_create_global_step()
      num_ground_truth = tf.to_int32(
          tf.round(
              tf.to_float(batch_size) *
              (k / (k + tf.exp(tf.to_float(iter_num) / tf.to_float(k))))))
      feedself = False

    # LSTM state sizes and states.
    lstm_size = np.int32(np.array([32, 32, 64, 64, 128, 64, 32]))
    lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None
    lstm_state5, lstm_state6, lstm_state7 = None, None, None

    # Latent tower
    if self.hparams.stochastic_model:
      latent_tower_outputs = self.construct_latent_tower(images)
      latent_mean, latent_std, samples = latent_tower_outputs

    # Main tower
    timestep = 0
    layer_norm = tf.contrib.layers.layer_norm

    for image, action in zip(images[:-1], actions[:-1]):
      # Reuse variables after the first timestep.
      reuse = bool(gen_images)

      done_warm_start = len(gen_images) > context_frames - 1
      with slim.arg_scope(
          [
              lstm_func, slim.layers.conv2d, slim.layers.fully_connected,
              layer_norm, slim.layers.conv2d_transpose
          ],
          reuse=reuse):

        if feedself and done_warm_start:
          # Feed in generated image.
          prev_image = gen_images[-1]
        elif done_warm_start:
          # Scheduled sampling
          prev_image = self.scheduled_sample(
              image, gen_images[-1], self.hparams.batch_size, num_ground_truth)
        else:
          # Always feed in ground_truth
          prev_image = image

        # Predicted state is always fed back in
        state_action = tf.concat(axis=1, values=[action, current_state])

        prev_image = common_layers.make_even_size(prev_image)
        enc0 = slim.layers.conv2d(
            prev_image,
            32, [5, 5],
            stride=2,
            scope="scale1_conv1",
            normalizer_fn=layer_norm,
            normalizer_params={"scope": "layer_norm1"})

        hidden1, lstm_state1 = lstm_func(
            enc0, lstm_state1, lstm_size[0], scope="state1")
        hidden1 = layer_norm(hidden1, scope="layer_norm2")
        hidden2, lstm_state2 = lstm_func(
            hidden1, lstm_state2, lstm_size[1], scope="state2")
        hidden2 = layer_norm(hidden2, scope="layer_norm3")
        hidden2 = common_layers.make_even_size(hidden2)
        enc1 = slim.layers.conv2d(
            hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope="conv2")

        hidden3, lstm_state3 = lstm_func(
            enc1, lstm_state3, lstm_size[2], scope="state3")
        hidden3 = layer_norm(hidden3, scope="layer_norm4")
        hidden4, lstm_state4 = lstm_func(
            hidden3, lstm_state4, lstm_size[3], scope="state4")
        hidden4 = layer_norm(hidden4, scope="layer_norm5")
        hidden4 = common_layers.make_even_size(hidden4)
        enc2 = slim.layers.conv2d(
            hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope="conv3")

        # Pass in state and action.
        smear = tf.reshape(
            state_action,
            [-1, 1, 1, int(common_layers.shape_list(state_action)[1])])
        enc2_shape = common_layers.shape_list(enc2)
        smear = tf.tile(
            smear, [1, enc2_shape[1], enc2_shape[2], 1])
        if use_state:
          enc2 = tf.concat(axis=3, values=[enc2, smear])

        # Setup latent
        if self.hparams.stochastic_model:
          latent = samples
          if self.hparams.multi_latent:
            latent = samples[timestep]
          if self.hparams.mode == tf.estimator.ModeKeys.TRAIN:
            # TODO(mbz): put 1st stage of training back in if necessary
            latent = latent_mean + tf.exp(latent_std / 2.0) * latent
          with tf.control_dependencies([latent]):
            enc2 = tf.concat([enc2, latent], 3)

        enc3 = slim.layers.conv2d(
            enc2, hidden4.get_shape()[3], [1, 1], stride=1, scope="conv4")

        hidden5, lstm_state5 = lstm_func(
            enc3, lstm_state5, lstm_size[4], scope="state5")  # last 8x8
        hidden5 = layer_norm(hidden5, scope="layer_norm6")
        enc4 = slim.layers.conv2d_transpose(
            hidden5, hidden5.get_shape()[3], 3, stride=2, scope="convt1")

        enc1_shape = common_layers.shape_list(enc1)
        enc4 = enc4[:, :enc1_shape[1], :enc1_shape[2], :]  # Cut to shape.
        hidden6, lstm_state6 = lstm_func(
            enc4, lstm_state6, lstm_size[5], scope="state6")  # 16x16
        hidden6 = layer_norm(hidden6, scope="layer_norm7")
        # Skip connection.
        hidden6 = tf.concat(axis=3, values=[hidden6, enc1])  # both 16x16

        enc5 = slim.layers.conv2d_transpose(
            hidden6, hidden6.get_shape()[3], 3, stride=2, scope="convt2")
        enc0_shape = common_layers.shape_list(enc0)
        enc5 = enc5[:, :enc0_shape[1], :enc0_shape[2], :]  # Cut to shape.
        hidden7, lstm_state7 = lstm_func(
            enc5, lstm_state7, lstm_size[6], scope="state7")  # 32x32
        hidden7 = layer_norm(hidden7, scope="layer_norm8")

        # Skip connection.
        hidden7 = tf.concat(axis=3, values=[hidden7, enc0])  # both 32x32

        enc6 = slim.layers.conv2d_transpose(
            hidden7,
            hidden7.get_shape()[3],
            3,
            stride=2,
            scope="convt3",
            activation_fn=None,
            normalizer_fn=layer_norm,
            normalizer_params={"scope": "layer_norm9"})

        if dna:
          # Using largest hidden state for predicting untied conv kernels.
          enc7 = slim.layers.conv2d_transpose(
              enc6,
              self.hparams.dna_kernel_size**2,
              1,
              stride=1,
              scope="convt4",
              activation_fn=None)
        else:
          # Using largest hidden state for predicting a new image layer.
          enc7 = slim.layers.conv2d_transpose(
              enc6,
              color_channels,
              1,
              stride=1,
              scope="convt4",
              activation_fn=None)
          # This allows the network to also generate one image from scratch,
          # which is useful when regions of the image become unoccluded.
          transformed = [tf.nn.sigmoid(enc7)]

        if cdna:
          # cdna_input = tf.reshape(hidden5, [int(batch_size), -1])
          cdna_input = tf.contrib.layers.flatten(hidden5)
          transformed += self.cdna_transformation(
              prev_image, cdna_input, num_masks, int(color_channels))
        elif dna:
          # Only one mask is supported (more should be unnecessary).
          if num_masks != 1:
            raise ValueError("Only one mask is supported for DNA model.")
          transformed = [self.dna_transformation(prev_image, enc7)]

        masks = slim.layers.conv2d_transpose(
            enc6, num_masks + 1, 1,
            stride=1, scope="convt7", activation_fn=None)
        masks = tf.reshape(
            tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])),
            [batch_size,
             int(img_height),
             int(img_width), num_masks + 1])
        mask_list = tf.split(
            axis=3, num_or_size_splits=num_masks + 1, value=masks)
        output = mask_list[0] * prev_image
        for layer, mask in zip(transformed, mask_list[1:]):
          output += layer * mask
        gen_images.append(output)

        current_state = slim.layers.fully_connected(
            state_action,
            int(current_state.get_shape()[1]),
            scope="state_pred",
            activation_fn=None)
        gen_states.append(current_state)
        timestep += 1

    return gen_images, gen_states, latent_mean, latent_std
Exemplo n.º 16
0
    def next_frame(self, frames, actions, rewards, target_frame,
                   internal_states, video_extra):
        del rewards, video_extra

        hparams = self.hparams
        filters = hparams.hidden_size
        kernel2 = (4, 4)
        action = actions[-1]
        activation_fn = common_layers.belu
        if self.hparams.activation_fn == "relu":
            activation_fn = tf.nn.relu

        # Normalize frames.
        frames = [common_layers.standardize_images(f) for f in frames]

        # Stack the inputs.
        if internal_states is not None and hparams.concat_internal_states:
            # Use the first part of the first internal state if asked to concatenate.
            batch_size = common_layers.shape_list(frames[0])[0]
            internal_state = internal_states[0][0][:batch_size, :, :, :]
            stacked_frames = tf.concat(frames + [internal_state], axis=-1)
        else:
            stacked_frames = tf.concat(frames, axis=-1)
        inputs_shape = common_layers.shape_list(stacked_frames)

        # Update internal states early if requested.
        if hparams.concat_internal_states:
            internal_states = self.update_internal_states_early(
                internal_states, frames)

        # Using non-zero bias initializer below for edge cases of uniform inputs.
        x = tf.layers.dense(
            stacked_frames,
            filters,
            name="inputs_embed",
            bias_initializer=tf.random_normal_initializer(stddev=0.01))
        x = common_attention.add_timing_signal_nd(x)

        # Down-stride.
        layer_inputs = [x]
        for i in range(hparams.num_compress_steps):
            with tf.variable_scope("downstride%d" % i):
                layer_inputs.append(x)
                x = tf.nn.dropout(x, 1.0 - self.hparams.dropout)
                x = common_layers.make_even_size(x)
                if i < hparams.filter_double_steps:
                    filters *= 2
                x = common_attention.add_timing_signal_nd(x)
                x = tf.layers.conv2d(x,
                                     filters,
                                     kernel2,
                                     activation=activation_fn,
                                     strides=(2, 2),
                                     padding="SAME")
                x = common_layers.layer_norm(x)

        if self.has_actions:
            with tf.variable_scope("policy"):
                x_flat = tf.layers.flatten(x)
                policy_pred = tf.layers.dense(x_flat,
                                              self.hparams.problem.num_actions)
                value_pred = tf.layers.dense(x_flat, 1)
                value_pred = tf.squeeze(value_pred, axis=-1)
        else:
            policy_pred, value_pred = None, None

        # Add embedded action if present.
        if self.has_actions:
            x = common_video.inject_additional_input(x, action, "action_enc",
                                                     hparams.action_injection)

        # Inject latent if present. Only for stochastic models.
        norm_target_frame = common_layers.standardize_images(target_frame)
        x, extra_loss = self.inject_latent(x, frames, norm_target_frame,
                                           action)

        x_mid = tf.reduce_mean(x, axis=[1, 2], keepdims=True)
        x, internal_states = self.middle_network(x, internal_states)

        # Up-convolve.
        layer_inputs = list(reversed(layer_inputs))
        for i in range(hparams.num_compress_steps):
            with tf.variable_scope("upstride%d" % i):
                x = tf.nn.dropout(x, 1.0 - self.hparams.dropout)
                if self.has_actions:
                    x = common_video.inject_additional_input(
                        x, action, "action_enc", hparams.action_injection)
                if i >= hparams.num_compress_steps - hparams.filter_double_steps:
                    filters //= 2
                x = tf.layers.conv2d_transpose(x,
                                               filters,
                                               kernel2,
                                               activation=activation_fn,
                                               strides=(2, 2),
                                               padding="SAME")
                y = layer_inputs[i]
                shape = common_layers.shape_list(y)
                x = x[:, :shape[1], :shape[2], :]
                x = common_layers.layer_norm(x + y)
                x = common_attention.add_timing_signal_nd(x)

        # Cut down to original size.
        x = x[:, :inputs_shape[1], :inputs_shape[2], :]
        x_fin = tf.reduce_mean(x, axis=[1, 2], keepdims=True)
        if hparams.do_autoregressive_rnn:
            # If enabled, we predict the target frame autoregregressively using rnns.
            # To this end, the current prediciton is flattened into one long sequence
            # of sub-pixels, and so is the target frame. Each sub-pixel (RGB value,
            # from 0 to 255) is predicted with an RNN. To avoid doing as many steps
            # as width * height * channels, we only use a number of pixels back,
            # as many as hparams.autoregressive_rnn_lookback.
            with tf.variable_scope("autoregressive_rnn"):
                batch_size = common_layers.shape_list(frames[0])[0]
                # Height, width, channels and lookback are the constants we need.
                h, w = inputs_shape[1], inputs_shape[
                    2]  # 105, 80 on Atari games
                c = hparams.problem.num_channels
                lookback = hparams.autoregressive_rnn_lookback
                assert (
                    h * w
                ) % lookback == 0, "Number of pixels must divide lookback."
                m = (h * w) // lookback  # Batch size multiplier for the RNN.
                # These are logits that will be used as inputs to the RNN.
                rnn_inputs = tf.layers.dense(x, c * 64, name="rnn_inputs")
                # They are of shape [batch_size, h, w, c, 64], reshaping now.
                rnn_inputs = tf.reshape(rnn_inputs,
                                        [batch_size * m, lookback * c, 64])
                # Same for the target frame.
                rnn_target = tf.reshape(target_frame,
                                        [batch_size * m, lookback * c])
                # Construct rnn starting state: flatten rnn_inputs, apply a relu layer.
                rnn_start_state = tf.nn.relu(
                    tf.layers.dense(tf.nn.relu(tf.layers.flatten(rnn_inputs)),
                                    256,
                                    name="rnn_start_state"))
                # Our RNN function API is on bits, each subpixel has 8 bits.
                total_num_bits = lookback * c * 8
                # We need to provide RNN targets as bits (due to the API).
                rnn_target_bits = discretization.int_to_bit(rnn_target, 8)
                rnn_target_bits = tf.reshape(rnn_target_bits,
                                             [batch_size * m, total_num_bits])
                if self.is_training:
                    # Run the RNN in training mode, add it's loss to the losses.
                    rnn_predict, rnn_loss = discretization.predict_bits_with_lstm(
                        rnn_start_state,
                        128,
                        total_num_bits,
                        target_bits=rnn_target_bits,
                        extra_inputs=rnn_inputs)
                    extra_loss += rnn_loss
                    # We still use non-RNN predictions too in order to guide the network.
                    x = tf.layers.dense(x, c * 256, name="logits")
                    x = tf.reshape(x, [batch_size, h, w, c, 256])
                    rnn_predict = tf.reshape(rnn_predict,
                                             [batch_size, h, w, c, 256])
                    # Mix non-RNN and RNN predictions so that after warmup the RNN is 90%.
                    x = tf.reshape(tf.nn.log_softmax(x),
                                   [batch_size, h, w, c * 256])
                    rnn_predict = tf.nn.log_softmax(rnn_predict)
                    rnn_predict = tf.reshape(rnn_predict,
                                             [batch_size, h, w, c * 256])
                    alpha = 0.9 * common_layers.inverse_lin_decay(
                        hparams.autoregressive_rnn_warmup_steps)
                    x = alpha * rnn_predict + (1.0 - alpha) * x
                else:
                    # In prediction mode, run the RNN without any targets.
                    bits, _ = discretization.predict_bits_with_lstm(
                        rnn_start_state,
                        128,
                        total_num_bits,
                        extra_inputs=rnn_inputs,
                        temperature=0.0
                    )  # No sampling from this RNN, just greedy.
                    # The output is in bits, get back the predicted pixels.
                    bits = tf.reshape(bits, [batch_size * m, lookback * c, 8])
                    ints = discretization.bit_to_int(tf.maximum(bits, 0), 8)
                    ints = tf.reshape(ints, [batch_size, h, w, c])
                    x = tf.reshape(tf.one_hot(ints, 256),
                                   [batch_size, h, w, c * 256])
        elif self.is_per_pixel_softmax:
            x = tf.layers.dense(x,
                                hparams.problem.num_channels * 256,
                                name="logits")
        else:
            x = tf.layers.dense(x, hparams.problem.num_channels, name="logits")

        reward_pred = None
        if self.has_rewards:
            # Reward prediction based on middle and final logits.
            reward_pred = tf.concat([x_mid, x_fin], axis=-1)
            reward_pred = tf.nn.relu(
                tf.layers.dense(reward_pred, 128, name="reward_pred"))
            reward_pred = tf.squeeze(reward_pred, axis=1)  # Remove extra dims
            reward_pred = tf.squeeze(reward_pred, axis=1)  # Remove extra dims

        return x, reward_pred, policy_pred, value_pred, extra_loss, internal_states
Exemplo n.º 17
0
    def inject_latent(self, layer, features, filters):
        """Inject a deterministic latent based on the target frame."""
        del filters
        hparams = self.hparams
        final_filters = common_layers.shape_list(layer)[-1]
        filters = hparams.hidden_size
        kernel = (4, 4)
        layer_shape = common_layers.shape_list(layer)
        batch_size = layer_shape[0]
        state_size = hparams.latent_predictor_state_size
        lstm_cell = tf.contrib.rnn.LSTMCell(state_size)
        discrete_predict = tf.layers.Dense(256, name="discrete_predict")
        discrete_embed = tf.layers.Dense(state_size, name="discrete_embed")

        def add_d(layer, d):
            z_mul = tf.layers.dense(d, final_filters, name="unbottleneck_mul")
            if not hparams.complex_addn:
                return layer + z_mul
            layer *= tf.nn.sigmoid(z_mul)
            z_add = tf.layers.dense(d, final_filters, name="unbottleneck_add")
            layer += z_add
            return layer

        if self.is_predicting:
            if hparams.full_latent_tower:
                rand = tf.random_uniform(layer_shape[:-1] +
                                         [hparams.bottleneck_bits])
            else:
                layer_pred = tf.reshape(
                    layer, [batch_size, prod(layer_shape[1:])])
                prediction = tf.layers.dense(layer_pred,
                                             state_size,
                                             name="istate")
                c_state = tf.layers.dense(layer_pred,
                                          state_size,
                                          name="cstate")
                m_state = tf.layers.dense(layer_pred,
                                          state_size,
                                          name="mstate")
                state = (c_state, m_state)
                outputs = []
                for i in range(hparams.bottleneck_bits // 8):
                    output, state = lstm_cell(prediction, state)
                    discrete_logits = discrete_predict(output)
                    discrete_samples = common_layers.sample_with_temperature(
                        discrete_logits, hparams.latent_predictor_temperature)
                    outputs.append(tf.expand_dims(discrete_samples, axis=1))
                    prediction = discrete_embed(
                        tf.one_hot(discrete_samples, 256))
                outputs = tf.concat(outputs, axis=1)
                outputs = discretization.int_to_bit(outputs, 8)
                rand = tf.reshape(outputs,
                                  [batch_size, 1, 1, hparams.bottleneck_bits])
            d = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0
            return add_d(layer, d), 0.0

        # Embed.
        frames = tf.concat([features["cur_target_frame"], features["inputs"]],
                           axis=-1)
        x = tf.layers.dense(
            frames,
            filters,
            name="latent_embed",
            bias_initializer=tf.random_normal_initializer(stddev=0.01))
        x = common_attention.add_timing_signal_nd(x)

        if hparams.full_latent_tower:
            for i in range(hparams.num_compress_steps):
                with tf.variable_scope("latent_downstride%d" % i):
                    x = common_layers.make_even_size(x)
                    if i < hparams.filter_double_steps:
                        filters *= 2
                    x = common_attention.add_timing_signal_nd(x)
                    x = tf.layers.conv2d(x,
                                         filters,
                                         kernel,
                                         activation=common_layers.belu,
                                         strides=(2, 2),
                                         padding="SAME")
                    x = common_layers.layer_norm(x)
        else:
            x = common_layers.double_discriminator(x)
            x = tf.expand_dims(tf.expand_dims(x, axis=1), axis=1)
        x = tf.layers.dense(x, hparams.bottleneck_bits, name="bottleneck")
        x0 = tf.tanh(x)
        d = x0 + tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x0)) - 1.0 -
                                  x0)
        pred_loss = 0.0
        if not hparams.full_latent_tower:
            d_pred = tf.reshape(tf.maximum(tf.stop_gradient(d), 0),
                                [batch_size, hparams.bottleneck_bits // 8, 8])
            d_int = discretization.bit_to_int(d_pred, 8)
            tf.summary.histogram("d_int", tf.reshape(d_int, [-1]))
            d_hot = tf.one_hot(d_int, 256, axis=-1)
            d_pred = discrete_embed(d_hot)
            layer_pred = tf.reshape(layer, [batch_size, prod(layer_shape[1:])])
            prediction0 = tf.layers.dense(layer_pred,
                                          state_size,
                                          name="istate")
            c_state = tf.layers.dense(layer_pred, state_size, name="cstate")
            m_state = tf.layers.dense(layer_pred, state_size, name="mstate")
            pred = tf.concat([tf.expand_dims(prediction0, axis=1), d_pred],
                             axis=1)
            state = (c_state, m_state)
            outputs = []
            for i in range(hparams.bottleneck_bits // 8):
                output, state = lstm_cell(pred[:, i, :], state)
                outputs.append(tf.expand_dims(output, axis=1))
            outputs = tf.concat(outputs, axis=1)
            d_int_pred = discrete_predict(outputs)
            pred_loss = tf.losses.sparse_softmax_cross_entropy(
                logits=d_int_pred, labels=d_int)
            pred_loss = tf.reduce_mean(pred_loss)
        if hparams.mode == tf.estimator.ModeKeys.TRAIN:
            x += tf.truncated_normal(common_layers.shape_list(x),
                                     mean=0.0,
                                     stddev=0.2)
            x = tf.tanh(x)
            noise = tf.random_uniform(common_layers.shape_list(x))
            noise = 2.0 * tf.to_float(tf.less(hparams.bottleneck_noise,
                                              noise)) - 1.0
            x *= noise
            d = x + tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x)) - 1.0 -
                                     x)
            p = common_layers.inverse_lin_decay(hparams.discrete_warmup_steps)
            d = tf.where(tf.less(tf.random_uniform([batch_size]), p), d, x)
        return add_d(layer, d), pred_loss
Exemplo n.º 18
0
    def bottom_part_tower(self,
                          input_image,
                          input_reward,
                          action,
                          latent,
                          lstm_state,
                          lstm_size,
                          conv_size,
                          concat_latent=False):
        """The bottom part of predictive towers.

    With the current (early) design, the main prediction tower and
    the reward prediction tower share the same arcitecture. TF Scope can be
    adjusted as required to either share or not share the weights between
    the two towers.

    Args:
      input_image: the current image.
      input_reward: the current reward.
      action: the action taken by the agent.
      latent: the latent vector.
      lstm_state: the current internal states of conv lstms.
      lstm_size: the size of lstms.
      conv_size: the size of convolutions.
      concat_latent: whether or not to concatenate the latent at every step.

    Returns:
      - the output of the partial network.
      - intermidate outputs for skip connections.
    """
        lstm_func = common_video.conv_lstm_2d
        tile_and_concat = common_video.tile_and_concat

        input_image = common_layers.make_even_size(input_image)
        concat_input_image = tile_and_concat(input_image,
                                             latent,
                                             concat_latent=concat_latent)

        enc0 = tfl.conv2d(concat_input_image,
                          conv_size[0], [5, 5],
                          strides=(2, 2),
                          activation=tf.nn.relu,
                          padding="SAME",
                          name="scale1_conv1")
        enc0 = tfcl.layer_norm(enc0, scope="layer_norm1")

        hidden1, lstm_state[0] = lstm_func(enc0,
                                           lstm_state[0],
                                           lstm_size[0],
                                           name="state1")
        hidden1 = tile_and_concat(hidden1, latent, concat_latent=concat_latent)
        hidden1 = tfcl.layer_norm(hidden1, scope="layer_norm2")
        hidden2, lstm_state[1] = lstm_func(hidden1,
                                           lstm_state[1],
                                           lstm_size[1],
                                           name="state2")
        hidden2 = tfcl.layer_norm(hidden2, scope="layer_norm3")
        hidden2 = common_layers.make_even_size(hidden2)
        enc1 = tfl.conv2d(hidden2,
                          hidden2.get_shape()[3], [3, 3],
                          strides=(2, 2),
                          padding="SAME",
                          activation=tf.nn.relu,
                          name="conv2")
        enc1 = tile_and_concat(enc1, latent, concat_latent=concat_latent)

        hidden3, lstm_state[2] = lstm_func(enc1,
                                           lstm_state[2],
                                           lstm_size[2],
                                           name="state3")
        hidden3 = tile_and_concat(hidden3, latent, concat_latent=concat_latent)
        hidden3 = tfcl.layer_norm(hidden3, scope="layer_norm4")
        hidden4, lstm_state[3] = lstm_func(hidden3,
                                           lstm_state[3],
                                           lstm_size[3],
                                           name="state4")
        hidden4 = tile_and_concat(hidden4, latent, concat_latent=concat_latent)
        hidden4 = tfcl.layer_norm(hidden4, scope="layer_norm5")
        hidden4 = common_layers.make_even_size(hidden4)
        enc2 = tfl.conv2d(hidden4,
                          hidden4.get_shape()[3], [3, 3],
                          strides=(2, 2),
                          padding="SAME",
                          activation=tf.nn.relu,
                          name="conv3")

        if action is not None:
            enc2 = self.inject_additional_input(
                enc2, action, "action_enc", self.hparams.concatenate_actions)
        if input_reward is not None:
            enc2 = self.inject_additional_input(enc2, input_reward,
                                                "reward_enc")
        if latent is not None and not concat_latent:
            with tf.control_dependencies([latent]):
                enc2 = tf.concat([enc2, latent], axis=3)

        enc3 = tfl.conv2d(enc2,
                          hidden4.get_shape()[3], [1, 1],
                          strides=(1, 1),
                          padding="SAME",
                          activation=tf.nn.relu,
                          name="conv4")

        hidden5, lstm_state[4] = lstm_func(enc3,
                                           lstm_state[4],
                                           lstm_size[4],
                                           name="state5")  # last 8x8
        hidden5 = tfcl.layer_norm(hidden5, scope="layer_norm6")
        hidden5 = tile_and_concat(hidden5, latent, concat_latent=concat_latent)
        return hidden5, (enc0, enc1)
Exemplo n.º 19
0
  def construct_predictive_tower(
      self, input_image, input_reward, action, lstm_state, latent):
    # Main tower
    layer_norm = tf.contrib.layers.layer_norm
    lstm_func = self.conv_lstm_2d
    batch_size = common_layers.shape_list(input_image)[0]
    # the number of different pixel motion predictions
    # and the number of masks for each of those predictions
    num_masks = self.hparams.num_masks

    lstm_size = self.tinyify([32, 32, 64, 64, 128, 64, 32])
    conv_size = self.tinyify([32])

    img_height, img_width, color_channels = self.hparams.problem.frame_shape

    with tf.variable_scope("main", reuse=tf.AUTO_REUSE):
      input_image = common_layers.make_even_size(input_image)
      enc0 = slim.layers.conv2d(
          input_image,
          conv_size[0], [5, 5],
          stride=2,
          scope="scale1_conv1",
          normalizer_fn=layer_norm,
          normalizer_params={"scope": "layer_norm1"})

      hidden1, lstm_state[0] = lstm_func(
          enc0, lstm_state[0], lstm_size[0], scope="state1")
      hidden1 = layer_norm(hidden1, scope="layer_norm2")
      hidden2, lstm_state[1] = lstm_func(
          hidden1, lstm_state[1], lstm_size[1], scope="state2")
      hidden2 = layer_norm(hidden2, scope="layer_norm3")
      hidden2 = common_layers.make_even_size(hidden2)
      enc1 = slim.layers.conv2d(
          hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope="conv2")

      hidden3, lstm_state[2] = lstm_func(
          enc1, lstm_state[2], lstm_size[2], scope="state3")
      hidden3 = layer_norm(hidden3, scope="layer_norm4")
      hidden4, lstm_state[3] = lstm_func(
          hidden3, lstm_state[3], lstm_size[3], scope="state4")
      hidden4 = layer_norm(hidden4, scope="layer_norm5")
      hidden4 = common_layers.make_even_size(hidden4)
      enc2 = slim.layers.conv2d(
          hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope="conv3")

      # Pass in reward and action.
      emb_action = self.encode_to_shape(action, enc2.get_shape(), "action_enc")
      emb_reward = self.encode_to_shape(
          input_reward, enc2.get_shape(), "reward_enc")
      enc2 = tf.concat(axis=3, values=[enc2, emb_action, emb_reward])

      if latent is not None:
        with tf.control_dependencies([latent]):
          enc2 = tf.concat([enc2, latent], 3)

      enc3 = slim.layers.conv2d(
          enc2, hidden4.get_shape()[3], [1, 1], stride=1, scope="conv4")

      hidden5, lstm_state[4] = lstm_func(
          enc3, lstm_state[4], lstm_size[4], scope="state5")  # last 8x8
      hidden5 = layer_norm(hidden5, scope="layer_norm6")
      enc4 = slim.layers.conv2d_transpose(
          hidden5, hidden5.get_shape()[3], 3, stride=2, scope="convt1")

      enc1_shape = common_layers.shape_list(enc1)
      enc4 = enc4[:, :enc1_shape[1], :enc1_shape[2], :]  # Cut to shape.
      hidden6, lstm_state[5] = lstm_func(
          enc4, lstm_state[5], lstm_size[5], scope="state6")  # 16x16
      hidden6 = layer_norm(hidden6, scope="layer_norm7")
      # Skip connection.
      hidden6 = tf.concat(axis=3, values=[hidden6, enc1])  # both 16x16

      enc5 = slim.layers.conv2d_transpose(
          hidden6, hidden6.get_shape()[3], 3, stride=2, scope="convt2")
      enc0_shape = common_layers.shape_list(enc0)
      enc5 = enc5[:, :enc0_shape[1], :enc0_shape[2], :]  # Cut to shape.
      hidden7, lstm_state[6] = lstm_func(
          enc5, lstm_state[6], lstm_size[6], scope="state7")  # 32x32
      hidden7 = layer_norm(hidden7, scope="layer_norm8")

      # Skip connection.
      hidden7 = tf.concat(axis=3, values=[hidden7, enc0])  # both 32x32

      enc6 = slim.layers.conv2d_transpose(
          hidden7,
          hidden7.get_shape()[3],
          3,
          stride=2,
          scope="convt3",
          activation_fn=None,
          normalizer_fn=layer_norm,
          normalizer_params={"scope": "layer_norm9"})

      if self.hparams.model_options == "DNA":
        # Using largest hidden state for predicting untied conv kernels.
        enc7 = slim.layers.conv2d_transpose(
            enc6,
            self.hparams.dna_kernel_size**2,
            1,
            stride=1,
            scope="convt4",
            activation_fn=None)
      else:
        # Using largest hidden state for predicting a new image layer.
        enc7 = slim.layers.conv2d_transpose(
            enc6,
            color_channels,
            1,
            stride=1,
            scope="convt4",
            activation_fn=None)
        # This allows the network to also generate one image from scratch,
        # which is useful when regions of the image become unoccluded.
        transformed = [tf.nn.sigmoid(enc7)]

      if self.hparams.model_options == "CDNA":
        # cdna_input = tf.reshape(hidden5, [int(batch_size), -1])
        cdna_input = tf.contrib.layers.flatten(hidden5)
        transformed += self.cdna_transformation(
            input_image, cdna_input, num_masks, int(color_channels))
      elif self.hparams.model_options == "DNA":
        # Only one mask is supported (more should be unnecessary).
        if num_masks != 1:
          raise ValueError("Only one mask is supported for DNA model.")
        transformed = [self.dna_transformation(input_image, enc7)]

      masks = slim.layers.conv2d_transpose(
          enc6, num_masks + 1, 1,
          stride=1, scope="convt7", activation_fn=None)
      masks = tf.reshape(
          tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])),
          [batch_size,
           int(img_height),
           int(img_width), num_masks + 1])
      mask_list = tf.split(
          axis=3, num_or_size_splits=num_masks + 1, value=masks)
      output = mask_list[0] * input_image
      for layer, mask in zip(transformed, mask_list[1:]):
        output += layer * mask

      p_reward = self.reward_prediction(hidden5)
      p_reward = self.decode_to_shape(
          p_reward, input_reward.shape, "reward_dec")

      return output, p_reward, lstm_state
Exemplo n.º 20
0
  def body(self, features):
    hparams = self.hparams
    filters = hparams.hidden_size
    kernel1, kernel2 = (3, 3), (4, 4)

    # Embed the inputs.
    inputs_shape = common_layers.shape_list(features["inputs"])
    # Using non-zero bias initializer below for edge cases of uniform inputs.
    x = tf.layers.dense(
        features["inputs"], filters, name="inputs_embed",
        bias_initializer=tf.random_normal_initializer(stddev=0.01))
    x = common_attention.add_timing_signal_nd(x)

    # Down-stride.
    layer_inputs = [x]
    for i in range(hparams.num_compress_steps):
      with tf.variable_scope("downstride%d" % i):
        layer_inputs.append(x)
        x = common_layers.make_even_size(x)
        if i < hparams.filter_double_steps:
          filters *= 2
        x = tf.layers.conv2d(x, filters, kernel2, activation=common_layers.belu,
                             strides=(2, 2), padding="SAME")
        x = common_layers.layer_norm(x)

    # Add embedded action if present.
    if "input_action" in features:
      action = tf.reshape(features["input_action"][:, -1, :],
                          [-1, 1, 1, hparams.hidden_size])
      action_mask = tf.layers.dense(action, filters, name="action_mask")
      zeros_mask = tf.zeros(common_layers.shape_list(x)[:-1] + [filters],
                            dtype=tf.float32)
      x *= action_mask + zeros_mask

    # Run a stack of convolutions.
    for i in range(hparams.num_hidden_layers):
      with tf.variable_scope("layer%d" % i):
        y = tf.layers.conv2d(x, filters, kernel1, activation=common_layers.belu,
                             strides=(1, 1), padding="SAME")
        y = tf.nn.dropout(y, 1.0 - hparams.dropout)
        if i == 0:
          x = y
        else:
          x = common_layers.layer_norm(x + y)

    # Up-convolve.
    layer_inputs = list(reversed(layer_inputs))
    for i in range(hparams.num_compress_steps):
      with tf.variable_scope("upstride%d" % i):
        if i >= hparams.num_compress_steps - hparams.filter_double_steps:
          filters //= 2
        x = tf.layers.conv2d_transpose(
            x, filters, kernel2, activation=common_layers.belu,
            strides=(2, 2), padding="SAME")
        y = layer_inputs[i]
        shape = common_layers.shape_list(y)
        x = x[:, :shape[1], :shape[2], :]
        x = common_layers.layer_norm(x + y)
        x = common_attention.add_timing_signal_nd(x)

    # Cut down to original size.
    x = x[:, :inputs_shape[1], :inputs_shape[2], :]

    # Reward prediction if needed.
    if "target_reward" not in features:
      return x
    reward_pred = tf.reduce_mean(x, axis=[1, 2], keepdims=True)
    return {"targets": x, "target_reward": reward_pred}
Exemplo n.º 21
0
def conv_latent_tower(images, time_axis, latent_channels=1, min_logvar=-5,
                      is_training=False, random_latent=False,
                      tiny_mode=False, small_mode=False):
  """Builds convolutional latent tower for stochastic model.

  At training time this tower generates a latent distribution (mean and std)
  conditioned on the entire video. This latent variable will be fed to the
  main tower as an extra variable to be used for future frames prediction.
  At inference time, the tower is disabled and only returns latents sampled
  from N(0,1).
  If the multi_latent flag is on, a different latent for every timestep would
  be generated.

  Args:
    images: tensor of ground truth image sequences
    time_axis: the time axis  in images tensor
    latent_channels: number of latent channels
    min_logvar: minimum value for log_var
    is_training: whether or not it is training mode
    random_latent: whether or not generate random latents
    tiny_mode: whether or not it is tiny_mode. tiny_mode sets the number
        of conv channels to 1 at each layer. useful for testing the
        integration tests.
    small_mode: whether or not it is small_mode. small mode is the same model
        with less conv and lstm layers and also lower number of channels.
        suitable for videos with less complexity and testing.
  Returns:
    latent_mean: predicted latent mean
    latent_logvar: predicted latent log variance
  """
  conv_size = tinyify([32, 64, 64], tiny_mode, small_mode)
  with tf.variable_scope("latent", reuse=tf.AUTO_REUSE):
    images = tf.to_float(images)
    images = tf.unstack(images, axis=time_axis)
    images = tf.concat(images, axis=3)

    x = images
    x = common_layers.make_even_size(x)
    x = tfl.conv2d(x, conv_size[0], [3, 3], strides=(2, 2),
                   padding="SAME", activation=tf.nn.relu, name="latent_conv1")
    x = tfcl.layer_norm(x)
    if not small_mode:
      x = tfl.conv2d(x, conv_size[1], [3, 3], strides=(2, 2),
                     padding="SAME", activation=tf.nn.relu, name="latent_conv2")
      x = tfcl.layer_norm(x)
    x = tfl.conv2d(x, conv_size[2], [3, 3], strides=(1, 1),
                   padding="SAME", activation=tf.nn.relu, name="latent_conv3")
    x = tfcl.layer_norm(x)

    nc = latent_channels
    mean = tfl.conv2d(x, nc, [3, 3], strides=(2, 2),
                      padding="SAME", activation=None, name="latent_mean")
    logv = tfl.conv2d(x, nc, [3, 3], strides=(2, 2),
                      padding="SAME", activation=tf.nn.relu, name="latent_std")
    logvar = logv + min_logvar

    # No latent tower at inference time, just standard gaussian.
    if not is_training:
      return tf.zeros_like(mean), tf.zeros_like(logvar)

    # No latent in the first phase
    ret_mean, ret_logvar = tf.cond(
        random_latent,
        lambda: (tf.zeros_like(mean), tf.zeros_like(logvar)),
        lambda: (mean, logvar))

    return ret_mean, ret_logvar
Exemplo n.º 22
0
  def body(self, features):
    hparams = self.hparams
    filters = hparams.hidden_size
    kernel1, kernel2 = (3, 3), (4, 4)

    # Embed the inputs.
    inputs_shape = common_layers.shape_list(features["inputs"])
    # Using non-zero bias initializer below for edge cases of uniform inputs.
    x = tf.layers.dense(
        features["inputs"], filters, name="inputs_embed",
        bias_initializer=tf.random_normal_initializer(stddev=0.01))
    x = common_attention.add_timing_signal_nd(x)

    # Down-stride.
    layer_inputs = [x]
    for i in range(hparams.num_compress_steps):
      with tf.variable_scope("downstride%d" % i):
        layer_inputs.append(x)
        x = common_layers.make_even_size(x)
        if i < hparams.filter_double_steps:
          filters *= 2
        x = tf.layers.conv2d(x, filters, kernel2, activation=common_layers.belu,
                             strides=(2, 2), padding="SAME")
        x = common_layers.layer_norm(x)

    # Add embedded action if present.
    if "input_action" in features:
      action = tf.reshape(features["input_action"][:, -1, :],
                          [-1, 1, 1, hparams.hidden_size])
      action_mask = tf.layers.dense(action, filters, name="action_mask")
      zeros_mask = tf.zeros(common_layers.shape_list(x)[:-1] + [filters],
                            dtype=tf.float32)
      x *= action_mask + zeros_mask

    # Run a stack of convolutions.
    for i in range(hparams.num_hidden_layers):
      with tf.variable_scope("layer%d" % i):
        y = tf.layers.conv2d(x, filters, kernel1, activation=common_layers.belu,
                             strides=(1, 1), padding="SAME")
        y = tf.nn.dropout(y, 1.0 - hparams.dropout)
        if i == 0:
          x = y
        else:
          x = common_layers.layer_norm(x + y)

    # Up-convolve.
    layer_inputs = list(reversed(layer_inputs))
    for i in range(hparams.num_compress_steps):
      with tf.variable_scope("upstride%d" % i):
        if i >= hparams.num_compress_steps - hparams.filter_double_steps:
          filters //= 2
        x = tf.layers.conv2d_transpose(
            x, filters, kernel2, activation=common_layers.belu,
            strides=(2, 2), padding="SAME")
        y = layer_inputs[i]
        shape = common_layers.shape_list(y)
        x = x[:, :shape[1], :shape[2], :]
        x = common_layers.layer_norm(x + y)
        x = common_attention.add_timing_signal_nd(x)

    # Cut down to original size.
    x = x[:, :inputs_shape[1], :inputs_shape[2], :]

    # Reward prediction if needed.
    if "target_reward" not in features:
      return x
    reward_pred = tf.reduce_mean(x, axis=[1, 2], keepdims=True)
    return {"targets": x, "target_reward": reward_pred}
Exemplo n.º 23
0
  def construct_predictive_tower(
      self, input_image, input_reward, action, lstm_state, latent):
    # Main tower
    layer_norm = tf.contrib.layers.layer_norm
    lstm_func = self.conv_lstm_2d
    batch_size = common_layers.shape_list(input_image)[0]
    # the number of different pixel motion predictions
    # and the number of masks for each of those predictions
    num_masks = self.hparams.num_masks

    lstm_size = self.tinyify([32, 32, 64, 64, 128, 64, 32])
    conv_size = self.tinyify([32])

    img_height, img_width, color_channels = self.hparams.problem.frame_shape

    with tf.variable_scope("main", reuse=tf.AUTO_REUSE):
      input_image = common_layers.make_even_size(input_image)
      enc0 = slim.layers.conv2d(
          input_image,
          conv_size[0], [5, 5],
          stride=2,
          scope="scale1_conv1",
          normalizer_fn=layer_norm,
          normalizer_params={"scope": "layer_norm1"})

      hidden1, lstm_state[0] = lstm_func(
          enc0, lstm_state[0], lstm_size[0], scope="state1")
      hidden1 = layer_norm(hidden1, scope="layer_norm2")
      hidden2, lstm_state[1] = lstm_func(
          hidden1, lstm_state[1], lstm_size[1], scope="state2")
      hidden2 = layer_norm(hidden2, scope="layer_norm3")
      hidden2 = common_layers.make_even_size(hidden2)
      enc1 = slim.layers.conv2d(
          hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope="conv2")

      hidden3, lstm_state[2] = lstm_func(
          enc1, lstm_state[2], lstm_size[2], scope="state3")
      hidden3 = layer_norm(hidden3, scope="layer_norm4")
      hidden4, lstm_state[3] = lstm_func(
          hidden3, lstm_state[3], lstm_size[3], scope="state4")
      hidden4 = layer_norm(hidden4, scope="layer_norm5")
      hidden4 = common_layers.make_even_size(hidden4)
      enc2 = slim.layers.conv2d(
          hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope="conv3")

      # Pass in reward and action.
      emb_action = self.encode_to_shape(action, enc2.get_shape())
      emb_reward = self.encode_to_shape(input_reward, enc2.get_shape())
      enc2 = tf.concat(axis=3, values=[enc2, emb_action, emb_reward])

      if latent is not None:
        with tf.control_dependencies([latent]):
          enc2 = tf.concat([enc2, latent], 3)

      enc3 = slim.layers.conv2d(
          enc2, hidden4.get_shape()[3], [1, 1], stride=1, scope="conv4")

      hidden5, lstm_state[4] = lstm_func(
          enc3, lstm_state[4], lstm_size[4], scope="state5")  # last 8x8
      hidden5 = layer_norm(hidden5, scope="layer_norm6")
      enc4 = slim.layers.conv2d_transpose(
          hidden5, hidden5.get_shape()[3], 3, stride=2, scope="convt1")

      enc1_shape = common_layers.shape_list(enc1)
      enc4 = enc4[:, :enc1_shape[1], :enc1_shape[2], :]  # Cut to shape.
      hidden6, lstm_state[5] = lstm_func(
          enc4, lstm_state[5], lstm_size[5], scope="state6")  # 16x16
      hidden6 = layer_norm(hidden6, scope="layer_norm7")
      # Skip connection.
      hidden6 = tf.concat(axis=3, values=[hidden6, enc1])  # both 16x16

      enc5 = slim.layers.conv2d_transpose(
          hidden6, hidden6.get_shape()[3], 3, stride=2, scope="convt2")
      enc0_shape = common_layers.shape_list(enc0)
      enc5 = enc5[:, :enc0_shape[1], :enc0_shape[2], :]  # Cut to shape.
      hidden7, lstm_state[6] = lstm_func(
          enc5, lstm_state[6], lstm_size[6], scope="state7")  # 32x32
      hidden7 = layer_norm(hidden7, scope="layer_norm8")

      # Skip connection.
      hidden7 = tf.concat(axis=3, values=[hidden7, enc0])  # both 32x32

      enc6 = slim.layers.conv2d_transpose(
          hidden7,
          hidden7.get_shape()[3],
          3,
          stride=2,
          scope="convt3",
          activation_fn=None,
          normalizer_fn=layer_norm,
          normalizer_params={"scope": "layer_norm9"})

      if self.hparams.model_options == "DNA":
        # Using largest hidden state for predicting untied conv kernels.
        enc7 = slim.layers.conv2d_transpose(
            enc6,
            self.hparams.dna_kernel_size**2,
            1,
            stride=1,
            scope="convt4",
            activation_fn=None)
      else:
        # Using largest hidden state for predicting a new image layer.
        enc7 = slim.layers.conv2d_transpose(
            enc6,
            color_channels,
            1,
            stride=1,
            scope="convt4",
            activation_fn=None)
        # This allows the network to also generate one image from scratch,
        # which is useful when regions of the image become unoccluded.
        transformed = [tf.nn.sigmoid(enc7)]

      if self.hparams.model_options == "CDNA":
        # cdna_input = tf.reshape(hidden5, [int(batch_size), -1])
        cdna_input = tf.contrib.layers.flatten(hidden5)
        transformed += self.cdna_transformation(
            input_image, cdna_input, num_masks, int(color_channels))
      elif self.hparams.model_options == "DNA":
        # Only one mask is supported (more should be unnecessary).
        if num_masks != 1:
          raise ValueError("Only one mask is supported for DNA model.")
        transformed = [self.dna_transformation(input_image, enc7)]

      masks = slim.layers.conv2d_transpose(
          enc6, num_masks + 1, 1,
          stride=1, scope="convt7", activation_fn=None)
      masks = tf.reshape(
          tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])),
          [batch_size,
           int(img_height),
           int(img_width), num_masks + 1])
      mask_list = tf.split(
          axis=3, num_or_size_splits=num_masks + 1, value=masks)
      output = mask_list[0] * input_image
      for layer, mask in zip(transformed, mask_list[1:]):
        output += layer * mask

      p_reward = self.reward_prediction(hidden5)
      p_reward = self.decode_to_shape(p_reward, input_reward.shape)

      return output, p_reward, lstm_state
Exemplo n.º 24
0
  def inject_latent(self, layer, inputs, target, action):
    """Inject a deterministic latent based on the target frame."""
    hparams = self.hparams
    final_filters = common_layers.shape_list(layer)[-1]
    filters = hparams.hidden_size
    kernel = (4, 4)
    layer_shape = common_layers.shape_list(layer)

    def add_bits(layer, bits):
      z_mul = tfl.dense(bits, final_filters, name="unbottleneck_mul")
      if not hparams.complex_addn:
        return layer + z_mul
      layer *= tf.nn.sigmoid(z_mul)
      z_add = tfl.dense(bits, final_filters, name="unbottleneck_add")
      layer += z_add
      return layer

    if not self.is_training:
      if hparams.full_latent_tower:
        rand = tf.random_uniform(layer_shape[:-1] + [hparams.bottleneck_bits])
        bits = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0
      else:
        bits, _ = discretization.predict_bits_with_lstm(
            layer, hparams.latent_predictor_state_size, hparams.bottleneck_bits,
            temperature=hparams.latent_predictor_temperature)
        bits = tf.expand_dims(tf.expand_dims(bits, axis=1), axis=2)
      return add_bits(layer, bits), 0.0

    # Embed.
    frames = tf.concat(inputs + [target], axis=-1)
    x = tfl.dense(
        frames, filters, name="latent_embed",
        bias_initializer=tf.random_normal_initializer(stddev=0.01))
    x = common_attention.add_timing_signal_nd(x)

    # Add embedded action if present.
    if action is not None:
      x = common_video.inject_additional_input(
          x, action, "action_enc_latent", hparams.action_injection)

    if hparams.full_latent_tower:
      for i in range(hparams.num_compress_steps):
        with tf.variable_scope("latent_downstride%d" % i):
          x = common_layers.make_even_size(x)
          if i < hparams.filter_double_steps:
            filters *= 2
          x = common_attention.add_timing_signal_nd(x)
          x = tfl.conv2d(x, filters, kernel,
                         activation=common_layers.belu,
                         strides=(2, 2), padding="SAME")
          x = common_layers.layer_norm(x)
    else:
      x = common_layers.double_discriminator(x)
      x = tf.expand_dims(tf.expand_dims(x, axis=1), axis=1)

    bits, bits_clean = discretization.tanh_discrete_bottleneck(
        x, hparams.bottleneck_bits, hparams.bottleneck_noise,
        hparams.discretize_warmup_steps, hparams.mode)
    if not hparams.full_latent_tower:
      _, pred_loss = discretization.predict_bits_with_lstm(
          layer, hparams.latent_predictor_state_size, hparams.bottleneck_bits,
          target_bits=bits_clean)
      # Mix bits from latent with predicted bits on forward pass as a noise.
      if hparams.latent_rnn_max_sampling > 0.0:
        with tf.variable_scope(tf.get_variable_scope(), reuse=True):
          bits_pred, _ = discretization.predict_bits_with_lstm(
              layer, hparams.latent_predictor_state_size,
              hparams.bottleneck_bits,
              temperature=hparams.latent_predictor_temperature)
          bits_pred = tf.expand_dims(tf.expand_dims(bits_pred, axis=1), axis=2)
        # Be bits_pred on the forward pass but bits on the backward one.
        bits_pred = bits_clean + tf.stop_gradient(bits_pred - bits_clean)
        # Select which bits to take from pred sampling with bit_p probability.
        which_bit = tf.random_uniform(common_layers.shape_list(bits))
        bit_p = common_layers.inverse_lin_decay(hparams.latent_rnn_warmup_steps)
        bit_p *= hparams.latent_rnn_max_sampling
        bits = tf.where(which_bit < bit_p, bits_pred, bits)

    res = add_bits(layer, bits)
    # During training, sometimes skip the latent to help action-conditioning.
    res_p = common_layers.inverse_lin_decay(hparams.latent_rnn_warmup_steps / 2)
    res_p *= hparams.latent_use_max_probability
    res_rand = tf.random_uniform([layer_shape[0]])
    res = tf.where(res_rand < res_p, res, layer)
    return res, pred_loss
Exemplo n.º 25
0
    def inject_latent(self, layer, inputs, target, action):
        """Inject a deterministic latent based on the target frame."""
        hparams = self.hparams
        final_filters = common_layers.shape_list(layer)[-1]
        filters = hparams.hidden_size
        kernel = (4, 4)
        layer_shape = common_layers.shape_list(layer)
        activation_fn = common_layers.belu
        if hparams.activation_fn == "relu":
            activation_fn = tf.nn.relu

        def add_bits(layer, bits):
            z_mul = tfl.dense(bits, final_filters, name="unbottleneck_mul")
            if not hparams.complex_addn:
                return layer + z_mul
            layer *= tf.nn.sigmoid(z_mul)
            z_add = tfl.dense(bits, final_filters, name="unbottleneck_add")
            layer += z_add
            return layer

        if not self.is_training:
            if hparams.full_latent_tower:
                rand = tf.random_uniform(layer_shape[:-1] +
                                         [hparams.bottleneck_bits])
                bits = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0
            else:
                bits, _ = discretization.predict_bits_with_lstm(
                    layer,
                    hparams.latent_predictor_state_size,
                    hparams.bottleneck_bits,
                    temperature=hparams.latent_predictor_temperature)
                bits = tf.expand_dims(tf.expand_dims(bits, axis=1), axis=2)
            return add_bits(layer, bits), 0.0

        # Embed.
        frames = tf.concat(inputs + [target], axis=-1)
        x = tfl.dense(
            frames,
            filters,
            name="latent_embed",
            bias_initializer=tf.random_normal_initializer(stddev=0.01))
        x = common_attention.add_timing_signal_nd(x)

        # Add embedded action if present.
        if action is not None:
            x = common_video.inject_additional_input(x, action,
                                                     "action_enc_latent",
                                                     hparams.action_injection)

        if hparams.full_latent_tower:
            for i in range(hparams.num_compress_steps):
                with tf.variable_scope("latent_downstride%d" % i):
                    x = common_layers.make_even_size(x)
                    if i < hparams.filter_double_steps:
                        filters *= 2
                    x = common_attention.add_timing_signal_nd(x)
                    x = tfl.conv2d(x,
                                   filters,
                                   kernel,
                                   activation=activation_fn,
                                   strides=(2, 2),
                                   padding="SAME")
                    x = common_layers.layer_norm(x)
        else:
            x = common_layers.double_discriminator(x)
            x = tf.expand_dims(tf.expand_dims(x, axis=1), axis=1)

        bits, bits_clean = discretization.tanh_discrete_bottleneck(
            x, hparams.bottleneck_bits, hparams.bottleneck_noise,
            hparams.discretize_warmup_steps, hparams.mode)
        if not hparams.full_latent_tower:
            _, pred_loss = discretization.predict_bits_with_lstm(
                layer,
                hparams.latent_predictor_state_size,
                hparams.bottleneck_bits,
                target_bits=bits_clean)
            # Mix bits from latent with predicted bits on forward pass as a noise.
            if hparams.latent_rnn_max_sampling > 0.0:
                with tf.variable_scope(tf.get_variable_scope(), reuse=True):
                    bits_pred, _ = discretization.predict_bits_with_lstm(
                        layer,
                        hparams.latent_predictor_state_size,
                        hparams.bottleneck_bits,
                        temperature=hparams.latent_predictor_temperature)
                    bits_pred = tf.expand_dims(tf.expand_dims(bits_pred,
                                                              axis=1),
                                               axis=2)
                # Be bits_pred on the forward pass but bits on the backward one.
                bits_pred = bits_clean + tf.stop_gradient(bits_pred -
                                                          bits_clean)
                # Select which bits to take from pred sampling with bit_p probability.
                which_bit = tf.random_uniform(common_layers.shape_list(bits))
                bit_p = common_layers.inverse_lin_decay(
                    hparams.latent_rnn_warmup_steps)
                bit_p *= hparams.latent_rnn_max_sampling
                bits = tf.where(which_bit < bit_p, bits_pred, bits)

        res = add_bits(layer, bits)
        # During training, sometimes skip the latent to help action-conditioning.
        res_p = common_layers.inverse_lin_decay(
            hparams.latent_rnn_warmup_steps / 2)
        res_p *= hparams.latent_use_max_probability
        res_rand = tf.random_uniform([layer_shape[0]])
        res = tf.where(res_rand < res_p, res, layer)
        return res, pred_loss
Exemplo n.º 26
0
  def bottom_part_tower(self, input_image, input_reward, action, latent,
                        lstm_state, lstm_size, conv_size, concat_latent=False):
    """The bottom part of predictive towers.

    With the current (early) design, the main prediction tower and
    the reward prediction tower share the same arcitecture. TF Scope can be
    adjusted as required to either share or not share the weights between
    the two towers.

    Args:
      input_image: the current image.
      input_reward: the current reward.
      action: the action taken by the agent.
      latent: the latent vector.
      lstm_state: the current internal states of conv lstms.
      lstm_size: the size of lstms.
      conv_size: the size of convolutions.
      concat_latent: whether or not to concatenate the latent at every step.

    Returns:
      - the output of the partial network.
      - intermidate outputs for skip connections.
    """
    lstm_func = common_video.conv_lstm_2d
    tile_and_concat = common_video.tile_and_concat

    input_image = common_layers.make_even_size(input_image)
    concat_input_image = tile_and_concat(
        input_image, latent, concat_latent=concat_latent)

    layer_id = 0
    enc0 = tfl.conv2d(
        concat_input_image,
        conv_size[0], [5, 5],
        strides=(2, 2),
        activation=tf.nn.relu,
        padding="SAME",
        name="scale1_conv1")
    enc0 = tfcl.layer_norm(enc0, scope="layer_norm1")

    hidden1, lstm_state[layer_id] = lstm_func(
        enc0, lstm_state[layer_id], lstm_size[layer_id], name="state1")
    hidden1 = tile_and_concat(hidden1, latent, concat_latent=concat_latent)
    hidden1 = tfcl.layer_norm(hidden1, scope="layer_norm2")
    layer_id += 1

    hidden2, lstm_state[layer_id] = lstm_func(
        hidden1, lstm_state[layer_id], lstm_size[layer_id], name="state2")
    hidden2 = tfcl.layer_norm(hidden2, scope="layer_norm3")
    hidden2 = common_layers.make_even_size(hidden2)
    enc1 = tfl.conv2d(hidden2, hidden2.get_shape()[3], [3, 3], strides=(2, 2),
                      padding="SAME", activation=tf.nn.relu, name="conv2")
    enc1 = tile_and_concat(enc1, latent, concat_latent=concat_latent)
    layer_id += 1

    if self.hparams.small_mode:
      hidden4, enc2 = hidden2, enc1
    else:
      hidden3, lstm_state[layer_id] = lstm_func(
          enc1, lstm_state[layer_id], lstm_size[layer_id], name="state3")
      hidden3 = tile_and_concat(hidden3, latent, concat_latent=concat_latent)
      hidden3 = tfcl.layer_norm(hidden3, scope="layer_norm4")
      layer_id += 1

      hidden4, lstm_state[layer_id] = lstm_func(
          hidden3, lstm_state[layer_id], lstm_size[layer_id], name="state4")
      hidden4 = tile_and_concat(hidden4, latent, concat_latent=concat_latent)
      hidden4 = tfcl.layer_norm(hidden4, scope="layer_norm5")
      hidden4 = common_layers.make_even_size(hidden4)
      enc2 = tfl.conv2d(hidden4, hidden4.get_shape()[3], [3, 3], strides=(2, 2),
                        padding="SAME", activation=tf.nn.relu, name="conv3")
      layer_id += 1

    if action is not None:
      enc2 = common_video.inject_additional_input(
          enc2, action, "action_enc", self.hparams.action_injection)
    if input_reward is not None:
      enc2 = common_video.inject_additional_input(
          enc2, input_reward, "reward_enc")
    if latent is not None and not concat_latent:
      with tf.control_dependencies([latent]):
        enc2 = tf.concat([enc2, latent], axis=3)

    enc3 = tfl.conv2d(enc2, hidden4.get_shape()[3], [1, 1], strides=(1, 1),
                      padding="SAME", activation=tf.nn.relu, name="conv4")

    hidden5, lstm_state[layer_id] = lstm_func(
        enc3, lstm_state[layer_id], lstm_size[layer_id], name="state5")
    hidden5 = tfcl.layer_norm(hidden5, scope="layer_norm6")
    hidden5 = tile_and_concat(hidden5, latent, concat_latent=concat_latent)
    layer_id += 1
    return hidden5, (enc0, enc1), layer_id
    def inject_latent(self, layer, inputs, target):
        """Inject a deterministic latent based on the target frame."""
        hparams = self.hparams
        final_filters = common_layers.shape_list(layer)[-1]
        filters = hparams.hidden_size
        kernel = (4, 4)
        layer_shape = common_layers.shape_list(layer)

        def add_bits(layer, bits):
            z_mul = tfl.dense(bits, final_filters, name="unbottleneck_mul")
            if not hparams.complex_addn:
                return layer + z_mul
            layer *= tf.nn.sigmoid(z_mul)
            z_add = tfl.dense(bits, final_filters, name="unbottleneck_add")
            layer += z_add
            return layer

        if not self.is_training:
            if hparams.full_latent_tower:
                rand = tf.random_uniform(layer_shape[:-1] +
                                         [hparams.bottleneck_bits])
                bits = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0
            else:
                bits, _ = discretization.predict_bits_with_lstm(
                    layer,
                    hparams.latent_predictor_state_size,
                    hparams.bottleneck_bits,
                    temperature=hparams.latent_predictor_temperature)
                bits = tf.expand_dims(tf.expand_dims(bits, axis=1), axis=2)
            return add_bits(layer, bits), 0.0

        # Embed.
        frames = tf.concat(inputs + [target], axis=-1)
        x = tfl.dense(
            frames,
            filters,
            name="latent_embed",
            bias_initializer=tf.random_normal_initializer(stddev=0.01))
        x = common_attention.add_timing_signal_nd(x)

        if hparams.full_latent_tower:
            for i in range(hparams.num_compress_steps):
                with tf.variable_scope("latent_downstride%d" % i):
                    x = common_layers.make_even_size(x)
                    if i < hparams.filter_double_steps:
                        filters *= 2
                    x = common_attention.add_timing_signal_nd(x)
                    x = tfl.conv2d(x,
                                   filters,
                                   kernel,
                                   activation=common_layers.belu,
                                   strides=(2, 2),
                                   padding="SAME")
                    x = common_layers.layer_norm(x)
        else:
            x = common_layers.double_discriminator(x)
            x = tf.expand_dims(tf.expand_dims(x, axis=1), axis=1)

        bits, bits_clean = discretization.tanh_discrete_bottleneck(
            x, hparams.bottleneck_bits, hparams.bottleneck_noise,
            hparams.discretize_warmup_steps, hparams.mode)
        if not hparams.full_latent_tower:
            _, pred_loss = discretization.predict_bits_with_lstm(
                layer,
                hparams.latent_predictor_state_size,
                hparams.bottleneck_bits,
                target_bits=bits_clean)

        return add_bits(layer, bits), pred_loss
Exemplo n.º 28
0
    def network(self):
        def middle_network(layer):
            # Run a stack of convolutions.
            x = layer
            kernel1 = (3, 3)
            filters = common_layers.shape_list(x)[-1]
            for i in range(2):
                with tf.variable_scope("layer%d" % i):
                    y = tf.nn.dropout(x, 1.0 - 0.5)
                    y = tf.layers.conv2d(y,
                                         filters,
                                         kernel1,
                                         activation=self.activation_fn,
                                         strides=(1, 1),
                                         padding="SAME")
                    if i == 0:
                        x = y
                    else:
                        x = common_layers.layer_norm(x + y)
            return x

        batch_size = tf.shape(self.states_ph)[0]

        filters = self.hidden_size
        kernel2 = (4, 4)
        action = self.actions_oph  #[0] NOTE - might remove this

        # Normalize states
        if (self.n_envs > 1):
            states = [
                common_layers.standardize_images(self.states_ph[i, :, :, :])
                for i in range(self.n_envs)
            ]
            stacked_states = tf.stack(states)
        else:
            stacked_states = common_layers.standardize_images(self.states_ph)
        inputs_shape = common_layers.shape_list(stacked_states)

        # Using non-zero bias initializer below for edge cases of uniform inputs.
        x = tf.layers.dense(
            stacked_states,
            filters,
            name="inputs_embed",
            bias_initializer=tf.random_normal_initializer(stddev=0.01))
        x = common_attention.add_timing_signal_nd(x)

        # Down-stride.
        layer_inputs = [x]
        for i in range(self.layers):
            with tf.variable_scope("downstride%d" % i):
                layer_inputs.append(x)
                x = tf.nn.dropout(x, 1.0 - self.dropout_p)
                x = common_layers.make_even_size(x)
                if i < 2:
                    filters *= 2
                x = common_attention.add_timing_signal_nd(x)
                x = tf.layers.conv2d(x,
                                     filters,
                                     kernel2,
                                     activation=self.activation_fn,
                                     strides=(2, 2),
                                     padding="SAME")
                x = common_layers.layer_norm(x)

        if self.is_policy:
            with tf.variable_scope("policy"):
                x_flat = tf.layers.flatten(x)
                policy_pred = tf.layers.dense(x_flat, self.action_dim)
                value_pred = tf.layers.dense(x_flat, 1)
                value_pred = tf.squeeze(value_pred, axis=-1)
        else:
            policy_pred, value_pred = None, None

        #if self.has_actions:
        x = inject_additional_input(x, action, "action_enc", "multi_additive")

        # Inject latent if present. Only for stochastic models.
        target_states = common_layers.standardize_images(self.target_states)

        x_mid = tf.reduce_mean(x, axis=[1, 2], keepdims=True)
        x = middle_network(x)

        # Up-convolve.
        layer_inputs = list(reversed(layer_inputs))
        for i in range(self.layers):
            with tf.variable_scope("upstride%d" % i):
                x = tf.nn.dropout(x, 1.0 - 0.1)
                if i >= self.layers - 2:
                    filters //= 2
                x = tf.layers.conv2d_transpose(x,
                                               filters,
                                               kernel2,
                                               activation=self.activation_fn,
                                               strides=(2, 2),
                                               padding="SAME")
                y = layer_inputs[i]
                shape = common_layers.shape_list(y)
                x = x[:, :shape[1], :shape[2], :]
                x = common_layers.layer_norm(x + y)
                x = common_attention.add_timing_signal_nd(x)

        # Cut down to original size.
        x = x[:, :inputs_shape[1], :inputs_shape[2], :]
        x_fin = tf.reduce_mean(x, axis=[1, 2], keepdims=True)

        x = tf.layers.dense(x, self.depth, name="logits")

        reward_pred = None
        if self.has_rewards:
            # Reward prediction based on middle and final logits.
            reward_pred = tf.concat([x_mid, x_fin], axis=-1)
            reward_pred = tf.nn.relu(
                tf.layers.dense(reward_pred, 128, name="reward_pred"))
            reward_pred = tf.squeeze(reward_pred, axis=1)  # Remove extra dims
            reward_pred = tf.squeeze(reward_pred, axis=1)  # Remove extra dims

        return x, reward_pred, policy_pred, value_pred