Exemplo n.º 1
0
  def test_encoder_decoder(self):
    with tf.Graph().as_default():
      hparams = glow.glow_hparams()
      hparams.n_levels = 3
      hparams.depth = 6
      rng = np.random.RandomState(0)
      x_np = rng.rand(1, 64, 64, 4)
      x_t = tf.convert_to_tensor(x_np, dtype=tf.float32)
      init_ops = [glow_ops.get_variable_ddi, glow_ops.actnorm]
      with arg_scope(init_ops, init=True):
        x_inv, _, eps, z_levels, _ = glow_ops.encoder_decoder(
            "encoder_decoder", x_t, hparams, reverse=False)
      x_inv_inv, _, z_inv_levels, _ = glow_ops.encoder_decoder(
          "encoder_decoder", x_inv, hparams, eps=eps, reverse=True)

      with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        x_inv_np = session.run(x_inv)
        z_levels_np, z_inv_levels_np, x_inv_inv_np = session.run(
            [z_levels, z_inv_levels, x_inv_inv])
        diff = x_inv_inv_np - x_np
        self.assertLen(z_levels_np, 2)
        self.assertLen(z_inv_levels_np, 2)
        # (h_i, w_i, c_i) = (h_{i-1}/f, w_{i-1}/f, c_{i-1}*(2f)/2) where (f=2)
        self.assertEqual(z_levels_np[0].shape, (1, 32, 32, 8))
        self.assertEqual(z_levels_np[1].shape, (1, 16, 16, 16))
        self.assertEqual(z_inv_levels_np[0].shape, (1, 32, 32, 8))
        self.assertEqual(z_inv_levels_np[1].shape, (1, 16, 16, 16))
        self.assertTrue(x_inv_np.shape, (1, 8, 8, 64))
        self.assertTrue(np.allclose(diff, 0.0, atol=1e-2))
Exemplo n.º 2
0
    def test_encoder_decoder(self):
        with tf.Graph().as_default():
            hparams = glow.glow_hparams()
            hparams.n_levels = 3
            hparams.depth = 2

            x = tf.random_uniform(shape=(16, 64, 64, 4), seed=0)
            x_inv, _, eps, z_levels, _ = glow_ops.encoder_decoder(
                "encoder_decoder", x, hparams, reverse=False)
            x_inv_inv, _, z_inv_levels, _ = glow_ops.encoder_decoder(
                "encoder_decoder", x_inv, hparams, eps=eps, reverse=True)

            with tf.Session() as session:
                session.run(tf.global_variables_initializer())
                diff, x_inv_np, z_levels_np, z_inv_levels_np = session.run(
                    [x - x_inv_inv, x_inv, z_levels, z_inv_levels])

                self.assertLen(z_levels_np, 2)
                self.assertLen(z_inv_levels_np, 2)
                # (h_i, w_i, c_i) = (h_{i-1}/f, w_{i-1}/f, c_{i-1}*(2f)/2) where (f=2)
                self.assertEqual(z_levels_np[0].shape, (16, 32, 32, 8))
                self.assertEqual(z_levels_np[1].shape, (16, 16, 16, 16))
                self.assertEqual(z_inv_levels_np[0].shape, (16, 32, 32, 8))
                self.assertEqual(z_inv_levels_np[1].shape, (16, 16, 16, 16))
                self.assertTrue(x_inv_np.shape, (16, 8, 8, 64))
                self.assertTrue(np.allclose(diff, 0.0, atol=1e-2))
Exemplo n.º 3
0
  def test_encoder_decoder(self):
    with tf.Graph().as_default():
      hparams = glow.glow_hparams()
      hparams.n_levels = 3
      hparams.depth = 2

      x = tf.random_uniform(shape=(16, 64, 64, 4), seed=0)
      x_inv, _, eps, z_levels, _ = glow_ops.encoder_decoder(
          "encoder_decoder", x, hparams, reverse=False)
      x_inv_inv, _, z_inv_levels, _ = glow_ops.encoder_decoder(
          "encoder_decoder", x_inv, hparams, eps=eps, reverse=True)

      with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        diff, x_inv_np, z_levels_np, z_inv_levels_np = session.run(
            [x - x_inv_inv, x_inv, z_levels, z_inv_levels])

        self.assertLen(z_levels_np, 2)
        self.assertLen(z_inv_levels_np, 2)
        # (h_i, w_i, c_i) = (h_{i-1}/f, w_{i-1}/f, c_{i-1}*(2f)/2) where (f=2)
        self.assertEqual(z_levels_np[0].shape, (16, 32, 32, 8))
        self.assertEqual(z_levels_np[1].shape, (16, 16, 16, 16))
        self.assertEqual(z_inv_levels_np[0].shape, (16, 32, 32, 8))
        self.assertEqual(z_inv_levels_np[1].shape, (16, 16, 16, 16))
        self.assertTrue(x_inv_np.shape, (16, 8, 8, 64))
        self.assertTrue(np.allclose(diff, 0.0, atol=1e-2))
Exemplo n.º 4
0
    def test_encoder_decoder_practical_usage(self):
        """Tests the following sequence of operations.

    1. Define forward network with arg_scope(init=True).
    2. Run one-forward pass to do data-dependent initialization and save.
    3. Define forward and reverse network with arg_scope(init=False)
    4. Check that reverse(forward(x)) == x
    """
        hparams = glow.glow_hparams()
        hparams.n_levels = 2
        hparams.depth = 12

        with tf.Graph().as_default():
            rng = np.random.RandomState(0)
            x_rand = np.asarray(rng.rand(1, 4, 4, 4), dtype=np.float32)
            x_t = tf.convert_to_tensor(x_rand)

            ops = [glow_ops.get_variable_ddi, glow_ops.actnorm]
            with arg_scope(ops, init=True):
                x_inv, _, _, _ = glow_ops.encoder_decoder("revnet",
                                                          x_t,
                                                          hparams,
                                                          reverse=False)
            curr_dir = tempfile.mkdtemp()
            model_path = os.path.join(curr_dir, "model")

            with tf.Session() as session:
                saver = tf.train.Saver()
                session.run(tf.global_variables_initializer())
                session.run(x_inv)
                saver.save(session, model_path)

        with tf.Graph().as_default():
            rng = np.random.RandomState(0)
            x_rand = np.asarray(rng.rand(1, 4, 4, 4), dtype=np.float32)
            x_t = tf.convert_to_tensor(x_rand)
            ops = [glow_ops.get_variable_ddi, glow_ops.actnorm]
            with arg_scope(ops, init=False):
                x_inv2, _, all_eps, _ = glow_ops.encoder_decoder("revnet",
                                                                 x_t,
                                                                 hparams,
                                                                 reverse=False)
                x_inv_inv_, _ = glow_ops.encoder_decoder("revnet",
                                                         x_inv2,
                                                         hparams,
                                                         eps=all_eps,
                                                         reverse=True)

            with tf.Session() as session:
                saver = tf.train.Saver()
                saver.restore(session, model_path)
                x_inv_inv_np = session.run(x_inv_inv_)
                diff = np.abs(x_inv_inv_np - x_rand)
                self.assertTrue(np.allclose(diff, 0.0, atol=1e-3))
Exemplo n.º 5
0
    def glow_encoder(self,
                     frame,
                     condition=False,
                     cond_latents=None,
                     init=False):
        """Glow network that encodes frame to a hierarchy of latents.

    Args:
      frame: 5-D Tensor of shape (batch_size, 1, height, width, channels).
      condition: Whether or not to condition on cond_latents.
      cond_latents: optional, list of tensors with length equal to
                    hparams.n_levels - 1. If provided, the latent at level l is
                    conditioned on the cond_latent at level l.
      init: Whether the given batch is an "init" batch or a "train" batch.
    Returns:
      objective: log-likelihood of the frame per the model.
      z_top: top-level latent.
      z_levels: a list of tensors with latents at all levels.
    """
        frame = self.squeeze_video(frame, init=init)
        frame = self.preprocess(frame)
        frame, objective = glow_ops.uniform_binning_correction(frame)

        glow_vals = glow_ops.encoder_decoder("codec",
                                             frame,
                                             self.hparams,
                                             eps=None,
                                             reverse=False,
                                             cond_latents=cond_latents,
                                             states=self.level_states,
                                             condition=condition)
        z_top, encoder_objective, self.eps, z_levels, self.level_states = glow_vals
        objective += encoder_objective
        return objective, z_top, z_levels
Exemplo n.º 6
0
    def body(self, features):
        x = features["inputs"]

        # Scale x such that the pixels lie in-between -0.5 and.0.5
        x = self.preprocess(x)
        x, objective = glow_ops.uniform_binning_correction(x)

        # The arg_scope call ensures that the actnorm parameters are set such that
        # the per-channel output activations have zero mean and unit variance
        # ONLY during the first step. After that the parameters are learned
        # through optimisation.
        global_step = tf.train.get_or_create_global_step()
        init_op = tf.logical_and(tf.equal(global_step, 0), self.is_training)
        ops = [glow_ops.get_variable_ddi, glow_ops.actnorm]
        with arg_scope(ops, init=init_op):
            self.z, encoder_objective, self.eps, _ = glow_ops.encoder_decoder(
                "codec", x, self.hparams, eps=None, reverse=False)
            objective += encoder_objective

            prior_objective, prior_dist = self.top_prior(self.z)
            tf.summary.scalar("top_prior", tf.reduce_mean(prior_objective))
            self.z_sample = prior_dist.sample()
            objective += prior_objective

        # bits per pixel
        _, h, w, c = common_layers.shape_list(x)
        objective = -objective / (np.log(2) * h * w * c)
        return tf.zeros_like(features["targets"]), {"training": objective}
Exemplo n.º 7
0
def latents_to_frames(z_top_interp, level_eps_interp, hparams):
  """Decodes latents to frames."""
  # Decode [z^1_t, z^2_t .. z^l_t] to [X_t]
  images, _, _, _ = glow_ops.encoder_decoder(
      "codec", z_top_interp, hparams, eps=level_eps_interp, reverse=True)
  images = glow_ops.postprocess(images)
  return images
Exemplo n.º 8
0
  def test_encoder_decoder_practical_usage(self):
    """Tests the following sequence of operations.

    1. Define forward network with arg_scope(init=True).
    2. Run one-forward pass to do data-dependent initialization and save.
    3. Define forward and reverse network with arg_scope(init=False)
    4. Check that reverse(forward(x)) == x
    """
    hparams = glow.glow_hparams()
    hparams.n_levels = 2
    hparams.depth = 12

    with tf.Graph().as_default():
      rng = np.random.RandomState(0)
      x_rand = np.asarray(rng.rand(1, 4, 4, 4), dtype=np.float32)
      x_t = tf.convert_to_tensor(x_rand)

      ops = [glow_ops.get_variable_ddi, glow_ops.actnorm]
      with arg_scope(ops, init=True):
        x_inv, _, _, _, _ = glow_ops.encoder_decoder(
            "revnet", x_t, hparams, reverse=False)
      curr_dir = tempfile.mkdtemp()
      model_path = os.path.join(curr_dir, "model")

      with tf.Session() as session:
        saver = tf.train.Saver()
        session.run(tf.global_variables_initializer())
        session.run(x_inv)
        saver.save(session, model_path)

    with tf.Graph().as_default():
      rng = np.random.RandomState(0)
      x_rand = np.asarray(rng.rand(1, 4, 4, 4), dtype=np.float32)
      x_t = tf.convert_to_tensor(x_rand)
      ops = [glow_ops.get_variable_ddi, glow_ops.actnorm]
      with arg_scope(ops, init=False):
        x_inv2, _, all_eps, _, _ = glow_ops.encoder_decoder(
            "revnet", x_t, hparams, reverse=False)
        x_inv_inv_, _, _, _ = glow_ops.encoder_decoder(
            "revnet", x_inv2, hparams, eps=all_eps, reverse=True)

      with tf.Session() as session:
        saver = tf.train.Saver()
        saver.restore(session, model_path)
        x_inv_inv_np = session.run(x_inv_inv_)
        diff = np.abs(x_inv_inv_np - x_rand)
        self.assertTrue(np.allclose(diff, 0.0, atol=1e-3))
Exemplo n.º 9
0
def frame_to_latents(frame, hparams):
  """Encode frames to latents."""
  # Preprocess
  frame = preprocess_frame(frame)

  # Encode [X_t] to [z^1_t, z^2_t .. z^l_t]
  glow_vals = glow_ops.encoder_decoder(
      "codec", frame, hparams, eps=None, reverse=False)
  z_top, _, level_eps, _, _ = glow_vals
  return z_top, level_eps
Exemplo n.º 10
0
    def test_encoder_decoder(self):
        with tf.Graph().as_default():
            hparams = glow_ops.glow_hparams()
            hparams.n_levels = 2
            hparams.depth = 2

            x = tf.random_uniform(shape=(16, 64, 64, 4), seed=0)
            x_inv, _, eps = glow_ops.encoder_decoder("encoder_decoder",
                                                     x,
                                                     hparams,
                                                     reverse=False)
            x_inv_inv, _ = glow_ops.encoder_decoder("encoder_decoder",
                                                    x_inv,
                                                    hparams,
                                                    eps=eps,
                                                    reverse=True)

            with tf.Session() as session:
                session.run(tf.global_variables_initializer())
                diff, x_inv_np = session.run([x - x_inv_inv, x_inv])
                self.assertTrue(x_inv_np.shape, (16, 8, 8, 64))
                self.assertTrue(np.allclose(diff, 0.0, atol=1e-2))
Exemplo n.º 11
0
  def infer(self, features, *args, **kwargs):  # pylint: disable=arguments-differ
    del args, kwargs
    x = features["inputs"]
    batch_size = common_layers.shape_list(x)[0]
    features["targets"] = tf.zeros(shape=(batch_size, 1, 1, 1))
    _, _ = self(features)  # pylint: disable=not-callable

    ops = [glow_ops.get_variable_ddi, glow_ops.actnorm]
    var_scope = tf.variable_scope("glow/body", reuse=True)
    # If eps=None, images are sampled from the prior.
    with arg_scope(ops, init=False), var_scope:
      predictions, _, _, _ = glow_ops.encoder_decoder(
          "codec", self.z_sample, self.hparams, eps=None, reverse=True)

    return self.scale(predictions)
Exemplo n.º 12
0
  def infer(self, features, *args, **kwargs):  # pylint: disable=arguments-differ
    del args, kwargs
    x = features["inputs"]
    batch_size = common_layers.shape_list(x)[0]
    features["targets"] = tf.zeros(shape=(batch_size, 1, 1, 1))
    _, _ = self(features)  # pylint: disable=not-callable

    ops = [glow_ops.get_variable_ddi, glow_ops.actnorm]
    var_scope = tf.variable_scope("glow/body", reuse=True)
    # If eps=None, images are sampled from the prior.
    with arg_scope(ops, init=False), var_scope:
      predictions, _, _, _ = glow_ops.encoder_decoder(
          "codec", self.z_sample, self.hparams, eps=None, reverse=True,
          temperature=self.temperature)

    return self.scale(predictions)
Exemplo n.º 13
0
    def objective_tower(self, features, init=True):
        """Objective in terms of bits-per-pixel.

    Args:
      features: dict of tensors with "features" and "targets" keys.
      init: Whether or not to run data-dependent init.
    Returns:
      objective: float, bits-per-pixel.
    """
        x = features["inputs"]

        # Scale x such that the pixels lie in-between -0.5 and.0.5
        x = self.preprocess(x)
        x, objective = glow_ops.uniform_binning_correction(x)

        # The arg_scope call ensures that the actnorm parameters are set such that
        # the per-channel output activations have zero mean and unit variance
        # ONLY during the first step. After that the parameters are learned
        # through optimisation.
        ops = [glow_ops.get_variable_ddi, glow_ops.actnorm]
        with arg_scope(ops, init=init):
            self.z, encoder_objective, self.eps, _, _ = glow_ops.encoder_decoder(
                "codec", x, self.hparams, eps=None, reverse=False)
            objective += encoder_objective

            self.z_top_shape = common_layers.shape_list(self.z)
            prior_dist = self.top_prior()
            prior_objective = tf.reduce_sum(prior_dist.log_prob(self.z),
                                            axis=[1, 2, 3])
            self.z_sample = prior_dist.sample()
            objective += prior_objective

        # bits per pixel
        _, h, w, c = common_layers.shape_list(x)
        objective = -objective / (np.log(2) * h * w * c)
        return objective
Exemplo n.º 14
0
    def body(self, features):
        x = features["inputs"]

        # Scale x such that the pixels lie in-between -0.5 and.0.5
        x = self.preprocess(x)

        n_bins = 2**self.hparams.n_bits_x
        batch_size, height, width, n_channels = common_layers.shape_list(x)
        hwc = float(height * width * n_channels)

        x = x + tf.random_uniform(
            shape=(batch_size, height, width, n_channels),
            minval=0.0,
            maxval=1.0 / n_bins)
        objective = -np.log(n_bins) * hwc * tf.ones(batch_size)

        # The arg_scope call ensures that the actnorm parameters are set such that
        # the per-channel output activations have zero mean and unit variance
        # ONLY during the first step. After that the parameters are learned
        # through optimisation.
        global_step = tf.train.get_or_create_global_step()
        init_op = tf.logical_and(tf.equal(global_step, 0), self.is_training)
        ops = [glow_ops.get_variable_ddi, glow_ops.actnorm]
        with arg_scope(ops, init=init_op):
            self.z, encoder_objective, self.eps = glow_ops.encoder_decoder(
                "codec", x, self.hparams, eps=None, reverse=False)
            objective += encoder_objective

            prior_objective, prior_dist = glow_ops.top_prior(
                "top_prior", self.z, learn_prior=self.hparams.learn_prior)
            self.z_sample = prior_dist.sample()
            objective += prior_objective

        # bits per pixel
        objective = -objective / (np.log(2) * hwc)
        return tf.zeros_like(features["targets"]), {"training": objective}
Exemplo n.º 15
0
  def objective_tower(self, features, init=True):
    """Objective in terms of bits-per-pixel.

    Args:
      features: dict of tensors with "features" and "targets" keys.
      init: Whether or not to run data-dependent init.
    Returns:
      objective: float, bits-per-pixel.
    """
    x = features["inputs"]

    # Scale x such that the pixels lie in-between -0.5 and.0.5
    x = self.preprocess(x)
    x, objective = glow_ops.uniform_binning_correction(x)

    # The arg_scope call ensures that the actnorm parameters are set such that
    # the per-channel output activations have zero mean and unit variance
    # ONLY during the first step. After that the parameters are learned
    # through optimisation.
    ops = [glow_ops.get_variable_ddi, glow_ops.actnorm]
    with arg_scope(ops, init=init):
      self.z, encoder_objective, self.eps, _, _ = glow_ops.encoder_decoder(
          "codec", x, self.hparams, eps=None, reverse=False)
      objective += encoder_objective

      self.z_top_shape = common_layers.shape_list(self.z)
      prior_dist = self.top_prior()
      prior_objective = tf.reduce_sum(
          prior_dist.log_prob(self.z), axis=[1, 2, 3])
      self.z_sample = prior_dist.sample()
      objective += prior_objective

    # bits per pixel
    _, h, w, c = common_layers.shape_list(x)
    objective = -objective / (np.log(2) * h * w * c)
    return objective
Exemplo n.º 16
0
    def infer(self, features, *args, **kwargs):  # pylint: disable=arguments-differ
        del args, kwargs

        # Make a copy of features that can be used in the call to self
        # that builds the graph.
        new_features = {}
        new_features["inputs"] = features["inputs"]
        new_features["targets"] = features["infer_targets"]
        _, _ = self(new_features)  # pylint: disable=not-callable

        if self.hparams.gen_mode == "unconditional":
            num_target_frames = 1
        else:
            num_target_frames = self.hparams.video_num_target_frames

        ops = [
            glow_ops.get_variable_ddi, glow_ops.actnorm, glow_ops.get_dropout
        ]
        var_scope = tf.variable_scope("next_frame_glow/body", reuse=True)
        all_frames = []

        # If eps=None, images are sampled from the prior.
        with arg_scope(ops, init=False), var_scope:
            for target_frame in range(1, num_target_frames + 1):

                # subscript -> timestep, superscript -> level.
                # self.z_sample equals z^0_{t} (top-level latent)
                # (X_{t}, z^{1..l}_{t}) = Glow(z^0_{t}, z^{1..l}_{t-1})
                # Get current set of cond_latents.
                cond_level, cond_level_latents = get_cond_latents(
                    self.all_level_latents, self.hparams)

                glow_vals = glow_ops.encoder_decoder(
                    "codec",
                    self.z_sample,
                    self.hparams,
                    eps=None,
                    reverse=True,
                    cond_latents=cond_level_latents,
                    states=self.level_states,
                    condition=cond_level,
                    temperature=self.temperature)
                predicted_frame, _, curr_latents, self.level_states = glow_vals
                all_frames.append(predicted_frame)
                self.all_level_latents.append(curr_latents)

                # Compute z^0_{t+1} = f(z^0_{t})
                if target_frame < num_target_frames:
                    cond_top, cond_top_latents = get_cond_latents(
                        self.all_top_latents, self.hparams)
                    prior_dist = self.top_prior(condition=cond_top,
                                                cond_latents=cond_top_latents)
                    self.z_sample = prior_dist.sample()
                    self.all_top_latents.append(self.z_sample)

        all_frames = tf.stack(all_frames)
        predicted_video = common_video.swap_time_and_batch_axes(all_frames)

        # The video-decode API requires the predicted video to be the same shape
        # as the target-video. Hence, for unconditional generation,
        # tile across time to ensure same shape.
        if self.hparams.gen_mode == "unconditional":
            predicted_video = tf.tile(
                predicted_video,
                [1, self.hparams.video_num_target_frames, 1, 1, 1])
        predicted_video = glow_ops.postprocess(predicted_video)

        # Output of a single decode / sample.
        output_features = {}
        output_features["targets"] = tf.zeros_like(predicted_video)
        output_features["outputs"] = predicted_video
        output_features["scores"] = tf.zeros_like(predicted_video)
        return output_features