Exemplo n.º 1
0
  def model_fn(self, features, labels, params, mode):
    """Constructs the model for the given features and mode.

    Args:
      features: A dictionary with the feature tensors.
      labels: Tensor will labels. Will be None if mode is PREDICT.
      params: Dictionary with hyperparameters passed to TPUEstimator.
          Additional TPUEstimator will set 3 keys: `batch_size`, `use_tpu`,
          `tpu_context`. `batch_size` is the batch size for this core.
      mode: `tf.estimator.ModeKeys` value (TRAIN, EVAL, PREDICT). The mode
          should be passed to the TPUEstimatorSpec and your model should be
          build this mode.

    Returns:
      A `tf.contrib.tpu.TPUEstimatorSpec`.
    """
    logging.info("model_fn(): features=%s, labels=%s,mode=%s, params=%s",
                 features, labels, mode, params)
    if mode != tf.estimator.ModeKeys.TRAIN:
      raise ValueError("Only training mode is supported.")

    use_tpu = params["use_tpu"]
    unroll_graph = self._experimental_force_graph_unroll or use_tpu
    num_sub_steps = self._get_num_sub_steps(unroll_graph=unroll_graph)
    if unroll_graph:
      logging.warning("Graph will be unrolled.")
    if self._experimental_joint_gen_for_disc and not unroll_graph:
      raise ValueError("Joining G forward passes is only supported for ",
                       "unrolled graphs.")

    # Clean old summaries from previous calls to model_fn().
    self._tpu_summary = tpu_summaries.TpuSummaries(self._model_dir)

    # Get features for each sub-step.
    fs, ls = self._split_inputs_and_generate_samples(
        features, labels, num_sub_steps=num_sub_steps)

    disc_optimizer = self.get_disc_optimizer(params["use_tpu"])
    disc_step = tf.get_variable(
        "global_step_disc", [], dtype=tf.int32, trainable=False)
    train_disc_fn = functools.partial(
        self._train_discriminator,
        step=disc_step,
        optimizer=disc_optimizer,
        params=params)

    gen_optimizer = self.get_gen_optimizer(params["use_tpu"])
    gen_step = tf.train.get_or_create_global_step()
    train_gen_fn = functools.partial(
        self._train_generator,
        features=fs[-1],
        labels=ls[-1],
        step=gen_step,
        optimizer=gen_optimizer,
        params=params)

    if not unroll_graph and self._disc_iters != 1:
      train_fn = train_gen_fn
      train_gen_fn = lambda: tf.cond(
          tf.equal(disc_step % self._disc_iters, 0), train_fn, lambda: 0.0)

    # Train D.
    d_losses = []
    d_steps = self._disc_iters if unroll_graph else 1
    for i in range(d_steps):
      with tf.name_scope("disc_step_{}".format(i + 1)):
        with tf.control_dependencies(d_losses):
          d_losses.append(train_disc_fn(features=fs[i], labels=ls[i]))

    # Train G.
    with tf.control_dependencies(d_losses):
      with tf.name_scope("gen_step"):
        g_loss = train_gen_fn()

    for i, d_loss in enumerate(d_losses):
      self._tpu_summary.scalar("loss/d_{}".format(i), d_loss)
    self._tpu_summary.scalar("loss/g", g_loss)
    self._add_images_to_summary(fs[0]["generated"], "fake_images", params)
    self._add_images_to_summary(fs[0]["images"], "real_images", params)

    self._check_variables()
    utils.log_parameter_overview(self.generator.trainable_variables,
                                 msg="Generator variables:")
    utils.log_parameter_overview(self.discriminator.trainable_variables,
                                 msg="Discriminator variables:")

    return tf.contrib.tpu.TPUEstimatorSpec(
        mode=mode,
        host_call=self._tpu_summary.get_host_call(),
        # Estimator requires a loss which gets displayed on TensorBoard.
        # The given Tensor is evaluated but not used to create gradients.
        loss=d_losses[0],
        train_op=g_loss.op)
Exemplo n.º 2
0
  def __init__(self,
               dataset,
               parameters,
               model_dir,
               deprecated_split_disc_calls=False,
               experimental_joint_gen_for_disc=False,
               experimental_force_graph_unroll=False,
               g_use_ema=False,
               ema_decay=0.9999,
               ema_start_step=40000,
               g_optimizer_fn=tf.train.AdamOptimizer,
               d_optimizer_fn=None,
               g_lr=0.0002,
               d_lr=None,
               conditional=False,
               fit_label_distribution=False):
    """ModularGAN  is a Gin configurable implementation of AbstractGAN.

    Graph Unrolling:
    For better performance TPUs perform multiple training steps in a single
    session run call. To utilize this we perform both D and G training in a
    single training step. The inputs to model_fn are split into multiple
    sub-steps:
    One sub-step for each discriminator training step (disc_iters) and a
    separate sub-step (with new inputs) for the generator training step.
    The configured batch size is the batch size used in a sub-step.

    Warning: Graph unrolling can increase the memory requirement and load to
    memory issues on GPUs. Therefore it is turned off when running on GPUs, but
    can be forced to be on with experimental_force_graph_unroll.

    Args:
      dataset: `ImageDataset` object. If `conditional` the dataset must provide
        labels and the number of classes bust known.
      parameters: Legacy Python dictionary with additional parameters. This must
        have the keys 'architecture', 'z_dim' and 'lambda'.
      model_dir: Directory path for storing summary files.
      deprecated_split_disc_calls: If True pass fake and real images separately
        through the discriminator network.
      experimental_joint_gen_for_disc: If True generate fake images for all D
        iterations jointly. This increase the batch size in G when generating
        fake images for D. The G step is stays the same.
      experimental_force_graph_unroll: Force unrolling of the graph as described
        above. When running on TPU the graph is always unrolled.
      g_use_ema: If True keep moving averages for weights in G and use them in
        the TF-Hub module.
      ema_decay: Decay rate for moving averages for G's weights.
      ema_start_step: Start step for keeping moving averages. Before this the
        decay rate is 0.
      g_optimizer_fn: Function (or constructor) to return an optimizer for G.
      d_optimizer_fn: Function (or constructor) to return an optimizer for D.
        If None will call `g_optimizer_fn`.
      g_lr: Learning rate for G.
      d_lr: Learning rate for D. Defaults to `g_lr`.
      conditional: Whether the GAN is conditional. If True both G and Y will
        get passed labels.
      fit_label_distribution: Whether to fit the label distribution.
    """
    super(ModularGAN, self).__init__(
        dataset=dataset, parameters=parameters, model_dir=model_dir)
    self._deprecated_split_disc_calls = deprecated_split_disc_calls
    self._experimental_joint_gen_for_disc = experimental_joint_gen_for_disc
    self._experimental_force_graph_unroll = experimental_force_graph_unroll
    self._g_use_ema = g_use_ema
    self._ema_decay = ema_decay
    self._ema_start_step = ema_start_step
    self._g_optimizer_fn = g_optimizer_fn
    self._d_optimizer_fn = d_optimizer_fn
    if self._d_optimizer_fn is None:
      self._d_optimizer_fn = g_optimizer_fn
    self._g_lr = g_lr
    self._d_lr = g_lr if d_lr is None else d_lr

    if conditional and not self._dataset.num_classes:
      raise ValueError(
          "Option 'conditional' selected but dataset {} does not have "
          "labels".format(self._dataset.name))
    self._conditional = conditional
    self._fit_label_distribution = fit_label_distribution

    self._tpu_summary = tpu_summaries.TpuSummaries(model_dir)

    # Parameters that have not been ported to Gin.
    self._architecture = parameters["architecture"]
    self._z_dim = parameters["z_dim"]
    self._lambda = parameters["lambda"]

    # Number of discriminator iterations per one iteration of the generator.
    self._disc_iters = parameters.get("disc_iters", 1)
    self._force_graph_unroll = parameters.get("force_graph_unroll")

    # Will be set by create_loss().
    self.d_loss = None
    self.g_loss = None
    self.penalty_loss = None

    # Cache for discriminator and generator objects.
    self._discriminator = None
    self._generator = None
Exemplo n.º 3
0
    def model_fn(self, features, labels, params, mode):
        """Constructs the model for the given features and mode.

    Args:
      features: A dictionary with the feature tensors.
      labels: Tensor will labels. Will be None if mode is PREDICT.
      params: Dictionary with hyperparameters passed to TPUEstimator.
          Additional TPUEstimator will set 3 keys: `batch_size`, `use_tpu`,
          `tpu_context`. `batch_size` is the batch size for this core.
      mode: `tf.estimator.ModeKeys` value (TRAIN, EVAL, PREDICT). The mode
          should be passed to the TPUEstimatorSpec and your model should be
          build this mode.

    Returns:
      A `tf.contrib.tpu.TPUEstimatorSpec`.
    """
        logging.info("model_fn(): features=%s, labels=%s,mode=%s, params=%s",
                     features, labels, mode, params)

        if mode != tf.estimator.ModeKeys.TRAIN:
            raise ValueError("Only training mode is supported.")

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        global_step = tf.train.get_or_create_global_step()
        # Variable to count discriminator steps
        global_step_disc = tf.get_variable("global_step_discriminator",
                                           dtype=tf.int32,
                                           initializer=tf.constant(0),
                                           trainable=False)

        # Create ops for first D steps here to create the variables.
        with tf.name_scope("disc_step"):
            self.create_loss(features,
                             labels,
                             params,
                             is_training=is_training,
                             reuse=False)

        # Divide trainable variables into a group for D and group for G.
        t_vars = tf.trainable_variables()
        d_vars = [var for var in t_vars if "discriminator" in var.name]
        g_vars = [var for var in t_vars if "generator" in var.name]
        if len(t_vars) != len(d_vars) + len(g_vars):
            logging.error("There variables that neither part of G or D.")
        self._check_variables(t_vars, d_vars, g_vars)

        d_optimizer = self.d_optimizer(params["use_tpu"])
        g_optimizer = self.g_optimizer(params["use_tpu"])

        # In the following each sub-step (disc_iters steps on D + one step on G)
        # depends on previous sub-steps. The optimizer ops for each step
        # depends on all the update ops (from batch norm etc.). Each update op
        # will still only be executed ones.
        deps = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        # Discriminator training.
        with tf.control_dependencies(deps):
            deps.append(
                d_optimizer.minimize(self.d_loss,
                                     var_list=d_vars,
                                     global_step=global_step_disc))

        # Clean old summaries from previous calls to model_fn().
        self._tpu_summary = tpu_summaries.TpuSummaries(self._model_dir)
        self._tpu_summary.scalar("loss/d", self.d_loss)
        with tf.name_scope("fake_images"):
            z = features["z"]
            sampled_y = None
            if self.conditional:
                sampled_y = self._get_one_hot_labels(
                    features["sampled_labels"])
            fake_images = self.generator(z,
                                         y=sampled_y,
                                         is_training=True,
                                         reuse=True)
        self._add_images_to_summary(fake_images, "fake_images", params)
        self._add_images_to_summary(features["images"], "real_images", params)

        # Generator training.
        with tf.name_scope("gen_step"):
            with tf.control_dependencies(deps):
                self._tpu_summary.scalar("loss/g", self.g_loss)
                deps.extend(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
            with tf.control_dependencies(deps):
                if self._disc_iters == 1:
                    train_op = g_optimizer.minimize(self.g_loss,
                                                    var_list=g_vars,
                                                    global_step=global_step)
                else:
                    # We should only train the generator every self.disc_iter steps.
                    # We can do this using `tf.cond`. Both paths must return a tensor.
                    # Our true_fn will return a tensor that depends on training the
                    # generator, while the tensor from false_fn depends on nothing.
                    def do_train_generator():
                        actual_train_op = g_optimizer.minimize(
                            self.g_loss,
                            var_list=g_vars,
                            global_step=global_step)
                        with tf.control_dependencies([actual_train_op]):
                            return tf.constant(0)

                    def do_not_train_generator():
                        return tf.constant(0)

                    train_op = tf.cond(tf.equal(
                        global_step_disc % self._disc_iters, 0),
                                       true_fn=do_train_generator,
                                       false_fn=do_not_train_generator,
                                       name="").op
                loss = self.g_loss

        if self._g_use_ema:
            with tf.name_scope("generator_ema"):
                logging.info("Creating moving averages of weights: %s", g_vars)

                def do_update_ema():
                    # The decay value is set to 0 if we're before the moving-average start
                    # point, so that the EMA vars will be the normal vars.
                    decay = self._ema_decay * tf.cast(
                        tf.greater_equal(global_step, self._ema_start_step),
                        tf.float32)
                    ema = tf.train.ExponentialMovingAverage(decay=decay)
                    return ema.apply(g_vars)

                def do_not_update_ema():
                    return tf.constant(0).op

                with tf.control_dependencies([train_op]):
                    train_op = tf.cond(tf.equal(
                        global_step_disc % self._disc_iters, 0),
                                       true_fn=do_update_ema,
                                       false_fn=do_not_update_ema,
                                       name="")

        d_param_overview = utils.get_parameter_overview(d_vars, limit=None)
        g_param_overview = utils.get_parameter_overview(g_vars, limit=None)
        logging.info("Discriminator variables:\n%s", d_param_overview)
        logging.info("Generator variables:\n%s", g_param_overview)

        return tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode,
            host_call=self._tpu_summary.get_host_call(),
            # Estimator requires a loss which gets displayed on TensorBoard.
            # The given Tensor is evaluated but not used to create gradients.
            loss=loss,
            train_op=train_op)
Exemplo n.º 4
0
def model_fn(features, labels, mode, params, vocab):
    """Model function that satisfies the Estimator API.

  Args:
    features: Dictionary of model input tensors.
    labels: Ununsed.
    mode: A tf.estimator.ModeKeys value.
    params: Dictionary of model parameters.
    vocab: A utils.text_utils.Vocab instance.

  Returns:
    spec: A tf.estimator.TPUEstimatorSpec.
  """
    del labels

    # ----------------------------------------------------------------------------
    # INITIALIZATION.
    # ----------------------------------------------------------------------------

    # Update model config from the pre-trained checkpoint.
    model = transformer_utils.TransformerModel(
        config=transformer_utils.TransformerConfig.from_dict(params),
        is_training=(mode == tf_estimator.ModeKeys.TRAIN))

    # Initialize QA model.
    rc_model = hub.Module(params["rc_model"])

    # image_features: [batch_size, num_regions, feature_size]
    # image_positions: [batch_size, num_regions]
    # image_mask: [batch_size, num_regions]
    image_features = features["object_features"].features
    image_positions = features["object_features"].positions
    image_mask = features["object_features"].mask

    # Expand mask by 1 to account for the leading [IMG] token.
    # [batch_size, num_regions + 1]
    batch_size = tensor_utils.shape(image_mask, 0)
    input_mask = tf.pad(image_mask, [[0, 0], [1, 0]], constant_values=1)

    # Encode the image and store the cached transformer values.
    # [batch_size, num_regions + 1, num_layers, num_heads, head_size]
    _, input_cache = model.compute_image_transformer(
        input_ids=tf.fill([batch_size, 1], vocab.t2i(vocab.IMG)),
        input_image=image_features,
        input_image_mask=input_mask,
        input_positions=image_positions)

    # ----------------------------------------------------------------------------
    # TRAINING
    # ----------------------------------------------------------------------------

    if mode == tf_estimator.ModeKeys.TRAIN:
        # MIXER-style training objective consists of two parts:
        #   1) Policy gradient on rewarded rollouts.
        #   2) MLE regularization on references.
        # The full loss is L_total = L_pg + L_mle.

        # Step 1: Policy gradient.
        # Compute and score policy rollouts (multiple per image).
        rollouts = reward_utils.compute_rollouts(model=model,
                                                 rc_model=rc_model,
                                                 features=features,
                                                 encoder_cache=input_cache,
                                                 encoder_cache_mask=input_mask,
                                                 vocab=vocab,
                                                 params=params)

        # Using a self-critical baseline, R'(y) = R(y) - b where b = argmax p(y|x),
        # sample a single rollout with non-zero reward.
        rollout, reward = reward_utils.sample_from_rollouts(
            rollouts=rollouts,
            baseline=rollouts.rewards[params["reward"]][:, 0],
            reward_type=params["reward"])

        # Compute the probablity of the rollout (back-propable).
        # [batch_size, decode_length, input_length + decode_length]
        rollout_attention_mask = transformer_utils.compute_attention_mask(
            token_mask=rollout.mask[:, :-1], input_mask=input_mask)

        # [batch_size, decode_length, vocab_size]
        rollout_emb, _ = model.compute_transformer(
            input_ids=rollout.token_ids[:, :-1],
            input_segment_id=rollout.segment_ids[:, :-1],
            input_positions=rollout.positions[:, :-1],
            attention_mask=rollout_attention_mask,
            input_cache=input_cache,
            reuse=tf.AUTO_REUSE)

        # [batch_size, decode_length, vocab_size]
        rollout_logits = model.compute_logits(rollout_emb, reuse=tf.AUTO_REUSE)

        # Compute the RL loss, -R(y) * log p(y|x)
        # Some elements in this batch are MLE only, mask those out from the loss.
        rollout_mask = tf.cast(rollout.mask[:, 1:], tf.float32)
        pg_mask = tf.equal(features["input_type"], datasets.DatasetTypes.VQA)
        rollout_mask *= tf.expand_dims(tf.cast(pg_mask, tf.float32), 1)
        rl_loss = tf.losses.sparse_softmax_cross_entropy(
            labels=rollout.token_ids[:, 1:],
            logits=rollout_logits,
            weights=tf.expand_dims(reward, 1) * rollout_mask,
            reduction=tf.losses.Reduction.SUM)
        rl_loss = tf.math.divide_no_nan(rl_loss, tf.reduce_sum(rollout_mask))

        # Step 2: MLE on references.
        # [batch_size, decode_length, input_length + decode_length]
        reference_attention_mask = transformer_utils.compute_attention_mask(
            token_mask=features["token_inputs"].mask, input_mask=input_mask)

        # [batch_size, decode_length, hidden_size]
        target_emb, _ = model.compute_transformer(
            input_ids=features["token_inputs"].token_ids,
            input_segment_id=features["token_inputs"].segment_ids,
            input_positions=features["token_inputs"].positions,
            attention_mask=reference_attention_mask,
            input_cache=input_cache,
            reuse=tf.AUTO_REUSE)

        # [batch_size, decode_length, vocab_size]
        target_logits = model.compute_logits(target_emb, reuse=tf.AUTO_REUSE)

        # Compute the MLE objective (cross-entropy loss).
        weights = features["token_outputs"].mask
        ref_mask = tf.equal(features["input_type"],
                            datasets.DatasetTypes.REFERENCE)
        weights *= tf.expand_dims(tf.cast(ref_mask, tf.int32), 1)
        reference_loss = tf.losses.sparse_softmax_cross_entropy(
            labels=features["token_outputs"].token_ids,
            logits=target_logits,
            weights=weights)

        # Add both losses together.
        loss = rl_loss + reference_loss

        # BERT-style optimization with linear warmp.
        train_op = optimization.create_optimizer(
            loss=loss,
            init_lr=params["learning_rate"],
            num_train_steps=params["num_train_steps"],
            num_warmup_steps=params["num_warmup_steps"],
            use_tpu=params.get("use_tpu"))

        # Book-keeping.
        summaries = tpu_summaries.TpuSummaries(params["model_dir"])
        summaries.scalar("loss", loss)

        # Check what percentage of examples have non-zero reward.
        total_vqa = tf.reduce_sum(tf.cast(pg_mask, tf.float32))
        nonzero = tf.cast(tf.not_equal(reward, 0), tf.float32)
        nonzero *= tf.cast(pg_mask, tf.float32)
        total_nonzero = tf.reduce_sum(nonzero)
        summaries.scalar("density", tf.div_no_nan(total_nonzero, total_vqa))

        # Total (non-normalized) reward.
        reward = rollouts.rewards[params["reward"]][:, 0]
        reward *= tf.cast(pg_mask, tf.float32)
        total_reward = tf.reduce_sum(reward)
        summaries.scalar("reward", tf.div_no_nan(total_reward, total_vqa))
        host_call = summaries.get_host_call()
    else:
        loss = None
        train_op = None
        host_call = None

    # ----------------------------------------------------------------------------
    # TESTING.
    # ----------------------------------------------------------------------------

    if mode == tf_estimator.ModeKeys.PREDICT:
        decode_output = transformer_utils.beam_search_decode(
            model=model,
            encoder_cache=input_cache,
            encoder_cache_mask=input_mask,
            start_id=vocab.t2i(vocab.CLS),
            stop_id=vocab.t2i(vocab.SEP),
            segment_id=0,
            num_steps=params["decode_length"],
            beam_size=params["beam_size"],
            alpha=params["beam_length_penalty"],
            reuse=tf.AUTO_REUSE)
        predictions = dict(image_id=features.get("image_id", -1),
                           question_id=features.get("question_id", -1),
                           token_ids=decode_output.token_ids[:, :, 1:])
    else:
        predictions = None

    # ----------------------------------------------------------------------------
    # WARM-START.
    # ----------------------------------------------------------------------------

    # Initialize from pretrained model.
    def scaffold_fn():
        """Init op run on host."""
        checkpoint = params["base_model"]
        if params["warm_start_path"]:
            checkpoint = params["warm_start_path"]
        if checkpoint:
            checkpoint_utils.init_from_checkpoint(checkpoint)
        return tf.train.Scaffold()

    return tf_estimator.tpu.TPUEstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op,
        predictions=predictions,
        scaffold_fn=scaffold_fn,
        host_call=host_call,
    )
Exemplo n.º 5
0
def model_fn(features, labels, mode, params, vocab):
    """Model function that satisfies the Estimator API.

  Args:
    features: Dictionary of model input tensors.
    labels: Ununsed.
    mode: A tf.estimator.ModeKeys value.
    params: Dictionary of model parameters.
    vocab: A utils.text_utils.Vocab instance.

  Returns:
    spec: A tf.estimator.TPUEstimatorSpec.
  """
    del labels

    # ----------------------------------------------------------------------------
    # INITIALIZATION.
    # ----------------------------------------------------------------------------

    model = transformer_utils.TransformerModel(
        config=transformer_utils.TransformerConfig.from_dict(params),
        is_training=(mode == tf_estimator.ModeKeys.TRAIN))

    # image_features: [batch_size, num_regions, feature_size]
    # image_positions: [batch_size, num_regions]
    # image_mask: [batch_size, num_regions]
    image_features = features["object_features"].features
    image_positions = features["object_features"].positions
    image_mask = features["object_features"].mask

    # Expand mask by 1 to account for the leading [IMG] token.
    # [batch_size, num_regions + 1]
    batch_size = tensor_utils.shape(image_mask, 0)
    input_mask = tf.pad(image_mask, [[0, 0], [1, 0]], constant_values=1)

    # Encode the image and store the cached transformer values.
    # [batch_size, num_regions + 1, num_layers, num_heads, head_size]
    _, input_cache = model.compute_image_transformer(
        input_ids=tf.fill([batch_size, 1], vocab.t2i(vocab.IMG)),
        input_image=image_features,
        input_image_mask=input_mask,
        input_positions=image_positions)

    if params.get("conditional_decoding"):
        # Add additional (text) conditioning information to the input cache.
        # The conditioning information gets to see the image information.
        # The new input consists of both the image and the extra encoded text.
        # This is used for the LEARN function of Alg. 1 in the paper.

        # [batch_size, num_regions + condition_length + 1]
        input_mask = tf.concat([input_mask, features["condition_inputs"].mask],
                               1)

        # [batch_size, condition_length, num_layers, num_heads, head_size]
        _, condition_cache = model.compute_transformer(
            input_ids=features["condition_inputs"].token_ids,
            input_segment_id=features["condition_inputs"].segment_ids,
            input_positions=features["condition_inputs"].positions,
            attention_mask=tf.expand_dims(input_mask, 1),
            input_cache=input_cache,
            reuse=tf.AUTO_REUSE,
            conditional=True)

        # [batch_size, input_length, num_layers, num_heads, head_size]
        input_cache = transformer_utils.TransformerCache(
            keys=tf.concat([input_cache.keys, condition_cache.keys], 1),
            values=tf.concat([input_cache.values, condition_cache.values], 1))

    # ----------------------------------------------------------------------------
    # TRAINING
    # ----------------------------------------------------------------------------

    if mode == tf_estimator.ModeKeys.TRAIN:
        # During training, apply forced decoding with a diagonal attention mask.
        # [batch_size, caption_length - 1, input_length + caption_length - 1]
        attention_mask = transformer_utils.compute_attention_mask(
            token_mask=features["token_inputs"].mask, input_mask=input_mask)

        # [batch_size, caption_length - 1, hidden_size]
        target_emb, _ = model.compute_transformer(
            input_ids=features["token_inputs"].token_ids,
            input_segment_id=features["token_inputs"].segment_ids,
            input_positions=features["token_inputs"].positions,
            attention_mask=attention_mask,
            input_cache=input_cache,
            reuse=tf.AUTO_REUSE)

        # [batch_size, caption_length - 1, vocab_size]
        target_logits = model.compute_logits(target_emb, reuse=tf.AUTO_REUSE)

        # Compute the MLE objective (cross-entropy loss).
        loss = tf.losses.sparse_softmax_cross_entropy(
            labels=features["token_outputs"].token_ids,
            logits=target_logits,
            weights=features["token_outputs"].mask)

        # BERT-style optimization with linear warmp.
        train_op = optimization.create_optimizer(
            loss=loss,
            init_lr=params["learning_rate"],
            num_train_steps=params["num_train_steps"],
            num_warmup_steps=params["num_warmup_steps"],
            use_tpu=params.get("use_tpu"))

        summaries = tpu_summaries.TpuSummaries(params["model_dir"])
        summaries.scalar("loss", loss)
        host_call = summaries.get_host_call()
    else:
        loss = None
        train_op = None
        host_call = None

    # ----------------------------------------------------------------------------
    # TESTING.
    # ----------------------------------------------------------------------------

    if mode == tf_estimator.ModeKeys.PREDICT:
        decode_output = transformer_utils.beam_search_decode(
            model=model,
            encoder_cache=input_cache,
            encoder_cache_mask=input_mask,
            start_id=vocab.t2i(vocab.CLS),
            stop_id=vocab.t2i(vocab.SEP),
            segment_id=0,
            num_steps=params["decode_length"],
            beam_size=params["beam_size"],
            alpha=params["beam_length_penalty"],
            reuse=tf.AUTO_REUSE)
        predictions = dict(image_id=features.get("image_id", -1),
                           question_id=features.get("question_id", -1),
                           token_ids=decode_output.token_ids[:, :, 1:])
    else:
        predictions = None

    # ----------------------------------------------------------------------------
    # WARM-START.
    # ----------------------------------------------------------------------------

    # Initialize from pretrained model.
    def scaffold_fn():
        """Init op run on host."""
        checkpoint = params.get("warm_start_path")
        if checkpoint:
            checkpoint_utils.init_from_checkpoint(checkpoint)
        return tf.train.Scaffold()

    return tf_estimator.tpu.TPUEstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op,
        predictions=predictions,
        scaffold_fn=scaffold_fn,
        host_call=host_call,
    )
Exemplo n.º 6
0
    def __init__(self,
                 dataset,
                 parameters,
                 model_dir,
                 deprecated_split_disc_calls=False,
                 experimental_joint_gen_for_disc=False,
                 g_use_ema=False,
                 ema_decay=0.9999,
                 ema_start_step=40000,
                 g_optimizer_fn=tf.train.AdamOptimizer,
                 d_optimizer_fn=None,
                 g_lr=0.0002,
                 d_lr=None,
                 conditional=False,
                 fit_label_distribution=False):
        """ModularGAN  is a Gin configurable implementation of AbstractGAN.

    Args:
      dataset: `ImageDataset` object. If `conditional` the dataset must provide
        labels and the number of classes bust known.
      parameters: Legacy Python dictionary with additional parameters. This must
        have the keys 'architecture', 'z_dim' and 'lambda'.
      model_dir: Directory path for storing summary files.
      deprecated_split_disc_calls: If True pass fake and real images separately
        through the discriminator network.
      experimental_joint_gen_for_disc: If True generate fake images for all D
        iterations jointly. This increase the batch size in G when generating
        fake images for D. The G step is stays the same.
      g_use_ema: If True keep moving averages for weights in G and use them in
        the TF-Hub module.
      ema_decay: Decay rate for moving averages for G's weights.
      ema_start_step: Start step for keeping moving averages. Before this the
        decay rate is 0.
      g_optimizer_fn: Function (or constructor) to return an optimizer for G.
      d_optimizer_fn: Function (or constructor) to return an optimizer for D.
        If None will call `g_optimizer_fn`.
      g_lr: Learning rate for G.
      d_lr: Learning rate for D. Defaults to `g_lr`.
      conditional: Whether the GAN is conditional. If True both G and Y will
        get passed labels.
      fit_label_distribution: Whether to fit the label distribution.
    """
        super(ModularGAN, self).__init__(dataset=dataset,
                                         parameters=parameters,
                                         model_dir=model_dir)
        self._deprecated_split_disc_calls = deprecated_split_disc_calls
        self._experimental_joint_gen_for_disc = experimental_joint_gen_for_disc
        self._g_use_ema = g_use_ema
        self._ema_decay = ema_decay
        self._ema_start_step = ema_start_step
        self._g_optimizer_fn = g_optimizer_fn
        self._d_optimizer_fn = d_optimizer_fn
        if self._d_optimizer_fn is None:
            self._d_optimizer_fn = g_optimizer_fn
        self._g_lr = g_lr
        self._d_lr = g_lr if d_lr is None else d_lr

        if conditional and not self._dataset.num_classes:
            raise ValueError(
                "Option 'conditional' selected but dataset {} does not have "
                "labels".format(self._dataset.name))
        self._conditional = conditional
        self._fit_label_distribution = fit_label_distribution

        self._tpu_summary = tpu_summaries.TpuSummaries(model_dir)

        # Parameters that have not been ported to Gin.
        self._architecture = parameters["architecture"]
        self._z_dim = parameters["z_dim"]
        self._lambda = parameters["lambda"]

        # Number of discriminator iterations per one iteration of the generator.
        self._disc_iters = parameters.get("disc_iters", 1)

        # Will be set by create_loss().
        self.d_loss = None
        self.g_loss = None
        self.penalty_loss = None
Exemplo n.º 7
0
    def model_fn(self, features, labels, params, mode):
        """Constructs the model for the given features and mode.

    Args:
      features: A dictionary with the feature tensors.
      labels: Tensor will labels. Will be None if mode is PREDICT.
      params: Dictionary with hyperparameters passed to TPUEstimator.
          Additional TPUEstimator will set 3 keys: `batch_size`, `use_tpu`,
          `tpu_context`. `batch_size` is the batch size for this core.
      mode: `tf.estimator.ModeKeys` value (TRAIN, EVAL, PREDICT). The mode
          should be passed to the TPUEstimatorSpec and your model should be
          build this mode.

    Returns:
      A `tf.contrib.tpu.TPUEstimatorSpec`.
    """
        logging.info("model_fn(): features=%s, labels=%s,mode=%s, params=%s",
                     features, labels, mode, params)

        if mode != tf.estimator.ModeKeys.TRAIN:
            raise ValueError("Only training mode is supported.")

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        global_step = tf.train.get_or_create_global_step()

        def _create_sub_step_loss(sub_step_idx=0, reuse=True):
            """Creates the loss for a slice of the current batch.

      Args:
        sub_step_idx: Index of the slice of the batch to use to construct the
            loss. If self.unroll_disc_iters is True this must be 0 and the whole
            batch will be used.
        reuse: Bool, whether to reuse existing variables for the models.
            Should be False for the first call and True on all other calls.

      Returns:
        Fake images created by the generator.
      """
            logging.info("sub_step_idx: %s, params: %s", sub_step_idx, params)
            # Set the random offset tensor for operations in tpu_random.py.
            tpu_random.set_random_offset_from_features(fs[sub_step_idx])
            self.create_loss(fs[sub_step_idx],
                             ls[sub_step_idx],
                             params,
                             is_training=is_training,
                             reuse=reuse)

        # Split inputs for sub steps.
        fs = [(k, tf.split(features[k], self.num_sub_steps)) for k in features]
        fs = [{k: v[i] for k, v in fs} for i in range(self.num_sub_steps)]
        ls = tf.split(labels, self.num_sub_steps)

        # Only the last sub step changes the generator weights. Thus we can combine
        # all forward passes through G to achieve better efficiency. The forward
        # pass for G's step needs to be separated since compute gradients for it.
        if self._experimental_joint_gen_for_disc:
            logging.info("Running generator forward pass for all D steps.")
            with tf.name_scope("gen_for_disc"):
                bs = params["batch_size"] // self.num_sub_steps
                # D steps.
                z = features["z"][:-bs]
                sampled_y = None
                if self.conditional:
                    sampled_y = self._get_one_hot_labels(
                        features["sampled_labels"][:-bs])
                generated = tf.stop_gradient(
                    self.generator(z,
                                   y=sampled_y,
                                   is_training=is_training,
                                   reuse=False))
                assert self.num_sub_steps - 1 == self._disc_iters
                generated = tf.split(generated, self._disc_iters)
                for i in range(self._disc_iters):
                    fs[i]["generated"] = generated[i]
                    del fs[i]["z"]
                # G step.
                z = features["z"][-bs:]
                sampled_y = None
                if self.conditional:
                    sampled_y = self._get_one_hot_labels(
                        features["sampled_labels"][-bs:])
                fs[-1]["generated"] = self.generator(z,
                                                     y=sampled_y,
                                                     is_training=is_training,
                                                     reuse=True)
                del fs[-1]["z"]

        logging.info("fs=%s, ls=%s", fs, ls)
        # Create ops for first D steps here to create the variables.
        with tf.name_scope("disc_step_1"):
            _create_sub_step_loss(0, reuse=tf.AUTO_REUSE)
            d_losses = [self.d_loss]

        # Divide trainable variables into a group for D and group for G.
        t_vars = tf.trainable_variables()
        d_vars = [var for var in t_vars if "discriminator" in var.name]
        g_vars = [var for var in t_vars if "generator" in var.name]
        if len(t_vars) != len(d_vars) + len(g_vars):
            logging.error("There variables that neither part of G or D.")
        self._check_variables(t_vars, d_vars, g_vars)

        d_optimizer = self.d_optimizer(params["use_tpu"])
        g_optimizer = self.g_optimizer(params["use_tpu"])

        # In the following each sub-step (disc_iters steps on D + one step on G)
        # depends on previous sub-steps. The optimizer ops for each step
        # depends on all the update ops (from batch norm etc.). Each update op
        # will still only be executed ones.
        deps = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        # Discriminator training.
        with tf.control_dependencies(deps):
            deps.append(d_optimizer.minimize(self.d_loss, var_list=d_vars))

        for sub_step_idx in range(1, self._disc_iters):
            with tf.name_scope("disc_step_{}".format(sub_step_idx + 1)):
                with tf.control_dependencies(deps):
                    _create_sub_step_loss(sub_step_idx)
                    deps.extend(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
                with tf.control_dependencies(deps):
                    d_losses.append(self.d_loss)
                    deps.append(
                        d_optimizer.minimize(self.d_loss, var_list=d_vars))

        # Clean old summaries from previous calls to model_fn().
        self._tpu_summary = tpu_summaries.TpuSummaries(self._model_dir)
        for i, d_loss in enumerate(d_losses):
            self._tpu_summary.scalar("loss/d_{}".format(i), d_loss)
        if self._experimental_joint_gen_for_disc:
            fake_images = fs[0]["generated"]
        else:
            with tf.name_scope("fake_images"):
                z = fs[0]["z"]
                sampled_y = None
                if self.conditional:
                    sampled_y = self._get_one_hot_labels(
                        fs[0]["sampled_labels"])
                fake_images = self.generator(z,
                                             y=sampled_y,
                                             is_training=True,
                                             reuse=True)
        self._add_images_to_summary(fake_images, "fake_images", params)
        self._add_images_to_summary(fs[0]["images"], "real_images", params)

        # Generator training.
        with tf.name_scope("gen_step"):
            with tf.control_dependencies(deps):
                # This will use the same inputs as the last for the discriminator step
                # above, but the new sub-graph will depend on the updates of the
                # discriminator steps.
                _create_sub_step_loss(self.num_sub_steps - 1)
                self._tpu_summary.scalar("loss/g", self.g_loss)
                deps.extend(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
            with tf.control_dependencies(deps):
                train_op = g_optimizer.minimize(self.g_loss,
                                                var_list=g_vars,
                                                global_step=global_step)
                loss = self.g_loss

        if self._g_use_ema:
            with tf.name_scope("generator_ema"):
                logging.info("Creating moving averages of weights: %s", g_vars)
                # The decay value is set to 0 if we're before the moving-average start
                # point, so that the EMA vars will be the normal vars.
                decay = self._ema_decay * tf.cast(
                    tf.greater_equal(global_step, self._ema_start_step),
                    tf.float32)
                ema = tf.train.ExponentialMovingAverage(decay=decay)
                with tf.control_dependencies([train_op]):
                    train_op = ema.apply(g_vars)

        d_param_overview = utils.get_parameter_overview(d_vars, limit=None)
        g_param_overview = utils.get_parameter_overview(g_vars, limit=None)
        logging.info("Discriminator variables:\n%s", d_param_overview)
        logging.info("Generator variables:\n%s", g_param_overview)

        return tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode,
            host_call=self._tpu_summary.get_host_call(),
            # Estimator requires a loss which gets displayed on TensorBoard.
            # The given Tensor is evaluated but not used to create gradients.
            loss=loss,
            train_op=train_op)