예제 #1
0
파일: optimizer.py 프로젝트: hhuang97/keras
    def _clip_gradients(self, grads):
        if not self._gradients_clip_option:
            return grads

        clipped_grads = []
        if self._gradients_clip_option.clipnorm:
            for g in grads:
                if g is None:
                    clipped_grads.append(g)
                else:
                    clipped_grads.append(
                        tf.clip_by_norm(g,
                                        self._gradients_clip_option.clipnorm))
            return clipped_grads

        if self._gradients_clip_option.global_clipnorm:
            return tf.clip_by_global_norm(
                grads, self._gradients_clip_option.global_clipnorm)[0]

        if self._gradients_clip_option.clipvalue:
            for g in grads:
                if g is None:
                    clipped_grads.append(g)
                else:
                    clipped_grads.append(
                        tf.clip_by_value(
                            g,
                            clip_value_min=(
                                -self._gradients_clip_option.clipvalue),
                            clip_value_max=self._gradients_clip_option.
                            clipvalue))
            return clipped_grads

        return grads
예제 #2
0
    def _clip_gradients(self, grads):
        clipped_grads = []
        if self.clipnorm and self.clipnorm > 0:
            for g in grads:
                if g is None:
                    clipped_grads.append(g)
                else:
                    clipped_grads.append(tf.clip_by_norm(g, self.clipnorm))
            return clipped_grads

        if self.global_clipnorm and self.global_clipnorm > 0:
            return tf.clip_by_global_norm(grads, self.global_clipnorm)[0]

        if self.clipvalue and self.clipvalue > 0:
            for g in grads:
                if g is None:
                    clipped_grads.append(g)
                else:
                    clipped_grads.append(
                        tf.clip_by_value(
                            g,
                            clip_value_min=-self.clipvalue,
                            clip_value_max=self.clipvalue,
                        ))
            return clipped_grads

        return grads
예제 #3
0
  def _clip_gradients(self, grads):
    clipped_grads = []
    if self.clipnorm and self.clipnorm > 0:
      for g in grads:
        if g is None:
          clipped_grads.append(g)
        else:
          clipped_grads.append(tf.clip_by_norm(g, self.clipnorm))
      return clipped_grads

    if self.global_clipnorm and self.global_clipnorm > 0:
      return tf.clip_by_global_norm(grads, self.global_clipnorm)[0]

    if self.clipvalue and self.clipvalue > 0:
      for g in grads:
        if g is None:
          clipped_grads.append(g)
        else:
          clipped_grads.append(
              tf.clip_by_value(
                  g,
                  clip_value_min=-self.clipvalue,  # pylint: disable=invalid-unary-operand-type
                  clip_value_max=self.clipvalue))
      return clipped_grads

    return grads
예제 #4
0
 def step_fn(self, batch):
   """Per-Replica training step."""
   with tf.GradientTape() as tape:
     _, losses = self.model(batch, return_losses=True, training=True)
   # Clip and apply gradients.
   grads = tape.gradient(losses['total_loss'], self.model.trainable_variables)
   grads, _ = tf.clip_by_global_norm(grads, self.grad_clip_norm)
   self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
   return losses
예제 #5
0
    def step_fn(batch):
      """Per-Replica training step."""
      with tf.GradientTape() as tape:
        _ = self.model(batch, training=True)
        total_loss = tf.reduce_sum(self.model.losses)

      grads = tape.gradient(total_loss, self.model.trainable_variables)
      grads, _ = tf.clip_by_global_norm(grads, self.grad_clip_norm)
      self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
      return self.model.losses_dict
예제 #6
0
  def gradient_clipnorm_fn(grads_and_vars):

    if isinstance(tf.distribute.get_strategy(),
                  (tf.distribute.experimental.CentralStorageStrategy,
                   tf.compat.v1.distribute.experimental.CentralStorageStrategy)):
      raise ValueError(
          "`global_clipnorm` is not supported with `CenteralStorageStrategy`")

    grads, variables = zip(*grads_and_vars)
    clipped_grads, _ = tf.clip_by_global_norm(grads, clipnorm)
    clipped_grads_and_vars = list(zip(clipped_grads, variables))
    return clipped_grads_and_vars
    def _get_gradients(self):
        """Calculate and apply gradients for this step"""
        with tf.GradientTape() as tape:
            normalized_nll = self._normalized_nll()
        grads = tape.gradient(normalized_nll, self.trainables)

        if self.grad_clip:
            grads, _ = tf.clip_by_global_norm(grads, self.grad_clip)

        self.optimizer.apply_gradients(zip(grads, self.trainables))

        return normalized_nll, grads, tf.math.abs(tf.reduce_max(grads))
예제 #8
0
  def train_step(self, target_audio, f0,  amp):
    f0 = tf.Variable(f0, dtype=tf.float32)
    amp = tf.Variable(amp, dtype=tf.float32)
    synth_audio = self.synth(f0, amp)
    target_mag = self.spec_layer(target_audio)[:,:-100,:,:]
    true_synth_mag = self.spec_layer(synth_audio)[:,:-100,:,:]
    true_loss = self.get_loss(target_mag, true_synth_mag)

    with tf.GradientTape() as tape:
      estimated_synth_mag = self.estimate_spec(f0, amp)[:,:-100,:,:]
      estimated_loss = self.get_loss(target_mag, estimated_synth_mag)
      end2end = tf.reduce_mean(tf.square(true_loss - estimated_loss))
      spec_mse = tf.reduce_mean(tf.square(target_mag - estimated_synth_mag))
      J = end2end + spec_mse * 50*0.9**self.optimizer.iterations.numpy()
    grads = tape.gradient(J, self.trainable_variables)
    grads, _ = tf.clip_by_global_norm(grads, 0.1)
    self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
예제 #9
0
  def update_model(self, x_pos, x_neg, t):
    """
    Update the model parameters in a iteration
    """
    with tf.GradientTape() as tape:
      tape.watch(self.diffusion.trainable_variables)
      loss, loss_ts, f_ts = self.diffusion.training_losses(x_pos, x_neg, t, dropout=self.hps.dropout)

    vars = self.diffusion.trainable_variables
    grads = tape.gradient(loss, vars)
    if self.hps.grad_clip:
      grads, gnorm = tf.clip_by_global_norm(grads, 1. / float(num_device()[0]))
    grads_and_vars = list(zip(grads, vars))
    grads_mean = tf.reduce_mean(tf.stack([tf.reduce_mean(tf.abs(grad)) for grad in grads], axis=0))
    grads_max = tf.reduce_max(tf.stack([tf.reduce_max(tf.abs(grad)) for grad in grads], axis=0))
    self.opt.apply_gradients(grads_and_vars)
    self.ema.apply(self.diffusion)

    return loss, grads_mean, grads_max, loss_ts, f_ts
예제 #10
0
    def _step(self, trajectory: sequence.Trajectory):
        """Do a batch of SGD on actor + critic loss on a sequence of experience."""
        observations, actions, rewards, discounts = trajectory

        # Add dummy batch dimensions.
        actions = tf.expand_dims(actions, axis=-1)  # [T, 1]
        rewards = tf.expand_dims(rewards, axis=-1)  # [T, 1]
        discounts = tf.expand_dims(discounts, axis=-1)  # [T, 1]
        observations = tf.expand_dims(observations, axis=1)  # [T+1, 1, ...]

        # Extract final observation for bootstrapping.
        observations, final_observation = observations[:-1], observations[-1]

        with tf.GradientTape() as tape:
            # Build actor and critic losses.
            (logits,
             values), state = snt.dynamic_unroll(self._network, observations,
                                                 self._rollout_initial_state)
            (_,
             bootstrap_value), state = self._network(final_observation, state)
            values = tf.squeeze(values, axis=-1)
            bootstrap_value = tf.squeeze(bootstrap_value, axis=-1)
            critic_loss, (advantages, _) = trfl.td_lambda(
                state_values=values,
                rewards=rewards,
                pcontinues=self._discount * discounts,
                bootstrap_value=bootstrap_value,
                lambda_=self._td_lambda)
            actor_loss = trfl.discrete_policy_gradient_loss(
                logits, actions, advantages)
            entropy_loss = trfl.discrete_policy_entropy_loss(logits).loss
            loss = actor_loss + critic_loss + self._entropy_cost * entropy_loss
            loss = tf.reduce_mean(loss)

        gradients = tape.gradient(loss, self._network.trainable_variables)
        gradients, _ = tf.clip_by_global_norm(gradients, 5.)
        self._optimizer.apply(gradients, self._network.trainable_variables)

        return state
예제 #11
0
    def _step(self, sequence: Sequence[tf.Tensor]):
        """Do a batch of SGD on actor + critic loss on a sequence of experience."""
        (observations, actions, rewards, discounts, masks, final_obs,
         final_mask) = sequence
        masks = tf.expand_dims(masks, axis=-1)

        with tf.GradientTape() as tape:
            # Build actor and critic losses.
            state = self._rollout_initial_state
            logits_sequence = []
            values = []
            for t in range(self._sequence_length):
                (logits, value), state = self._network(
                    (observations[t], masks[t]), state)
                logits_sequence.append(logits)
                values.append(value)
            (_, bootstrap_value), _ = self._network((final_obs, final_mask),
                                                    state)
            values = tf.squeeze(tf.stack(values, axis=0), axis=-1)
            logits = tf.stack(logits_sequence, axis=0)
            bootstrap_value = tf.squeeze(bootstrap_value, axis=-1)
            critic_loss, (advantages, _) = trfl.td_lambda(
                state_values=values,
                rewards=rewards,
                pcontinues=self._discount * discounts,
                bootstrap_value=bootstrap_value,
                lambda_=self._td_lambda)
            actor_loss = trfl.discrete_policy_gradient_loss(
                logits, actions, advantages)
            loss = tf.reduce_mean(actor_loss + critic_loss)

        gradients = tape.gradient(loss, self._network.trainable_variables)
        gradients, _ = tf.clip_by_global_norm(gradients, 5.)
        self._optimizer.apply(gradients, self._network.trainable_variables)

        return state
    def _compiled_local_step(inputs, labels, training_vars, accum_vars):
      """Replicated training step."""
      with tf.GradientTape() as tape:
        model_outputs, metric_outputs = model(inputs, training=True)
        loss = loss_fn(labels, model_outputs)
      if isinstance(optimizer,
                    tf.keras.mixed_precision.experimental.LossScaleOptimizer):
        with tape:
          scaled_loss = optimizer.get_scaled_loss(loss)
        scaled_grads = tape.gradient(scaled_loss, training_vars)
        grads = optimizer.get_unscaled_gradients(scaled_grads)
      else:
        grads = tape.gradient(loss, training_vars)
      (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

      if accum_vars is None:
        return grads, loss, model_outputs, metric_outputs
      else:
        new_accum_vars = []
        for i, grad in enumerate(grads):
          new_accum_vars.append(
              accum_vars[i] +
              tf.math.scalar_mul(1.0 / num_accumulation_steps, grad))
        return new_accum_vars, loss, model_outputs, metric_outputs
 def grad(dy):
     # NOTE: Must return a gradient for all inputs to `clip_gradient`.
     return tf.clip_by_global_norm([dy], clip_norm)[0][0], tf.constant(0.)
def eager_train_step(detection_model,
                     features,
                     labels,
                     unpad_groundtruth_tensors,
                     optimizer,
                     learning_rate,
                     add_regularization_loss=True,
                     clip_gradients_value=None,
                     global_step=None,
                     num_replicas=1.0):
    """Process a single training batch.
    This method computes the loss for the model on a single training batch,
    while tracking the gradients with a gradient tape. It then updates the
    model variables with the optimizer, clipping the gradients if
    clip_gradients_value is present.
    This method can run eagerly or inside a tf.function.
    Args:
      detection_model: A DetectionModel (based on Keras) to train.
      features: Dictionary of feature tensors from the input dataset.
        Should be in the format output by `inputs.train_input.
          features[fields.InputDataFields.image] is a [batch_size, H, W, C]
            float32 tensor with preprocessed images.
          features[HASH_KEY] is a [batch_size] int32 tensor representing unique
            identifiers for the images.
          features[fields.InputDataFields.true_image_shape] is a [batch_size, 3]
            int32 tensor representing the true image shapes, as preprocessed
            images could be padded.
          features[fields.InputDataFields.original_image] (optional, not used
            during training) is a
            [batch_size, H, W, C] float32 tensor with original images.
      labels: A dictionary of groundtruth tensors. This method unstacks
        these labels using model_lib.unstack_batch. The stacked labels are of
        the form returned by `inputs.train_input` and `inputs.eval_input`.
          labels[fields.InputDataFields.num_groundtruth_boxes] is a [batch_size]
            int32 tensor indicating the number of valid groundtruth boxes
            per image.
          labels[fields.InputDataFields.groundtruth_boxes] is a
            [batch_size, num_boxes, 4] float32 tensor containing the corners of
            the groundtruth boxes.
          labels[fields.InputDataFields.groundtruth_classes] is a
            [batch_size, num_boxes, num_classes] float32 one-hot tensor of
            classes. num_classes includes the background class.
          labels[fields.InputDataFields.groundtruth_weights] is a
            [batch_size, num_boxes] float32 tensor containing groundtruth weights
            for the boxes.
          -- Optional --
          labels[fields.InputDataFields.groundtruth_instance_masks] is a
            [batch_size, num_boxes, H, W] float32 tensor containing only binary
            values, which represent instance masks for objects.
          labels[fields.InputDataFields.groundtruth_keypoints] is a
            [batch_size, num_boxes, num_keypoints, 2] float32 tensor containing
            keypoints for each box.
          labels[fields.InputDataFields.groundtruth_dp_num_points] is a
            [batch_size, num_boxes] int32 tensor with the number of DensePose
            sampled points per instance.
          labels[fields.InputDataFields.groundtruth_dp_part_ids] is a
            [batch_size, num_boxes, max_sampled_points] int32 tensor with the
            part ids (0-indexed) for each instance.
          labels[fields.InputDataFields.groundtruth_dp_surface_coords] is a
            [batch_size, num_boxes, max_sampled_points, 4] float32 tensor with the
            surface coordinates for each point. Each surface coordinate is of the
            form (y, x, v, u) where (y, x) are normalized image locations and
            (v, u) are part-relative normalized surface coordinates.
          labels[fields.InputDataFields.groundtruth_labeled_classes] is a float32
            k-hot tensor of classes.
          labels[fields.InputDataFields.groundtruth_track_ids] is a int32
            tensor of track IDs.
          labels[fields.InputDataFields.groundtruth_keypoint_depths] is a
            float32 tensor containing keypoint depths information.
          labels[fields.InputDataFields.groundtruth_keypoint_depth_weights] is a
            float32 tensor containing the weights of the keypoint depth feature.
      unpad_groundtruth_tensors: A parameter passed to unstack_batch.
      optimizer: The training optimizer that will update the variables.
      learning_rate: The learning rate tensor for the current training step.
        This is used only for TensorBoard logging purposes, it does not affect
         model training.
      add_regularization_loss: Whether or not to include the model's
        regularization loss in the losses dictionary.
      clip_gradients_value: If this is present, clip the gradients global norm
        at this value using `tf.clip_by_global_norm`.
      global_step: The current training step. Used for TensorBoard logging
        purposes. This step is not updated by this function and must be
        incremented separately.
      num_replicas: The number of replicas in the current distribution strategy.
        This is used to scale the total loss so that training in a distribution
        strategy works correctly.
    Returns:
      The total loss observed at this training step
    """
    # """Execute a single training step in the TF v2 style loop."""
    is_training = True

    detection_model._is_training = is_training  # pylint: disable=protected-access
    tf.keras.backend.set_learning_phase(is_training)

    labels = model_lib.unstack_batch(
        labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

    with tf.GradientTape() as tape:
        losses_dict, _ = _compute_losses_and_predictions_dicts(
            detection_model, features, labels, add_regularization_loss)

        total_loss = losses_dict['Loss/total_loss']

        # Normalize loss for num replicas
        total_loss = tf.math.divide(
            total_loss, tf.constant(num_replicas, dtype=tf.float32))
        losses_dict['Loss/normalized_total_loss'] = total_loss

    for loss_type in losses_dict:
        tf.compat.v2.summary.scalar(loss_type,
                                    losses_dict[loss_type],
                                    step=global_step)

    trainable_variables = detection_model.trainable_variables

    gradients = tape.gradient(total_loss, trainable_variables)

    if clip_gradients_value:
        gradients, _ = tf.clip_by_global_norm(gradients, clip_gradients_value)
    optimizer.apply_gradients(zip(gradients, trainable_variables))
    tf.compat.v2.summary.scalar('learning_rate',
                                learning_rate,
                                step=global_step)
    tf.compat.v2.summary.image(name='train_input_images',
                               step=global_step,
                               data=features[fields.InputDataFields.image],
                               max_outputs=3)
    return total_loss
예제 #15
0
def _train_op_fn(loss,
                 optimizer_fn,
                 l2_regularization=-1,
                 gradient_max_norm=-1,
                 use_synchronous_optimizer=False):
    """Returns the op to optimize the loss.

  Supports l2 regularization, learning rate decay and gradient clipping.

  Args:
    loss: The training loss before regularization.
    optimizer_fn: the optimization function.
    l2_regularization: a float that will multiply the l2 weight norms in the
      loss function.
    gradient_max_norm: a float - maximal gradient update allowed.
    use_synchronous_optimizer: a bool whether to use synchronous optimization.

  Returns:
    `ModelSpec` with logits, loss, train_ops and train_hooks.
  """
    total_loss = loss
    if l2_regularization > 0:
        weight_losses = [
            tf.multiply(tf.nn.l2_loss(weight),
                        l2_regularization,
                        name="l2_weight_loss")
            for weight in tf.compat.v1.trainable_variables()
        ]
        total_loss = tf.add_n(weight_losses + [loss], name="total_loss")

    global_step = tf.compat.v1.train.get_or_create_global_step()

    opt = optimizer_fn()

    train_hooks = []
    if use_synchronous_optimizer:
        config = tf.estimator.RunConfig()
        workers = config.num_worker_replicas + 1
        tolerance = _compute_tolerance(workers)
        to_aggregate = workers - tolerance
        opt = tf.compat.v1.train.SyncReplicasOptimizer(
            opt,
            replicas_to_aggregate=to_aggregate,
            total_num_replicas=workers)
        sync_replicas_hook = opt.make_session_run_hook(config.is_chief)
        train_hooks.append(sync_replicas_hook)

    tvars = tf.compat.v1.trainable_variables()
    grads_and_vars = opt.compute_gradients(loss=total_loss, var_list=tvars)
    # TODO(b/172564129): switch to tf.contrib.estimator.clip_gradients_by_norm
    if gradient_max_norm > 0.0:
        grads = [gv[0] for gv in grads_and_vars]
        tvars = [gv[1] for gv in grads_and_vars]
        grads, _ = tf.clip_by_global_norm(grads, gradient_max_norm)
        grads_and_vars = list(zip(grads, tvars))

    if use_synchronous_optimizer:
        apply_gradients_op = opt.apply_gradients(grads_and_vars, global_step)
    else:
        apply_gradients_op = opt.apply_gradients(grads_and_vars)

    update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        return tf.group(apply_gradients_op), train_hooks
예제 #16
0
  def _step(self) -> Dict[str, tf.Tensor]:
    # Get data from replay (dropping extras if any). Note there is no
    # extra data here because we do not insert any into Reverb.
    sample = next(self._iterator)
    o_tm1, a_tm1, r_t, d_t, o_t = sample.data[:5]

    # Cast the additional discount to match the environment discount dtype.
    discount = tf.cast(self._discount, dtype=d_t.dtype)

    q_t = self._target_critic_network(o_t,
                                      self._policy_network(o_t))
    if not self._distributional and self._vmin is not None:
      q_t = tf.clip_by_value(q_t, self._vmin, self._vmax)
      logging.info('Clip target critic network output with [%f, %f]',
                   self._vmin, self._vmax)

    with tf.GradientTape() as tape:
      # Critic learning.
      q_tm1 = self._critic_network(o_tm1, a_tm1)

      # Critic loss.
      if self._distributional:
        critic_loss = losses.categorical(q_tm1, r_t, discount * d_t, q_t)
      else:
        # Squeeze into the shape expected by the td_learning implementation.
        q_tm1 = tf.squeeze(q_tm1, axis=-1)  # [B]
        q_t = tf.squeeze(q_t, axis=-1)  # [B]
        critic_loss = trfl.td_learning(q_tm1, r_t, discount * d_t, q_t).loss

      critic_loss = tf.reduce_mean(critic_loss, axis=[0])

    # Get trainable variables.
    critic_variables = self._critic_network.trainable_variables

    # Compute gradients.
    critic_gradients = tape.gradient(critic_loss, critic_variables)

    # Maybe clip gradients.
    if self._clipping:
      critic_gradients = tf.clip_by_global_norm(critic_gradients, 40.)[0]

    # Apply gradients.
    self._critic_optimizer.apply(critic_gradients, critic_variables)

    source_variables = self._critic_network.variables
    target_variables = self._target_critic_network.variables

    # Make online -> target network update ops.
    if tf.math.mod(self._num_steps, self._target_update_period) == 0:
      for src, dest in zip(source_variables, target_variables):
        dest.assign(src)

    if self._init_observations is not None:
      if tf.math.mod(self._num_steps, 100) == 0:
        # init_obs = tf.convert_to_tensor(self._init_observations, tf.float32)
        init_obs = tree.map_structure(tf.convert_to_tensor,
                                      self._init_observations)
        init_actions = self._policy_network(init_obs)
        init_critic = tf.reduce_mean(self._critic_mean(init_obs, init_actions))
      else:
        init_critic = tf.constant(0.)
    else:
      init_critic = tf.constant(0.)

    self._num_steps.assign_add(1)

    # Losses to track.
    return {
        'critic_loss': critic_loss,
        'q_s0': init_critic,
    }
예제 #17
0
 def step_fn_d(self, batch):
   outputs = self.model(batch)
   d_losses, grads = self.model.discriminator_step_fn(outputs)
   grads, _ = tf.clip_by_global_norm(grads, self.grad_clip_norm)
   self.d_optimizer.apply_gradients(zip(grads, self.model.discriminator_variables))
   return d_losses
예제 #18
0
 def step_fn_g(self, batch):
   """Per-Replica training step."""
   outputs, losses, grads = self.model.step_fn(batch)
   grads, _ = tf.clip_by_global_norm(grads, self.grad_clip_norm)
   self.optimizer.apply_gradients(zip(grads, self.model.generator_variables))
   return losses