示例#1
0
    def _train(self, experience, weights):
        (observations, actions,
         rewards) = bandit_utils.process_experience_for_neural_agents(
             experience, self._observation_and_action_constraint_splitter,
             self._accepts_per_arm_features, self.training_data_spec)

        with tf.GradientTape() as tape:
            loss_info = self.loss(observations,
                                  actions,
                                  rewards,
                                  weights=weights,
                                  training=True)

        variables_to_train = self._variables_to_train()
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars)
        self.train_step_counter.assign_add(1)

        return loss_info
示例#2
0
  def _train(self, experience, weights):
    time_steps, actions, next_time_steps = self._experience_to_transitions(
        experience)

    with tf.GradientTape() as tape:
      loss_info = self.loss(time_steps,
                            actions,
                            next_time_steps,
                            td_errors_loss_fn=self._td_errors_loss_fn,
                            gamma=self._gamma,
                            reward_scale_factor=self._reward_scale_factor,
                            weights=weights)
    tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan')
    variables_to_train = self._q_network.trainable_weights
    assert list(variables_to_train), "No variables in the agent's q_network."
    grads = tape.gradient(loss_info.loss, variables_to_train)
    grads_and_vars = zip(grads, variables_to_train)
    if self._gradient_clipping is not None:
      grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars,
                                                       self._gradient_clipping)

    if self._summarize_grads_and_vars:
      eager_utils.add_variables_summaries(grads_and_vars,
                                          self.train_step_counter)
      eager_utils.add_gradients_summaries(grads_and_vars,
                                          self.train_step_counter)

    self._optimizer.apply_gradients(grads_and_vars,
                                    global_step=self.train_step_counter)

    self._update_target()

    return loss_info
示例#3
0
    def train_complete(self,
                       tape: tf.GradientTape,
                       training_info: TrainingInfo,
                       weight=1.0):
        """Complete one iteration of training.

        `train_complete` should calculate gradients and update parameters using
        those gradients.

        Args:
            tape (tf.GradientTape): the tape which are used for calculating
                gradient. All the previous `train_interval` `train_step()` for
                are called under the context of this tape.
            training_info (TrainingInfo): information collected for training.
                training_info.info are the batched from each policy_step.info
                returned by train_step()
            weight (float): weight for this batch. Loss will be multiplied with
                this weight before calculating gradient
        Returns:
            a tuple of the following:
            loss_info (LossInfo): loss information
            grads_and_vars (list[tuple]): list of gradient and variable tuples
        """
        valid_masks = tf.cast(
            tf.not_equal(training_info.step_type, StepType.LAST), tf.float32)

        # reward shaping
        if self._reward_shaping_fn is not None:
            # record unshaped extrinsic rewards given by the environment
            self.add_reward_summary("reward/raw", training_info.reward)
            training_info = training_info._replace(
                reward=self._reward_shaping_fn(training_info.reward))

        # record shaped extrinsic rewards actually used for training
        self.add_reward_summary("reward/extrinsic", training_info.reward)

        with tape:
            loss_info = self.calc_loss(training_info)
            loss_info = tf.nest.map_structure(
                lambda l: tf.reduce_mean(l * valid_masks), loss_info)
            loss = weight * loss_info.loss

        var_sets = self._get_cached_var_sets()
        all_grads_and_vars = ()
        for vars, optimizer in zip(var_sets, self._optimizers):
            grads = tape.gradient(loss, vars)
            grads_and_vars = tuple(zip(grads, vars))
            all_grads_and_vars = all_grads_and_vars + grads_and_vars
            if self._gradient_clipping is not None:
                if self._clip_by_global_norm:
                    grads, _ = tf.clip_by_global_norm(grads,
                                                      self._gradient_clipping)
                    grads_and_vars = tuple(zip(grads, vars))
                else:
                    grads_and_vars = eager_utils.clip_gradient_norms(
                        grads_and_vars, self._gradient_clipping)

            optimizer.apply_gradients(grads_and_vars)

        return loss_info, all_grads_and_vars
示例#4
0
  def _train(self, experience, weights):
    with tf.GradientTape() as tape:
      loss_info = self._loss(
          experience,
          td_errors_loss_fn=self._td_errors_loss_fn,
          gamma=self._gamma,
          reward_scale_factor=self._reward_scale_factor,
          weights=weights,
          training=True)
    tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
    variables_to_train = self._q_network.trainable_weights
    non_trainable_weights = self._q_network.non_trainable_weights
    assert list(variables_to_train), "No variables in the agent's q_network."
    grads = tape.gradient(loss_info.loss, variables_to_train)
    # Tuple is used for py3, where zip is a generator producing values once.
    grads_and_vars = list(zip(grads, variables_to_train))
    if self._gradient_clipping is not None:
      grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars,
                                                       self._gradient_clipping)

    if self._summarize_grads_and_vars:
      grads_and_vars_with_non_trainable = (
          grads_and_vars + [(None, v) for v in non_trainable_weights])
      eager_utils.add_variables_summaries(grads_and_vars_with_non_trainable,
                                          self.train_step_counter)
      eager_utils.add_gradients_summaries(grads_and_vars,
                                          self.train_step_counter)
    self._optimizer.apply_gradients(grads_and_vars)
    self.train_step_counter.assign_add(1)

    self._update_target()

    return loss_info
示例#5
0
    def _apply_loss(self, aggregated_losses, variables_to_train, tape,
                    optimizer):
        total_loss = aggregated_losses.total_loss
        tf.debugging.check_numerics(total_loss, "Loss is inf or nan")
        assert list(variables_to_train), "No variables in the agent's network."

        grads = tape.gradient(total_loss, variables_to_train)
        grads_and_vars = list(zip(grads, variables_to_train))

        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self.summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)

        optimizer.apply_gradients(grads_and_vars)

        if self.summaries_enabled:
            dict_losses = {
                "loss": aggregated_losses.weighted,
                "reg_loss": aggregated_losses.regularization,
                "total_loss": total_loss
            }
            common.summarize_scalar_dict(dict_losses,
                                         step=self.train_step_counter,
                                         name_scope="Losses/")
示例#6
0
  def _train_v1(self, experience, weights):
    with tf.GradientTape() as tape:
      loss_info = self._loss(
          experience,
          td_errors_loss_fn=self._td_errors_loss_fn,
          gamma=self._gamma,
          reward_scale_factor=self._reward_scale_factor,
          weights=weights)
    tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan')
    variables_to_train = self._q_network.trainable_weights
    assert list(variables_to_train), "No variables in the agent's q_network."
    grads = tape.gradient(loss_info.loss, variables_to_train)
    # Tuple is used for py3, where zip is a generator producing values once.
    grads_and_vars = tuple(zip(grads, variables_to_train))
    if self._gradient_clipping is not None:
      grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars,
                                                       self._gradient_clipping)

    if self._summarize_grads_and_vars:
      eager_utils.add_variables_summaries(grads_and_vars,
                                          self.train_step_counter)
      eager_utils.add_gradients_summaries(grads_and_vars,
                                          self.train_step_counter)

    train_op = self._optimizer.apply_gradients(
        grads_and_vars, global_step=self.train_step_counter)

    update_op = self._update_target()
    train_op = tf.group(train_op, update_op)

    return train_op, loss_info
    def _train(self, experience: types.NestedTensor,
               weights: types.Tensor) -> tf_agent.LossInfo:
        experience = self._as_trajectory(experience)

        with tf.GradientTape() as tape:
            loss_info = self._loss(experience, weights=weights, training=True)

        variables_to_train = self._variables_to_train()
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars)
        self.train_step_counter.assign_add(1)

        return loss_info
示例#8
0
    def _train(self, experience, weights=None):
        # TODO(b/126593927): Support batch dimensions >1.
        if experience.step_type.shape[0] != 1:
            raise NotImplementedError(
                'ReinforceAgent does not yet support batch '
                'dimensions greater than 1.')

        experience = tf.nest.map_structure(lambda t: tf.squeeze(t, 0),
                                           experience)
        returns = common.compute_returns(experience.reward,
                                         experience.discount)
        if self._debug_summaries:
            tf.compat.v2.summary.histogram(name='rewards',
                                           data=experience.reward,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='discounts',
                                           data=experience.discount,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='returns',
                                           data=returns,
                                           step=self.train_step_counter)

        # TODO(b/126592060): replace with tensor normalizer.
        if self._normalize_returns:
            ret_mean, ret_var = tf.nn.moments(x=returns, axes=[0])
            returns = (returns - ret_mean) / (tf.sqrt(ret_var) + 1e-6)
            if self._debug_summaries:
                tf.compat.v2.summary.histogram(name='normalized_returns',
                                               data=returns,
                                               step=self.train_step_counter)

        time_step = ts.TimeStep(experience.step_type,
                                tf.zeros_like(experience.reward),
                                tf.zeros_like(experience.discount),
                                experience.observation)

        variables_to_train = self._actor_network.variables
        with tf.GradientTape() as tape:
            loss_info = self._loss(time_step,
                                   experience.action,
                                   tf.stop_gradient(returns),
                                   weights=weights)
            tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
        grads = tape.gradient(loss_info.loss, variables_to_train)

        grads_and_vars = zip(grads, variables_to_train)
        if self._gradient_clipping:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars,
                                        global_step=self.train_step_counter)

        return tf.nest.map_structure(tf.identity, loss_info)
示例#9
0
    def train(self, x0, a0, y0, y1, r0, r1, vars_to_train):
        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(vars_to_train)

            feat_x0, _ = self.forward_enc(x0, training=True)
            if self.action_condition:
                e_zx0_param, _ = self.forward_head([feat_x0, a0],
                                                   training=True)
            else:
                e_zx0_param, _ = self.forward_head(feat_x0, training=True)
            e_zx0_loc, e_zx0_scale = e_zx0_param
            e_zx0 = tfd.MultivariateNormalDiag(loc=e_zx0_loc,
                                               scale_diag=e_zx0_scale)
            zx0 = e_zx0.sample()

            feat_y0, _ = self.backward_enc(y0,
                                           training=self.learn_backward_enc)
            if not self.learn_backward_enc:
                feat_y0 = tf.stop_gradient(feat_y0)
            if self.backward_encode_rewards:
                b_zy0_param, _ = self.backward_head([feat_y0, r0],
                                                    training=True)
            else:
                b_zy0_param, _ = self.backward_head(feat_y0, training=True)
            b_zy0_loc, b_zy0_scale = b_zy0_param
            b_zy0 = tfd.MultivariateNormalDiag(loc=b_zy0_loc,
                                               scale_diag=b_zy0_scale)

            b_zy1 = None
            if self.ceb.smooth_mode is not None:
                feat_y1, _ = self.backward_enc(
                    y1, training=self.learn_backward_enc)
                if not self.learn_backward_enc:
                    feat_y1 = tf.stop_gradient(feat_y1)
                if self.backward_encode_rewards:
                    b_zy1_param, _ = self.backward_head([feat_y1, r1],
                                                        training=True)
                else:
                    b_zy1_param, _ = self.backward_head(feat_y1, training=True)
                b_zy1_loc, b_zy1_scale = b_zy1_param
                b_zy1 = tfd.MultivariateNormalDiag(loc=b_zy1_loc,
                                                   scale_diag=b_zy1_scale)

            if self.y_decoders is None:  # pure contrastive CEB
                loss = self.ceb.loss(zx0, e_zx0, b_zy0, b_zy1)
            else:  # CEB with generative objectives
                # y_targets0 = [y0, r0]
                y_targets0 = [tf.cast(y0, tf.float32) / 255.0, r0]
                y_preds0 = self.y_decoders(zx0, training=True)._[0]
                loss = self.ceb.loss(zx0, e_zx0, b_zy0, b_zy1, y_preds0,
                                     y_targets0)

        grads = tape.gradient(loss, vars_to_train)
        grads_and_vars = tuple(zip(grads, vars_to_train))
        if self.grad_clip is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self.grad_clip)
        self.optimizer.apply_gradients(grads_and_vars)
        return loss, feat_x0, zx0
示例#10
0
    def compute_loss_using_reward_layer(
            self,
            observation: types.NestedTensor,
            action: types.Tensor,
            reward: types.Tensor,
            weights: Optional[types.Float] = None,
            training: bool = False) -> tf_agent.LossInfo:
        """Computes loss using the reward layer.

    Args:
      observation: A batch of observations.
      action: A batch of actions.
      reward: A batch of rewards.
      weights: Optional scalar or elementwise (per-batch-entry) importance
        weights.  The output batch loss will be scaled by these weights, and
        the final scalar loss is the mean of these values.
      training: Whether the loss is being used for training.

    Returns:
      loss: A `LossInfo` containing the loss for the training step.
    """
        # Update the neural network params.
        with tf.GradientTape() as tape:
            loss_info = self._loss_using_reward_layer(observation,
                                                      action,
                                                      reward,
                                                      weights,
                                                      training=training)
        tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan')
        tf.compat.v2.summary.scalar(name='using_reward_layer',
                                    data=1,
                                    step=self.train_step_counter)
        if self._summarize_grads_and_vars:
            self.compute_summaries(loss_info.loss)
        variables_to_train = (self._encoding_network.trainable_weights +
                              self._reward_layer.trainable_weights)
        if not variables_to_train:
            raise ValueError('No variable to train in the agent.')

        grads = tape.gradient(loss_info.loss, variables_to_train)
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            with tf.name_scope('Reward_network/'):
                eager_utils.add_variables_summaries(grads_and_vars,
                                                    self.train_step_counter)
                eager_utils.add_gradients_summaries(grads_and_vars,
                                                    self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars)
        self.train_step_counter.assign_add(1)

        return loss_info
示例#11
0
    def _train(self, experience, weights=None):
        # Add a mask to ensure we reset the return calculation at episode
        # boundaries. This is needed in cases where episodes are truncated before
        # reaching a terminal state.
        non_last_mask = tf.cast(
            tf.math.not_equal(experience.next_step_type, ts.StepType.LAST),
            tf.float32)
        discounts = non_last_mask * experience.discount * self._gamma
        returns = value_ops.discounted_return(experience.reward,
                                              discounts,
                                              time_major=False)

        if self._debug_summaries:
            tf.compat.v2.summary.histogram(name='rewards',
                                           data=experience.reward,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='discounts',
                                           data=experience.discount,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='returns',
                                           data=returns,
                                           step=self.train_step_counter)

        time_step = ts.TimeStep(experience.step_type,
                                tf.zeros_like(experience.reward),
                                tf.zeros_like(experience.discount),
                                experience.observation)

        with tf.GradientTape() as tape:
            loss_info = self.total_loss(time_step,
                                        experience.action,
                                        tf.stop_gradient(returns),
                                        weights=weights)
            tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
        variables_to_train = self._actor_network.trainable_weights
        if self._baseline:
            variables_to_train += self._value_network.trainable_weights
        grads = tape.gradient(loss_info.loss, variables_to_train)

        grads_and_vars = list(zip(grads, variables_to_train))
        if self._gradient_clipping:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars,
                                        global_step=self.train_step_counter)

        return tf.nest.map_structure(tf.identity, loss_info)
示例#12
0
  def testClipGrads(self):
    xs = tf.Variable(0.0)
    grads = tf.constant(4.0)
    gradients_to_variables = [(grads, xs)]
    clipped_gradients_to_variables = eager_utils.clip_gradient_norms(
        gradients_to_variables, 3.0)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    self.assertAlmostEqual(4.0, self.evaluate(gradients_to_variables[0][0]))
    self.assertAlmostEqual(3.0,
                           self.evaluate(clipped_gradients_to_variables[0][0]))
示例#13
0
    def _train(self, experience, weights=None):
        returns = value_ops.discounted_return(experience.reward,
                                              experience.discount,
                                              time_major=False)

        if self._debug_summaries:
            tf.compat.v2.summary.histogram(name='rewards',
                                           data=experience.reward,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='discounts',
                                           data=experience.discount,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='returns',
                                           data=returns,
                                           step=self.train_step_counter)

        # TODO(b/126592060): replace with tensor normalizer.
        if self._normalize_returns:
            returns = _standard_normalize(returns, axes=(0, 1))
            if self._debug_summaries:
                tf.compat.v2.summary.histogram(name='normalized_returns',
                                               data=returns,
                                               step=self.train_step_counter)

        time_step = ts.TimeStep(experience.step_type,
                                tf.zeros_like(experience.reward),
                                tf.zeros_like(experience.discount),
                                experience.observation)

        variables_to_train = self._actor_network.variables
        with tf.GradientTape() as tape:
            loss_info = self._loss(time_step,
                                   experience.action,
                                   tf.stop_gradient(returns),
                                   weights=weights)
            tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
        grads = tape.gradient(loss_info.loss, variables_to_train)

        grads_and_vars = zip(grads, variables_to_train)
        if self._gradient_clipping:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars,
                                        global_step=self.train_step_counter)

        return tf.nest.map_structure(tf.identity, loss_info)
示例#14
0
    def _train(self, experience, weights):
        rewards, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.reward, self._time_step_spec.reward)
        actions, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.action, self._action_spec)
        observations, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.observation, self.training_data_spec.observation)
        if self._observation_and_action_constraint_splitter is not None:
            observations, _ = self._observation_and_action_constraint_splitter(
                observations)
        if self._accepts_per_arm_features:
            # The arm observation we train on needs to be copied from the respective
            # policy info field to the per arm observation field. Pretending there was
            # only one action, we fill the action field with zeros.
            chosen_action, _ = nest_utils.flatten_multi_batched_nested_tensors(
                experience.policy_info.chosen_arm_features,
                self.policy.info_spec.chosen_arm_features)
            observations[
                bandit_spec_utils.PER_ARM_FEATURE_KEY] = tf.expand_dims(
                    chosen_action, axis=1)
            actions = tf.zeros_like(actions)

        with tf.GradientTape() as tape:
            loss_info = self.loss(observations,
                                  actions,
                                  rewards,
                                  weights=weights,
                                  training=True)

        self.compute_summaries(loss_info.loss)
        variables_to_train = self._reward_network.trainable_weights
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        training_lib.apply_gradients(self._optimizer,
                                     grads_and_vars,
                                     global_step=self.train_step_counter)

        return loss_info
示例#15
0
    def _apply_gradients(self, gradients, variables, optimizer):
        grads_and_vars = list(zip(gradients, variables))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        optimizer.apply_gradients(grads_and_vars)
示例#16
0
    def _train(self, experience, weights):
        # Get individual tensors from transitions.
        (time_steps, policy_steps_,
         next_time_steps) = trajectory.to_transition(experience)
        #observations = time_steps.observation
        actions = policy_steps_.action

        rewards = next_time_steps.reward
        print(rewards)
        discounts = next_time_steps.discount
        if self._reward_normalizer:
            rewards = self._reward_normalizer.normalize(
                rewards,
                center_mean=False,
                clip_value=self._reward_norm_clipping)

        value_preds = self.double_batch_pred(self._mod_net,
                                             experience.observation,
                                             is_training=True)
        #print("VPRED",value_preds.shape,value_preds_2.shape)

        returns = self.compute_return(next_time_steps, value_preds)
        value_estimation_losses = []

        loss_info = None
        # For each epoch, create its own train op that depends on the previous one.
        for i_epoch in range(self._num_epochs):
            with tf.name_scope('epoch_%d' % i_epoch):

                # Build one epoch train op.
                with tf.GradientTape() as tape:
                    loss_info = self.get_epoch_loss(
                        time_steps, returns,
                        weights)  #action_distribution_parameters

                variables_to_train = self._mod_net.trainable_weights
                grads = tape.gradient(loss_info.loss, variables_to_train)
                # Tuple is used for py3, where zip is a generator producing values once.
                grads_and_vars = tuple(zip(grads, variables_to_train))
                if self._gradient_clipping > 0:
                    grads_and_vars = eager_utils.clip_gradient_norms(
                        grads_and_vars, self._gradient_clipping)

                self._optimizer.apply_gradients(
                    grads_and_vars)  #, global_step=self.train_step_counter)

                value_estimation_losses.append(
                    loss_info.extra.value_estimation_loss)

        loss_info = tf.nest.map_structure(tf.identity, loss_info)
        return loss_info
示例#17
0
    def testClipGradsIndexedSlices(self):
        xs = tf.Variable(0.0)
        grads = tf.IndexedSlices(values=tf.constant(4.0),
                                 indices=tf.constant([0]),
                                 dense_shape=None)
        gradients_to_variables = [(grads, xs)]
        clipped_gradients_to_variables = eager_utils.clip_gradient_norms(
            gradients_to_variables, 3.0)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAlmostEqual(
            4.0, self.evaluate(gradients_to_variables[0][0].values))
        self.assertAlmostEqual(
            3.0, self.evaluate(clipped_gradients_to_variables[0][0].values))
示例#18
0
  def _apply_gradients(self, gradients, variables, optimizer):
    # Tuple is used for py3, where zip is a generator producing values once.
    grads_and_vars = tuple(zip(gradients, variables))
    if self._gradient_clipping is not None:
      grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars,
                                                       self._gradient_clipping)

    if self._summarize_grads_and_vars:
      eager_utils.add_variables_summaries(grads_and_vars,
                                          self.train_step_counter)
      eager_utils.add_gradients_summaries(grads_and_vars,
                                          self.train_step_counter)

    return optimizer.apply_gradients(grads_and_vars)
示例#19
0
    def _train(self, experience, weights=None):
        # TODO(b/132914246): Use .is_last() to mask the end of each episode.
        returns = value_ops.discounted_return(experience.reward,
                                              experience.discount *
                                              self._gamma,
                                              time_major=False)

        if self._debug_summaries:
            tf.compat.v2.summary.histogram(name='rewards',
                                           data=experience.reward,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='discounts',
                                           data=experience.discount,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='returns',
                                           data=returns,
                                           step=self.train_step_counter)

        time_step = ts.TimeStep(experience.step_type,
                                tf.zeros_like(experience.reward),
                                tf.zeros_like(experience.discount),
                                experience.observation)

        with tf.GradientTape() as tape:
            loss_info = self.total_loss(time_step,
                                        experience.action,
                                        tf.stop_gradient(returns),
                                        weights=weights)
            tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
        variables_to_train = self._actor_network.trainable_weights
        if self._baseline:
            variables_to_train += self._value_network.trainable_weights
        grads = tape.gradient(loss_info.loss, variables_to_train)

        grads_and_vars = zip(grads, variables_to_train)
        if self._gradient_clipping:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars,
                                        global_step=self.train_step_counter)

        return tf.nest.map_structure(tf.identity, loss_info)
    def _train(self, experience: types.NestedTensor,
               weights: types.Tensor) -> tf_agent.LossInfo:
        (observations, actions,
         objective_values) = bandit_utils.process_experience_for_neural_agents(
             experience, self._accepts_per_arm_features,
             self.training_data_spec)
        if self._observation_and_action_constraint_splitter is not None:
            observations, _ = self._observation_and_action_constraint_splitter(
                observations)
        if objective_values.shape.rank != 2:
            raise ValueError(
                'The objectives tensor should be rank-2 [batch_size, num_objectives],'
                ' but found to be rank-{}'.format(objective_values.shape.rank))
        if objective_values.shape[1] != self._num_objectives:
            raise ValueError(
                'The number of objectives in the objective_values tensor: {} '
                'is different from the number of objective networks: {}.'.
                format(objective_values.shape[1], self._num_objectives))

        with tf.GradientTape() as tape:
            loss_info = self.loss(observations,
                                  actions,
                                  objective_values,
                                  weights=weights,
                                  training=True)

        variables_to_train = self._variables_to_train()
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars)
        self.train_step_counter.assign_add(1)

        return loss_info
示例#21
0
    def _train(self, experience, weights):
        rewards, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.reward, self._time_step_spec.reward)
        actions, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.action, self._action_spec)
        observations, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.observation, self._time_step_spec.observation)
        if self._observation_and_action_constraint_splitter is not None:
            observations, _ = self._observation_and_action_constraint_splitter(
                observations)

        with tf.GradientTape() as tape:
            loss_info = self.loss(observations,
                                  actions,
                                  rewards,
                                  weights=weights,
                                  training=True)
        tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan')
        self.compute_summaries(loss_info.loss)
        variables_to_train = self._reward_network.trainable_weights
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        training_lib.apply_gradients(self._optimizer,
                                     grads_and_vars,
                                     global_step=self.train_step_counter)

        return loss_info
示例#22
0
    def _update_values(self, time_steps, returns, weights):
        """Update value function estimate by performing gradient descent on value loss"""
        variables_to_train = self._value_net.trainable_weights

        value_loss = 0.0
        for _ in range(self._value_train_iters):
            with tf.GradientTape() as tape:
                value_loss = self.value_estimation_loss(
                    time_steps, returns, weights)

            grads = tape.gradient(value_loss, variables_to_train)

            # Tuple is used for py3, where zip is a generator producing values once.
            grads_and_vars = tuple(zip(grads, variables_to_train))
            if self._gradient_clipping > 0:
                grads_and_vars = eager_utils.clip_gradient_norms(
                    grads_and_vars, self._gradient_clipping)

            self._optimizer.apply_gradients(
                grads_and_vars, global_step=self.train_step_counter)
        return value_loss
示例#23
0
    def _train(self, experience, weights):
        experience = self._as_trajectory(experience)

        with tf.GradientTape() as tape:
            loss_info = self._loss(experience, weights=weights, training=True)

        variables_to_train = self._variables_to_train()
        if not variables_to_train:
            logging.info('No variable to train in the agent.')
            return loss_info

        grads = tape.gradient(loss_info.loss, variables_to_train)
        # Tuple is used for py3, where zip is a generator producing values once.
        grads_and_vars = tuple(zip(grads, variables_to_train))
        if self._gradient_clipping is not None:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars)
        self.train_step_counter.assign_add(1)
        if not self._accepts_per_arm_features and self._num_samples_list:
            # Compute the number of samples for each action in the current batch.
            actions_flattened = tf.reshape(experience.action, [-1])
            num_samples_per_action_current = [
                tf.reduce_sum(tf.cast(tf.equal(actions_flattened, k),
                                      tf.int64))
                for k in range(self._num_actions)
            ]
            # Update the number of samples for each action.
            for a, b in zip(self._num_samples_list,
                            num_samples_per_action_current):
                tf.compat.v1.assign_add(a, b)

        return loss_info
示例#24
0
    def _train(self, experience, weights):
        # Get individual tensors from transitions.
        (time_steps, policy_steps_,
         next_time_steps) = trajectory.to_transition(experience)
        actions = policy_steps_.action

        if self._debug_summaries:
            actions_list = tf.nest.flatten(actions)
            show_action_index = len(actions_list) != 1
            for i, single_action in enumerate(actions_list):
                action_name = ('actions_{}'.format(i)
                               if show_action_index else 'actions')
                tf.compat.v2.summary.histogram(name=action_name,
                                               data=single_action,
                                               step=self.train_step_counter)

        action_distribution_parameters = policy_steps_.info

        # Reconstruct per-timestep policy distribution from stored distribution
        #   parameters.
        old_actions_distribution = (
            distribution_spec.nested_distributions_from_specs(
                self._action_distribution_spec,
                action_distribution_parameters))

        # Compute log probability of actions taken during data collection, using the
        #   collect policy distribution.
        act_log_probs = common.log_probability(old_actions_distribution,
                                               actions, self._action_spec)

        # Compute the value predictions for states using the current value function.
        # To be used for return & advantage computation.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._collect_policy.get_initial_state(
            batch_size=batch_size)

        value_preds, unused_policy_state = self._collect_policy.apply_value_network(
            experience.observation,
            experience.step_type,
            policy_state=policy_state)
        value_preds = tf.stop_gradient(value_preds)

        valid_mask = ppo_utils.make_timestep_mask(next_time_steps)

        if weights is None:
            weights = valid_mask
        else:
            weights *= valid_mask

        returns, normalized_advantages = self.compute_return_and_advantage(
            next_time_steps, value_preds)

        # Loss tensors across batches will be aggregated for summaries.
        policy_gradient_losses = []
        value_estimation_losses = []
        l2_regularization_losses = []
        entropy_regularization_losses = []
        kl_penalty_losses = []

        loss_info = None  # TODO(b/123627451): Remove.
        # For each epoch, create its own train op that depends on the previous one.
        for i_epoch in range(self._num_epochs):
            with tf.name_scope('epoch_%d' % i_epoch):
                # Only save debug summaries for first and last epochs.
                debug_summaries = (self._debug_summaries
                                   and (i_epoch == 0
                                        or i_epoch == self._num_epochs - 1))

                # Build one epoch train op.
                with tf.GradientTape() as tape:
                    loss_info = self.get_epoch_loss(
                        time_steps, actions, act_log_probs, returns,
                        normalized_advantages, action_distribution_parameters,
                        weights, self.train_step_counter, debug_summaries)

                variables_to_train = (self._actor_net.trainable_weights +
                                      self._value_net.trainable_weights)
                grads = tape.gradient(loss_info.loss, variables_to_train)
                # Tuple is used for py3, where zip is a generator producing values once.
                grads_and_vars = tuple(zip(grads, variables_to_train))
                if self._gradient_clipping > 0:
                    grads_and_vars = eager_utils.clip_gradient_norms(
                        grads_and_vars, self._gradient_clipping)

                # If summarize_gradients, create functions for summarizing both
                # gradients and variables.
                if self._summarize_grads_and_vars and debug_summaries:
                    eager_utils.add_gradients_summaries(
                        grads_and_vars, self.train_step_counter)
                    eager_utils.add_variables_summaries(
                        grads_and_vars, self.train_step_counter)

                self._optimizer.apply_gradients(
                    grads_and_vars, global_step=self.train_step_counter)

                policy_gradient_losses.append(
                    loss_info.extra.policy_gradient_loss)
                value_estimation_losses.append(
                    loss_info.extra.value_estimation_loss)
                l2_regularization_losses.append(
                    loss_info.extra.l2_regularization_loss)
                entropy_regularization_losses.append(
                    loss_info.extra.entropy_regularization_loss)
                kl_penalty_losses.append(loss_info.extra.kl_penalty_loss)

        # After update epochs, update adaptive kl beta, then update observation
        #   normalizer and reward normalizer.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._collect_policy.get_initial_state(batch_size)
        # Compute the mean kl from previous action distribution.
        kl_divergence = self._kl_divergence(
            time_steps, action_distribution_parameters,
            self._collect_policy.distribution(time_steps, policy_state).action)
        self.update_adaptive_kl_beta(kl_divergence)

        if self._observation_normalizer:
            self._observation_normalizer.update(time_steps.observation,
                                                outer_dims=[0, 1])
        else:
            # TODO(b/127661780): Verify performance of reward_normalizer when obs are
            #                    not normalized
            if self._reward_normalizer:
                self._reward_normalizer.update(next_time_steps.reward,
                                               outer_dims=[0, 1])

        loss_info = tf.nest.map_structure(tf.identity, loss_info)

        # Make summaries for total loss across all epochs.
        # The *_losses lists will have been populated by
        #   calls to self.get_epoch_loss.
        with tf.name_scope('Losses/'):
            total_policy_gradient_loss = tf.add_n(policy_gradient_losses)
            total_value_estimation_loss = tf.add_n(value_estimation_losses)
            total_l2_regularization_loss = tf.add_n(l2_regularization_losses)
            total_entropy_regularization_loss = tf.add_n(
                entropy_regularization_losses)
            total_kl_penalty_loss = tf.add_n(kl_penalty_losses)
            tf.compat.v2.summary.scalar(name='policy_gradient_loss',
                                        data=total_policy_gradient_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='value_estimation_loss',
                                        data=total_value_estimation_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='l2_regularization_loss',
                                        data=total_l2_regularization_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='entropy_regularization_loss',
                                        data=total_entropy_regularization_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='kl_penalty_loss',
                                        data=total_kl_penalty_loss,
                                        step=self.train_step_counter)

            total_abs_loss = (tf.abs(total_policy_gradient_loss) +
                              tf.abs(total_value_estimation_loss) +
                              tf.abs(total_entropy_regularization_loss) +
                              tf.abs(total_l2_regularization_loss) +
                              tf.abs(total_kl_penalty_loss))

            tf.compat.v2.summary.scalar(name='total_abs_loss',
                                        data=total_abs_loss,
                                        step=self.train_step_counter)

        if self._summarize_grads_and_vars:
            with tf.name_scope('Variables/'):
                all_vars = (self._actor_net.trainable_weights +
                            self._value_net.trainable_weights)
                for var in all_vars:
                    tf.compat.v2.summary.histogram(
                        name=var.name.replace(':', '_'),
                        data=var,
                        step=self.train_step_counter)

        return loss_info
示例#25
0
def apply_gradients(gradients, variables, optimizer, gradient_clipping):
  grads_and_vars = zip(gradients, variables)
  if gradient_clipping is not None:
    grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars, gradient_clipping)
  optimizer.apply_gradients(grads_and_vars)
    def _train(self, experience, weights):
        # Get individual tensors from transitions.
        (time_steps, policy_steps_,
         next_time_steps) = trajectory.to_transition(experience)

        #observations = time_steps.observation
        actions = policy_steps_.action
        #rewards = next_time_steps.reward
        #discounts = next_time_steps.discount

        old_actions_distribution = policy_steps_.info

        act_log_probs = get_neglopacs(logits=old_actions_distribution,
                                      labels=actions)

        # Compute the value predictions for states using the current value function.

        value_preds = double_batch_pred2(self._value_net,
                                         experience.observation,
                                         self._observation_spec,
                                         is_training=True)
        value_preds = tf.squeeze(value_preds, -1)

        #NeedValue preds at all time_steps +1 final step obs
        #print("Weight",weights)
        #print("REW",rewards)
        #print("Dis",discounts)
        returns, normalized_advantages = self.compute_return_and_advantage(
            next_time_steps, value_preds)

        #print("RET",returns)
        #print(normalized_advantages)
        # Loss tensors across batches will be aggregated for summaries.
        policy_gradient_losses = []
        value_estimation_losses = []
        l2_regularization_losses = []
        entropy_regularization_losses = []
        kl_penalty_losses = []

        loss_info = None  # TODO(b/123627451): Remove.
        # For each epoch, create its own train op that depends on the previous one.
        for i_epoch in range(self._num_epochs):
            with tf.name_scope('epoch_%d' % i_epoch):

                # Build one epoch train op.
                with tf.GradientTape() as tape:
                    loss_info = self.get_epoch_loss(
                        time_steps, actions, act_log_probs, returns,
                        normalized_advantages, old_actions_distribution,
                        weights)  #action_distribution_parameters

                variables_to_train = (self._actor_net.trainable_variables +
                                      self._value_net.trainable_variables)
                grads = tape.gradient(loss_info.loss, variables_to_train)
                # Tuple is used for py3, where zip is a generator producing values once.
                grads_and_vars = tuple(zip(grads, variables_to_train))
                if self._gradient_clipping > 0:
                    grads_and_vars = eager_utils.clip_gradient_norms(
                        grads_and_vars, self._gradient_clipping)

                self._optimizer.apply_gradients(
                    grads_and_vars)  #, global_step=self.train_step_counter)

                policy_gradient_losses.append(
                    loss_info.extra.policy_gradient_loss)
                value_estimation_losses.append(
                    loss_info.extra.value_estimation_loss)
                l2_regularization_losses.append(
                    loss_info.extra.l2_regularization_loss)
                entropy_regularization_losses.append(
                    loss_info.extra.entropy_regularization_loss)
                kl_penalty_losses.append(loss_info.extra.kl_penalty_loss)

        # After update epochs, update adaptive kl beta, then update observation
        #   normalizer and reward normalizer.
        # Compute the mean kl from previous action distribution.
        temp_ = double_batch_pred2(self._actor_net,
                                   time_steps.observation,
                                   self._observation_spec,
                                   is_training=True)
        kl_divergence = self._kl_divergence(time_steps,
                                            old_actions_distribution, temp_)
        self.update_adaptive_kl_beta(kl_divergence)

        if self._observation_normalizer:
            self._observation_normalizer.update(time_steps.observation,
                                                outer_dims=[0, 1])
        else:
            # TODO(b/127661780): Verify performance of reward_normalizer when obs are
            #                    not normalized
            if self._reward_normalizer:
                self._reward_normalizer.update(next_time_steps.reward,
                                               outer_dims=[0, 1])

        loss_info = tf.nest.map_structure(tf.identity, loss_info)
        return loss_info
示例#27
0
    def train_complete(self,
                       tape: tf.GradientTape,
                       training_info,
                       valid_masks=None,
                       weight=1.0):
        """Complete one iteration of training.

        `train_complete` should calculate gradients and update parameters using
        those gradients.

        Args:
            tape (tf.GradientTape): the tape which are used for calculating
                gradient. All the previous `train_interval` `train_step()`
                are called under the context of this tape.
            training_info (nested Tensor): information collected for training.
                It is batched from each `info` returned bt `train_step()`
            valid_masks (tf.Tensor): masks indicating which samples are valid.
                shape=(T, B), dtype=tf.float32
            weight (float): weight for this batch. Loss will be multiplied with
                this weight before calculating gradient
        Returns:
            loss_info (LossInfo): loss information
            grads_and_vars (list[tuple]): list of gradient and variable tuples
        """
        with tape:
            loss_info = self.calc_loss(training_info)
            if valid_masks is not None:
                loss_info = tf.nest.map_structure(
                    lambda l: tf.reduce_mean(l * valid_masks)
                    if len(l.shape) == 2 else l, loss_info)
            else:
                loss_info = tf.nest.map_structure(lambda l: tf.reduce_mean(l),
                                                  loss_info)
            if isinstance(loss_info.scalar_loss, tf.Tensor):
                assert len(loss_info.scalar_loss.shape) == 0
                loss_info = loss_info._replace(
                    loss=loss_info.loss + loss_info.scalar_loss)
            loss = weight * loss_info.loss

        opt_and_var_sets = self._get_cached_opt_and_var_sets()
        all_grads_and_vars = ()
        for i, (optimizer, vars) in enumerate(opt_and_var_sets):
            if len(vars) == 0:
                continue
            assert optimizer is not None, "optimizer needs to be provides at __init__()"
            grads = tape.gradient(loss, vars)
            grads_and_vars = tuple(zip(grads, vars))
            all_grads_and_vars = all_grads_and_vars + grads_and_vars
            if self._gradient_clipping is not None:
                if self._clip_by_global_norm:
                    grads, global_norm = tf.clip_by_global_norm(
                        grads, self._gradient_clipping)
                    grads_and_vars = tuple(zip(grads, vars))
                    alf.utils.common.run_if(
                        alf.utils.common.should_record_summaries(), lambda: tf.
                        summary.scalar("global_grad_norm/%s" % i, global_norm))
                else:
                    grads_and_vars = eager_utils.clip_gradient_norms(
                        grads_and_vars, self._gradient_clipping)

            optimizer.apply_gradients(grads_and_vars)

        self.after_train(training_info)

        return loss_info, all_grads_and_vars