示例#1
0
    def _train(self, experience, weights=None):
        # TODO(b/126593927): Support batch dimensions >1.
        if experience.step_type.shape[0] != 1:
            raise NotImplementedError(
                'ReinforceAgent does not yet support batch '
                'dimensions greater than 1.')

        experience = tf.nest.map_structure(lambda t: tf.squeeze(t, 0),
                                           experience)
        returns = common.compute_returns(experience.reward,
                                         experience.discount)
        if self._debug_summaries:
            tf.compat.v2.summary.histogram(name='rewards',
                                           data=experience.reward,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='discounts',
                                           data=experience.discount,
                                           step=self.train_step_counter)
            tf.compat.v2.summary.histogram(name='returns',
                                           data=returns,
                                           step=self.train_step_counter)

        # TODO(b/126592060): replace with tensor normalizer.
        if self._normalize_returns:
            ret_mean, ret_var = tf.nn.moments(x=returns, axes=[0])
            returns = (returns - ret_mean) / (tf.sqrt(ret_var) + 1e-6)
            if self._debug_summaries:
                tf.compat.v2.summary.histogram(name='normalized_returns',
                                               data=returns,
                                               step=self.train_step_counter)

        time_step = ts.TimeStep(experience.step_type,
                                tf.zeros_like(experience.reward),
                                tf.zeros_like(experience.discount),
                                experience.observation)

        variables_to_train = self._actor_network.variables
        with tf.GradientTape() as tape:
            loss_info = self._loss(time_step,
                                   experience.action,
                                   tf.stop_gradient(returns),
                                   weights=weights)
            tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan')
        grads = tape.gradient(loss_info.loss, variables_to_train)

        grads_and_vars = zip(grads, variables_to_train)
        if self._gradient_clipping:
            grads_and_vars = eager_utils.clip_gradient_norms(
                grads_and_vars, self._gradient_clipping)

        if self._summarize_grads_and_vars:
            eager_utils.add_variables_summaries(grads_and_vars,
                                                self.train_step_counter)
            eager_utils.add_gradients_summaries(grads_and_vars,
                                                self.train_step_counter)

        self._optimizer.apply_gradients(grads_and_vars,
                                        global_step=self.train_step_counter)

        return tf.nest.map_structure(tf.identity, loss_info)
示例#2
0
    def _train(self, experience, weights=None, train_step_counter=None):
        # TODO(sfishman): Support batch dimensions >1.
        if experience.step_type.shape[0] != 1:
            raise NotImplementedError(
                'ReinforceAgent does not yet support batch '
                'dimensions greater than 1.')
        experience = nest.map_structure(lambda t: tf.squeeze(t, 0), experience)
        returns = common.compute_returns(experience.reward,
                                         experience.discount)
        if self._debug_summaries:
            tf.contrib.summary.histogram('rewards', experience.reward)
            tf.contrib.summary.histogram('discounts', experience.discount)
            tf.contrib.summary.histogram('returns', returns)

        # TODO(kbnaoop): replace with tensor normalizer.
        if self._normalize_returns:
            ret_mean, ret_var = tf.nn.moments(returns, axes=[0])
            returns = (returns - ret_mean) / (tf.sqrt(ret_var) + 1e-6)
            if self._debug_summaries:
                tf.contrib.summary.histogram('normalized_returns', returns)

        # TODO(kbanoop): remove after changing network interface to accept
        # observations and step_types, instead of time_steps.
        time_step = ts.TimeStep(experience.step_type,
                                tf.zeros_like(experience.reward),
                                tf.zeros_like(experience.discount),
                                experience.observation)
        # TODO(kbanoop): Filter boundary steps.

        loss_info = self._loss(time_step,
                               experience.action,
                               tf.stop_gradient(returns),
                               weights=weights)

        clip_gradients = (tf.contrib.training.clip_gradient_norms_fn(
            self._gradient_clipping) if self._gradient_clipping else None)

        # TODO(sguada): create_train_step should not return a Future.
        loss_info = eager_utils.create_train_step(
            loss_info,
            self._optimizer,
            total_loss_fn=lambda loss_info: loss_info.loss,
            global_step=train_step_counter,
            transform_grads_fn=clip_gradients,
            summarize_gradients=self._summarize_grads_and_vars,
            variables_to_train=lambda: self._actor_network.trainable_weights,
        )

        if isinstance(loss_info, eager_utils.Future):
            loss_info = loss_info()

        if self._summarize_grads_and_vars:
            with tf.name_scope('Variables/'):
                for var in self._actor_network.trainable_weights:
                    tf.contrib.summary.histogram(var.name.replace(':', '_'),
                                                 var)

        return loss_info
示例#3
0
  def testComputeReturns(self):
    rewards = tf.constant(np.ones(9), dtype=tf.float32)
    discounts = tf.constant([1, 1, 1, 1, 0, 0.9, 0.9, 0.9, 0], dtype=tf.float32)
    returns = common.compute_returns(rewards, discounts)
    expected_returns = [5, 4, 3, 2, 1, 3.439, 2.71, 1.9, 1]

    self.evaluate(tf.compat.v1.global_variables_initializer())
    returns = self.evaluate(returns)
    self.assertAllClose(returns, expected_returns)
示例#4
0
  def testComputeReturnsRandomized(self):
    rewards = tf.constant(np.random.random([20]), dtype=tf.float32)
    discounts = tf.constant(np.random.random([20]), dtype=tf.float32)
    returns = common.compute_returns(rewards, discounts)

    def _compute_returns_fn(rewards, discounts):
      """Python implementation of computing discounted returns."""
      returns = np.zeros(len(rewards))
      next_state_return = 0.0
      for t in range(len(returns) - 1, -1, -1):
        returns[t] = rewards[t] + discounts[t] * next_state_return
        next_state_return = returns[t]
      return returns.astype(np.float32)
    expected_returns = tf.py_func(_compute_returns_fn,
                                  [rewards, discounts],
                                  tf.float32)

    self.evaluate(tf.global_variables_initializer())
    returns = self.evaluate(returns)
    expected_returns = self.evaluate(expected_returns)
    self.assertAllClose(returns, expected_returns)