def _check(self, rewards, values, step_types, discounts, expected): np.testing.assert_array_almost_equal( value_ops.discounted_return(rewards=rewards, values=values, step_types=step_types, discounts=discounts, time_major=False), expected) np.testing.assert_array_almost_equal( value_ops.discounted_return( rewards=torch.stack([rewards, 2 * rewards], dim=2), values=torch.stack([values, 2 * values], dim=2), step_types=step_types, discounts=discounts, time_major=False), torch.stack([expected, 2 * expected], dim=2))
def _calc_returns_and_advantages(self, training_info, value): if self._use_vtrace: return value_ops.calc_vtrace_returns_and_advantages( training_info, value, self._gamma, self._action_spec, self._lambda, self._debug_summaries) returns = value_ops.discounted_return( rewards=training_info.reward, values=value, step_types=training_info.step_type, discounts=training_info.discount * self._gamma) returns = common.tensor_extend(returns, value[-1]) if not self._use_gae: advantages = returns - value else: advantages = value_ops.generalized_advantage_estimation( rewards=training_info.reward, values=value, step_types=training_info.step_type, discounts=training_info.discount * self._gamma, td_lambda=self._lambda) advantages = common.tensor_extend_zero(advantages) if self._use_td_lambda_return: returns = advantages + value return returns, advantages
def test_discounted_return(self): values = tf.constant([[1.] * 5], tf.float32) step_types = tf.constant([[StepType.MID] * 5], tf.int64) rewards = tf.constant([[2.] * 5], tf.float32) discounts = tf.constant([[0.9] * 5], tf.float32) expected = tf.constant( [[(((1 * 0.9 + 2) * 0.9 + 2) * 0.9 + 2) * 0.9 + 2, ((1 * 0.9 + 2) * 0.9 + 2) * 0.9 + 2, (1 * 0.9 + 2) * 0.9 + 2, 1 * 0.9 + 2]], dtype=tf.float32) self.assertAllClose( value_ops.discounted_return(rewards=rewards, values=values, step_types=step_types, discounts=discounts, time_major=False), expected) # two episodes, and exceed by time limit (discount=1) step_types = tf.constant([[ StepType.MID, StepType.MID, StepType.LAST, StepType.MID, StepType.MID ]], tf.int32) expected = tf.constant( [[(1 * 0.9 + 2) * 0.9 + 2, 1 * 0.9 + 2, 1, 1 * 0.9 + 2]], dtype=tf.float32) self.assertAllClose( value_ops.discounted_return(rewards=rewards, values=values, step_types=step_types, discounts=discounts, time_major=False), expected) # two episodes, and end normal (discount=0) step_types = tf.constant([[ StepType.MID, StepType.MID, StepType.LAST, StepType.MID, StepType.MID ]], tf.int32) discounts = tf.constant([[0.9, 0.9, 0.0, 0.9, 0.9]]) expected = tf.constant([[(0 * 0.9 + 2) * 0.9 + 2, 2, 1, 1 * 0.9 + 2]], dtype=tf.float32) self.assertAllClose( value_ops.discounted_return(rewards=rewards, values=values, step_types=step_types, discounts=discounts, time_major=False), expected)
def test_vtrace_returns_and_advantages_impl_on_policy_no_last_step(self): """Test vtrace_returns_and_advantages_impl on policy no last_step in the middle of the trajectory. """ importance_ratio_clipped = tf.constant([[1.] * 5], tf.float32) values = tf.constant([[2.] * 5], tf.float32) step_types = tf.constant([[StepType.MID] * 5], tf.int64) rewards = tf.constant([[3.] * 5], tf.float32) discounts = tf.constant([[0.9] * 5], tf.float32) td_lambda = 1.0 returns, advantages = value_ops.vtrace_returns_and_advantages_impl( importance_ratio_clipped, rewards, values, step_types, discounts, time_major=False) sa_returns, sa_adv = vtrace_scalable_agent(importance_ratio_clipped, discounts, rewards, values, step_types) self.assertAllClose(sa_adv, advantages, msg='advantages differ from scalable_agent') self.assertAllClose(sa_returns, returns, msg='returns differ from scalable_agent') expected_advantages = value_ops.generalized_advantage_estimation( rewards=rewards, values=values, step_types=step_types, discounts=discounts, td_lambda=td_lambda, time_major=False) expected_advantages = tf.transpose(a=expected_advantages) expected_advantages = common.tensor_extend_zero(expected_advantages) expected_advantages = tf.transpose(a=expected_advantages) self.assertAllClose(expected_advantages, advantages, msg='advantages differ from gold') expected_returns = value_ops.discounted_return(rewards=rewards, values=values, step_types=step_types, discounts=discounts, time_major=False) expected_returns = tf.transpose(a=expected_returns) values = tf.transpose(a=values) expected_returns = common.tensor_extend(expected_returns, values[-1]) expected_returns = tf.transpose(a=expected_returns) self.assertAllClose(expected_returns, returns, msg='returns differ from gold')
def test_vtrace_returns_and_advantages_impl_on_policy_has_last_step(self): """Test vtrace_returns_and_advantages_impl on policy has last_step in the middle of the trajectory. """ importance_ratio_clipped = tf.constant([[1.] * 5], tf.float32) values = tf.constant([[2., 2.1, 2.2, 2.3, 2.4]], tf.float32) step_types = tf.constant([[ StepType.MID, StepType.MID, StepType.LAST, StepType.MID, StepType.MID ]], tf.int32) rewards = tf.constant([[3., 3.1, 3.2, 3.3, 3.4]], tf.float32) discounts = tf.constant([[0.9, 0.9, 0.0, 0.9, 0.9]]) td_lambda = 1.0 returns, advantages = value_ops.vtrace_returns_and_advantages_impl( importance_ratio_clipped, rewards, values, step_types, discounts, time_major=False) expected_advantages = value_ops.generalized_advantage_estimation( rewards=rewards, values=values, step_types=step_types, discounts=discounts, td_lambda=td_lambda, time_major=False) expected_advantages = tf.transpose(a=expected_advantages) expected_advantages = common.tensor_extend_zero(expected_advantages) expected_advantages = tf.transpose(a=expected_advantages) self.assertAllClose(expected_advantages, advantages, msg='advantages differ') expected_returns = value_ops.discounted_return(rewards=rewards, values=values, step_types=step_types, discounts=discounts, time_major=False) expected_returns = tf.transpose(a=expected_returns) values = tf.transpose(a=values) expected_returns = common.tensor_extend(expected_returns, values[-1]) expected_returns = tf.transpose(a=expected_returns) self.assertAllClose(expected_returns, returns, msg='returns differ')
def _calc_returns_and_advantages(self, experience, value): returns = value_ops.discounted_return(rewards=experience.reward, values=value, step_types=experience.step_type, discounts=experience.discount * self._gamma) returns = tensor_utils.tensor_extend(returns, value[-1]) if not self._use_gae: advantages = returns - value else: advantages = value_ops.generalized_advantage_estimation( rewards=experience.reward, values=value, step_types=experience.step_type, discounts=experience.discount * self._gamma, td_lambda=self._lambda) advantages = tensor_utils.tensor_extend_zero(advantages) if self._use_td_lambda_return: returns = advantages + value return returns, advantages
def forward(self, experience, value, target_value): """Cacluate the loss. The first dimension of all the tensors is time dimension and the second dimesion is the batch dimension. Args: experience (Experience): experience collected from ``unroll()`` or a replay buffer. All tensors are time-major. value (torch.Tensor): the time-major tensor for the value at each time step. The loss is between this and the calculated return. target_value (torch.Tensor): the time-major tensor for the value at each time step. This is used to calculate return. ``target_value`` can be same as ``value``. Returns: LossInfo: with the ``extra`` field same as ``loss``. """ if self._lambda == 1.0: returns = value_ops.discounted_return( rewards=experience.reward, values=target_value, step_types=experience.step_type, discounts=experience.discount * self._gamma) elif self._lambda == 0.0: returns = value_ops.one_step_discounted_return( rewards=experience.reward, values=target_value, step_types=experience.step_type, discounts=experience.discount * self._gamma) else: advantages = value_ops.generalized_advantage_estimation( rewards=experience.reward, values=target_value, step_types=experience.step_type, discounts=experience.discount * self._gamma, td_lambda=self._lambda) returns = advantages + target_value[:-1] value = value[:-1] if self._debug_summaries and alf.summary.should_record_summaries(): mask = experience.step_type[:-1] != StepType.LAST with alf.summary.scope(self._name): def _summarize(v, r, td, suffix): alf.summary.scalar( "explained_variance_of_return_by_value" + suffix, tensor_utils.explained_variance(v, r, mask)) safe_mean_hist_summary('values' + suffix, v, mask) safe_mean_hist_summary('returns' + suffix, r, mask) safe_mean_hist_summary("td_error" + suffix, td, mask) if value.ndim == 2: _summarize(value, returns, returns - value, '') else: td = returns - value for i in range(value.shape[2]): suffix = '/' + str(i) _summarize(value[..., i], returns[..., i], td[..., i], suffix) loss = self._td_error_loss_fn(returns.detach(), value) if loss.ndim == 3: # Multidimensional reward. Average over the critic loss for all dimensions loss = loss.mean(dim=2) # The shape of the loss expected by Algorith.update_with_gradient is # [T, B], so we need to augment it with additional zeros. loss = tensor_utils.tensor_extend_zero(loss) return LossInfo(loss=loss, extra=loss)