def estimate_advantage_bias_and_variance(advantage_fn, mean_reward=1.23, reward_noise=0.45, discount_mask=None, discount_true_return=True, true_value=False, n_samples=10000, length=5, gamma=0.9, margin=1, **advantage_kwargs): advantage_fn = advantage_fn(gamma, margin, **advantage_kwargs) rewards = np.random.normal(loc=mean_reward, scale=reward_noise, size=(n_samples, length)) if discount_mask is None: discount_mask = np.ones_like(rewards) gammas = advantages.mask_discount(gamma, discount_mask) returns = advantages.discounted_returns(rewards, gammas) true_returns = advantages.discounted_returns(np.full( returns.shape, fill_value=mean_reward), gammas=gammas) if true_value: values = true_returns else: values = np.zeros_like(returns) dones = np.zeros_like(returns, dtype=np.bool) adv = advantage_fn(rewards, returns, values, dones, discount_mask) if discount_true_return: mean_return = true_returns[0, 0] else: mean_return = mean_reward * length return calc_bias_and_variance(adv[:, 0], mean_return - values[:, 0])
def calculate_returns(self, gamma): """Calculate discounted returns.""" rewards = np.array([ts.reward for ts in self._timesteps]) discount_mask = np.array( [ts.env_info.discount_mask for ts in self._timesteps]) gammas = advantages.mask_discount(gamma, discount_mask) returns = advantages.discounted_returns(rewards, gammas) for (i, return_) in enumerate(returns): self._timesteps[i] = self._timesteps[i]._replace(return_=return_)
def test_future_return_is_zero_iff_discount_mask_is_on(self, advantage_fn): # (... when gamma=0) rewards = np.array([[1, 2, 3, 4]], dtype=np.float32) values = np.array([[5, 6, 7, 8]], dtype=np.float32) dones = np.zeros_like(rewards, dtype=np.bool) discount_mask = np.array([[1, 0, 1, 0]], dtype=np.bool) gammas = advantages.mask_discount(0.0, discount_mask) returns = advantages.discounted_returns(rewards, gammas) adv = advantage_fn(gamma=0.0, margin=1)(rewards, returns, values, dones, discount_mask) target_returns = values[:, :-1] + adv # Assert that in the states with discount_mask on the future return in the # advantage is zero, i.e. the return is equal to the reward. rewards = rewards[:, :-1] discount_mask = discount_mask[:, :-1] np.testing.assert_almost_equal(target_returns[discount_mask], rewards[discount_mask]) # Assert the converse. with np.testing.assert_raises(AssertionError): np.testing.assert_almost_equal(target_returns[~discount_mask], rewards[~discount_mask])