示例#1
0
def estimate_advantage_bias_and_variance(advantage_fn,
                                         mean_reward=1.23,
                                         reward_noise=0.45,
                                         discount_mask=None,
                                         discount_true_return=True,
                                         true_value=False,
                                         n_samples=10000,
                                         length=5,
                                         gamma=0.9,
                                         margin=1,
                                         **advantage_kwargs):
    advantage_fn = advantage_fn(gamma, margin, **advantage_kwargs)
    rewards = np.random.normal(loc=mean_reward,
                               scale=reward_noise,
                               size=(n_samples, length))
    if discount_mask is None:
        discount_mask = np.ones_like(rewards)
    gammas = advantages.mask_discount(gamma, discount_mask)
    returns = advantages.discounted_returns(rewards, gammas)

    true_returns = advantages.discounted_returns(np.full(
        returns.shape, fill_value=mean_reward),
                                                 gammas=gammas)
    if true_value:
        values = true_returns
    else:
        values = np.zeros_like(returns)

    dones = np.zeros_like(returns, dtype=np.bool)
    adv = advantage_fn(rewards, returns, values, dones, discount_mask)
    if discount_true_return:
        mean_return = true_returns[0, 0]
    else:
        mean_return = mean_reward * length
    return calc_bias_and_variance(adv[:, 0], mean_return - values[:, 0])
示例#2
0
 def calculate_returns(self, gamma):
     """Calculate discounted returns."""
     rewards = np.array([ts.reward for ts in self._timesteps])
     discount_mask = np.array(
         [ts.env_info.discount_mask for ts in self._timesteps])
     gammas = advantages.mask_discount(gamma, discount_mask)
     returns = advantages.discounted_returns(rewards, gammas)
     for (i, return_) in enumerate(returns):
         self._timesteps[i] = self._timesteps[i]._replace(return_=return_)
示例#3
0
 def test_future_return_is_zero_iff_discount_mask_is_on(self, advantage_fn):
     # (... when gamma=0)
     rewards = np.array([[1, 2, 3, 4]], dtype=np.float32)
     values = np.array([[5, 6, 7, 8]], dtype=np.float32)
     dones = np.zeros_like(rewards, dtype=np.bool)
     discount_mask = np.array([[1, 0, 1, 0]], dtype=np.bool)
     gammas = advantages.mask_discount(0.0, discount_mask)
     returns = advantages.discounted_returns(rewards, gammas)
     adv = advantage_fn(gamma=0.0, margin=1)(rewards, returns, values,
                                             dones, discount_mask)
     target_returns = values[:, :-1] + adv
     # Assert that in the states with discount_mask on the future return in the
     # advantage is zero, i.e. the return is equal to the reward.
     rewards = rewards[:, :-1]
     discount_mask = discount_mask[:, :-1]
     np.testing.assert_almost_equal(target_returns[discount_mask],
                                    rewards[discount_mask])
     # Assert the converse.
     with np.testing.assert_raises(AssertionError):
         np.testing.assert_almost_equal(target_returns[~discount_mask],
                                        rewards[~discount_mask])