def test_normalize_return_vector_env(): env_fns = [make_env(0), make_env(1)] envs = gym.vector.SyncVectorEnv(env_fns) envs = NormalizeReward(envs) obs = envs.reset() obs, reward, _, _ = envs.step(envs.action_space.sample()) assert_almost_equal( envs.return_rms.mean, np.mean([1.5]), # the mean of first returns [[1, 2]] decimal=4, ) obs, reward, _, _ = envs.step(envs.action_space.sample()) assert_almost_equal( envs.return_rms.mean, np.mean( [[1, 2], [2 + envs.gamma * 1, 3 + envs.gamma * 2]] ), # the mean of first and second returns [[1, 2], [2 + envs.gamma * 1, 3 + envs.gamma * 2]] decimal=4, )
def test_normalize_return(): env = DummyRewardEnv(return_reward_idx=0) env = NormalizeReward(env) env.reset() env.step(env.action_space.sample()) assert_almost_equal( env.return_rms.mean, np.mean([1]), # [first return] decimal=4, ) env.step(env.action_space.sample()) assert_almost_equal( env.return_rms.mean, np.mean([2 + env.gamma * 1, 1]), # [second return, first return] decimal=4, )