def test_sync_vec_normalize(): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) assert isinstance(unwrap_vec_normalize(env), VecNormalize) env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) eval_env = VecFrameStack(eval_env, 1) env.reset() # Initialize running mean for _ in range(100): env.step([env.action_space.sample()]) obs = env.reset() original_obs = env.get_original_obs() dummy_rewards = np.random.rand(10) # Normalization must be different assert not np.allclose(obs, eval_env.normalize_obs(original_obs)) sync_envs_normalization(env, eval_env) # Now they must be synced assert np.allclose(obs, eval_env.normalize_obs(original_obs)) assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
def test_sync_vec_normalize(make_env): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) assert isinstance(unwrap_vec_normalize(env), VecNormalize) if not isinstance(env.observation_space, spaces.Dict): env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) if not isinstance(env.observation_space, spaces.Dict): eval_env = VecFrameStack(eval_env, 1) env.seed(0) env.action_space.seed(0) env.reset() # Initialize running mean latest_reward = None for _ in range(100): _, latest_reward, _, _ = env.step([env.action_space.sample()]) # Check that unnormalized reward is same as original reward original_latest_reward = env.get_original_reward() assert np.allclose(original_latest_reward, env.unnormalize_reward(latest_reward)) obs = env.reset() dummy_rewards = np.random.rand(10) original_obs = env.get_original_obs() # Check that unnormalization works assert allclose(original_obs, env.unnormalize_obs(obs)) # Normalization must be different (between different environments) assert not allclose(obs, eval_env.normalize_obs(original_obs)) # Test syncing of parameters sync_envs_normalization(env, eval_env) # Now they must be synced assert allclose(obs, eval_env.normalize_obs(original_obs)) assert allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))