def test_sync_vec_normalize(): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) assert isinstance(unwrap_vec_normalize(env), VecNormalize) env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) eval_env = VecFrameStack(eval_env, 1) env.reset() # Initialize running mean for _ in range(100): env.step([env.action_space.sample()]) obs = env.reset() original_obs = env.get_original_obs() dummy_rewards = np.random.rand(10) # Normalization must be different assert not np.allclose(obs, eval_env.normalize_obs(original_obs)) sync_envs_normalization(env, eval_env) # Now they must be synced assert np.allclose(obs, eval_env.normalize_obs(original_obs)) assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
def test_sync_vec_normalize(make_env): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) assert isinstance(unwrap_vec_normalize(env), VecNormalize) if not isinstance(env.observation_space, spaces.Dict): env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) if not isinstance(env.observation_space, spaces.Dict): eval_env = VecFrameStack(eval_env, 1) env.seed(0) env.action_space.seed(0) env.reset() # Initialize running mean latest_reward = None for _ in range(100): _, latest_reward, _, _ = env.step([env.action_space.sample()]) # Check that unnormalized reward is same as original reward original_latest_reward = env.get_original_reward() assert np.allclose(original_latest_reward, env.unnormalize_reward(latest_reward)) obs = env.reset() dummy_rewards = np.random.rand(10) original_obs = env.get_original_obs() # Check that unnormalization works assert allclose(original_obs, env.unnormalize_obs(obs)) # Normalization must be different (between different environments) assert not allclose(obs, eval_env.normalize_obs(original_obs)) # Test syncing of parameters sync_envs_normalization(env, eval_env) # Now they must be synced assert allclose(obs, eval_env.normalize_obs(original_obs)) assert allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
def test_eval_friendly_error(): # tests that eval callback does not crash when given a vector train_env = VecNormalize(DummyVecEnv([lambda: gym.make("CartPole-v1")])) eval_env = DummyVecEnv([lambda: gym.make("CartPole-v1")]) eval_env = VecNormalize(eval_env, training=False, norm_reward=False) _ = train_env.reset() original_obs = train_env.get_original_obs() model = A2C("MlpPolicy", train_env, n_steps=50, seed=0) eval_callback = EvalCallback( eval_env, eval_freq=100, warn=False, ) model.learn(100, callback=eval_callback) # Check synchronization assert np.allclose(train_env.normalize_obs(original_obs), eval_env.normalize_obs(original_obs)) wrong_eval_env = gym.make("CartPole-v1") eval_callback = EvalCallback( wrong_eval_env, eval_freq=100, warn=False, ) with pytest.warns(Warning): with pytest.raises(AssertionError): model.learn(100, callback=eval_callback)
def main(args): policy_path = args.policy_path expert = PPO.load(policy_path) # Initialize environment for input standardization factory = EnvFactory(args.env) env = DummyVecEnv([factory.make_env]) env = VecNormalize.load(args.stats_path, env) env.training = False states = [] for i in np.arange(-10, 110): for j in np.arange(-3, 3, 0.05): states.append([i, j]) states = np.stack(states) states_scaled = env.normalize_obs(states) states_tensor = torch.as_tensor(states_scaled).float() policy: ActorCriticPolicy = expert.policy.cpu() true_actions_tensor, _, _ = policy.forward(states_tensor, deterministic=True) features_tensor = policy.features_extractor.forward(states_tensor) shared_latents_tensor = policy.mlp_extractor.shared_net.forward( features_tensor) policy_latents_tensor_layer1 = policy.mlp_extractor.policy_net[0].forward( shared_latents_tensor) policy_latents_tensor_layer1_activated = policy.mlp_extractor.policy_net[ 1].forward(policy_latents_tensor_layer1) policy_latents_tensor_layer2 = policy.mlp_extractor.policy_net[2].forward( policy_latents_tensor_layer1_activated) policy_latents_tensor_layer2_activated = policy.mlp_extractor.policy_net[ 3].forward(policy_latents_tensor_layer2) actions_tensor = policy.action_net.forward( policy_latents_tensor_layer2_activated) assert actions_tensor.equal(true_actions_tensor) binary_embeddings_layer1 = policy_latents_tensor_layer1_activated > 0 binary_embeddings_layer1 = binary_embeddings_layer1.cpu().detach().numpy() binary_embeddings_layer2 = policy_latents_tensor_layer2_activated > 0 binary_embeddings_layer2 = binary_embeddings_layer2.cpu().detach().numpy() binary_embeddings = np.concatenate( [binary_embeddings_layer1, binary_embeddings_layer2], axis=1).astype(int) integer_embeddings = np.packbits(binary_embeddings, axis=1, bitorder="little") integer_embeddings = integer_embeddings @ (256**np.arange( integer_embeddings.shape[1])) # to allow arbitrary number of bits # convert raw integer embeddings to 0, 1, 2, 3... # fast rendering of state cells via grid interpolation grid_x, grid_y = np.mgrid[-10:110:1000j, -3:3:1000j] z = griddata((states[:, 0], states[:, 1]), integer_embeddings, (grid_x, grid_y), method='nearest') # convert raw integer convert_raw_integer_to_colorhash = np.vectorize(lambda x: ColorHash(x).rgb) grid_z = np.array(convert_raw_integer_to_colorhash(z)).swapaxes( 0, 1).swapaxes(1, 2) plt.figure() plt.imshow(grid_z, extent=[-10, 110, -3, 3], aspect='auto') plt.title("State Space Visualized") plt.xlabel("$x$") plt.ylabel("$\\dot x$") plt.show()