Пример #1
0
def test_sync_vec_normalize():
    env = DummyVecEnv([make_env])

    assert unwrap_vec_normalize(env) is None

    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)

    assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    env = VecFrameStack(env, 1)

    assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    eval_env = DummyVecEnv([make_env])
    eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
    eval_env = VecFrameStack(eval_env, 1)

    env.reset()
    # Initialize running mean
    for _ in range(100):
        env.step([env.action_space.sample()])

    obs = env.reset()
    original_obs = env.get_original_obs()
    dummy_rewards = np.random.rand(10)
    # Normalization must be different
    assert not np.allclose(obs, eval_env.normalize_obs(original_obs))

    sync_envs_normalization(env, eval_env)

    # Now they must be synced
    assert np.allclose(obs, eval_env.normalize_obs(original_obs))
    assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
Пример #2
0
def test_sync_vec_normalize(make_env):
    env = DummyVecEnv([make_env])

    assert unwrap_vec_normalize(env) is None

    env = VecNormalize(env,
                       norm_obs=True,
                       norm_reward=True,
                       clip_obs=100.0,
                       clip_reward=100.0)

    assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    if not isinstance(env.observation_space, spaces.Dict):
        env = VecFrameStack(env, 1)
        assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    eval_env = DummyVecEnv([make_env])
    eval_env = VecNormalize(eval_env,
                            training=False,
                            norm_obs=True,
                            norm_reward=True,
                            clip_obs=100.0,
                            clip_reward=100.0)

    if not isinstance(env.observation_space, spaces.Dict):
        eval_env = VecFrameStack(eval_env, 1)

    env.seed(0)
    env.action_space.seed(0)

    env.reset()
    # Initialize running mean
    latest_reward = None
    for _ in range(100):
        _, latest_reward, _, _ = env.step([env.action_space.sample()])

    # Check that unnormalized reward is same as original reward
    original_latest_reward = env.get_original_reward()
    assert np.allclose(original_latest_reward,
                       env.unnormalize_reward(latest_reward))

    obs = env.reset()
    dummy_rewards = np.random.rand(10)
    original_obs = env.get_original_obs()
    # Check that unnormalization works
    assert allclose(original_obs, env.unnormalize_obs(obs))
    # Normalization must be different (between different environments)
    assert not allclose(obs, eval_env.normalize_obs(original_obs))

    # Test syncing of parameters
    sync_envs_normalization(env, eval_env)
    # Now they must be synced
    assert allclose(obs, eval_env.normalize_obs(original_obs))
    assert allclose(env.normalize_reward(dummy_rewards),
                    eval_env.normalize_reward(dummy_rewards))
Пример #3
0
def test_eval_friendly_error():
    # tests that eval callback does not crash when given a vector
    train_env = VecNormalize(DummyVecEnv([lambda: gym.make("CartPole-v1")]))
    eval_env = DummyVecEnv([lambda: gym.make("CartPole-v1")])
    eval_env = VecNormalize(eval_env, training=False, norm_reward=False)
    _ = train_env.reset()
    original_obs = train_env.get_original_obs()
    model = A2C("MlpPolicy", train_env, n_steps=50, seed=0)

    eval_callback = EvalCallback(
        eval_env,
        eval_freq=100,
        warn=False,
    )
    model.learn(100, callback=eval_callback)

    # Check synchronization
    assert np.allclose(train_env.normalize_obs(original_obs), eval_env.normalize_obs(original_obs))

    wrong_eval_env = gym.make("CartPole-v1")
    eval_callback = EvalCallback(
        wrong_eval_env,
        eval_freq=100,
        warn=False,
    )

    with pytest.warns(Warning):
        with pytest.raises(AssertionError):
            model.learn(100, callback=eval_callback)
def main(args):
    policy_path = args.policy_path
    expert = PPO.load(policy_path)

    # Initialize environment for input standardization
    factory = EnvFactory(args.env)
    env = DummyVecEnv([factory.make_env])
    env = VecNormalize.load(args.stats_path, env)
    env.training = False

    states = []
    for i in np.arange(-10, 110):
        for j in np.arange(-3, 3, 0.05):
            states.append([i, j])
    states = np.stack(states)
    states_scaled = env.normalize_obs(states)
    states_tensor = torch.as_tensor(states_scaled).float()

    policy: ActorCriticPolicy = expert.policy.cpu()
    true_actions_tensor, _, _ = policy.forward(states_tensor,
                                               deterministic=True)
    features_tensor = policy.features_extractor.forward(states_tensor)
    shared_latents_tensor = policy.mlp_extractor.shared_net.forward(
        features_tensor)
    policy_latents_tensor_layer1 = policy.mlp_extractor.policy_net[0].forward(
        shared_latents_tensor)
    policy_latents_tensor_layer1_activated = policy.mlp_extractor.policy_net[
        1].forward(policy_latents_tensor_layer1)
    policy_latents_tensor_layer2 = policy.mlp_extractor.policy_net[2].forward(
        policy_latents_tensor_layer1_activated)
    policy_latents_tensor_layer2_activated = policy.mlp_extractor.policy_net[
        3].forward(policy_latents_tensor_layer2)
    actions_tensor = policy.action_net.forward(
        policy_latents_tensor_layer2_activated)

    assert actions_tensor.equal(true_actions_tensor)

    binary_embeddings_layer1 = policy_latents_tensor_layer1_activated > 0
    binary_embeddings_layer1 = binary_embeddings_layer1.cpu().detach().numpy()
    binary_embeddings_layer2 = policy_latents_tensor_layer2_activated > 0
    binary_embeddings_layer2 = binary_embeddings_layer2.cpu().detach().numpy()

    binary_embeddings = np.concatenate(
        [binary_embeddings_layer1, binary_embeddings_layer2],
        axis=1).astype(int)
    integer_embeddings = np.packbits(binary_embeddings,
                                     axis=1,
                                     bitorder="little")
    integer_embeddings = integer_embeddings @ (256**np.arange(
        integer_embeddings.shape[1]))  # to allow arbitrary number of bits

    # convert raw integer embeddings to 0, 1, 2, 3...
    # fast rendering of state cells via grid interpolation
    grid_x, grid_y = np.mgrid[-10:110:1000j, -3:3:1000j]
    z = griddata((states[:, 0], states[:, 1]),
                 integer_embeddings, (grid_x, grid_y),
                 method='nearest')

    # convert raw integer
    convert_raw_integer_to_colorhash = np.vectorize(lambda x: ColorHash(x).rgb)
    grid_z = np.array(convert_raw_integer_to_colorhash(z)).swapaxes(
        0, 1).swapaxes(1, 2)

    plt.figure()
    plt.imshow(grid_z, extent=[-10, 110, -3, 3], aspect='auto')
    plt.title("State Space Visualized")
    plt.xlabel("$x$")
    plt.ylabel("$\\dot x$")
    plt.show()