Exemplo n.º 1
0
def test_identity_continuous(model_class):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    """
    env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

    if model_class in [DDPG, TD3]:
        n_actions = 1
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))
    else:
        action_noise = None

    model = model_class("MlpPolicy",
                        env,
                        gamma=0.1,
                        seed=0,
                        action_noise=action_noise,
                        buffer_size=int(1e6))
    model.learn(total_timesteps=20000)

    n_trials = 1000
    reward_sum = 0
    set_global_seeds(0)
    obs = env.reset()
    for _ in range(n_trials):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        reward_sum += reward
    assert reward_sum > 0.9 * n_trials
    # Free memory
    del model, env
def test_log_prob_calcuation(model_class):
    model = model_class("MlpPolicy", IdentityEnvBox())
    # Fixed mean/std
    model.proba_step = Helper.proba_vals
    # Check that the log probability is the one expected for the given mean/std
    logprob = model.action_probability(observation=np.array([[0.5], [0.5]]), actions=0.2, logp=True)
    assert np.allclose(logprob, np.array([-16.616353440210627])), "Calculation failed for {}".format(model_class)
Exemplo n.º 3
0
def test_identity_ddpg():
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    """
    env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

    std = 0.2
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(std),
                                         desired_action_stddev=float(std))

    model = DDPG("MlpPolicy",
                 env,
                 gamma=0.0,
                 param_noise=param_noise,
                 memory_limit=int(1e6))
    model.learn(total_timesteps=20000, seed=0)

    n_trials = 1000
    reward_sum = 0
    set_global_seeds(0)
    obs = env.reset()
    for _ in range(n_trials):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        reward_sum += reward
    assert reward_sum > 0.9 * n_trials
    # Free memory
    del model, env
Exemplo n.º 4
0
def test_identity_continuous(model_class):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    """
    env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

    if model_class in [DDPG, TD3]:
        n_actions = 1
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))
    else:
        action_noise = None

    model = model_class("MlpPolicy",
                        env,
                        gamma=0.1,
                        seed=0,
                        action_noise=action_noise,
                        buffer_size=int(1e6))
    model.learn(total_timesteps=20000)

    evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90)
    # Free memory
    del model, env
Exemplo n.º 5
0
def test_identity_continuous(model_class):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    """
    env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

    n_steps = {SAC: 700, TD3: 500, DDPG: 2000}[model_class]

    kwargs = dict(seed=0, gamma=0.95, buffer_size=1e5)
    if model_class in [DDPG, TD3]:
        n_actions = 1
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.05 * np.ones(n_actions))
        kwargs["action_noise"] = action_noise

    if model_class == DDPG:
        kwargs["actor_lr"] = 1e-3
        kwargs["batch_size"] = 100

    model = model_class("MlpPolicy", env, **kwargs)
    model.learn(total_timesteps=n_steps)

    evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90)
    # Free memory
    del model, env
Exemplo n.º 6
0
def test_identity_box(model_class):
    """
    test the Box environment vectorisation detection

    :param model_class: (BaseRLModel) the RL model
    """
    check_shape(lambda: IdentityEnvBox(eps=0.5), model_class, (1, ), (1, 1))
Exemplo n.º 7
0
def test_buffer_actions_scaling(model_class, model_kwargs):
    """
    Test if actions are scaled to tanh co-domain before being put in a buffer
    for algorithms that use tanh-squashing, i.e., DDPG, TD3, SAC

    :param model_class: (BaseRLModel) A RL Model
    :param model_kwargs: (dict) Dictionary containing named arguments to the given algorithm
    """

    # check random and inferred actions as they possibly have different flows
    for random_coeff in [0.0, 1.0]:

        env = IdentityEnvBox(-2000, 1000)

        model = model_class("MlpPolicy", env, seed=1, random_exploration=random_coeff, **model_kwargs)
        model.learn(total_timesteps=ROLLOUT_STEPS)

        assert hasattr(model, 'replay_buffer')

        buffer = model.replay_buffer

        assert buffer.can_sample(ROLLOUT_STEPS)

        _, actions, _, _, _ = buffer.sample(ROLLOUT_STEPS)

        assert not np.any(actions > np.ones_like(actions))
        assert not np.any(actions < -np.ones_like(actions))
Exemplo n.º 8
0
def test_common_failures_reset():
    """
    Test that common failure cases of the `reset_method` are caught
    """
    env = IdentityEnvBox()
    # Return an observation that does not match the observation_space
    check_reset_assert_error(env, np.ones((3,)))
    # The observation is not a numpy array
    check_reset_assert_error(env, 1)

    # Return not only the observation
    check_reset_assert_error(env, (env.observation_space.sample(), False))
Exemplo n.º 9
0
def test_identity_box(model_class):
    """
    test the Box environment vectorisation detection

    :param model_class: (BaseRLModel) the RL model
    """
    model = model_class(policy="MlpPolicy", env=DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]))

    env0 = IdentityEnvBox()
    env1 = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

    n_trials = 100
    for env, expected_shape in [(env0, (1,)), (env1, (1, 1))]:
        obs = env.reset()
        for _ in range(n_trials):
            action, _ = model.predict(obs)
            assert np.array(action).shape == expected_shape
            obs, _, _, _ = env.step(action)

    # Free memory
    del model, env
Exemplo n.º 10
0
def test_identity_continuous(model_name):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)

    :param model_name: (str) Name of the RL model
    """
    env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

    model = LEARN_FUNC_DICT[model_name](env)

    n_trials = 1000
    obs = env.reset()
    action_shape = model.predict(obs, deterministic=False)[0].shape
    action, _ = model.predict(obs, deterministic=True)
    assert action.shape == action_shape
    for _ in range(n_trials):
        new_action = model.predict(obs, deterministic=True)[0]
        assert action == model.predict(obs, deterministic=True)[0]
        assert new_action.shape == action_shape
Exemplo n.º 11
0
def test_common_failures_step():
    """
    Test that common failure cases of the `step` method are caught
    """
    env = IdentityEnvBox()

    # Wrong shape for the observation
    check_step_assert_error(env, (np.ones((4,)), 1.0, False, {}))
    # Obs is not a numpy array
    check_step_assert_error(env, (1, 1.0, False, {}))

    # Return a wrong reward
    check_step_assert_error(env, (env.observation_space.sample(), np.ones(1), False, {}))

    # Info dict is not returned
    check_step_assert_error(env, (env.observation_space.sample(), 0.0, False))

    # Done is not a boolean
    check_step_assert_error(env, (env.observation_space.sample(), 0.0, 3.0, {}))
    check_step_assert_error(env, (env.observation_space.sample(), 0.0, 1, {}))
Exemplo n.º 12
0
def test_model_manipulation(model_class):
    """
    Test if the algorithm can be loaded and saved without any issues, the environment switching
    works and that the action prediction works

    :param model_class: (BaseRLModel) A model
    """
    try:
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

        # create and train
        model = model_class(policy="MlpPolicy", env=env)
        model.learn(total_timesteps=NUM_TIMESTEPS, seed=0)

        # predict and measure the acc reward
        acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            acc_reward += reward
        acc_reward = sum(acc_reward) / N_TRIALS

        # saving
        model.save("./test_model")

        del model, env

        # loading
        model = model_class.load("./test_model")

        # changing environment (note: this can be done at loading)
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])
        model.set_env(env)

        # predict the same output before saving
        loaded_acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS

        with pytest.warns(None) as record:
            act_prob = model.action_probability(obs)

        if model_class in [DDPG, SAC]:
            # check that only one warning was raised
            assert len(record) == 1, "No warning was raised for {}".format(
                model_class)
            assert act_prob is None, "Error: action_probability should be None for {}".format(
                model_class)
        else:
            assert act_prob[0].shape == (1, 1) and act_prob[1].shape == (1, 1), \
                "Error: action_probability not returning correct shape"

        # test action probability for given (obs, action) pair
        # must return zero and raise a warning or raise an exception if not defined
        env = model.get_env()
        obs = env.reset()
        observations = np.array([obs for _ in range(10)])
        observations = np.squeeze(observations)
        observations = observations.reshape((-1, 1))
        actions = np.array([env.action_space.sample() for _ in range(10)])

        if model_class == DDPG:
            with pytest.raises(ValueError):
                model.action_probability(observations, actions=actions)
        else:
            with pytest.warns(UserWarning):
                actions_probas = model.action_probability(observations,
                                                          actions=actions)
            assert actions_probas.shape == (len(actions),
                                            1), actions_probas.shape
            assert np.all(actions_probas == 0.0), actions_probas

        # assert <15% diff
        assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.15, \
            "Error: the prediction seems to have changed between loading and saving"

        # learn post loading
        model.learn(total_timesteps=100, seed=0)

        # validate no reset post learning
        # This test was failing from time to time for no good reason
        # other than bad luck
        # We should change this test
        # loaded_acc_reward = 0
        # set_global_seeds(0)
        # obs = env.reset()
        # for _ in range(N_TRIALS):
        #     action, _ = model.predict(obs)
        #     obs, reward, _, _ = env.step(action)
        #     loaded_acc_reward += reward
        # loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        # # assert <10% diff
        # assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \
        #     "Error: the prediction seems to have changed between pre learning and post learning"

        # predict new values
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, _, _, _ = env.step(action)

        # Free memory
        del model, env

    finally:
        if os.path.exists("./test_model"):
            os.remove("./test_model")
Exemplo n.º 13
0
def test_model_manipulation(model_class):
    """
    Test if the algorithm can be loaded and saved without any issues, the environment switching
    works and that the action prediction works

    :param model_class: (BaseRLModel) A model
    """
    try:
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

        # create and train
        model = model_class(policy="MlpPolicy", env=env)
        model.learn(total_timesteps=NUM_TIMESTEPS, seed=0)

        # predict and measure the acc reward
        acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            acc_reward += reward
        acc_reward = sum(acc_reward) / N_TRIALS

        # saving
        model.save("./test_model")

        del model, env

        # loading
        model = model_class.load("./test_model")

        # changing environment (note: this can be done at loading)
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])
        model.set_env(env)

        # predict the same output before saving
        loaded_acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        # assert <10% diff
        assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \
            "Error: the prediction seems to have changed between loading and saving"

        # learn post loading
        model.learn(total_timesteps=100, seed=0)

        # validate no reset post learning
        loaded_acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        # assert <10% diff
        assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \
            "Error: the prediction seems to have changed between pre learning and post learning"

        # predict new values
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, _, _, _ = env.step(action)

        # Free memory
        del model, env

    finally:
        if os.path.exists("./test_model"):
            os.remove("./test_model")
 def make_env():
     return IdentityEnvBox(ep_length=1e10)
Exemplo n.º 15
0
def test_model_manipulation(request, model_class):
    """
    Test if the algorithm can be loaded and saved without any issues, the environment switching
    works and that the action prediction works

    :param model_class: (BaseRLModel) A model
    """
    model_fname = None
    try:
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

        # create and train
        model = model_class(policy="MlpPolicy", env=env, seed=0)
        model.learn(total_timesteps=NUM_TIMESTEPS)

        env.reset()
        observations = np.concatenate(
            [env.step([env.action_space.sample()])[0] for _ in range(10)],
            axis=0)
        selected_actions, _ = model.predict(observations, deterministic=True)

        # saving
        model_fname = './test_model_{}.zip'.format(request.node.name)
        model.save(model_fname)

        del model, env

        # loading
        model = model_class.load(model_fname)

        # check if model still selects the same actions
        new_selected_actions, _ = model.predict(observations,
                                                deterministic=True)
        assert np.allclose(selected_actions, new_selected_actions, 1e-4)

        # changing environment (note: this can be done at loading)
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])
        model.set_env(env)

        obs = env.reset()
        with pytest.warns(None) as record:
            act_prob = model.action_probability(obs)

        if model_class in [DDPG, SAC, TD3]:
            # check that only one warning was raised
            assert len(record) == 1, "No warning was raised for {}".format(
                model_class)
            assert act_prob is None, "Error: action_probability should be None for {}".format(
                model_class)
        else:
            assert act_prob[0].shape == (1, 1) and act_prob[1].shape == (1, 1), \
                "Error: action_probability not returning correct shape"

        # test action probability for given (obs, action) pair
        # must return zero and raise a warning or raise an exception if not defined
        env = model.get_env()
        obs = env.reset()
        observations = np.array([obs for _ in range(10)])
        observations = np.squeeze(observations)
        observations = observations.reshape((-1, 1))
        actions = np.array([env.action_space.sample() for _ in range(10)])

        if model_class in [DDPG, SAC, TD3]:
            with pytest.raises(ValueError):
                model.action_probability(observations, actions=actions)
        else:
            actions_probas = model.action_probability(observations,
                                                      actions=actions)
            assert actions_probas.shape == (len(actions),
                                            1), actions_probas.shape
            assert np.all(actions_probas >= 0), actions_probas
            actions_logprobas = model.action_probability(observations,
                                                         actions=actions,
                                                         logp=True)
            assert np.allclose(actions_probas,
                               np.exp(actions_logprobas)), (actions_probas,
                                                            actions_logprobas)

        # learn post loading
        model.learn(total_timesteps=100)

        # predict new values
        evaluate_policy(model, env, n_eval_episodes=N_EVAL_EPISODES)

        # Free memory
        del model, env

    finally:
        if model_fname is not None and os.path.exists(model_fname):
            os.remove(model_fname)