def test_identity_continuous(model_class): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) """ env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) if model_class in [DDPG, TD3]: n_actions = 1 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) else: action_noise = None model = model_class("MlpPolicy", env, gamma=0.1, seed=0, action_noise=action_noise, buffer_size=int(1e6)) model.learn(total_timesteps=20000) n_trials = 1000 reward_sum = 0 set_global_seeds(0) obs = env.reset() for _ in range(n_trials): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) reward_sum += reward assert reward_sum > 0.9 * n_trials # Free memory del model, env
def test_log_prob_calcuation(model_class): model = model_class("MlpPolicy", IdentityEnvBox()) # Fixed mean/std model.proba_step = Helper.proba_vals # Check that the log probability is the one expected for the given mean/std logprob = model.action_probability(observation=np.array([[0.5], [0.5]]), actions=0.2, logp=True) assert np.allclose(logprob, np.array([-16.616353440210627])), "Calculation failed for {}".format(model_class)
def test_identity_ddpg(): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) """ env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) std = 0.2 param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(std), desired_action_stddev=float(std)) model = DDPG("MlpPolicy", env, gamma=0.0, param_noise=param_noise, memory_limit=int(1e6)) model.learn(total_timesteps=20000, seed=0) n_trials = 1000 reward_sum = 0 set_global_seeds(0) obs = env.reset() for _ in range(n_trials): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) reward_sum += reward assert reward_sum > 0.9 * n_trials # Free memory del model, env
def test_identity_continuous(model_class): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) """ env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) if model_class in [DDPG, TD3]: n_actions = 1 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) else: action_noise = None model = model_class("MlpPolicy", env, gamma=0.1, seed=0, action_noise=action_noise, buffer_size=int(1e6)) model.learn(total_timesteps=20000) evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90) # Free memory del model, env
def test_identity_continuous(model_class): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) """ env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) n_steps = {SAC: 700, TD3: 500, DDPG: 2000}[model_class] kwargs = dict(seed=0, gamma=0.95, buffer_size=1e5) if model_class in [DDPG, TD3]: n_actions = 1 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.05 * np.ones(n_actions)) kwargs["action_noise"] = action_noise if model_class == DDPG: kwargs["actor_lr"] = 1e-3 kwargs["batch_size"] = 100 model = model_class("MlpPolicy", env, **kwargs) model.learn(total_timesteps=n_steps) evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90) # Free memory del model, env
def test_identity_box(model_class): """ test the Box environment vectorisation detection :param model_class: (BaseRLModel) the RL model """ check_shape(lambda: IdentityEnvBox(eps=0.5), model_class, (1, ), (1, 1))
def test_buffer_actions_scaling(model_class, model_kwargs): """ Test if actions are scaled to tanh co-domain before being put in a buffer for algorithms that use tanh-squashing, i.e., DDPG, TD3, SAC :param model_class: (BaseRLModel) A RL Model :param model_kwargs: (dict) Dictionary containing named arguments to the given algorithm """ # check random and inferred actions as they possibly have different flows for random_coeff in [0.0, 1.0]: env = IdentityEnvBox(-2000, 1000) model = model_class("MlpPolicy", env, seed=1, random_exploration=random_coeff, **model_kwargs) model.learn(total_timesteps=ROLLOUT_STEPS) assert hasattr(model, 'replay_buffer') buffer = model.replay_buffer assert buffer.can_sample(ROLLOUT_STEPS) _, actions, _, _, _ = buffer.sample(ROLLOUT_STEPS) assert not np.any(actions > np.ones_like(actions)) assert not np.any(actions < -np.ones_like(actions))
def test_common_failures_reset(): """ Test that common failure cases of the `reset_method` are caught """ env = IdentityEnvBox() # Return an observation that does not match the observation_space check_reset_assert_error(env, np.ones((3,))) # The observation is not a numpy array check_reset_assert_error(env, 1) # Return not only the observation check_reset_assert_error(env, (env.observation_space.sample(), False))
def test_identity_box(model_class): """ test the Box environment vectorisation detection :param model_class: (BaseRLModel) the RL model """ model = model_class(policy="MlpPolicy", env=DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])) env0 = IdentityEnvBox() env1 = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) n_trials = 100 for env, expected_shape in [(env0, (1,)), (env1, (1, 1))]: obs = env.reset() for _ in range(n_trials): action, _ = model.predict(obs) assert np.array(action).shape == expected_shape obs, _, _, _ = env.step(action) # Free memory del model, env
def test_identity_continuous(model_name): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) :param model_name: (str) Name of the RL model """ env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) model = LEARN_FUNC_DICT[model_name](env) n_trials = 1000 obs = env.reset() action_shape = model.predict(obs, deterministic=False)[0].shape action, _ = model.predict(obs, deterministic=True) assert action.shape == action_shape for _ in range(n_trials): new_action = model.predict(obs, deterministic=True)[0] assert action == model.predict(obs, deterministic=True)[0] assert new_action.shape == action_shape
def test_common_failures_step(): """ Test that common failure cases of the `step` method are caught """ env = IdentityEnvBox() # Wrong shape for the observation check_step_assert_error(env, (np.ones((4,)), 1.0, False, {})) # Obs is not a numpy array check_step_assert_error(env, (1, 1.0, False, {})) # Return a wrong reward check_step_assert_error(env, (env.observation_space.sample(), np.ones(1), False, {})) # Info dict is not returned check_step_assert_error(env, (env.observation_space.sample(), 0.0, False)) # Done is not a boolean check_step_assert_error(env, (env.observation_space.sample(), 0.0, 3.0, {})) check_step_assert_error(env, (env.observation_space.sample(), 0.0, 1, {}))
def test_model_manipulation(model_class): """ Test if the algorithm can be loaded and saved without any issues, the environment switching works and that the action prediction works :param model_class: (BaseRLModel) A model """ try: env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) # create and train model = model_class(policy="MlpPolicy", env=env) model.learn(total_timesteps=NUM_TIMESTEPS, seed=0) # predict and measure the acc reward acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) acc_reward += reward acc_reward = sum(acc_reward) / N_TRIALS # saving model.save("./test_model") del model, env # loading model = model_class.load("./test_model") # changing environment (note: this can be done at loading) env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) model.set_env(env) # predict the same output before saving loaded_acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) loaded_acc_reward += reward loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS with pytest.warns(None) as record: act_prob = model.action_probability(obs) if model_class in [DDPG, SAC]: # check that only one warning was raised assert len(record) == 1, "No warning was raised for {}".format( model_class) assert act_prob is None, "Error: action_probability should be None for {}".format( model_class) else: assert act_prob[0].shape == (1, 1) and act_prob[1].shape == (1, 1), \ "Error: action_probability not returning correct shape" # test action probability for given (obs, action) pair # must return zero and raise a warning or raise an exception if not defined env = model.get_env() obs = env.reset() observations = np.array([obs for _ in range(10)]) observations = np.squeeze(observations) observations = observations.reshape((-1, 1)) actions = np.array([env.action_space.sample() for _ in range(10)]) if model_class == DDPG: with pytest.raises(ValueError): model.action_probability(observations, actions=actions) else: with pytest.warns(UserWarning): actions_probas = model.action_probability(observations, actions=actions) assert actions_probas.shape == (len(actions), 1), actions_probas.shape assert np.all(actions_probas == 0.0), actions_probas # assert <15% diff assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.15, \ "Error: the prediction seems to have changed between loading and saving" # learn post loading model.learn(total_timesteps=100, seed=0) # validate no reset post learning # This test was failing from time to time for no good reason # other than bad luck # We should change this test # loaded_acc_reward = 0 # set_global_seeds(0) # obs = env.reset() # for _ in range(N_TRIALS): # action, _ = model.predict(obs) # obs, reward, _, _ = env.step(action) # loaded_acc_reward += reward # loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS # # assert <10% diff # assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \ # "Error: the prediction seems to have changed between pre learning and post learning" # predict new values obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, _, _, _ = env.step(action) # Free memory del model, env finally: if os.path.exists("./test_model"): os.remove("./test_model")
def test_model_manipulation(model_class): """ Test if the algorithm can be loaded and saved without any issues, the environment switching works and that the action prediction works :param model_class: (BaseRLModel) A model """ try: env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) # create and train model = model_class(policy="MlpPolicy", env=env) model.learn(total_timesteps=NUM_TIMESTEPS, seed=0) # predict and measure the acc reward acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) acc_reward += reward acc_reward = sum(acc_reward) / N_TRIALS # saving model.save("./test_model") del model, env # loading model = model_class.load("./test_model") # changing environment (note: this can be done at loading) env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) model.set_env(env) # predict the same output before saving loaded_acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) loaded_acc_reward += reward loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS # assert <10% diff assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \ "Error: the prediction seems to have changed between loading and saving" # learn post loading model.learn(total_timesteps=100, seed=0) # validate no reset post learning loaded_acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) loaded_acc_reward += reward loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS # assert <10% diff assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \ "Error: the prediction seems to have changed between pre learning and post learning" # predict new values obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, _, _, _ = env.step(action) # Free memory del model, env finally: if os.path.exists("./test_model"): os.remove("./test_model")
def make_env(): return IdentityEnvBox(ep_length=1e10)
def test_model_manipulation(request, model_class): """ Test if the algorithm can be loaded and saved without any issues, the environment switching works and that the action prediction works :param model_class: (BaseRLModel) A model """ model_fname = None try: env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) # create and train model = model_class(policy="MlpPolicy", env=env, seed=0) model.learn(total_timesteps=NUM_TIMESTEPS) env.reset() observations = np.concatenate( [env.step([env.action_space.sample()])[0] for _ in range(10)], axis=0) selected_actions, _ = model.predict(observations, deterministic=True) # saving model_fname = './test_model_{}.zip'.format(request.node.name) model.save(model_fname) del model, env # loading model = model_class.load(model_fname) # check if model still selects the same actions new_selected_actions, _ = model.predict(observations, deterministic=True) assert np.allclose(selected_actions, new_selected_actions, 1e-4) # changing environment (note: this can be done at loading) env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) model.set_env(env) obs = env.reset() with pytest.warns(None) as record: act_prob = model.action_probability(obs) if model_class in [DDPG, SAC, TD3]: # check that only one warning was raised assert len(record) == 1, "No warning was raised for {}".format( model_class) assert act_prob is None, "Error: action_probability should be None for {}".format( model_class) else: assert act_prob[0].shape == (1, 1) and act_prob[1].shape == (1, 1), \ "Error: action_probability not returning correct shape" # test action probability for given (obs, action) pair # must return zero and raise a warning or raise an exception if not defined env = model.get_env() obs = env.reset() observations = np.array([obs for _ in range(10)]) observations = np.squeeze(observations) observations = observations.reshape((-1, 1)) actions = np.array([env.action_space.sample() for _ in range(10)]) if model_class in [DDPG, SAC, TD3]: with pytest.raises(ValueError): model.action_probability(observations, actions=actions) else: actions_probas = model.action_probability(observations, actions=actions) assert actions_probas.shape == (len(actions), 1), actions_probas.shape assert np.all(actions_probas >= 0), actions_probas actions_logprobas = model.action_probability(observations, actions=actions, logp=True) assert np.allclose(actions_probas, np.exp(actions_logprobas)), (actions_probas, actions_logprobas) # learn post loading model.learn(total_timesteps=100) # predict new values evaluate_policy(model, env, n_eval_episodes=N_EVAL_EPISODES) # Free memory del model, env finally: if model_fname is not None and os.path.exists(model_fname): os.remove(model_fname)