def test_reward_clipping(self): # Clipping: True (clip between -1.0 and 1.0). ev = RolloutWorker( env_creator=lambda _: MockEnv2(episode_length=10), policy_spec=MockPolicy, clip_rewards=True, batch_mode="complete_episodes", ) self.assertEqual(max(ev.sample()["rewards"]), 1) result = collect_metrics(ev, []) self.assertEqual(result["episode_reward_mean"], 1000) ev.stop() from ray.rllib.examples.env.random_env import RandomEnv # Clipping in certain range (-2.0, 2.0). ev2 = RolloutWorker( env_creator=lambda _: RandomEnv( dict( reward_space=gym.spaces.Box(low=-10, high=10, shape=()), p_done=0.0, max_episode_len=10, ) ), policy_spec=MockPolicy, clip_rewards=2.0, batch_mode="complete_episodes", ) sample = ev2.sample() self.assertEqual(max(sample["rewards"]), 2.0) self.assertEqual(min(sample["rewards"]), -2.0) self.assertLess(np.mean(sample["rewards"]), 0.5) self.assertGreater(np.mean(sample["rewards"]), -0.5) ev2.stop() # Clipping: Off. ev2 = RolloutWorker( env_creator=lambda _: MockEnv2(episode_length=10), policy_spec=MockPolicy, clip_rewards=False, batch_mode="complete_episodes", ) self.assertEqual(max(ev2.sample()["rewards"]), 100) result2 = collect_metrics(ev2, []) self.assertEqual(result2["episode_reward_mean"], 1000) ev2.stop()
def test_hard_horizon(self): ev = RolloutWorker( env_creator=lambda _: MockEnv2(episode_length=10), policy_spec=MockPolicy, batch_mode="complete_episodes", rollout_fragment_length=10, episode_horizon=4, soft_horizon=False, ) samples = ev.sample() # Three logical episodes and correct episode resets (always after 4 # steps). self.assertEqual(len(set(samples["eps_id"])), 3) for i in range(4): self.assertEqual(np.argmax(samples["obs"][i]), i) self.assertEqual(np.argmax(samples["obs"][4]), 0) # 3 done values. self.assertEqual(sum(samples["dones"]), 3) ev.stop() # A gym env's max_episode_steps is smaller than Trainer's horizon. ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy_spec=MockPolicy, batch_mode="complete_episodes", rollout_fragment_length=10, episode_horizon=6, soft_horizon=False, ) samples = ev.sample() # 12 steps due to `complete_episodes` batch_mode. self.assertEqual(len(samples["eps_id"]), 12) # Two logical episodes and correct episode resets (always after 6(!) # steps). self.assertEqual(len(set(samples["eps_id"])), 2) # 2 done values after 6 and 12 steps. check( samples["dones"], [ False, False, False, False, False, True, False, False, False, False, False, True, ], ) ev.stop()
def test_multi_env_seed(self): ev = RolloutWorker(env_creator=lambda _: MockEnv2(100), num_envs=3, policy_spec=MockPolicy, seed=1) # Make sure we can properly sample from the wrapped env. ev.sample() # Make sure all environments got a different deterministic seed. seeds = ev.foreach_env(lambda env: env.rng_seed) self.assertEqual(seeds, [1, 2, 3]) ev.stop()
def __init__(self, num, increment_obs=False): if increment_obs: # Observations are 0, 1, 2, 3... etc. as time advances self.agents = [MockEnv2(5) for _ in range(num)] else: # Observations are all zeros self.agents = [MockEnv(5) for _ in range(num)] self.dones = set() self.last_obs = {} self.last_rew = {} self.last_done = {} self.last_info = {} self.i = 0 self.num = num self.observation_space = gym.spaces.Discrete(10) self.action_space = gym.spaces.Discrete(2)
def test_wrap_gym_env(self): record_env_dir = os.popen("mktemp -d").read()[:-1] print(f"tmp dir for videos={record_env_dir}") if not os.path.exists(record_env_dir): sys.exit(1) num_steps_per_episode = 10 wrapped = record_env_wrapper( env=MockEnv2(num_steps_per_episode), record_env=record_env_dir, log_dir="", policy_config={ "in_evaluation": False, }, ) # Non MultiAgentEnv: Wrapper's type is wrappers.Monitor. self.assertTrue(isinstance(wrapped, gym.wrappers.Monitor)) self.assertFalse(isinstance(wrapped, VideoMonitor)) wrapped.reset() # Expect one video file to have been produced in the tmp dir. os.chdir(record_env_dir) ls = glob.glob("*.mp4") self.assertTrue(len(ls) == 1) # 10 steps for a complete episode. for i in range(num_steps_per_episode): wrapped.step(0) # Another episode. wrapped.reset() for i in range(num_steps_per_episode): wrapped.step(0) # Expect another video file to have been produced (2nd episode). ls = glob.glob("*.mp4") self.assertTrue(len(ls) == 2) # MockEnv2 returns a reward of 100.0 every step. # So total reward is 1000.0 per episode (10 steps). check( np.array([100.0, 100.0]) * num_steps_per_episode, wrapped.get_episode_rewards(), ) # Erase all generated files and the temp path just in case, # as to not disturb further CI-tests. shutil.rmtree(record_env_dir)
def test_wrap_gym_env(self): wrapped = record_env_wrapper(env=MockEnv2(10), record_env=tempfile.gettempdir(), log_dir="", policy_config={ "in_evaluation": False, }) # Type is wrappers.Monitor. self.assertTrue(isinstance(wrapped, wrappers.Monitor)) self.assertFalse(isinstance(wrapped, VideoMonitor)) wrapped.reset() # 10 steps for a complete episode. for i in range(10): wrapped.step(0) # MockEnv2 returns a reward of 100.0 every step. # So total reward is 1000.0. self.assertEqual(wrapped.get_episode_rewards(), [1000.0])