def test_complete_episodes_packing(self): ev = RolloutWorker( env_creator=lambda _: MockEnv(10), policy_spec=MockPolicy, rollout_fragment_length=15, batch_mode="complete_episodes", ) batch = ev.sample() self.assertEqual(batch.count, 20) self.assertEqual( batch["t"].tolist(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], ) ev.stop()
def test_metrics(self): ev = RolloutWorker(env_creator=lambda _: MockEnv(episode_length=10), policy_spec=MockPolicy, batch_mode="complete_episodes") remote_ev = RolloutWorker.as_remote().remote( env_creator=lambda _: MockEnv(episode_length=10), policy_spec=MockPolicy, batch_mode="complete_episodes") ev.sample() ray.get(remote_ev.sample.remote()) result = collect_metrics(ev, [remote_ev]) self.assertEqual(result["episodes_this_iter"], 20) self.assertEqual(result["episode_reward_mean"], 10) ev.stop()
def test_filter_sync(self): ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy_spec=MockPolicy, sample_async=True, observation_filter="ConcurrentMeanStdFilter", ) time.sleep(2) ev.sample() filters = ev.get_filters(flush_after=True) obs_f = filters[DEFAULT_POLICY_ID] self.assertNotEqual(obs_f.rs.n, 0) self.assertNotEqual(obs_f.buffer.n, 0) ev.stop()
def test_soft_horizon(self): ev = RolloutWorker( env_creator=lambda _: MockEnv(episode_length=10), policy=MockPolicy, batch_mode="complete_episodes", rollout_fragment_length=10, episode_horizon=4, soft_horizon=True) samples = ev.sample() # three logical episodes self.assertEqual(len(set(samples["eps_id"])), 3) # only 1 hard done value self.assertEqual(sum(samples["dones"]), 1) ev.stop()
def test_extra_python_envs(self): extra_envs = {"env_key_1": "env_value_1", "env_key_2": "env_value_2"} self.assertFalse("env_key_1" in os.environ) self.assertFalse("env_key_2" in os.environ) ev = RolloutWorker(env_creator=lambda _: MockEnv(10), policy_spec=MockPolicy, extra_python_environs=extra_envs) self.assertTrue("env_key_1" in os.environ) self.assertTrue("env_key_2" in os.environ) ev.stop() # reset to original del os.environ["env_key_1"] del os.environ["env_key_2"]
def test_get_filters(self): ev = RolloutWorker(env_creator=lambda _: gym.make("CartPole-v0"), policy_spec=MockPolicy, sample_async=True, observation_filter="ConcurrentMeanStdFilter") self.sample_and_flush(ev) filters = ev.get_filters(flush_after=False) time.sleep(2) filters2 = ev.get_filters(flush_after=False) obs_f = filters[DEFAULT_POLICY_ID] obs_f2 = filters2[DEFAULT_POLICY_ID] self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n) self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n) ev.stop()
def test_batches_larger_when_vectorized(self): ev = RolloutWorker(env_creator=lambda _: MockEnv(episode_length=8), policy_spec=MockPolicy, batch_mode="truncate_episodes", rollout_fragment_length=4, num_envs=4) batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) batch = ev.sample() result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 4) ev.stop()
def test_wrap_multi_agent_env(self): ev = RolloutWorker( env_creator=lambda _: BasicMultiAgent(10), policy_spec=MockPolicy, policy_config={ "in_evaluation": False, }, ) # Make sure we can properly sample from the wrapped env. ev.sample() # Make sure the resulting environment is indeed still an self.assertTrue(isinstance(ev.env.unwrapped, MultiAgentEnv)) self.assertTrue(isinstance(ev.env, gym.Env)) ev.stop()
def test_action_immutability(self): from ray.rllib.examples.env.random_env import RandomEnv action_space = gym.spaces.Box(0.0001, 0.0002, (5,)) class ActionMutationEnv(RandomEnv): def init(self, config): self.test_case = config["test_case"] super().__init__(config=config) def step(self, action): # Ensure that it is called from inside the sampling process. import inspect curframe = inspect.currentframe() called_from_check = any( [ frame[3] == "check_gym_environments" for frame in inspect.getouterframes(curframe, 2) ] ) # Check, whether the action is immutable. if action.flags.writeable and not called_from_check: self.test_case.assertFalse( action.flags.writeable, "Action is mutable" ) return super().step(action) ev = RolloutWorker( env_creator=lambda _: ActionMutationEnv( config=dict( test_case=self, action_space=action_space, max_episode_len=10, p_done=0.0, check_action_bounds=True, ) ), policy_spec=RandomPolicy, policy_config=dict( action_space=action_space, ignore_action_bounds=True, ), clip_actions=False, batch_mode="complete_episodes", ) ev.sample() ev.stop()
def test_batch_ids(self): fragment_len = 100 ev = RolloutWorker(env_creator=lambda _: gym.make("CartPole-v0"), policy_spec=MockPolicy, rollout_fragment_length=fragment_len) batch1 = ev.sample() batch2 = ev.sample() unroll_ids_1 = set(batch1["unroll_id"]) unroll_ids_2 = set(batch2["unroll_id"]) # Assert no overlap of unroll IDs between sample() calls. self.assertTrue(not any(uid in unroll_ids_2 for uid in unroll_ids_1)) # CartPole episodes should be short initially: Expect more than one # unroll ID in each batch. self.assertTrue(len(unroll_ids_1) > 1) self.assertTrue(len(unroll_ids_2) > 1) ev.stop()
def test_vector_env_support(self): ev = RolloutWorker( env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8), policy_spec=MockPolicy, batch_mode="truncate_episodes", rollout_fragment_length=10) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 8) ev.stop()
def test_reward_clipping(self): # Clipping: on. ev = RolloutWorker(env_creator=lambda _: MockEnv2(episode_length=10), policy=MockPolicy, clip_rewards=True, batch_mode="complete_episodes") self.assertEqual(max(ev.sample()["rewards"]), 1) result = collect_metrics(ev, []) self.assertEqual(result["episode_reward_mean"], 1000) ev.stop() # Clipping: off. ev2 = RolloutWorker(env_creator=lambda _: MockEnv2(episode_length=10), policy=MockPolicy, clip_rewards=False, batch_mode="complete_episodes") self.assertEqual(max(ev2.sample()["rewards"]), 100) result2 = collect_metrics(ev2, []) self.assertEqual(result2["episode_reward_mean"], 1000) ev2.stop()
def test_vector_env_support(self): # Test a vector env that contains 8 actual envs # (MockEnv instances). ev = RolloutWorker( env_creator=( lambda _: VectorizedMockEnv(episode_length=20, num_envs=8)), policy_spec=MockPolicy, batch_mode="truncate_episodes", rollout_fragment_length=10, ) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 8) ev.stop() # Test a vector env that pretends(!) to contain 4 envs, but actually # only has 1 (CartPole). ev = RolloutWorker( env_creator=(lambda _: MockVectorEnv(20, mocked_num_envs=4)), policy_spec=MockPolicy, batch_mode="truncate_episodes", rollout_fragment_length=10, ) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertGreater(result["episodes_this_iter"], 3) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertGreater(result["episodes_this_iter"], 6) ev.stop()
def test_sync_filter(self): ev = RolloutWorker(env_creator=lambda _: gym.make("CartPole-v0"), policy=MockPolicy, sample_async=True, observation_filter="ConcurrentMeanStdFilter") obs_f = self.sample_and_flush(ev) # Current State filters = ev.get_filters(flush_after=False) obs_f = filters[DEFAULT_POLICY_ID] self.assertLessEqual(obs_f.buffer.n, 20) new_obsf = obs_f.copy() new_obsf.rs._n = 100 ev.sync_filters({DEFAULT_POLICY_ID: new_obsf}) filters = ev.get_filters(flush_after=False) obs_f = filters[DEFAULT_POLICY_ID] self.assertGreaterEqual(obs_f.rs.n, 100) self.assertLessEqual(obs_f.buffer.n, 20) ev.stop()
def test_hard_horizon(self): ev = RolloutWorker( env_creator=lambda _: MockEnv2(episode_length=10), policy_spec=MockPolicy, batch_mode="complete_episodes", rollout_fragment_length=10, episode_horizon=4, soft_horizon=False) samples = ev.sample() # Three logical episodes and correct episode resets (always after 4 # steps). self.assertEqual(len(set(samples["eps_id"])), 3) for i in range(4): self.assertEqual(np.argmax(samples["obs"][i]), i) self.assertEqual(np.argmax(samples["obs"][4]), 0) # 3 done values. self.assertEqual(sum(samples["dones"]), 3) ev.stop() # A gym env's max_episode_steps is smaller than Trainer's horizon. ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy_spec=MockPolicy, batch_mode="complete_episodes", rollout_fragment_length=10, episode_horizon=6, soft_horizon=False) samples = ev.sample() # 12 steps due to `complete_episodes` batch_mode. self.assertEqual(len(samples["eps_id"]), 12) # Two logical episodes and correct episode resets (always after 6(!) # steps). self.assertEqual(len(set(samples["eps_id"])), 2) # 2 done values after 6 and 12 steps. check(samples["dones"], [ False, False, False, False, False, True, False, False, False, False, False, True ]) ev.stop()
def test_no_training(self): class NoTrainingEnv(MockEnv): def __init__(self, episode_length, training_enabled): super(NoTrainingEnv, self).__init__(episode_length) self.training_enabled = training_enabled def step(self, action): obs, rew, done, info = super(NoTrainingEnv, self).step(action) return ( obs, rew, done, { **info, "training_enabled": self.training_enabled }, ) ev = RolloutWorker( env_creator=lambda _: NoTrainingEnv(10, True), policy_spec=MockPolicy, rollout_fragment_length=5, batch_mode="complete_episodes", ) batch = ev.sample() self.assertEqual(batch.count, 10) self.assertEqual(len(batch["obs"]), 10) ev.stop() ev = RolloutWorker( env_creator=lambda _: NoTrainingEnv(10, False), policy_spec=MockPolicy, rollout_fragment_length=5, batch_mode="complete_episodes", ) batch = ev.sample() self.assertTrue(isinstance(batch, MultiAgentBatch)) self.assertEqual(len(batch.policy_batches), 0) ev.stop()
def test_auto_vectorization(self): ev = RolloutWorker( env_creator=lambda cfg: MockEnv(episode_length=20, config=cfg), policy_spec=MockPolicy, batch_mode="truncate_episodes", rollout_fragment_length=2, num_envs=8) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 8) indices = [] for env in ev.async_env.vector_env.envs: self.assertEqual(env.unwrapped.config.worker_index, 0) indices.append(env.unwrapped.config.vector_index) self.assertEqual(indices, [0, 1, 2, 3, 4, 5, 6, 7]) ev.stop()
def test_basic(self): ev = RolloutWorker(env_creator=lambda _: gym.make("CartPole-v0"), policy=MockPolicy) batch = ev.sample() for key in [ "obs", "actions", "rewards", "dones", "advantages", "prev_rewards", "prev_actions" ]: self.assertIn(key, batch) self.assertGreater(np.abs(np.mean(batch[key])), 0) def to_prev(vec): out = np.zeros_like(vec) for i, v in enumerate(vec): if i + 1 < len(out) and not batch["dones"][i]: out[i + 1] = v return out.tolist() self.assertEqual(batch["prev_rewards"].tolist(), to_prev(batch["rewards"])) self.assertEqual(batch["prev_actions"].tolist(), to_prev(batch["actions"])) self.assertGreater(batch["advantages"][0], 1) ev.stop()
def test_truncate_episodes(self): ev_env_steps = RolloutWorker( env_creator=lambda _: MockEnv(10), policy_spec=MockPolicy, rollout_fragment_length=15, batch_mode="truncate_episodes", ) batch = ev_env_steps.sample() self.assertEqual(batch.count, 15) self.assertTrue(isinstance(batch, SampleBatch)) ev_env_steps.stop() action_space = Discrete(2) obs_space = Box(float("-inf"), float("inf"), (4,), dtype=np.float32) ev_agent_steps = RolloutWorker( env_creator=lambda _: MultiAgentCartPole({"num_agents": 4}), policy_spec={ "pol0": (MockPolicy, obs_space, action_space, {}), "pol1": (MockPolicy, obs_space, action_space, {}), }, policy_mapping_fn=lambda agent_id, episode, **kwargs: "pol0" if agent_id == 0 else "pol1", rollout_fragment_length=301, count_steps_by="env_steps", batch_mode="truncate_episodes", ) batch = ev_agent_steps.sample() self.assertTrue(isinstance(batch, MultiAgentBatch)) self.assertGreater(batch.agent_steps(), 301) self.assertEqual(batch.env_steps(), 301) ev_agent_steps.stop() ev_agent_steps = RolloutWorker( env_creator=lambda _: MultiAgentCartPole({"num_agents": 4}), policy_spec={ "pol0": (MockPolicy, obs_space, action_space, {}), "pol1": (MockPolicy, obs_space, action_space, {}), }, policy_mapping_fn=lambda agent_id, episode, **kwargs: "pol0" if agent_id == 0 else "pol1", rollout_fragment_length=301, count_steps_by="agent_steps", batch_mode="truncate_episodes", ) batch = ev_agent_steps.sample() self.assertTrue(isinstance(batch, MultiAgentBatch)) self.assertLess(batch.env_steps(), 301) # When counting agent steps, the count may be slightly larger than # rollout_fragment_length, b/c we have up to N agents stepping in each # env step and we only check, whether we should build after each env # step. self.assertGreaterEqual(batch.agent_steps(), 301) ev_agent_steps.stop()
def test_reward_clipping(self): # Clipping: True (clip between -1.0 and 1.0). ev = RolloutWorker( env_creator=lambda _: MockEnv2(episode_length=10), policy_spec=MockPolicy, clip_rewards=True, batch_mode="complete_episodes", ) self.assertEqual(max(ev.sample()["rewards"]), 1) result = collect_metrics(ev, []) self.assertEqual(result["episode_reward_mean"], 1000) ev.stop() from ray.rllib.examples.env.random_env import RandomEnv # Clipping in certain range (-2.0, 2.0). ev2 = RolloutWorker( env_creator=lambda _: RandomEnv( dict( reward_space=gym.spaces.Box(low=-10, high=10, shape=()), p_done=0.0, max_episode_len=10, ) ), policy_spec=MockPolicy, clip_rewards=2.0, batch_mode="complete_episodes", ) sample = ev2.sample() self.assertEqual(max(sample["rewards"]), 2.0) self.assertEqual(min(sample["rewards"]), -2.0) self.assertLess(np.mean(sample["rewards"]), 0.5) self.assertGreater(np.mean(sample["rewards"]), -0.5) ev2.stop() # Clipping: Off. ev2 = RolloutWorker( env_creator=lambda _: MockEnv2(episode_length=10), policy_spec=MockPolicy, clip_rewards=False, batch_mode="complete_episodes", ) self.assertEqual(max(ev2.sample()["rewards"]), 100) result2 = collect_metrics(ev2, []) self.assertEqual(result2["episode_reward_mean"], 1000) ev2.stop()
def test_action_clipping(self): from ray.rllib.examples.env.random_env import RandomEnv action_space = gym.spaces.Box(-2.0, 1.0, (3, )) # Clipping: True (clip between Policy's action_space.low/high), ev = RolloutWorker( env_creator=lambda _: RandomEnv(config=dict( action_space=action_space, max_episode_len=10, p_done=0.0, check_action_bounds=True, )), policy_spec=RandomPolicy, policy_config=dict( action_space=action_space, ignore_action_bounds=True, ), clip_actions=True, batch_mode="complete_episodes") sample = ev.sample() # Check, whether the action bounds have been breached (expected). # We still arrived here b/c we clipped according to the Env's action # space. self.assertGreater(np.max(sample["actions"]), action_space.high[0]) self.assertLess(np.min(sample["actions"]), action_space.low[0]) ev.stop() # Clipping: False and RandomPolicy produces invalid actions. # Expect Env to complain. ev2 = RolloutWorker( env_creator=lambda _: RandomEnv(config=dict( action_space=action_space, max_episode_len=10, p_done=0.0, check_action_bounds=True, )), policy_spec=RandomPolicy, policy_config=dict( action_space=action_space, ignore_action_bounds=True, ), clip_actions=False, # <- should lead to Env complaining batch_mode="complete_episodes") self.assertRaisesRegex(ValueError, r"Illegal action", ev2.sample) ev2.stop() # Clipping: False and RandomPolicy produces valid (bounded) actions. # Expect "actions" in SampleBatch to be unclipped. ev3 = RolloutWorker( env_creator=lambda _: RandomEnv(config=dict( action_space=action_space, max_episode_len=10, p_done=0.0, check_action_bounds=True, )), policy_spec=RandomPolicy, policy_config=dict(action_space=action_space), # Should not be a problem as RandomPolicy abides to bounds. clip_actions=False, batch_mode="complete_episodes") sample = ev3.sample() self.assertGreater(np.min(sample["actions"]), action_space.low[0]) self.assertLess(np.max(sample["actions"]), action_space.high[0]) ev3.stop()