예제 #1
0
 def test_complete_episodes_packing(self):
     ev = RolloutWorker(
         env_creator=lambda _: MockEnv(10),
         policy_spec=MockPolicy,
         rollout_fragment_length=15,
         batch_mode="complete_episodes",
     )
     batch = ev.sample()
     self.assertEqual(batch.count, 20)
     self.assertEqual(
         batch["t"].tolist(),
         [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
     )
     ev.stop()
예제 #2
0
 def test_metrics(self):
     ev = RolloutWorker(env_creator=lambda _: MockEnv(episode_length=10),
                        policy_spec=MockPolicy,
                        batch_mode="complete_episodes")
     remote_ev = RolloutWorker.as_remote().remote(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_spec=MockPolicy,
         batch_mode="complete_episodes")
     ev.sample()
     ray.get(remote_ev.sample.remote())
     result = collect_metrics(ev, [remote_ev])
     self.assertEqual(result["episodes_this_iter"], 20)
     self.assertEqual(result["episode_reward_mean"], 10)
     ev.stop()
예제 #3
0
 def test_filter_sync(self):
     ev = RolloutWorker(
         env_creator=lambda _: gym.make("CartPole-v0"),
         policy_spec=MockPolicy,
         sample_async=True,
         observation_filter="ConcurrentMeanStdFilter",
     )
     time.sleep(2)
     ev.sample()
     filters = ev.get_filters(flush_after=True)
     obs_f = filters[DEFAULT_POLICY_ID]
     self.assertNotEqual(obs_f.rs.n, 0)
     self.assertNotEqual(obs_f.buffer.n, 0)
     ev.stop()
예제 #4
0
 def test_soft_horizon(self):
     ev = RolloutWorker(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy=MockPolicy,
         batch_mode="complete_episodes",
         rollout_fragment_length=10,
         episode_horizon=4,
         soft_horizon=True)
     samples = ev.sample()
     # three logical episodes
     self.assertEqual(len(set(samples["eps_id"])), 3)
     # only 1 hard done value
     self.assertEqual(sum(samples["dones"]), 1)
     ev.stop()
예제 #5
0
    def test_extra_python_envs(self):
        extra_envs = {"env_key_1": "env_value_1", "env_key_2": "env_value_2"}
        self.assertFalse("env_key_1" in os.environ)
        self.assertFalse("env_key_2" in os.environ)
        ev = RolloutWorker(env_creator=lambda _: MockEnv(10),
                           policy_spec=MockPolicy,
                           extra_python_environs=extra_envs)
        self.assertTrue("env_key_1" in os.environ)
        self.assertTrue("env_key_2" in os.environ)
        ev.stop()

        # reset to original
        del os.environ["env_key_1"]
        del os.environ["env_key_2"]
예제 #6
0
 def test_get_filters(self):
     ev = RolloutWorker(env_creator=lambda _: gym.make("CartPole-v0"),
                        policy_spec=MockPolicy,
                        sample_async=True,
                        observation_filter="ConcurrentMeanStdFilter")
     self.sample_and_flush(ev)
     filters = ev.get_filters(flush_after=False)
     time.sleep(2)
     filters2 = ev.get_filters(flush_after=False)
     obs_f = filters[DEFAULT_POLICY_ID]
     obs_f2 = filters2[DEFAULT_POLICY_ID]
     self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n)
     self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n)
     ev.stop()
예제 #7
0
 def test_batches_larger_when_vectorized(self):
     ev = RolloutWorker(env_creator=lambda _: MockEnv(episode_length=8),
                        policy_spec=MockPolicy,
                        batch_mode="truncate_episodes",
                        rollout_fragment_length=4,
                        num_envs=4)
     batch = ev.sample()
     self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     batch = ev.sample()
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 4)
     ev.stop()
예제 #8
0
 def test_wrap_multi_agent_env(self):
     ev = RolloutWorker(
         env_creator=lambda _: BasicMultiAgent(10),
         policy_spec=MockPolicy,
         policy_config={
             "in_evaluation": False,
         },
     )
     # Make sure we can properly sample from the wrapped env.
     ev.sample()
     # Make sure the resulting environment is indeed still an
     self.assertTrue(isinstance(ev.env.unwrapped, MultiAgentEnv))
     self.assertTrue(isinstance(ev.env, gym.Env))
     ev.stop()
예제 #9
0
    def test_action_immutability(self):
        from ray.rllib.examples.env.random_env import RandomEnv

        action_space = gym.spaces.Box(0.0001, 0.0002, (5,))

        class ActionMutationEnv(RandomEnv):
            def init(self, config):
                self.test_case = config["test_case"]
                super().__init__(config=config)

            def step(self, action):
                # Ensure that it is called from inside the sampling process.
                import inspect

                curframe = inspect.currentframe()
                called_from_check = any(
                    [
                        frame[3] == "check_gym_environments"
                        for frame in inspect.getouterframes(curframe, 2)
                    ]
                )
                # Check, whether the action is immutable.
                if action.flags.writeable and not called_from_check:
                    self.test_case.assertFalse(
                        action.flags.writeable, "Action is mutable"
                    )
                return super().step(action)

        ev = RolloutWorker(
            env_creator=lambda _: ActionMutationEnv(
                config=dict(
                    test_case=self,
                    action_space=action_space,
                    max_episode_len=10,
                    p_done=0.0,
                    check_action_bounds=True,
                )
            ),
            policy_spec=RandomPolicy,
            policy_config=dict(
                action_space=action_space,
                ignore_action_bounds=True,
            ),
            clip_actions=False,
            batch_mode="complete_episodes",
        )
        ev.sample()
        ev.stop()
예제 #10
0
 def test_batch_ids(self):
     fragment_len = 100
     ev = RolloutWorker(env_creator=lambda _: gym.make("CartPole-v0"),
                        policy_spec=MockPolicy,
                        rollout_fragment_length=fragment_len)
     batch1 = ev.sample()
     batch2 = ev.sample()
     unroll_ids_1 = set(batch1["unroll_id"])
     unroll_ids_2 = set(batch2["unroll_id"])
     # Assert no overlap of unroll IDs between sample() calls.
     self.assertTrue(not any(uid in unroll_ids_2 for uid in unroll_ids_1))
     # CartPole episodes should be short initially: Expect more than one
     # unroll ID in each batch.
     self.assertTrue(len(unroll_ids_1) > 1)
     self.assertTrue(len(unroll_ids_2) > 1)
     ev.stop()
예제 #11
0
 def test_vector_env_support(self):
     ev = RolloutWorker(
         env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8),
         policy_spec=MockPolicy,
         batch_mode="truncate_episodes",
         rollout_fragment_length=10)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 10)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 10)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 8)
     ev.stop()
예제 #12
0
    def test_reward_clipping(self):
        # Clipping: on.
        ev = RolloutWorker(env_creator=lambda _: MockEnv2(episode_length=10),
                           policy=MockPolicy,
                           clip_rewards=True,
                           batch_mode="complete_episodes")
        self.assertEqual(max(ev.sample()["rewards"]), 1)
        result = collect_metrics(ev, [])
        self.assertEqual(result["episode_reward_mean"], 1000)
        ev.stop()

        # Clipping: off.
        ev2 = RolloutWorker(env_creator=lambda _: MockEnv2(episode_length=10),
                            policy=MockPolicy,
                            clip_rewards=False,
                            batch_mode="complete_episodes")
        self.assertEqual(max(ev2.sample()["rewards"]), 100)
        result2 = collect_metrics(ev2, [])
        self.assertEqual(result2["episode_reward_mean"], 1000)
        ev2.stop()
예제 #13
0
    def test_vector_env_support(self):
        # Test a vector env that contains 8 actual envs
        # (MockEnv instances).
        ev = RolloutWorker(
            env_creator=(
                lambda _: VectorizedMockEnv(episode_length=20, num_envs=8)),
            policy_spec=MockPolicy,
            batch_mode="truncate_episodes",
            rollout_fragment_length=10,
        )
        for _ in range(8):
            batch = ev.sample()
            self.assertEqual(batch.count, 10)
        result = collect_metrics(ev, [])
        self.assertEqual(result["episodes_this_iter"], 0)
        for _ in range(8):
            batch = ev.sample()
            self.assertEqual(batch.count, 10)
        result = collect_metrics(ev, [])
        self.assertEqual(result["episodes_this_iter"], 8)
        ev.stop()

        # Test a vector env that pretends(!) to contain 4 envs, but actually
        # only has 1 (CartPole).
        ev = RolloutWorker(
            env_creator=(lambda _: MockVectorEnv(20, mocked_num_envs=4)),
            policy_spec=MockPolicy,
            batch_mode="truncate_episodes",
            rollout_fragment_length=10,
        )
        for _ in range(8):
            batch = ev.sample()
            self.assertEqual(batch.count, 10)
        result = collect_metrics(ev, [])
        self.assertGreater(result["episodes_this_iter"], 3)
        for _ in range(8):
            batch = ev.sample()
            self.assertEqual(batch.count, 10)
        result = collect_metrics(ev, [])
        self.assertGreater(result["episodes_this_iter"], 6)
        ev.stop()
예제 #14
0
    def test_sync_filter(self):
        ev = RolloutWorker(env_creator=lambda _: gym.make("CartPole-v0"),
                           policy=MockPolicy,
                           sample_async=True,
                           observation_filter="ConcurrentMeanStdFilter")
        obs_f = self.sample_and_flush(ev)

        # Current State
        filters = ev.get_filters(flush_after=False)
        obs_f = filters[DEFAULT_POLICY_ID]

        self.assertLessEqual(obs_f.buffer.n, 20)

        new_obsf = obs_f.copy()
        new_obsf.rs._n = 100
        ev.sync_filters({DEFAULT_POLICY_ID: new_obsf})
        filters = ev.get_filters(flush_after=False)
        obs_f = filters[DEFAULT_POLICY_ID]
        self.assertGreaterEqual(obs_f.rs.n, 100)
        self.assertLessEqual(obs_f.buffer.n, 20)
        ev.stop()
예제 #15
0
    def test_hard_horizon(self):
        ev = RolloutWorker(
            env_creator=lambda _: MockEnv2(episode_length=10),
            policy_spec=MockPolicy,
            batch_mode="complete_episodes",
            rollout_fragment_length=10,
            episode_horizon=4,
            soft_horizon=False)
        samples = ev.sample()
        # Three logical episodes and correct episode resets (always after 4
        # steps).
        self.assertEqual(len(set(samples["eps_id"])), 3)
        for i in range(4):
            self.assertEqual(np.argmax(samples["obs"][i]), i)
        self.assertEqual(np.argmax(samples["obs"][4]), 0)
        # 3 done values.
        self.assertEqual(sum(samples["dones"]), 3)
        ev.stop()

        # A gym env's max_episode_steps is smaller than Trainer's horizon.
        ev = RolloutWorker(
            env_creator=lambda _: gym.make("CartPole-v0"),
            policy_spec=MockPolicy,
            batch_mode="complete_episodes",
            rollout_fragment_length=10,
            episode_horizon=6,
            soft_horizon=False)
        samples = ev.sample()
        # 12 steps due to `complete_episodes` batch_mode.
        self.assertEqual(len(samples["eps_id"]), 12)
        # Two logical episodes and correct episode resets (always after 6(!)
        # steps).
        self.assertEqual(len(set(samples["eps_id"])), 2)
        # 2 done values after 6 and 12 steps.
        check(samples["dones"], [
            False, False, False, False, False, True, False, False, False,
            False, False, True
        ])
        ev.stop()
예제 #16
0
    def test_no_training(self):
        class NoTrainingEnv(MockEnv):
            def __init__(self, episode_length, training_enabled):
                super(NoTrainingEnv, self).__init__(episode_length)
                self.training_enabled = training_enabled

            def step(self, action):
                obs, rew, done, info = super(NoTrainingEnv, self).step(action)
                return (
                    obs,
                    rew,
                    done,
                    {
                        **info, "training_enabled": self.training_enabled
                    },
                )

        ev = RolloutWorker(
            env_creator=lambda _: NoTrainingEnv(10, True),
            policy_spec=MockPolicy,
            rollout_fragment_length=5,
            batch_mode="complete_episodes",
        )
        batch = ev.sample()
        self.assertEqual(batch.count, 10)
        self.assertEqual(len(batch["obs"]), 10)
        ev.stop()

        ev = RolloutWorker(
            env_creator=lambda _: NoTrainingEnv(10, False),
            policy_spec=MockPolicy,
            rollout_fragment_length=5,
            batch_mode="complete_episodes",
        )
        batch = ev.sample()
        self.assertTrue(isinstance(batch, MultiAgentBatch))
        self.assertEqual(len(batch.policy_batches), 0)
        ev.stop()
예제 #17
0
 def test_auto_vectorization(self):
     ev = RolloutWorker(
         env_creator=lambda cfg: MockEnv(episode_length=20, config=cfg),
         policy_spec=MockPolicy,
         batch_mode="truncate_episodes",
         rollout_fragment_length=2,
         num_envs=8)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 8)
     indices = []
     for env in ev.async_env.vector_env.envs:
         self.assertEqual(env.unwrapped.config.worker_index, 0)
         indices.append(env.unwrapped.config.vector_index)
     self.assertEqual(indices, [0, 1, 2, 3, 4, 5, 6, 7])
     ev.stop()
예제 #18
0
    def test_basic(self):
        ev = RolloutWorker(env_creator=lambda _: gym.make("CartPole-v0"),
                           policy=MockPolicy)
        batch = ev.sample()
        for key in [
                "obs", "actions", "rewards", "dones", "advantages",
                "prev_rewards", "prev_actions"
        ]:
            self.assertIn(key, batch)
            self.assertGreater(np.abs(np.mean(batch[key])), 0)

        def to_prev(vec):
            out = np.zeros_like(vec)
            for i, v in enumerate(vec):
                if i + 1 < len(out) and not batch["dones"][i]:
                    out[i + 1] = v
            return out.tolist()

        self.assertEqual(batch["prev_rewards"].tolist(),
                         to_prev(batch["rewards"]))
        self.assertEqual(batch["prev_actions"].tolist(),
                         to_prev(batch["actions"]))
        self.assertGreater(batch["advantages"][0], 1)
        ev.stop()
예제 #19
0
    def test_truncate_episodes(self):
        ev_env_steps = RolloutWorker(
            env_creator=lambda _: MockEnv(10),
            policy_spec=MockPolicy,
            rollout_fragment_length=15,
            batch_mode="truncate_episodes",
        )
        batch = ev_env_steps.sample()
        self.assertEqual(batch.count, 15)
        self.assertTrue(isinstance(batch, SampleBatch))
        ev_env_steps.stop()

        action_space = Discrete(2)
        obs_space = Box(float("-inf"), float("inf"), (4,), dtype=np.float32)
        ev_agent_steps = RolloutWorker(
            env_creator=lambda _: MultiAgentCartPole({"num_agents": 4}),
            policy_spec={
                "pol0": (MockPolicy, obs_space, action_space, {}),
                "pol1": (MockPolicy, obs_space, action_space, {}),
            },
            policy_mapping_fn=lambda agent_id, episode, **kwargs: "pol0"
            if agent_id == 0
            else "pol1",
            rollout_fragment_length=301,
            count_steps_by="env_steps",
            batch_mode="truncate_episodes",
        )
        batch = ev_agent_steps.sample()
        self.assertTrue(isinstance(batch, MultiAgentBatch))
        self.assertGreater(batch.agent_steps(), 301)
        self.assertEqual(batch.env_steps(), 301)
        ev_agent_steps.stop()

        ev_agent_steps = RolloutWorker(
            env_creator=lambda _: MultiAgentCartPole({"num_agents": 4}),
            policy_spec={
                "pol0": (MockPolicy, obs_space, action_space, {}),
                "pol1": (MockPolicy, obs_space, action_space, {}),
            },
            policy_mapping_fn=lambda agent_id, episode, **kwargs: "pol0"
            if agent_id == 0
            else "pol1",
            rollout_fragment_length=301,
            count_steps_by="agent_steps",
            batch_mode="truncate_episodes",
        )
        batch = ev_agent_steps.sample()
        self.assertTrue(isinstance(batch, MultiAgentBatch))
        self.assertLess(batch.env_steps(), 301)
        # When counting agent steps, the count may be slightly larger than
        # rollout_fragment_length, b/c we have up to N agents stepping in each
        # env step and we only check, whether we should build after each env
        # step.
        self.assertGreaterEqual(batch.agent_steps(), 301)
        ev_agent_steps.stop()
예제 #20
0
    def test_reward_clipping(self):
        # Clipping: True (clip between -1.0 and 1.0).
        ev = RolloutWorker(
            env_creator=lambda _: MockEnv2(episode_length=10),
            policy_spec=MockPolicy,
            clip_rewards=True,
            batch_mode="complete_episodes",
        )
        self.assertEqual(max(ev.sample()["rewards"]), 1)
        result = collect_metrics(ev, [])
        self.assertEqual(result["episode_reward_mean"], 1000)
        ev.stop()

        from ray.rllib.examples.env.random_env import RandomEnv

        # Clipping in certain range (-2.0, 2.0).
        ev2 = RolloutWorker(
            env_creator=lambda _: RandomEnv(
                dict(
                    reward_space=gym.spaces.Box(low=-10, high=10, shape=()),
                    p_done=0.0,
                    max_episode_len=10,
                )
            ),
            policy_spec=MockPolicy,
            clip_rewards=2.0,
            batch_mode="complete_episodes",
        )
        sample = ev2.sample()
        self.assertEqual(max(sample["rewards"]), 2.0)
        self.assertEqual(min(sample["rewards"]), -2.0)
        self.assertLess(np.mean(sample["rewards"]), 0.5)
        self.assertGreater(np.mean(sample["rewards"]), -0.5)
        ev2.stop()

        # Clipping: Off.
        ev2 = RolloutWorker(
            env_creator=lambda _: MockEnv2(episode_length=10),
            policy_spec=MockPolicy,
            clip_rewards=False,
            batch_mode="complete_episodes",
        )
        self.assertEqual(max(ev2.sample()["rewards"]), 100)
        result2 = collect_metrics(ev2, [])
        self.assertEqual(result2["episode_reward_mean"], 1000)
        ev2.stop()
예제 #21
0
    def test_action_clipping(self):
        from ray.rllib.examples.env.random_env import RandomEnv
        action_space = gym.spaces.Box(-2.0, 1.0, (3, ))

        # Clipping: True (clip between Policy's action_space.low/high),
        ev = RolloutWorker(
            env_creator=lambda _: RandomEnv(config=dict(
                action_space=action_space,
                max_episode_len=10,
                p_done=0.0,
                check_action_bounds=True,
            )),
            policy_spec=RandomPolicy,
            policy_config=dict(
                action_space=action_space,
                ignore_action_bounds=True,
            ),
            clip_actions=True,
            batch_mode="complete_episodes")
        sample = ev.sample()
        # Check, whether the action bounds have been breached (expected).
        # We still arrived here b/c we clipped according to the Env's action
        # space.
        self.assertGreater(np.max(sample["actions"]), action_space.high[0])
        self.assertLess(np.min(sample["actions"]), action_space.low[0])
        ev.stop()

        # Clipping: False and RandomPolicy produces invalid actions.
        # Expect Env to complain.
        ev2 = RolloutWorker(
            env_creator=lambda _: RandomEnv(config=dict(
                action_space=action_space,
                max_episode_len=10,
                p_done=0.0,
                check_action_bounds=True,
            )),
            policy_spec=RandomPolicy,
            policy_config=dict(
                action_space=action_space,
                ignore_action_bounds=True,
            ),
            clip_actions=False,  # <- should lead to Env complaining
            batch_mode="complete_episodes")
        self.assertRaisesRegex(ValueError, r"Illegal action", ev2.sample)
        ev2.stop()

        # Clipping: False and RandomPolicy produces valid (bounded) actions.
        # Expect "actions" in SampleBatch to be unclipped.
        ev3 = RolloutWorker(
            env_creator=lambda _: RandomEnv(config=dict(
                action_space=action_space,
                max_episode_len=10,
                p_done=0.0,
                check_action_bounds=True,
            )),
            policy_spec=RandomPolicy,
            policy_config=dict(action_space=action_space),
            # Should not be a problem as RandomPolicy abides to bounds.
            clip_actions=False,
            batch_mode="complete_episodes")
        sample = ev3.sample()
        self.assertGreater(np.min(sample["actions"]), action_space.low[0])
        self.assertLess(np.max(sample["actions"]), action_space.high[0])
        ev3.stop()