Пример #1
0
    def test_reward_clipping(self):
        # Clipping: True (clip between -1.0 and 1.0).
        ev = RolloutWorker(
            env_creator=lambda _: MockEnv2(episode_length=10),
            policy_spec=MockPolicy,
            clip_rewards=True,
            batch_mode="complete_episodes",
        )
        self.assertEqual(max(ev.sample()["rewards"]), 1)
        result = collect_metrics(ev, [])
        self.assertEqual(result["episode_reward_mean"], 1000)
        ev.stop()

        from ray.rllib.examples.env.random_env import RandomEnv

        # Clipping in certain range (-2.0, 2.0).
        ev2 = RolloutWorker(
            env_creator=lambda _: RandomEnv(
                dict(
                    reward_space=gym.spaces.Box(low=-10, high=10, shape=()),
                    p_done=0.0,
                    max_episode_len=10,
                )
            ),
            policy_spec=MockPolicy,
            clip_rewards=2.0,
            batch_mode="complete_episodes",
        )
        sample = ev2.sample()
        self.assertEqual(max(sample["rewards"]), 2.0)
        self.assertEqual(min(sample["rewards"]), -2.0)
        self.assertLess(np.mean(sample["rewards"]), 0.5)
        self.assertGreater(np.mean(sample["rewards"]), -0.5)
        ev2.stop()

        # Clipping: Off.
        ev2 = RolloutWorker(
            env_creator=lambda _: MockEnv2(episode_length=10),
            policy_spec=MockPolicy,
            clip_rewards=False,
            batch_mode="complete_episodes",
        )
        self.assertEqual(max(ev2.sample()["rewards"]), 100)
        result2 = collect_metrics(ev2, [])
        self.assertEqual(result2["episode_reward_mean"], 1000)
        ev2.stop()
Пример #2
0
    def test_hard_horizon(self):
        ev = RolloutWorker(
            env_creator=lambda _: MockEnv2(episode_length=10),
            policy_spec=MockPolicy,
            batch_mode="complete_episodes",
            rollout_fragment_length=10,
            episode_horizon=4,
            soft_horizon=False,
        )
        samples = ev.sample()
        # Three logical episodes and correct episode resets (always after 4
        # steps).
        self.assertEqual(len(set(samples["eps_id"])), 3)
        for i in range(4):
            self.assertEqual(np.argmax(samples["obs"][i]), i)
        self.assertEqual(np.argmax(samples["obs"][4]), 0)
        # 3 done values.
        self.assertEqual(sum(samples["dones"]), 3)
        ev.stop()

        # A gym env's max_episode_steps is smaller than Trainer's horizon.
        ev = RolloutWorker(
            env_creator=lambda _: gym.make("CartPole-v0"),
            policy_spec=MockPolicy,
            batch_mode="complete_episodes",
            rollout_fragment_length=10,
            episode_horizon=6,
            soft_horizon=False,
        )
        samples = ev.sample()
        # 12 steps due to `complete_episodes` batch_mode.
        self.assertEqual(len(samples["eps_id"]), 12)
        # Two logical episodes and correct episode resets (always after 6(!)
        # steps).
        self.assertEqual(len(set(samples["eps_id"])), 2)
        # 2 done values after 6 and 12 steps.
        check(
            samples["dones"],
            [
                False,
                False,
                False,
                False,
                False,
                True,
                False,
                False,
                False,
                False,
                False,
                True,
            ],
        )
        ev.stop()
Пример #3
0
 def test_multi_env_seed(self):
     ev = RolloutWorker(env_creator=lambda _: MockEnv2(100),
                        num_envs=3,
                        policy_spec=MockPolicy,
                        seed=1)
     # Make sure we can properly sample from the wrapped env.
     ev.sample()
     # Make sure all environments got a different deterministic seed.
     seeds = ev.foreach_env(lambda env: env.rng_seed)
     self.assertEqual(seeds, [1, 2, 3])
     ev.stop()
Пример #4
0
 def __init__(self, num, increment_obs=False):
     if increment_obs:
         # Observations are 0, 1, 2, 3... etc. as time advances
         self.agents = [MockEnv2(5) for _ in range(num)]
     else:
         # Observations are all zeros
         self.agents = [MockEnv(5) for _ in range(num)]
     self.dones = set()
     self.last_obs = {}
     self.last_rew = {}
     self.last_done = {}
     self.last_info = {}
     self.i = 0
     self.num = num
     self.observation_space = gym.spaces.Discrete(10)
     self.action_space = gym.spaces.Discrete(2)
Пример #5
0
    def test_wrap_gym_env(self):
        record_env_dir = os.popen("mktemp -d").read()[:-1]
        print(f"tmp dir for videos={record_env_dir}")

        if not os.path.exists(record_env_dir):
            sys.exit(1)

        num_steps_per_episode = 10
        wrapped = record_env_wrapper(
            env=MockEnv2(num_steps_per_episode),
            record_env=record_env_dir,
            log_dir="",
            policy_config={
                "in_evaluation": False,
            },
        )
        # Non MultiAgentEnv: Wrapper's type is wrappers.Monitor.
        self.assertTrue(isinstance(wrapped, gym.wrappers.Monitor))
        self.assertFalse(isinstance(wrapped, VideoMonitor))

        wrapped.reset()
        # Expect one video file to have been produced in the tmp dir.
        os.chdir(record_env_dir)
        ls = glob.glob("*.mp4")
        self.assertTrue(len(ls) == 1)
        # 10 steps for a complete episode.
        for i in range(num_steps_per_episode):
            wrapped.step(0)
        # Another episode.
        wrapped.reset()
        for i in range(num_steps_per_episode):
            wrapped.step(0)
        # Expect another video file to have been produced (2nd episode).
        ls = glob.glob("*.mp4")
        self.assertTrue(len(ls) == 2)

        # MockEnv2 returns a reward of 100.0 every step.
        # So total reward is 1000.0 per episode (10 steps).
        check(
            np.array([100.0, 100.0]) * num_steps_per_episode,
            wrapped.get_episode_rewards(),
        )
        # Erase all generated files and the temp path just in case,
        # as to not disturb further CI-tests.
        shutil.rmtree(record_env_dir)
Пример #6
0
    def test_wrap_gym_env(self):
        wrapped = record_env_wrapper(env=MockEnv2(10),
                                     record_env=tempfile.gettempdir(),
                                     log_dir="",
                                     policy_config={
                                         "in_evaluation": False,
                                     })
        # Type is wrappers.Monitor.
        self.assertTrue(isinstance(wrapped, wrappers.Monitor))
        self.assertFalse(isinstance(wrapped, VideoMonitor))

        wrapped.reset()
        # 10 steps for a complete episode.
        for i in range(10):
            wrapped.step(0)

        # MockEnv2 returns a reward of 100.0 every step.
        # So total reward is 1000.0.
        self.assertEqual(wrapped.get_episode_rewards(), [1000.0])