예제 #1
0
def make_env(batch_size=8):
    """Creates the env."""

    # No resizing needed, so let's be on the normal EnvProblem.
    if not FLAGS.resize:  # None or False
        return env_problem.EnvProblem(base_env_name=FLAGS.env_problem_name,
                                      batch_size=batch_size,
                                      reward_range=(-1, 1))

    max_timestep = None
    try:
        max_timestep = int(FLAGS.max_timestep)
    except Exception:  # pylint: disable=broad-except
        pass

    wrapper_fn = functools.partial(
        gym_utils.gym_env_wrapper, **{
            "rl_env_max_episode_steps": max_timestep,
            "maxskip_env": True,
            "rendered_env": True,
            "rendered_env_resize_to":
            (FLAGS.resized_height, FLAGS.resized_width),
            "sticky_actions": False,
            "output_dtype": onp.int32 if FLAGS.use_tpu else None,
        })

    return rendered_env_problem.RenderedEnvProblem(
        base_env_name=FLAGS.env_problem_name,
        batch_size=batch_size,
        env_wrapper_fn=wrapper_fn,
        reward_range=(-1, 1))
예제 #2
0
  def test_play_env_problem_with_policy(self):
    env = env_problem.EnvProblem(
        base_env_name="CartPole-v0",
        batch_size=2,
        reward_range=(-1, 1))

    def policy_fun(observations, rng=None):
      b, t = observations.shape[:2]
      a = env.action_space.n
      p = np.random.uniform(size=(b, t, a))
      p = np.exp(p)
      p = p / np.sum(p, axis=-1, keepdims=True)
      return np.log(p), (), rng

    max_timestep = 15
    num_trajectories = 2
    trajectories, _ = env_problem_utils.play_env_problem_with_policy(
        env, policy_fun, num_trajectories=num_trajectories,
        max_timestep=max_timestep, boundary=20)

    self.assertEqual(num_trajectories, len(trajectories))

    # Check shapes within trajectories.
    traj = trajectories[0]
    T = traj[1].shape[0]  # pylint: disable=invalid-name
    self.assertEqual((T+1, 4), traj[0].shape)  # (4,) is OBS
    self.assertEqual((T,), traj[2].shape)
    self.assertLessEqual(T, max_timestep)

    traj = trajectories[1]
    T = traj[1].shape[0]  # pylint: disable=invalid-name
    self.assertEqual((T+1, 4), traj[0].shape)
    self.assertEqual((T,), traj[2].shape)
    self.assertLessEqual(T, max_timestep)
예제 #3
0
def make_env():
    """Creates the env."""
    if FLAGS.env_name:
        return gym.make(FLAGS.env_name)

    assert FLAGS.env_problem_name

    # No resizing needed, so let's be on the normal EnvProblem.
    if not FLAGS.resize:  # None or False
        return env_problem.EnvProblem(base_env_name=FLAGS.env_problem_name,
                                      batch_size=FLAGS.batch_size,
                                      reward_range=(-1, 1))

    wrapper_fn = functools.partial(
        gym_utils.gym_env_wrapper, **{
            "rl_env_max_episode_steps": FLAGS.max_timestep,
            "maxskip_env": True,
            "rendered_env": True,
            "rendered_env_resize_to":
            (FLAGS.resized_height, FLAGS.resized_width),
            "sticky_actions": False
        })

    return rendered_env_problem.RenderedEnvProblem(
        base_env_name=FLAGS.env_problem_name,
        batch_size=FLAGS.batch_size,
        env_wrapper_fn=wrapper_fn,
        reward_range=(-1, 1))
예제 #4
0
    def test_default_processed_rewards_discrete(self):
        # This differs in the above because it has a Tuple observation space.
        ep = env_problem.EnvProblem(base_env_name="KellyCoinflip-v0",
                                    batch_size=5,
                                    reward_range=None)
        ep.assert_common_preconditions()

        # Assert reward range is finite here.
        self.assertTrue(ep.is_reward_range_finite)

        # Assert that it is as expected of the underlying environment.
        reward_range = ep.reward_range
        self.assertEqual(0, reward_range[0])

        # Google's version of Gym has maxWealth, vs max_wealth externally.
        max_wealth = getattr(ep._envs[0], "maxWealth",
                             getattr(ep._envs[0], "max_wealth", None))
        self.assertIsNotNone(max_wealth)
        self.assertEqual(max_wealth, reward_range[1])

        # Check that the processed rewards are discrete.
        self.assertTrue(ep.is_processed_rewards_discrete)

        # Assert on the number of rewards.
        self.assertEqual(ep.num_rewards, reward_range[1] - reward_range[0] + 1)
예제 #5
0
    def test_resets_properly(self):
        base_env_name = "CartPole-v0"
        batch_size = 5
        reward_range = (-1, 1)
        nsteps = 100

        env = env_problem.EnvProblem(base_env_name=base_env_name,
                                     batch_size=batch_size,
                                     reward_range=reward_range)
        env.name = base_env_name

        num_dones = 0
        while num_dones == 0:
            env, num_dones, _ = self.play_env(env=env,
                                              nsteps=nsteps,
                                              batch_size=batch_size,
                                              reward_range=reward_range)

        # Some completed trajectories have been generated.
        self.assertGreater(env.trajectories.num_completed_trajectories, 0)

        # This should clear the env completely of any state.
        env.reset()

        # Assert that there aren't any completed trajectories in the env now.
        self.assertEqual(env.trajectories.num_completed_trajectories, 0)
예제 #6
0
  def test_interaction_with_env(self):
    batch_size = 5
    reward_range = (-1, 1)
    ep = env_problem.EnvProblem(
        base_env_name="KellyCoinflip-v0",
        batch_size=batch_size,
        reward_range=reward_range)

    # Resets all environments.
    ep.reset()

    # Let's play a few steps.
    nsteps = 100
    num_trajectories_completed = 0
    num_timesteps_completed = 0
    # If batch_done_at_step[i] = j then it means that i^th env last got done at
    # step = j.
    batch_done_at_step = np.full(batch_size, -1)
    for i in range(nsteps):
      # Sample batch_size actions from the action space and stack them (since
      # that is the expected type).
      actions = np.stack([ep.action_space.sample() for _ in range(batch_size)])

      _, _, dones, _ = ep.step(actions)

      # Do the book-keeping on number of trajectories completed and expect that
      # it matches ep's completed number.

      num_done = sum(dones)
      num_trajectories_completed += num_done

      self.assertEqual(num_trajectories_completed,
                       len(ep.trajectories.completed_trajectories))

      # Get the indices where we are done ...
      done_indices = env_problem.EnvProblem.done_indices(dones)

      # ... and reset those.
      ep.reset(indices=done_indices)

      # If nothing got done, go on to the next step.
      if done_indices.size == 0:
        # i.e. this is an empty array.
        continue

      # See when these indices were last done and calculate how many time-steps
      # each one took to get done.
      num_timesteps_completed += sum(i + 1 - batch_done_at_step[done_indices])
      batch_done_at_step[done_indices] = i

      # This should also match the number of time-steps completed given by ep.
      num_timesteps_completed_ep = sum(
          ct.num_time_steps() for ct in ep.trajectories.completed_trajectories)
      self.assertEqual(num_timesteps_completed, num_timesteps_completed_ep)

    # Reset the trajectories.
    ep.trajectories.reset_batch_trajectories()
    self.assertEqual(0, len(ep.trajectories.completed_trajectories))
예제 #7
0
  def test_reward_range(self):
    # Passing reward_range=None means take the reward range of the underlying
    # environment as the reward range.
    ep = env_problem.EnvProblem(
        base_env_name="FrozenLake-v0", batch_size=5, reward_range=None)
    ep.assert_common_preconditions()

    # Assert reward range is finite here.
    self.assertTrue(ep.is_reward_range_finite)

    # Assert that it is as expected of the underlying environment, since reward_
    self.assertEqual(0, ep.reward_range[0])
    self.assertEqual(1, ep.reward_range[1])
예제 #8
0
    def play_env(self,
                 env=None,
                 nsteps=100,
                 base_env_name=None,
                 batch_size=5,
                 reward_range=None):
        """Creates `EnvProblem` with the given arguments and plays it randomly.

    Args:
      env: optional env.
      nsteps: plays the env randomly for nsteps.
      base_env_name: passed to EnvProblem's init.
      batch_size: passed to EnvProblem's init.
      reward_range: passed to EnvProblem's init.

    Returns:
      tuple of env_problem, number of trajectories done, number of trajectories
      done in the last step.
    """

        if env is None:
            env = env_problem.EnvProblem(base_env_name=base_env_name,
                                         batch_size=batch_size,
                                         reward_range=reward_range)
            # Usually done by a registered subclass, we do this manually in the test.
            env.name = base_env_name

        # Reset all environments.
        env.reset()

        # Play for some steps to generate data.
        num_dones = 0
        num_dones_in_last_step = 0
        for _ in range(nsteps):
            # Sample actions.
            actions = np.stack(
                [env.action_space.sample() for _ in range(batch_size)])
            # Step through it.
            _, _, dones, _ = env.step(actions)
            # Get the indices where we are done ...
            done_indices = env_problem_utils.done_indices(dones)
            # ... and reset those.
            env.reset(indices=done_indices)
            # count the number of dones we got, in this step and overall.
            num_dones_in_last_step = sum(dones)
            num_dones += num_dones_in_last_step

        return env, num_dones, num_dones_in_last_step
  def get_wrapped_env(self, name="CartPole-v0", max_episode_steps=2):
    wrapper_fn = functools.partial(
        gym_utils.gym_env_wrapper,
        **{
            "rl_env_max_episode_steps": max_episode_steps,
            "maxskip_env": False,
            "rendered_env": False,
            "rendered_env_resize_to": None,  # Do not resize frames
            "sticky_actions": False,
            "output_dtype": None,
        })

    return env_problem.EnvProblem(base_env_name=name,
                                  batch_size=1,
                                  env_wrapper_fn=wrapper_fn,
                                  reward_range=(-1, 1))
예제 #10
0
    def test_play_env_problem_with_policy(self):
        env = env_problem.EnvProblem(base_env_name="CartPole-v0",
                                     batch_size=2,
                                     reward_range=(-1, 1))

        # Let's make sure that at-most 4 observations come to the policy function.
        len_history_for_policy = 4

        def policy_fun(observations, rng=None):
            b, t = observations.shape[:2]
            # Assert that observations from time-step len_history_for_policy onwards
            # are zeros.
            self.assertTrue(
                np.all(observations[:, len_history_for_policy:, ...] == 0))
            self.assertFalse(
                np.all(observations[:, :len_history_for_policy, ...] == 0))
            a = env.action_space.n
            p = np.random.uniform(size=(b, t, a))
            p = np.exp(p)
            p = p / np.sum(p, axis=-1, keepdims=True)
            return np.log(p), (), rng

        max_timestep = 15
        num_trajectories = 2
        trajectories, _, _ = env_problem_utils.play_env_problem_with_policy(
            env,
            policy_fun,
            num_trajectories=num_trajectories,
            max_timestep=max_timestep,
            len_history_for_policy=len_history_for_policy)

        self.assertEqual(num_trajectories, len(trajectories))

        # Check shapes within trajectories.
        traj = trajectories[0]
        T = traj[1].shape[0]  # pylint: disable=invalid-name
        self.assertEqual((T + 1, 4), traj[0].shape)  # (4,) is OBS
        self.assertEqual((T, ), traj[2].shape)
        self.assertLessEqual(T, max_timestep)

        traj = trajectories[1]
        T = traj[1].shape[0]  # pylint: disable=invalid-name
        self.assertEqual((T + 1, 4), traj[0].shape)
        self.assertEqual((T, ), traj[2].shape)
        self.assertLessEqual(T, max_timestep)
예제 #11
0
    def test_setup(self):
        ep = env_problem.EnvProblem(base_env_name="CartPole-v0", batch_size=5)
        # Checks that environments were created and they are `batch_size` in number.
        ep.assert_common_preconditions()

        # Expectations on the observation space.
        observation_space = ep.observation_space
        self.assertTrue(isinstance(observation_space, Box))
        self.assertEqual(observation_space.shape, (4, ))
        self.assertEqual(observation_space.dtype, np.float32)

        # Expectations on the action space.
        action_space = ep.action_space
        self.assertTrue(isinstance(action_space, Discrete))
        self.assertEqual(action_space.shape, ())
        self.assertEqual(action_space.dtype, np.int64)
        self.assertEqual(ep.num_actions, 2)

        # Reward range is infinite here.
        self.assertFalse(ep.is_reward_range_finite)
예제 #12
0
def make_env(batch_size=8, **env_kwargs):
    """Creates the env."""

    if FLAGS.clip_rewards:
        env_kwargs.update({"reward_range": (-1, 1), "discrete_rewards": True})
    else:
        env_kwargs.update({"discrete_rewards": False})

    # TODO(afrozm): Should we leave out some cores?
    parallelism = multiprocessing.cpu_count() if FLAGS.parallelize_envs else 1

    # No resizing needed, so let's be on the normal EnvProblem.
    if not FLAGS.resize:  # None or False
        return env_problem.EnvProblem(base_env_name=FLAGS.env_problem_name,
                                      batch_size=batch_size,
                                      parallelism=parallelism,
                                      **env_kwargs)

    max_timestep = None
    try:
        max_timestep = int(FLAGS.max_timestep)
    except Exception:  # pylint: disable=broad-except
        pass

    wrapper_fn = functools.partial(
        gym_utils.gym_env_wrapper, **{
            "rl_env_max_episode_steps": max_timestep,
            "maxskip_env": True,
            "rendered_env": True,
            "rendered_env_resize_to":
            (FLAGS.resized_height, FLAGS.resized_width),
            "sticky_actions": False,
            "output_dtype": onp.int32 if FLAGS.use_tpu else None,
        })

    return rendered_env_problem.RenderedEnvProblem(
        base_env_name=FLAGS.env_problem_name,
        batch_size=batch_size,
        parallelism=parallelism,
        env_wrapper_fn=wrapper_fn,
        **env_kwargs)