예제 #1
0
    def test_step(self):
        bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE)

        indices = np.arange(self.BATCH_SIZE)
        observations, _, _, _ = self.get_random_observations_rewards_actions_dones(
        )

        # Have to call reset first.
        bt.reset(indices, observations)

        # Create some fake data for calling step.
        new_observations, raw_rewards, actions, dones = (
            self.get_random_observations_rewards_actions_dones())
        processed_rewards = raw_rewards.astype(np.int64)

        # Force mark the first one as done anyways, so that there is something to
        # test.
        dones[0] = True

        num_done = sum(dones)
        self.assertLessEqual(1, num_done)  # i.e. num_done is atleast 1.

        num_not_done = len(dones) - num_done

        # Finally call step.
        bt.step(new_observations, raw_rewards, processed_rewards, dones,
                actions)

        # Expect to see `num_done` number of completed trajectories.
        self.assertEqual(num_done, bt.num_completed_trajectories)

        # Expect to see that the rest are marked as active.
        num_active = sum(t.is_active for t in bt.trajectories)
        self.assertEqual(num_not_done, num_active)
예제 #2
0
    def initialize_environments(self, batch_size=1, parallelism=1, **kwargs):
        """Initializes the environments.

    Args:
      batch_size: (int) Number of `self.base_env_name` envs to initialize.
      parallelism: (int) If this is greater than one then we run the envs in
        parallel using multi-threading.
      **kwargs: (dict) Kwargs to pass to gym.make.
    """
        assert batch_size >= 1

        self._envs = [
            gym.make(self.base_env_name, **kwargs) for _ in range(batch_size)
        ]
        self._parallelism = parallelism
        self._pool = multiprocessing.pool.ThreadPool(self._parallelism)
        if self._env_wrapper_fn is not None:
            self._envs = list(map(self._env_wrapper_fn, self._envs))

        self._verify_same_spaces()

        # If self.reward_range is None, i.e. this means that we should take the
        # reward range of the env.
        if self.reward_range is None:
            self._reward_range = self._envs[0].reward_range

        # This data structure stores the history of each env.
        #
        # NOTE: Even if the env is a NN and can step in all batches concurrently, it
        # is still valuable to store the trajectories separately.
        self._trajectories = trajectory.BatchTrajectory(batch_size=batch_size)
예제 #3
0
    def initialize_environments(self,
                                batch_size=1,
                                max_episode_steps=-1,
                                max_and_skip_env=False):
        """Initializes the environments and trajectories.

    Subclasses can override this if they don't want a default implementation
    which initializes `batch_size` environments, but must take care to
    initialize self._trajectories (this is checked in __init__ anyways).

    Args:
      batch_size: (int) Number of `self.base_env_name` envs to initialize.
      max_episode_steps: (int) Passed on to `gym_utils.make_gym_env`.
      max_and_skip_env: (boolean) Passed on to `gym_utils.make_gym_env`.
    """

        assert batch_size >= 1
        self._batch_size = batch_size

        # pylint: disable=g-complex-comprehension
        self._envs = [
            gym_utils.make_gym_env(self.base_env_name,
                                   rl_env_max_episode_steps=max_episode_steps,
                                   maxskip_env=max_and_skip_env)
            for _ in range(batch_size)
        ]

        # If self.observation_space and self.action_space aren't None, then it means
        # that this is a re-initialization of this class, in that case make sure
        # that this matches our previous behaviour.
        if self._observation_space:
            assert str(self._observation_space) == str(
                self._envs[0].observation_space)
        else:
            # This means that we are initializing this class for the first time.
            #
            # We set this equal to the first env's observation space, later on we'll
            # verify that all envs have the same observation space.
            self._observation_space = self._envs[0].observation_space

        # Similarly for action_space
        if self._action_space:
            assert str(self._action_space) == str(self._envs[0].action_space)
        else:
            self._action_space = self._envs[0].action_space

        self._verify_same_spaces()

        # If self.reward_range is None, i.e. this means that we should take the
        # reward range of the env.
        if self.reward_range is None:
            self._reward_range = self._envs[0].reward_range

        # This data structure stores the history of each env.
        #
        # NOTE: Even if the env is a NN and can step in all batches concurrently, it
        # is still valuable to store the trajectories separately.
        self._trajectories = trajectory.BatchTrajectory(batch_size=batch_size)
예제 #4
0
  def initialize_environments(self, batch_size=1, parallelism=1, **env_kwargs):
    """Initializes the environments and trajectories.

    Subclasses can override this if they don't want a default implementation
    which initializes `batch_size` environments, but must take care to
    initialize self._trajectories (this is checked in __init__ anyways).

    Args:
      batch_size: (int) Number of `self.base_env_name` envs to initialize.
      parallelism: (int) If this is greater than one then we run the envs in
        parallel using multi-threading.
      **env_kwargs: (dict) Kwargs to pass to gym.make.
    """
    assert batch_size >= 1
    self._batch_size = batch_size

    self._envs = [
        gym.make(self.base_env_name, **env_kwargs) for _ in range(batch_size)
    ]
    self._parallelism = parallelism
    self._pool = multiprocessing.pool.ThreadPool(self._parallelism)
    if self._env_wrapper_fn is not None:
      self._envs = list(map(self._env_wrapper_fn, self._envs))

    # If self.observation_space and self.action_space aren't None, then it means
    # that this is a re-initialization of this class, in that case make sure
    # that this matches our previous behaviour.
    if self._observation_space:
      assert str(self._observation_space) == str(
          self._envs[0].observation_space)
    else:
      # This means that we are initializing this class for the first time.
      #
      # We set this equal to the first env's observation space, later on we'll
      # verify that all envs have the same observation space.
      self._observation_space = self._envs[0].observation_space

    # Similarly for action_space
    if self._action_space:
      assert str(self._action_space) == str(self._envs[0].action_space)
    else:
      self._action_space = self._envs[0].action_space

    self._verify_same_spaces()

    # If self.reward_range is None, i.e. this means that we should take the
    # reward range of the env.
    if self.reward_range is None:
      self._reward_range = self._envs[0].reward_range

    # This data structure stores the history of each env.
    #
    # NOTE: Even if the env is a NN and can step in all batches concurrently, it
    # is still valuable to store the trajectories separately.
    self._trajectories = trajectory.BatchTrajectory(batch_size=batch_size)
예제 #5
0
    def test_desired_placement_of_rewards_and_actions(self):
        batch_size = 1
        bt = trajectory.BatchTrajectory(batch_size=batch_size)

        indices = np.arange(batch_size)
        observations, _, _, _ = self.get_random_observations_rewards_actions_dones(
            batch_size=batch_size)

        # Have to call reset first.
        bt.reset(indices, observations)

        # Create some fake data for calling step.
        new_observations, raw_rewards, actions, _ = (
            self.get_random_observations_rewards_actions_dones(
                batch_size=batch_size))
        processed_rewards = raw_rewards.astype(np.int64)
        dones = np.full(batch_size, False)

        # Call step.
        bt.step(new_observations, raw_rewards, processed_rewards, dones,
                actions)

        # Assert that nothing is done, since dones is False
        self.assertEqual(0, bt.num_completed_trajectories)

        # The only trajectory is active.
        self.assertEqual(batch_size, len(bt.trajectories))
        t = bt.trajectories[0]
        self.assertTrue(t.is_active)
        self.assertEqual(2, t.num_time_steps)

        ts = t.time_steps

        # Now assert on placements

        # i.e. the old observation/done is first and the new one comes later.
        self.assertAllEqual(observations[0], ts[0].observation)
        self.assertAllEqual(new_observations[0], ts[1].observation)

        self.assertEqual(False, ts[0].done)
        self.assertEqual(False, ts[1].done)

        # Similarly actions went to the first time-step.
        self.assertEqual(actions[0], ts[0].action)
        self.assertIsNone(ts[1].action)

        # However make sure reward went into the second time-step and not the first.
        self.assertNear(raw_rewards[0], ts[1].raw_reward, 1e-6)
        self.assertIsNone(ts[0].raw_reward)

        # Similarly with processed_rewards.
        self.assertEqual(processed_rewards[0], ts[1].processed_reward)
        self.assertIsNone(ts[0].processed_reward)
예제 #6
0
  def initialize_environments(self,
                              batch_size=1,
                              parallelism=1,
                              per_env_kwargs=None,
                              **kwargs):
    """Initializes the environments.

    Args:
      batch_size: (int) Number of `self.base_env_name` envs to initialize.
      parallelism: (int) If this is greater than one then we run the envs in
        parallel using multi-threading.
      per_env_kwargs: (list or None) An optional list of dictionaries to pass to
        gym.make. If not None, length should match `batch_size`.
      **kwargs: (dict) Kwargs to pass to gym.make.
    """
    assert batch_size >= 1
    if per_env_kwargs is not None:
      assert batch_size == len(per_env_kwargs)
    else:
      per_env_kwargs = [{} for _ in range(batch_size)]

    # By now `per_env_kwargs` is a list of dictionaries of size batch_size.
    # The individual dictionaries maybe empty.

    def union_dicts(dict1, dict2):
      """Union `dict1` and `dict2`."""
      copy_dict1 = copy.copy(dict1)
      copy_dict1.update(dict2)
      return copy_dict1

    self._envs = [
        gym.make(self.base_env_name,
                 **union_dicts(kwargs, env_kwarg))
        for env_kwarg in per_env_kwargs
    ]
    self._parallelism = parallelism
    self._pool = multiprocessing.pool.ThreadPool(self._parallelism)
    if self._env_wrapper_fn is not None:
      self._envs = list(map(self._env_wrapper_fn, self._envs))

    self._verify_same_spaces()

    # If self.reward_range is None, i.e. this means that we should take the
    # reward range of the env.
    if self.reward_range is None:
      self._reward_range = self._envs[0].reward_range

    # This data structure stores the history of each env.
    #
    # NOTE: Even if the env is a NN and can step in all batches concurrently, it
    # is still valuable to store the trajectories separately.
    self._trajectories = trajectory.BatchTrajectory(batch_size=batch_size)
예제 #7
0
  def test_reset_all(self):
    bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE)

    indices = np.arange(self.BATCH_SIZE)
    observations, _, _, _ = self.get_random_observations_rewards_actions_dones()

    # Call reset.
    bt.reset(indices, observations)

    # Assert that all trajectories are active and not done (reset never marks
    # anything as done).
    self.assertTrue(all(t.is_active for t in bt.trajectories))
    self.assertEqual(0, bt.num_completed_trajectories)
예제 #8
0
    def test_reset_all(self):
        bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE)

        indices = np.arange(self.BATCH_SIZE)
        observations = np.random.rand(*((self.BATCH_SIZE, ) +
                                        self.OBSERVATION_SHAPE))

        # Call reset.
        bt.reset(indices, observations)

        # Assert that all trajectories are active and not done (reset never marks
        # anything as done).
        self.assertTrue(all(t.is_active() for t in bt.trajectories))
        self.assertEqual(0, len(bt.completed_trajectories))
예제 #9
0
    def initialize(self, batch_size=1, **kwargs):
        self.initialize_environments(batch_size=batch_size, **kwargs)

        self._batch_size = batch_size

        # This data structure stores the history of each env.
        #
        # NOTE: Even if the env is a NN and can step in all batches concurrently, it
        # is still valuable to store the trajectories separately.
        self._trajectories = trajectory.BatchTrajectory(batch_size=batch_size)

        # Assert that *all* the above are now set, we should do this since
        # subclasses can override `initialize_environments`.
        self.assert_common_preconditions()
        assert self.observation_space is not None
        assert self.action_space is not None
        assert self.reward_range is not None
예제 #10
0
  def test_reset_some(self):
    bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE)

    indices = np.arange(self.BATCH_SIZE // 2)
    observations, _, _, _ = self.get_random_observations_rewards_actions_dones(
        batch_size=self.BATCH_SIZE // 2)

    # Just reset the first half.
    bt.reset(indices, observations)

    # So first half are active, rest aren't.
    self.assertTrue(
        all(t.is_active for t in bt.trajectories[:self.BATCH_SIZE // 2]))
    self.assertTrue(
        all(not t.is_active for t in bt.trajectories[self.BATCH_SIZE // 2:]))

    # Nothing is done anyways.
    self.assertEqual(0, bt.num_completed_trajectories)
예제 #11
0
    def test_reset_some(self):
        bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE)

        indices = np.arange(self.BATCH_SIZE // 2)
        observations = np.random.rand(*((self.BATCH_SIZE // 2, ) +
                                        self.OBSERVATION_SHAPE))

        # Just reset the first half.
        bt.reset(indices, observations)

        # So first half are active, rest aren't.
        self.assertTrue(
            all(t.is_active() for t in bt.trajectories[:self.BATCH_SIZE // 2]))
        self.assertTrue(
            all(not t.is_active()
                for t in bt.trajectories[self.BATCH_SIZE // 2:]))

        # Nothing is done anyways.
        self.assertEqual(0, len(bt.completed_trajectories))
예제 #12
0
    def test_truncate(self):
        batch_size = 1
        bt = trajectory.BatchTrajectory(batch_size=batch_size)

        indices = np.arange(batch_size)
        observations, _, _, _ = (
            self.get_random_observations_rewards_actions_dones(
                batch_size=batch_size))

        # Have to call reset first.
        bt.reset(indices, observations)

        # Take a few steps.
        ts = 5
        for _ in range(ts):
            (observations, rewards, actions,
             dones) = self.get_random_observations_rewards_actions_dones(
                 batch_size=batch_size)
            dones[...] = False
            bt.step(observations, rewards, rewards, dones, actions)

        self.assertEqual(0, bt.num_completed_trajectories)

        num_to_keep = 2
        bt.truncate_trajectories(indices, num_to_keep=num_to_keep)

        self.assertEqual(batch_size, bt.num_completed_trajectories)

        # Assert they are all active.
        # Since the last `num_to_keep` observations were duplicated.
        self.assertTrue(all(t.is_active for t in bt.trajectories))

        orig_obs = bt.completed_trajectories[0].observations_np
        # + 1 because of the initial reset
        self.assertEqual(ts + 1, orig_obs.shape[0])

        trunc_obs = bt.trajectories[0].observations_np
        self.assertEqual(num_to_keep, trunc_obs.shape[0])
        self.assertEqual(num_to_keep, bt.trajectories[0].num_time_steps)

        # Test that the observations are the same.
        self.assertAllEqual(orig_obs[-num_to_keep:, ...], trunc_obs)
예제 #13
0
    def test_step(self):
        bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE)

        indices = np.arange(self.BATCH_SIZE)
        observations = np.random.rand(*((self.BATCH_SIZE, ) +
                                        self.OBSERVATION_SHAPE))

        # Have to call reset first.
        bt.reset(indices, observations)

        # Create some fake data for calling step.
        new_observations = np.random.rand(*((self.BATCH_SIZE, ) +
                                            self.OBSERVATION_SHAPE))
        raw_rewards = processed_rewards = actions = np.random.randn(
            self.BATCH_SIZE)
        processed_rewards = np.int64(processed_rewards)
        dones = raw_rewards > 0.5

        # Force mark the first one as done anyways, so that there is something to
        # test.
        dones[0] = True

        num_done = sum(dones)
        self.assertLessEqual(1, num_done)  # i.e. num_done is atleast 1.

        num_not_done = len(dones) - num_done

        # Finally call step.
        bt.step(new_observations, raw_rewards, processed_rewards, dones,
                actions)

        # Expect to see `num_done` number of completed trajectories.
        self.assertEqual(num_done, len(bt.completed_trajectories))

        # Expect to see that the rest are marked as active.
        num_active = sum(t.is_active() for t in bt.trajectories)
        self.assertEqual(num_not_done, num_active)
예제 #14
0
    def test_observations_np(self):
        bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE)
        indices = np.arange(self.BATCH_SIZE)
        observations, _, _, _ = self.get_random_observations_rewards_actions_dones(
        )

        # Have to call reset first.
        bt.reset(indices, observations)

        # Number of time-steps now looks like the following:
        # (1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
        lengths = np.full((self.BATCH_SIZE, ), 1)

        ts = 5
        for _ in range(ts):
            (observations, rewards, actions,
             dones) = self.get_random_observations_rewards_actions_dones()
            dones[...] = False
            bt.step(observations, rewards, rewards, dones, actions)

        # Number of time-steps now looks like the following:
        # (6, 6, 6, 6, 6, 6, 6, 6, 6, 6)
        lengths = lengths + ts

        # Now let's mark the first two as done.
        observations, _, _, _ = self.get_random_observations_rewards_actions_dones(
            batch_size=2)
        bt.reset(np.array([0, 1]), observations)

        # Number of time-steps now looks like the following:
        # (1, 1, 6, 6, 6, 6, 6, 6, 6, 6)
        lengths[0] = lengths[1] = 1

        for _ in range(ts):
            (observations, rewards, actions,
             dones) = self.get_random_observations_rewards_actions_dones()
            dones[...] = False
            bt.step(observations, rewards, rewards, dones, actions)

        # Number of time-steps now looks like the following:
        # (6, 6, 11, 11, 11, 11, 11, 11, 11, 11)
        lengths = lengths + ts

        boundary = 20
        len_history_for_policy = 40

        padded_obs_np, padded_lengths = bt.observations_np(
            boundary=boundary, len_history_for_policy=len_history_for_policy)

        # The lengths are what we expect them to be.
        self.assertAllEqual(lengths, padded_lengths)

        # The padded_observations are the shape we expect them to be.
        self.assertEqual(
            (self.BATCH_SIZE, boundary + 1) + self.OBSERVATION_SHAPE,
            padded_obs_np.shape)

        # Let's now request the last n = [1, 2 * boundary) steps for the history.
        for len_history_for_policy in range(1, 2 * boundary):
            # The expected lengths will now be:
            truncated_lengths = [
                min(l, len_history_for_policy) for l in lengths
            ]

            padded_obs_np, padded_lengths = bt.observations_np(
                boundary=boundary,
                len_history_for_policy=len_history_for_policy)

            self.assertAllEqual(truncated_lengths, padded_lengths)

            # This shouldn't change, since even if we request lengths > boundary + 1
            # there are no trajectories that long.
            self.assertEqual(
                (self.BATCH_SIZE, boundary + 1) + self.OBSERVATION_SHAPE,
                padded_obs_np.shape)

        # Let's do 10 more steps (to go on the other side of the boundary.
        ts = 10
        for _ in range(ts):
            (observations, rewards, actions,
             dones) = self.get_random_observations_rewards_actions_dones()
            dones[...] = False
            bt.step(observations, rewards, rewards, dones, actions)

        # Number of time-steps now looks like the following:
        # (16, 16, 21, 21, 21, 21, 21, 21, 21, 21)
        lengths = lengths + ts

        len_history_for_policy = 40
        padded_obs_np, padded_lengths = bt.observations_np(
            boundary=boundary, len_history_for_policy=len_history_for_policy)

        # The lengths are what we expect them to be.
        self.assertAllEqual(lengths, padded_lengths)

        # The padded_observations are the shape we expect them to be.
        self.assertEqual(
            (self.BATCH_SIZE, (2 * boundary) + 1) + self.OBSERVATION_SHAPE,
            padded_obs_np.shape)

        # Test that the padding is the only part that is all 0s.
        # NOTE: There is almost 0 probability that the random observation is all 0s.
        zero_obs = np.full(self.OBSERVATION_SHAPE, 0.)
        for b in range(self.BATCH_SIZE):
            # The first lengths[b] will be actual data, rest is 0s.
            for ts in range(lengths[b]):
                self.assertFalse(np.all(zero_obs == padded_obs_np[b][ts]))

            for ts in range(lengths[b], len(padded_obs_np[b])):
                self.assertAllEqual(zero_obs, padded_obs_np[b][ts])
예제 #15
0
    def test_num_time_steps(self):
        bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE)

        self.assertEqual(0, bt.num_completed_time_steps)
        self.assertEqual(0, bt.num_time_steps)
예제 #16
0
    def test_creation(self):
        bt = trajectory.BatchTrajectory(batch_size=self.BATCH_SIZE)

        self.assertEqual(self.BATCH_SIZE, len(bt.trajectories))
        self.assertEqual(0, bt.num_completed_trajectories)