Exemplo n.º 1
0
 def _preprocess_trajectories(self, trajectories):
     (_, reward_mask, observations, actions, rewards,
      infos) = (ppo.pad_trajectories(trajectories,
                                     boundary=self._max_timestep))
     assert self.train_env.observation_space.shape == observations.shape[2:]
     if not self._serialized_sequence_policy:
         # Add one timestep at the end, so it's compatible with
         # self._rewards_to_actions.
         pad_width = ((0, 0), (0, 1)) + ((0, 0), ) * (actions.ndim - 2)
         actions = np.pad(actions, pad_width)
         actions = np.reshape(actions, (actions.shape[0], -1))
     else:
         (observations,
          actions) = self._serialize_trajectories(observations, actions,
                                                  reward_mask)
     return (observations, actions, rewards, reward_mask, infos)
Exemplo n.º 2
0
 def _preprocess_trajectories(self, trajectories):
     (_, reward_mask, observations, actions, rewards,
      infos) = (ppo.pad_trajectories(trajectories,
                                     boundary=self._max_timestep))
     (low, high) = self.train_env.reward_range
     outside = np.logical_or(rewards < low, rewards > high)
     rewards = jax.ops.index_update(rewards, jax.ops.index[outside], 0)
     assert self.train_env.observation_space.shape == observations.shape[2:]
     if self._policy_and_value_vocab_size is None:
         # Add one timestep at the end, so it's compatible with
         # self._rewards_to_actions.
         pad_width = ((0, 0), (0, 1)) + ((0, 0), ) * (actions.ndim - 2)
         actions = np.pad(actions, pad_width)
         actions = np.reshape(actions, (actions.shape[0], -1))
     else:
         (observations,
          actions) = self._serialize_trajectories(observations, actions,
                                                  reward_mask)
     return (observations, actions, rewards, reward_mask, infos)
Exemplo n.º 3
0
  def test_pad_trajectories(self):
    observation_shape = (2, 3, 4)
    trajectories = []
    n_trajectories = 7
    n_actions = 10

    # Time-steps are between [min_allowable_time_step, max_allowable_time_step]
    max_allowable_time_step = 19
    min_allowable_time_step = 5

    # The actual max we see in the data.
    max_time_step = -1

    # Bucket length.
    bucket_length = 15

    # Make `n_trajectories` random trajectories.
    for i in range(n_trajectories):
      time_steps = np.random.randint(min_allowable_time_step,
                                     max_allowable_time_step + 1)
      if time_steps > max_time_step:
        max_time_step = time_steps
      observations = np.random.randint(
          0, 255, size=(time_steps + 1,) + observation_shape).astype(np.uint8)
      rewards = np.random.uniform(size=(time_steps,)).astype(np.float32)
      actions = np.random.randint(
          0, n_actions, size=(time_steps,)).astype(np.int32)
      infos = {
          'a': np.random.uniform(size=(time_steps,)).astype(np.float32),
          'b': np.random.uniform(size=(time_steps,)).astype(np.float32)
      }
      trajectories.append((observations, rewards, actions, infos))

    # Now pad these trajectories.
    padded_trajectories = ppo.pad_trajectories(
        trajectories, boundary=bucket_length)

    # Expected padding.
    i = 1
    while i * bucket_length < max_time_step:
      i += 1
    expected_padding = i * bucket_length

    # Get the padded objects.
    (pad_lengths, reward_mask, padded_observations, padded_actions,
     padded_rewards, padded_infos) = padded_trajectories

    # Expectations on the padded shapes.
    self.assertEqual(padded_observations.shape, (
        n_trajectories,
        expected_padding + 1,
    ) + observation_shape)
    self.assertEqual(padded_actions.shape, (n_trajectories, expected_padding))
    self.assertEqual(padded_rewards.shape, (n_trajectories, expected_padding))
    self.assertEqual(reward_mask.shape, (n_trajectories, expected_padding))

    self.assertEqual(padded_infos['a'].shape,
                     (n_trajectories, expected_padding))
    self.assertEqual(padded_infos['b'].shape,
                     (n_trajectories, expected_padding))

    # Assert that the padding lengths and reward mask are consistent.
    self.assertAllEqual(
        np.full((n_trajectories,), expected_padding),
        np.array(np.sum(reward_mask, axis=1)) + pad_lengths)