def test_change_last_time_step(self): t = trajectory.Trajectory() t.add_time_step(observation=1, done=False) t.add_time_step(observation=1, done=True) self.assertTrue(t.is_active) num_ts_old = t.num_time_steps self.assertEqual(2, num_ts_old) # Assert on what the last time-step is currently. ts = t.last_time_step self.assertEqual(1, ts.observation) self.assertTrue(ts.done) self.assertEqual(None, ts.action) # Change the last time-step. t.change_last_time_step(done=False, action=5) # Assert that it changed. ts = t.last_time_step self.assertEqual( 1, ts.observation) # unchanged, since we didn't change it. self.assertFalse(ts.done) # was True earlier self.assertEqual(5, ts.action) # was None earlier # Assert on the number of steps remaining the same as before. self.assertEqual(num_ts_old, t.num_time_steps)
def _make_trajectory(self, observations, actions): assert len(observations) == len(actions) + 1 t = trajectory.Trajectory() for (obs, act) in zip(observations, actions): t.add_time_step(observation=obs, action=act, done=False) t.add_time_step(observation=observations[-1], done=True) return t
def test_truncate_and_last_n_observations_np(self): t = trajectory.Trajectory() ts = 5 shape = (3, 4) for _ in range(ts): t.add_time_step(observation=np.random.uniform(size=shape), done=False) original_obs = np.copy(t.observations_np) self.assertEqual((ts, ) + shape, original_obs.shape) # Now let's just get the observations from the last 2 steps. num_to_keep = 2 truncated_original_obs = original_obs[-num_to_keep:, ...] # Let's get the last `num_to_keep` observations last_n_observations_np = np.copy( t.last_n_observations_np(n=num_to_keep)) # Now truncate the trajectory and get the same. _ = t.truncate(num_to_keep=num_to_keep) truncated_np = np.copy(t.observations_np) # These should be the expected length. self.assertEqual((2, ) + shape, last_n_observations_np.shape) self.assertEqual((2, ) + shape, truncated_np.shape) # Test the last `num_to_keep` are the same. self.assertAllEqual(truncated_np, truncated_original_obs) self.assertAllEqual(last_n_observations_np, truncated_original_obs)
def test_as_numpy(self): t = trajectory.Trajectory() shape = (3, 4) # We'll have `ts` observations and `ts-1` actions and rewards. ts = 5 num_actions = 6 observations = np.random.uniform(size=(ts,) + shape) actions = np.random.choice(range(num_actions), size=(ts-1,)) rewards = np.random.choice([-1, 0, 1], size=(ts-1,)) # First time-step has no reward. t.add_time_step(observation=observations[0], done=False, action=actions[0]) for i in range(1, ts - 1): t.add_time_step(observation=observations[i], done=False, raw_reward=rewards[i-1], processed_reward=rewards[i-1], action=actions[i]) # Last time-step has no action. t.add_time_step(observation=observations[-1], done=False, raw_reward=rewards[-1], processed_reward=rewards[-1]) traj_np = t.as_numpy self.assertAllEqual(observations, traj_np[0]) self.assertAllEqual(actions, traj_np[1]) self.assertAllEqual(rewards, traj_np[2])
def test_observation_np(self): t = trajectory.Trajectory() ts = 5 shape = (3, 4) for _ in range(ts): t.add_time_step(observation=np.random.uniform(size=shape), done=False) self.assertEqual((ts,) + shape, t.observations_np.shape)
def _make_trajectory(self, observations=None, actions=None): t = trajectory.Trajectory() if observations is None: observations = itertools.repeat(None) if actions is None: actions = itertools.repeat(None) for (observation, action) in zip(observations, actions): t.add_time_step(observation=observation, action=action) return t
def test_reward(self): t = trajectory.Trajectory() # first time-step doesn't have rewards, since they are on entering a state. t.add_time_step( observation=1, raw_reward=None, processed_reward=None, done=False) t.add_time_step( observation=2, raw_reward=2, processed_reward=200, done=False) t.add_time_step( observation=3, raw_reward=3, processed_reward=300, done=True) raw_reward, processed_reward = t.reward self.assertEqual(5, raw_reward) self.assertEqual(500, processed_reward)
def test_add_time_step(self): t = trajectory.Trajectory() t.add_time_step(observation=1, done=True) # Test that the trajectory is now active. self.assertTrue(t.is_active) added_t = t.last_time_step self.assertEqual(1, added_t.observation) self.assertTrue(added_t.done) self.assertIsNone(None, added_t.raw_reward) self.assertIsNone(None, added_t.processed_reward) self.assertIsNone(None, added_t.action) self.assertEqual(1, t.num_time_steps)
def test_as_numpy(self): t = trajectory.Trajectory() shape = (3, 4) # We'll have `ts` observations and `ts-1` actions and rewards. ts = 5 num_actions = 6 observations = np.random.uniform(size=(ts,) + shape) actions = np.random.choice(range(num_actions), size=(ts - 1,)) rewards = np.random.choice([-1, 0, 1], size=(ts - 1,)) squares = np.arange(ts - 1)**2 cubes = np.arange(ts - 1)**3 def get_info(i): return {"sq": squares[i], "cu": cubes[i]} # First time-step has no reward. t.add_time_step( observation=observations[0], done=False, action=actions[0], info=get_info(0)) for i in range(1, ts - 1): t.add_time_step( observation=observations[i], done=False, raw_reward=rewards[i - 1], processed_reward=rewards[i - 1], action=actions[i], info=get_info(i)) # Last time-step has no action. t.add_time_step( observation=observations[-1], done=False, raw_reward=rewards[-1], processed_reward=rewards[-1]) traj_np = t.as_numpy self.assertAllEqual(observations, traj_np[0]) self.assertAllEqual(actions, traj_np[1]) self.assertAllEqual(rewards, traj_np[2]) self.assertAllEqual(squares, traj_np[4]["sq"]) self.assertAllEqual(cubes, traj_np[4]["cu"])
def get_random_trajectory(self, max_time_step=None, obs_shape=(2, 2)) -> trajectory.Trajectory: t = trajectory.Trajectory() max_time_step = max_time_step or np.random.randint(2, 10) for _ in range(max_time_step): r = float(np.random.uniform(size=())) t.add_time_step(observation=np.random.uniform(size=obs_shape), done=False, raw_reward=r, processed_reward=r, action=int(np.random.choice(10, ())), info={ replay_buffer.ReplayBuffer.LOGPS_KEY_TRAJ: float(np.random.uniform(low=-10, high=0)) }) t.change_last_time_step(done=True) return t
def play_env_problem(env, policy_fn): """Plays an EnvProblem using a given policy function.""" trajectories = [trajectory.Trajectory() for _ in range(env.batch_size)] observations = env.reset() for (traj, observation) in zip(trajectories, observations): traj.add_time_step(observation=observation) done_so_far = np.array([False] * env.batch_size) while not np.all(done_so_far): padded_observations, _ = env.trajectories.observations_np( len_history_for_policy=None) actions = policy_fn(padded_observations) (observations, rewards, dones, _) = env.step(actions) for (traj, observation, action, reward, done) in zip(trajectories, observations, actions, rewards, dones): if not traj.done: traj.change_last_time_step(action=action) traj.add_time_step(observation=observation, raw_reward=reward, done=done) env.reset(indices=env_problem_utils.done_indices(dones)) done_so_far = np.logical_or(done_so_far, dones) return trajectories
def _make_singleton_trajectory(self, observation): t = trajectory.Trajectory() t.add_time_step(observation=observation) return t
def test_empty_trajectory(self): t = trajectory.Trajectory() self.assertFalse(t.is_active) self.assertEqual(0, t.num_time_steps) self.assertFalse(t.done)
def test_load_from_directory(self): output_dir = self.get_temp_dir() epochs = [0, 1, 2] env_ids = [0, 1, 2] temperatures = [0.5, 1.0] random_strings = ["a", "b"] # Write some trajectories. # There are 3x3x2x2 (36) trajectories, and of them 3x2x2 (12) are done. for epoch in epochs: for env_id in env_ids: for temperature in temperatures: for random_string in random_strings: traj = trajectory.Trajectory(time_steps=[ time_step.TimeStep(observation=epoch, done=(epoch == 0), raw_reward=1.0, processed_reward=1.0, action=env_id, info={}) ]) trajectory_file_name = trajectory.TRAJECTORY_FILE_FORMAT.format( epoch=epoch, env_id=env_id, temperature=temperature, r=random_string) with gfile.GFile( os.path.join(output_dir, trajectory_file_name), "w") as f: trajectory.get_pickle_module().dump(traj, f) # Load everything and check. bt = trajectory.BatchTrajectory.load_from_directory(output_dir) self.assertIsInstance(bt, trajectory.BatchTrajectory) self.assertEqual(36, bt.num_completed_trajectories) self.assertEqual(36, bt.batch_size) bt = trajectory.BatchTrajectory.load_from_directory(output_dir, epoch=0) self.assertEqual(12, bt.num_completed_trajectories) self.assertEqual(12, bt.batch_size) # Get 100 trajectories, but there aren't any. bt = trajectory.BatchTrajectory.load_from_directory(output_dir, epoch=0, n_trajectories=100, max_tries=0) self.assertIsNone(bt) bt = trajectory.BatchTrajectory.load_from_directory(output_dir, epoch=0, temperature=0.5) self.assertEqual(6, bt.num_completed_trajectories) self.assertEqual(6, bt.batch_size) bt = trajectory.BatchTrajectory.load_from_directory(output_dir, epoch=1) self.assertEqual(12, bt.num_completed_trajectories) self.assertEqual(12, bt.batch_size) # Constraints cannot be satisfied. bt = trajectory.BatchTrajectory.load_from_directory(output_dir, epoch=1, n_trajectories=100, up_sample=False, max_tries=0) self.assertIsNone(bt) # Constraints can be satisfied. bt = trajectory.BatchTrajectory.load_from_directory(output_dir, epoch=1, n_trajectories=100, up_sample=True, max_tries=0) self.assertEqual(100, bt.num_completed_trajectories) self.assertEqual(100, bt.batch_size) bt = trajectory.BatchTrajectory.load_from_directory(output_dir, epoch=1, n_trajectories=10) self.assertEqual(10, bt.num_completed_trajectories) self.assertEqual(10, bt.batch_size) gfile.rmtree(output_dir)