def test_play_env_problem_with_policy(self): env = gym_env_problem.GymEnvProblem( base_env_name="CartPole-v0", batch_size=2, reward_range=(-1, 1)) # Let's make sure that at-most 4 observations come to the policy function. len_history_for_policy = 4 def policy_fun(observations, lengths, state=None, rng=None): del lengths b = observations.shape[0] # Assert that observations from time-step len_history_for_policy onwards # are zeros. self.assertTrue( np.all(observations[:, len_history_for_policy:, ...] == 0)) self.assertFalse( np.all(observations[:, :len_history_for_policy, ...] == 0)) a = env.action_space.n p = np.random.uniform(size=(b, 1, a)) p = np.exp(p) p = p / np.sum(p, axis=-1, keepdims=True) return np.log(p), np.mean(p, axis=-1), state, rng max_timestep = 15 num_trajectories = 2 trajectories, _, _, _ = env_problem_utils.play_env_problem_with_policy( env, policy_fun, num_trajectories=num_trajectories, max_timestep=max_timestep, len_history_for_policy=len_history_for_policy) self.assertEqual(num_trajectories, len(trajectories)) # Check shapes within trajectories. traj = trajectories[0] T = traj[1].shape[0] # pylint: disable=invalid-name self.assertEqual((T + 1, 4), traj[0].shape) # (4,) is OBS self.assertEqual((T,), traj[2].shape) self.assertEqual(T, len(traj[4]["log_prob_actions"])) self.assertEqual(T, len(traj[4]["value_predictions"])) self.assertLessEqual(T, max_timestep) traj = trajectories[1] T = traj[1].shape[0] # pylint: disable=invalid-name self.assertEqual((T + 1, 4), traj[0].shape) self.assertEqual((T,), traj[2].shape) self.assertEqual(T, len(traj[4]["log_prob_actions"])) self.assertEqual(T, len(traj[4]["value_predictions"])) self.assertLessEqual(T, max_timestep)
def play_env(self, env=None, nsteps=100, base_env_name=None, batch_size=5, reward_range=None): """Creates `GymEnvProblem` with the given arguments and plays it randomly. Args: env: optional env. nsteps: plays the env randomly for nsteps. base_env_name: passed to GymEnvProblem's init. batch_size: passed to GymEnvProblem's init. reward_range: passed to GymEnvProblem's init. Returns: tuple of gym_env_problem, number of trajectories done, number of trajectories done in the last step. """ if env is None: env = gym_env_problem.GymEnvProblem(base_env_name=base_env_name, batch_size=batch_size, reward_range=reward_range) # Usually done by a registered subclass, we do this manually in the test. env.name = base_env_name # Reset all environments. env.reset() # Play for some steps to generate data. num_dones = 0 num_dones_in_last_step = 0 for _ in range(nsteps): # Sample actions. actions = np.stack( [env.action_space.sample() for _ in range(batch_size)]) # Step through it. _, _, dones, _ = env.step(actions) # Get the indices where we are done ... done_indices = env_problem_utils.done_indices(dones) # ... and reset those. env.reset(indices=done_indices) # count the number of dones we got, in this step and overall. num_dones_in_last_step = sum(dones) num_dones += num_dones_in_last_step return env, num_dones, num_dones_in_last_step
def get_wrapped_env(self, name="CartPole-v0", max_episode_steps=2): wrapper_fn = functools.partial( gym_utils.gym_env_wrapper, **{ "rl_env_max_episode_steps": max_episode_steps, "maxskip_env": False, "rendered_env": False, "rendered_env_resize_to": None, # Do not resize frames "sticky_actions": False, "output_dtype": None, }) return gym_env_problem.GymEnvProblem(base_env_name=name, batch_size=1, env_wrapper_fn=wrapper_fn, discrete_rewards=False)
def _make_wrapped_env(self, name, max_episode_steps=2): wrapper_fn = functools.partial( gym_utils.gym_env_wrapper, **{ 'rl_env_max_episode_steps': max_episode_steps, 'maxskip_env': False, 'rendered_env': False, 'rendered_env_resize_to': None, # Do not resize frames 'sticky_actions': False, 'output_dtype': None, }) return gym_env_problem.GymEnvProblem(base_env_name=name, batch_size=2, env_wrapper_fn=wrapper_fn, discrete_rewards=False)
def make_env(batch_size=1, env_problem_name="", resize=True, resized_height=105, resized_width=80, max_timestep="None", clip_rewards=True, parallelism=1, use_tpu=False, **env_kwargs): """Creates the env.""" if clip_rewards: env_kwargs.update({"reward_range": (-1, 1), "discrete_rewards": True}) else: env_kwargs.update({"discrete_rewards": False}) # No resizing needed, so let's be on the normal EnvProblem. if not resize: # None or False return gym_env_problem.GymEnvProblem( base_env_name=env_problem_name, batch_size=batch_size, parallelism=parallelism, **env_kwargs) try: max_timestep = int(max_timestep) except Exception: # pylint: disable=broad-except max_timestep = None wrapper_fn = functools.partial( gym_utils.gym_env_wrapper, **{ "rl_env_max_episode_steps": max_timestep, "maxskip_env": True, "rendered_env": True, "rendered_env_resize_to": (resized_height, resized_width), "sticky_actions": False, "output_dtype": np.int32 if use_tpu else None, }) return rendered_env_problem.RenderedEnvProblem( base_env_name=env_problem_name, batch_size=batch_size, parallelism=parallelism, env_wrapper_fn=wrapper_fn, **env_kwargs)
def make_env(batch_size=8, **env_kwargs): """Creates the env.""" if FLAGS.clip_rewards: env_kwargs.update({"reward_range": (-1, 1), "discrete_rewards": True}) else: env_kwargs.update({"discrete_rewards": False}) # TODO(afrozm): Should we leave out some cores? parallelism = multiprocessing.cpu_count() if FLAGS.parallelize_envs else 1 # No resizing needed, so let's be on the normal EnvProblem. if not FLAGS.resize: # None or False return gym_env_problem.GymEnvProblem( base_env_name=FLAGS.env_problem_name, batch_size=batch_size, parallelism=parallelism, **env_kwargs) max_timestep = None try: max_timestep = int(FLAGS.max_timestep) except Exception: # pylint: disable=broad-except pass wrapper_fn = functools.partial( gym_utils.gym_env_wrapper, **{ "rl_env_max_episode_steps": max_timestep, "maxskip_env": True, "rendered_env": True, "rendered_env_resize_to": (FLAGS.resized_height, FLAGS.resized_width), "sticky_actions": False, "output_dtype": onp.int32 if FLAGS.use_tpu else None, }) return rendered_env_problem.RenderedEnvProblem( base_env_name=FLAGS.env_problem_name, batch_size=batch_size, parallelism=parallelism, env_wrapper_fn=wrapper_fn, **env_kwargs)
def test_setup(self): ep = gym_env_problem.GymEnvProblem(base_env_name="CartPole-v0", batch_size=5) # Checks that environments were created and they are `batch_size` in number. ep.assert_common_preconditions() # Expectations on the observation space. observation_space = ep.observation_space self.assertIsInstance(observation_space, Box) self.assertEqual(observation_space.shape, (4, )) self.assertEqual(observation_space.dtype, np.float32) # Expectations on the action space. action_space = ep.action_space self.assertTrue(isinstance(action_space, Discrete)) self.assertEqual(action_space.shape, ()) self.assertEqual(action_space.dtype, np.int64) self.assertEqual(ep.num_actions, 2) # Reward range is infinite here. self.assertFalse(ep.is_reward_range_finite)
def make_env(name, batch_size, max_timestep, clip_rewards, rendered_env, resize_dims, **env_kwargs): """Creates the env.""" if clip_rewards: env_kwargs.update({"reward_range": (-1, 1), "discrete_rewards": True}) else: env_kwargs.update({"discrete_rewards": False}) # TODO(afrozm): Should we leave out some cores? parallelism = multiprocessing.cpu_count() if FLAGS.parallelize_envs else 1 # No resizing needed, so let's be on the normal EnvProblem. if not rendered_env: return gym_env_problem.GymEnvProblem( base_env_name=name, batch_size=batch_size, parallelism=parallelism, **env_kwargs) wrapper_fn = functools.partial( gym_utils.gym_env_wrapper, **{ "rl_env_max_episode_steps": max_timestep, "maxskip_env": True, "rendered_env": True, "rendered_env_resize_to": resize_dims, "sticky_actions": False, "output_dtype": onp.int32 if FLAGS.use_tpu else None, }) return rendered_env_problem.RenderedEnvProblem( base_env_name=name, batch_size=batch_size, parallelism=parallelism, env_wrapper_fn=wrapper_fn, **env_kwargs)
def test_interaction_with_env(self): batch_size = 5 reward_range = (-1, 1) ep = gym_env_problem.GymEnvProblem(base_env_name="KellyCoinflip-v0", batch_size=batch_size, reward_range=reward_range) # Resets all environments. ep.reset() # Let's play a few steps. nsteps = 100 num_trajectories_completed = 0 num_timesteps_completed = 0 # If batch_done_at_step[i] = j then it means that i^th env last got done at # step = j. batch_done_at_step = np.full(batch_size, -1) for i in range(nsteps): # Sample batch_size actions from the action space and stack them (since # that is the expected type). actions = np.stack( [ep.action_space.sample() for _ in range(batch_size)]) _, _, dones, _ = ep.step(actions) # Do the book-keeping on number of trajectories completed and expect that # it matches ep's completed number. num_done = sum(dones) num_trajectories_completed += num_done self.assertEqual(num_trajectories_completed, len(ep.trajectories.completed_trajectories)) # Get the indices where we are done ... done_indices = env_problem_utils.done_indices(dones) # ... and reset those. ep.reset(indices=done_indices) # If nothing got done, go on to the next step. if done_indices.size == 0: # i.e. this is an empty array. continue # See when these indices were last done and calculate how many time-steps # each one took to get done. num_timesteps_completed += sum(i + 1 - batch_done_at_step[done_indices]) batch_done_at_step[done_indices] = i # This should also match the number of time-steps completed given by ep. num_timesteps_completed_ep = sum( ct.num_time_steps for ct in ep.trajectories.completed_trajectories) self.assertEqual(num_timesteps_completed, num_timesteps_completed_ep) # Reset the trajectories. ep.trajectories.reset_batch_trajectories() self.assertEqual(0, len(ep.trajectories.completed_trajectories))