def test_sanity_check_cartpole_done_vector(): """TODO: Sanity check, make sure that cartpole has done=True at some point when using a BatchedEnv. """ batch_size = 5 starting_values = [i for i in range(batch_size)] targets = [10 for i in range(batch_size)] env = make_batched_env("CartPole-v0", batch_size=5, wrappers=[PixelObservationWrapper]) env = AddDoneToObservation(env) # env = AddInfoToObservation(env) # env = BatchedVectorEnv([ # partial(gym.make, "CartPole-v0") for i in range(batch_size) # ]) obs = env.reset() for i in range(100): obs, rewards, done, info = env.step(env.action_space.sample()) assert all(obs[1] == done), i if any(done): break else: assert False, "Should have had at least one done=True, over the 100 steps!"
def test_done_is_sometimes_True_when_iterating_through_env(batch_size: int): """ Test that when *iterating* through the env, done is sometimes 'True'. """ env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=True) env = AddDoneToObservation(env) env = ConvertToFromTensors(env) env = EnvDataset(env) for i, obs in zip(range(100), env): print(i, obs) _ = env.send(env.action_space.sample()) if any(obs["done"]): break else: assert False, "Never encountered done=True!"
def env_fn(): # FIXME: Using the DummyEnvironment for now since it's easier to debug with. # env = gym.make(env_name) env = DummyEnvironment() env = AddDoneToObservation(env) env = TimeLimit(env, max_episode_steps=max_steps_per_episode) return env
def get_multi_task_env( batch_size: int = 1, ) -> Environment[RLSetting.Observations, RLSetting.Actions, RLSetting.Rewards]: def single_env_fn() -> gym.Env: env = gym.make("CartPole-v0") env = TimeLimit(env, max_episode_steps=10) env = MultiTaskEnvironment( env, task_schedule={ 0: {"length": 0.1}, 100: {"length": 0.2}, 200: {"length": 0.3}, 300: {"length": 0.4}, 400: {"length": 0.5}, }, add_task_id_to_obs=True, new_random_task_on_reset=True, ) return env batch_size = 1 env = SyncVectorEnv([single_env_fn for _ in range(batch_size)]) from sequoia.common.gym_wrappers import AddDoneToObservation from sequoia.settings.active import TypedObjectsWrapper env = AddDoneToObservation(env) # Wrap the observations so they appear as though they are from the given setting. env = TypedObjectsWrapper( env, observations_type=RLSetting.Observations, actions_type=RLSetting.Actions, rewards_type=RLSetting.Rewards, ) env.seed(123) return env
def _make_env_dataloader( self, env_factory: Callable[[], gym.Env], batch_size: Optional[int], num_workers: Optional[int] = None, seed: Optional[int] = None, max_steps: Optional[int] = None, max_episodes: Optional[int] = None, ) -> GymDataLoader: """ Helper function for creating a (possibly vectorized) environment. """ logger.debug( f"batch_size: {batch_size}, num_workers: {num_workers}, seed: {seed}" ) env: Union[gym.Env, gym.vector.VectorEnv] if batch_size is None: env = env_factory() else: env = make_batched_env( env_factory, batch_size=batch_size, num_workers=num_workers, # TODO: Still debugging shared memory + custom spaces (e.g. Sparse). shared_memory=False, ) ## Apply the "post-batch" wrappers: # from sequoia.common.gym_wrappers import ConvertToFromTensors # TODO: Only the BaselineMethod requires this, we should enable it only # from the BaselineMethod, and leave it 'off' by default. if self.add_done_to_observations: env = AddDoneToObservation(env) # # Convert the samples to tensors and move them to the right device. # env = ConvertToFromTensors(env) # env = ConvertToFromTensors(env, device=self.config.device) # Add a wrapper that converts numpy arrays / etc to Observations/Rewards # and from Actions objects to numpy arrays. env = TypedObjectsWrapper( env, observations_type=self.Observations, rewards_type=self.Rewards, actions_type=self.Actions, ) # Create an IterableDataset from the env using the EnvDataset wrapper. dataset = EnvDataset(env, max_steps=max_steps, max_episodes=max_episodes,) # Create a GymDataLoader for the EnvDataset. env_dataloader = GymDataLoader(dataset) if batch_size and seed: # Seed each environment with its own seed (based on the base seed). env.seed([seed + i for i in range(env_dataloader.num_envs)]) else: env.seed(seed) return env_dataloader
def test_with_controllable_episode_lengths(batch_size: int, monkeypatch): """ TODO: Test out the PolicyHead in a very controlled environment, where we know exactly the lengths of each episode. """ env = FakeEnvironment( partial(gym.make, "CartPole-v0"), batch_size=batch_size, episode_lengths=[5, *(10 for _ in range(batch_size - 1))], new_episode_length=lambda env_index: 10, ) env = AddDoneToObservation(env) env = ConvertToFromTensors(env) env = EnvDataset(env) obs_space = env.single_observation_space x_dim = flatdim(obs_space["x"]) # Create some dummy encoder. encoder = nn.Linear(x_dim, x_dim) representation_space = obs_space["x"] output_head = PolicyHead( input_space=representation_space, action_space=env.single_action_space, reward_space=env.single_reward_space, hparams=PolicyHead.HParams( max_episode_window_length=100, min_episodes_before_update=1, accumulate_losses_before_backward=False, ), ) # TODO: Simulating as if the output head were attached to a BaselineModel. PolicyHead.base_model_optimizer = torch.optim.Adam( output_head.parameters(), lr=1e-3 ) # Simplify the loss function so we know exactly what the loss should be at # each step. def mock_policy_gradient( rewards: Sequence[float], log_probs: Sequence[float], gamma: float = 0.95 ) -> Optional[Loss]: log_probs = (log_probs - log_probs.clone()) + 1 # Return the length of the episode, but with a "gradient" flowing back into log_probs. return len(rewards) * log_probs.mean() monkeypatch.setattr(output_head, "policy_gradient", mock_policy_gradient) batch_size = env.batch_size obs = env.reset() step_done = np.zeros(batch_size, dtype=np.bool) for step in range(200): x, obs_done = obs["x"], obs["done"] # The done from the obs should always be the same as the 'done' from the 'step' function. assert np.array_equal(obs_done, step_done) representations = encoder(x) observations = ContinualRLSetting.Observations(x=x, done=obs_done,) actions_obj = output_head(observations, representations) actions = actions_obj.y_pred # TODO: kinda useless to wrap a single tensor in an object.. forward_pass = ForwardPass( observations=observations, representations=representations, actions=actions, ) obs, rewards, step_done, info = env.step(actions) rewards_obj = ContinualRLSetting.Rewards(y=rewards) loss = output_head.get_loss( forward_pass=forward_pass, actions=actions_obj, rewards=rewards_obj, ) print(f"Step {step}") print(f"num episodes since update: {output_head.num_episodes_since_update}") print(f"steps left in episode: {env.steps_left_in_episode}") print(f"Loss for that step: {loss}") if any(obs_done): assert loss != 0.0 if step == 5.0: # Env 0 first episode from steps 0 -> 5 assert loss.loss == 5.0 assert loss.metrics["gradient_usage"].used_gradients == 5.0 assert loss.metrics["gradient_usage"].wasted_gradients == 0.0 elif step == 10: # Envs[1:batch_size], first episode, from steps 0 -> 10 # NOTE: At this point, both envs have reached the required number of episodes. # This means that the gradient usage on the next time any env reaches # an end-of-episode will be one less than the total number of items. assert loss.loss == 10.0 * (batch_size - 1) assert loss.metrics["gradient_usage"].used_gradients == 10.0 * ( batch_size - 1 ) assert loss.metrics["gradient_usage"].wasted_gradients == 0.0 elif step == 15: # Env 0 second episode from steps 5 -> 15 assert loss.loss == 10.0 assert loss.metrics["gradient_usage"].used_gradients == 4 assert loss.metrics["gradient_usage"].wasted_gradients == 6 elif step == 20: # Envs[1:batch_size]: second episode, from steps 0 -> 10 # NOTE: At this point, both envs have reached the required number of episodes. # This means that the gradient usage on the next time any env reaches # an end-of-episode will be one less than the total number of items. assert loss.loss == 10.0 * (batch_size - 1) assert loss.metrics["gradient_usage"].used_gradients == 9 * (batch_size - 1) assert loss.metrics["gradient_usage"].wasted_gradients == 1 * ( batch_size - 1 ) elif step == 25: # Env 0 third episode from steps 5 -> 15 assert loss.loss == 10.0 assert loss.metrics["gradient_usage"].used_gradients == 4 assert loss.metrics["gradient_usage"].wasted_gradients == 6 elif step > 0 and step % 10 == 0: # Same pattern as step 20 above assert loss.loss == 10.0 * (batch_size - 1), step assert loss.metrics["gradient_usage"].used_gradients == 9 * (batch_size - 1) assert loss.metrics["gradient_usage"].wasted_gradients == 1 * ( batch_size - 1 ) elif step > 0 and step % 5 == 0: # Same pattern as step 25 above assert loss.loss == 10.0 assert loss.metrics["gradient_usage"].used_gradients == 4 assert loss.metrics["gradient_usage"].wasted_gradients == 6 else: assert loss.loss == 0.0, step
def test_loss_is_nonzero_at_episode_end_iterate(batch_size: int): """ Test that when *iterating* through the env (active-dataloader style), when the episode ends, a non-zero loss is returned by the output head. """ with gym.make("CartPole-v0") as temp_env: temp_env = AddDoneToObservation(temp_env) obs_space = temp_env.observation_space action_space = temp_env.action_space reward_space = getattr( temp_env, "reward_space", spaces.Box(*temp_env.reward_range, shape=()) ) env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=False) env = AddDoneToObservation(env) env = ConvertToFromTensors(env) env = EnvDataset(env) head = PolicyHead( # observation_space=obs_space, input_space=obs_space["x"], action_space=action_space, reward_space=reward_space, hparams=PolicyHead.HParams(accumulate_losses_before_backward=False), ) env.seed(123) non_zero_losses = 0 for i, obs in zip(range(100), env): print(i, obs) x = obs["x"] done = obs["done"] representations = x assert isinstance(x, Tensor) assert isinstance(done, Tensor) observations = ContinualRLSetting.Observations( x=x, done=done, # info=info, ) head_output = head.forward(observations, representations=representations) actions = head_output.actions.numpy().tolist() # actions = np.zeros(batch_size, dtype=int).tolist() rewards = env.send(actions) # print(f"Step {i}, obs: {obs}, done: {done}") assert isinstance(representations, Tensor) forward_pass = ForwardPass( observations=observations, representations=representations, actions=head_output, ) rewards = ContinualRLSetting.Rewards(rewards) loss = head.get_loss(forward_pass, actions=head_output, rewards=rewards) print("loss:", loss) for env_index, env_is_done in enumerate(observations.done): if env_is_done: print(f"Episode ended for env {env_index} at step {i}") assert loss.total_loss != 0.0 non_zero_losses += 1 break else: print(f"No episode ended on step {i}, expecting no loss.") assert loss.total_loss == 0.0 assert non_zero_losses > 0
def test_loss_is_nonzero_at_episode_end(batch_size: int): """ Test that when stepping through the env, when the episode ends, a non-zero loss is returned by the output head. """ with gym.make("CartPole-v0") as temp_env: temp_env = AddDoneToObservation(temp_env) obs_space = temp_env.observation_space action_space = temp_env.action_space reward_space = getattr( temp_env, "reward_space", spaces.Box(*temp_env.reward_range, shape=()) ) env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=False) env = AddDoneToObservation(env) env = ConvertToFromTensors(env) env = EnvDataset(env) head = PolicyHead( input_space=obs_space.x, action_space=action_space, reward_space=reward_space, hparams=PolicyHead.HParams(accumulate_losses_before_backward=False), ) # TODO: Simulating as if the output head were attached to a BaselineModel. PolicyHead.base_model_optimizer = torch.optim.Adam(head.parameters(), lr=1e-3) head.train() env.seed(123) obs = env.reset() # obs = torch.as_tensor(obs, dtype=torch.float32) done = torch.zeros(batch_size, dtype=bool) info = np.array([{} for _ in range(batch_size)]) loss = None non_zero_losses = 0 encoder = nn.Linear(4, 4) encoder.train() for i in range(100): representations = encoder(obs["x"]) observations = ContinualRLSetting.Observations( x=obs["x"], done=done, # info=info, ) head_output = head.forward(observations, representations=representations) actions = head_output.actions.numpy().tolist() # actions = np.zeros(batch_size, dtype=int).tolist() obs, rewards, done, info = env.step(actions) done = torch.as_tensor(done, dtype=bool) rewards = ContinualRLSetting.Rewards(rewards) assert len(info) == batch_size print(f"Step {i}, obs: {obs}, done: {done}, info: {info}") forward_pass = ForwardPass( observations=observations, representations=representations, actions=head_output, ) loss = head.get_loss(forward_pass, actions=head_output, rewards=rewards) print("loss:", loss) assert observations.done is not None for env_index, env_is_done in enumerate(observations.done): if env_is_done: print(f"Episode ended for env {env_index} at step {i}") assert loss.loss != 0.0 non_zero_losses += 1 break else: print(f"No episode ended on step {i}, expecting no loss.") assert loss is None or loss.loss == 0.0 assert non_zero_losses > 0