def test_done_is_sometimes_True_when_iterating_through_env(batch_size: int): """ Test that when *iterating* through the env, done is sometimes 'True'. """ env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=True) env = AddDoneToObservation(env) env = ConvertToFromTensors(env) env = EnvDataset(env) for i, obs in zip(range(100), env): print(i, obs) _ = env.send(env.action_space.sample()) if any(obs["done"]): break else: assert False, "Never encountered done=True!"
def test_with_controllable_episode_lengths(batch_size: int, monkeypatch): """ TODO: Test out the PolicyHead in a very controlled environment, where we know exactly the lengths of each episode. """ env = FakeEnvironment( partial(gym.make, "CartPole-v0"), batch_size=batch_size, episode_lengths=[5, *(10 for _ in range(batch_size - 1))], new_episode_length=lambda env_index: 10, ) env = AddDoneToObservation(env) env = ConvertToFromTensors(env) env = EnvDataset(env) obs_space = env.single_observation_space x_dim = flatdim(obs_space["x"]) # Create some dummy encoder. encoder = nn.Linear(x_dim, x_dim) representation_space = obs_space["x"] output_head = PolicyHead( input_space=representation_space, action_space=env.single_action_space, reward_space=env.single_reward_space, hparams=PolicyHead.HParams( max_episode_window_length=100, min_episodes_before_update=1, accumulate_losses_before_backward=False, ), ) # TODO: Simulating as if the output head were attached to a BaselineModel. PolicyHead.base_model_optimizer = torch.optim.Adam( output_head.parameters(), lr=1e-3 ) # Simplify the loss function so we know exactly what the loss should be at # each step. def mock_policy_gradient( rewards: Sequence[float], log_probs: Sequence[float], gamma: float = 0.95 ) -> Optional[Loss]: log_probs = (log_probs - log_probs.clone()) + 1 # Return the length of the episode, but with a "gradient" flowing back into log_probs. return len(rewards) * log_probs.mean() monkeypatch.setattr(output_head, "policy_gradient", mock_policy_gradient) batch_size = env.batch_size obs = env.reset() step_done = np.zeros(batch_size, dtype=np.bool) for step in range(200): x, obs_done = obs["x"], obs["done"] # The done from the obs should always be the same as the 'done' from the 'step' function. assert np.array_equal(obs_done, step_done) representations = encoder(x) observations = ContinualRLSetting.Observations(x=x, done=obs_done,) actions_obj = output_head(observations, representations) actions = actions_obj.y_pred # TODO: kinda useless to wrap a single tensor in an object.. forward_pass = ForwardPass( observations=observations, representations=representations, actions=actions, ) obs, rewards, step_done, info = env.step(actions) rewards_obj = ContinualRLSetting.Rewards(y=rewards) loss = output_head.get_loss( forward_pass=forward_pass, actions=actions_obj, rewards=rewards_obj, ) print(f"Step {step}") print(f"num episodes since update: {output_head.num_episodes_since_update}") print(f"steps left in episode: {env.steps_left_in_episode}") print(f"Loss for that step: {loss}") if any(obs_done): assert loss != 0.0 if step == 5.0: # Env 0 first episode from steps 0 -> 5 assert loss.loss == 5.0 assert loss.metrics["gradient_usage"].used_gradients == 5.0 assert loss.metrics["gradient_usage"].wasted_gradients == 0.0 elif step == 10: # Envs[1:batch_size], first episode, from steps 0 -> 10 # NOTE: At this point, both envs have reached the required number of episodes. # This means that the gradient usage on the next time any env reaches # an end-of-episode will be one less than the total number of items. assert loss.loss == 10.0 * (batch_size - 1) assert loss.metrics["gradient_usage"].used_gradients == 10.0 * ( batch_size - 1 ) assert loss.metrics["gradient_usage"].wasted_gradients == 0.0 elif step == 15: # Env 0 second episode from steps 5 -> 15 assert loss.loss == 10.0 assert loss.metrics["gradient_usage"].used_gradients == 4 assert loss.metrics["gradient_usage"].wasted_gradients == 6 elif step == 20: # Envs[1:batch_size]: second episode, from steps 0 -> 10 # NOTE: At this point, both envs have reached the required number of episodes. # This means that the gradient usage on the next time any env reaches # an end-of-episode will be one less than the total number of items. assert loss.loss == 10.0 * (batch_size - 1) assert loss.metrics["gradient_usage"].used_gradients == 9 * (batch_size - 1) assert loss.metrics["gradient_usage"].wasted_gradients == 1 * ( batch_size - 1 ) elif step == 25: # Env 0 third episode from steps 5 -> 15 assert loss.loss == 10.0 assert loss.metrics["gradient_usage"].used_gradients == 4 assert loss.metrics["gradient_usage"].wasted_gradients == 6 elif step > 0 and step % 10 == 0: # Same pattern as step 20 above assert loss.loss == 10.0 * (batch_size - 1), step assert loss.metrics["gradient_usage"].used_gradients == 9 * (batch_size - 1) assert loss.metrics["gradient_usage"].wasted_gradients == 1 * ( batch_size - 1 ) elif step > 0 and step % 5 == 0: # Same pattern as step 25 above assert loss.loss == 10.0 assert loss.metrics["gradient_usage"].used_gradients == 4 assert loss.metrics["gradient_usage"].wasted_gradients == 6 else: assert loss.loss == 0.0, step
def test_loss_is_nonzero_at_episode_end_iterate(batch_size: int): """ Test that when *iterating* through the env (active-dataloader style), when the episode ends, a non-zero loss is returned by the output head. """ with gym.make("CartPole-v0") as temp_env: temp_env = AddDoneToObservation(temp_env) obs_space = temp_env.observation_space action_space = temp_env.action_space reward_space = getattr( temp_env, "reward_space", spaces.Box(*temp_env.reward_range, shape=()) ) env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=False) env = AddDoneToObservation(env) env = ConvertToFromTensors(env) env = EnvDataset(env) head = PolicyHead( # observation_space=obs_space, input_space=obs_space["x"], action_space=action_space, reward_space=reward_space, hparams=PolicyHead.HParams(accumulate_losses_before_backward=False), ) env.seed(123) non_zero_losses = 0 for i, obs in zip(range(100), env): print(i, obs) x = obs["x"] done = obs["done"] representations = x assert isinstance(x, Tensor) assert isinstance(done, Tensor) observations = ContinualRLSetting.Observations( x=x, done=done, # info=info, ) head_output = head.forward(observations, representations=representations) actions = head_output.actions.numpy().tolist() # actions = np.zeros(batch_size, dtype=int).tolist() rewards = env.send(actions) # print(f"Step {i}, obs: {obs}, done: {done}") assert isinstance(representations, Tensor) forward_pass = ForwardPass( observations=observations, representations=representations, actions=head_output, ) rewards = ContinualRLSetting.Rewards(rewards) loss = head.get_loss(forward_pass, actions=head_output, rewards=rewards) print("loss:", loss) for env_index, env_is_done in enumerate(observations.done): if env_is_done: print(f"Episode ended for env {env_index} at step {i}") assert loss.total_loss != 0.0 non_zero_losses += 1 break else: print(f"No episode ended on step {i}, expecting no loss.") assert loss.total_loss == 0.0 assert non_zero_losses > 0
def test_loss_is_nonzero_at_episode_end(batch_size: int): """ Test that when stepping through the env, when the episode ends, a non-zero loss is returned by the output head. """ with gym.make("CartPole-v0") as temp_env: temp_env = AddDoneToObservation(temp_env) obs_space = temp_env.observation_space action_space = temp_env.action_space reward_space = getattr( temp_env, "reward_space", spaces.Box(*temp_env.reward_range, shape=()) ) env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=False) env = AddDoneToObservation(env) env = ConvertToFromTensors(env) env = EnvDataset(env) head = PolicyHead( input_space=obs_space.x, action_space=action_space, reward_space=reward_space, hparams=PolicyHead.HParams(accumulate_losses_before_backward=False), ) # TODO: Simulating as if the output head were attached to a BaselineModel. PolicyHead.base_model_optimizer = torch.optim.Adam(head.parameters(), lr=1e-3) head.train() env.seed(123) obs = env.reset() # obs = torch.as_tensor(obs, dtype=torch.float32) done = torch.zeros(batch_size, dtype=bool) info = np.array([{} for _ in range(batch_size)]) loss = None non_zero_losses = 0 encoder = nn.Linear(4, 4) encoder.train() for i in range(100): representations = encoder(obs["x"]) observations = ContinualRLSetting.Observations( x=obs["x"], done=done, # info=info, ) head_output = head.forward(observations, representations=representations) actions = head_output.actions.numpy().tolist() # actions = np.zeros(batch_size, dtype=int).tolist() obs, rewards, done, info = env.step(actions) done = torch.as_tensor(done, dtype=bool) rewards = ContinualRLSetting.Rewards(rewards) assert len(info) == batch_size print(f"Step {i}, obs: {obs}, done: {done}, info: {info}") forward_pass = ForwardPass( observations=observations, representations=representations, actions=head_output, ) loss = head.get_loss(forward_pass, actions=head_output, rewards=rewards) print("loss:", loss) assert observations.done is not None for env_index, env_is_done in enumerate(observations.done): if env_is_done: print(f"Episode ended for env {env_index} at step {i}") assert loss.loss != 0.0 non_zero_losses += 1 break else: print(f"No episode ended on step {i}, expecting no loss.") assert loss is None or loss.loss == 0.0 assert non_zero_losses > 0