예제 #1
0
def test_sanity_check_cartpole_done_vector():
    """TODO: Sanity check, make sure that cartpole has done=True at some point
    when using a BatchedEnv.
    """
    batch_size = 5

    starting_values = [i for i in range(batch_size)]
    targets = [10 for i in range(batch_size)]

    env = make_batched_env("CartPole-v0",
                           batch_size=5,
                           wrappers=[PixelObservationWrapper])
    env = AddDoneToObservation(env)
    # env = AddInfoToObservation(env)

    # env = BatchedVectorEnv([
    #     partial(gym.make, "CartPole-v0") for i in range(batch_size)
    # ])
    obs = env.reset()

    for i in range(100):
        obs, rewards, done, info = env.step(env.action_space.sample())
        assert all(obs[1] == done), i
        if any(done):

            break
    else:
        assert False, "Should have had at least one done=True, over the 100 steps!"
예제 #2
0
def test_done_is_sometimes_True_when_iterating_through_env(batch_size: int):
    """ Test that when *iterating* through the env, done is sometimes 'True'.
    """
    env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=True)
    env = AddDoneToObservation(env)
    env = ConvertToFromTensors(env)
    env = EnvDataset(env)
    for i, obs in zip(range(100), env):
        print(i, obs)
        _ = env.send(env.action_space.sample())
        if any(obs["done"]):
            break
    else:
        assert False, "Never encountered done=True!"
예제 #3
0
 def env_fn():
     # FIXME: Using the DummyEnvironment for now since it's easier to debug with.
     # env = gym.make(env_name)
     env = DummyEnvironment()
     env = AddDoneToObservation(env)
     env = TimeLimit(env, max_episode_steps=max_steps_per_episode)
     return env
예제 #4
0
def get_multi_task_env(
    batch_size: int = 1,
) -> Environment[RLSetting.Observations, RLSetting.Actions, RLSetting.Rewards]:
    def single_env_fn() -> gym.Env:
        env = gym.make("CartPole-v0")
        env = TimeLimit(env, max_episode_steps=10)
        env = MultiTaskEnvironment(
            env,
            task_schedule={
                0: {"length": 0.1},
                100: {"length": 0.2},
                200: {"length": 0.3},
                300: {"length": 0.4},
                400: {"length": 0.5},
            },
            add_task_id_to_obs=True,
            new_random_task_on_reset=True,
        )
        return env

    batch_size = 1
    env = SyncVectorEnv([single_env_fn for _ in range(batch_size)])
    from sequoia.common.gym_wrappers import AddDoneToObservation
    from sequoia.settings.active import TypedObjectsWrapper

    env = AddDoneToObservation(env)
    # Wrap the observations so they appear as though they are from the given setting.
    env = TypedObjectsWrapper(
        env,
        observations_type=RLSetting.Observations,
        actions_type=RLSetting.Actions,
        rewards_type=RLSetting.Rewards,
    )
    env.seed(123)
    return env
예제 #5
0
    def _make_env_dataloader(
        self,
        env_factory: Callable[[], gym.Env],
        batch_size: Optional[int],
        num_workers: Optional[int] = None,
        seed: Optional[int] = None,
        max_steps: Optional[int] = None,
        max_episodes: Optional[int] = None,
    ) -> GymDataLoader:
        """ Helper function for creating a (possibly vectorized) environment.
        
        """
        logger.debug(
            f"batch_size: {batch_size}, num_workers: {num_workers}, seed: {seed}"
        )

        env: Union[gym.Env, gym.vector.VectorEnv]
        if batch_size is None:
            env = env_factory()
        else:
            env = make_batched_env(
                env_factory,
                batch_size=batch_size,
                num_workers=num_workers,
                # TODO: Still debugging shared memory + custom spaces (e.g. Sparse).
                shared_memory=False,
            )

        ## Apply the "post-batch" wrappers:
        # from sequoia.common.gym_wrappers import ConvertToFromTensors
        # TODO: Only the BaselineMethod requires this, we should enable it only
        # from the BaselineMethod, and leave it 'off' by default.
        if self.add_done_to_observations:
            env = AddDoneToObservation(env)
        # # Convert the samples to tensors and move them to the right device.
        # env = ConvertToFromTensors(env)
        # env = ConvertToFromTensors(env, device=self.config.device)
        # Add a wrapper that converts numpy arrays / etc to Observations/Rewards
        # and from Actions objects to numpy arrays.
        env = TypedObjectsWrapper(
            env,
            observations_type=self.Observations,
            rewards_type=self.Rewards,
            actions_type=self.Actions,
        )
        # Create an IterableDataset from the env using the EnvDataset wrapper.
        dataset = EnvDataset(env, max_steps=max_steps, max_episodes=max_episodes,)

        # Create a GymDataLoader for the EnvDataset.
        env_dataloader = GymDataLoader(dataset)

        if batch_size and seed:
            # Seed each environment with its own seed (based on the base seed).
            env.seed([seed + i for i in range(env_dataloader.num_envs)])
        else:
            env.seed(seed)

        return env_dataloader
예제 #6
0
def test_with_controllable_episode_lengths(batch_size: int, monkeypatch):
    """ TODO: Test out the PolicyHead in a very controlled environment, where we
    know exactly the lengths of each episode.
    """
    env = FakeEnvironment(
        partial(gym.make, "CartPole-v0"),
        batch_size=batch_size,
        episode_lengths=[5, *(10 for _ in range(batch_size - 1))],
        new_episode_length=lambda env_index: 10,
    )
    env = AddDoneToObservation(env)
    env = ConvertToFromTensors(env)
    env = EnvDataset(env)

    obs_space = env.single_observation_space
    x_dim = flatdim(obs_space["x"])
    # Create some dummy encoder.
    encoder = nn.Linear(x_dim, x_dim)
    representation_space = obs_space["x"]

    output_head = PolicyHead(
        input_space=representation_space,
        action_space=env.single_action_space,
        reward_space=env.single_reward_space,
        hparams=PolicyHead.HParams(
            max_episode_window_length=100,
            min_episodes_before_update=1,
            accumulate_losses_before_backward=False,
        ),
    )
    # TODO: Simulating as if the output head were attached to a BaselineModel.
    PolicyHead.base_model_optimizer = torch.optim.Adam(
        output_head.parameters(), lr=1e-3
    )

    # Simplify the loss function so we know exactly what the loss should be at
    # each step.

    def mock_policy_gradient(
        rewards: Sequence[float], log_probs: Sequence[float], gamma: float = 0.95
    ) -> Optional[Loss]:
        log_probs = (log_probs - log_probs.clone()) + 1
        # Return the length of the episode, but with a "gradient" flowing back into log_probs.
        return len(rewards) * log_probs.mean()

    monkeypatch.setattr(output_head, "policy_gradient", mock_policy_gradient)

    batch_size = env.batch_size

    obs = env.reset()
    step_done = np.zeros(batch_size, dtype=np.bool)

    for step in range(200):
        x, obs_done = obs["x"], obs["done"]

        # The done from the obs should always be the same as the 'done' from the 'step' function.
        assert np.array_equal(obs_done, step_done)

        representations = encoder(x)
        observations = ContinualRLSetting.Observations(x=x, done=obs_done,)

        actions_obj = output_head(observations, representations)
        actions = actions_obj.y_pred

        # TODO: kinda useless to wrap a single tensor in an object..
        forward_pass = ForwardPass(
            observations=observations, representations=representations, actions=actions,
        )
        obs, rewards, step_done, info = env.step(actions)

        rewards_obj = ContinualRLSetting.Rewards(y=rewards)
        loss = output_head.get_loss(
            forward_pass=forward_pass, actions=actions_obj, rewards=rewards_obj,
        )
        print(f"Step {step}")
        print(f"num episodes since update: {output_head.num_episodes_since_update}")
        print(f"steps left in episode: {env.steps_left_in_episode}")
        print(f"Loss for that step: {loss}")

        if any(obs_done):
            assert loss != 0.0

        if step == 5.0:
            # Env 0 first episode from steps 0 -> 5
            assert loss.loss == 5.0
            assert loss.metrics["gradient_usage"].used_gradients == 5.0
            assert loss.metrics["gradient_usage"].wasted_gradients == 0.0
        elif step == 10:
            # Envs[1:batch_size], first episode, from steps 0 -> 10
            # NOTE: At this point, both envs have reached the required number of episodes.
            # This means that the gradient usage on the next time any env reaches
            # an end-of-episode will be one less than the total number of items.
            assert loss.loss == 10.0 * (batch_size - 1)
            assert loss.metrics["gradient_usage"].used_gradients == 10.0 * (
                batch_size - 1
            )
            assert loss.metrics["gradient_usage"].wasted_gradients == 0.0
        elif step == 15:
            # Env 0 second episode from steps 5 -> 15
            assert loss.loss == 10.0
            assert loss.metrics["gradient_usage"].used_gradients == 4
            assert loss.metrics["gradient_usage"].wasted_gradients == 6

        elif step == 20:
            # Envs[1:batch_size]: second episode, from steps 0 -> 10
            # NOTE: At this point, both envs have reached the required number of episodes.
            # This means that the gradient usage on the next time any env reaches
            # an end-of-episode will be one less than the total number of items.
            assert loss.loss == 10.0 * (batch_size - 1)
            assert loss.metrics["gradient_usage"].used_gradients == 9 * (batch_size - 1)
            assert loss.metrics["gradient_usage"].wasted_gradients == 1 * (
                batch_size - 1
            )

        elif step == 25:
            # Env 0 third episode from steps 5 -> 15
            assert loss.loss == 10.0
            assert loss.metrics["gradient_usage"].used_gradients == 4
            assert loss.metrics["gradient_usage"].wasted_gradients == 6

        elif step > 0 and step % 10 == 0:
            # Same pattern as step 20 above
            assert loss.loss == 10.0 * (batch_size - 1), step
            assert loss.metrics["gradient_usage"].used_gradients == 9 * (batch_size - 1)
            assert loss.metrics["gradient_usage"].wasted_gradients == 1 * (
                batch_size - 1
            )

        elif step > 0 and step % 5 == 0:
            # Same pattern as step 25 above
            assert loss.loss == 10.0
            assert loss.metrics["gradient_usage"].used_gradients == 4
            assert loss.metrics["gradient_usage"].wasted_gradients == 6

        else:
            assert loss.loss == 0.0, step
예제 #7
0
def test_loss_is_nonzero_at_episode_end_iterate(batch_size: int):
    """ Test that when *iterating* through the env (active-dataloader style),
    when the episode ends, a non-zero loss is returned by the output head.
    """
    with gym.make("CartPole-v0") as temp_env:
        temp_env = AddDoneToObservation(temp_env)

        obs_space = temp_env.observation_space
        action_space = temp_env.action_space
        reward_space = getattr(
            temp_env, "reward_space", spaces.Box(*temp_env.reward_range, shape=())
        )

    env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=False)
    env = AddDoneToObservation(env)
    env = ConvertToFromTensors(env)
    env = EnvDataset(env)

    head = PolicyHead(
        # observation_space=obs_space,
        input_space=obs_space["x"],
        action_space=action_space,
        reward_space=reward_space,
        hparams=PolicyHead.HParams(accumulate_losses_before_backward=False),
    )

    env.seed(123)
    non_zero_losses = 0

    for i, obs in zip(range(100), env):
        print(i, obs)
        x = obs["x"]
        done = obs["done"]
        representations = x
        assert isinstance(x, Tensor)
        assert isinstance(done, Tensor)
        observations = ContinualRLSetting.Observations(
            x=x,
            done=done,
            # info=info,
        )
        head_output = head.forward(observations, representations=representations)

        actions = head_output.actions.numpy().tolist()
        # actions = np.zeros(batch_size, dtype=int).tolist()

        rewards = env.send(actions)

        # print(f"Step {i}, obs: {obs}, done: {done}")
        assert isinstance(representations, Tensor)
        forward_pass = ForwardPass(
            observations=observations,
            representations=representations,
            actions=head_output,
        )
        rewards = ContinualRLSetting.Rewards(rewards)
        loss = head.get_loss(forward_pass, actions=head_output, rewards=rewards)
        print("loss:", loss)

        for env_index, env_is_done in enumerate(observations.done):
            if env_is_done:
                print(f"Episode ended for env {env_index} at step {i}")
                assert loss.total_loss != 0.0
                non_zero_losses += 1
                break
        else:
            print(f"No episode ended on step {i}, expecting no loss.")
            assert loss.total_loss == 0.0

    assert non_zero_losses > 0
예제 #8
0
def test_loss_is_nonzero_at_episode_end(batch_size: int):
    """ Test that when stepping through the env, when the episode ends, a
    non-zero loss is returned by the output head.
    """
    with gym.make("CartPole-v0") as temp_env:
        temp_env = AddDoneToObservation(temp_env)
        obs_space = temp_env.observation_space
        action_space = temp_env.action_space
        reward_space = getattr(
            temp_env, "reward_space", spaces.Box(*temp_env.reward_range, shape=())
        )

    env = gym.vector.make("CartPole-v0", num_envs=batch_size, asynchronous=False)
    env = AddDoneToObservation(env)
    env = ConvertToFromTensors(env)
    env = EnvDataset(env)

    head = PolicyHead(
        input_space=obs_space.x,
        action_space=action_space,
        reward_space=reward_space,
        hparams=PolicyHead.HParams(accumulate_losses_before_backward=False),
    )
    # TODO: Simulating as if the output head were attached to a BaselineModel.
    PolicyHead.base_model_optimizer = torch.optim.Adam(head.parameters(), lr=1e-3)
    head.train()

    env.seed(123)
    obs = env.reset()

    # obs = torch.as_tensor(obs, dtype=torch.float32)

    done = torch.zeros(batch_size, dtype=bool)
    info = np.array([{} for _ in range(batch_size)])
    loss = None

    non_zero_losses = 0

    encoder = nn.Linear(4, 4)
    encoder.train()

    for i in range(100):
        representations = encoder(obs["x"])

        observations = ContinualRLSetting.Observations(
            x=obs["x"],
            done=done,
            # info=info,
        )
        head_output = head.forward(observations, representations=representations)
        actions = head_output.actions.numpy().tolist()
        # actions = np.zeros(batch_size, dtype=int).tolist()

        obs, rewards, done, info = env.step(actions)
        done = torch.as_tensor(done, dtype=bool)
        rewards = ContinualRLSetting.Rewards(rewards)
        assert len(info) == batch_size

        print(f"Step {i}, obs: {obs}, done: {done}, info: {info}")

        forward_pass = ForwardPass(
            observations=observations,
            representations=representations,
            actions=head_output,
        )
        loss = head.get_loss(forward_pass, actions=head_output, rewards=rewards)
        print("loss:", loss)

        assert observations.done is not None
        for env_index, env_is_done in enumerate(observations.done):
            if env_is_done:
                print(f"Episode ended for env {env_index} at step {i}")
                assert loss.loss != 0.0
                non_zero_losses += 1
                break
        else:
            print(f"No episode ended on step {i}, expecting no loss.")
            assert loss is None or loss.loss == 0.0

    assert non_zero_losses > 0