def test_step_limit_with_vectorized_env(batch_size):
    start = 0
    target = 10
    starting_values = [start for i in range(batch_size)]
    targets = [target for i in range(batch_size)]

    env = SyncVectorEnv([
        partial(DummyEnvironment,
                start=start,
                target=target,
                max_value=target * 2)
        for start, target in zip(starting_values, targets)
    ])
    env = ObservationLimit(env, max_steps=3 * batch_size)

    obs = env.reset()
    obs, reward, done, info = env.step(env.action_space.sample())
    # obs, reward, done, info = env.step(env.action_space.sample())
    obs = env.reset()
    assert env.is_closed

    with pytest.raises(gym.error.ClosedEnvironmentError):
        env.reset()

    with pytest.raises(gym.error.ClosedEnvironmentError):
        _ = env.step(env.action_space.sample())
예제 #2
0
    def test_not_setting_max_steps_per_episode_with_vector_env_raises_warning(
            self, env_name: str, batch_size: int):
        from functools import partial
        from gym.vector import SyncVectorEnv

        env = SyncVectorEnv(
            [partial(gym.make, env_name) for i in range(batch_size)])
        with pytest.warns(UserWarning):
            dataset = self.EnvDataset(env)

        env.close()
예제 #3
0
def test_episode_limit_with_vectorized_env_dataset(batch_size):
    """ Test that when adding the EpisodeLimit wrapper on top of a vectorized
    environment, the episode limit is with respect to each individual env rather
    than the batched env.
    """
    start = 0
    target = 10
    starting_values = [start for i in range(batch_size)]
    targets = [target for i in range(batch_size)]

    env = SyncVectorEnv([
        partial(DummyEnvironment, start=start, target=target, max_value=10 * 2)
        for start, target in zip(starting_values, targets)
    ])
    
    max_episodes = 2
    # TODO: For some reason the reverse order doesn't work!
    env = EpisodeLimit(env, max_episodes=max_episodes * batch_size)
    env = EnvDataset(env)

    for i, obs in enumerate(env):
        print(i, obs)
        actions = np.ones(batch_size)
        reward = env.send(actions)

    assert  i == max_episodes * target - 1

    with pytest.raises(gym.error.ClosedEnvironmentError):
        env.reset()

    with pytest.raises(gym.error.ClosedEnvironmentError):
        for i, obs in enumerate(env):
            print(i, obs)
            actions = np.ones(batch_size)
            reward = env.send(actions)
예제 #4
0
    def test_shapes_are_correct_env_with_continuous_obs_and_discrete_action_spaces_vector(
            self, _, n_envs, n_steps):

        env = SyncVectorEnv(
            [lambda: gym.make("CartPole-v0") for _ in range(n_envs)])
        observation_shape = env.observation_space.shape

        loop = EnvironmentLoop(env, self._create_discrite_policy(env))
        batch = loop.step()
        for _ in range(1, n_steps):
            batch = loop.step()

        self._assert_has_shapes(
            batch,
            expected={
                SampleBatch.OBSERVATIONS: observation_shape,
                SampleBatch.OBSERVATION_NEXTS: observation_shape,
            },
            default=(n_envs, ),
        )
        self._assert_has_dtype(
            batch,
            expected={
                SampleBatch.ACTIONS: torch.int64,
                SampleBatch.EPS_ID: torch.int64,
            },
            default=torch.float32,
        )
예제 #5
0
    def test_shapes_are_correct_env_with_continuous_action_spaces_vector_sample(
            self, _, n_envs, n_steps):
        env_name = "MountainCarContinuous-v0"
        env = SyncVectorEnv(
            [lambda: gym.make(env_name) for _ in range(n_envs)])
        observation_shape = (n_envs, ) + env.envs[0].observation_space.shape
        action_shape = (n_envs, ) + env.envs[0].action_space.shape

        loop = EnvironmentLoop(env, self._create_continuouse_policy(env))
        batch = loop.sample()
        for _ in range(1, n_steps):
            batch = loop.sample()

        self._assert_has_shapes(
            batch,
            expected={
                SampleBatch.OBSERVATIONS: observation_shape,
                SampleBatch.OBSERVATION_NEXTS: observation_shape,
                SampleBatch.ACTIONS: action_shape,
            },
            default=(n_envs, ),
        )
        self._assert_has_dtype(
            batch,
            expected={
                SampleBatch.EPS_ID: torch.int64,
            },
            default=torch.float32,
        )
예제 #6
0
    def create_env(self, env_kwargs):
        def thunk():
            import experiments.test_lstm_a2c
            return RewardCollector(gym.make(**env_kwargs))

        env = AsyncVectorEnv([thunk] * self.num_processes)
        self.validation_env = SyncVectorEnv([thunk])
        return env
예제 #7
0
def create_unreal_env(num_processes, kwargs):
    def thunk(env):
        env = gym.make(**env)
        env = RewardCollector(env)
        env = TransposeImage(env)
        env = ScaledFloatFrame(env)
        env = UnrealEnvBaseWrapper(env)
        return env

    return AsyncVectorEnv([lambda: thunk(kwargs) for _ in range(num_processes)]), SyncVectorEnv([lambda: thunk(kwargs)])
예제 #8
0
def get_multi_task_env(
    batch_size: int = 1,
) -> Environment[RLSetting.Observations, RLSetting.Actions, RLSetting.Rewards]:
    def single_env_fn() -> gym.Env:
        env = gym.make("CartPole-v0")
        env = TimeLimit(env, max_episode_steps=10)
        env = MultiTaskEnvironment(
            env,
            task_schedule={
                0: {"length": 0.1},
                100: {"length": 0.2},
                200: {"length": 0.3},
                300: {"length": 0.4},
                400: {"length": 0.5},
            },
            add_task_id_to_obs=True,
            new_random_task_on_reset=True,
        )
        return env

    batch_size = 1
    env = SyncVectorEnv([single_env_fn for _ in range(batch_size)])
    from sequoia.common.gym_wrappers import AddDoneToObservation
    from sequoia.settings.active import TypedObjectsWrapper

    env = AddDoneToObservation(env)
    # Wrap the observations so they appear as though they are from the given setting.
    env = TypedObjectsWrapper(
        env,
        observations_type=RLSetting.Observations,
        actions_type=RLSetting.Actions,
        rewards_type=RLSetting.Rewards,
    )
    env.seed(123)
    return env
예제 #9
0
def test_task_sequence_is_reproducible(env: str):
    """Test that the multi-task setup is seeded correctly, i.e. that the task sequence
    is reproducible given the same seed.
    """
    if env == "cartpole":
        env_fn = env_fn_cartpole
    elif env == "monsterkong":
        env_fn = env_fn_monsterkong
    else:
        assert False, f"just testing on cartpole and monsterkong for now, but got env {env}"

    batch_size = 1

    first_results: List[Tuple[int, int]] = []
    n_runs = 5
    n_episodes_per_run = 10

    for run_number in range(n_runs):
        print(f"starting run {run_number} / {n_runs}")
        # For each 'run', we record the task sequence and how long each task lasted for.
        # Then, we want to check that each run was indentical, for a given seed.
        env = SyncVectorEnv([env_fn for _ in range(batch_size)])
        env.seed(123)

        task_ids: List[int] = []
        task_lengths: List[int] = []
        for episode in range(n_episodes_per_run):
            print(f"Episode {episode} / {n_episodes_per_run}")
            obs = env.reset()
            task_id: int = obs[1][0]
            task_length = 0
            done = False
            while not done:
                obs, _, done_array, _ = env.step(env.action_space.sample())
                assert len(done_array) == 1
                done = done_array[0]
                task_length += 1
            task_ids.append(task_id)
            task_lengths.append(task_length)

        task_ids_and_lengths = list(zip(task_ids, task_lengths))
        print(f"Task ids and length of each one: {task_ids_and_lengths}")

        assert len(
            set(task_ids)) > 1, "should have been more than just one task!"

        if not first_results:
            first_results = task_ids_and_lengths
        else:
            # Make sure that the results from this run are equivalent to the others with
            # the same seed:
            assert task_ids_and_lengths == first_results
예제 #10
0
def test_measure_RL_performance_batched_env():
    batch_size = 3
    start = [i for i in range(batch_size)]
    target = 5
    env = EnvDataset(
        SyncVectorEnv([
            partial(DummyEnvironment,
                    start=start[i],
                    target=target,
                    max_value=target * 2) for i in range(batch_size)
        ]))
    # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards)

    env = MeasureRLPerformanceWrapper(env)
    env.seed(123)
    all_episode_rewards = []
    all_episode_steps = []

    for step, obs in enumerate(itertools.islice(env, 100)):
        print(f"step {step} obs: {obs}")
        action = np.ones(batch_size)  # always increment the counter
        reward = env.send(action)
        print(env.done_)
        # print(obs, reward, done, info)
    assert step == 99
    from collections import defaultdict
    from sequoia.common.metrics import Metrics

    expected_metrics = defaultdict(Metrics)
    for i in range(101):
        for env_index in range(batch_size):
            if i and i % target == 0:
                expected_metrics[i] += EpisodeMetrics(
                    n_samples=1,
                    mean_episode_reward=
                    10.,  # ? FIXME: Actually understand this condition
                    mean_episode_length=target,
                )

            # FIXME: This test is a bit too complicated, hard to follow. I'll keep the
            # batches synced-up for now.
            # if i > 0 and (i + env_index) % target == 0:
            #     expected_metrics[i] += EpisodeMetrics(
            #         n_samples=1,
            #         mean_episode_reward=sum(target - (i + env_index % target) for j in range(start[env_index], target)),
            #         mean_episode_length=target - start[env_index] - 1
            #     )

    assert env.get_online_performance() == expected_metrics
예제 #11
0
    def create_env(self, env):
        class W(gym.ObservationWrapper):
            def observation(self, o):
                return o.astype(np.float32)

        env_kwargs = env

        def _thunk():
            env = gym.make(**env_kwargs)
            env = RewardCollector(env)
            env = gym.wrappers.TransformReward(env, lambda r: 0.01 * r)
            env = W(env)
            return env

        self.validation_environment = SyncVectorEnv([_thunk])
        return AsyncVectorEnv([_thunk for _ in range(self.num_processes)])
def test_step_limit_with_vectorized_env_partial_final_batch(batch_size):
    """ In the case where the batch size isn't a multiple of the max
    observations, the env returns ceil(max_obs / batch_size) * batch_size
    observations in total.

    TODO: If we ever get to few-shot learning or something like that, we might
    have to care about this.
    """
    start = 0
    target = 10
    starting_values = [start for i in range(batch_size)]
    targets = [target for i in range(batch_size)]

    env = SyncVectorEnv([
        partial(DummyEnvironment,
                start=start,
                target=target,
                max_value=target * 2)
        for start, target in zip(starting_values, targets)
    ])
    env = ObservationLimit(env, max_steps=3 * batch_size + 1)

    obs = env.reset()
    assert not env.is_closed

    obs, reward, done, info = env.step(env.action_space.sample())
    obs, reward, done, info = env.step(env.action_space.sample())
    assert not env.is_closed

    # obs, reward, done, info = env.step(env.action_space.sample())
    obs = env.reset()
    assert env.is_closed

    with pytest.raises(gym.error.ClosedEnvironmentError):
        env.reset()

    with pytest.raises(gym.error.ClosedEnvironmentError):
        _ = env.step(env.action_space.sample())
예제 #13
0
    def _create_env_loop(self, env_name, n_envs=None, fetch_agent_info=None):
        if n_envs is None:
            env = gym.make(env_name)
        else:
            env = SyncVectorEnv(
                [lambda: gym.make(env_name) for _ in range(n_envs)])

        if env_name == "MountainCarContinuous-v0":
            return EnvironmentLoop(env,
                                   self._create_continuouse_policy(env),
                                   fetch_agent_info=fetch_agent_info)

        if env_name == "Taxi-v3" or "CartPole" in env_name:
            return EnvironmentLoop(env,
                                   self._create_discrite_policy(env),
                                   fetch_agent_info=fetch_agent_info)

        raise RuntimeError("Unknown env", env_name)
예제 #14
0
    def test_shapes_are_correct_env_with_discrete_obs_and_action_spaces_vector_env(
            self, _, n_envs, n_steps):
        env = SyncVectorEnv(
            [lambda: gym.make("Taxi-v3") for _ in range(n_envs)])
        loop = EnvironmentLoop(env, self._create_discrite_policy(env))

        batch = loop.step()
        for _ in range(1, n_steps):
            batch = loop.step()

        self._assert_has_shapes(batch, default=(n_envs, ))
        self._assert_has_dtype(
            batch,
            expected={
                SampleBatch.REWARDS: torch.float32,
                SampleBatch.DONES: torch.float32,
            },
            default=torch.int64,
        )
예제 #15
0
def test_episode_limit_with_vectorized_env(batch_size):
    """ Test that when adding the EpisodeLimit wrapper on top of a vectorized
    environment, the episode limit is with respect to each individual env rather
    than the batched env.
    """ 
    starting_values = [0 for i in range(batch_size)]
    targets = [10 for i in range(batch_size)]
    
    env = SyncVectorEnv([
        partial(DummyEnvironment, start=start, target=target, max_value=10 * 2)
        for start, target in zip(starting_values, targets)
    ])
    env = EpisodeLimit(env, max_episodes=2 * batch_size)
    
    obs = env.reset()
    assert obs.tolist() == starting_values 
    print("reset obs: ", obs)
    for i in range(10):
        print(i, obs)
        actions = np.ones(batch_size)
        obs, reward, done, info = env.step(actions)
    # all episodes end at step 10
    assert all(done)
    
    # Because of how VectorEnvs work, the obs are the new 'reset' obs, rather
    # than the final obs in the episode.
    assert obs.tolist() == starting_values 
    
    assert obs.tolist() == starting_values 
    print("reset obs: ", obs)
    for i in range(10):
        print(i, obs)
        actions = np.ones(batch_size)
        obs, reward, done, info = env.step(actions)

    # all episodes end at step 10
    assert all(done)
    assert env.is_closed
    assert obs.tolist() == starting_values
    with pytest.raises(gym.error.ClosedEnvironmentError):
        actions = np.ones(batch_size)
        obs, reward, done, info = env.step(actions)
예제 #16
0
def test_reset_vectorenv_with_unfinished_episodes_raises_warning(batch_size):
    """ Test that when adding the EpisodeLimit wrapper on top of a vectorized
    environment, the episode limit is with respect to each individual env rather
    than the batched env.
    """
    start = 0
    target = 10
    starting_values = [start for i in range(batch_size)]
    targets = [target for i in range(batch_size)]

    env = SyncVectorEnv([
        partial(DummyEnvironment, start=start, target=target, max_value=10 * 2)
        for start, target in zip(starting_values, targets)
    ])
    env = EpisodeLimit(env, max_episodes=3 * batch_size)
    
    obs = env.reset()
    _ = env.step(env.action_space.sample())
    _ = env.step(env.action_space.sample())
    with pytest.warns(UserWarning) as record:
        env.reset()
예제 #17
0
def test_space_with_tuple_observations(batch_size: int, n_workers: Optional[int]):
    def make_env():
        env = gym.make("Breakout-v0")
        env = MultiTaskEnvironment(
            env, add_task_id_to_obs=True, add_task_dict_to_info=True
        )
        return env

    env_fn = make_env
    env_fns = [env_fn for _ in range(batch_size)]

    # from gym.vector.utils import batch_space
    # env = BatchedVectorEnv(env_fns, n_workers=n_workers)
    from gym.vector import SyncVectorEnv
    env = SyncVectorEnv(env_fns) # FIXME: debugging
    # env = AsyncVectorEnv(env_fns)
    env.seed(123)

    assert env.observation_space == spaces.Dict(
        x=spaces.Box(0, 255, (batch_size, 210, 160, 3), np.uint8),
        task_labels=spaces.MultiDiscrete(np.ones(batch_size)),
    )

    assert env.single_observation_space == spaces.Dict(
        x=spaces.Box(0, 255, (210, 160, 3), np.uint8),
        task_labels=spaces.Discrete(1)
    )

    obs = env.reset()
    assert obs["x"].shape == env.observation_space["x"].shape
    assert obs["task_labels"].shape == env.observation_space["task_labels"].shape
    assert obs in env.observation_space

    actions = env.action_space.sample()
    step_obs, rewards, done, info = env.step(actions)
    assert step_obs in env.observation_space

    assert len(rewards) == batch_size
    assert len(done) == batch_size
    assert all([isinstance(v, bool) for v in done.tolist()]), [type(v) for v in done]
    assert len(info) == batch_size
예제 #18
0
def _build_env(env_builder: EnvBuilder, n_envs: int) -> gym.Env:
    if n_envs > 1:
        return SyncVectorEnv([env_builder for _ in range(n_envs)])
    else:
        return env_builder()
예제 #19
0
def main(cfg):
    random.seed(cfg.exp.seed)
    np.random.seed(cfg.exp.seed)
    torch.manual_seed(cfg.exp.seed)
    torch.backends.cudnn.deterministic = cfg.exp.torch_deterministic

    # so that the environment automatically resets
    env = SyncVectorEnv([
        lambda: RecordEpisodeStatistics(gym.make('CartPole-v1'))
    ])

    actor, critic = Actor(), Critic()
    actor_optim = Adam(actor.parameters(), eps=1e-5, lr=cfg.params.actor_lr)
    critic_optim = Adam(critic.parameters(), eps=1e-5, lr=cfg.params.critic_lr)
    memory = Memory(mini_batch_size=cfg.params.mini_batch_size, batch_size=cfg.params.batch_size)
    obs = env.reset()
    global_rewards = []

    NUM_UPDATES = (cfg.params.total_timesteps // cfg.params.batch_size) * cfg.params.epochs
    cur_timestep = 0

    def calc_factor(cur_timestep: int) -> float:
        """Calculates the factor to be multiplied with the learning rate to update it."""
        update_number = cur_timestep // cfg.params.batch_size
        total_updates = cfg.params.total_timesteps // cfg.params.batch_size
        fraction = 1.0 - update_number / total_updates
        return fraction

    actor_scheduler = LambdaLR(actor_optim, lr_lambda=calc_factor, verbose=True)
    critic_scheduler = LambdaLR(critic_optim, lr_lambda=calc_factor, verbose=True)

    while cur_timestep < cfg.params.total_timesteps:
        # keep playing the game
        obs = torch.as_tensor(obs, dtype=torch.float32)
        with torch.no_grad():
            dist = actor(obs)
            action = dist.sample()
            log_prob = dist.log_prob(action)
            value = critic(obs)
        action = action.cpu().numpy()
        value = value.cpu().numpy()
        log_prob = log_prob.cpu().numpy()
        obs_, reward, done, info = env.step(action)
        
        if done[0]:
            tqdm.write(f'Reward: {info[0]["episode"]["r"]}, Avg Reward: {np.mean(global_rewards[-10:]):.3f}')
            global_rewards.append(info[0]['episode']['r'])
            wandb.log({'Avg_Reward': np.mean(global_rewards[-10:]), 'Reward': info[0]['episode']['r']})

        memory.remember(obs.squeeze(0).cpu().numpy(), action.item(), log_prob.item(), reward.item(), done.item(), value.item())
        obs = obs_
        cur_timestep += 1

        # if the current timestep is a multiple of the batch size, then we need to update the model
        if cur_timestep % cfg.params.batch_size == 0:
            for epoch in tqdm(range(cfg.params.epochs), desc=f'Num updates: {cfg.params.epochs * (cur_timestep // cfg.params.batch_size)} / {NUM_UPDATES}'):
                # sample a batch from memory of experiences
                old_states, old_actions, old_log_probs, old_rewards, old_dones, old_values, batch_indices = memory.sample()
                old_log_probs = torch.tensor(old_log_probs, dtype=torch.float32)
                old_actions = torch.tensor(old_actions, dtype=torch.float32)
                advantage = calculate_advantage(old_rewards, old_values, old_dones, gae_gamma=cfg.params.gae_gamma, gae_lambda=cfg.params.gae_lambda)
                
                advantage = torch.tensor(advantage, dtype=torch.float32)
                old_rewards = torch.tensor(old_rewards, dtype=torch.float32)
                old_values = torch.tensor(old_values, dtype=torch.float32)

                # for each mini batch from batch, calculate advantage using GAE
                for mini_batch_index in batch_indices:
                    # remember: Normalization of advantage is done on mini batch, not the entire batch
                    advantage[mini_batch_index] = (advantage[mini_batch_index] - advantage[mini_batch_index].mean()) / (advantage[mini_batch_index].std() + 1e-8)

                    dist = actor(torch.tensor(old_states[mini_batch_index], dtype=torch.float32).unsqueeze(0))
                    # actions = dist.sample()
                    log_probs = dist.log_prob(old_actions[mini_batch_index]).squeeze(0)
                    entropy = dist.entropy().squeeze(0)

                    log_ratio = log_probs - old_log_probs[mini_batch_index]
                    ratio = torch.exp(log_ratio)

                    with torch.no_grad():
                        # approx_kl = ((ratio-1)-log_ratio).mean()
                        approx_kl = ((old_log_probs[mini_batch_index] - log_probs)**2).mean()
                        wandb.log({'Approx_KL': approx_kl})

                    actor_loss = -torch.min(
                        ratio * advantage[mini_batch_index],
                        torch.clamp(ratio, 1 - cfg.params.actor_loss_clip, 1 + cfg.params.actor_loss_clip) * advantage[mini_batch_index]
                    ).mean()

                    values = critic(torch.tensor(old_states[mini_batch_index], dtype=torch.float32).unsqueeze(0)).squeeze(-1)
                    returns = old_values[mini_batch_index] + advantage[mini_batch_index]

                    critic_loss = torch.max(
                        (values - returns)**2,
                        (old_values[mini_batch_index] + torch.clamp(
                            values - old_values[mini_batch_index], -cfg.params.critic_loss_clip, cfg.params.critic_loss_clip
                            ) - returns
                        )**2
                    ).mean()
                    # critic_loss = F.mse_loss(values, returns)

                    wandb.log({'Actor_Loss': actor_loss.item(), 'Critic_Loss': critic_loss.item(), 'Entropy': entropy.mean().item()})
                    loss = actor_loss + 0.25 * critic_loss - 0.01 * entropy.mean()
                    actor_optim.zero_grad()
                    critic_optim.zero_grad()
                    loss.backward()
                    nn.utils.clip_grad_norm_(actor.parameters(), cfg.params.max_grad_norm)
                    nn.utils.clip_grad_norm_(critic.parameters(), cfg.params.max_grad_norm)

                    actor_optim.step()
                    critic_optim.step()

            memory.reset()
            actor_scheduler.step(cur_timestep)
            critic_scheduler.step(cur_timestep)

            y_pred, y_true = old_values.cpu().numpy(), (old_values + advantage).cpu().numpy()
            var_y = np.var(y_true)
            explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
            wandb.log({'Explained_Var': explained_var})

    if cfg.exp.save_weights:
        torch.save(actor.state_dict(), Path(f'{hydra.utils.get_original_cwd()}/{cfg.exp.model_dir}/actor.pth'))
        torch.save(critic.state_dict(), Path(f'{hydra.utils.get_original_cwd()}/{cfg.exp.model_dir}/critic.pth'))