def test_step_limit_with_single_env_dataset(env_name: str):
    env = gym.make(env_name)
    start = 0
    target = 10
    env = DummyEnvironment(start=start, target=target, max_value=10 * 2)
    env = EnvDataset(env)

    max_steps = 5

    env = ObservationLimit(env, max_steps=max_steps)
    env.seed(123)
    values = []
    for i, obs in zip(range(100), env):
        values.append(obs)
        _ = env.send(1)
    assert values == list(range(start, max_steps))

    assert env.is_closed

    with pytest.raises(gym.error.ClosedEnvironmentError):
        env.reset()

    with pytest.raises(gym.error.ClosedEnvironmentError):
        env.step(env.action_space.sample())

    with pytest.raises(gym.error.ClosedEnvironmentError):
        for i, _ in zip(range(5), env):
            assert False
예제 #2
0
def test_iterating_with_send():
    env = DummyEnvironment(target=5)
    env = EnvDataset(env)
    env.seed(123)

    actions = [0, 1, 1, 2, 1, 1, 1, 1, 0, 0, 0]
    expected_obs = [0, 0, 1, 2, 1, 2, 3, 4, 5]
    expected_rewards = [5, 4, 3, 4, 3, 2, 1, 0]
    expected_dones = [False, False, False, False, False, False, False, True]

    reset_obs = 0
    # obs = env.reset()
    # assert obs == reset_obs
    n_calls = 0

    for i, observation in enumerate(env):
        print(f"Step {i}: batch: {observation}")
        assert observation == expected_obs[i]

        action = actions[i]
        reward = env.send(action)
        assert reward == expected_rewards[i]
    # TODO: The episode will end as soon as 'done' is encountered, which means
    # that we will never be given the 'final' observation. In this case, the
    # DummyEnvironment will set done=True when the state is state = target = 5
    # in this case.
    assert observation == 4
예제 #3
0
def test_step_normally_works_fine():
    env = DummyEnvironment()
    env = EnvDataset(env)
    env.seed(123)

    obs = env.reset()
    assert obs == 0

    obs, reward, done, info = env.step(0)
    assert (obs, reward, done, info) == (0, 5, False, {})
    obs, reward, done, info = env.step(1)
    assert (obs, reward, done, info) == (1, 4, False, {})
    obs, reward, done, info = env.step(1)
    assert (obs, reward, done, info) == (2, 3, False, {})
    obs, reward, done, info = env.step(2)
    assert (obs, reward, done, info) == (1, 4, False, {})
    obs, reward, done, info = env.step(1)
    assert (obs, reward, done, info) == (2, 3, False, {})
    obs, reward, done, info = env.step(1)
    assert (obs, reward, done, info) == (3, 2, False, {})
    obs, reward, done, info = env.step(1)
    assert (obs, reward, done, info) == (4, 1, False, {})

    obs, reward, done, info = env.step(1)
    assert (obs, reward, done, info) == (5, 0, True, {})

    env.reset()
    obs, reward, done, info = env.step(0)
    assert (obs, reward, done, info) == (0, 5, False, {})
예제 #4
0
def test_doesnt_raise_error_when_action_sent():
    env = DummyEnvironment()
    with EnvDataset(env) as env:
        env.reset()
        env.seed(123)

        for i, obs in zip(range(5), env):
            assert obs in env.observation_space
            reward = env.send(env.action_space.sample())
예제 #5
0
def test_raise_error_when_missing_action():
    env = DummyEnvironment()
    with EnvDataset(env) as env:
        env.reset()
        env.seed(123)

        with pytest.raises(RuntimeError):
            for i, observation in zip(range(5), env):
                pass
예제 #6
0
def test_iterating_with_policy():
    env = DummyEnvironment()
    env = PolicyEnv(env)
    env.seed(123)

    actions = [0, 1, 1, 2, 1, 1, 1, 1]
    expected_obs = [0, 0, 1, 2, 1, 2, 3, 4, 5]
    expected_rewards = [5, 4, 3, 4, 3, 2, 1, 0]
    expected_dones = [False, False, False, False, False, False, False, True]
    
    # Expect the transitions to have this form.
    expected_transitions = list(zip(expected_obs[0:],
                                    actions[0:],
                                    expected_obs[1:]))

    reset_obs = 0
    # obs = env.reset()
    # assert obs == reset_obs

    n_calls = 0
    def custom_policy(observations, action_space):
        # Deteministic policy used for testing purposes.
        nonlocal n_calls
        action = actions[n_calls]
        n_calls += 1
        return action

    n_expected_transitions = len(actions)
    env.set_policy(custom_policy)
    actual_transitions: List[StateTransition] = []

    i = 0
    for i, batch in enumerate(env):
        print(f"Step {i}: batch: {batch}")
        state_transition, reward = batch
        actual_transitions.append(state_transition)

        observation, action, next_observation = state_transition.as_tuple()

        assert observation == expected_obs[i]
        assert next_observation == expected_obs[i+1]
        assert action == actions[i]
        assert reward == expected_rewards[i]

    assert i == n_expected_transitions - 1
    assert len(actual_transitions) == n_expected_transitions
    assert [v.as_tuple() for v in actual_transitions] == expected_transitions
예제 #7
0
def test_measure_RL_performance_basics():
    env = DummyEnvironment(start=0, target=5, max_value=10)

    from sequoia.settings.active.continual.continual_rl_setting import \
        ContinualRLSetting

    # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards)

    env = MeasureRLPerformanceWrapper(env)
    env.seed(123)
    all_episode_rewards = []
    all_episode_steps = []

    for episode in range(5):
        episode_steps = 0
        episode_reward = 0
        obs = env.reset()
        print(f"Episode {episode}, obs: {obs}")
        done = False
        while not done:
            action = env.action_space.sample()
            obs, reward, done, info = env.step(action)
            episode_reward += reward
            episode_steps += 1
            # print(obs, reward, done, info)

        all_episode_steps.append(episode_steps)
        all_episode_rewards.append(episode_reward)
    from itertools import accumulate

    expected_metrics = {}
    for episode_steps, cumul_step, episode_reward in zip(
            all_episode_steps, accumulate(all_episode_steps),
            all_episode_rewards):
        expected_metrics[cumul_step] = EpisodeMetrics(
            n_samples=1,
            mean_episode_reward=episode_reward,
            mean_episode_length=episode_steps,
        )

    assert env.get_online_performance() == expected_metrics
예제 #8
0
def test_measure_RL_performance_iteration():
    env = DummyEnvironment(start=0, target=5, max_value=10)
    from gym.wrappers import TimeLimit
    max_episode_steps = 50
    env = EnvDataset(env)
    env = TimeLimit(env, max_episode_steps=max_episode_steps)

    # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards)

    env = MeasureRLPerformanceWrapper(env)
    env.seed(123)
    all_episode_rewards = []
    all_episode_steps = []

    for episode in range(5):
        episode_steps = 0
        episode_reward = 0
        for step, obs in enumerate(env):
            print(f"Episode {episode}, obs: {obs}")
            action = env.action_space.sample()
            reward = env.send(action)
            episode_reward += reward
            episode_steps += 1
            # print(obs, reward, done, info)
            assert step <= max_episode_steps, "shouldn't be able to iterate longer than that."

        all_episode_steps.append(episode_steps)
        all_episode_rewards.append(episode_reward)

    expected_metrics = {}
    for episode_steps, cumul_step, episode_reward in zip(
            all_episode_steps, accumulate(all_episode_steps),
            all_episode_rewards):
        expected_metrics[cumul_step] = EpisodeMetrics(
            n_samples=1,
            mean_episode_reward=episode_reward,
            mean_episode_length=episode_steps,
        )

    assert env.get_online_performance() == expected_metrics