def test_step_limit_with_single_env_dataset(env_name: str): env = gym.make(env_name) start = 0 target = 10 env = DummyEnvironment(start=start, target=target, max_value=10 * 2) env = EnvDataset(env) max_steps = 5 env = ObservationLimit(env, max_steps=max_steps) env.seed(123) values = [] for i, obs in zip(range(100), env): values.append(obs) _ = env.send(1) assert values == list(range(start, max_steps)) assert env.is_closed with pytest.raises(gym.error.ClosedEnvironmentError): env.reset() with pytest.raises(gym.error.ClosedEnvironmentError): env.step(env.action_space.sample()) with pytest.raises(gym.error.ClosedEnvironmentError): for i, _ in zip(range(5), env): assert False
def test_iterating_with_send(): env = DummyEnvironment(target=5) env = EnvDataset(env) env.seed(123) actions = [0, 1, 1, 2, 1, 1, 1, 1, 0, 0, 0] expected_obs = [0, 0, 1, 2, 1, 2, 3, 4, 5] expected_rewards = [5, 4, 3, 4, 3, 2, 1, 0] expected_dones = [False, False, False, False, False, False, False, True] reset_obs = 0 # obs = env.reset() # assert obs == reset_obs n_calls = 0 for i, observation in enumerate(env): print(f"Step {i}: batch: {observation}") assert observation == expected_obs[i] action = actions[i] reward = env.send(action) assert reward == expected_rewards[i] # TODO: The episode will end as soon as 'done' is encountered, which means # that we will never be given the 'final' observation. In this case, the # DummyEnvironment will set done=True when the state is state = target = 5 # in this case. assert observation == 4
def test_doesnt_raise_error_when_action_sent(): env = DummyEnvironment() with EnvDataset(env) as env: env.reset() env.seed(123) for i, obs in zip(range(5), env): assert obs in env.observation_space reward = env.send(env.action_space.sample())
def test_measure_RL_performance_iteration(): env = DummyEnvironment(start=0, target=5, max_value=10) from gym.wrappers import TimeLimit max_episode_steps = 50 env = EnvDataset(env) env = TimeLimit(env, max_episode_steps=max_episode_steps) # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards) env = MeasureRLPerformanceWrapper(env) env.seed(123) all_episode_rewards = [] all_episode_steps = [] for episode in range(5): episode_steps = 0 episode_reward = 0 for step, obs in enumerate(env): print(f"Episode {episode}, obs: {obs}") action = env.action_space.sample() reward = env.send(action) episode_reward += reward episode_steps += 1 # print(obs, reward, done, info) assert step <= max_episode_steps, "shouldn't be able to iterate longer than that." all_episode_steps.append(episode_steps) all_episode_rewards.append(episode_reward) expected_metrics = {} for episode_steps, cumul_step, episode_reward in zip( all_episode_steps, accumulate(all_episode_steps), all_episode_rewards): expected_metrics[cumul_step] = EpisodeMetrics( n_samples=1, mean_episode_reward=episode_reward, mean_episode_length=episode_steps, ) assert env.get_online_performance() == expected_metrics
def test_measure_RL_performance_iteration(): env = DummyEnvironment(start=0, target=5, max_value=10) env = EnvDataset(env) from sequoia.settings.active.continual.continual_rl_setting import \ ContinualRLSetting # env = TypedObjectsWrapper(env, observations_type=ContinualRLSetting.Observations, actions_type=ContinualRLSetting.Actions, rewards_type=ContinualRLSetting.Rewards) env = MeasureRLPerformanceWrapper(env) env.seed(123) all_episode_rewards = [] all_episode_steps = [] for episode in range(5): episode_steps = 0 episode_reward = 0 for step, obs in enumerate(env): print(f"Episode {episode}, obs: {obs}") action = env.action_space.sample() reward = env.send(action) episode_reward += reward episode_steps += 1 # print(obs, reward, done, info) all_episode_steps.append(episode_steps) all_episode_rewards.append(episode_reward) from itertools import accumulate expected_metrics = {} for episode_steps, cumul_step, episode_reward in zip( all_episode_steps, accumulate(all_episode_steps), all_episode_rewards): expected_metrics[cumul_step] = EpisodeMetrics( n_samples=1, mean_episode_reward=episode_reward, mean_episode_length=episode_steps, ) assert env.get_online_performance() == expected_metrics