def check_env(env): """ Check that the environment is (almost) gym-compatible and that it is reproducible in the sense that it returns the same states when given the same seed. Parameters ---------- env: gym.env or rlberry env Environment that we want to check. """ # Small reproducibility test action = env.action_space.sample() safe_reseed(env, Seeder(42)) env.reset() a = env.step(action)[0] safe_reseed(env, Seeder(42)) env.reset() b = env.step(action)[0] if hasattr(a, "__len__"): assert np.all(np.array(a) == np.array( b)), "The environment does not seem to be reproducible" else: assert a == b, "The environment does not seem to be reproducible" # Modified check suite from gym check_gym_env(env)
def test_seeder_initialized_from_seeder(): """ Check that Seeder(seed_seq) respawns seed_seq in the constructor. """ seeder1 = Seeder(43) seeder_temp = Seeder(43) seeder2 = Seeder(seeder_temp) data1 = seeder1.rng.integers(100, size=1000) data2 = seeder2.rng.integers(100, size=1000) assert (data1 != data2).sum() > 5
def test_seeder_reseeding(): """ Check that reseeding with a Seeder instance works properly. """ # seeders 1 and 2 are identical seeder1 = Seeder(43) seeder2 = Seeder(43) # reseed seeder 2 using seeder 1 seeder2.reseed(seeder1) data1 = seeder1.rng.integers(100, size=1000) data2 = seeder2.rng.integers(100, size=1000) assert (data1 != data2).sum() > 5
def test_rescale_reward(): # tolerance tol = 1e-14 rng = Seeder(123).rng for _ in range(10): # generate random MDP S, A = 5, 2 R = rng.uniform(0.0, 1.0, (S, A)) P = rng.uniform(0.0, 1.0, (S, A, S)) for ss in range(S): for aa in range(A): P[ss, aa, :] /= P[ss, aa, :].sum() env = FiniteMDP(R, P) # test wrapped = RescaleRewardWrapper(env, (-10, 10)) _ = wrapped.reset() for _ in range(100): _, reward, _, _ = wrapped.sample( wrapped.observation_space.sample(), wrapped.action_space.sample()) assert reward <= 10 + tol and reward >= -10 - tol _ = wrapped.reset() for _ in range(100): _, reward, _, _ = wrapped.step(wrapped.action_space.sample()) assert reward <= 10 + tol and reward >= -10 - tol
def test_gym_safe_reseed(env_name): seeder = Seeder(123) seeder_aux = Seeder(123) env1 = gym.make(env_name) env2 = gym.make(env_name) env3 = gym.make(env_name) safe_reseed(env1, seeder) safe_reseed(env2, seeder) safe_reseed(env3, seeder_aux) traj1 = get_env_trajectory(env1, 500) traj2 = get_env_trajectory(env2, 500) traj3 = get_env_trajectory(env3, 500) assert not compare_trajectories(traj1, traj2) assert compare_trajectories(traj1, traj3)
def test_seeder_basic(): seeder1 = Seeder(43) data1 = seeder1.rng.integers(100, size=1000) seeder2 = Seeder(44) data2 = seeder2.rng.integers(100, size=1000) seeder3 = Seeder(44) data3 = seeder3.rng.integers(100, size=1000) assert (data1 != data2).sum() > 5 assert (data2 != data3).sum() == 0 assert ( seeder2.spawn(1).generate_state(1)[0] == seeder3.spawn(1).generate_state(1)[0] ) assert ( seeder1.spawn(1).generate_state(1)[0] != seeder3.spawn(1).generate_state(1)[0] )
def __init__(self, n): """ Parameters ---------- n : int number of elements in the space """ assert n >= 0, "The number of elements in Discrete must be >= 0" gym.spaces.Discrete.__init__(self, n) self.seeder = Seeder()
def test_env_seeding(env_name): seeder1 = Seeder(123) env1 = gym_make(env_name) env1.reseed(seeder1) seeder2 = Seeder(456) env2 = gym_make(env_name) env2.reseed(seeder2) seeder3 = Seeder(123) env3 = gym_make(env_name) env3.reseed(seeder3) if deepcopy(env1).is_online(): traj1 = get_env_trajectory(env1, 500) traj2 = get_env_trajectory(env2, 500) traj3 = get_env_trajectory(env3, 500) assert not compare_trajectories(traj1, traj2) assert compare_trajectories(traj1, traj3)
def test_rescale_wrapper_seeding(ModelClass): env1 = RescaleRewardWrapper(ModelClass(), (0, 1)) seeder = Seeder(123) env1.reseed(seeder) env2 = RescaleRewardWrapper(ModelClass(), (0, 1)) seeder = Seeder(456) env2.reseed(seeder) env3 = RescaleRewardWrapper(ModelClass(), (0, 1)) seeder = Seeder(123) env3.reseed(seeder) if deepcopy(env1).is_online(): traj1 = get_env_trajectory(env1, 500) traj2 = get_env_trajectory(env2, 500) traj3 = get_env_trajectory(env3, 500) assert not compare_trajectories(traj1, traj2) assert compare_trajectories(traj1, traj3)
def test_adversarial(): r1 = np.concatenate((2 * np.ones((500, 1)), np.ones((500, 1))), axis=1) r2 = np.concatenate((np.ones((500, 1)), 2 * np.ones((500, 1))), axis=1) rewards = np.concatenate((r1, r2)) env = AdversarialBandit(rewards=rewards) safe_reseed(env, Seeder(TEST_SEED)) sample = [env.step(1)[1] for f in range(1000)] assert np.abs(np.mean(sample) - 1.5) < 1e-10
def test_copy_reseeding(env_name): seeder = Seeder(123) env = gym_make(env_name) env.reseed(seeder) c_env = deepcopy(env) c_env.reseed() if deepcopy(env).is_online(): traj1 = get_env_trajectory(env, 500) traj2 = get_env_trajectory(c_env, 500) assert not compare_trajectories(traj1, traj2)
def test_double_wrapper_copy_reseeding(ModelClass): env = Wrapper(Wrapper(ModelClass())) seeder = Seeder(123) env.reseed(seeder) c_env = deepcopy(env) c_env.reseed() if deepcopy(env).is_online(): traj1 = get_env_trajectory(env, 500) traj2 = get_env_trajectory(c_env, 500) assert not compare_trajectories(traj1, traj2)
def test_gym_copy_reseeding(): seeder = Seeder(123) if _GYM_INSTALLED: gym_env = gym.make("Acrobot-v1") env = Wrapper(gym_env) env.reseed(seeder) c_env = deepcopy(env) c_env.reseed() if deepcopy(env).is_online(): traj1 = get_env_trajectory(env, 500) traj2 = get_env_trajectory(c_env, 500) assert not compare_trajectories(traj1, traj2)
def test_gym_copy_reseeding_2(): seeder = Seeder(123) if _GYM_INSTALLED: gym_env = gym.make("Acrobot-v1") # nested wrapping env = RescaleRewardWrapper(Wrapper(Wrapper(gym_env)), (0, 1)) env.reseed(seeder) c_env = deepcopy(env) c_env.reseed() if deepcopy(env).is_online(): traj1 = get_env_trajectory(env, 500) traj2 = get_env_trajectory(c_env, 500) assert not compare_trajectories(traj1, traj2)
def test_seeder_spawning(): """ Check that Seeder(seed_seq) respawns seed_seq in the constructor. """ seeder1 = Seeder(43) seeder2 = seeder1.spawn() seeder3 = seeder2.spawn() print(seeder1) print(seeder2) print(seeder3) data1 = seeder1.rng.integers(100, size=1000) data2 = seeder2.rng.integers(100, size=1000) assert (data1 != data2).sum() > 5
def reseed(self, seed_seq=None): # self.seeder if seed_seq is None: self.seeder = self.seeder.spawn() else: self.seeder = Seeder(seed_seq) # seed gym.Env that is not a rlberry Model if not isinstance(self.env, Model): # get a seed for gym environment; spaces are reseeded below. safe_reseed(self.env, self.seeder, reseed_spaces=False) # seed rlberry Model else: self.env.reseed(self.seeder) safe_reseed(self.observation_space, self.seeder) safe_reseed(self.action_space, self.seeder)
def test_mbqvi(S, A): rng = Seeder(123).rng for sim in range(5): # generate random MDP with deterministic transitions R = rng.uniform(0.0, 1.0, (S, A)) P = np.zeros((S, A, S)) for ss in range(S): for aa in range(A): ns = rng.integers(0, S) P[ss, aa, ns] = 1 # run MBQVI and check exactness of estimators env = FiniteMDP(R, P) agent = MBQVIAgent(env, n_samples=1) agent.fit() assert np.abs(R - agent.R_hat).max() < 1e-16 assert np.abs(P - agent.P_hat).max() < 1e-16
def reseed(self, seed_seq=None): """ Get new random number generator for the model. Parameters ---------- seed_seq : np.random.SeedSequence, rlberry.seeding.Seeder or int, default : None Seed sequence from which to spawn the random number generator. If None, generate random seed. If int, use as entropy for SeedSequence. If seeder, use seeder.seed_seq """ # self.seeder if seed_seq is None: self.seeder = self.seeder.spawn() else: self.seeder = Seeder(seed_seq) # spaces self.observation_space.reseed(self.seeder.seed_seq) self.action_space.reseed(self.seeder.seed_seq)
def __init__(self, spaces): gym.spaces.Tuple.__init__(self, spaces) self.seeder = Seeder()
def __init__(self, n): gym.spaces.MultiBinary.__init__(self, n) self.seeder = Seeder()
def __init__(self, spaces=None, **spaces_kwargs): gym.spaces.Dict.__init__(self, spaces, **spaces_kwargs) self.seeder = Seeder()
def __init__(self, low, high, shape=None, dtype=np.float64): gym.spaces.Box.__init__(self, low, high, shape=shape, dtype=dtype) self.seeder = Seeder()
def __init__(self, nvec, dtype=np.int64): gym.spaces.MultiDiscrete.__init__(self, nvec, dtype=dtype) self.seeder = Seeder()
from rlberry.seeding import safe_reseed from rlberry.seeding import Seeder import numpy as np from rlberry.utils.check_gym_env import check_gym_env seeder = Seeder(42) def check_env(env): """ Check that the environment is (almost) gym-compatible and that it is reproducible in the sense that it returns the same states when given the same seed. Parameters ---------- env: gym.env or rlberry env Environment that we want to check. """ # Small reproducibility test action = env.action_space.sample() safe_reseed(env, Seeder(42)) env.reset() a = env.step(action)[0] safe_reseed(env, Seeder(42)) env.reset() b = env.step(action)[0] if hasattr(a, "__len__"): assert np.all(np.array(a) == np.array( b)), "The environment does not seem to be reproducible" else:
A demo of twinrooms environment =============================== Illustration of TwinRooms environment .. video:: ../../video_plot_twinrooms.mp4 :width: 600 """ # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_twinrooms.jpg' from rlberry.envs.benchmarks.generalization.twinrooms import TwinRooms from rlberry.agents.mbqvi import MBQVIAgent from rlberry.wrappers.discretize_state import DiscretizeStateWrapper from rlberry.seeding import Seeder seeder = Seeder(123) env = TwinRooms() env = DiscretizeStateWrapper(env, n_bins=20) env.reseed(seeder) horizon = 20 agent = MBQVIAgent(env, n_samples=10, gamma=1.0, horizon=horizon) agent.reseed(seeder) agent.fit() state = env.reset() env.enable_rendering() for ii in range(10): action = agent.policy(state) ns, rr, _, _ = env.step(action) state = ns
def __init__( self, agent_class, train_env, fit_budget=None, eval_env=None, init_kwargs=None, fit_kwargs=None, eval_kwargs=None, agent_name=None, n_fit=4, output_dir=None, parallelization="thread", max_workers=None, mp_context="spawn", worker_logging_level="INFO", seed=None, enable_tensorboard=False, outdir_id_style="timestamp", default_writer_kwargs=None, init_kwargs_per_instance=None, ): # agent_class should only be None when the constructor is called # by the class method AgentManager.load(), since the agent class # will be loaded. if agent_class is None: return None # Must only happen when load() method is called. self.seeder = Seeder(seed) self.eval_seeder = self.seeder.spawn(1) self.agent_name = agent_name if agent_name is None: self.agent_name = agent_class.name # Check train_env and eval_env assert isinstance( train_env, Tuple ), "[AgentManager]train_env must be Tuple (constructor, kwargs)" if eval_env is not None: assert isinstance( eval_env, Tuple ), "[AgentManager]train_env must be Tuple (constructor, kwargs)" # check options assert outdir_id_style in [None, "unique", "timestamp"] # create oject identifier self.unique_id = metadata_utils.get_unique_id(self) self.timestamp_id = metadata_utils.get_readable_id(self) # Agent class self.agent_class = agent_class # Train env self.train_env = train_env # Check eval_env if eval_env is None: eval_env = deepcopy(train_env) self._eval_env = eval_env # check kwargs fit_kwargs = fit_kwargs or {} eval_kwargs = eval_kwargs or {} # params base_init_kwargs = init_kwargs or {} self._base_init_kwargs = deepcopy(base_init_kwargs) self.fit_kwargs = deepcopy(fit_kwargs) self.eval_kwargs = deepcopy(eval_kwargs) self.n_fit = n_fit self.parallelization = parallelization self.max_workers = max_workers self.mp_context = mp_context self.worker_logging_level = worker_logging_level self.output_dir = output_dir if fit_budget is not None: self.fit_budget = fit_budget else: try: self.fit_budget = self.fit_kwargs.pop("fit_budget") except KeyError: raise ValueError("[AgentManager] fit_budget missing in __init__().") # extra params per instance if init_kwargs_per_instance is not None: assert len(init_kwargs_per_instance) == n_fit init_kwargs_per_instance = deepcopy(init_kwargs_per_instance) self.init_kwargs_per_instance = init_kwargs_per_instance or [ dict() for _ in range(n_fit) ] # output dir if output_dir is None: output_dir_ = metadata_utils.RLBERRY_TEMP_DATA_DIR else: output_dir_ = output_dir self.output_dir_ = Path(output_dir_) / "manager_data" if outdir_id_style == "unique": self.output_dir_ = self.output_dir_ / ( self.agent_name + "_" + self.unique_id ) elif outdir_id_style == "timestamp": self.output_dir_ = self.output_dir_ / ( self.agent_name + "_" + self.timestamp_id ) # Create list of writers for each agent that will be trained # 'default' will keep Agent's use of DefaultWriter. self.writers = [("default", None) for _ in range(n_fit)] # Parameters to setup Agent's DefaultWriter self.agent_default_writer_kwargs = [ dict( name=self.agent_name, log_interval=3, tensorboard_kwargs=None, execution_metadata=metadata_utils.ExecutionMetadata(obj_worker_id=idx), ) for idx in range(n_fit) ] self.tensorboard_dir = None if enable_tensorboard: self.tensorboard_dir = self.output_dir_ / "tensorboard" for idx, params in enumerate(self.agent_default_writer_kwargs): params["tensorboard_kwargs"] = dict( log_dir=self.tensorboard_dir / str(idx) ) # Update DefaultWriter according to user's settings. default_writer_kwargs = default_writer_kwargs or {} if default_writer_kwargs: logger.warning( "(Re)defining the following DefaultWriter" f" parameters in AgentManager: {list(default_writer_kwargs.keys())}" ) for ii in range(n_fit): self.agent_default_writer_kwargs[ii].update(default_writer_kwargs) # agent handlers and init kwargs self._set_init_kwargs() # init_kwargs for each agent self.agent_handlers = None self._reset_agent_handlers() self.default_writer_data = None self.best_hyperparams = None # optuna study and database self.optuna_study = None self.db_filename = None self.optuna_storage_url = None # rlberry version for reproducibility purpose self.rlberry_version = rlberry.__version__
def __init__(self): self.observation_space = None self.action_space = None self.reward_range: tuple = (-np.inf, np.inf) # random number generator self.seeder = Seeder()
def test_cor_normal(): env = CorruptedNormalBandit(means=[0, 1], cor_prop=0.1) safe_reseed(env, Seeder(TEST_SEED)) sample = [env.step(1)[1] for f in range(1000)] assert np.abs(np.median(sample) - 1) < 0.5
def test_normal(): env = NormalBandit(means=[0, 1]) safe_reseed(env, Seeder(TEST_SEED)) sample = [env.step(1)[1] for f in range(1000)] assert np.abs(np.mean(sample) - 1) < 0.1
def test_bernoulli(): env = BernoulliBandit(p=[0.05, 0.95]) safe_reseed(env, Seeder(TEST_SEED)) sample = [env.step(1)[1] for f in range(1000)] assert np.abs(np.mean(sample) - 0.95) < 0.1