示例#1
0
def check_env(env):
    """
    Check that the environment is (almost) gym-compatible and that it is reproducible
    in the sense that it returns the same states when given the same seed.

    Parameters
    ----------
    env: gym.env or rlberry env
        Environment that we want to check.
    """
    # Small reproducibility test
    action = env.action_space.sample()
    safe_reseed(env, Seeder(42))
    env.reset()
    a = env.step(action)[0]

    safe_reseed(env, Seeder(42))
    env.reset()
    b = env.step(action)[0]
    if hasattr(a, "__len__"):
        assert np.all(np.array(a) == np.array(
            b)), "The environment does not seem to be reproducible"
    else:
        assert a == b, "The environment does not seem to be reproducible"

    # Modified check suite from gym
    check_gym_env(env)
示例#2
0
def test_seeder_initialized_from_seeder():
    """
    Check that Seeder(seed_seq) respawns seed_seq in the constructor.
    """
    seeder1 = Seeder(43)
    seeder_temp = Seeder(43)
    seeder2 = Seeder(seeder_temp)

    data1 = seeder1.rng.integers(100, size=1000)
    data2 = seeder2.rng.integers(100, size=1000)
    assert (data1 != data2).sum() > 5
示例#3
0
def test_seeder_reseeding():
    """
    Check that reseeding with a Seeder instance works properly.
    """
    # seeders 1 and 2 are identical
    seeder1 = Seeder(43)
    seeder2 = Seeder(43)

    # reseed seeder 2 using seeder 1
    seeder2.reseed(seeder1)

    data1 = seeder1.rng.integers(100, size=1000)
    data2 = seeder2.rng.integers(100, size=1000)
    assert (data1 != data2).sum() > 5
示例#4
0
def test_rescale_reward():
    # tolerance
    tol = 1e-14

    rng = Seeder(123).rng

    for _ in range(10):
        # generate random MDP
        S, A = 5, 2
        R = rng.uniform(0.0, 1.0, (S, A))
        P = rng.uniform(0.0, 1.0, (S, A, S))
        for ss in range(S):
            for aa in range(A):
                P[ss, aa, :] /= P[ss, aa, :].sum()
        env = FiniteMDP(R, P)

        # test
        wrapped = RescaleRewardWrapper(env, (-10, 10))
        _ = wrapped.reset()
        for _ in range(100):
            _, reward, _, _ = wrapped.sample(
                wrapped.observation_space.sample(),
                wrapped.action_space.sample())
            assert reward <= 10 + tol and reward >= -10 - tol

        _ = wrapped.reset()
        for _ in range(100):
            _, reward, _, _ = wrapped.step(wrapped.action_space.sample())
            assert reward <= 10 + tol and reward >= -10 - tol
示例#5
0
def test_gym_safe_reseed(env_name):
    seeder = Seeder(123)
    seeder_aux = Seeder(123)

    env1 = gym.make(env_name)
    env2 = gym.make(env_name)
    env3 = gym.make(env_name)

    safe_reseed(env1, seeder)
    safe_reseed(env2, seeder)
    safe_reseed(env3, seeder_aux)

    traj1 = get_env_trajectory(env1, 500)
    traj2 = get_env_trajectory(env2, 500)
    traj3 = get_env_trajectory(env3, 500)
    assert not compare_trajectories(traj1, traj2)
    assert compare_trajectories(traj1, traj3)
示例#6
0
def test_seeder_basic():
    seeder1 = Seeder(43)
    data1 = seeder1.rng.integers(100, size=1000)

    seeder2 = Seeder(44)
    data2 = seeder2.rng.integers(100, size=1000)

    seeder3 = Seeder(44)
    data3 = seeder3.rng.integers(100, size=1000)

    assert (data1 != data2).sum() > 5
    assert (data2 != data3).sum() == 0
    assert (
        seeder2.spawn(1).generate_state(1)[0] == seeder3.spawn(1).generate_state(1)[0]
    )
    assert (
        seeder1.spawn(1).generate_state(1)[0] != seeder3.spawn(1).generate_state(1)[0]
    )
示例#7
0
 def __init__(self, n):
     """
     Parameters
     ----------
     n : int
         number of elements in the space
     """
     assert n >= 0, "The number of elements in Discrete must be >= 0"
     gym.spaces.Discrete.__init__(self, n)
     self.seeder = Seeder()
示例#8
0
def test_env_seeding(env_name):
    seeder1 = Seeder(123)
    env1 = gym_make(env_name)
    env1.reseed(seeder1)

    seeder2 = Seeder(456)
    env2 = gym_make(env_name)
    env2.reseed(seeder2)

    seeder3 = Seeder(123)
    env3 = gym_make(env_name)
    env3.reseed(seeder3)

    if deepcopy(env1).is_online():
        traj1 = get_env_trajectory(env1, 500)
        traj2 = get_env_trajectory(env2, 500)
        traj3 = get_env_trajectory(env3, 500)

        assert not compare_trajectories(traj1, traj2)
        assert compare_trajectories(traj1, traj3)
示例#9
0
def test_rescale_wrapper_seeding(ModelClass):
    env1 = RescaleRewardWrapper(ModelClass(), (0, 1))
    seeder = Seeder(123)
    env1.reseed(seeder)

    env2 = RescaleRewardWrapper(ModelClass(), (0, 1))
    seeder = Seeder(456)
    env2.reseed(seeder)

    env3 = RescaleRewardWrapper(ModelClass(), (0, 1))
    seeder = Seeder(123)
    env3.reseed(seeder)

    if deepcopy(env1).is_online():
        traj1 = get_env_trajectory(env1, 500)
        traj2 = get_env_trajectory(env2, 500)
        traj3 = get_env_trajectory(env3, 500)

        assert not compare_trajectories(traj1, traj2)
        assert compare_trajectories(traj1, traj3)
示例#10
0
def test_adversarial():
    r1 = np.concatenate((2 * np.ones((500, 1)), np.ones((500, 1))), axis=1)

    r2 = np.concatenate((np.ones((500, 1)), 2 * np.ones((500, 1))), axis=1)

    rewards = np.concatenate((r1, r2))

    env = AdversarialBandit(rewards=rewards)
    safe_reseed(env, Seeder(TEST_SEED))

    sample = [env.step(1)[1] for f in range(1000)]
    assert np.abs(np.mean(sample) - 1.5) < 1e-10
示例#11
0
def test_copy_reseeding(env_name):
    seeder = Seeder(123)
    env = gym_make(env_name)
    env.reseed(seeder)

    c_env = deepcopy(env)
    c_env.reseed()

    if deepcopy(env).is_online():
        traj1 = get_env_trajectory(env, 500)
        traj2 = get_env_trajectory(c_env, 500)
        assert not compare_trajectories(traj1, traj2)
示例#12
0
def test_double_wrapper_copy_reseeding(ModelClass):
    env = Wrapper(Wrapper(ModelClass()))
    seeder = Seeder(123)
    env.reseed(seeder)

    c_env = deepcopy(env)
    c_env.reseed()

    if deepcopy(env).is_online():
        traj1 = get_env_trajectory(env, 500)
        traj2 = get_env_trajectory(c_env, 500)
        assert not compare_trajectories(traj1, traj2)
示例#13
0
def test_gym_copy_reseeding():
    seeder = Seeder(123)
    if _GYM_INSTALLED:
        gym_env = gym.make("Acrobot-v1")
        env = Wrapper(gym_env)
        env.reseed(seeder)

        c_env = deepcopy(env)
        c_env.reseed()

        if deepcopy(env).is_online():
            traj1 = get_env_trajectory(env, 500)
            traj2 = get_env_trajectory(c_env, 500)
            assert not compare_trajectories(traj1, traj2)
示例#14
0
def test_gym_copy_reseeding_2():
    seeder = Seeder(123)
    if _GYM_INSTALLED:
        gym_env = gym.make("Acrobot-v1")
        # nested wrapping
        env = RescaleRewardWrapper(Wrapper(Wrapper(gym_env)), (0, 1))
        env.reseed(seeder)

        c_env = deepcopy(env)
        c_env.reseed()

        if deepcopy(env).is_online():
            traj1 = get_env_trajectory(env, 500)
            traj2 = get_env_trajectory(c_env, 500)
            assert not compare_trajectories(traj1, traj2)
示例#15
0
def test_seeder_spawning():
    """
    Check that Seeder(seed_seq) respawns seed_seq in the constructor.
    """
    seeder1 = Seeder(43)
    seeder2 = seeder1.spawn()
    seeder3 = seeder2.spawn()

    print(seeder1)
    print(seeder2)
    print(seeder3)

    data1 = seeder1.rng.integers(100, size=1000)
    data2 = seeder2.rng.integers(100, size=1000)
    assert (data1 != data2).sum() > 5
示例#16
0
 def reseed(self, seed_seq=None):
     # self.seeder
     if seed_seq is None:
         self.seeder = self.seeder.spawn()
     else:
         self.seeder = Seeder(seed_seq)
     # seed gym.Env that is not a rlberry Model
     if not isinstance(self.env, Model):
         # get a seed for gym environment; spaces are reseeded below.
         safe_reseed(self.env, self.seeder, reseed_spaces=False)
     # seed rlberry Model
     else:
         self.env.reseed(self.seeder)
     safe_reseed(self.observation_space, self.seeder)
     safe_reseed(self.action_space, self.seeder)
示例#17
0
def test_mbqvi(S, A):
    rng = Seeder(123).rng

    for sim in range(5):
        # generate random MDP with deterministic transitions
        R = rng.uniform(0.0, 1.0, (S, A))
        P = np.zeros((S, A, S))
        for ss in range(S):
            for aa in range(A):
                ns = rng.integers(0, S)
                P[ss, aa, ns] = 1

        # run MBQVI and check exactness of estimators
        env = FiniteMDP(R, P)
        agent = MBQVIAgent(env, n_samples=1)
        agent.fit()
        assert np.abs(R - agent.R_hat).max() < 1e-16
        assert np.abs(P - agent.P_hat).max() < 1e-16
示例#18
0
    def reseed(self, seed_seq=None):
        """
        Get new random number generator for the model.

        Parameters
        ----------
        seed_seq : np.random.SeedSequence, rlberry.seeding.Seeder or int, default : None
            Seed sequence from which to spawn the random number generator.
            If None, generate random seed.
            If int, use as entropy for SeedSequence.
            If seeder, use seeder.seed_seq
        """
        # self.seeder
        if seed_seq is None:
            self.seeder = self.seeder.spawn()
        else:
            self.seeder = Seeder(seed_seq)
        # spaces
        self.observation_space.reseed(self.seeder.seed_seq)
        self.action_space.reseed(self.seeder.seed_seq)
示例#19
0
 def __init__(self, spaces):
     gym.spaces.Tuple.__init__(self, spaces)
     self.seeder = Seeder()
示例#20
0
 def __init__(self, n):
     gym.spaces.MultiBinary.__init__(self, n)
     self.seeder = Seeder()
示例#21
0
 def __init__(self, spaces=None, **spaces_kwargs):
     gym.spaces.Dict.__init__(self, spaces, **spaces_kwargs)
     self.seeder = Seeder()
示例#22
0
 def __init__(self, low, high, shape=None, dtype=np.float64):
     gym.spaces.Box.__init__(self, low, high, shape=shape, dtype=dtype)
     self.seeder = Seeder()
示例#23
0
 def __init__(self, nvec, dtype=np.int64):
     gym.spaces.MultiDiscrete.__init__(self, nvec, dtype=dtype)
     self.seeder = Seeder()
示例#24
0
from rlberry.seeding import safe_reseed
from rlberry.seeding import Seeder
import numpy as np
from rlberry.utils.check_gym_env import check_gym_env

seeder = Seeder(42)


def check_env(env):
    """
    Check that the environment is (almost) gym-compatible and that it is reproducible
    in the sense that it returns the same states when given the same seed.

    Parameters
    ----------
    env: gym.env or rlberry env
        Environment that we want to check.
    """
    # Small reproducibility test
    action = env.action_space.sample()
    safe_reseed(env, Seeder(42))
    env.reset()
    a = env.step(action)[0]

    safe_reseed(env, Seeder(42))
    env.reset()
    b = env.step(action)[0]
    if hasattr(a, "__len__"):
        assert np.all(np.array(a) == np.array(
            b)), "The environment does not seem to be reproducible"
    else:
示例#25
0
A demo of twinrooms environment
===============================
 Illustration of TwinRooms environment

.. video:: ../../video_plot_twinrooms.mp4
   :width: 600

"""
# sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_twinrooms.jpg'

from rlberry.envs.benchmarks.generalization.twinrooms import TwinRooms
from rlberry.agents.mbqvi import MBQVIAgent
from rlberry.wrappers.discretize_state import DiscretizeStateWrapper
from rlberry.seeding import Seeder

seeder = Seeder(123)

env = TwinRooms()
env = DiscretizeStateWrapper(env, n_bins=20)
env.reseed(seeder)
horizon = 20
agent = MBQVIAgent(env, n_samples=10, gamma=1.0, horizon=horizon)
agent.reseed(seeder)
agent.fit()

state = env.reset()
env.enable_rendering()
for ii in range(10):
    action = agent.policy(state)
    ns, rr, _, _ = env.step(action)
    state = ns
示例#26
0
    def __init__(
        self,
        agent_class,
        train_env,
        fit_budget=None,
        eval_env=None,
        init_kwargs=None,
        fit_kwargs=None,
        eval_kwargs=None,
        agent_name=None,
        n_fit=4,
        output_dir=None,
        parallelization="thread",
        max_workers=None,
        mp_context="spawn",
        worker_logging_level="INFO",
        seed=None,
        enable_tensorboard=False,
        outdir_id_style="timestamp",
        default_writer_kwargs=None,
        init_kwargs_per_instance=None,
    ):
        # agent_class should only be None when the constructor is called
        # by the class method AgentManager.load(), since the agent class
        # will be loaded.

        if agent_class is None:
            return None  # Must only happen when load() method is called.

        self.seeder = Seeder(seed)
        self.eval_seeder = self.seeder.spawn(1)

        self.agent_name = agent_name
        if agent_name is None:
            self.agent_name = agent_class.name

        # Check train_env and eval_env
        assert isinstance(
            train_env, Tuple
        ), "[AgentManager]train_env must be Tuple (constructor, kwargs)"
        if eval_env is not None:
            assert isinstance(
                eval_env, Tuple
            ), "[AgentManager]train_env must be Tuple (constructor, kwargs)"

        # check options
        assert outdir_id_style in [None, "unique", "timestamp"]

        # create oject identifier
        self.unique_id = metadata_utils.get_unique_id(self)
        self.timestamp_id = metadata_utils.get_readable_id(self)

        # Agent class
        self.agent_class = agent_class

        # Train env
        self.train_env = train_env

        # Check eval_env
        if eval_env is None:
            eval_env = deepcopy(train_env)

        self._eval_env = eval_env

        # check kwargs
        fit_kwargs = fit_kwargs or {}
        eval_kwargs = eval_kwargs or {}

        # params
        base_init_kwargs = init_kwargs or {}
        self._base_init_kwargs = deepcopy(base_init_kwargs)
        self.fit_kwargs = deepcopy(fit_kwargs)
        self.eval_kwargs = deepcopy(eval_kwargs)
        self.n_fit = n_fit
        self.parallelization = parallelization
        self.max_workers = max_workers
        self.mp_context = mp_context
        self.worker_logging_level = worker_logging_level
        self.output_dir = output_dir
        if fit_budget is not None:
            self.fit_budget = fit_budget
        else:
            try:
                self.fit_budget = self.fit_kwargs.pop("fit_budget")
            except KeyError:
                raise ValueError("[AgentManager] fit_budget missing in __init__().")
        # extra params per instance
        if init_kwargs_per_instance is not None:
            assert len(init_kwargs_per_instance) == n_fit
            init_kwargs_per_instance = deepcopy(init_kwargs_per_instance)
        self.init_kwargs_per_instance = init_kwargs_per_instance or [
            dict() for _ in range(n_fit)
        ]

        # output dir
        if output_dir is None:
            output_dir_ = metadata_utils.RLBERRY_TEMP_DATA_DIR
        else:
            output_dir_ = output_dir
        self.output_dir_ = Path(output_dir_) / "manager_data"
        if outdir_id_style == "unique":
            self.output_dir_ = self.output_dir_ / (
                self.agent_name + "_" + self.unique_id
            )
        elif outdir_id_style == "timestamp":
            self.output_dir_ = self.output_dir_ / (
                self.agent_name + "_" + self.timestamp_id
            )

        # Create list of writers for each agent that will be trained
        # 'default' will keep Agent's use of DefaultWriter.
        self.writers = [("default", None) for _ in range(n_fit)]

        # Parameters to setup Agent's DefaultWriter
        self.agent_default_writer_kwargs = [
            dict(
                name=self.agent_name,
                log_interval=3,
                tensorboard_kwargs=None,
                execution_metadata=metadata_utils.ExecutionMetadata(obj_worker_id=idx),
            )
            for idx in range(n_fit)
        ]
        self.tensorboard_dir = None
        if enable_tensorboard:
            self.tensorboard_dir = self.output_dir_ / "tensorboard"
            for idx, params in enumerate(self.agent_default_writer_kwargs):
                params["tensorboard_kwargs"] = dict(
                    log_dir=self.tensorboard_dir / str(idx)
                )
        # Update DefaultWriter according to user's settings.
        default_writer_kwargs = default_writer_kwargs or {}
        if default_writer_kwargs:
            logger.warning(
                "(Re)defining the following DefaultWriter"
                f" parameters in AgentManager: {list(default_writer_kwargs.keys())}"
            )
        for ii in range(n_fit):
            self.agent_default_writer_kwargs[ii].update(default_writer_kwargs)

        # agent handlers and init kwargs
        self._set_init_kwargs()  # init_kwargs for each agent
        self.agent_handlers = None
        self._reset_agent_handlers()
        self.default_writer_data = None
        self.best_hyperparams = None

        # optuna study and database
        self.optuna_study = None
        self.db_filename = None
        self.optuna_storage_url = None

        # rlberry version for reproducibility purpose
        self.rlberry_version = rlberry.__version__
示例#27
0
 def __init__(self):
     self.observation_space = None
     self.action_space = None
     self.reward_range: tuple = (-np.inf, np.inf)
     # random number generator
     self.seeder = Seeder()
示例#28
0
def test_cor_normal():
    env = CorruptedNormalBandit(means=[0, 1], cor_prop=0.1)
    safe_reseed(env, Seeder(TEST_SEED))

    sample = [env.step(1)[1] for f in range(1000)]
    assert np.abs(np.median(sample) - 1) < 0.5
示例#29
0
def test_normal():
    env = NormalBandit(means=[0, 1])
    safe_reseed(env, Seeder(TEST_SEED))

    sample = [env.step(1)[1] for f in range(1000)]
    assert np.abs(np.mean(sample) - 1) < 0.1
示例#30
0
def test_bernoulli():
    env = BernoulliBandit(p=[0.05, 0.95])
    safe_reseed(env, Seeder(TEST_SEED))

    sample = [env.step(1)[1] for f in range(1000)]
    assert np.abs(np.mean(sample) - 0.95) < 0.1