Пример #1
0
def test_rescale_reward():
    # tolerance
    tol = 1e-14

    rng = seeding.get_rng()

    for _ in range(10):
        # generate random MDP
        S, A = 5, 2
        R = rng.uniform(0.0, 1.0, (S, A))
        P = rng.uniform(0.0, 1.0, (S, A, S))
        for ss in range(S):
            for aa in range(A):
                P[ss, aa, :] /= P[ss, aa, :].sum()
        env = FiniteMDP(R, P)

        # test
        wrapped = RescaleRewardWrapper(env, (-10, 10))
        _ = wrapped.reset()
        for _ in range(100):
            _, reward, _, _ = wrapped.sample(
                wrapped.observation_space.sample(),
                wrapped.action_space.sample())
            assert reward <= 10 + tol and reward >= -10 - tol

        _ = wrapped.reset()
        for _ in range(100):
            _, reward, _, _ = wrapped.step(wrapped.action_space.sample())
            assert reward <= 10 + tol and reward >= -10 - tol
def test_bellman_operator_monotonicity_and_contraction(gamma, S, A):
    rng = seeding.get_rng()
    vmax = 1.0 / (1.0 - gamma)
    for _ in range(10):
        # generate random MDP
        R, P = get_random_mdp(S, A)

        # generate random Q functions
        Q0 = rng.uniform(-vmax, vmax, (S, A))
        Q1 = rng.uniform(-vmax, vmax, (S, A))
        # apply Bellman operator
        TQ0 = bellman_operator(Q0, R, P, gamma)
        TQ1 = bellman_operator(Q1, R, P, gamma)

        # test contraction
        norm_tq = np.abs(TQ1 - TQ0).max()
        norm_q = np.abs(Q1 - Q0).max()
        assert norm_tq <= gamma * norm_q

        # test monotonicity
        Q2 = rng.uniform(-vmax / 2, vmax / 2, (S, A))
        Q3 = Q2 + rng.uniform(0.0, vmax / 2, (S, A))
        TQ2 = bellman_operator(Q2, R, P, gamma)
        TQ3 = bellman_operator(Q3, R, P, gamma)
        assert np.greater(TQ2, TQ3).sum() == 0
Пример #3
0
def test_random_numbers():
    seed = 43
    seeding.set_global_seed(seed)
    rng1 = seeding.get_rng()
    data1 = rng1.integers(100, size=1000)

    seed = 44
    seeding.set_global_seed(seed)
    rng2 = seeding.get_rng()
    data2 = rng2.integers(100, size=1000)

    seed = 44
    seeding.set_global_seed(seed)
    rng3 = seeding.get_rng()
    data3 = rng3.integers(100, size=1000)

    assert (data1 != data2).sum() > 5
    assert (data2 != data3).sum() == 0
Пример #4
0
 def reseed(self):
     self.rng = seeding.get_rng()
     # seed gym.Env that is not a rlberry Model
     if not isinstance(self.env, Model):
         # get a seed for gym environment
         seeding.safe_reseed(self.env)
         seeding.safe_reseed(self.observation_space)
         seeding.safe_reseed(self.action_space)
     # seed rlberry Model
     else:
         self.env.reseed()
         self.observation_space.rng = self.env.rng
         self.action_space.rng = self.env.rng
Пример #5
0
def test_seeding():
    seed = 123
    seeding.set_global_seed(seed)

    # check that reimports do not cause problems
    import rlberry
    import rlberry.seeding
    #

    assert seeding._GLOBAL_SEED_SEQ.entropy == seed

    _ = seeding.get_rng()
    assert seeding._GLOBAL_SEED_SEQ.n_children_spawned == 1

    # check that reimports do not cause problems
    import rlberry
    import rlberry.seeding
    assert seeding._GLOBAL_SEED_SEQ.entropy == seed
    #

    _ = seeding.get_rng()
    assert seeding._GLOBAL_SEED_SEQ.n_children_spawned == 2
Пример #6
0
def test_seeding():
    seed = 123
    seeding.set_global_seed(seed)

    # check that reimports do not cause problems
    import rlberry
    import rlberry.seeding
    #

    assert seeding._GLOBAL_SEED_SEQ.entropy == seed

    _ = seeding.get_rng()
    assert seeding._GLOBAL_SEED_SEQ.n_children_spawned == 2  # counting the global rng generated automatically

    # check that reimports do not cause problems
    import rlberry
    import rlberry.seeding
    assert seeding._GLOBAL_SEED_SEQ.entropy == seed
    #

    _ = seeding.get_rng()
    assert seeding._GLOBAL_SEED_SEQ.n_children_spawned == 3
Пример #7
0
    def __init__(self,
                 env,
                 n_episodes=1000,
                 horizon=100,
                 gamma=0.99,
                 batch_size=16,
                 percentile=70,
                 learning_rate=0.01,
                 optimizer_type='ADAM',
                 policy_net_fn=None,
                 **kwargs):
        Agent.__init__(self, env, **kwargs)

        # check environment
        assert isinstance(self.env.observation_space, spaces.Box)
        assert isinstance(self.env.action_space, spaces.Discrete)

        # parameters
        self.gamma = gamma
        self.batch_size = batch_size
        self.n_episodes = n_episodes
        self.percentile = percentile
        self.learning_rate = learning_rate
        self.horizon = horizon

        # random number generator
        self.rng = seeding.get_rng()

        #
        self.policy_net_fn = policy_net_fn \
            or (lambda: default_policy_net_fn(self.env))

        self.optimizer_kwargs = {'optimizer_type': optimizer_type,
                                 'lr': learning_rate}

        # policy net
        self.policy_net = self.policy_net_fn().to(device)

        # loss function and optimizer
        self.loss_fn = nn.CrossEntropyLoss()
        self.optimizer = optimizer_factory(
                                    self.policy_net.parameters(),
                                    **self.optimizer_kwargs)

        # memory
        self.memory = CEMMemory(self.batch_size)

        # default writer
        self.writer = PeriodicWriter(self.name,
                                     log_every=5*logger.getEffectiveLevel())
Пример #8
0
 def reseed(self):
     self.rng = seeding.get_rng()
     # seed gym.Env that is not a rlberry Model
     if isinstance(self.env, gym.Env) \
             and not isinstance(self.env, Model):
         # get a seed for gym environment
         seed = self.rng.integers(2**16).item()
         self.env.seed(seed)
         self.observation_space.seed(seed)
         self.action_space.seed(seed)
     # seed rlberry Model
     else:
         self.env.reseed()
         self.observation_space.rng = self.env.rng
         self.action_space.rng = self.env.rng
Пример #9
0
def test_mbqvi(S, A):
    rng = seeding.get_rng()

    for sim in range(5):
        # generate random MDP with deterministic transitions
        R = rng.uniform(0.0, 1.0, (S, A))
        P = np.zeros((S, A, S))
        for ss in range(S):
            for aa in range(A):
                ns = rng.integers(0, S)
                P[ss, aa, ns] = 1

        # run MBQVI and check exactness of estimators
        env = FiniteMDP(R, P)
        agent = MBQVIAgent(env, n_samples=1)
        agent.fit()
        assert np.abs(R - agent.R_hat).max() < 1e-16
        assert np.abs(P - agent.P_hat).max() < 1e-16
    def __init__(self,
                 env,
                 policy,
                 learning_rate=7e-4,
                 n_steps: int = 5,
                 gamma: float = 0.99,
                 gae_lambda: float = 1.0,
                 ent_coef: float = 0.0,
                 vf_coef: float = 0.5,
                 max_grad_norm: float = 0.5,
                 rms_prop_eps: float = 1e-5,
                 use_rms_prop: bool = True,
                 use_sde: bool = False,
                 sde_sample_freq: int = -1,
                 normalize_advantage: bool = False,
                 tensorboard_log=None,
                 create_eval_env=False,
                 policy_kwargs=None,
                 verbose: int = 0,
                 seed=None,
                 device="auto",
                 _init_setup_model: bool = True,
                 **kwargs):

        # Generate seed for A2CStableBaselines using rlberry seeding
        self.rng = seeding.get_rng()
        seed = self.rng.integers(2**32).item()

        # init stable baselines class
        self.wrapped = A2CStableBaselines(
            policy, env, learning_rate, n_steps, gamma, gae_lambda, ent_coef,
            vf_coef, max_grad_norm, rms_prop_eps, use_rms_prop, use_sde,
            sde_sample_freq, normalize_advantage, tensorboard_log,
            create_eval_env, policy_kwargs, verbose, seed, device,
            _init_setup_model)

        # init rlberry base class
        Agent.__init__(self, env, **kwargs)
Пример #11
0
def test_mock_args(monkeypatch):
    monkeypatch.setattr(
        "sys.argv", ['', 'rlberry/experiment/tests/params_experiment.yaml'])
    random_numbers = []

    for agent_stats in experiment_generator():
        rng = sd.get_rng()
        random_numbers.append(rng.uniform(size=10))

        assert agent_stats.agent_class is RSUCBVIAgent
        assert agent_stats.init_kwargs['n_episodes'] == 100
        assert agent_stats.init_kwargs['horizon'] == 50

        assert agent_stats.init_kwargs['lp_metric'] == 2
        assert agent_stats.init_kwargs['min_dist'] == 0.0
        assert agent_stats.init_kwargs['max_repr'] == 800
        assert agent_stats.init_kwargs['bonus_scale_factor'] == 1.0
        assert agent_stats.init_kwargs['reward_free'] is True

        assert agent_stats.eval_horizon == 51

        train_env = agent_stats.train_env[0](**agent_stats.train_env[1])
        assert train_env.reward_free is False
        assert train_env.array_observation is True

        if agent_stats.agent_name == 'rsucbvi':
            assert agent_stats.init_kwargs['gamma'] == 1.0

        elif agent_stats.agent_name == 'rsucbvi_alternative':
            assert agent_stats.init_kwargs['gamma'] == 0.9

        else:
            raise ValueError()

    #  check that seeding is the same for each AgentStats instance
    for ii in range(1, len(random_numbers)):
        assert np.array_equal(random_numbers[ii - 1], random_numbers[ii])
Пример #12
0
    def __init__(self,
                 agent_class,
                 train_env,
                 eval_env=None,
                 eval_horizon=None,
                 init_kwargs=None,
                 fit_kwargs=None,
                 policy_kwargs=None,
                 agent_name=None,
                 n_fit=4,
                 n_jobs=4,
                 output_dir='stats_data'):
        # agent_class should only be None when the constructor is called
        # by the class method AgentStats.load(), since the agent class
        # will be loaded.
        if agent_class is not None:

            self.agent_name = agent_name
            if agent_name is None:
                self.agent_name = agent_class.name

            # create oject identifier
            timestamp = datetime.timestamp(datetime.now())
            self.identifier = 'stats_{}_{}'.format(self.agent_name,
                                                   str(int(timestamp)))

            self.fit_info = agent_class.fit_info
            self.agent_class = agent_class
            self.train_env = train_env
            if eval_env is None:
                self.eval_env = deepcopy(train_env)
                self.eval_env.reseed()
            else:
                self.eval_env = deepcopy(eval_env)
                self.eval_env.reseed()

            self.eval_horizon = eval_horizon
            # init and fit kwargs are deep copied in fit()
            self.init_kwargs = deepcopy(init_kwargs)
            self.fit_kwargs = fit_kwargs
            self.policy_kwargs = deepcopy(policy_kwargs)
            self.n_fit = n_fit
            self.n_jobs = n_jobs
            self.output_dir = output_dir

            if init_kwargs is None:
                self.init_kwargs = {}
            if fit_kwargs is None:
                self.fit_kwargs = {}
            if policy_kwargs is None:
                self.policy_kwargs = {}

            # Create environment copies for training
            self.train_env_set = []
            for _ in range(n_fit):
                _env = deepcopy(train_env)
                _env.reseed()
                self.train_env_set.append(_env)

            #
            self.fitted_agents = None
            self.fit_kwargs_list = None  # keep in memory for partial_fit()
            self.fit_statistics = {}

            #
            self.rng = seeding.get_rng()

            # optuna study
            self.study = None

            # default filename to save data
            self.default_filename = os.path.join(self.output_dir,
                                                 self.identifier)
Пример #13
0
def compare_policies(agent_stats_list,
                     eval_env=None,
                     eval_horizon=None,
                     stationary_policy=True,
                     n_sim=10,
                     fignum=None,
                     show=True,
                     plot=True,
                     **kwargs):
    """
    Compare the policies of each of the agents in agent_stats_list.
    Each element of the agent_stats_list contains a list of fitted agents.
    To evaluate the policy, we repeat n_sim times:
        * choose one of the fitted agents uniformly at random
        * run its policy in eval_env for eval_horizon time steps


    Parameters
    ----------
    agent_stats_list : list of AgentStats objects.
    eval_env : Model
        Environment where to evaluate the policies.
        If None, it is taken from AgentStats.
    eval_horizon : int
        Number of time steps for policy evaluation.
        If None, it is taken from AgentStats.
    stationary_policy : bool
        If False, the time step h (0<= h <= eval_horizon) is sent as argument
        to agent.policy() for policy evaluation.
    n_sim : int
        Number of simulations to evaluate each policy.
    fignum: string or int
        Identifier of plot figure.
    show: bool
        If true, calls plt.show().
    plot: bool
        If false, do not plot.
    kwargs:
        Extra parameters for sns.boxplot
    """
    #
    # evaluation
    #
    use_eval_from_agent_stats = (eval_env is None)
    use_horizon_from_agent_stats = (eval_horizon is None)

    rng = seeding.get_rng()
    agents_rewards = []
    for agent_stats in agent_stats_list:
        # train agents if they are not already trained
        if agent_stats.fitted_agents is None:
            agent_stats.fit()

        # eval env and horizon
        if use_eval_from_agent_stats:
            eval_env = agent_stats.eval_env
            assert eval_env is not None, \
                "eval_env not in AgentStats %s" % agent_stats.agent_name
        if use_horizon_from_agent_stats:
            eval_horizon = agent_stats.eval_horizon
            assert eval_horizon is not None, \
                "eval_horizon not in AgentStats %s" % agent_stats.agent_name

        # evaluate agent
        episode_rewards = np.zeros(n_sim)
        for sim in range(n_sim):
            # choose one of the fitted agents randomly
            agent_idx = rng.integers(len(agent_stats.fitted_agents))
            agent = agent_stats.fitted_agents[agent_idx]
            # evaluate agent
            observation = eval_env.reset()
            for hh in range(eval_horizon):
                if stationary_policy:
                    action = agent.policy(observation,
                                          **agent_stats.policy_kwargs)
                else:
                    action = agent.policy(observation, hh,
                                          **agent_stats.policy_kwargs)
                observation, reward, done, _ = eval_env.step(action)
                episode_rewards[sim] += reward
                if done:
                    break
        # store rewards
        agents_rewards.append(episode_rewards)

    #
    # plot
    #

    # build unique agent IDs (in case there are two agents with the same ID)
    unique_ids = []
    id_count = {}
    for agent_stats in agent_stats_list:
        name = agent_stats.agent_name
        if name not in id_count:
            id_count[name] = 1
        else:
            id_count[name] += 1

        unique_ids.append(name + "*" * (id_count[name] - 1))

    # convert output to DataFrame
    data = {}
    for agent_id, agent_rewards in zip(unique_ids, agents_rewards):
        data[agent_id] = agent_rewards
    output = pd.DataFrame(data)

    # plot
    if plot:
        plt.figure(fignum)

        with sns.axes_style("whitegrid"):
            ax = sns.boxplot(data=output, **kwargs)
            ax.set_xlabel("agent")
            ax.set_ylabel("rewards in one episode")
            plt.title("Environment = %s" %
                      getattr(eval_env.unwrapped, "name",
                              eval_env.unwrapped.__class__.__name__))
            if show:
                plt.show()

    return output
Пример #14
0
def mc_policy_evaluation(agent,
                         eval_env,
                         eval_horizon=10**5,
                         n_sim=10,
                         gamma=1.0,
                         policy_kwargs=None,
                         stationary_policy=True):
    """
    Monte-Carlo Policy evaluation [1]_ of an agent to estimate the value at the initial state.

    If a list of agents is provided as input, for each evaluation, one of the agents is sampled
    uniformly at random.

    Parameters
    ----------
    agent : Agent or list of agents.
        Trained agent(s).
    eval_env : Env
        Evaluation environment.
    eval_horizon : int, default: 10**5
        Horizon, maximum episode length.
    n_sim : int, default: 10
        Number of Monte Carlo simulations.
    gamma : double, default: 1.0
        Discount factor.
    policy_kwargs : dict or None
        Optional kwargs for agent.policy() method.
    stationary_policy : bool, default: True
        If False, the time step h (0<= h <= eval_horizon) is sent as argument
        to agent.policy() for policy evaluation.

    Return
    ------
    Numpy array of shape (n_sim, ) containing the sum of rewards in each simulation.

    References
    ----------
    .. [1] http://incompleteideas.net/book/first/ebook/node50.html
    """
    rng = seeding.get_rng()
    if not isinstance(agent, list):
        agents = [agent]
    else:
        agents = agent

    policy_kwargs = policy_kwargs or {}

    episode_rewards = np.zeros(n_sim)
    for sim in range(n_sim):
        idx = rng.integers(len(agents))

        observation = eval_env.reset()
        for hh in range(eval_horizon):
            if stationary_policy:
                action = agents[idx].policy(observation, **policy_kwargs)
            else:
                action = agents[idx].policy(observation, hh, **policy_kwargs)
            observation, reward, done, _ = eval_env.step(action)
            episode_rewards[sim] += reward * np.power(gamma, hh)
            if done:
                break

    return episode_rewards
Пример #15
0
import numpy as np
import pytest

import rlberry.seeding as seeding
from rlberry.agents.dynprog import ValueIterationAgent
from rlberry.agents.dynprog.utils import backward_induction
from rlberry.agents.dynprog.utils import backward_induction_in_place
from rlberry.agents.dynprog.utils import backward_induction_sd
from rlberry.agents.dynprog.utils import bellman_operator
from rlberry.agents.dynprog.utils import value_iteration
from rlberry.envs.finite import FiniteMDP

_rng = seeding.get_rng()


def get_random_mdp(S, A):
    R = _rng.uniform(0.0, 1.0, (S, A))
    P = _rng.uniform(0.0, 1.0, (S, A, S))
    for ss in range(S):
        for aa in range(A):
            P[ss, aa, :] /= P[ss, aa, :].sum()
    return R, P


@pytest.mark.parametrize("gamma, S, A", [(0.001, 2, 1), (0.25, 2, 1),
                                         (0.5, 2, 1), (0.75, 2, 1),
                                         (0.999, 2, 1), (0.001, 4, 2),
                                         (0.25, 4, 2), (0.5, 4, 2),
                                         (0.75, 4, 2), (0.999, 4, 2),
                                         (0.001, 20, 4), (0.25, 20, 4),
                                         (0.5, 20, 4), (0.75, 20, 4),