示例#1
0
def _fit_agent_manager(agent, env="continuous_state", init_kwargs=None):
    """
    Check that the agent is compatible with :class:`~rlberry.manager.AgentManager`.

    Parameters
    ----------
    agent: rlberry agent module
        Agent class to test.
    env: tuple (env_ctor, env_kwargs) or str in {"continuous_state", "discrete_state"}, default="continuous_state"
        if tuple, env is the constructor and keywords of the env on which to test.
        if str in {"continuous_state", "discrete_state"}, we use a default Benchmark environment.
    init_kwargs : dict
        Arguments required by the agent's constructor.
    """
    if init_kwargs is None:
        init_kwargs = {}

    train_env = _make_env(env)
    try:
        agent = AgentManager(
            agent, train_env, fit_budget=5, n_fit=1, seed=SEED, init_kwargs=init_kwargs
        )
        agent.fit()
    except Exception as exc:
        raise RuntimeError("Agent not compatible with Agent Manager") from exc

    return agent
示例#2
0
def check_bandit_agent(Agent, environment=BernoulliBandit, seed=42):
    """
    Function used to check a bandit agent in rlberry on a Gaussian bandit problem.

    Parameters
    ----------
    Agent: rlberry agent module
        Agent class that we want to test.

    environment: rlberry env module
        Environment (i.e bandit instance) on which to test the agent.

    seed : Seed sequence from which to spawn the random number generator.


    Returns
    -------
    result : bool
        Whether the agent is a valid/compatible bandit agent.

    Examples
    --------
    >>> from rlberry.agents.bandits import IndexAgent
    >>> from rlberry.utils import check_bandit_agent
    >>> import numpy as np
    >>> class UCBAgent(IndexAgent):
    >>>     name = "UCB"
    >>>     def __init__(self, env, **kwargs):
    >>>         def index(r, t):
    >>>             return np.mean(r) + np.sqrt(np.log(t**2) / (2 * len(r)))
    >>>         IndexAgent.__init__(self, env, index, **kwargs)
    >>> check_bandit_agent(UCBAgent)
    True

    """
    env_ctor = environment
    env_kwargs = {}

    agent1 = AgentManager(Agent, (env_ctor, env_kwargs),
                          fit_budget=10,
                          n_fit=1,
                          seed=seed)
    agent2 = AgentManager(Agent, (env_ctor, env_kwargs),
                          fit_budget=10,
                          n_fit=1,
                          seed=seed)

    agent1.fit()
    agent2.fit()
    env = env_ctor(**env_kwargs)
    state = env.reset()
    result = True
    for _ in range(5):
        # test reproducibility on 5 actions
        action1 = agent1.agent_handlers[0].policy(state)
        action2 = agent2.agent_handlers[0].policy(state)
        if action1 != action2:
            result = False

    return result
示例#3
0
def test_recursive_vs_not_recursive():
    env_ctor = NormalBandit
    env_kwargs = {}

    agent1 = AgentManager(UCBAgent, (env_ctor, env_kwargs),
                          fit_budget=10,
                          n_fit=1,
                          seed=TEST_SEED)

    agent2 = AgentManager(
        RecursiveUCBAgent,
        (env_ctor, env_kwargs),
        fit_budget=10,
        n_fit=1,
        seed=TEST_SEED,
    )

    agent1.fit()
    agent2.fit()
    env = env_ctor(**env_kwargs)
    state = env.reset()
    for _ in range(5):
        # test reproducibility on 5 actions
        action1 = agent1.agent_handlers[0].policy(state)
        action2 = agent2.agent_handlers[0].policy(state)
        assert action1 == action2
示例#4
0
def test_agent_manager_partial_fit_and_tuple_env():
    # Define train and evaluation envs
    train_env = (
        GridWorld,
        None,
    )  # tuple (constructor, kwargs) must also work in AgentManager

    # Parameters
    params = {}
    eval_kwargs = dict(eval_horizon=10)

    # Run AgentManager
    stats = AgentManager(
        DummyAgent,
        train_env,
        init_kwargs=params,
        n_fit=4,
        fit_budget=5,
        eval_kwargs=eval_kwargs,
        seed=123,
    )
    stats2 = AgentManager(
        DummyAgent,
        train_env,
        init_kwargs=params,
        n_fit=4,
        fit_budget=5,
        eval_kwargs=eval_kwargs,
        seed=123,
    )

    # Run partial fit
    stats.fit(10)
    stats.fit(20)
    for agent in stats.agent_handlers:
        assert agent.total_budget == 30

    # Run fit
    stats2.fit()

    # learning curves
    plot_writer_data([stats],
                     tag="episode_rewards",
                     show=False,
                     preprocess_func=np.cumsum)

    # compare final policies
    evaluate_agents([stats], show=False)

    # delete some writers
    stats.set_writer(0, None)
    stats.set_writer(3, None)

    stats.clear_output_dir()
    stats2.clear_output_dir()
示例#5
0
def _create_and_fit_agent_manager(output_dir, outdir_id_style):
    env_ctor = GridWorld
    env_kwargs = dict(nrows=2, ncols=2, reward_at={(1, 1): 0.1, (2, 2): 1.0})

    manager = AgentManager(
        VIAgent,
        (env_ctor, env_kwargs),
        fit_budget=10,
        n_fit=3,
        output_dir=output_dir,
        outdir_id_style=outdir_id_style,
    )
    manager.fit()
    manager.save()
    return manager
示例#6
0
def test_jax_dqn(lambda_):
    if not _IMPORT_SUCCESSFUL:
        return

    env = (gym_make, dict(id="CartPole-v0"))
    params = dict(
        chunk_size=4, batch_size=128, target_update_interval=5, lambda_=lambda_
    )

    stats = AgentManager(
        DQNAgent,
        env,
        fit_budget=20,
        eval_env=env,
        init_kwargs=params,
        n_fit=1,
        parallelization="thread",
    )
    stats.fit()
    stats.clear_output_dir()
示例#7
0
def check_save_load(agent, env="continuous_state", init_kwargs=None):
    """
    Check that the agent save a non-empty file and can load.

    Parameters
    ----------
    agent: rlberry agent module
        Agent class to test.
    env: tuple (env_ctor, env_kwargs) or str in {"continuous_state", "discrete_state"}, default="continuous_state"
        if tuple, env is the constructor and keywords of the env on which to test.
        if str in {"continuous_state", "discrete_state"}, we use a default Benchmark environment.
    init_kwargs : dict
        Arguments required by the agent's constructor.
    """
    if init_kwargs is None:
        init_kwargs = {}

    train_env = _make_env(env)
    env = train_env[0](**train_env[1])
    with tempfile.TemporaryDirectory() as tmpdirname:
        agent = AgentManager(
            agent,
            train_env,
            fit_budget=5,
            n_fit=1,
            seed=SEED,
            init_kwargs=init_kwargs,
            output_dir=tmpdirname,
        )
        agent.fit(3)
        assert (
            os.path.getsize(str(agent.output_dir_) + "/agent_handlers/idx_0.pickle") > 1
        ), "The saved file is empty."
        try:
            agent.load(str(agent.output_dir_) + "/agent_handlers/idx_0.pickle")
        except Exception:
            raise RuntimeError("Failed to load the agent file.")
示例#8
0
def check_fit_additive(agent, env="continuous_state", init_kwargs=None):
    """
    Check that fitting two times with 10 fit budget is the same as fitting
    one time with 20 fit budget.

    Parameters
    ----------
    agent: rlberry agent module
        Agent class to test.
    env: tuple (env_ctor, env_kwargs) or str in ["continuous_state", "discrete_state"], default="continuous_state"
        if tuple, env is the constructor and keywords of the env on which to test.
        if str in ["continuous_state", "discrete_state"], we use a default Benchmark environment.
    init_kwargs : dict
        Arguments required by the agent's constructor.
    """
    if init_kwargs is None:
        init_kwargs = {}
    train_env = _make_env(env)

    agent1 = AgentManager(
        agent, train_env, fit_budget=5, n_fit=1, seed=SEED, init_kwargs=init_kwargs
    )
    agent1.fit(3)
    agent1.fit(3)

    agent2 = AgentManager(
        agent, train_env, fit_budget=5, n_fit=1, seed=SEED, init_kwargs=init_kwargs
    )
    agent2.fit(6)

    result = check_agents_almost_equal(
        agent1.agent_handlers[0], agent2.agent_handlers[0]
    )

    assert (
        result
    ), "Error: fitting the agent two times for 10 steps is not equivalent to fitting it one time for 20 steps."
示例#9
0
    env = VecFrameStack(env, n_stack=4)
    env = ScalarizeEnvWrapper(env)
    return env


#
# Testing single agent
#

if __name__ == "__main__":
    #
    # Training several agents and comparing different hyperparams
    #

    stats = AgentManager(
        A2CAgent,
        train_env=(env_constructor, None),
        eval_env=(eval_env_constructor, None),
        eval_kwargs=dict(eval_horizon=200),
        agent_name="A2C baseline",
        fit_budget=5000,
        init_kwargs=dict(policy="CnnPolicy", verbose=10),
        n_fit=4,
        parallelization="process",
        output_dir="dev/stable_baselines_atari",
        seed=123,
    )

    stats.fit()
    stats.optimize_hyperparams(timeout=60, n_fit=2)
示例#10
0
def execute_message(message: interface.Message,
                    resources: interface.Resources) -> interface.Message:
    response = interface.Message.create(command=interface.Command.ECHO)
    # LIST_RESOURCES
    if message.command == interface.Command.LIST_RESOURCES:
        info = {}
        for rr in resources:
            info[rr] = resources[rr]["description"]
        response = interface.Message.create(info=info)
    # AGENT_MANAGER_CREATE_INSTANCE
    elif message.command == interface.Command.AGENT_MANAGER_CREATE_INSTANCE:
        params = message.params
        base_dir = pathlib.Path(metadata_utils.RLBERRY_DEFAULT_DATA_DIR)
        if "output_dir" in params:
            params[
                "output_dir"] = base_dir / "server_data" / params["output_dir"]
        else:
            params["output_dir"] = base_dir / "server_data/"
        agent_manager = AgentManager(**params)
        filename = str(agent_manager.save())
        response = interface.Message.create(info=dict(
            filename=filename,
            agent_name=agent_manager.agent_name,
            output_dir=str(agent_manager.output_dir).replace(
                "server_data/", "client_data/"),
        ))
        del agent_manager
    # AGENT_MANAGER_FIT
    elif message.command == interface.Command.AGENT_MANAGER_FIT:
        filename = message.params["filename"]
        budget = message.params["budget"]
        extra_params = message.params["extra_params"]
        agent_manager = AgentManager.load(filename)
        agent_manager.fit(budget, **extra_params)
        agent_manager.save()
        response = interface.Message.create(command=interface.Command.ECHO)
        del agent_manager
    # AGENT_MANAGER_EVAL
    elif message.command == interface.Command.AGENT_MANAGER_EVAL:
        filename = message.params["filename"]
        agent_manager = AgentManager.load(filename)
        eval_output = agent_manager.eval_agents(
            message.params["n_simulations"])
        response = interface.Message.create(data=dict(output=eval_output))
        del agent_manager
    # AGENT_MANAGER_CLEAR_OUTPUT_DIR
    elif message.command == interface.Command.AGENT_MANAGER_CLEAR_OUTPUT_DIR:
        filename = message.params["filename"]
        agent_manager = AgentManager.load(filename)
        agent_manager.clear_output_dir()
        response = interface.Message.create(
            message=f"Cleared output dir: {agent_manager.output_dir}")
        del agent_manager
    # AGENT_MANAGER_CLEAR_HANDLERS
    elif message.command == interface.Command.AGENT_MANAGER_CLEAR_HANDLERS:
        filename = message.params["filename"]
        agent_manager = AgentManager.load(filename)
        agent_manager.clear_handlers()
        agent_manager.save()
        response = interface.Message.create(
            message=f"Cleared handlers: {filename}")
        del agent_manager
    # AGENT_MANAGER_SET_WRITER
    elif message.command == interface.Command.AGENT_MANAGER_SET_WRITER:
        filename = message.params["filename"]
        agent_manager = AgentManager.load(filename)
        agent_manager.set_writer(**message.params["kwargs"])
        agent_manager.save()
        del agent_manager
    # AGENT_MANAGER_OPTIMIZE_HYPERPARAMS
    elif message.command == interface.Command.AGENT_MANAGER_OPTIMIZE_HYPERPARAMS:
        filename = message.params["filename"]
        agent_manager = AgentManager.load(filename)
        best_params_dict = agent_manager.optimize_hyperparams(
            **message.params["kwargs"])
        agent_manager.save()
        del agent_manager
        response = interface.Message.create(data=best_params_dict)
    # AGENT_MANAGER_GET_WRITER_DATA
    elif message.command == interface.Command.AGENT_MANAGER_GET_WRITER_DATA:
        # writer scalar data
        filename = message.params["filename"]
        agent_manager = AgentManager.load(filename)
        writer_data = agent_manager.get_writer_data()
        writer_data = writer_data or dict()
        for idx in writer_data:
            writer_data[idx] = writer_data[idx].to_csv(index=False)
        # tensoboard data
        tensorboard_bin_data = None
        if agent_manager.tensorboard_dir is not None:
            tensorboard_zip_file = rlberry.utils.io.zipdir(
                agent_manager.tensorboard_dir,
                agent_manager.output_dir / "tensorboard_data.zip",
            )
            if tensorboard_zip_file is not None:
                tensorboard_bin_data = open(tensorboard_zip_file, "rb").read()
                tensorboard_bin_data = base64.b64encode(
                    tensorboard_bin_data).decode("ascii")
        response = interface.Message.create(
            data=dict(writer_data=writer_data,
                      tensorboard_bin_data=tensorboard_bin_data))
        del agent_manager
    # end
    return response
示例#11
0
ppo_params["gamma"] = 0.99
ppo_params["learning_rate"] = 0.001
ppo_params["eps_clip"] = 0.2
ppo_params["k_epochs"] = 4

eval_kwargs = dict(eval_horizon=horizon, n_simulations=20)

ppo_stats = AgentManager(
    PPOAgent,
    env,
    fit_budget=n_episodes,
    eval_kwargs=eval_kwargs,
    init_kwargs=ppo_params,
    n_fit=2,
)
ppo_stats.fit(n_episodes // 2)
plot_writer_data(
    ppo_stats,
    tag="episode_rewards",
    preprocess_func=np.cumsum,
    title="Cumulative Rewards",
    show=False,
)
evaluate_agents([ppo_stats], show=False)
ppo_stats.fit(n_episodes // 4)
plot_writer_data(
    ppo_stats,
    tag="episode_rewards",
    preprocess_func=np.cumsum,
    title="Cumulative Rewards",
    show=False,
示例#12
0
env_kwargs = {"means": means, "stds": 2 * np.ones(len(means))}

agent = AgentManager(
    UCBAgent,
    (env_ctor, env_kwargs),
    fit_budget=T,
    init_kwargs={"B": 2},
    n_fit=M,
    parallelization="process",
    mp_context="fork",
)
# these parameters should give parallel computing even in notebooks

# Agent training

agent.fit()


# Compute and plot (pseudo-)regret
def compute_pseudo_regret(actions):
    return np.cumsum(np.max(means) - means[actions.astype(int)])


fig = plt.figure(1, figsize=(5, 3))
ax = plt.gca()
output = plot_writer_data(
    [agent],
    tag="action",
    preprocess_func=compute_pseudo_regret,
    title="Cumulative Pseudo-Regret",
    ax=ax,
示例#13
0
        #                          write_scalar = "action")


env_ctor = GridWorld
env_kwargs = dict(
    nrows=3,
    ncols=10,
    reward_at={(1, 1): 0.1, (2, 9): 1.0},
    walls=((1, 4), (2, 4), (1, 5)),
    success_probability=0.7,
)

env = env_ctor(**env_kwargs)
agent = AgentManager(VIAgent, (env_ctor, env_kwargs), fit_budget=10, n_fit=3)

agent.fit(budget=10)
# comment the line above if you only want to load data from rlberry_data.


# We use the following preprocessing function to plot the cumulative reward.
def compute_reward(rewards):
    return np.cumsum(rewards)


# Plot of the cumulative reward.
output = plot_writer_data(
    agent, tag="reward", preprocess_func=compute_reward, title="Cumulative Reward"
)
# The output is for 500 global steps because it uses 10 fit_budget * horizon

# Log-Log plot :
示例#14
0
        n_fit=2,
        sampler_method="optuna_default",
        optuna_parallelization="thread",
    )

    initial_n_trials = len(manager.optuna_study.trials)

    # save
    manager_fname = manager.save()
    del manager

    # load
    manager = AgentManager.load(manager_fname)

    # continue previous optimization, now with 120s of timeout and multiprocessing
    manager.optimize_hyperparams(
        n_trials=512,
        timeout=120,
        n_fit=8,
        continue_previous=True,
        optuna_parallelization="process",
        n_optuna_workers=4,
    )

    print("number of initial trials = ", initial_n_trials)
    print("number of trials after continuing= ", len(manager.optuna_study.trials))

    print("----")
    print("fitting agents after choosing hyperparams...")
    manager.fit()  # fit the 4 agents
示例#15
0
        fit_budget=N_EPISODES,
        init_kwargs=params_a2c,
        eval_kwargs=eval_kwargs,
        n_fit=4,
        seed=123,
        parallelization="process",
        max_workers=2,
    )

    agent_manager_list = [rsucbvi_stats, rskernel_stats, a2c_stats]

    for st in agent_manager_list:
        st.fit()

    # Fit RSUCBVI for 50 more episodes
    rsucbvi_stats.fit(budget=50)

    # learning curves
    plot_writer_data(
        agent_manager_list,
        tag="episode_rewards",
        preprocess_func=np.cumsum,
        title="cumulative rewards",
        show=False,
    )

    plot_writer_data(
        agent_manager_list, tag="episode_rewards", title="episode rewards", show=False
    )

    # compare final policies
    eval_kwargs = dict(eval_horizon=HORIZON, n_simulations=20)

    # -------------------------------
    # Run AgentManager and save results
    # --------------------------------
    ppo_stats = AgentManager(
        PPOAgent,
        train_env,
        fit_budget=N_EPISODES,
        init_kwargs=params_ppo,
        eval_kwargs=eval_kwargs,
        n_fit=4,
        output_dir="dev/",
        parallelization="process",
    )
    ppo_stats.fit()  # fit the 4 agents
    ppo_stats_fname = ppo_stats.save()
    del ppo_stats

    # -------------------------------
    # Load and plot results
    # --------------------------------
    ppo_stats = AgentManager.load(ppo_stats_fname)

    # learning curves
    plot_writer_data(
        ppo_stats,
        tag="episode_rewards",
        preprocess_func=np.cumsum,
        title="Cumulative Rewards",
        show=False,
示例#17
0
        agent_name="LSVI (random exploration)",
        parallelization=parallelization,
    )

    # Oracle (optimal policy)
    oracle_stats = AgentManager(
        ValueIterationAgent,
        env,
        init_kwargs=params_oracle,
        fit_budget=n_episodes,
        eval_kwargs=eval_kwargs,
        n_fit=1,
    )

    # fit
    stats.fit()
    stats_ucbvi.fit()
    stats_random.fit()
    oracle_stats.fit()

    # visualize results
    plot_writer_data(
        [stats, stats_ucbvi, stats_random],
        tag="episode_rewards",
        preprocess_func=np.cumsum,
        title="Cumulative Rewards",
        show=False,
    )
    plot_writer_data([stats, stats_ucbvi, stats_random],
                     tag="dw_time_elapsed",
                     show=False)