示例#1
0
def test_ppo_agent():
    env = get_benchmark_env(level=1)
    n_episodes = 5
    horizon = 30

    def uncertainty_estimator_fn(observation_space, action_space):
        counter = DiscreteCounter(observation_space,
                                  action_space,
                                  n_bins_obs=20)
        return counter

    agent = PPOAgent(env,
                     n_episodes=n_episodes,
                     horizon=horizon,
                     gamma=0.99,
                     learning_rate=0.001,
                     eps_clip=0.2,
                     k_epochs=4,
                     use_bonus=True,
                     uncertainty_estimator_kwargs=dict(
                         uncertainty_estimator_fn=uncertainty_estimator_fn,
                         bonus_scale_factor=1))
    agent._log_interval = 0
    agent.fit()
    agent.policy(env.observation_space.sample())
示例#2
0
def test_adaptive_ql():
    env = get_benchmark_env(level=2)
    agent = AdaptiveQLAgent(env, n_episodes=50, horizon=30)
    agent.fit()
    agent.policy(env.observation_space.sample())
    agent.Qtree.plot(0, 20)
    plt.clf()
示例#3
0
def run_experiment(params, optimize_hyperparams):
    """
    Main experiment function
    """
    # Choose environment
    env = get_benchmark_env(level=1)

    # Initialize AgentStats
    stats = {}
    stats['ppo'] = AgentStats(PPOAgent,
                              env,
                              init_kwargs=params['ppo'],
                              eval_horizon=params['ppo']['horizon'],
                              n_fit=2)

    stats['a2c'] = AgentStats(A2CAgent,
                              env,
                              init_kwargs=params['a2c'],
                              eval_horizon=params['a2c']['horizon'],
                              n_fit=2)

    agent_stats_list = stats.values()

    # Optimize hyperparams
    if optimize_hyperparams:
        for stats in agent_stats_list:
            # timeout after 20 seconds
            stats.optimize_hyperparams(n_trials=50, timeout=10)

    # learning curves
    plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

    # compare final policies
    output = compare_policies(agent_stats_list, n_sim=10)
    print(output)
示例#4
0
def test_reinforce_agent():
    _env = get_benchmark_env(level=1)
    n_episodes = 50
    horizon = 30

    #
    def uncertainty_estimator_fn(observation_space, action_space):
        counter = DiscreteCounter(observation_space,
                                  action_space,
                                  n_bins_obs=20)
        return counter

    env = UncertaintyEstimatorWrapper(_env,
                                      uncertainty_estimator_fn,
                                      bonus_scale_factor=1.0)
    #
    agent = REINFORCEAgent(
        env,
        horizon=horizon,
        gamma=0.99,
        learning_rate=0.001,
        use_bonus_if_available=True,
    )
    agent.fit(budget=n_episodes)
    agent.policy(env.observation_space.sample())
示例#5
0
def test_rs_ucbvi_reward_free():
    env = get_benchmark_env(level=1)
    agent = RSUCBVIAgent(
        env, gamma=0.99, horizon=30, bonus_scale_factor=0.1, reward_free=True
    )
    agent.fit(budget=5)
    agent.policy(env.observation_space.sample())
    assert agent.R_hat.sum() == 0.0
示例#6
0
def test_rs_ucbvi():
    env = get_benchmark_env(level=1)
    agent = RSUCBVIAgent(env,
                         n_episodes=5,
                         gamma=0.99,
                         horizon=30,
                         bonus_scale_factor=0.1)
    agent._log_interval = 0
    agent.fit()
    agent.policy(env.observation_space.sample())
示例#7
0
def run_experiment(params,
                   optimize_hyperparams,
                   rlberry_seed):
    """
    Main experiment function
    """
    seeding.set_global_seed(rlberry_seed)

    # Choose environment
    env = get_benchmark_env(level=1)

    # Initialize AgentStats
    stats = {}
    stats['ppo'] = AgentStats(PPOAgent,
                              env,
                              init_kwargs=params['ppo'],
                              eval_horizon=params['ppo']['horizon'],
                              n_fit=2,
                              output_dir=fs_observer.dir)

    # uncomment to disable writer of the 2nd PPO thread
    # stats['ppo'].set_writer(1, None)

    stats['a2c'] = AgentStats(A2CAgent,
                              env,
                              init_kwargs=params['a2c'],
                              eval_horizon=params['a2c']['horizon'],
                              n_fit=2,
                              output_dir=fs_observer.dir)

    # uncomment to disable writer of the 1st A2C thread
    # stats['a2c'].set_writer(0, None)

    agent_stats_list = stats.values()

    # Optimize hyperparams
    if optimize_hyperparams:
        for stats in agent_stats_list:
            # timeout after 20 seconds
            stats.optimize_hyperparams(n_trials=50, timeout=10, n_fit=2)

    # Fit with best hyperparams and save results
    for stats in agent_stats_list:
        stats.fit()
        stats.save_results()

    # learning curves
    plot_episode_rewards(agent_stats_list, cumulative=True, show=False)

    # compare final policies
    output = compare_policies(agent_stats_list, n_sim=10)
    print(output)
示例#8
0
def test_reinforce_agent():
    env = get_benchmark_env(level=1)
    n_episodes = 5
    horizon = 30

    agent = REINFORCEAgent(env,
                           n_episodes=n_episodes,
                           horizon=horizon,
                           gamma=0.99,
                           learning_rate=0.001)
    agent._log_interval = 0
    agent.fit()
    agent.policy(env.observation_space.sample())
示例#9
0
def test_a2c_agent():
    env = get_benchmark_env(level=1)
    n_episodes = 5
    horizon = 30

    agent = A2CAgent(env,
                     n_episodes=n_episodes,
                     horizon=horizon,
                     gamma=0.99,
                     learning_rate=0.001,
                     k_epochs=4)
    agent._log_interval = 0
    agent.fit()
    agent.policy(env.observation_space.sample())
示例#10
0
def test_rs_kernel_ucbvi(kernel_type):
    for horizon in [None, 30]:
        env = get_benchmark_env(level=1)
        agent = RSKernelUCBVIAgent(env,
                                   n_episodes=5,
                                   gamma=0.95,
                                   horizon=horizon,
                                   bonus_scale_factor=0.01,
                                   min_dist=0.2,
                                   bandwidth=0.05,
                                   beta=1.0,
                                   kernel_type=kernel_type)
        agent._log_interval = 0
        agent.fit()
        agent.policy(env.observation_space.sample())
示例#11
0
def test_avec_ppo_agent():
    env = get_benchmark_env(level=1)
    n_episodes = 5
    horizon = 30

    agent = AVECPPOAgent(env,
                         n_episodes=n_episodes,
                         horizon=horizon,
                         gamma=0.99,
                         learning_rate=0.001,
                         eps_clip=0.2,
                         k_epochs=4,
                         batch_size=1)
    agent._log_interval = 0
    agent.fit()
    agent.policy(env.observation_space.sample())
示例#12
0
def test_cem_agent():
    env = get_benchmark_env(level=1)
    n_episodes = 5
    batch_size = 100
    horizon = 30
    gamma = 0.99

    agent = CEMAgent(env,
                     n_episodes,
                     horizon=horizon,
                     gamma=gamma,
                     batch_size=batch_size,
                     percentile=70,
                     learning_rate=0.01)
    agent._log_interval = 0
    agent.fit()
    agent.policy(env.observation_space.sample())
示例#13
0
def test_reinforce_agent_partial_fit():
    env = get_benchmark_env(level=1)
    n_episodes = 10
    horizon = 30

    agent = REINFORCEAgent(
        env,
        horizon=horizon,
        gamma=0.99,
        learning_rate=0.001,
        use_bonus_if_available=False,
    )
    agent.fit(budget=n_episodes // 2)
    agent.policy(env.observation_space.sample())
    assert agent.episode == 5
    agent.fit(budget=n_episodes // 2)
    assert agent.episode == 10
    agent.policy(env.observation_space.sample())
示例#14
0
def test_sac_agent_partial_fit():
    env = get_benchmark_env(level=1)
    n_episodes = 10

    agent = SACAgent(
        env,
        gamma=0.99,
        learning_rate=0.001,
        k_epochs=4,
        use_bonus=False,
        device="cpu",
    )

    agent.fit(budget=n_episodes // 2)
    agent.policy(env.observation_space.sample())
    assert agent.episode == 5
    agent.fit(budget=n_episodes // 2)
    assert agent.episode == 10
    agent.policy(env.observation_space.sample())
示例#15
0
def test_a2c_agent_partial_fit():
    env = get_benchmark_env(level=1)
    n_episodes = 10
    horizon = 30

    agent = A2CAgent(env,
                     n_episodes=n_episodes,
                     horizon=horizon,
                     gamma=0.99,
                     learning_rate=0.001,
                     k_epochs=4,
                     use_bonus=False)
    agent._log_interval = 0

    agent.partial_fit(0.5)
    agent.policy(env.observation_space.sample())
    assert agent.episode == 5
    agent.partial_fit(0.5)
    assert agent.episode == 10
    agent.policy(env.observation_space.sample())
def test_ppo_agent_partial_fit():
    env = get_benchmark_env(level=1)
    n_episodes = 10
    horizon = 30

    agent = PPOAgent(
        env,
        horizon=horizon,
        gamma=0.99,
        learning_rate=0.001,
        eps_clip=0.2,
        k_epochs=4,
        use_bonus=False,
    )

    agent.fit(budget=n_episodes // 2)
    agent.policy(env.observation_space.sample())
    assert agent.episode == 5
    agent.fit(budget=n_episodes // 2)
    assert agent.episode == 10
    agent.policy(env.observation_space.sample())
示例#17
0
def test_ppo_agent_partial_fit():
    env = get_benchmark_env(level=1)
    n_episodes = 10
    horizon = 30

    agent = PPOAgent(env,
                     n_episodes=n_episodes,
                     horizon=horizon,
                     gamma=0.99,
                     learning_rate=0.001,
                     eps_clip=0.2,
                     k_epochs=4,
                     batch_size=1)
    agent._log_interval = 0

    agent.partial_fit(0.5)
    agent.policy(env.observation_space.sample())
    assert agent.episode == 5
    agent.partial_fit(0.5)
    assert agent.episode == 10
    agent.policy(env.observation_space.sample())
示例#18
0
def test_rnd():
    # Environment
    env = get_benchmark_env(level=1)

    # RND
    rnd = RandomNetworkDistillation(
        env.observation_space,
        env.action_space,
        learning_rate=0.1,
        update_period=100,
        embedding_dim=2)

    # Test
    state = env.reset()
    for ii in range(1000):
        action = env.action_space.sample()
        next_s, reward, _, _ = env.step(action)
        rnd.update(state, action, next_s, reward)
        state = next_s
        # measure uncertainty
        _ = rnd.measure(state, action)
示例#19
0
def test_sac_agent():
    env = get_benchmark_env(level=1)
    n_episodes = 5

    def uncertainty_estimator_fn(observation_space, action_space):
        counter = DiscreteCounter(observation_space,
                                  action_space,
                                  n_bins_obs=20)
        return counter

    agent = SACAgent(
        env,
        gamma=0.99,
        learning_rate=0.001,
        k_epochs=4,
        use_bonus=True,
        uncertainty_estimator_kwargs=dict(
            uncertainty_estimator_fn=uncertainty_estimator_fn,
            bonus_scale_factor=1.0),
        device="cpu",
    )
    agent.fit(budget=n_episodes)
    agent.policy(env.observation_space.sample())
示例#20
0
from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
from rlberry.agents import MBQVIAgent
from rlberry.agents.ppo import PPOAgent
from rlberry.utils.logging import configure_logging
from rlberry.wrappers import DiscretizeStateWrapper
from rlberry.stats import AgentStats, plot_episode_rewards, compare_policies

configure_logging("DEBUG")

# global seed
seeding.set_global_seed(1234)

# --------------------------------
# Define train and evaluation envs
# --------------------------------
train_env = get_benchmark_env(level=5)
d_train_env = DiscretizeStateWrapper(train_env, 20)

# -----------------------------
# Parameters
# -----------------------------
N_EPISODES = 500
GAMMA = 0.99
HORIZON = 50

params_oracle = {
    "n_samples": 20,  # samples per state-action
    "gamma": GAMMA,
    "horizon": HORIZON
}
示例#21
0
def test_ball2d_benchmark_instantiation():
    for level in [1, 2, 3, 4, 5]:
        env = get_benchmark_env(level)
        for aa in range(env.action_space.n):
            env.step(aa)
            env.sample(env.observation_space.sample(), aa)
示例#22
0
def test_rs_ucbvi():
    env = get_benchmark_env(level=1)
    agent = RSUCBVIAgent(env, gamma=0.99, horizon=30, bonus_scale_factor=0.1)
    agent.fit(budget=5)
    agent.policy(env.observation_space.sample())
示例#23
0
from rlberry.agents.cem import CEMAgent
from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
import rlberry.seeding as seeding

seeding.set_global_seed(1234)

env = get_benchmark_env(level=1)

n_episodes = 1000
horizon = 30
gamma = 0.99

params = {
    'n_episodes': n_episodes,
    'horizon': horizon,
    'gamma': gamma,
    'batch_size': 20,
    'percentile': 70,
    'learning_rate': 0.01
}

agent = CEMAgent(env, **params)
agent.fit()

env.enable_rendering()
state = env.reset()
for tt in range(4 * horizon):
    action = agent.policy(state)
    next_state, reward, done, _ = env.step(action)
    state = next_state
示例#24
0
from copy import deepcopy
from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env
from rlberry.agents.ppo import PPOAgent
from rlberry.stats import AgentStats, plot_episode_rewards, compare_policies
from rlberry.exploration_tools.online_discretization_counter import OnlineDiscretizationCounter
from rlberry.exploration_tools.discrete_counter import DiscreteCounter
from rlberry.wrappers.uncertainty_estimator_wrapper import UncertaintyEstimatorWrapper


# global seed
seeding.set_global_seed(12345)

# --------------------------------
# Define train env
# --------------------------------
env = get_benchmark_env(level=4)
eval_env = get_benchmark_env(level=4)


def uncertainty_estimator_fn(obs_space, act_space):
    counter = DiscreteCounter(obs_space,
                              act_space,
                              n_bins_obs=20)
    return counter


# -----------------------------
# Parameters
# -----------------------------
N_EPISODES = 2000
GAMMA = 0.99