Exemplo n.º 1
0
def test_lsvi_optimism():
    env = GridWorld(nrows=2, ncols=2, walls=())

    def feature_map_fn(_env):
        return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n)

    agent = LSVIUCBAgent(env, n_episodes=250, gamma=0.99,
                         feature_map_fn=feature_map_fn,
                         horizon=3,
                         bonus_scale_factor=3,
                         reg_factor=0.000001)
    agent.fit()

    # near optimal Q
    agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=3)
    agent_opt.fit()
    Q = agent_opt.Q[0, :, :]

    # optimistic Q
    S = env.observation_space.n
    A = env.action_space.n
    Q_optimistic = np.zeros((S, A))
    for ss in range(S):
        Q_optimistic[ss, :] = agent._compute_q_vec(
                                    agent.w_vec[0, :],
                                    ss,
                                    agent.bonus_scale_factor)

    print(Q)
    print(Q_optimistic)
    assert (Q_optimistic - Q).min() >= -1e-5
Exemplo n.º 2
0
def test_lsvi_random_exploration():

    seeding.set_global_seed(123)

    env = GridWorld(nrows=2, ncols=2, walls=(), success_probability=0.95)

    def feature_map_fn(_env):
        return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n)

    agent = LSVIUCBAgent(env, n_episodes=250,
                         feature_map_fn=feature_map_fn,
                         horizon=20,
                         gamma=0.99,
                         reg_factor=1e-5,
                         bonus_scale_factor=0.0)
    agent.fit()

    # estimated Q
    S = env.observation_space.n
    Q_est = agent._run_lsvi(bonus_factor=0.0)[0, :].reshape((S, -1))

    # near optimal Q
    agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=20)
    agent_opt.fit()
    Q = agent_opt.Q[0, :, :]

    print(Q)
    print("---")
    print(Q_est)

    print("-------")
    print(np.abs(Q-Q_est))
    # Check error
    assert np.abs(Q-Q_est).mean() < 0.1
Exemplo n.º 3
0
def test_value_iteration_agent(horizon, gamma, S, A):
    for sim in range(5):
        # generate random MDP
        R, P = get_random_mdp(S, A)
        # create env and agent
        env = FiniteMDP(R, P)
        agent = ValueIterationAgent(env, gamma=gamma, horizon=horizon)
        # run
        agent.fit()
Exemplo n.º 4
0
"""
=======================================================
A demo of ValueIteration algorithm in Chain environment
=======================================================
 Illustration of how to set up an ValueIteration algorithm in rlberry.
 The environment chosen here is Chain environment.

.. video:: ../../video_plot_vi.mp4
   :width: 600

"""
# sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_vi.jpg'

from rlberry.agents.dynprog import ValueIterationAgent
from rlberry.envs.finite import Chain

env = Chain()
agent = ValueIterationAgent(env, gamma=0.95)
info = agent.fit()
print(info)

env.enable_rendering()
state = env.reset()
for tt in range(50):
    action = agent.policy(state)
    next_s, _, done, _ = env.step(action)
    if done:
        break
    state = next_s
video = env.save_video("_video/video_plot_vi.mp4")
Exemplo n.º 5
0
from rlberry.envs.benchmarks.grid_exploration.nroom import NRoom, get_nroom_state_coord
from rlberry.envs.classic_control import MountainCar
from rlberry.wrappers.vis2d import Vis2dWrapper
from rlberry.agents import RSUCBVIAgent
from rlberry.agents.dynprog import ValueIterationAgent

CHOICE = 1

if CHOICE == 0:
    env = NRoom(nrooms=5, array_observation=False, reward_free=True)
    env = Vis2dWrapper(env,
                       n_bins_obs=20,
                       memory_size=100,
                       state_preprocess_fn=get_nroom_state_coord)
    agent = ValueIterationAgent(env.unwrapped,
                                gamma=0.99,
                                horizon=200,
                                copy_env=False)

else:
    env = MountainCar()
    env = Vis2dWrapper(env, n_bins_obs=20, memory_size=200)

    agent = RSUCBVIAgent(
        env,
        gamma=0.99,
        horizon=200,
        bonus_scale_factor=0.1,
        copy_env=False,
        min_dist=0.1,
    )
Exemplo n.º 6
0
def test_lsvi_without_bonus():
    seeding.set_global_seed(123)

    def lsvi_debug_gather_data(agent):
        """
        Function to gather data sampling uniformly
        states and actions
        """
        N = agent.n_episodes*agent.horizon
        count = 0
        while count < N:
            state = agent.env.observation_space.sample()
            action = agent.env.action_space.sample()
            next_state, reward, done, info = agent.env.sample(state, action)
            #
            #
            feat = agent.feature_map.map(state, action)
            outer_prod = np.outer(feat, feat)
            inv = agent.lambda_mat_inv

            #
            agent.lambda_mat += np.outer(feat, feat)
            # update inverse
            agent.lambda_mat_inv -= \
                (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat)

            # update history
            agent.reward_hist[count] = reward
            agent.state_hist.append(state)
            agent.action_hist.append(action)
            agent.nstate_hist.append(next_state)

            #
            tt = agent.total_time_steps
            agent.feat_hist[tt, :] = agent.feature_map.map(state, action)
            for aa in range(agent.env.action_space.n):
                agent.feat_ns_all_actions[tt, aa, :] = \
                    agent.feature_map.map(next_state, aa)

            # increments
            agent.total_time_steps += 1
            count += 1

    env = GridWorld(nrows=2, ncols=2, walls=(), success_probability=0.95)

    def feature_map_fn(_env):
        return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n)

    agent = LSVIUCBAgent(env, n_episodes=100,
                         feature_map_fn=feature_map_fn,
                         horizon=20,
                         gamma=0.99,
                         reg_factor=1e-5)

    lsvi_debug_gather_data(agent)
    # estimated Q
    S = env.observation_space.n
    Q_est = agent._run_lsvi(bonus_factor=0.0)[0, :].reshape((S, -1))

    # near optimal Q
    agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=20)
    agent_opt.fit()
    Q = agent_opt.Q[0, :, :]

    print(Q)
    print("---")
    print(Q_est)

    print("-------")
    print(np.abs(Q-Q_est))
    # Check error
    assert Q_est == pytest.approx(Q, rel=0.01)
Exemplo n.º 7
0
"""
# sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_rooms.jpg'

from rlberry.envs.benchmarks.grid_exploration.nroom import NRoom
from rlberry.agents.dynprog import ValueIterationAgent

env = NRoom(
    nrooms=9,
    remove_walls=False,
    room_size=9,
    initial_state_distribution="center",
    include_traps=True,
)
horizon = env.observation_space.n

agent = ValueIterationAgent(env, gamma=0.999, horizon=horizon)
print("fitting...")
info = agent.fit()
print(info)

env.enable_rendering()

for _ in range(10):
    state = env.reset()
    for tt in range(horizon):
        # action = agent.policy(state)
        action = env.action_space.sample()
        next_s, _, done, _ = env.step(action)
        if done:
            break
        state = next_s