def test_lsvi_optimism(): env = GridWorld(nrows=2, ncols=2, walls=()) def feature_map_fn(_env): return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n) agent = LSVIUCBAgent(env, n_episodes=250, gamma=0.99, feature_map_fn=feature_map_fn, horizon=3, bonus_scale_factor=3, reg_factor=0.000001) agent.fit() # near optimal Q agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=3) agent_opt.fit() Q = agent_opt.Q[0, :, :] # optimistic Q S = env.observation_space.n A = env.action_space.n Q_optimistic = np.zeros((S, A)) for ss in range(S): Q_optimistic[ss, :] = agent._compute_q_vec( agent.w_vec[0, :], ss, agent.bonus_scale_factor) print(Q) print(Q_optimistic) assert (Q_optimistic - Q).min() >= -1e-5
def test_lsvi_random_exploration(): seeding.set_global_seed(123) env = GridWorld(nrows=2, ncols=2, walls=(), success_probability=0.95) def feature_map_fn(_env): return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n) agent = LSVIUCBAgent(env, n_episodes=250, feature_map_fn=feature_map_fn, horizon=20, gamma=0.99, reg_factor=1e-5, bonus_scale_factor=0.0) agent.fit() # estimated Q S = env.observation_space.n Q_est = agent._run_lsvi(bonus_factor=0.0)[0, :].reshape((S, -1)) # near optimal Q agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=20) agent_opt.fit() Q = agent_opt.Q[0, :, :] print(Q) print("---") print(Q_est) print("-------") print(np.abs(Q-Q_est)) # Check error assert np.abs(Q-Q_est).mean() < 0.1
def test_value_iteration_agent(horizon, gamma, S, A): for sim in range(5): # generate random MDP R, P = get_random_mdp(S, A) # create env and agent env = FiniteMDP(R, P) agent = ValueIterationAgent(env, gamma=gamma, horizon=horizon) # run agent.fit()
""" ======================================================= A demo of ValueIteration algorithm in Chain environment ======================================================= Illustration of how to set up an ValueIteration algorithm in rlberry. The environment chosen here is Chain environment. .. video:: ../../video_plot_vi.mp4 :width: 600 """ # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_vi.jpg' from rlberry.agents.dynprog import ValueIterationAgent from rlberry.envs.finite import Chain env = Chain() agent = ValueIterationAgent(env, gamma=0.95) info = agent.fit() print(info) env.enable_rendering() state = env.reset() for tt in range(50): action = agent.policy(state) next_s, _, done, _ = env.step(action) if done: break state = next_s video = env.save_video("_video/video_plot_vi.mp4")
from rlberry.envs.benchmarks.grid_exploration.nroom import NRoom, get_nroom_state_coord from rlberry.envs.classic_control import MountainCar from rlberry.wrappers.vis2d import Vis2dWrapper from rlberry.agents import RSUCBVIAgent from rlberry.agents.dynprog import ValueIterationAgent CHOICE = 1 if CHOICE == 0: env = NRoom(nrooms=5, array_observation=False, reward_free=True) env = Vis2dWrapper(env, n_bins_obs=20, memory_size=100, state_preprocess_fn=get_nroom_state_coord) agent = ValueIterationAgent(env.unwrapped, gamma=0.99, horizon=200, copy_env=False) else: env = MountainCar() env = Vis2dWrapper(env, n_bins_obs=20, memory_size=200) agent = RSUCBVIAgent( env, gamma=0.99, horizon=200, bonus_scale_factor=0.1, copy_env=False, min_dist=0.1, )
def test_lsvi_without_bonus(): seeding.set_global_seed(123) def lsvi_debug_gather_data(agent): """ Function to gather data sampling uniformly states and actions """ N = agent.n_episodes*agent.horizon count = 0 while count < N: state = agent.env.observation_space.sample() action = agent.env.action_space.sample() next_state, reward, done, info = agent.env.sample(state, action) # # feat = agent.feature_map.map(state, action) outer_prod = np.outer(feat, feat) inv = agent.lambda_mat_inv # agent.lambda_mat += np.outer(feat, feat) # update inverse agent.lambda_mat_inv -= \ (inv @ outer_prod @ inv) / (1 + feat @ inv.T @ feat) # update history agent.reward_hist[count] = reward agent.state_hist.append(state) agent.action_hist.append(action) agent.nstate_hist.append(next_state) # tt = agent.total_time_steps agent.feat_hist[tt, :] = agent.feature_map.map(state, action) for aa in range(agent.env.action_space.n): agent.feat_ns_all_actions[tt, aa, :] = \ agent.feature_map.map(next_state, aa) # increments agent.total_time_steps += 1 count += 1 env = GridWorld(nrows=2, ncols=2, walls=(), success_probability=0.95) def feature_map_fn(_env): return OneHotFeatureMap(_env.observation_space.n, _env.action_space.n) agent = LSVIUCBAgent(env, n_episodes=100, feature_map_fn=feature_map_fn, horizon=20, gamma=0.99, reg_factor=1e-5) lsvi_debug_gather_data(agent) # estimated Q S = env.observation_space.n Q_est = agent._run_lsvi(bonus_factor=0.0)[0, :].reshape((S, -1)) # near optimal Q agent_opt = ValueIterationAgent(env, gamma=0.99, horizon=20) agent_opt.fit() Q = agent_opt.Q[0, :, :] print(Q) print("---") print(Q_est) print("-------") print(np.abs(Q-Q_est)) # Check error assert Q_est == pytest.approx(Q, rel=0.01)
""" # sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_rooms.jpg' from rlberry.envs.benchmarks.grid_exploration.nroom import NRoom from rlberry.agents.dynprog import ValueIterationAgent env = NRoom( nrooms=9, remove_walls=False, room_size=9, initial_state_distribution="center", include_traps=True, ) horizon = env.observation_space.n agent = ValueIterationAgent(env, gamma=0.999, horizon=horizon) print("fitting...") info = agent.fit() print(info) env.enable_rendering() for _ in range(10): state = env.reset() for tt in range(horizon): # action = agent.policy(state) action = env.action_space.sample() next_s, _, done, _ = env.step(action) if done: break state = next_s