def test_bellman_operator_monotonicity_and_contraction(gamma, seed): env = ToyEnv1(seed) V0 = np.array([1.0, 100.0, 1000.0]) V1 = np.array([2.0, 120.0, 1200.0]) policy_array = np.array([[0.2, 0.8], [0.5, 0.5], [0.9, 0.9]]) policy = FinitePolicy(policy_array, seed) dp_agent = DynProgAgent(env, gamma=gamma) TV0, _ = dp_agent.bellman_opt_operator(V0) TV1, _ = dp_agent.bellman_opt_operator(V1) TpiV0 = dp_agent.bellman_operator(V0, policy) TpiV1 = dp_agent.bellman_operator(V1, policy) # Test monotonicity assert np.greater(TV0, TV1).sum() == 0 assert np.greater(TpiV0, TpiV1).sum() == 0 # Test contraction norm_tv = np.abs(TV1 - TV0).max() norm_v = np.abs(V1 - V0).max() assert norm_tv <= gamma * norm_v
def test_value_and_policy_iteration_gridworld(sx, sy, gamma): # Tolerance tol = 1e-8 # Environment env = GridWorld(nrows=sx, ncols=sy) dp_agent_val = DynProgAgent(env, gamma=gamma, method='value-iteration') dp_agent_pol = DynProgAgent(env, gamma=gamma, method='policy-iteration') V_value_it, _ = dp_agent_val.train(val_it_tol=tol) V_pol_it, _ = dp_agent_pol.train() assert np.allclose(V_value_it, V_pol_it, atol=tol, rtol=1e2 * tol)
def test_value_and_policy_iteration(gamma, seed, Ns, Na): # Tolerance tol = 1e-8 # Environment env = ToyEnv2(Ns, Na, seed) dp_agent_val = DynProgAgent(env, gamma=gamma, method='value-iteration') dp_agent_pol = DynProgAgent(env, gamma=gamma, method='policy-iteration') V_value_it, _ = dp_agent_val.train(val_it_tol=tol) V_pol_it, _ = dp_agent_pol.train() assert dp_agent_val.policy == dp_agent_pol.policy assert np.allclose(V_value_it, V_pol_it, atol=tol, rtol=1e1 * tol)
assert nrows >= 3 assert ncols >= 3 # defining walls middle_col = ncols // 2 middle_row = nrows // 2 walls = () for row in range(nrows): if row != middle_row: walls += ((row, middle_col), ) # super().__init__(seed_val, nrows, ncols, start_coord, terminal_states, success_probability, reward_at, walls, default_reward, enable_render) if __name__ == '__main__': gw = TwoRoomDense(9, 9, success_probability=1.0) from rlplan.agents.planning import DynProgAgent dynprog = DynProgAgent(gw, method='policy-iteration', gamma=0.9) V, _ = dynprog.train() gw.display_values(V) # run gw.render(mode='auto', policy=dynprog.policy) # reset gw.reset()
return self.gamma*mu + self.r/len(self.sampled_nodes) if __name__=='__main__': from rlplan.agents.planning import DynProgAgent from rlplan.envs.toy import ToyEnv1 import numpy as np # Define parameters gamma = 0.1 # discount factor seed = 55 # random seed # Initialize environment env = ToyEnv1(seed_val=seed) # ---------------------------------------------------------- # Finding the exact value function # ---------------------------------------------------------- dynprog = DynProgAgent(env, method='policy-iteration', gamma=gamma) V, _ = dynprog.train() # ---------------------------------------------------------- # TrailBlazer # ---------------------------------------------------------- state = env.reset() tb = TrailBlazer(state, env, gamma=gamma, delta=0.1, epsilon=1.0) val = tb.run() print("Value function = ", V) print("TrailBlazer estimate of V[%d] = %f" % (state, val))
from rlplan.agents.planning import DynProgAgent from rlplan.envs.toy import ToyEnv1 from rlplan.envs.gridworld import GridWorld from rlplan.envs import Chain from rlplan.prediction import TabularTD # Discount factor gamma = 0.9 # Create environment # env = Chain(10) # env = ToyEnv1() env = GridWorld(success_probability=0.9, nrows=4, ncols=4, walls=((1, 1), )) # Initialize and train dynamic programming agent dp_agent = DynProgAgent(env, method='policy-iteration', gamma=gamma) V_dp, _ = dp_agent.train() # Initialize and train q-learning agent ql_agent = QLearningAgent(env, gamma=gamma, learning_rate=None, min_learning_rate=0.1, epsilon=0.2) training_info = ql_agent.train(n_steps=1e5) V_ql = training_info['V'] # Use tabular TD tab_td = TabularTD(env, dp_agent.policy, gamma,
ql_agent = QLearningUcbAgent(env, gamma=gamma, learning_rate=None, min_learning_rate=0.1, c_expl=4.0) training_info = ql_agent.train(n_steps=1000, eval_params={'n_sim': 10}) # # Visualize policy # env.reset() # env.render(mode='auto', policy=ql_agent.policy) # # # Visualize training curve # ql_agent.plot_rewards(training_info['rewards_list'], training_info['x_data'], show=True) dp_agent = DynProgAgent(env, gamma=gamma, method='policy-iteration') # Draw history # draw_grid_world_state_distribution(ql_agent.env) action_freq = get_action_frequency(ql_agent.env) policy_freq = FinitePolicy(action_freq) # visualize_exploration(ql_agent.env, show=False) # env.render('manual') # plt.show() # env_eval.reset() env_eval.render(policy=ql_agent.policy) env_eval.reset() env_eval.render(policy=policy_freq)
""" action_freq = np.zeros((env.Ns, env.Na)) H = len(env.history) for ii in range(H): state, action, reward, next_state, done = env.history[ii] action_freq[state, action] += 1.0 for state in range(env.Ns): action_freq[ state, :] = action_freq[state, :] / action_freq[state, :].sum() return action_freq if __name__ == '__main__': from rlplan.envs import GridWorld from rlplan.agents.planning import DynProgAgent env = GridWorld() dp_agent = DynProgAgent(env, method='policy-iteration', gamma=0.9) dp_agent.train() env.track = True for step in range(15): env.step(dp_agent.policy.sample(env.state)) draw_gridworld_history(env) env.render('manual')