@author: momos_000 """ # using tabular Q-Learning to learn a policy over options in RoomWorld # referring to U.C. Berkeley DeepRL Bootcamp materials # Nov 15: HRL without planning, with interruption import time import numpy as np from room_world import RoomWorld, SmdpAgent_Q import learning_test_utilities as util #setup env = RoomWorld() state_space = env.state_space num_actions = env.action_space.size q_func = util.QTable(state_space, num_actions) # as "goto hallway" options options = util.create_hallway_options(env) agent_smdp = SmdpAgent_Q(env, q_func, options) #training max_options = 200 iterations, epsilon, gamma, alpha = util.learning_parameters() iterations = 100 #alpha = 1./16. # overwrite to match Sutton report_freq = iterations / 100 hist = np.zeros( (iterations, 7) ) #training step, avg_td, avg_ret, avg_greedy_ret, avg_greedy_successrate, avg_greedy_steps, avg_greedy_choices start_time = time.time() for itr in range(iterations):
# November 2, 2017. Modified to replan if the goal is not reached # using tabular Q-Learning to learn a flat policy in RoomWorld # referring to U.C. Berkeley DeepRL Bootcamp materials import time import numpy as np from room_world import RoomWorld, SmdpPlanningAgent_Q import learning_test_utilities as util #setup env = RoomWorld() state_space = env.state_space num_actions = env.action_space.size plan_length = 2 q_func = util.QTable(state_space, num_actions**plan_length) # as "goto hallway" options options = util.create_hallway_options(env) agent_plan = SmdpPlanningAgent_Q(env, q_func, options, plan_length=plan_length) #training iterations, epsilon, gamma, alpha = util.learning_parameters() max_plans = 100 #alpha = 1./16. # overwrite to match Sutton report_freq = iterations / 50 hist = np.zeros( (iterations, 7) ) #training step, avg_td, avg_ret, avg_greedy_ret, avg_greedy_successrate, avg_greedy_steps, avg_greedy_choices start_time = time.time() for itr in range(iterations): cur_state = env.reset(random_placement=True) done = [False]