예제 #1
0
@author: momos_000
"""
# using tabular Q-Learning to learn a policy over options in RoomWorld
# referring to U.C. Berkeley DeepRL Bootcamp materials
# Nov 15: HRL without planning, with interruption

import time
import numpy as np
from room_world import RoomWorld, SmdpAgent_Q
import learning_test_utilities as util

#setup
env = RoomWorld()
state_space = env.state_space
num_actions = env.action_space.size
q_func = util.QTable(state_space, num_actions)  # as "goto hallway" options
options = util.create_hallway_options(env)
agent_smdp = SmdpAgent_Q(env, q_func, options)

#training
max_options = 200
iterations, epsilon, gamma, alpha = util.learning_parameters()
iterations = 100
#alpha       = 1./16. # overwrite to match Sutton
report_freq = iterations / 100
hist = np.zeros(
    (iterations, 7)
)  #training step, avg_td, avg_ret, avg_greedy_ret, avg_greedy_successrate, avg_greedy_steps, avg_greedy_choices
start_time = time.time()

for itr in range(iterations):
예제 #2
0
# November 2, 2017. Modified to replan if the goal is not reached
# using tabular Q-Learning to learn a flat policy in RoomWorld
# referring to U.C. Berkeley DeepRL Bootcamp materials

import time
import numpy as np
from room_world import RoomWorld, SmdpPlanningAgent_Q
import learning_test_utilities as util

#setup
env = RoomWorld()
state_space = env.state_space
num_actions = env.action_space.size
plan_length = 2
q_func = util.QTable(state_space,
                     num_actions**plan_length)  # as "goto hallway" options
options = util.create_hallway_options(env)
agent_plan = SmdpPlanningAgent_Q(env, q_func, options, plan_length=plan_length)
#training
iterations, epsilon, gamma, alpha = util.learning_parameters()
max_plans = 100
#alpha       = 1./16. # overwrite to match Sutton
report_freq = iterations / 50
hist = np.zeros(
    (iterations, 7)
)  #training step, avg_td, avg_ret, avg_greedy_ret, avg_greedy_successrate, avg_greedy_steps, avg_greedy_choices
start_time = time.time()

for itr in range(iterations):
    cur_state = env.reset(random_placement=True)
    done = [False]