def execute(): """Execute the learning algorithm""" s = lp.s a = lp.a alpha = lp.alpha q = lp.q v = lp.v policy = lp.policy agent.execute_action(a) time.sleep(lp.step_time) sp = agent.observe_state() r = agent.obtain_reward(s, a, sp) ap = agent.select_action(sp) # update Q delta = r + exp.GAMMA * q[sp, ap] - q[s, a] # TD error (SARSA) q[s, a] = q[s, a] + alpha * delta # update rule # Update V and Policy v[s] = np.max(q[s]) policy[s] = np.argmax(q[s]) lp.s = s lp.a = a lp.sp = sp lp.ap = ap lp.r = r lp.alpha = alpha lp.q = q lp.v = v lp.policy = policy lp.delta = delta
def execute(): """ Execute Learning Algorithm """ global eligibility, q_old assert initiated, "TOSL not initiated! setup() must be previously called" s = lp.s a = lp.a alpha = lp.alpha q = lp.q v = lp.v policy = lp.policy # Specific Learning Algorithm agent.execute_action(a) time.sleep(lp.step_time) # robot.stop() # time.sleep(TASK.STEP_TIME/2) sp = agent.observe_state() r = agent.obtain_reward(s, a, sp) ap = agent.select_action(sp) # Exploration strategy diff_q = q[s, a] - q_old q_old = q[sp, ap] delta = r + exp.GAMMA * q[sp, ap] - q[s, a] # TD error eligibility[s, a] = (1.0 - alpha) * eligibility[s, a] + 1 if eli_queue.count(s) > 0: eli_queue.remove(s) assert eli_queue.count(s) == 0, ("duplicated states found in ET ", str(s)) eli_queue.appendleft(s) for i in eli_queue: # no all states updated, just those in eli_queue # replace eli_queue by range(task.n_states) for non-reduced ET for j in range(task.n_actions): if eligibility[i, j] > 0.01: q[i, j] = q[i, j] + alpha * (delta + diff_q) * eligibility[i, j] eligibility[i, j] *= exp.GAMMA * exp.LAMBDA else: eligibility[i, j] = 0 if i == s and j == a: q[i, j] = q[i, j] - alpha * diff_q # update v and policy v[i] = np.max(q[i]) policy[i] = np.argmax(q[i]) lp.s, lp.a = s, a lp.sp, lp.ap = sp, ap lp.r = r lp.alpha = alpha lp.q = q lp.v = v lp.policy = policy lp.delta = delta
def setup(): """ Create module variables """ global step_time, step, s, sp, a, ap, r, alpha, delta, q, v, policy, q_count global t_sas, r_sas, elapsed_time, actual_step_time global final_average_reward, ave_v_step, ave_r_step, sasr_step, q_limit, s0 global initiated, initial_step_time agent.setup() step_time = task.STEP_TIME / exp.SPEED_RATE initial_step_time = step_time # only used in case we want to modify step_time from a task module # to speed up the experiment when the robot reaches the goal step = 0 # Get initial state: if exp.LEARN_FROM_MODEL: import model s = int(model.s0) else: s = agent.observe_state() s0 = s sp = -1 a = task.INITIAL_POLICY ap = -1 r = 0 alpha = exp.ALPHA delta = 0 q = np.zeros((task.n_states, task.n_actions), dtype=np.float32) v = np.zeros(task.n_states, dtype=np.float64) policy = np.full(task.n_states, task.INITIAL_POLICY, dtype=np.uint32) q_count = np.zeros((task.n_states, task.n_actions), dtype=np.uint64) if exp.USE_T_SAS_AND_R_SAS: t_sas = np.zeros((task.n_states, task.n_actions, task.n_states)) r_sas = np.zeros((task.n_states, task.n_actions, task.n_states)) elapsed_time = 0 actual_step_time = 0 final_average_reward = 0 ave_v_step = np.zeros(exp.N_STEPS) ave_r_step = np.zeros(exp.N_STEPS) sasr_step = np.zeros((exp.N_STEPS, 4)) learning_algorithm.setup() q_limit = round(max(task.REWARDS) / (1 - exp.GAMMA)) # q_limit = max(TASK.REWARDS)/(1-EXP.GAMMA) if q_limit != 100: print("q_limit = ", str(q_limit), ". Softmax regression will be normalized as q_limit = 100") time.sleep(2) initiated = True return
def execute(): """ Execute the learning algorithm """ global eligibility assert initiated, " SL not initiated! setup() must be previously called" s = lp.s a = lp.a alpha = lp.alpha q = lp.q v = lp.v policy = lp.policy # Specific Learning Algorithm agent.execute_action(a) time.sleep(lp.step_time) sp = agent.observe_state() r = agent.obtain_reward(s, a, sp) ap = agent.select_action(sp) # Exploration strategy delta = r + exp.GAMMA * q[sp, ap] - q[s, a] # TD error eligibility[s, a] = 1.0 # replace trace if eli_queue.count(s) > 0: eli_queue.remove(s) assert eli_queue.count(s) == 0, ("duplicated states found in ET: ", str(s)) eli_queue.appendleft(s) # only the states in eli_queue are updated: for i in eli_queue: # no all states updated, just those in eli_queue # replace eli_queue by range(task.n_states) for non-reduced ET for j in range(task.n_actions): if eligibility[i, j] > 0.01: q[i, j] = q[i, j] + alpha * delta * eligibility[i, j] eligibility[i, j] *= exp.GAMMA * exp.LAMBDA else: eligibility[i, j] = 0 # update v and policy v[i] = np.max(q[i]) policy[i] = np.argmax(q[i]) lp.s = s lp.a = a lp.sp = sp lp.ap = ap lp.r = r lp.alpha = alpha lp.q = q lp.v = v lp.policy = policy lp.delta = delta