def sarsa(env, learn_st, test_st, learning=0.5, discount=0.99, back=False): # Initialize Q table Q = np.random.uniform(low=-1, high=1, size=(11, 11, env.action_space.n)) learning_b = 0.05 discount_b = 0.99 # Initialize variables to track results learn_len = 0 learn_cumr = 0 tot_episodes = 0 # Decrease temp every turn temp = TEMP decay = (TEMP - TEMP_MIN) / 5000 reached = 0 # Run Q learning algorithm while reached < 300: # Initialize parameters done = False tot_reward, reward = 0, 0 x = np.random.choice(300) pos, vel = learn_st[x] state = env.reset(pos, vel) M = list() steps = 0 # Discretize state state_adj = discretize(state) state_tup = (state_adj[0], state_adj[1]) # Determine action - Boltzmann policy action = boltz_policy(state_adj[0], state_adj[1], Q, temp) while done != True: # Get next state and reward state2, reward, done, info = env.step(action) steps += 1 # Discretize state2 state2_adj = discretize(state2) state2_tup = (state2_adj[0], state2_adj[1]) # Determine action in state2 - Boltzmann policy action2 = boltz_policy(state2_adj[0], state2_adj[1], Q, temp) # Store s, a, r, s' in M if back: M.append((state_tup, action, reward, state2_tup)) if done and reward == 200: reached += 1 # Adjust Q value for current state else: delta = learning * (reward + discount * Q[state2_adj[0], state2_adj[1], action2] - Q[state_adj[0], state_adj[1], action]) Q[state_adj[0], state_adj[1], action] += delta state_adj = state2_adj action = action2 # Update variables tot_reward += reward state_adj = state2_adj while back and tot_reward > -2500 and steps > 0: state1, action, reward, state2 = M.pop() state11, state12 = state1 state21, state22 = state2 Q[state11, state12, action] += learning_b * (reward + discount_b * max(Q[state21, state22]) - Q[state11, state12, action]) steps -= 1 if back: M.clear() if temp > TEMP_MIN: temp -= decay tot_episodes += 1 # Track mean length of episodes and mean cumulative reward learn_len = (learn_len * (tot_episodes - 1) + steps) / tot_episodes learn_cumr = (learn_cumr * (tot_episodes - 1) + tot_reward) / tot_episodes test_len = 0 test_cumr = 0 tot_episodes = 0 reached = 0 # Run SARSA algorithm: Testing phase while reached < 40: # Initialize parameters done = False tot_reward, reward = 0, 0 pos, vel = test_st[reached] state = env.reset(pos, vel) M = list() steps = 0 # Discretize state state_adj = discretize(state) state_tup = (state_adj[0], state_adj[1]) # Determine action - Boltzmann policy action = boltz_policy(state_adj[0], state_adj[1], Q, temp) while done != True: # Get next state and reward state2, reward, done, info = env.step(action) steps += 1 # Discretize state2 state2_adj = discretize(state2) state2_tup = (state2_adj[0], state2_adj[1]) # Determine action in state2 - Boltzmann policy action2 = boltz_policy(state2_adj[0], state2_adj[1], Q, temp) # Store s, a, r, s' in M if back: M.append((state_tup, action, reward, state2_tup)) if done: reached += 1 # Adjust Q value for current state else: delta = learning * (reward + discount * Q[state2_adj[0], state2_adj[1], action2] - Q[state_adj[0], state_adj[1], action]) Q[state_adj[0], state_adj[1], action] += delta state_adj = state2_adj action = action2 # Update variables tot_reward += reward state_adj = state2_adj while back and tot_reward > -2500 and steps > 0: state1, action, reward, state2 = M.pop() state11, state12 = state1 state21, state22 = state2 Q[state11, state12, action] += learning_b * (reward + discount_b * max(Q[state21, state22]) - Q[state11, state12, action]) steps -= 1 if back: M.clear() if temp > TEMP_MIN: temp -= decay tot_episodes += 1 # Track mean length of episodes and mean cumulative reward test_len = (test_len * (tot_episodes - 1) + steps) / tot_episodes test_cumr = (test_cumr * (tot_episodes - 1) + tot_reward) / tot_episodes env.close() return learn_len, learn_cumr, test_len, test_cumr
# Initialize parameters done = False tot_reward, reward = 0,0 steps = 0 pos, vel = test_st[reached] state = env.reset(pos, vel) exp = 0.0 M = list() temp = pow(10, HIGH) # Discretize state state_adj = discretize(state) state_tup = (state_adj[0], state_adj[1]) # Determine the action - Boltzmann policy action = boltz_policy(state_adj[0], state_adj[1], Q, temp) while done != True: # Keep track of visiting number if state_tup in visited.keys(): visited[state_tup] += 1 else: visited[state_tup] = 1 if steps == 0: state_prev = state_adj # compute q~, deltaV and exploration degree q_th = max(Q[state_adj[0], state_adj[1]]) - min(Q[state_adj[0], state_adj[1]]) delta_v = max(Q[state_adj[0], state_adj[1]]) - max(Q[state_prev[0], state_prev[1]]) exp = exp_weight*exp + (1 - exp_weight)*math.log(temp, 10)
def esl(env, learn_st, test_st, alpha=0.9, discount=0.99, back=False): # Initialize Q table Q = np.random.uniform(low = -1, high = 1, size = (11, 11, env.action_space.n)) learning_b = 0.05 discount_b = 0.99 visited = dict() exp_weight = 0.7 a_c = 10 a_max = alpha # thresholds for q~ and exp q_thresh = 0.1 exp_thresh = -4.5 # Initialize variables to track rewards learn_len = 0 learn_cumr = 0 tot_episodes = 0 reached = 0 # Run Enhanced SARSA algorithm: Learning phase while reached < 300: # Initialize parameters done = False tot_reward, reward = 0,0 steps = 0 x = np.random.choice(300) pos, vel = learn_st[x] state = env.reset(pos, vel) exp = 0.0 M = list() temp = pow(10, HIGH) # Discretize state state_adj = discretize(state) state_tup = (state_adj[0], state_adj[1]) # Determine the action - Boltzmann policy action = boltz_policy(state_adj[0], state_adj[1], Q, temp) while done != True: # Keep track of visiting number if state_tup in visited.keys(): visited[state_tup] += 1 else: visited[state_tup] = 1 if steps == 0: state_prev = state_adj # compute q~, deltaV and exploration degree q_th = max(Q[state_adj[0], state_adj[1]]) - min(Q[state_adj[0], state_adj[1]]) delta_v = max(Q[state_adj[0], state_adj[1]]) - max(Q[state_prev[0], state_prev[1]]) exp = exp_weight*exp + (1 - exp_weight)*math.log(temp, 10) # compute temp using fuzzy balancer if (q_th < q_thresh and delta_v < 0): temp = pow(10, LOW) if (q_th < q_thresh and delta_v > 0 and exp < exp_thresh): temp = pow(10, VLOW) if (q_th < q_thresh and delta_v > 0 and exp > exp_thresh): temp = pow(10, LOW) if (q_th > q_thresh and delta_v < 0): temp = pow(10, LOW) if (q_th > q_thresh and delta_v > 0): temp = pow(10, LOW) # Get next state and reward state2, reward, done, info = env.step(action) steps += 1 # Discretize state2 state2_adj = discretize(state2) state2_tup = (state2_adj[0], state2_adj[1]) # Determine next action - Boltzmann policy action2 = boltz_policy(state_adj[0], state_adj[1], Q, temp) # Store s, a, r, s' in M if back: M.append((state_tup, action, reward, state2_tup)) if done and reward == 200: reached += 1 # Compute adaptive learning rate alpha = min(a_c/visited[state_tup], a_max) # Adjust Q value for current state else: delta = alpha*(reward + discount*Q[state2_adj[0], state2_adj[1], action] - Q[state_adj[0], state_adj[1],action]) Q[state_adj[0], state_adj[1], action] += delta # Update variables tot_reward += reward state_adj = state2_adj action = action2 while back and tot_reward > -2500 steps > 0: state1, action, reward, state2 = M.pop() state11, state12 = state1 state21, state22 = state2 Q[state11, state12, action] += learning_b*(reward + discount_b*max(Q[state21, state22]) - Q[state11, state12, action]) steps -= 1
def adaptive(env, learn_st, test_st, learning=0.5, discount=0.99, temp = TEMP, back=False): # Initialize Q table Q = np.random.uniform(low = -1, high = 1, size = (11, 11, env.action_space.n)) learning_b = 0.05 discount_b = 0.99 th_p = 100 th_n = -100 c = 50 beta = 2 k = 1.15 discount_init = discount learning_init = learning # Initialize variables to track results learn_len = 0 learn_cumr = 0 tot_episodes = 0 reached = 0 # Run Q learning algorithm while reached < 300 # Initialize parameters done = False tot_reward, reward = 0,0 x = np.random.choice(300) pos, vel = learn_st[x] state = env.reset(pos, vel) steps = 0 M = list() # Discretize state state_adj = discretize(state) state_tup = (state_adj[0], state_adj[1]) while done != True: # Determine next action - Boltzmann policy action = boltz_policy(state_adj[0], state_adj[1], Q, temp) # Get next state and reward state2, reward, done, info = env.step(action) steps += 1 # Discretize state2 state2_adj = discretize(state2) state2_tup = (state2_adj[0], state2_adj[1]) # Store s, a, r, s' in M if back: M.append((state_tup, action, reward, state2_tup)) if done and reward == 200: reached += 1 max_next = max(Q[state2_adj[0], state2_adj[1]]) if ((th_n - k*steps) < max_next) and ((th_p + k*steps) > max_next) : discount = discount_init learning = learning_init else: discount = np.tanh(steps*discount) # delta = reward + discount*max_next - Q[state_adj[0], state_adj[1], action] learning = np.tanh(beta*abs(delta)/steps) temp = c/steps # Adjust Q value for current state else: delta = learning*(reward + discount*np.max(Q[state2_adj[0], state2_adj[1]]) - Q[state_adj[0], state_adj[1],action]) Q[state_adj[0], state_adj[1],action] += delta # Update variables tot_reward += reward state_adj = state2_adj while back and tot_reward > -2500 and steps > 0: state1, action, reward, state2 = M.pop() state11, state12 = state1 state21, state22 = state2 Q[state11, state12, action] += learning_b*(reward + discount_b*max(Q[state21, state22]) - Q[state11, state12, action]) steps -= 1 if back: M.clear() tot_episodes += 1 # Track mean length of episodes and mean cumulative reward learn_len = (learn_len*(tot_episodes - 1) + steps)/tot_episodes learn_cumr = (learn_cumr*(tot_episodes - 1) + tot_reward)/tot_episodes