alpha = ALPHA / t2 # instead of 'generating' an epsiode, we will PLAY # an episode within this loop s = (2, 0) # start state grid.set_state(s) # get Q(s) so we can choose the first action Qs = getQs(model, s) # the first (s, r) tuple is the state we start in and 0 # (since we don't get a reward) for simply starting the game # the last (s, r) tuple is the terminal state and the final reward # the value for the terminal state is by definition 0, so we don't # care about updating it. a = max_dict(Qs)[0] a = random_action(a, eps=0.5/t) # epsilon-greedy biggest_change = 0 while not grid.game_over(): r = grid.move(a) s2 = grid.current_state() # we need the next action as well since Q(s,a) depends on Q(s',a') # if s2 not in policy then it's a terminal state, all Q are 0 old_theta = model.theta.copy() if grid.is_terminal(s2): model.theta += alpha*(r - model.predict(s, a))*model.grad(s, a) else: # not terminal Qs2 = getQs(model, s2) a2 = max_dict(Qs2)[0]
if it % 100 == 0: t += 1e-2 if it % 2000 == 0: print("it:", it) # instead of 'generating' an epsiode, we will PLAY # an episode within this loop s = (2, 0) # start state grid.set_state(s) # the first (s, r) tuple is the state we start in and 0 # (since we don't get a reward) for simply starting the game # the last (s, r) tuple is the terminal state and the final reward # the value for the terminal state is by definition 0, so we don't # care about updating it. a = max_dict(Q[s])[0] biggest_change = 0 while not grid.is_game_over(): a = random_action(a, eps=0.5 / t) # Epsilon-greedy r = grid.move(a) s2 = grid.current_state() # we will update Q(s,a) AS we experience the episode alpha = ALPHA / update_counts_sa[s][a] update_counts_sa[s][a] += 0.005 old_qsa = Q[s][a] a2, max_q_s2a2 = max_dict(Q[s2]) Q[s][a] = Q[s][a] + alpha * (r + GAMMA * max_q_s2a2 - Q[s][a]) biggest_change = max(biggest_change, np.abs(old_qsa - Q[s][a]))
if it % 100 == 0: t += 1e-2 if it % 2000 == 0: print "it:", it # instead of 'generating' an epsiode, we will PLAY # an episode within this loop s = (2, 0) # start state grid.set_state(s) # the first (s, r) tuple is the state we start in and 0 # (since we don't get a reward) for simply starting the game # the last (s, r) tuple is the terminal state and the final reward # the value for the terminal state is by definition 0, so we don't # care about updating it. a, _ = max_dict(Q[s]) biggest_change = 0 while not grid.game_over(): a = random_action(a, eps=0.5/t) # epsilon-greedy # random action also works, but slower since you can bump into walls # a = np.random.choice(ALL_POSSIBLE_ACTIONS) r = grid.move(a) s2 = grid.current_state() # adaptive learning rate alpha = ALPHA / update_counts_sa[s][a] update_counts_sa[s][a] += 0.005 # we will update Q(s,a) AS we experience the episode old_qsa = Q[s][a] # the difference between SARSA and Q-Learning is with Q-Learning
states_actions_returns = play_game(grid, policy) seen_state_action_pairs = set() for state, action, G in states_actions_returns: # First-visit sa = (state, action) if sa not in seen_state_action_pairs: old_q = Q[state][action] returns[sa].append(G) Q[state][action] = np.mean(returns[sa]) biggest_change = max(biggest_change, np.abs(old_q - Q[state][action])) seen_state_action_pairs.add(sa) deltas.append(biggest_change) # Update policy argmax for state in policy: action, _ = max_dict(Q[state]) policy[state] = action plt.plot(deltas) plt.show() print('Final Policy:') print_policy(policy, grid) print('') V = {} for state, Qs in Q.items(): V[state] = max_dict(Q[state])[1] print('Final Values:') print_values(V, grid)
alpha = ALPHA / t2 # instead of 'generating' an epsiode, we will PLAY # an episode within this loop s = (2, 0) # start state grid.set_state(s) # get Q(s) so we can choose the first action Qs = getQs(model, s) # the first (s, r) tuple is the state we start in and 0 # (since we don't get a reward) for simply starting the game # the last (s, r) tuple is the terminal state and the final reward # the value for the terminal state is by definition 0, so we don't # care about updating it. a = max_dict(Qs)[0] a = random_action(a, eps=0.5 / t) # epsilon-greedy biggest_change = 0 while not grid.game_over(): r = grid.move(a) s2 = grid.current_state() # we need the next action as well since Q(s,a) depends on Q(s',a') # if s2 not in policy then it's a terminal state, all Q are 0 old_theta = model.theta.copy() if grid.is_terminal(s2): model.theta += alpha * (r - model.predict(s, a)) * model.grad( s, a) else: # not terminal Qs2 = getQs(model, s2)
alpha = ALPHA / t2 # instead of 'generating' an epsiode, we will PLAY # an episode within this loop s = (2, 0) # start state grid.set_state(s) # get Q(s) so we can choose the first action Qs = getQs(model, s) # the first (s, r) tuple is the state we start in and 0 # (since we don't get a reward) for simply starting the game # the last (s, r) tuple is the terminal state and the final reward # the value for the terminal state is by definition 0, so we don't # care about updating it. a = max_dict(Qs)[0] a = random_action(a, eps=0.5/t) # epsilon-greedy biggest_change = 0 while not grid.game_over(): r = grid.move(a) s2 = grid.current_state() # we need the next action as well since Q(s,a) depends on Q(s',a') # if s2 not in policy then it's a terminal state, all Q are 0 old_theta = model.theta.copy() if grid.is_terminal(s2): model.theta += alpha*(r - model.predict(s, a))*model.grad(s, a) else: # not terminal Qs2 = getQs(model, s2) a2, maxQs2a2 = max_dict(Qs2)
seen_state_action_pairs = set() for s, a, G in states_actions_returns: # check if we have already seen s # called first-visit MC policy evaluation sa = (s, a) if sa not in seen_state_action_pairs: old_Q = Q[s][a] returns[sa].append(G) Q[s][a] = np.mean(returns[sa]) biggest_change = max(biggest_change, np.abs(old_Q - Q[s][a])) seen_state_action_pairs.add(sa) deltas.append(biggest_change) # update policy for s in list(policy): policy[s] = max_dict(Q[s])[0] plt.plot(deltas) plt.show() print("final policy:") print_policy(policy, grid) # Find V V = {} for s, Qs in Q.items(): V[s] = max_dict(Qs)[1] print("values:") print_values(V, grid)
seen_state_action_pairs = set() for s, a, G in states_actions_returns: # check if we have already seen s # called "first-visit" MC policy evaluation sa = (s, a) if sa not in seen_state_action_pairs: old_q = Q[s][a] returns[sa].append(G) Q[s][a] = np.mean(returns[sa]) biggest_change = max(biggest_change, np.abs(old_q - Q[s][a])) seen_state_action_pairs.add(sa) deltas.append(biggest_change) # calculate new policy pi(s) = argmax[a]{ Q(s,a) } for s in policy.keys(): a, _ = max_dict(Q[s]) policy[s] = a plt.plot(deltas) plt.show() # find the optimal state-value function # V(s) = max[a]{ Q(s,a) } V = {} for s in policy.keys(): V[s] = max_dict(Q[s])[1] print("final values:") print_values(V, grid) print("final policy:") print_policy(policy, grid)
deltas = [] for it in range(100): print("\n\n\n Iteration: {}".format(it)) print("Q: {}".format(Q)) print("State-Action Count: {}".format(update_counts_sa)) # instead of 'generating' an epsiode, we will PLAY # an episode within this loop s = (2, 0) # start state grid.set_state(s) # the first (s, r) tuple is the state we start in and 0 # (since we don't get a reward) for simply starting the game # the last (s, r) tuple is the terminal state and the final reward # the value for the terminal state is by definition 0, so we don't # care about updating it. a, _ = max_dict(Q[s]) print("Max Q Action for State {} is: {}".format(s, a)) biggest_change = 0 while not grid.game_over(): a = random_action(a, eps=0.5 / t) # epsilon-greedy, # random action also works, but slower since you can bump into walls # a = np.random.choice(ALL_POSSIBLE_ACTIONS) r = grid.move(a) s2 = grid.current_state() print("\nCurrent State: {}".format(s)) print("Action: {}".format(a)) print("Reward: {}".format(r)) print("Next State: {}".format(s2)) # adaptive learning rate alpha = ALPHA / update_counts_sa[s][a]
if it % 100 == 0: t += 1e-2 if it % 2000 == 0: print "it:", it # instead of 'generating' an epsiode, we will PLAY # an episode within this loop s = (2, 0) # start state grid.set_state(s) # the first (s, r) tuple is the state we start in and 0 # (since we don't get a reward) for simply starting the game # the last (s, r) tuple is the terminal state and the final reward # the value for the terminal state is by definition 0, so we don't # care about updating it. a = max_dict(Q[s])[0] a = random_action(a, eps=0.5/t) biggest_change = 0 while not grid.game_over(): r = grid.move(a) s2 = grid.current_state() # we need the next action as well since Q(s,a) depends on Q(s',a') # if s2 not in policy then it's a terminal state, all Q are 0 a2 = max_dict(Q[s2])[0] a2 = random_action(a2, eps=0.5/t) # epsilon-greedy # we will update Q(s,a) AS we experience the episode alpha = ALPHA / update_counts_sa[s][a] update_counts_sa[s][a] += 0.005 old_qsa = Q[s][a]
def epsilon_greedy(Q, s, eps=0.1): if np.random.random() < eps: return np.random.choice(ALL_POSSIBLE_ACTIONS) else: a_opt = max_dict(Q[s])[0] return a_opt
print("it:", it) # begin a new episode s = grid.reset() episode_reward = 0 while not grid.game_over(): # perform action and get next state + reward a = epsilon_greedy(Q, s, eps=0.1) r = grid.move(a) s2 = grid.current_state() # update reward episode_reward += r # update Q(s,a) maxQ = max_dict(Q[s2])[1] Q[s][a] = Q[s][a] + ALPHA * (r + GAMMA * maxQ - Q[s][a]) # we would like to know how often Q(s) has been updated too update_counts[s] = update_counts.get(s, 0) + 1 # next state becomes current state s = s2 # log the reward for this episode reward_per_episode.append(episode_reward) plt.plot(reward_per_episode) plt.title("reward_per_episode") plt.show()
update_counts_sa[s] = {} for a in ALL_POSSIBLE_ACTIONS: update_counts_sa[s][a] = 1.0 t = 1.0 deltas = [] for it in range(10000): if it % 100 == 0: t += 10e-3 if it % 2000 == 0: print('it:', it) s = (2,0) grid.set_state(s) a = max_dict(Q[s])[0] a = random_action(a, eps = 0.5/t) biggest_change = 0 while not grid.game_over(): r = grid.move(a) s1 = grid.current_state() a1 = max_dict(Q[s1])[0] a1 = random_action(a1, eps = 0.5/t) alpha = ALPHA/update_counts_sa[s][a] update_counts_sa[s][a] += 0.005 old_qsa = Q[s][a] Q[s][a] = Q[s][a] + ALPHA*(r + GAMMA*Q[s1][a1] - Q[s][a])
rewards = [] for it in range(10000): # we start playing the game if it % 100 == 0: t += 1e-2 # how often and by how much we increae t represents a HYPERPARAMETER if it % 1000 == 0: print("it:", it, "/10000") # instead of generating an epsisod, we will play an epsisode withing the loop s = (2, 0) # starting position grid.set_state(s) # the first (s,r) tuple is the start state and is equal to 0 (no rewards for starting the game) # the final (s,r) is the terminal state = 0 (by definition) -> no need to update it a = max_dict(Q[s])[0] a = random_action( a, eps=0.5 / t ) # epsilon greedy, as the action is choosen between the best available action and a random exploration action delta = 0 reward = 0 while not grid.game_over(): r = grid.move(a) s2 = grid.current_state() a2 = max_dict(Q[s2])[0] # we need the next action for Q(s',a') a2 = random_action(a2, eps=0.5 / t)
if i % 1000 == 0: print(i) delta = 0 # Policy Evaluation Step states_actions_returns = play_episode(grid, pi) seen_state_actions_pairs = set() for s, a, G in states_actions_returns: sa = (s, a) if sa not in seen_state_actions_pairs: old_q = Q[s][a] returns[sa].append(G) Q[s][a] = np.mean(returns[sa]) delta = max(delta, np.abs(old_q - Q[s][a])) seen_state_actions_pairs.add(sa) deltas.append(delta) # Policy Improvement Step for s in pi.keys(): pi[s] = max_dict(Q[s])[0] plt.plot(deltas) plt.show() print('Done') print_policy(pi, grid) V = {} for s, Qs in Q.items(): V[s] = max_dict(Q[s])[1] print_values(V, grid)
for a in ALL_POSSIBLE_ACTIONS: update_counts_sa[s][a] = 1.0 # repeat until convergence t = 1.0 deltas = [] for it in range(10000): if it % 100 == 0: t += 10e-3 if it % 2000 == 0: print('it:', it) s = (2, 0) grid.set_state(s) a = max_dict(Q[s])[0] a = random_action(a, eps=0.5 / t) biggest_change = 0 while not grid.game_over(): r = grid.move(a) s2 = grid.current_state() a2 = max_dict(Q[s2])[0] a2 = random_action(a2, eps=0.5 / t) # update Q(s,a) as we experience the episode alpha = ALPHA / update_counts_sa[s][a] update_counts_sa[s][a] += 0.005 old_qsa = Q[s][a] Q[s][a] = Q[s][a] + alpha * (r + GAMMA * Q[s2][a2] - Q[s][a])
Q[s] = {} update_counts_sa[s] = {} for a in ALL_POSSIBLE_ACTIONS: Q[s][a] = 0 update_counts_sa[s][a] = 1.0 t = 1.0 delta = [] for it in range(10000): if it % 100 == 0: t += 10e-3 s = (2, 0) grid.set_state(s) a = max_dict(Q[s])[0] biggest_change = 0 while not grid.game_over(): a = random_action(a, eps=0.5 / t) r = grid.move(a) s2 = grid.current_state() a2, max_q2s2 = max_dict(Q[s2]) alpha = ALPHA / update_counts_sa[s][a] update_counts_sa[s][a] += 0.005 old_qsa = Q[s][a] Q[s][a] = Q[s][a] + alpha * (r + GAMMA * max_q2s2 - Q[s][a]) biggest_change = max(biggest_change, np.abs(old_qsa - Q[s][a])) update_counts[s] = update_counts.get(s, 0) + 1
if it % 100 == 0: t += 1e-2 if it % 2000 == 0: print("it:", it) # instead of 'generating' an epsiode, we will PLAY # an episode within this loop s = (2, 0) # start state grid.set_state(s) # the first (s, r) tuple is the state we start in and 0 # (since we don't get a reward) for simply starting the game # the last (s, r) tuple is the terminal state and the final reward # the value for the terminal state is by definition 0, so we don't # care about updating it. a, _ = max_dict(Q[s]) biggest_change = 0 while not grid.game_over(): a = random_action(a, eps=0.5 / t) # epsilon-greedy # random action also works, but slower since you can bump into walls # a = np.random.choice(ALL_POSSIBLE_ACTIONS) r = grid.move(a) s2 = grid.current_state() # we will update Q(s,a) AS we experience the episode old_qsa = Q[s][a] # the difference between SARSA and Q-Learning is with Q-Learning # we will use this max[a']{ Q(s',a')} in our update # even if we do not end up taking this action in the next step a2, max_q_s2a2 = max_dict(Q[s2]) Q[s][a] = Q[s][a] + ALPHA * (r + GAMMA * max_q_s2a2 - Q[s][a])
s = s2 a = a2 # log the reward for this episode reward_per_episode.append(episode_reward) plt.plot(reward_per_episode) plt.title("reward_per_episode") plt.show() # determine the policy from Q* # find V* from Q* policy = {} V = {} for s in grid.actions.keys(): a, max_q = max_dict(Q[s]) policy[s] = a V[s] = max_q # what's the proportion of time we spend updating each part of Q? print("update counts:") total = np.sum(list(update_counts.values())) for k, v in update_counts.items(): update_counts[k] = float(v) / total print_values(update_counts, grid) print("values:") print_values(V, grid) print("policy:") print_policy(policy, grid)