def monte_carlo(grid): """ Functions runs games and updates the value function monte carlo style Args: grid: The grid world Return: Value gunction, Policy fuction, Delta list """ #Create a random policy policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(CONST_ACTION_LST) #Init Q(s,a) function and return Q = {} returns = {} for s in grid.non_terminal_states(): Q[s] = {} for a in CONST_ACTION_LST: Q[s][a] = 0 returns[(s, a)] = [] # protocol changes of the Q values in each episode deltas = [] # run the monte carlo approimation for a specified amount of times for t in range(CONST_N_EPISODES): if t % 1000 == 0: print(t) biggest_change = 0 states_actions_returns = play_a_game(grid, policy) # calculate Q(s,a) seen_state_action_pairs = set() for s, a, G in states_actions_returns: # check if we have already seen, otherwise skip (first-visit insead of every visit) sa = (s, a) if sa not in seen_state_action_pairs: returns[sa].append(G) old_q = Q[s][a] Q[s][a] = np.mean(returns[sa]) biggest_change = max(biggest_change, np.abs(old_q - Q[s][a])) seen_state_action_pairs.add(sa) deltas.append(biggest_change) # calculate new policy p[s] = argmax[a]{Q(s,a)} for s in policy.keys(): a, _ = max_dict(Q[s]) policy[s] = a # calculate values for each state (just to print and compare) # V(s) = max[a]{ Q(s,a) } V = {} for s in policy.keys(): V[s] = max_dict( Q[s] )[1] #this function was givin by utils and is basically argmax for a python dict return V, policy, deltas
def monte_carlo(grid): # initialize a random policy policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS) # initialize Q(s,a) and returns Q = {} returns = {} # dictionary of state -> list of returns we've received states = grid.non_terminal_states() for s in states: Q[s] = {} for a in ALL_POSSIBLE_ACTIONS: Q[s][a] = 0 returns[(s,a)] = [] # keep track of how much our Q values change each episode so we can know when it converges deltas = [] # repeat for the number of episodes specified (enough that it converges) for t in range(N_EPISODES): if t % 1000 == 0: print(t) # generate an episode using the current policy biggest_change = 0 states_actions_returns = play_game(grid, policy) # calculate Q(s,a) seen_state_action_pairs = set() for s, a, G in states_actions_returns: # check if we have already seen s # first-visit Monte Carlo optimization sa = (s, a) if sa not in seen_state_action_pairs: returns[sa].append(G) old_q = Q[s][a] # the new Q[s][a] is the sample mean of all our returns for that (state, action) Q[s][a] = np.mean(returns[sa]) biggest_change = max(biggest_change, np.abs(old_q - Q[s][a])) seen_state_action_pairs.add(sa) deltas.append(biggest_change) # calculate new policy pi(s) = argmax[a]{ Q(s,a) } for s in policy.keys(): a, _ = max_dict(Q[s]) policy[s] = a # calculate values for each state (just to print and compare) # V(s) = max[a]{ Q(s,a) } V = {} for s in policy.keys(): V[s] = max_dict(Q[s])[1] return V, policy, deltas
def monte_carlo(grid): policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS) Q = {} returns = {} states = grid.non_terminal_states() for s in states: Q[s] = {} for a in ALL_POSSIBLE_ACTIONS: Q[s][a] = 0 returns[(s, a)] = [] deltas = [] for t in range(N_EPISODES): if t % 1000 == 0: print(t) biggest_change = 0 states_actions_returns = play_game(grid, policy) seen_state_action_pairs = set() for s, a, G in states_actions_returns: sa = (s, a) if sa not in seen_state_action_pairs: returns[sa].append(G) old_q = Q[s][a] Q[s][a] = np.mean(returns[sa]) biggest_change = max(biggest_change, np.abs(old_q - Q[s][a])) seen_state_action_pairs.add(sa) deltas.append(biggest_change) for s in policy.keys(): a, _ = max_dict(Q[s]) policy[s] = a V = {} for s in policy.keys(): V[s] = max_dict(Q[s])[1] return V, policy, deltas