def play_game(grid, policy): ''' reset game to start at random position we need to do this because given our current deterministic policy we would never end up at certain states, but we still want to measure them :param grid: the grid class object :param policy: dictionary containing policies :return: a list of states and corresponding returns ''' start_states = list(grid.actions.keys()) start_idx = np.random.choice(len(start_states)) grid.set_state(start_states[start_idx]) s = grid.current_state() states_and_rewards = [(s, 0)] # list of tuples of (state,reward) while not grid.game_over(): a = policy[s] r = grid.move(a) s = grid.current_state() states_and_rewards.append((s, r)) #calculate returns by working backwards from terminal state G = 0 states_and_returns = [] first = True for s, r in reversed(states_and_rewards): #the value of the terminal state is 0 by definition #we should ignore the first state we encounter if first: first = False else: states_and_returns.append((s, G)) G = r + GAMMA * G states_and_returns.reverse() # we want it to be in order of state visited return states_and_returns
def play_game(grid, policy): ''' returns a list of states their returns and we dont use ex ''' s = (2, 0) grid.set_state(s) a = random_action(policy[s]) #be aware of timing each triple is s(t), a(t), r(t) #but r(t0 results in aking action a(t-1) from s(t-1) to land at s(t) states_actions_rewards = [(s, a, 0)] while True: r = grid.move(a) s = grid.current_state() if grid.game_over(): states_actions_rewards.append((s, None, r)) break else: a = random_action(policy[s]) states_actions_rewards.append((s, a, r)) #calculate returns by working back from terminal state G = 0 states_actions_returns = [] first = True for s, a, r in reversed((states_actions_rewards)): if first: first = False else: states_actions_returns.append((s, a, G)) G = r + GAMMA * G states_actions_returns.reverse() return states_actions_returns
def play_game(grid, policy): s = (2, 0) grid.set_state(s) states_and_rewards = [(s, 0)] while not grid.game_over(): a = policy[s] a = random_action(a) r = grid.move(a) s = grid.current_state() states_and_rewards.append((s, r)) return states_and_rewards
grid.set_state(s) #get Q(s) to choose first action Qs = getQs(model, s) # the first (s, r) tuple is the state we start in and 0 # (since we don't get a reward) for simply starting the game # the last (s, r) tuple is the terminal state and the final reward # the value for the terminal state is by definition 0, so we don't # care about updating it. a = max_dict(Qs)[0] a = random_action(a, eps=0.5 / t) biggest_change = 0 while not grid.game_over(): r = grid.move(a) s2 = grid.current_state() #need next action since Q(s,a) depends on Q(s',a') old_theta = model.theta.copy() if grid.is_terminal(s2): model.theta += alpha * (r - model.predict(s, a)) * model.grad( s, a) else: #not terminal Qs2 = getQs(model, s2) a2 = max_dict(Qs2)[0] a2 = random_action(a2, eps=0.5 / t) #epsilon greedy model.theta += alpha * (r + GAMMA * model.predict(s2, a2) - model.predict(s, a)) * model.grad( s, a)
#repeat until convergence #V[s] = max[a]{sum[s',r] {p(s',r|s,a)[r + GAMMA * V[s']]}} while True: max_change = 0 for s in states: old_vs = V[s] #V[s] only has policy if not a terminal state if s in policy: new_v = float('-inf') #find max[a] for a in ACTIONS: grid.set_state(s) r = grid.move(a) v = r + GAMMA * V[grid.current_state()] if v > new_v: new_v = v V[s] = new_v biggest_change = max(max_change, np.abs(old_vs - V[s])) #when the value function converges break out of the loop if max_change < thresh: break #find a policy that leads to optimal value function for s in policy.keys(): best_act = None best_value = float('-inf') for a in ACTIONS: grid.set_state(s) r = grid.move(a)
def play_game(grid, policy): ''' reset game to start at random position we need to do this because given our current deterministic policy we would never end up at certain states, but we still want to measure them :param grid: the grid class object :param policy: dictionary containing policies :return: a list of states and corresponding returns ''' start_states = list(grid.actions.keys()) start_idx = np.random.choice(len(start_states)) grid.set_state(start_states[start_idx]) s = grid.current_state() a = np.random.choice(ALL_POSSIBLE_ACTIONS) #first action is uniformly random #be aware of timing #each triple is s(t), a(t), r(t) #but r(t) results from taking action a(t-1) from s(t-1) and landing in s(t) states_actions_rewards = [(s, a, 0)] seen_states = set() seen_states.add(grid.current_state()) num_steps = 0 while True: r = grid.move(a) num_steps += 1 s = grid.current_state() if s in seen_states: # hack so that we don't end up in an infinitely long episode # bumping into the wall repeatedly # if num_steps == 1 -> bumped into a wall and haven't moved anywhere # reward = -10 # else: # reward = falls off by 1 / num_steps reward = -10. / num_steps states_actions_rewards.append((s, None, reward)) break elif grid.game_over(): states_actions_rewards.append((s, None, r)) break else: a = policy[s] states_actions_rewards.append((s, a, r)) seen_states.add(s) # calculate the returns by working backwards from the terminal state G = 0 states_actions_returns = [] first = True for s, a, r in reversed(states_actions_rewards): #the value of the terminal state is 0 so we ingore first state #and we ignore the last G which is meaningless since it doesnt correspond if first: first = False else: states_actions_returns.append((s,a,G)) G = r + GAMMA*G states_actions_returns.reverse() # we want it to be in order of state visited return states_actions_returns
#repeat until convergence while True: biggest_change = 0 for s in states: old_v = V[s] #V(s) only has value if its not a terminal state if s in grid.actions: new_v = 0 # we will accumulate the answer p_a = 1.0/len(grid.actions[s]) #each actions has equal prob since uniform for a in grid.actions[s]: grid.set_state(s) r = grid.move(a) new_v += p_a * (r + gamma*V[grid.current_state()]) V[s] = new_v biggest_change = max(biggest_change, np.abs(old_v - V[s])) if biggest_change < SMALL_ENOUGH: break print("Values for uniformly random actions:") print_values(V,grid) print('\n\n') ### fixed policy ### policy = { (2,0): 'U', (1,0): 'U', (0,0): 'R', (0,1): 'R',