def play_game(grid, policy): # returns a list of states and corresponding returns print("\n Playing Game with Policy: ") print_policy(policy, grid) # reset game to start at random position # we need to do this, because our current deterministic policy would # ... never end up at certain states, but we want to measure their reward s = (2, 0) grid.set_state(s) # play the game print("\nStarting State for the Game is: {}".format(s)) # each triple is s(t), a(t), r(t) # but r(t) results from taking action a(t-1) from s(t-1) and landing in s(t) states_and_rewards = [(s, 0)] num_steps = 0 while not grid.game_over(): # play the game print("\nState at move {} : {}".format(num_steps + 1, s)) # play until the game finishes a = policy[s] a = random_action(a) r = grid.move(a) print("Action at move {} : {}".format(num_steps + 1, a)) num_steps += 1 s = grid.current_state() states_and_rewards.append((s, r)) return states_and_rewards
def play_game(grid, policy): # returns a list of states and corresponding returns print("\n Playing Game with Policy: ") print_policy(policy, grid) # reset game to start at random position # we need to do this, because our current deterministic policy would # ... never end up at certain states, but we want to measure their reward s = (2, 0) grid.set_state(s) # play the game print("\nStarting State for the Game is: {}".format(s)) a = random_action(policy[s]) print("Starting Action for the Game is {}".format(a)) # each triple is s(t), a(t), r(t) # but r(t) results from taking action a(t-1) from s(t-1) and landing in s(t) states_actions_rewards = [(s, a, 0)] num_steps = 0 while True: # play the game print("\nState at move {} : {}".format(num_steps+1, s)) print("Action at move {} : {}".format(num_steps+1, a)) # play until the game finishes r = grid.move(a) num_steps += 1 s = grid.current_state() if grid.game_over(): states_actions_rewards.append((s, None, r)) break else: a = random_action(policy[s]) states_actions_rewards.append((s, a, r)) # calculate the returns by working backwards from the terminal state G = 0 states_actions_returns = [] first = True for s, a, r in reversed(states_actions_rewards): # the value of the terminal state is 0 by definition # we should ignore the first state we encounter # and ignore the last G, which is meaningless since it doesn't correspond to any move if first: first = False else: states_actions_returns.append((s, a, G)) G = r + GAMMA * G states_actions_returns.reverse() # we want it to be in order of state visited print("\nState Action Return (G): {}".format(states_actions_returns)) return states_actions_returns
# ... then initialise a return list for the state if s in grid.actions: returns[s] = [] else: # terminal state or state we can't otherwise get to V[s] = 0 # repeat for t in range(100): print('\n') print(t) # generate an episode using pi states_and_returns = play_game(grid, policy) # get a list of states and associated G values (expected future reward for this value) seen_states = set() # get unique states for s, G in states_and_returns: # for all states and expected future rewards, # check if we have already seen s # called "first-visit" MC policy evaluation if s not in seen_states: returns[s].append(G) # add the G to the returns for the chosen state print("returns:{}".format(returns)) V[s] = np.mean(returns[s]) # update the mean value for the state print("v(s):{}".format(V)) seen_states.add(s) print("values:") print_values(V, grid) print("policy:") print_policy(policy, grid)
# we will update Q(s,a) AS we experience the episode model.theta += alpha*(r + GAMMA*model.predict(s2, a2) - model.predict(s, a))*model.grad(s, a) # next state becomes current state s = s2 a = a2 biggest_change = max(biggest_change, np.abs(model.theta - old_theta).sum()) deltas.append(biggest_change) plt.plot(deltas) plt.show() # determine the policy from Q* # find V* from Q* policy = {} V = {} Q = {} for s in grid.actions.keys(): Qs = getQs(model, s) Q[s] = Qs a, max_q = max_dict(Qs) policy[s] = a V[s] = max_q print "values:" print_values(V, grid) print "policy:" print_policy(policy, grid)
def main(grid_type='negative'): # NOTE: every p(s',r|s,a) is deterministic (1 or 0) if grid_type == 'negative': # get the grid: grid = negative_grid() else: # assuming the standard grid: grid = standard_grid() # print the rewards: print('\nrewards:') print_values(grid.rewards, grid) # prints any dict with # a tuple of numbers as the key # and a number as the value # STEP 1: randomly initialize V(s) and the policy, pi(s): V = {} states = grid.all_states for s in states: # we can simply initialize all to zero: V[s] = 0 # or perform a random initialization: # if s in grid.actions: # if not a terminal state # V[s] = np.random.random() # else: # # terminal # V[s] = 0 print('\ninitial values:') print_values(V, grid) policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS) print('\ninitial policy:') print_policy(policy, grid) # STEP 2: alternate between policy evaluation and policy improvement: # repeat untill convergence: i = 0 while True: # STEP 2A: iterative policy evaluation while True: # NOTE: all of the actions, next states and rewards # are considered deterministic max_change = 0 for s in states: old_v = V[s] # save the old value of the state # check if not a terminal state: if s in grid.actions: grid.set_state(s) # take an action according to the policy and get the reward: a = policy[s] r = grid.move(a) # the "look-ahead" - get the value of the next state, s_prime: s_prime = grid.current_state # s_prime is needed in order to calculate # the value of the current state - the Bellman equation: V[s] = r + GAMMA * V[s_prime] # update max_change: max_change = max(max_change, np.abs(V[s] - old_v)) # check if converged: if max_change < THRESHOLD: break # STEP 2B: policy iteration # for each state we take an action according to the policy # and check whether there is a better action - take all possible # actions from that state and calculate the values; # we choose the action that results in the max value of the state. policy_improved = False for s in states: # check if not a terminal-state: if s in grid.actions: grid.set_state(s) # yep, don't forget to set the position! # save the old policy: old_a = policy[s] max_v = np.float('-inf') # worse is unlikely to occur # choose the best action among all the possible ones: for a in ALL_POSSIBLE_ACTIONS: # print('reached here!') grid.set_state(s) # take an action, receive your keto-chocolate bar: r = grid.move(a) s_prime = grid.current_state new_v = r + GAMMA * V[s_prime] # compare the values: if new_v > max_v: max_v = new_v better_a = a # change the policy: policy[s] = better_a if old_a != better_a: # print('policy_improved') policy_improved = True # if policy has changed, we need to recalculate the values of all states - # get back to STEP 2A; # else - we're done! # and since the policy's not changed, the values remain the same: if not policy_improved: break i += 1 print('\niterations to converge:', i) # print the values: print('\nvalues:') print_values(V, grid) # print the policy: print('\nthe improved policy:') print_policy(policy, grid)
states = g.all_states() V = {} policy = {} for s in states: if s in g.actions: V[s] = np.random.random() else: V[s] = 0 #print(g.actions.keys()) #Initialize random policy for s in g.actions.keys(): policy[s] = np.random.choice(all_actions) print_policy(policy, g) Iter = 0 while True: #Iterative Policy Iteration while Iter < 1000: Iter += 1 print("Iteration %d" % Iter) biggest_change = 0 #Backup old policy for s in states: old_v = V[s] #V[s] has value only if its not a terminal state if s in policy: a = policy[s] g.set_state(s)
def main(grid_type='negative'): # NOTE: every p(s',r|s,a) is now random, i.e. lies in [0,1], # but the policy is deterministic! if grid_type == 'negative': step_cost = float( input('\nenter step_cost (e.g. \'-1\' or \'-0.1\'):\n').strip()) # get the grid: grid = negative_grid(step_cost=step_cost) else: # assuming the standard grid: grid = standard_grid() # print the rewards: print('\nrewards:') print_values(grid.rewards, grid) # prints any dict with # a tuple of numbers as the key # and a number as the value # STEP 1: randomly initialize V(s) and the policy, pi(s): V = {} states = grid.all_states for s in states: # we can simply initialize all to zero: # V[s] = 0 # or perform a random initialization: if s in grid.actions: # if not a terminal state V[s] = np.random.random() else: # terminal V[s] = 0 print('\ninitial values:') print_values(V, grid) policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS) print('\ninitial policy:') print_policy(policy, grid) # STEP 2: alternate between policy evaluation and policy improvement # with random state-transitions: # repeat untill convergence: i = 0 while True: # STEP 2A: iterative policy evaluation while True: max_change = 0 for s in states: old_v = V[s] # save the old value of the state new_v = 0 # check if not a terminal state: if s in grid.actions: for a in ALL_POSSIBLE_ACTIONS: grid.set_state(s) # possible_actions = list(grid.actions[s]) # print('\npossible actions from the state (%d, %d):' % grid.current_state) # print(possible_actions) if a == policy[s]: # take this action with the probability p(a|s)=P_A: p_s_prime_and_r = P_A else: p_s_prime_and_r = (1 - P_A) / ( len(ALL_POSSIBLE_ACTIONS) - 1) # same as: p(s',r|s,!policy[s]) # move in the chosen direciton: r = grid.move(a) # the "look-ahead" - get the value of the next state, s_prime: s_prime = grid.current_state # s_prime is needed in order to calculate # the value of the current state - the Bellman equation: new_v += p_s_prime_and_r * (r + GAMMA * V[s_prime]) V[s] = new_v # update max_change: max_change = max(max_change, np.abs(V[s] - old_v)) # check if converged: if max_change < THRESHOLD: break # STEP 2B: policy iteration # for each state we take an action according to the policy # and check whether there is a better action - take all possible # actions from that state and calculate the values, but now we also # take into account that our state-transitions are random!!! # we then choose the action that results in the max value of the state. policy_improved = False for s in states: # check if not a terminal-state: if s in grid.actions: grid.set_state(s) # yep, don't forget to set the position! # save the old policy: old_a = policy[s] max_v = np.float('-inf') # worse is unlikely to occur # choose the best action among all the possible ones: for a in ALL_POSSIBLE_ACTIONS: # print('reached here!') new_v = 0 # we're to accumulate the value for another_a in ALL_POSSIBLE_ACTIONS: grid.set_state(s) # since the state-transitions are random, # we check if the action is desired: if another_a == a: # take this action with the probability p(a|s)=0.5: p_s_prime_and_r = P_A else: p_s_prime_and_r = (1 - P_A) / ( len(ALL_POSSIBLE_ACTIONS) - 1) # take an action, receive your keto-chocolate bar: r = grid.move(another_a) s_prime = grid.current_state new_v += p_s_prime_and_r * (r + GAMMA * V[s_prime]) # compare the values: if new_v > max_v: max_v = new_v better_a = a # change the policy: policy[s] = better_a if old_a != better_a: # print('policy_improved') policy_improved = True # if policy has changed, we need to recalculate the values of all states - # get back to STEP 2A; # else - we're done! # and since the policy's not changed, the values remain the same: if not policy_improved: break i += 1 print('\niterations to converge:', i) # print the values: print('\nvalues:') print_values(V, grid) # print the policy: print('\nthe improved policy:') print_policy(policy, grid)
# display rewards: print('\nrewards:') print_values(grid.rewards, grid) states = grid.all_states # initialize value function and number of visits per state: V = {} N = {} for s in states: V[s] = 0 N[s] = 0 ############################# First-Visit Monte Carlo: ############################# for i in range(10000): states_and_returns = play_game(grid, POLICY) visited_s = set() for s, G in states_and_returns: if s not in visited_s: N[s] += 1 V[s] = (1 - 1 / N[s]) * V[s] + (1 / N[s]) * G # print values: print('\nvalues:') print_values(V, grid) # print policy: print('\npolicy:') print_policy(POLICY, grid)
v = V[s] max_val = float("-inf") for action in grid.actions[s]: grid.set_state(s) r = grid.move(action) val = r + gamma * V[grid.current_state()] if val > max_val: max_val = val V[s] = max_val delta = max(delta, abs(v - V[s])) if delta < theta: print_values(V, grid) break # Output a deterministic policy (which is optimal) pi = {} for s in grid.actions: max_val = float("-inf") for action in grid.actions[s]: grid.set_state(s) r = grid.move(action) val = r + V[grid.current_state()] if val > max_val: max_val = val max_action = action pi[s] = max_action print_policy(pi, grid)
plt.plot(reward_per_episode) plt.title("Reward per episode") plt.show() # obtain V* and pi* V = {} greedy_policy = {} states = grid.all_states() for s in states: if s in grid.actions: values = model.predict_all_actions(s) V[s] = np.max(values) greedy_policy[s] = ALL_POSSIBLE_ACTIONS[np.argmax(values)] else: # terminal state or state we can't otherwise get to V[s] = 0 print("values:") print_values(V, grid) print("policy:") print_policy(greedy_policy, grid) print("state_visit_count:") state_sample_count_arr = np.zeros((grid.rows, grid.cols)) for i in range(grid.rows): for j in range(grid.cols): if (i, j) in state_visit_count: state_sample_count_arr[i, j] = state_visit_count[(i, j)] df = pd.DataFrame(state_sample_count_arr) print(df)
def play_game(grid, policy): # returns a list of states and corresponding returns print("\n Playing Game with Policy: ") print_policy(policy, grid) # reset game to start at random position # we need to do this, because our current deterministic policy would # ... never end up at certain states, but we want to measure their reward start_states = list(grid.actions.keys()) start_idx = np.random.choice(len(start_states)) grid.set_state(start_states[start_idx]) # play the game s = grid.current_state() print("\nStarting State for the Game is: {}".format(s)) a = np.random.choice(ALL_POSSIBLE_ACTIONS) print("Starting Action for the Game is {}".format(a)) # each triple is s(t), a(t), r(t) # but r(t) results from taking action a(t-1) from s(t-1) and landing in s(t) states_actions_rewards = [(s, a, 0)] seen_states = set() seen_states.add(grid.current_state()) num_steps = 0 while True: # play the game print("\nState at move {} : {}".format(num_steps + 1, s)) print("Action at move {} : {}".format(num_steps + 1, a)) # play until the game finishes r = grid.move(a) num_steps += 1 s = grid.current_state() if s in seen_states: # hack so that we don't end up in an infinitely long episode # bumping into the wall repeatedly # if num_steps == 1 -> bumped into a wall and haven't moved anywhere # reward = -10 # else: # reward = falls off by 1 / num_steps reward = -10. / num_steps states_actions_rewards.append((s, None, reward)) break elif grid.game_over(): states_actions_rewards.append((s, None, r)) break else: # THE FIRST MOVE IS RANDOM, BUT PAST THIS ITS ACCORDING TO THE POLICY # THIS NEEDS TO BE THE CASE OTHERWISE WE WOULD NEVER REACH CERTAIN STATES USING # OUR DETERMINISTIC POLICY a = policy[s] states_actions_rewards.append((s, a, r)) seen_states.add(s) print("\nState Action Reward: {}".format(states_actions_rewards)) # calculate the returns by working backwards from the terminal state G = 0 states_actions_returns = [] first = True for s, a, r in reversed(states_actions_rewards): # the value of the terminal state is 0 by definition # we should ignore the first state we encounter # and ignore the last G, which is meaningless since it doesn't correspond to any move if first: first = False else: states_actions_returns.append((s, a, G)) G = r + GAMMA * G states_actions_returns.reverse( ) # we want it to be in order of state visited print("\nState Action Return (G): {}".format(states_actions_returns)) return states_actions_returns
def main(grid_type='negative'): if grid_type == 'negative': step_cost = float( input('\nenter step_cost (e.g. \'-1\' or \'-0.1\'):\n').strip()) # get the grid: grid = negative_grid(step_cost=step_cost) else: # assuming the standard grid: grid = standard_grid() # display rewards: print('\nrewards:') print_values(grid.rewards, grid) states = grid.all_states # STEP 1: randomly initialize the value function, V(s): V = {} # the values for s in states: # as an option, initialize to 0: # V[s] = 0 # check if not a terminal state: if s in grid.actions: V[s] = np.random.random() else: V[s] = 0 print('\ninitial values:') print_values(V, grid) # STEP 2: value iteration while True: max_change = 0 for s in states: old_v = V[s] # if we're not in a terminal state: if s in grid.actions: # choose an action that results in the maximum value # for this state: best_v = np.float('-inf') # best_a = np.random.choice(ALL_POSSIBLE_ACTIONS) for a in ALL_POSSIBLE_ACTIONS: # arrive in the state: grid.set_state(s) # take the action and receive the reward: r = grid.move(a) # calculate the Bellman equation: v = r + GAMMA * V[grid.current_state] if v > best_v: best_v = v # p[s] = a # we'll do it in another loop later # update the value of this state: V[s] = best_v # update the maximum change: max_change = max(max_change, np.abs(old_v - V[s])) # check if converged: if max_change < THRESHOLD: break # STEP 3: take our optimal value funciton # and find our optimal policy p = {} # the policy for s in states: best_a = None best_v = float('-inf') # if not a terminal state: if s in grid.actions: # find the best action: for a in ALL_POSSIBLE_ACTIONS: grid.set_state(s) r = grid.move(a) v = r + GAMMA * V[grid.current_state] if v > best_v: best_v = v best_a = a p[s] = best_a # optimal values: print('\noptimal values:') print_values(V, grid) # optimal policy: print('\noptimal policy:') print_policy(p, grid)