states = grid.all_states #V for value V = {} for s in states: V[s] = 0 gamma = 1.0 while True: biggest_change = 0 for s in states: old_v = V[s] if s in grid.location_to_action: # calculate new value. given chance of each possible move form state/location new_v = 0 p_a = 1.0 / len(grid.location_to_action[s]) for a in grid.location_to_action[s]: grid.set_state(s) r = grid.move(a) new_v += p_a * (r + gamma * V[grid.current_state]) V[s] = new_v biggest_change = max(biggest_change, np.abs(old_v - V[s])) if biggest_change < SMALL_ENOUGH: break print_values(V, grid)
if s in policy: # pick action in this case. we have fix policy r = grid.get_reward(s, policy[s]) V[s] = r + GAMMA * V[grid.current_state] biggest_change = max(biggest_change, np.abs(old_v - V[s])) if biggest_change < SMALL_ENOUGH: break if __name__ == '__main__': grid = negative_grid(-0.3) print("rewards:") print_values(grid.location_to_rewards, grid) # intialize random policy. then update policy = {} for s in grid.location_to_action.keys(): policy[s] = np.random.choice(grid.location_to_action[s]) print("initial policy:") print_policy(policy, grid) V = initalize_V(grid) while True: #evaluate policy to find V evalulate_v_for_policy(policy, grid, V) """ Summary: change policy for biggest V
s = s2 a = a2 #for logging only logging_update_counts[s] = logging_update_counts.get(s, 0) + 1 log_biggest_change = max(log_biggest_change, np.abs(log_old_qsa - Q[s][a])) logging_deltas.append(log_biggest_change) plt.plot(logging_deltas) plt.show() policy = {} V = {} for s in grid.location_to_action.keys(): policy[s] = get_best_action_from_q(Q, s, grid) V[s] = Q[s][policy[s]] # what's the proportion of time we spend updating each part of Q? print("update counts:") total = np.sum(list(logging_update_counts.values())) for k, v in logging_update_counts.items(): logging_update_counts[k] = float(v) / total print_values(logging_update_counts, grid) print("values:") print_values(V, grid) print("policy:") print_policy(policy, grid)
else: Qs2 = getQs(model, s2) a2, MaxQs2a2 = max_dict(Qs2) a2 = random_action(a2, eps=0.5 / t) model.theta += alpha * (r + gamma * MaxQs2a2 - model.predict( s, a)) * model.grad(s, a) s = s2 a = a2 delta = max(delta, np.abs(old_theta - model.theta).sum()) deltas.append(delta) plt.plot(deltas) plt.show() #Find Policy and V function Policy = {} V = {} Q = {} for s in g.actions.keys(): Q[s] = getQs(model, s) a, max_q = max_dict(Q[s]) Policy[s] = a V[s] = max_q print("Values") print_values(V, g) print("Policy") print_policy(Policy, g)
first = True for s, r in reversed(states_and_rewards): if first: first = False else: states_and_returns.append((s, G)) G = r + gamma * G states_and_returns.reverse() return states_and_returns if __name__ == "__main__": g = standard_grid() print('rewards:') print_values(g.rewards, g) Policy = { (2, 0): 'U', (1, 0): 'U', (0, 0): 'R', (0, 1): 'R', (0, 2): 'R', (1, 2): 'U', (2, 1): 'L', (2, 2): 'U', (2, 3): 'L', } #Intilize V(s) and returns V = {}