def main(): env = standard_grid() vs = monte_carlo_evaluation(standard_grid, win_policy) render_vs(env, vs) env = standard_grid() vs = monte_carlo_evaluation(standard_grid, win_policy) render_vs(env, vs)
def main(): grid = standard_grid(obey_prob=1.0, step_cost=None) print_values(grid.rewards, grid) V, Policy, Deltas = monte_carlo(grid) print_values(V, grid) print_policy(Policy, grid) plt.plot(Deltas) plt.show()
def main(): env = standard_grid() qs = monte_carlo_control(standard_grid, epsilon_soft_greedy) render_qs_policy(env, qs) print() env = negative_grid() qs = monte_carlo_control(negative_grid, epsilon_soft_greedy) render_qs_policy(env, qs)
def main(): print('Standard Grid') env = standard_grid() policy, vs = policy_iteration(env) render_vs(env, vs) render_policy(env, policy) print('Negative Grid:') env = negative_grid() policy, vs = policy_iteration(env) render_vs(env, vs) render_policy(env, policy)
def main(): print('Standard Grid') env = standard_grid() v_star = value_iteration(env) render_vs(env, v_star) render_policy(env, policy_from_v(env, v_star)) print('Negative Grid:') env = negative_grid() v_star = value_iteration(env) render_vs(env, v_star) render_policy(env, policy_from_v(env, v_star))
def first_visit_monte_carlo_prediction(pi, N): V = {} all_returns = {} # default = [] for i in range(N): visited_states = set() states_and_returns = play_episode(standard_grid(), pi) for s, g in states_and_returns: if s not in visited_states: visited_states.add(s) if s not in all_returns: all_returns[s] = [] all_returns[s].append(g) V[s] = np.mean(all_returns[s]) return V
def negative_grid(step_cost=-0.1): # in this game we want to try to minimize the number of moves # so we will penalize every move g = standard_grid() g.rewards.update({ (0, 0): step_cost, (0, 1): step_cost, (0, 2): step_cost, (1, 0): step_cost, (1, 2): step_cost, (2, 0): step_cost, (2, 1): step_cost, (2, 2): step_cost, (2, 3): step_cost, }) return g
def main(): grid = standard_grid(obey_prob=1.0, step_cost=None) # print rewards print("rewards:") print_values(grid.rewards, grid) V, policy, deltas = monte_carlo(grid) print("final values:") print_values(V, grid) print("final policy:") print_policy(policy, grid) plt.plot(deltas) plt.show()
def temporal_difference(alpha=0.1, gamma=0.9): env = standard_grid() V = {} policy = initial_policy(env) states = env.all_states() for s in states: V[s] = 0 for i in range(2000): s_r = play_game(env, policy, (2, 0)) for t in range(len(s_r) - 1): s, r = s_r[t] s1, r1 = s_r[t + 1] V[s] = V[s] + alpha * (r1 + gamma * V[s1] - V[s]) return (env, V, policy)
def q_learning(episodes=2000, initial_state=(2, 0), alpha=0.1, gamma=0.9): env = standard_grid() Q = initial_Q(env, initial_value=0) s = initial_state for episode in range(episodes): s = initial_state env.set_state(s) while not env.game_over(): a = choose_action(env.all_actions, max_dict(Q[s])[0]) # action s2 = env.move(a) # state r = env.get_state_reward() # reward a2 = max_dict(Q[s2])[0] Q[s][a] = Q[s][a] + alpha * (r + gamma * Q[s2][a2] - Q[s][a]) s = s2 a = a2 return (Q, env)
def main( ): #If you want to run using python interpreter directly, replace def main(): to if __name__ == '__main__': #Create enviroment env = standard_grid(obey_prob=0.9, step_cost=None) #Create agent agent = QLearningAgent(env.all_states(), CONST_ACTION_LST) #Learn Policy by playing many episodes and Q-Learning adapting the Policy for episode in range(10000): env.set_state(CONST_START_STATE) state = env.current_state() while True: action = agent.get_action(state) reward = env.move(action) next_state = env.current_state() agent.learn(state, action, reward, next_state) if env.game_over(): break state = next_state print(agent.Q)
if env.is_terminal(env.current_state()): target = reward else: target = reward + discount * next_est # Update current state theta = theta + alpha * (target - c_est) * x return theta def semi_gradient_td(create_env, policy, episodes=100000): theta = np.random.randn(4) / 2 for _ in range(episodes): theta = td_episode(create_env, policy, theta) return theta if __name__ == '__main__': env = standard_grid() theta = semi_gradient_td(standard_grid, eps_win_policy) vs = get_value(env, theta, preprocess_features) render_vs(env, vs) print() env = negative_grid() theta = semi_gradient_td(negative_grid, eps_win_policy) vs = get_value(env, theta, preprocess_features) render_vs(env, vs)
elif len(sys.argv) > 1: try: obey_prob = float(sys.argv[1]) step_cost = 0 except: print("Bad arguments: Usage python " + sys.argv[0] + " obey_prob(float) + step_cost(float)") sys.exit() else: step_cost = 0 obey_prob = 1.0 # this grid gives you a reward of -0.1 for every non-terminal state # we want to see if this will encourage finding a shorter path to the goal grid = standard_grid(obey_prob=obey_prob, step_cost=step_cost) # print rewards print("rewards:") print_values(grid.rewards, grid) # calculate accurate values for each square values = calculate_values(grid) # calculate the optimum policy based on our values policy = calculate_greedy_policy(grid, values) # our goal here is to verify that we get the same answer as with policy iteration print("values:") print_values(values, grid) print("optimal policy:")
def calculate_greedy_policy(grid, V): policy = initialize_random_policy() # find a policy that leads to optimal value function for s in policy.keys(): grid.set_state(s) # loop through all possible actions to find the best current action best_a, _ = best_action_value(grid, V, s) policy[s] = best_a return policy if __name__ == '__main__': # this grid gives you a reward of -0.1 for every non-terminal state # we want to see if this will encourage finding a shorter path to the goal grid = standard_grid(obey_prob=1, step_cost=-.4) # print rewards print("rewards:") print_values(grid.rewards, grid) # calculate accurate values for each square V = calculate_values(grid) # calculate the optimum policy based on our values policy = calculate_greedy_policy(grid, V) # our goal here is to verify that we get the same answer as with policy iteration print("values:") print_values(V, grid) print("policy:")
def calculate_greedy_policy(grid, V): policy = initialize_random_policy() # find a policy that leads to optimal value function for s in policy.keys(): grid.set_state(s) # loop through all possible actions to find the best current action best_a, _ = best_action_value(grid, V, s) policy[s] = best_a return policy if __name__ == '__main__': # this grid gives you a reward of -0.1 for every non-terminal state # we want to see if this will encourage finding a shorter path to the goal grid = standard_grid(obey_prob=0.8, step_cost=-0.05) # print rewards print("rewards:") print_values(grid.rewards, grid) # calculate accurate values for each square V = calculate_values(grid) # calculate the optimum policy based on our values policy = calculate_greedy_policy(grid, V) # our goal here is to verify that we get the same answer as with policy iteration print("values:") print_values(V, grid) print("policy:")
# the value of the terminal state is 0 by definition # we should ignore the first state we encounter # and ignore the last G, which is meaningless since it doesn't correspond to any move if first: first = False else: states_and_returns.append((s, G)) G = r + GAMMA*G states_and_returns.reverse() # we want it to be in order of state visited return states_and_returns if __name__ == '__main__': # use the standard grid again (0 for every step) so that we can compare # to iterative policy evaluation grid = standard_grid() # print rewards print "rewards:" print_values(grid.rewards, grid) # state -> action # found by policy_iteration_random on standard_grid # MC method won't get exactly this, but should be close # values: # --------------------------- # 0.43| 0.56| 0.72| 0.00| # --------------------------- # 0.33| 0.00| 0.21| 0.00| # --------------------------- # 0.25| 0.18| 0.11| -0.17|
def calculate_greedy_policy(grid, V): policy = initialize_random_policy() # find a policy that leads to optimal value function for s in policy.keys(): grid.set_state(s) # loop through all possible actions to find the best current action best_a, _ = best_action_value(grid, V, s) policy[s] = best_a return policy if __name__ == '__main__': # this grid gives you a reward of -0.1 for every non-terminal state # we want to see if this will encourage finding a shorter path to the goal grid = standard_grid(obey_prob=0.8, step_cost=None) # print rewards print("rewards:") print_values(grid.rewards, grid) # calculate accurate values for each square V = calculate_values(grid) # calculate the optimum policy based on our values policy = calculate_greedy_policy(grid, V) # our goal here is to verify that we get the same answer as with policy iteration print("values:") print_values(V, grid) print("policy:")
reversed(states[:-1]), reversed(states[1:]), reversed(rewards)): new_value = self.state_values[s] + 1/(np.log(t+2)) * \ (r + self.discount_factor * self.state_values[s_prime] - self.state_values[s]) deltas[i] = np.abs(new_value - self.state_values[s]) self.state_values[s] = new_value def solve_prediction_problem(self, max_iter=10000): state_values = {} for t in tqdm(range(max_iter)): states, actions, rewards = self.play_game() self.update_state_value_function(states, rewards, t) if t % 1000 == 0: state_values[t] = copy.deepcopy(self.state_values) return state_values if __name__ == "__main__": a = Agent(grid_world.standard_grid(), policy='random', discount_factor=1.0) state_values = a.solve_prediction_problem() for k, v in state_values.items(): print(k) grid_world.print_values(v, a.env) a = Agent(grid_world.standard_grid(), policy='win-from-start') state_values = a.solve_prediction_problem() for k, v in state_values.items(): print(k) grid_world.print_values(v, a.env)
Q[s][a] = np.mean(returns[sa]) biggest_change = max(biggest_change, np.abs(old_q - Q[s][a])) seen_state_action_pairs.add(sa) deltas.append(biggest_change) for s in policy.keys(): a, _ = max_dict(Q[s]) policy[s] = a V = {} for s in policy.keys(): V[s] = max_dict(Q[s])[1] return V, policy, deltas if __name__ == '__main__': grid = standard_grid(obey_prob=1.0, step_cost=None) print("rewards:") print_values(grid.rewards, grid) V, policy, deltas = monte_carlo(grid) print("final values:") print_values(V, grid) print("final policy:") print_policy(policy, grid) plt.plot(deltas) plt.show()
def main(): env = standard_grid() theta = monte_carlo_evaluation(standard_grid, eps_win_policy) print(theta) vs = get_value(env, theta, preprocess_features) render_vs(env, vs)
def main(): env = standard_grid() policy, vs = policy_iteration(env) render_vs(env, vs) render_policy(env, policy)
for s, r in reversed(states_and_rewards): print("State, Immediate Reward: {},{}".format(s, r)) if first: first = False else: print("State, Future Reward: {},{}".format(s, G)) states_and_returns.append((s, G)) G = r + GAMMA*G states_and_returns.reverse() print(states_and_returns) return states_and_returns if __name__ == '__main__': # create standard_grid grid = standard_grid() # print rewards print("Rewards:") print_values(grid.rewards, grid) # state-> action is the policy policy = { (2, 0): 'U', (1, 0): 'U', (0, 0): 'R', (0, 1): 'R', (0, 2): 'R', (1, 2): 'R', (2, 1): 'R', (2, 2): 'R',
def calculate_greedy_policy(grid, V): policy = initialize_random_policy() # find a policy that leads to optimal value function for s in policy.keys(): grid.set_state(s) # loop through all possible actions to find the best current action best_a, _ = best_action_value(grid, V, s) policy[s] = best_a return policy if __name__ == '__main__': # this grid gives you a reward of -0.1 for every non-terminal state # we want to see if this will encourage finding a shorter path to the goal grid = standard_grid(obey_prob=0.5, step_cost=-2) # print rewards print("rewards:") print_values(grid.rewards, grid) # calculate accurate values for each square V = calculate_values(grid) # calculate the optimum policy based on our values policy = calculate_greedy_policy(grid, V) # our goal here is to verify that we get the same answer as with policy iteration print("values:") print_values(V, grid) print("policy:")