def policy_iteration(env, policy, epsilon): q = init_state_action_map(env) visits_map = init_state_action_map(env) for _ in xrange(20000): episode = generate_episode(env, policy) on_policy_evaluation(episode, q, visits_map) epsilon_greedy_policy_improvement(env, episode, q, policy, epsilon) return q
def policy_iteration(env, policy): q = init_state_action_map(env) visits_map = init_state_action_map(env) for _ in xrange(20000): episode = generate_episode_es(env, policy) on_policy_evaluation(episode, q, visits_map) greedy_deterministic_policy_improvement(env, episode, q, policy) return q
def policy_iteration(env, target_policy, behavior_policy): q = init_state_action_map(env) c = init_state_action_map(env) for _ in xrange(20000): episode = generate_episode(env, behavior_policy) off_policy_evaluation(episode, q, c, target_policy, behavior_policy) greedy_stochastic_policy_improvement(env, episode, q, target_policy) return q
def main(): env = Blackjack() target_policy = init_policy(env) behavior_policy = init_equiprobable_random_policy(env) q = init_state_action_map(env) c = init_state_action_map(env) for _ in xrange(20000): episode = generate_episode(env, behavior_policy) off_policy_evaluation(episode, q, c, target_policy, behavior_policy) env.visualize_action_value(q)
def policy_iteration2(env, target_policy, behavior_policy): q = init_state_action_map(env) c = init_state_action_map(env) for _ in xrange(20000): episode = generate_episode(env, behavior_policy) fine_grained_off_policy_iteration(episode, q, c, target_policy, behavior_policy, gamma=1) return q
def double_q_learning(env, epsilon=0.1, alpha=0.5, gamma=1, num_episodes=1000): q1 = init_state_action_map(env) q2 = init_state_action_map(env) for i in range(num_episodes): state = env.reset() done = False while not done: action = choose_doubled_epsilon_greedy_action(q1, q2, state, epsilon) (next_state, reward, done, _) = env.step(action) if random.random() < 0.5: double_q_update(q1, q2, state, action, reward, next_state, alpha, gamma) else: double_q_update(q2, q1, state, action, reward, next_state, alpha, gamma) state = next_state return q1, q2
def main(): # define hyperparameters num_episodes = 1000 epsilon = 1 gamma = 0.9 alpha = 0.1 # create an env env = GridworldChase(12, 12, p_goal_move=1, agent_random_start=True, goal_random_start=True) # init q and get baseline random performance q = init_state_action_map(env) estimate_performance(env, q, 1) # learn q print("running q-learning...") q = q_learning(env, q, epsilon=epsilon, alpha=alpha, gamma=gamma, num_episodes=num_episodes) print("q-learning complete") # determine post-training performance estimate_performance(env, q, 0.01) visualize_performance(env, q, delay=0.15)
def n_step_sarsa(env, n=5, alpha=0.5, epsilon=0.1, gamma=0.9, num_episodes=10): q = init_state_action_map(env) for _ in range(num_episodes): # reset states, actions, and rewards lists states = [] actions = [] rewards = [None] # reset state, action state = env.reset() states.append(state) action = choose_epsilon_greedy_action(q, state, epsilon) actions.append(action) T = float("inf") t = 0 while True: # while more actions remain to be taken if t < T: action = actions[t] next_state, reward, done, _ = env.step(action) states.append(next_state) rewards.append(reward) if done: T = t + 1 else: action = choose_epsilon_greedy_action( q, next_state, epsilon) actions.append(action) # tau is the index on state/action updates tau = t - n + 1 # if we are deep enough into an episode to perform an update if tau >= 0: # compute the target of the update (n-step return) G = sum([ gamma**(i - tau - 1) * rewards[i] for i in range(tau + 1, min(tau + n, T) + 1) ]) if tau + n < T: G = G + gamma**n * q[states[tau + n]][actions[tau + n]] q_value = q[states[tau]][actions[tau]] # update the q function q[states[tau]][actions[tau]] = q_value + alpha * (G - q_value) # don't update the terminal state if tau == T - 1: break t = t + 1 return q
def main(): goals = [(7,0)] anti_goals = [(1,0),(2,0),(3,0),(4,0),(5,0),(6,0)] env = Gridworld(8, 4, goals, anti_goals) # get baseline random performance q = init_state_action_map(env) estimate_performance(env, q, 1) # learn q print("running double q-learning...") q1, q2 = double_q_learning(env) print("double q-learning complete") # determine post-training performance estimate_performance(env, q2, 0.01) visualize_performance(env, q2)
def main(): goals = [(7, 0)] anti_goals = [(1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0)] env = Gridworld(8, 4, goals, anti_goals) # init q and get baseline random performance q = init_state_action_map(env) estimate_performance(env, q, 1) # learn q print("running sarsa...") q = sarsa(env, q) print("sarsa complete") # determine post-training performance estimate_performance(env, q, 0.01) visualize_performance(env, q)
def main(): x_limit = 8 y_limit = 5 goals = [(0, 4)] walls = [(0, 2), (1, 2), (2, 2), (3, 2)] env = Maze(x_limit, y_limit, goals, walls) num_episodes = 10 # determine the baseline performance that results from taking random moves avg = sum([len(generate_random_episode(env)) for _ in range(num_episodes)]) / float(num_episodes) print "baseline random performance: " + str(avg) # learn q print "running tabular dyna-q..." q = init_state_action_map(env) q = tabular_dyna_q(env, q) print "tabular dyna-q complete" # evaluate performance avg = sum([ len(generate_epsilon_greedy_episode(env, q)) for _ in range(num_episodes) ]) / float(num_episodes) print "post learning performance: " + str(avg) # visualize post-training episode state = env.reset() while True: env.render() time.sleep(0.25) action = choose_epsilon_greedy_action(q, state, 0.1) state, _, done, _ = env.step(action) # take a random action if done: env.render(close=True) break