def sarsa(env, estimator, num_episodes, statistics, discount_factor=1.0, epsilon=0.1, epsilon_decay=1.0): """ sarsa algorithm for on-policy TD control using Function Approximation. Args: env: OpenAI environment. estimator: Action-Value function estimator num_episodes: Number of episodes to run for. An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. discount_factor: Lambda time discount factor. epsilon: Chance the sample a random action. Float betwen 0 and 1. epsilon_decay: Each episode, epsilon is decayed by this factor Returns: """ for i_episode in range(num_episodes): # The policy we're following e_greedy_policy = utility.make_epsilon_greedy_policy_with_fa( estimator, epsilon * epsilon_decay**i_episode, env.action_space.n) # Print out which episode we're on, useful for debugging. # Also print reward for last episode last_reward = statistics.episode_rewards[i_episode - 1] sys.stdout.flush() # Reset the environment and pick the first action obvservation = env.reset() action = utility.make_decision(e_greedy_policy, obvservation) for t in itertools.count(): next_observation, reward, done, _ = env.step(action) # Update statistics statistics.episode_rewards[i_episode] += reward statistics.episode_lengths[i_episode] = t next_action = utility.make_decision(e_greedy_policy, next_observation) q_values_next = estimator.predict(next_observation, next_action) td_target = reward + discount_factor * q_values_next # Update the function approximator using our target estimator.update(obvservation, action, td_target) print("\rStep {} @ Episode {}/{} ({})".format( t, i_episode + 1, num_episodes, last_reward), end="") if done: break action = next_action obvservation = next_observation
def q_learning(env, num_episodes, statistics, discount_factor=1.0, alpha=0.5, epsilon=0.1): """ Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy while following an epsilon-greedy policy Args: env: OpenAI environment. num_episodes: Number of episodes to run for. discount_factor: Lambda time discount factor. alpha: TD learning rate. epsilon: Chance the sample a random action. Float betwen 0 and 1. Returns: Q is the optimal action-value function, a dictionary mapping state -> action values. stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # The final action-value function. # A nested dictionary that maps state -> (action -> action-value). Q = defaultdict(lambda: np.zeros(env.action_space.n)) # The policy we're following e_greedy_policy = utility.make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in range(num_episodes): # Print out which episode we're on, useful for debugging. if (i_episode + 1) % 100 == 0: print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes), end="") sys.stdout.flush() # Reset the environment and pick the first action state = env.reset() # One step in the environment # total_reward = 0.0 for t in itertools.count(): # Take a step action = utility.make_decision(e_greedy_policy, state) next_state, reward, done, _ = env.step(action) # Update statistics statistics.episode_rewards[i_episode] += reward statistics.episode_lengths[i_episode] = t # TD Update best_next_action = np.argmax(Q[next_state]) td_target = reward + discount_factor * \ Q[next_state][best_next_action] td_delta = td_target - Q[state][action] Q[state][action] += alpha * td_delta if done: break state = next_state return Q
def run_episode(env, greedy_policy): observation = env.reset() for t in itertools.count(): env.render() action = utility.make_decision(greedy_policy, observation) ob, reward, done, info = env.step(action) if done: break observation = ob
def mc_control_epsilon_greedy(env, num_episodes, statistics, discount_factor=1.0, epsilon=0.1): """ Monte Carlo Control using Epsilon-Greedy policies. Finds an optimal epsilon-greedy policy. Args: env: OpenAI gym environment. num_episodes: Nubmer of episodes to sample. statistics: namedTuple of statistics informaiton discount_factor: Lambda discount factor. epsilon: Chance the sample a random action. Float betwen 0 and 1. Returns: A tuple (Q, policy). Q is a dictionary mapping state -> action values. policy is a function taht takes an observation as an argument and returns action probabilities """ # Keeps track of sum and count of returns for each state # to calculate an average. We could use an array to save all # returns (like in the book) but that's memory inefficient. returns_sum = defaultdict(float) returns_count = defaultdict(float) # The final action-value function. # A nested dictionary that maps state -> (action -> action-value). Q = defaultdict(lambda: np.zeros(env.action_space.n)) # The policy we're following policy = utility.make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in range(num_episodes): # Print out which episode we're on, useful for debugging. if i_episode % 100 == 0: print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="") sys.stdout.flush() # Generate an episode. # An episode is an array of (state, action, reward) tuples episode = [] state = env.reset() for t in itertools.count(): action = utility.make_decision(policy, state) next_state, reward, done, _ = env.step(action) # Update statistics statistics.episode_rewards[i_episode] += reward statistics.episode_lengths[i_episode] = t episode.append((state, action, reward)) if done: break state = next_state # Find all (state, action) pairs we've visited in this episode # We convert each state to a tuple so that we can use it as a dict key # the tuple is (state,action) state_action_in_episode = set([(x[0], x[1]) for x in episode]) for state, action in state_action_in_episode: state_action = (state, action) # Find the first occurance of the (state, action) pair in the episode first_occurence_idx = next(i for i, x in enumerate(episode) if x[0] == state and x[1] == action) # Sum up all rewards since the first occurance G = sum([x[2] * (discount_factor**i) for i, x in enumerate(episode[first_occurence_idx:])]) # Calculate average return for this state over all sampled episodes returns_sum[state_action] += G returns_count[state_action] += 1.0 Q[state][action] = returns_sum[state_action] / \ returns_count[state_action] # The policy is improved implicitly by changing the Q dictionar return Q
def expected_sarsa(env, num_episodes, statistics, discount_factor=1.0, alpha=0.5, epsilon=0.1): """ Q-expected_sarsa algorithm: on-policy TD control. Finds the optimal epsilon-greedy policy Args: env: OpenAI environment. num_episodes: Number of episodes to run for. statistics: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. discount_factor: Lambda time discount factor. alpha: TD learning rate. epsilon: Chance the sample a random action. Float betwen 0 and 1. Returns: Q is the optimal action-value function, a dictionary mapping state -> action values. stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # The final action-value function. # A nested dictionary that maps state -> (action -> action-value). q = defaultdict(lambda: np.zeros(env.action_space.n)) # The policy we're following e_greedy_policy = utility.make_epsilon_greedy_policy(q, epsilon, env.action_space.n) for i_episode in range(num_episodes): # Print out which episode we're on, useful for debugging. if (i_episode + 1) % 100 == 0: print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes), end="") sys.stdout.flush() # Reset the environment and pick the first action observation = env.reset() # One step in the environment for t in itertools.count(): # Take a step action = utility.make_decision(e_greedy_policy, observation) next_observation, reward, done, _ = env.step(action) # Update statistics statistics.episode_rewards[i_episode] += reward statistics.episode_lengths[i_episode] = t expected_next_q = 0 next_actions = e_greedy_policy(next_observation) for action, action_prob in enumerate(next_actions): expected_next_q += action_prob * q[next_observation][action] td_target = reward + discount_factor * expected_next_q td_delta = td_target - q[observation][action] q[observation][action] += alpha * td_delta if done: break observation = next_observation return q