def semi_gradient_n_step_sarsa(env, n, alpha, gamma, epsilon, n_episodes, \ tile_coder, action_len, stop_threshold): # Initialization. q = LinearPolicy(tile_coder.total_n_tiles, action_len, env.action_space_size) states = [None] * n actions = np.zeros(n) rewards = np.zeros(n) all_steps = [] for episode in range(n_episodes): done = False obs = env.reset() states[0] = obs a = eps_greedy_func_policy(q, obs, epsilon, tile_coder,\ env.action_space_size) t = 0 tau = -1 T = np.inf while not done or tau != T-1: if t < T: obs_prime, reward, done = env.step(a) rewards[(t+1)%n] = reward states[(t+1)%n] = obs_prime if done: T = t+1 else: a = eps_greedy_func_policy(q, obs_prime, epsilon, \ tile_coder, env.action_space_size) actions[(t+1)%n] = a tau = t-n+1 if tau > -1: # Calculate n-step return. G = np.sum([gamma**(i-tau-1)*rewards[i%n] \ for i in range(tau+1, min(tau+n,T))]) if tau + n < T: s = states[(tau+n)%n] a = actions[(tau+n)%n] x = tile_coder.get_feature_vector(s, a) G += gamma**n * q.evaluate(x) s = states[tau%n] a = actions[tau%n] x = tile_coder.get_feature_vector(s, a) # Update weights. q.weights += alpha * (np.dot((G - q.evaluate(x)),x)) t += 1 print_episode(episode, n_episodes) # Stop training if state-action value function has converged. if len(all_steps) > 10 and sum(all_steps[-10:]) < stop_threshold: break # Store steps for plotting. all_steps.append(env.steps) # Plot agent performance during training. create_line_plot(range(len(all_steps)), all_steps, 'Episode number:', \ 'Number of steps:', 'Number of steps required to reach goal during training:') print_episode(n_episodes, n_episodes) return q
def differential_semi_gradient_n_step_sarsa(env, n, alpha, beta, epsilon, \ n_episodes, tile_coder, action_vec_dim, stop_threshold): # Initialization. q = LinearPolicy(tile_coder.total_n_tiles, action_vec_dim, env.action_space_size) r_bar = 0 states = [None] * n actions = np.zeros(n) rewards = np.zeros(n) all_steps = [] for episode in range(n_episodes): done = False obs = env.reset() states[0] = obs a = eps_greedy_func_policy(q, obs, epsilon, tile_coder, \ env.action_space_size) t = 0 tau = -1 while not done: obs, reward, done = env.step(a) states[(t + 1) % n] = obs rewards[(t + 1) % n] = reward a = eps_greedy_func_policy(q, obs, epsilon, tile_coder, \ env.action_space_size) actions[(t + 1) % n] = a tau = t - n + 1 if tau > -1: x = tile_coder.get_feature_vector(states[tau % n], actions[tau % n]) x_n = tile_coder.get_feature_vector(states[(tau+n)%n], \ actions[(tau+n)%n]) summ = np.sum( [rewards[i % n] - r_bar for i in range(tau + 1, tau + n)]) delta = summ + q.evaluate(x_n) - q.evaluate(x) r_bar += beta * delta q.weights += alpha * delta * x t += 1 # Stop training if state-action value function has converged. if len(all_steps) > 10 and sum(all_steps[-10:]) < stop_threshold: break # Store steps for plotting. all_steps.append(env.steps) print_episode(episode, n_episodes) # Plot agent performance during training. create_line_plot(range(len(all_steps)), all_steps, 'Episode number:', \ 'Number of steps:', 'Number of steps required to reach goal during training:') print_episode(n_episodes, n_episodes) return q
def Q_learing(env, alpha, gamma, epsilon, n_episodes): # Initialize state-action value function. Q = {} curr_row = 0 for row, col in env.state_space: for i in range(curr_row, curr_row + row): positions = product([i], range(col)) velocities = product(range(-3, 1), range(-2, 3)) states = product(positions, velocities) sa_pairs = product(states, range(9)) # Key: (((pos_x, pos_y), (dy, dx)), action) for pair in sa_pairs: Q[pair] = 0 curr_row += row # Store rewards for plot. rewards = [] decay = lambda x: x - 2 / n_episodes if x - 2 / n_episodes > 0 else 0.1 for episode in range(n_episodes): done = False val = 0 obs = env.reset() while not done: action = eps_greedy_policy(Q, obs, epsilon, env.action_space_size) obs_prime, reward, done = env.step(action) val += reward action_values = [Q[obs_prime, i] for i in range(9)] opt_a = np.argmax(action_values) # Update state-action value estimate. Q[obs,action] += alpha * (reward + gamma * Q[obs_prime,opt_a] \ - Q[obs,action]) obs = obs_prime epsilon = decay(epsilon) rewards.append(val) if episode % 10 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) # Plot rewards over training process. create_line_plot(range(len(rewards)), rewards, 'Episode number:', \ 'Return:', 'Agent returns over training:') return Q
def semi_gradient_sarsa(env, alpha, gamma, epsilon, n_episodes, tile_coder, action_len): # Initialization. q = LinearPolicy(tile_coder.total_n_tiles, action_len, env.action_space_size) all_steps = [] for episode in range(n_episodes): done = False obs = env.reset() a = eps_greedy_func_policy(q, obs, epsilon, tile_coder, \ env.action_space_size) while not done: obs_prime, reward, done = env.step(a) x = tile_coder.get_feature_vector(obs, a) if done: # Update weights. q.weights += alpha * np.dot((reward - q.evaluate(x)), x) else: a_prime = eps_greedy_func_policy(q, obs_prime, epsilon, \ tile_coder, env.action_space_size) x_prime = tile_coder.get_feature_vector(obs_prime, a_prime) # Update weights. q.weights += alpha * np.dot((reward + \ gamma * q.evaluate(x_prime) - q.evaluate(x)), x) obs = obs_prime a = a_prime # Store steps for plotting. all_steps.append(env.steps) print_episode(episode, n_episodes) # Plot agent performance over training. create_line_plot(range(len(all_steps)), all_steps, 'Episode number:', \ 'Number of steps:', 'Number of steps required to reach goal during training:') print_episode(n_episodes, n_episodes) return q
def differential_semi_gradient_sarsa(env, alpha, beta, epsilon, n_episodes,\ tile_coder, action_vec_dim, stop_threshold): # Initialization. q = LinearPolicy(tile_coder.total_n_tiles, action_vec_dim, env.action_space_size) r_bar = 0 all_steps = [] for episode in range(n_episodes): done = False obs = env.reset() a = eps_greedy_func_policy(q, obs, epsilon, tile_coder, \ env.action_space_size) while not done: obs_prime, reward, done = env.step(a) a_prime = eps_greedy_func_policy(q, obs_prime, epsilon, tile_coder,\ env.action_space_size) x = tile_coder.get_feature_vector(obs, a) x_prime = tile_coder.get_feature_vector(obs_prime, a_prime) delta = reward - r_bar + q.evaluate(x_prime) - q.evaluate(x) r_bar += beta * delta # Update weights. q.weights += alpha * delta * x obs = obs_prime a = a_prime # Stop training if state-action value function has converged. if len(all_steps) > 10 and sum(all_steps[-10:]) < stop_threshold: break # Store steps for plotting. all_steps.append(env.steps) print_episode(episode, n_episodes) # Plot agent performance during training. create_line_plot(range(len(all_steps)), all_steps, 'Episode number:', \ 'Number of steps:', 'Number of steps required to reach goal during training:') print_episode(n_episodes, n_episodes) return q
for t in range(len(states)): G_t = sum(rewards[t + 1:]) x_t = encode_state(states[t], env.observation_space_size) delta = G_t - v.evaluate(x_t) v.weights += alpha_w * (gamma**t) * delta * x_t all_sa_pairs = [encode_sa_pair(states[t], a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] policy.weights += alpha_th * (gamma ** t) * G_t * delta * \ policy.eligibility_vector(actions[t], all_sa_pairs) returns.append(sum(rewards[1:])) print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return (policy, np.array(returns)) if __name__ == '__main__': gamma = 1 alpha_w = 0.001 alpha_th = 0.000001 n_episodes = 1000 env = ShortCorridor() all_returns = np.array([REINFORCE_baseline(env, alpha_th, alpha_w, gamma, \ n_episodes)[1] for i in range(150)]) all_returns = np.sum(all_returns, axis=0) all_returns = all_returns / all_returns.shape[0] create_line_plot(range(all_returns.shape[0]), all_returns, 'Episode number:', \ 'Average return:', 'Returns averaged over 150 independent runs:')