def REINFORCE(env, alpha, gamma, n_episodes): policy = ExponentialSoftmax(env.observation_space_size * env.action_space_size) for episode in range(n_episodes): done = False obs = env.reset() all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(all_sa_pairs) states = [obs] actions = [a] rewards = [None] while not done: obs, reward, done = env.step(a) all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(all_sa_pairs) states.append(obs) actions.append(a) rewards.append(reward) for t in range(len(states)): G_t = sum(rewards[t + 1:]) all_sa_pairs = [encode_sa_pair(states[t], a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] policy.weights += alpha * (gamma ** t) * G_t * \ policy.eligibility_vector(actions[t], all_sa_pairs) if episode % 100 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy
def one_step_actor_critic(env, alpha_th, alpha_w, gamma, n_episodes): policy = ExponentialSoftmax(env.observation_space_size*env.action_space_size) v = LinearValueFunction(env.observation_space_size) for episode in range(n_episodes): done = False obs = env.reset() obs_vec = encode_state(obs, env.observation_space_size) I = 1 while not done: sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(sa_pairs) obs_prime, reward, done = env.step(a) obs_prime_vec = encode_state(obs_prime, env.observation_space_size) delta = reward + gamma * v.evaluate(obs_prime_vec) - v.evaluate(obs_vec) v.weights += alpha_w * I * delta * obs_vec policy.weights += alpha_th * I * delta * policy.eligibility_vector(a, sa_pairs) I *= I obs_vec = obs_prime_vec obs = obs_prime print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy
def actor_critic_eligibility_traces(env, eta, alpha_th, alpha_w, lambda_th, lambda_w, \ gamma, n_episodes): policy = ExponentialSoftmax(env.observation_space_size * env.action_space_size) v = LinearValueFunction(env.observation_space_size) z_th = np.zeros(env.observation_space_size * env.action_space_size) z_w = np.zeros(env.observation_space_size) R_bar = 0 for episode in range(n_episodes): done = False obs = env.reset() obs_vec = encode_state(obs, env.observation_space_size) while not done: sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(sa_pairs) obs_prime, reward, done = env.step(a) obs_prime_vec = encode_state(obs_prime, env.observation_space_size) delta = reward - R_bar + v.evaluate(obs_prime_vec) - v.evaluate( obs_vec) R_bar += eta * delta z_w = lambda_w * z_w + obs_vec z_th = lambda_th * z_th + policy.eligibility_vector(a, sa_pairs) v.weights += alpha_w * delta * z_w policy.weights += alpha_th * delta * z_th obs_vec = obs_prime_vec obs = obs_prime print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy
def sarsa_lambda(env, lamda, alpha, gamma, epsilon, n_episodes): # Initialize state-action value function. q = LinearPolicy(env.observation_space_size * env.action_space_size, 0, \ env.action_space_size) for episode in range(n_episodes): done = False obs = env.reset() action = eps_greedy_policy_bin_features(q, obs, epsilon, \ env.observation_space_size, env.action_space_size) z = np.zeros(env.observation_space_size * env.action_space_size) while not done: obs_prime, reward, done = env.step(action) delta = reward sa_vec = encode_sa_pair(obs, action, env.observation_space_size, \ env.action_space_size) idx_active = np.argwhere(sa_vec == 1) delta -= np.sum(q.weights[idx_active]) # Accumulating traces. z[idx_active] += 1 if done: # Update weights. q.weights += alpha * delta * z else: action_prime = eps_greedy_policy_bin_features(q, obs_prime, epsilon, \ env.observation_space_size, env.action_space_size) sa_prime_vec = encode_sa_pair(obs_prime, action_prime, \ env.observation_space_size, env.action_space_size) idx_active = np.argwhere(sa_prime_vec == 1) delta += gamma * np.sum(q.weights[idx_active]) # Update weights. q.weights += alpha * delta * z # Update accumulating traces. z = gamma * lamda * z obs = obs_prime action = action_prime if episode % 100 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return q
def REINFORCE_baseline(env, alpha_th, alpha_w, gamma, n_episodes): policy = ExponentialSoftmax(env.observation_space_size * env.action_space_size) v = LinearValueFunction(env.observation_space_size) returns = [] for episode in range(n_episodes): done = False obs = env.reset() all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(all_sa_pairs) states = [obs] actions = [a] rewards = [None] while not done: obs, reward, done = env.step(a) all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(all_sa_pairs) states.append(obs) actions.append(a) rewards.append(reward) for t in range(len(states)): G_t = sum(rewards[t + 1:]) x_t = encode_state(states[t], env.observation_space_size) delta = G_t - v.evaluate(x_t) v.weights += alpha_w * (gamma**t) * delta * x_t all_sa_pairs = [encode_sa_pair(states[t], a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] policy.weights += alpha_th * (gamma ** t) * G_t * delta * \ policy.eligibility_vector(actions[t], all_sa_pairs) returns.append(sum(rewards[1:])) print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return (policy, np.array(returns))
def online_sarsa_lambda(env, lamda, alpha, gamma, epsilon, n_episodes): # Initialize state-action value function. q = LinearPolicy(env.observation_space_size * env.action_space_size, 0, env.action_space_size) for episode in range(n_episodes): done = False obs = env.reset() a = eps_greedy_policy_bin_features(q, obs, epsilon, env.observation_space_size, \ env.action_space_size) x = encode_sa_pair(obs, a, env.observation_space_size, env.action_space_size) z = np.zeros(env.observation_space_size * env.action_space_size) Q_old = 0 while not done: obs_prime, reward, done = env.step(a) a_prime = eps_greedy_policy_bin_features(q, obs_prime, epsilon, \ env.observation_space_size, env.action_space_size) x_prime = encode_sa_pair(obs_prime, a_prime, env.observation_space_size, \ env.action_space_size) Q = q.evaluate(x) Q_prime = q.evaluate(x_prime) delta = reward + gamma * Q_prime - Q # Update eligibility traces. z = gamma * lamda * z + (1 - alpha * gamma * lamda * np.dot(z, x)) * x # Update weights. q.weights += alpha * (delta + Q - Q_old) * z - alpha * (Q - Q_old) * x Q_old = Q x = x_prime a = a_prime if episode % 100 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return q
def test_policy(env, policy, n_tests): # MOve this function. import time input('Press any key to begin tests.') for i in range(n_tests): done = False obs = env.reset() env.render() time.sleep(0.3) while not done: all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.greedy_action(all_sa_pairs) obs, _, done = env.step(a) env.render() time.sleep(0.3)