def actor_critic_eligibility_traces(env, eta, alpha_th, alpha_w, lambda_th, lambda_w, \ gamma, n_episodes): policy = ExponentialSoftmax(env.observation_space_size * env.action_space_size) v = LinearValueFunction(env.observation_space_size) z_th = np.zeros(env.observation_space_size * env.action_space_size) z_w = np.zeros(env.observation_space_size) R_bar = 0 for episode in range(n_episodes): done = False obs = env.reset() obs_vec = encode_state(obs, env.observation_space_size) while not done: sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(sa_pairs) obs_prime, reward, done = env.step(a) obs_prime_vec = encode_state(obs_prime, env.observation_space_size) delta = reward - R_bar + v.evaluate(obs_prime_vec) - v.evaluate( obs_vec) R_bar += eta * delta z_w = lambda_w * z_w + obs_vec z_th = lambda_th * z_th + policy.eligibility_vector(a, sa_pairs) v.weights += alpha_w * delta * z_w policy.weights += alpha_th * delta * z_th obs_vec = obs_prime_vec obs = obs_prime print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy
def one_step_actor_critic(env, alpha_th, alpha_w, gamma, n_episodes): policy = ExponentialSoftmax(env.observation_space_size*env.action_space_size) v = LinearValueFunction(env.observation_space_size) for episode in range(n_episodes): done = False obs = env.reset() obs_vec = encode_state(obs, env.observation_space_size) I = 1 while not done: sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(sa_pairs) obs_prime, reward, done = env.step(a) obs_prime_vec = encode_state(obs_prime, env.observation_space_size) delta = reward + gamma * v.evaluate(obs_prime_vec) - v.evaluate(obs_vec) v.weights += alpha_w * I * delta * obs_vec policy.weights += alpha_th * I * delta * policy.eligibility_vector(a, sa_pairs) I *= I obs_vec = obs_prime_vec obs = obs_prime print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy
def least_squares_td(env, policy, epsilon, alpha, gamma, n_episodes, tile_coder): # Initialization. n = tile_coder.total_n_tiles A = (1/epsilon) * np.eye(n) b = np.zeros((n,1)) d = np.zeros((n,1)) for episode in range(n_episodes): done = False obs = env.reset() while not done: feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \ env.action_space_size) a = policy.greedy_action(feature_vectors) obs_prime, reward, done = env.step(a) x = np.array(tile_coder.get_tile_code(obs)).reshape(-1,1) x_prime = np.array(tile_coder.get_tile_code(obs_prime)).reshape(-1,1) b = b + reward * x d = (x - gamma * x_prime) A = A + x @ d.T if env.steps == 2: inv_A = np.linalg.inv(A) else: t = np.eye(n) - (((x @ d.T)/(1 + ((d.T @ inv_A) @ x))) @ nv_A) inv_A = inv_A @ t theta = inv_A @ b print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) v = LinearValueFunction(tile_coder.total_n_tiles) v.weights = theta.flatten() return v
def online_td_lambda(env, lamda, alpha, gamma, n_episodes): # Initialize value function. v = LinearValueFunction(env.n_states) for episode in range(n_episodes): done = False obs = env.reset() obs_vec = encode_state(obs, env.n_states) z = np.zeros(env.n_states) V_old = 0 while not done: obs_prime, reward, done = env.step() obs_prime_vec = encode_state(obs_prime, env.n_states) V = v.evaluate(obs_vec) V_prime = v.evaluate(obs_prime_vec) delta = reward + gamma * V_prime - V # Update eligibility traces. z = gamma * lamda * z + ( 1 - alpha * gamma * lamda * np.dot(z, obs_vec)) * obs_vec # Update weights. v.weights += alpha * (delta + V - V_old) * z - alpha * (V - V_old) * obs_vec V_old = V_prime obs_vec = obs_prime_vec return v
def gradient_mc_prediction(env, policy, alpha, n_episodes, tile_coder): # Initialization. v = LinearValueFunction(tile_coder.total_n_tiles) states = [] rewards = [None] for episode in range(n_episodes): done = False obs = env.reset() # Store the feature vector representation of the state. states.append(tile_coder.get_tile_code(obs)) feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \ env.action_space_size) a = policy.greedy_action(feature_vectors) while not done: obs, reward, done = env.step(a) feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \ env.action_space_size) a = policy.greedy_action(feature_vectors) rewards.append(reward) states.append(tile_coder.get_tile_code(obs)) for i in range(len(states)): G = np.sum(rewards[i + 1:]) # Update weights. v.weights += alpha * np.dot((G - v.evaluate(states[i])), states[i]) print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return v
def semi_gradient_td_lambda(env, lamda, alpha, gamma, n_episodes): # Initialize value function. v = LinearValueFunction(env.n_states) for episode in range(n_episodes): done = False obs = env.reset() obs_vec = encode_state(obs, env.n_states) z = np.zeros(env.n_states) while not done: obs_prime, reward, done = env.step() obs_prime_vec = encode_state(obs_prime, env.n_states) # Update eligibility traces. z = gamma * lamda * z + obs_vec delta = reward + gamma * v.evaluate(obs_prime_vec) - v.evaluate(obs_vec) # Update weights. v.weights += alpha * delta * z obs_vec = obs_prime_vec return v
def semi_gradient_td_zero(env, policy, alpha, gamma, n_episodes, tile_coder): # Initialization. v = LinearValueFunction(tile_coder.total_n_tiles) for episode in range(n_episodes): done = False obs = env.reset() while not done: feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \ env.action_space_size) a = policy.greedy_action(feature_vectors) obs_prime, reward, done = env.step(a) s = tile_coder.get_tile_code(obs) s_prime = tile_coder.get_tile_code(obs_prime) # Update weights. v.weights += alpha * (np.dot((reward + gamma*v.evaluate(s_prime)- \ v.evaluate(s)), s)) obs = obs_prime print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return v
def REINFORCE_baseline(env, alpha_th, alpha_w, gamma, n_episodes): policy = ExponentialSoftmax(env.observation_space_size * env.action_space_size) v = LinearValueFunction(env.observation_space_size) returns = [] for episode in range(n_episodes): done = False obs = env.reset() all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(all_sa_pairs) states = [obs] actions = [a] rewards = [None] while not done: obs, reward, done = env.step(a) all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(all_sa_pairs) states.append(obs) actions.append(a) rewards.append(reward) for t in range(len(states)): G_t = sum(rewards[t + 1:]) x_t = encode_state(states[t], env.observation_space_size) delta = G_t - v.evaluate(x_t) v.weights += alpha_w * (gamma**t) * delta * x_t all_sa_pairs = [encode_sa_pair(states[t], a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] policy.weights += alpha_th * (gamma ** t) * G_t * delta * \ policy.eligibility_vector(actions[t], all_sa_pairs) returns.append(sum(rewards[1:])) print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return (policy, np.array(returns))
def semi_gradient_n_step_td(env, policy, n, alpha, gamma, n_episodes, tile_coder): # Initialization. v = LinearValueFunction(tile_coder.total_n_tiles) states = [None] * n rewards = np.zeros(n) for episode in range(n_episodes): done = False obs = env.reset() states[0] = tile_coder.get_tile_code(obs) t = 0 tau = -1 T = np.inf while not done or tau != T - 1: if t < T: feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \ env.action_space_size) a = policy.greedy_action(feature_vectors) obs, reward, done = env.step(a) states[(t + 1) % n] = tile_coder.get_tile_code(obs) rewards[(t + 1) % n] = reward if done: T = t + 1 tau = t - n + 1 if tau > -1: # Calculate n-step return. G = np.sum([gamma**(i-tau-1)*rewards[i%n] for i in range(tau+1, \ min(tau+n, T))]) if tau + n < T: G += gamma**n * v.evaluate(states[(tau + n) % n]) # Update weights. v.weights += alpha * np.dot((G-v.evaluate(states[tau%n])), \ states[tau%n]) t += 1 print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return v