stored_actions[(t + 1) % (n + 1)] = action tau = t - n + 1 if tau >= 0: # calculate G(tau:tau+n) G = np.sum([ discount_factor**(i - tau - 1) * stored_rewards[i % (n + 1)] for i in range(tau + 1, min(tau + n, T) + 1) ]) if tau + n < T: G += discount_factor**n * Q[stored_states[ (tau + n) % (n + 1)]][stored_actions[(tau + n) % (n + 1)]] tau_s, tau_a = stored_states[tau % (n + 1)], stored_actions[tau % (n + 1)] # update Q value with n step return Q[tau_s][tau_a] += alpha * (G - Q[tau_s][tau_a]) return Q, stats if __name__ == '__main__': Q, stats = n_step_sarsa(env, num_episodes=300, n=10) plots.plot_episode_stats(stats, file='results/n_step_sarsa/')
action_probs = behavior_policy(state) action = np.random.choice(np.arange(nA), p=action_probs) stored_actions[(t+1) % (n+1)] = action tau = t - n + 1 if tau >= 0: # calculate rho rho = np.prod( [target_policy(stored_states[i%(n+1)])[stored_actions[i%(n+1)]]/behavior_policy(stored_states[i%(n+1)])[stored_actions[i%(n+1)]] for i in range(tau+1, min(tau+n-1, T-1)+1)] ) # calculate return G = np.sum([(gamma**(i-tau-1))*stored_rewards[i%(n+1)] for i in range(tau+1, min(tau+n, T)+1)]) if tau+n < T: expected_sarsa_update = np.sum( [target_policy(stored_states[(tau+n) % (n+1)])[a] * Q[stored_states[(tau+n) % (n+1)]][a] for a in range(nA)] ) G += (gamma**n) * expected_sarsa_update s_tau, a_tau = stored_states[tau % (n+1)], stored_actions[tau % (n+1)] td_error = G - Q[s_tau][a_tau] Q[s_tau][a_tau] += alpha * rho * td_error return Q, stats if __name__=='__main__': Q, stats = n_step_expected_sarsa(env, num_episodes=300) plots.plot_episode_stats(stats, file='results/n_step_off_policy_expected_sarsa/')
next_state, reward, done, _ = env.step(action) next_action_probs = policy(next_state) next_action = np.random.choice(env.action_space.n, p=next_action_probs) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t td_error = reward + (discount_factor * Q[next_state][next_action]) - Q[state][action] E[state][action] += 1 for s, _ in Q.items(): for a_ in range(nA): Q[s][a_] += alpha * td_error * E[s][a_] E[s][a_] *= discount_factor * lambd if done: break state = next_state action = next_action return Q, stats if __name__=='__main__': Q, stats = sarsa_lambd(env, 300) plots.plot_episode_stats(stats, file='results/sarsa_lambda/')
state = next_state for t, transition in enumerate(trajectory): # get total reward total_return = sum(self.gamma**i * tr.reward for i, tr in enumerate(trajectory[t:])) # get value_estimate value_estimate = self.value_estimator.predict(transition.state).detach() advantage = torch.FloatTensor([total_return]) - value_estimate advantage = torch.FloatTensor([advantage]) # update value estimator self.value_estimator.update(transition.state, torch.FloatTensor([total_return]), self.value_optimizer) # update policy estimator action = torch.LongTensor([transition.action]) self.policy_estimator.update(transition.state, advantage, action, self.policy_optimizer) return stats if __name__=="__main__": agent = ReinforceBaselineAgent(env.observation_space.n, action_size, 2000) stats = agent.train() plots.plot_episode_stats(stats, smoothing_window=25, file='results/pytorch_reinforce/')
# sample action from behavior policy action_probs = policy(state) action = np.random.choice(env.action_space.n, p=action_probs) # take action and observe environment's effects next_state, reward, done, _ = env.step(action) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # sample next action from target policy next_action = np.argmax(Q[next_state]) td_target = reward + discount_factor * Q[next_state][next_action] # update Q value Q[state][action] += alpha * (td_target - Q[state][action]) if done: break state = next_state return Q, stats if __name__ == '__main__': Q, stats = q_learning(env, num_episodes=300) plots.plot_episode_stats(stats, file='results/Q_learning/')
(n + 1)]] for k in range(min(t + 1, T), tau, -1): if k == T: G = stored_rewards[T % (n + 1)] else: s_k = stored_states[k % (n + 1)] a_k = stored_actions[k % (n + 1)] r_k = stored_rewards[k % (n + 1)] sigma_k = stored_sigma[k % (n + 1)] rho_k = stored_rho[k % (n + 1)] v_ = np.sum([(target_policy(s_k)[a]) * Q[s_k][a] for a in range(nA)]) G = r_k + gamma * ((sigma_k * rho_k) + ((1 - sigma_k) * (target_policy(s_k)[a_k]))) * ( G - Q[s_k][a_k]) + gamma * v_ s_tau, a_tau = stored_states[tau % (n + 1)], stored_actions[tau % (n + 1)] td_error = G - Q[s_tau][a_tau] Q[s_tau][a_tau] += alpha * td_error return Q, stats if __name__ == '__main__': Q, stats = q_sigma(env, num_episodes=300) plots.plot_episode_stats(stats, file='results/n_step_q_sigma/')
state = next_state return stats if __name__ == "__main__": estimator = Estimator() # Note: For the Mountain Car we don't actually need an epsilon > 0.0 # because our initial estimate for all states is too "optimistic" which leads # to the exploration of all states. stats = sarsa(env, estimator, 200, epsilon=0.0) plots.plot_cost_to_go_mountain_car(env, estimator, file='results/sarsa/') plots.plot_episode_stats(stats, smoothing_window=25, file='results/sarsa/') # uncomment to render # for i_episode in range(20): # print(i_episode) # policy = make_epsilon_greedy_policy( # estimator, 0.0, env.action_space.n) # observation = env.reset() # for t in itertools.count(): # env.render() # action = np.argmax(policy(observation)) # observation, reward, done, info = env.step(action) # if done: # print("Episode finished after {} timesteps".format(t+1)) # break # env.close()
state = next_state return stats if __name__=="__main__": estimator = Estimator() # Note: For the Mountain Car we don't actually need an epsilon > 0.0 # because our initial estimate for all states is too "optimistic" which leads # to the exploration of all states. stats = q_learning(env, estimator, 200, epsilon=0.0) plots.plot_cost_to_go_mountain_car(env, estimator, file='results/q_learning/') plots.plot_episode_stats(stats, smoothing_window=25, file='results/q_learning/') # uncomment to render # for i_episode in range(20): # print(i_episode) # policy = make_epsilon_greedy_policy( # estimator, 0.0, env.action_space.n) # observation = env.reset() # for t in itertools.count(): # env.render() # action = np.argmax(policy(observation)) # observation, reward, done, info = env.step(action) # if done: # print("Episode finished after {} timesteps".format(t+1)) # break
s_t1 = stored_states[(t + 1) % (n + 1)] # calulate sum of the leaf actions leaf_sum = np.sum([(target_policy(s_t1)[a]) * Q[s_t1][a] for a in range(env.nA)]) G = stored_rewards[(t + 1) % (n + 1)] + gamma * leaf_sum for k in range(min(t, T - 1), tau, -1): # get kth action and state s_k, a_k = stored_states[k % (n + 1)], stored_actions[k % (n + 1)] a_probs = np.sum([ target_policy(s_k)[a] * Q[s_k][a] for a in range(nA) if a != a_k ]) G = stored_rewards[k % (n + 1)] + gamma * ( a_probs + target_policy(s_k)[a_k] * G) s_tau, a_tau = stored_states[tau % (n + 1)], stored_actions[tau % (n + 1)] td_error = G - Q[s_tau][a_tau] Q[s_tau][a_tau] += alpha * td_error return Q, stats if __name__ == '__main__': Q, stats = n_step_tree_backup(env, num_episodes=300) plots.plot_episode_stats(stats, file='results/n_step_tree_backup/')
# take action and observe environment's effects next_state, reward, done, _ = env.step(action) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # sample next action from target policy next_action = np.argmax(Q[next_state]) td_target = reward + discount_factor * ((1-epsilon)*Q[next_state][next_action]+(epsilon/nA)*np.sum([Q[next_state][a] for a in range(nA)])) # update Q value Q[state][action] += alpha * (td_target - Q[state][action]) if done: break state = next_state return Q, stats if __name__=='__main__': Q, stats = expected_sarsa_off_policy(env, num_episodes=300) plots.plot_episode_stats(stats, file='results/expected_sarsa_off_policy/')