import algorithms as alg env = gym.make("MountainCar-v0") bd = alg.encoding.BoxDiscretization( env.observation_space, N_buckets=[10, 10], limits=[[env.observation_space.low[0], env.observation_space.high[0]], [env.observation_space.low[1], env.observation_space.high[1]]]) env = gym.wrappers.TransformObservation(env, bd.encode) print("Q-Learning") alg.utils.random_seed(env, 1) Q, history_qlearning = alg.qlearning(env, alpha=0.1, gamma=1, epsilon=0.1, N_episodes=2000) alg.utils.plot_learning_curves([history_qlearning], ["Q-Learning"], "mountaincar_td_learning.pdf") for i in range(10): done = False state = env.reset() steps = 1 ret = 0 while not done: action = alg.utils.select_action_greedy(Q[state]) state, reward, done, info = env.step(action) env.render()
alpha=0.1, gamma=1, epsilon=1, N_episodes=10000, epsilon_decay=alg.utils.decay_sigmoid) pi = alg.utils.create_greedy_policy(Q) print( np.array([np.argmax(pi[s]) for s in range(env.nS)]).reshape(env.nrow, env.ncol)) evaluate_policy(env, pi, 10000, env.nS - 1) print("\nQ-Learning") alg.utils.random_seed(env, 1) Q, history_qlearning = alg.qlearning(env, alpha=0.1, gamma=1, epsilon=1, N_episodes=10000, epsilon_decay=alg.utils.decay_sigmoid) pi = alg.utils.create_greedy_policy(Q) print( np.array([np.argmax(pi[s]) for s in range(env.nS)]).reshape(env.nrow, env.ncol)) evaluate_policy(env, pi, 10000, env.nS - 1) print("\nExpected SARSA") alg.utils.random_seed(env, 1) Q, history_expected_sarsa = alg.expected_sarsa( env, alpha=0.1, gamma=1, epsilon=1,
Q, history_sarsa = alg.sarsa(env, alpha=0.5, gamma=0.99, epsilon=0.5, N_episodes=10000, epsilon_decay=alg.utils.decay_linear, alpha_decay=alg.utils.decay_linear) pi = alg.utils.create_greedy_policy(Q) evaluate_policy(env, pi, 1000) print("\nQ-Learning") alg.utils.random_seed(env, 1) Q, history_qlearning = alg.qlearning(env, alpha=0.5, gamma=0.99, epsilon=0.5, N_episodes=10000, epsilon_decay=alg.utils.decay_linear, alpha_decay=alg.utils.decay_linear) pi = alg.utils.create_greedy_policy(Q) evaluate_policy(env, pi, 1000) print("\nExpected SARSA") alg.utils.random_seed(env, 1) Q, history_expected_sarsa = alg.expected_sarsa( env, alpha=0.5, gamma=0.99, epsilon=0.5, N_episodes=10000, epsilon_decay=alg.utils.decay_linear,