from mdps.learners import q_learning, make_epsilon_greedy_policy, linear_decrease, exponential_decrease from experiments import get_env import matplotlib.pyplot as plt from mdps.evaluate_policy import evaluate_solutions import numpy as np name_no_slip, env_no_slip = get_env("taxi") plt.title("Effect of Exploration on Taxi Problem") policy, stats_no_slip = q_learning(env_no_slip, 10000, .65, .6, .1) f, (ax1, ax2) = plt.subplots(1, 2) f.suptitle("10000 iterations, .65 Discount, .6 Alpha, .1 Epsilon - No Decay") ax1.plot(stats_no_slip.episode_scores, label="All Episodes") ax2.plot(stats_no_slip.episode_scores[4000:], label="Episodes after episode 2000") plt.legend() plt.show()
DISCOUNT = .99 ALPHA = .9 EPSILON = .9 TAIL = int(EPISODE_LENGTH * .8) _, env_no_slip = get_env("8x8") _, env_slip = get_env("8x8", slippery=True) f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(3, 2) f.suptitle("8x8 problem - {} iterations, {} Discount, {} LR, " "{} Epsilon - No Decay".format(EPISODE_LENGTH, DISCOUNT, ALPHA, EPSILON)) ax1.set_title("No Slip") ax2.set_title("Slip") q_no_slip, stats_no_slip = q_learning(env_no_slip, EPISODE_LENGTH, DISCOUNT, ALPHA, EPSILON) q_slip, stats_slip = q_learning(env_slip, EPISODE_LENGTH, DISCOUNT, ALPHA, EPSILON) ax1.plot(stats_no_slip.episode_scores, label="No Slipping") ax2.axis('off') visualize_ice_policy ax3.plot(stats_no_slip.episode_scores, label="Slipping") ax1.legend() ax2.legend() ax3.legend() ax4.legend()
TAIL = int(EPISODE_LENGTH * .8) _, env_no_slip = get_env("8x8") _, env_slip = get_env("8x8", slippery=True) f, (ax1, ax2) = plt.subplots(1, 2) f.suptitle("8x8 problem Linear Decrease - {} iterations, {} Discount, {} LR, " "{} Epsilon\n Epsilon Decay after {} Alpha Decay After {}".format( EPISODE_LENGTH, DISCOUNT, ALPHA, EPSILON, DECREASE_EPSILON_POINT, ALPHA_DECREASE_POINT)) ax1.set_title("Success per hundred Episodes") ax2.set_title("Final Policy - Slip") q_slip, stats_slip = q_learning(env_slip, EPISODE_LENGTH, DISCOUNT, ALPHA, EPSILON, epsilon_division_point=DECREASE_EPSILON_POINT, alpha_division_point=ALPHA_DECREASE_POINT) ax1.plot(stats_slip.success_per_hundred, label="No Slipping") pol_slip = convert_q_to_policy(q_slip, env_slip) ax2.axis('off') visualize_ice_policy(env_no_slip, pol_slip, ax=ax2) ax2.plot(stats_slip.episode_scores, label="Slipping") ax1.legend() plt.show()
from mdps.learners import q_learning, make_epsilon_greedy_policy, step_decay from experiments import get_env import matplotlib.pyplot as plt from mdps.evaluate_policy import evaluate_solutions import numpy as np import pickle import random EPISODE_LENGTH = 5 TAIL = int(EPISODE_LENGTH * .8) _, env_no_slip = get_env("8x8") _, env_slip = get_env("8x8", slippery=True) s = step_decay([5000, 2500, 1000]) _, stats_no_slip = q_learning(env_no_slip, EPISODE_LENGTH, .95, .8, .9) _, stats_slip = q_learning(env_slip, EPISODE_LENGTH, .95, .8, .9) copy_dict_slip = { "ep_length": stats_slip.episode_lengths.tolist(), "ep_score": stats_slip.episode_scores.tolist() } copy_dict_no_slip = { "ep_length": stats_no_slip.episode_lengths.tolist(), "ep_score": stats_no_slip.episode_scores.tolist() } with open("stats_no_slip{}.p".format(random.randint(1, 200)), "wb") as f: pickle.dump(copy_dict_no_slip, f)