示例#1
0
from mdps.learners import q_learning, make_epsilon_greedy_policy, linear_decrease, exponential_decrease
from experiments import get_env
import matplotlib.pyplot as plt
from mdps.evaluate_policy import evaluate_solutions
import numpy as np

name_no_slip, env_no_slip = get_env("taxi")

plt.title("Effect of Exploration on Taxi Problem")

policy, stats_no_slip = q_learning(env_no_slip, 10000, .65, .6, .1)

f, (ax1, ax2) = plt.subplots(1, 2)

f.suptitle("10000 iterations, .65 Discount, .6 Alpha, .1 Epsilon - No Decay")
ax1.plot(stats_no_slip.episode_scores, label="All Episodes")
ax2.plot(stats_no_slip.episode_scores[4000:], label="Episodes after episode 2000")
plt.legend()
plt.show()
示例#2
0
DISCOUNT = .99
ALPHA = .9
EPSILON = .9

TAIL = int(EPISODE_LENGTH * .8)
_, env_no_slip = get_env("8x8")
_, env_slip = get_env("8x8", slippery=True)

f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(3, 2)
f.suptitle("8x8 problem - {} iterations, {} Discount, {} LR, "
           "{} Epsilon - No Decay".format(EPISODE_LENGTH, DISCOUNT, ALPHA,
                                          EPSILON))
ax1.set_title("No Slip")
ax2.set_title("Slip")

q_no_slip, stats_no_slip = q_learning(env_no_slip, EPISODE_LENGTH, DISCOUNT,
                                      ALPHA, EPSILON)

q_slip, stats_slip = q_learning(env_slip, EPISODE_LENGTH, DISCOUNT, ALPHA,
                                EPSILON)

ax1.plot(stats_no_slip.episode_scores, label="No Slipping")
ax2.axis('off')

visualize_ice_policy

ax3.plot(stats_no_slip.episode_scores, label="Slipping")

ax1.legend()
ax2.legend()
ax3.legend()
ax4.legend()
示例#3
0
TAIL = int(EPISODE_LENGTH * .8)
_, env_no_slip = get_env("8x8")
_, env_slip = get_env("8x8", slippery=True)

f, (ax1, ax2) = plt.subplots(1, 2)
f.suptitle("8x8 problem Linear Decrease - {} iterations, {} Discount, {} LR, "
           "{} Epsilon\n Epsilon Decay after {} Alpha Decay After {}".format(
               EPISODE_LENGTH, DISCOUNT, ALPHA, EPSILON,
               DECREASE_EPSILON_POINT, ALPHA_DECREASE_POINT))
ax1.set_title("Success per hundred Episodes")
ax2.set_title("Final Policy - Slip")

q_slip, stats_slip = q_learning(env_slip,
                                EPISODE_LENGTH,
                                DISCOUNT,
                                ALPHA,
                                EPSILON,
                                epsilon_division_point=DECREASE_EPSILON_POINT,
                                alpha_division_point=ALPHA_DECREASE_POINT)

ax1.plot(stats_slip.success_per_hundred, label="No Slipping")
pol_slip = convert_q_to_policy(q_slip, env_slip)
ax2.axis('off')
visualize_ice_policy(env_no_slip, pol_slip, ax=ax2)

ax2.plot(stats_slip.episode_scores, label="Slipping")

ax1.legend()

plt.show()
示例#4
0
from mdps.learners import q_learning, make_epsilon_greedy_policy, step_decay
from experiments import get_env
import matplotlib.pyplot as plt
from mdps.evaluate_policy import evaluate_solutions
import numpy as np
import pickle
import random

EPISODE_LENGTH = 5
TAIL = int(EPISODE_LENGTH * .8)
_, env_no_slip = get_env("8x8")
_, env_slip = get_env("8x8", slippery=True)

s = step_decay([5000, 2500, 1000])

_, stats_no_slip = q_learning(env_no_slip, EPISODE_LENGTH, .95, .8, .9)
_, stats_slip = q_learning(env_slip, EPISODE_LENGTH, .95, .8, .9)

copy_dict_slip = {
    "ep_length": stats_slip.episode_lengths.tolist(),
    "ep_score": stats_slip.episode_scores.tolist()
}

copy_dict_no_slip = {
    "ep_length": stats_no_slip.episode_lengths.tolist(),
    "ep_score": stats_no_slip.episode_scores.tolist()
}

with open("stats_no_slip{}.p".format(random.randint(1, 200)), "wb") as f:
    pickle.dump(copy_dict_no_slip, f)