Пример #1
0
import algorithms as alg

env = gym.make("MountainCar-v0")

bd = alg.encoding.BoxDiscretization(
    env.observation_space,
    N_buckets=[10, 10],
    limits=[[env.observation_space.low[0], env.observation_space.high[0]],
            [env.observation_space.low[1], env.observation_space.high[1]]])
env = gym.wrappers.TransformObservation(env, bd.encode)

print("Q-Learning")
alg.utils.random_seed(env, 1)
Q, history_qlearning = alg.qlearning(env,
                                     alpha=0.1,
                                     gamma=1,
                                     epsilon=0.1,
                                     N_episodes=2000)

alg.utils.plot_learning_curves([history_qlearning], ["Q-Learning"],
                               "mountaincar_td_learning.pdf")

for i in range(10):
    done = False
    state = env.reset()
    steps = 1
    ret = 0
    while not done:
        action = alg.utils.select_action_greedy(Q[state])
        state, reward, done, info = env.step(action)
        env.render()
Пример #2
0
                             alpha=0.1,
                             gamma=1,
                             epsilon=1,
                             N_episodes=10000,
                             epsilon_decay=alg.utils.decay_sigmoid)
pi = alg.utils.create_greedy_policy(Q)
print(
    np.array([np.argmax(pi[s])
              for s in range(env.nS)]).reshape(env.nrow, env.ncol))
evaluate_policy(env, pi, 10000, env.nS - 1)

print("\nQ-Learning")
alg.utils.random_seed(env, 1)
Q, history_qlearning = alg.qlearning(env,
                                     alpha=0.1,
                                     gamma=1,
                                     epsilon=1,
                                     N_episodes=10000,
                                     epsilon_decay=alg.utils.decay_sigmoid)
pi = alg.utils.create_greedy_policy(Q)
print(
    np.array([np.argmax(pi[s])
              for s in range(env.nS)]).reshape(env.nrow, env.ncol))
evaluate_policy(env, pi, 10000, env.nS - 1)

print("\nExpected SARSA")
alg.utils.random_seed(env, 1)
Q, history_expected_sarsa = alg.expected_sarsa(
    env,
    alpha=0.1,
    gamma=1,
    epsilon=1,
Пример #3
0
Q, history_sarsa = alg.sarsa(env,
                             alpha=0.5,
                             gamma=0.99,
                             epsilon=0.5,
                             N_episodes=10000,
                             epsilon_decay=alg.utils.decay_linear,
                             alpha_decay=alg.utils.decay_linear)
pi = alg.utils.create_greedy_policy(Q)
evaluate_policy(env, pi, 1000)

print("\nQ-Learning")
alg.utils.random_seed(env, 1)
Q, history_qlearning = alg.qlearning(env,
                                     alpha=0.5,
                                     gamma=0.99,
                                     epsilon=0.5,
                                     N_episodes=10000,
                                     epsilon_decay=alg.utils.decay_linear,
                                     alpha_decay=alg.utils.decay_linear)
pi = alg.utils.create_greedy_policy(Q)
evaluate_policy(env, pi, 1000)

print("\nExpected SARSA")
alg.utils.random_seed(env, 1)
Q, history_expected_sarsa = alg.expected_sarsa(
    env,
    alpha=0.5,
    gamma=0.99,
    epsilon=0.5,
    N_episodes=10000,
    epsilon_decay=alg.utils.decay_linear,