Exemplo n.º 1
0
def example_6_6():
    fig, ax = plt.subplots()
    fig.suptitle(f'Example 6.6 (Averaged over {EX_6_6_N_SEEDS} seeds)')
    ax.set_xlabel('Episodes')
    ax.set_ylabel(
        f'(Average of last {EX_6_6_N_AVG}) sum of rewards during episodes')
    ax.set_yticks(EX_6_6_YTICKS)
    ax.set_ylim(bottom=min(EX_6_6_YTICKS))
    n_ep = EX_6_6_N_EPS
    env = TheCliff()
    qlearning_alg = QLearning(env,
                              step_size=EX_6_5_STEP_SIZE,
                              gamma=UNDISCOUNTED,
                              eps=EX_6_5_EPS)
    sarsa_alg = Sarsa(env,
                      step_size=EX_6_5_STEP_SIZE,
                      gamma=UNDISCOUNTED,
                      eps=EX_6_5_EPS)
    qlearning_rew = np.zeros(n_ep)
    sarsa_rew = np.zeros(n_ep)
    for seed in range(EX_6_6_N_SEEDS):
        print(f"seed={seed}")
        qlearning_alg.seed(seed)
        qlearning_rew += qlearning_alg.q_learning(n_ep)
        sarsa_alg.seed(seed)
        sarsa_rew += sarsa_alg.on_policy_td_control(n_ep, rews=True)
    plt.plot(smooth_rewards(qlearning_rew / EX_6_6_N_SEEDS, EX_6_6_N_AVG),
             color='r',
             label='Q learning')
    plt.plot(smooth_rewards(sarsa_rew / EX_6_6_N_SEEDS, EX_6_6_N_AVG),
             color='b',
             label='Sarsa')
    plt.legend()
    plt.savefig('example6.6.png')
    plt.show()
Exemplo n.º 2
0
def plot_sarsa(ax,
               n_ep,
               label=None,
               diags=False,
               stay=False,
               stoch=False,
               seed=0):
    env = WindyGridworld(diags, stay, stoch)
    alg = Sarsa(env,
                step_size=EX_6_5_STEP_SIZE,
                gamma=UNDISCOUNTED,
                eps=EX_6_5_EPS)
    alg.seed(seed)
    kwargs = {"label": label} if label else {}
    plt.plot(alg.on_policy_td_control(n_ep), **kwargs)
Exemplo n.º 3
0
        # Randomly locate the food on the barn.
        amount_food = randint(max_size / 2, 2 * max_size)
        food = []

        while len(food) < amount_food:

            # Add a new piece of food.
            food.append((randint(0, max_size-1), randint(0, max_size-1)))

            # Ensure uniqueness.
            food = list(set(food))

        # Start the algorithm.
        sarsa = Sarsa(BarnState((0,0), food, max_size), epsilon=epsilon, alpha=alpha, gamma=gamma)
        sarsa.seed(int(100 * time.time()))

        # keep track of how much do we move the q.
        track = []

        for it in range(1, max_iters + 1):

            if it % 10 == 0:
                print "Scenario %d: %d/%d\r" % (n, it, max_iters) ,
                sys.stdout.flush()

            history, corrections = sarsa.iterate()
            track.append(numpy.sqrt(sum(map(lambda x: x*x, corrections))))
            
            # We're just selecting nice places to evaluate the current policy and create a picture.
            if (it % 10 ** int(log10(it)) == 0) and (it / 10 ** int(log10(it)) in [1, 2, 4, 8]):