def example_6_6(): fig, ax = plt.subplots() fig.suptitle(f'Example 6.6 (Averaged over {EX_6_6_N_SEEDS} seeds)') ax.set_xlabel('Episodes') ax.set_ylabel( f'(Average of last {EX_6_6_N_AVG}) sum of rewards during episodes') ax.set_yticks(EX_6_6_YTICKS) ax.set_ylim(bottom=min(EX_6_6_YTICKS)) n_ep = EX_6_6_N_EPS env = TheCliff() qlearning_alg = QLearning(env, step_size=EX_6_5_STEP_SIZE, gamma=UNDISCOUNTED, eps=EX_6_5_EPS) sarsa_alg = Sarsa(env, step_size=EX_6_5_STEP_SIZE, gamma=UNDISCOUNTED, eps=EX_6_5_EPS) qlearning_rew = np.zeros(n_ep) sarsa_rew = np.zeros(n_ep) for seed in range(EX_6_6_N_SEEDS): print(f"seed={seed}") qlearning_alg.seed(seed) qlearning_rew += qlearning_alg.q_learning(n_ep) sarsa_alg.seed(seed) sarsa_rew += sarsa_alg.on_policy_td_control(n_ep, rews=True) plt.plot(smooth_rewards(qlearning_rew / EX_6_6_N_SEEDS, EX_6_6_N_AVG), color='r', label='Q learning') plt.plot(smooth_rewards(sarsa_rew / EX_6_6_N_SEEDS, EX_6_6_N_AVG), color='b', label='Sarsa') plt.legend() plt.savefig('example6.6.png') plt.show()
def plot_sarsa(ax, n_ep, label=None, diags=False, stay=False, stoch=False, seed=0): env = WindyGridworld(diags, stay, stoch) alg = Sarsa(env, step_size=EX_6_5_STEP_SIZE, gamma=UNDISCOUNTED, eps=EX_6_5_EPS) alg.seed(seed) kwargs = {"label": label} if label else {} plt.plot(alg.on_policy_td_control(n_ep), **kwargs)
# Randomly locate the food on the barn. amount_food = randint(max_size / 2, 2 * max_size) food = [] while len(food) < amount_food: # Add a new piece of food. food.append((randint(0, max_size-1), randint(0, max_size-1))) # Ensure uniqueness. food = list(set(food)) # Start the algorithm. sarsa = Sarsa(BarnState((0,0), food, max_size), epsilon=epsilon, alpha=alpha, gamma=gamma) sarsa.seed(int(100 * time.time())) # keep track of how much do we move the q. track = [] for it in range(1, max_iters + 1): if it % 10 == 0: print "Scenario %d: %d/%d\r" % (n, it, max_iters) , sys.stdout.flush() history, corrections = sarsa.iterate() track.append(numpy.sqrt(sum(map(lambda x: x*x, corrections)))) # We're just selecting nice places to evaluate the current policy and create a picture. if (it % 10 ** int(log10(it)) == 0) and (it / 10 ** int(log10(it)) in [1, 2, 4, 8]):