def plotter(ax, v, vmax=0, vmin=-20, env=None): plt.cla() # ax.axis('off') ax.set_autoscaley_on(True) if env is not None: N = env.N all_states = dstack_product(np.arange(N), np.arange(N)) for state in all_states: i, j = state if not env.is_the_new_state_allowed(state): v[i, j] = vmin plt.matshow(v, fignum=0, vmax=vmax, vmin=vmin) plt.draw() plt.show() plt.pause(0.1)
def plot_the_policy(plt, pi, env): N = env.N all_states = dstack_product(np.arange(N), np.arange(N)) scale = 0.5 for state in all_states: x, y = state for action in range(env.action_space.n): if action == 0: vx = 0 vy = scale * pi[x, y][0] if action == 1: vx = 0 vy = -1. * scale * pi[x, y][1] if action == 2: vx = scale * pi[x, y][2] vy = 0 if action == 3: vx = -1. * scale * pi[x, y][3] vy = 0 plt.arrow(y, x, vy, vx, head_width=0.1, color='black', alpha=0.5)
# initializing V to zero pi = return_a_random_policy(N, env.action_space.n, epsilon=1000000) V_accumulate = np.zeros((N, N)) # 1.6 setting up the plot ax = create_plot(N) plt.ion() interactive(True) plt.cla() ax.axis('off') nr_episodes = 1_000 gamma = 0.98 all_states = dstack_product(np.arange(N), np.arange(N)) for episode_id in tqdm(range(nr_episodes)): # a sweep over all the states in the system. for counter, init_state in enumerate(all_states): terminated = False env.reset(init_state) tmp_V = 0.0 step_counter = 0 while not terminated: action_id = choose_an_action_based_on_pi(env.state, pi) new_state, reward, terminated, info = env.step(action_id) tmp_V += np.power(gamma, step_counter) * reward step_counter += 1 i, j = init_state