# RL take action and get next state and reward _, next_state_index, reward, done = env.step(action) # RL choose action based on next state next_action = RL.choose_action(str(next_state_index)) # RL learn from this transition (s, a, r, s, a) ==> Sarsa RL.learn(str(state), action, reward, str(next_state_index), next_action) # swap state and action state = next_state_index action = next_action # break while loop when end of this episode if done: break # end of game print('game over') env.destroy() if __name__ == "__main__": env = GridWorld() RL = Sarsa(actions=list(range(env.n_actions))) env.after(10000, update) env.mainloop() print(RL.q_table)
_, next_state_index, reward, done = env.step(action) env.render() if next_state_index != 'terminal': next_i, next_j = next_state_index else: next_i, next_j = 0, 0 # Record value function q(s,a) = r + γ*v(s') values.append(reward + reward_decay * value[next_i, next_j]) # According to the optimal equation of behrman, find the maximum value function, and update q(s,a) new_value[i, j] = np.max(values) # Iteration termination condition: error less than 1e-4 if np.sum(np.abs(new_value - value)) < 1e-4: draw_image(np.round(new_value, decimals=2)) plt.title('$v_{*}$') plt.show() plt.close() break value = new_value print(value) # end of game print('game over') env.destroy() env = GridWorld(grid_world_h, grid_world_w) value = np.zeros((grid_world_h, grid_world_w)) env.after(10000, dp) env.mainloop()