def main(): grid_size = 10 grid_world = GridWorld(grid_size, num_obstacles=20, stochastic_cell_ratio=0.1) params = {} params['type'] = 'value_iteration' params['grid_size'] = grid_size params['rewards'] = grid_world.rewards params['transition_matrix'] = grid_world.transition_matrix params['step_func'] = GridWorld.deterministic_step params['discount'] = 0.9 agent = AgentFactory.create_agent(params) episode_ended = False while True: grid_world.get_user_input() grid_world.draw_with_state_values( agent.v, policy=agent.pi if grid_world.render_policy else None) if not grid_world.pause: if episode_ended: grid_world.restart_episode() grid_world.draw_black_screen() episode_ended = False else: agent.do_job() if agent.ready_to_play(): action = agent.get_action(grid_world.pos) episode_ended, _, _ = grid_world.step(action) grid_world.tick_tock()
def Q_learning(env: GridWorld, epsilon: float, lr: float, initQ: np.ndarray, converge=False) -> (np.ndarray, float): """ Performs Q learning for single episode in environment and returns learned policy. :param env: GridWorld subclass :param epsilon: Exploitation rate :param lr: learning rate :param initQ: Q table to update :param converge: Flag to determine if delta of Q-values need to be tracked for convergence within set bound. :return: Updated Q table after a single episode of training and maximum change for any Q-value """ # keep track of maximum state-action value change delta = 0.0 state = env.reset() done = False while not done: # explore action space if np.random.uniform(0, 1) > epsilon: action = env.sample() # exploit else: action = np.argmax(initQ[state]) # take step in env obs, r, done = env.step(action) # update Q table prev_value = initQ[state, action] new_value = prev_value + lr * (r + env.gamma * np.max(initQ[obs]) - prev_value) initQ[state, action] = new_value # update state state = obs if converge: delta = max(delta, np.abs(new_value - prev_value)) return initQ, delta