def test_move(self): grid = ' #P\nG #' gw = GridWorld(grid, move_value=-1, die_value=-20, win_value=10) step_tests = [ # move into wall ((0,0), (1,0), (0,0), -1, False), # move to free field ((0,0), (1,1), (1,1), -1, False), # move to goal ((0,0), (0,1), (0,1), 10, True), # die penalty ((0,0), (2,0), (2,0), -20, True), # out of bounds #1 ((0,0), (-1,0), (0,0), -1, False), # out of bounds #1 ((0,0), (10,0), (0,0), -1, False), ] for start, to, end, reward, is_terminal in step_tests: e, r, t = gw.move(start, to) self.assertEqual(e, end) self.assertEqual(r, reward) self.assertEqual(t, is_terminal)
if __name__ == '__main__': max_steps = 100 max_iters = 1000 seed = random.randint(0, 100) agent = qAgent() grid = GridWorld(size=8, force_fast=True, seed=seed) grid.show() print() for iter in range(max_iters): agent.set_grid(grid) i, j = 0, 0 # initial state cum_reward = 0 for step in range(max_steps): action = agent.get_action(i, j) new_i, new_j = grid.move(i, j, action) reward, is_final = grid.get_reward(i, j) cum_reward += reward agent.update_q(i, j, new_i, new_j, action, reward) if is_final: break i = new_i j = new_j if iter % 100 == 0: print( 'Episode {} finished after {} steps with cumulative reward of {}' .format(iter, step, cum_reward)) grid = GridWorld(size=8, force_fast=True, seed=seed) print() show_qtable(agent, grid.size)
print "currentstate:", currentstate_index, currentstate # Posicion actual (considerando to do el vector de estado) # Realmente la posicion son la columna 1 y 2 print "currentposition:", currentposition # Se asegura de que el movimiento sea valido validMove = False while not validMove: action_index = agent.getAction('qlearning') action = agent.getActions()[action_index] print "newaction:", action_index, action # Recupera la nueva posicion a traves de ejecutar una accion newposition = gridworld.move( agent.getCurrentPosition()[-2:], action) print "newposition:", newposition if newposition: validMove = True print "+ valid move" # Incluye el presupuesto en la posicion newposition = np.append([agent.getBudgetState()], newposition) print "newposition with budget:", newposition # Calcula la recompensa que devuelve el ambiente current_reward = reward.reward(currentposition, action, newposition) print "reward:", current_reward