예제 #1
0
    def test_reward_single_agent_makespan(self):
        grid = MapfGrid(['....', '....', '....', '....', '....'])

        start_locations = ((0, 0), )
        goal_locations = ((4, 0), )

        determinstic_env = MapfEnv(grid, 1, start_locations, goal_locations, 0,
                                   REWARD_OF_CLASH, REWARD_OF_GOAL,
                                   REWARD_OF_LIVING,
                                   OptimizationCriteria.Makespan)
        total_reward = 0
        down_action = vector_action_to_integer((DOWN, ))
        _, r, _, _ = determinstic_env.step(down_action)
        total_reward += r
        _, r, _, _ = determinstic_env.step(down_action)
        total_reward += r
        _, r, _, _ = determinstic_env.step(down_action)
        total_reward += r
        s, r, done, _ = determinstic_env.step(down_action)
        total_reward += r

        self.assertEqual(s,
                         determinstic_env.locations_to_state(goal_locations))
        self.assertEqual(r, REWARD_OF_LIVING + REWARD_OF_GOAL)

        self.assertEqual(total_reward, REWARD_OF_GOAL + 4 * REWARD_OF_LIVING)
예제 #2
0
    def test_copy_mapf_env(self):
        grid = MapfGrid(['....', '....', '....', '....', '....'])
        env = MapfEnv(grid, 1, ((0, 0), ), ((4, 0), ), 0, REWARD_OF_CLASH,
                      REWARD_OF_GOAL, REWARD_OF_LIVING,
                      OptimizationCriteria.Makespan)

        env.step(vector_action_to_integer((RIGHT, )))

        env_copy = copy(env)
        env_copy.step(vector_action_to_integer((RIGHT, )))
예제 #3
0
    def test_reward_multiagent_soc_stay_actions(self):
        grid = MapfGrid(['....', '....', '....', '....'])

        start_locations = ((0, 0), (3, 3), (1, 1))
        goal_locations = ((0, 1), (1, 3), (1, 2))

        determinstic_env = MapfEnv(grid, 3, start_locations, goal_locations, 0,
                                   REWARD_OF_CLASH, REWARD_OF_GOAL,
                                   REWARD_OF_LIVING, OptimizationCriteria.SoC)

        right_stay_stay = vector_action_to_integer((RIGHT, STAY, STAY))
        s, r, done, _ = determinstic_env.step(right_stay_stay)
        self.assertEqual(r, -3)
예제 #4
0
    def test_reawrd_multiagent_makespan(self):
        grid = MapfGrid(['....', '....', '....', '....'])

        start_locations = ((0, 0), (3, 3), (1, 1))
        goal_locations = ((0, 1), (1, 3), (1, 2))

        determinstic_env = MapfEnv(grid, 3, start_locations, goal_locations, 0,
                                   REWARD_OF_CLASH, REWARD_OF_GOAL,
                                   REWARD_OF_LIVING,
                                   OptimizationCriteria.Makespan)

        total_reward = 0
        right_up_right = vector_action_to_integer((RIGHT, UP, RIGHT))
        s, r, done, _ = determinstic_env.step(right_up_right)
        total_reward += r
        self.assertFalse(done)

        stay_up_stay = vector_action_to_integer((STAY, UP, STAY))
        s, r, done, _ = determinstic_env.step(stay_up_stay)
        total_reward += r
        self.assertEqual(s,
                         determinstic_env.locations_to_state(goal_locations))
        self.assertTrue(done)
        self.assertEqual(total_reward, 2 * REWARD_OF_LIVING + REWARD_OF_GOAL)
예제 #5
0
    def test_action_from_terminal_state_has_no_effect(self):
        grid = MapfGrid(['..', '..'])
        env = MapfEnv(grid, 1, ((0, 0), ), ((1, 1), ), 0, REWARD_OF_CLASH,
                      REWARD_OF_GOAL, REWARD_OF_LIVING,
                      OptimizationCriteria.Makespan)

        state, reward, done, _ = env.step(vector_action_to_integer((RIGHT, )))
        self.assertEqual(reward, REWARD_OF_LIVING)
        self.assertEqual(done, False)
        state, reward, done, _ = env.step(vector_action_to_integer((DOWN, )))
        self.assertEqual(reward, REWARD_OF_LIVING + REWARD_OF_GOAL)
        self.assertEqual(done, True)
        # now, after the game is finished - do another step and make sure it has not effect.
        state_after_done, reward_after_done, done_after_done, _ = env.step(
            vector_action_to_integer((UP, )))
        self.assertEqual(state_after_done, state)
        self.assertEqual(done_after_done, True)
        self.assertEqual(reward_after_done, 0)
        # another time like I'm trying to reach the goal
        state_after_done, reward_after_done, done_after_done, _ = env.step(
            vector_action_to_integer((DOWN, )))
        self.assertEqual(state_after_done, state)
        self.assertEqual(done_after_done, True)
        self.assertEqual(reward_after_done, 0)
예제 #6
0
    def test_switch_spots_is_a_collision(self):
        grid = MapfGrid(['..'])

        agents_starts = (
            (0, 0),
            (0, 1),
        )
        agents_goals = ((0, 1), (0, 0))

        determinstic_env = MapfEnv(grid, 2, agents_starts, agents_goals, 0,
                                   REWARD_OF_CLASH, REWARD_OF_GOAL,
                                   REWARD_OF_LIVING,
                                   OptimizationCriteria.Makespan)

        s, r, done, _ = determinstic_env.step(
            vector_action_to_integer((RIGHT, LEFT)))

        # Assert the game terminated in a collision
        self.assertEqual(done, True)
        self.assertEqual(r, REWARD_OF_LIVING + REWARD_OF_CLASH)
예제 #7
0
def lrtdp(
    heuristic_function: Callable[[MapfEnv], Callable[[int], float]],
    max_iterations: int,
    gamma: float,
    epsilon: float,
    env: MapfEnv,
    info: Dict,
) -> Policy:
    info['iterations'] = []

    # initialize V to an upper bound
    env.reset()
    initial_state = env.s
    policy = LrtdpPolicy(env, gamma, heuristic_function(env))

    # follow the greedy policy, for each transition do a bellman update on V
    n_iterations = 0
    while initial_state not in policy.solved and n_iterations < max_iterations:
        n_iterations += 1
        s = env.s
        start = time.time()
        path = []

        # LRTDP Trial
        while s not in policy.solved:
            # env.render()
            a = greedy_action(env, s, policy.v, gamma)
            path.append((s, a))
            # print(f'action {integer_action_to_vector(a, env.n_agents)} chosen')
            # time.sleep(1)
            new_v_s = sum([
                prob * (reward + gamma * policy.v[next_state])
                for prob, next_state, reward, done in env.P[s][a]
            ])
            policy.v_partial_table[s] = new_v_s

            # simulate the step and sample a new state
            s, r, done, _ = env.step(a)
            if done:
                # add the state to done, the action does not matter
                path.append((s, 0))
                break

        # iteration finished
        while path:
            state, action = path.pop()
            if not check_solved(policy, state, epsilon):
                break

        info['iterations'].append({
            'n_moves': len(path),
            'time': round(time.time() - start, 2),
            'n_states_solved': len(policy.solved),
            'final_reward': r,
        })

        env.reset()

    env.reset()

    return policy