def test_env_3(env3_robots):
    env_data = env3_robots.get_env_metadata()
    env3_robots.reset()
    state = env3_robots.get_current_state()
    action = [0, 0, 0]
    action = n_from_prod(env_data['sets'], action)
    new_state, reward, done, _ = env3_robots.step(action)
    new_state_from_obs = get_state_from_observation(new_state)
    assert done == False
    assert reward == -0.666666
    assert state.robots_data == new_state_from_obs.robots_data
    assert state.time + 1 == new_state_from_obs.time
    assert new_state_from_obs.positions == [2, 2, 3]
    assert env3_robots.state.all() == np.array(new_state).all()
    for i in range(3):
        action = [0, 0, 0]
        action = n_from_prod(env_data['sets'], action)
        new_state, reward, done, _ = env3_robots.step(action)
    action = [0, 0, 70]
    action = n_from_prod(env_data['sets'], action)
    new_state, reward, done, _ = env3_robots.step(action)
    assert done == False
    assert reward == 0.001
    assert get_state_from_observation(new_state).time == 6
    assert get_state_from_observation(new_state).robots_data == [15, 30, 0]
Пример #2
0
def test_one_minus_one_reward_good_env_4(env4_robots):
    env_data = env4_robots.get_env_metadata()
    action = [17, 0, 0, 0, 0, 13]
    action = n_from_prod(env_data['sets'], action)
    # {(r1 -> r0: 2), ,
    # (r2 -> r3: 13)}
    interpreted_action = env4_robots.get_action_from_space(action)
    reward_giver = OneMinusOneRewardGiverAllowIllegal()

    assert reward_giver.give_reward(State([10, 10, 15, 0], 10, None),
                                    interpreted_action, env_data['meetings'],
                                    env_data['cycle_lengths'],
                                    env_data['max_memory']) == 0

    assert reward_giver.give_reward(
        State([10, 10, 10, 10], 10, None), interpreted_action,
        env_data['meetings'], env_data['cycle_lengths'],
        env_data['max_memory']) == settings.REWARD_FOR_INVALID_TRANSFER + 1

    action = [25, 0, 0, 0, 0, 13]
    action = n_from_prod(env_data['sets'], action)

    # {(r1 -> r0: 10), ,
    # (r2 -> r3: 13)}
    interpreted_action = env4_robots.get_action_from_space(action)
    assert reward_giver.give_reward(State([10, 10, 15, 0], 10, None),
                                    interpreted_action, env_data['meetings'],
                                    env_data['cycle_lengths'],
                                    env_data['max_memory']) == 0
Пример #3
0
def test_one_minus_one_reward_bad_env_4(env4_robots):
    env_data = env4_robots.get_env_metadata()
    action = [1, 2, 20, 15, 13, 28]
    action = n_from_prod(env_data['sets'], action)
    # {(r0 -> r1: 1), (r0 -> r2: 2), (r3 -> r0: 5),
    # (r1 -> r2: 15), (r1 -> r3: 13), (r3 -> r2: 13)}
    interpreted_action = env4_robots.get_action_from_space(action)
    reward_giver = OneMinusOneRewardGiverAllowIllegal()

    assert reward_giver.give_reward(
        State([10, 10, 10, 10], 8, None), interpreted_action,
        env_data['meetings'], env_data['cycle_lengths'], env_data['max_memory']
    ) == 5 * settings.REWARD_FOR_INVALID_MEETING + settings.REWARD_FOR_INVALID_TRANSFER

    assert reward_giver.give_reward(
        State([0, 0, 0, 0], 8, None), interpreted_action, env_data['meetings'],
        env_data['cycle_lengths'], env_data['max_memory']
    ) == 5 * settings.REWARD_FOR_INVALID_MEETING + settings.REWARD_FOR_INVALID_TRANSFER

    action = [0] * (env_data['num_robots'] * (env_data['num_robots'] - 1) // 2)
    action = n_from_prod(env_data['sets'], action)
    interpreted_action = env4_robots.get_action_from_space(action)
    assert reward_giver.give_reward(State([0, 0, 0, 0], 8, None),
                                    interpreted_action, env_data['meetings'],
                                    env_data['cycle_lengths'],
                                    env_data['max_memory']) == 0
def test_apply_action_test_4_env(env4_robots):
    env_data = env4_robots.get_env_metadata()
    env4_robots.reset()
    state = env4_robots.get_current_state()
    state.time = 8
    action = [19, 16, 0, 0, 0, 18]
    action = n_from_prod(env_data['sets'], action)
    interpreted_action = env4_robots.get_action_from_space(action)
    new_state = apply_action_allow_illegal(state, interpreted_action,
                                           env_data['max_memory'],
                                           env_data['cycles'])
    assert new_state == State(robots_data=[15, 6, 12, 7],
                              time=9,
                              positions=[2, 2, 5, 5])

    action = [26, 16, 0, 0, 0, 22]
    action = n_from_prod(env_data['sets'], action)

    interpreted_action = env4_robots.get_action_from_space(action)
    new_state = apply_action_allow_illegal(state, interpreted_action,
                                           env_data['max_memory'],
                                           env_data['cycles'])
    assert new_state == State(robots_data=[11, 10, 9, 10],
                              time=9,
                              positions=[2, 2, 5, 5])
    def step(self, action: int) -> Tuple[np.array, float, bool, Dict]:
        # check if the action is in the action space
        if not isinstance(action, list):
            action = int(action)
        else:
            assert self.action_space.contains(
                action), f'{action}, {type(action)} invalid'
            action = n_from_prod(self.__sets, action)
        interpreted_state = get_state_from_observation(
            self.state)  # convert observation to interpreted state
        interpreted_action = self.get_action_from_space(action)
        self.__state_action = interpreted_action
        reward = self.__reward_class.give_reward(interpreted_state,
                                                 interpreted_action,
                                                 self.__meetings,
                                                 self.__cycles_lengths,
                                                 self.__max_memory)

        if reward == settings.REWARD_FOR_INVALID_ACTION:
            # new_state = interpreted_state
            new_state = apply_action_only_increase_time_move_robots(
                interpreted_state, interpreted_action, self.__max_memory,
                self.__cycles)
        else:
            new_state = self.__action_apply(interpreted_state,
                                            interpreted_action,
                                            self.__max_memory, self.__cycles)

        self.state = np.array(get_observation_from_state(new_state))
        return self.state, reward / 1000000, check_if_done(
            new_state, settings.MAXIMUM_NUM_ITER), {}
Пример #6
0
def test_one_minus_one_reward_bad_env_3(env3_robots):
    env_data = env3_robots.get_env_metadata()
    action = [15, 20, 56]
    action = n_from_prod(env_data['sets'], action)
    interpreted_action = env3_robots.get_action_from_space(action)
    reward_giver = OneMinusOneRewardGiverAllowIllegal()
    assert reward_giver.give_reward(
        State([15, 20, 1], 5, None), interpreted_action, env_data['meetings'],
        env_data['cycle_lengths'],
        env_data['max_memory']) == 2 * settings.REWARD_FOR_INVALID_MEETING - 1
def test_apply_action_env_3_robots(env3_robots):
    env_data = env3_robots.get_env_metadata()
    env3_robots.reset()
    state = env3_robots.get_current_state()
    action = [61, 0, 0]
    action = n_from_prod(env_data['sets'], action)
    interpreted_action = env3_robots.get_action_from_space(action)
    new_state = apply_action_allow_illegal(state, interpreted_action,
                                           env_data['max_memory'],
                                           env_data['cycles'])
    assert new_state == State(robots_data=[21, 9, 15],
                              time=2,
                              positions=[2, 2, 3])
    action = [61, 0, 0]
    action = n_from_prod(env_data['sets'], action)
    interpreted_action = env3_robots.get_action_from_space(action)
    assert apply_action_allow_illegal(state, interpreted_action,
                                      env_data['max_memory'],
                                      env_data['cycles']) == new_state
def test_env_termination(env4_robots):
    env_data = env4_robots.get_env_metadata()
    env4_robots.reset()
    action_map_terminate = {
        '-1': [0] * 6,
        '2': [25] + [0] * 5,
        '4': [0, 25] + [0] * 4,
        '6': [0] * 5 + [25],
        '8': [0, 25] + [0] * 4
    }
    for i in range(1, 9):
        if action_map_terminate.get(str(i), None):
            action = action_map_terminate[str(i)]
            action = n_from_prod(env_data['sets'], action)
            _, __, done, ___ = env4_robots.step(action)
        else:
            action = action_map_terminate['-1']
            action = n_from_prod(env_data['sets'], action)
            _, __, done, ___ = env4_robots.step(action)
    assert done == True
Пример #9
0
def test_one_minus_one_reward_good_env_3(env3_robots):
    env_data = env3_robots.get_env_metadata()
    action = [57, 0, 0]
    action = n_from_prod(env_data['sets'], action)
    interpreted_action = env3_robots.get_action_from_space(action)
    reward_giver = OneMinusOneRewardGiverAllowIllegal()

    assert reward_giver.give_reward(State([2, 2, 0], 10, None),
                                    interpreted_action, env_data['meetings'],
                                    env_data['cycle_lengths'],
                                    env_data['max_memory']) == 1

    action = [58, 0, 0]
    action = n_from_prod(env_data['sets'], action)
    interpreted_action = env3_robots.get_action_from_space(action)

    assert reward_giver.give_reward(
        State([0, 0, 0], 10, None), interpreted_action, env_data['meetings'],
        env_data['cycle_lengths'],
        env_data['max_memory']) == settings.REWARD_FOR_INVALID_TRANSFER