Exemplo n.º 1
0
def remove_duplicates(duplicated_groups: dict):
    """
    Remove groups ids for tasks which have more than one.
    This is to make sure that every task belongs to a single group only.
    This distinct group id will be the basis for further geometric processing.
    """

    for duplicated_group_id in sorted(duplicated_groups.keys(), reverse=True):
        logger.debug(
            f"{duplicated_group_id}: {list(duplicated_groups[duplicated_group_id])}"
        )
        my_duplicated_group_id = duplicated_group_id
        for other_group_id in duplicated_groups[duplicated_group_id]:
            if other_group_id < my_duplicated_group_id:
                my_duplicated_group_id = other_group_id

        for task_id in yes_results_dict.keys():
            if yes_results_dict[task_id]["my_group_id"] == duplicated_group_id:
                yes_results_dict[task_id]["my_group_id"] = my_duplicated_group_id
def create_hot_tm_tasks(
    project_id: str,
    project_data: list,
    group_size: int = 15,
    neighbourhood_shape: str = "rectangle",
    neighbourhood_size: int = 5,
) -> dict:
    """
    This functions creates a dictionary of tiles which will be forming a task in the HOT Tasking Manager.
    It will create a neighbourhood list, which will function as a mask to filter tiles that are close to each other.
    The functions assigns group ids to each tile.
    For tiles that got several group ids, this will be resolved in the next step.
    Once each task has a unique group id, the function checks the size (number of tiles) for each group.
    Groups that hold too many tiles (too big to map in the Tasking Manager) will be split into smaller groups.
    Finally, a dictionary is returned which holds each group as an item.
    Each group consists of a limited number of tiles.
    """

    # final groups dict will store the groups that are exported
    final_groups_dict = {}
    highest_group_id = 0

    # create a dictionary with all results
    global yes_results_dict
    yes_results_dict = {}
    for result in project_data:
        yes_results_dict[result["id"]] = result
    logger.info("created results dictionary. there are %s results." %
                len(yes_results_dict))
    if len(yes_results_dict) < 1:
        return final_groups_dict

    global neighbour_list
    global my_neighbourhood_size
    my_neighbourhood_size = neighbourhood_size

    neighbour_list = get_neighbour_list(neighbourhood_shape,
                                        neighbourhood_size)
    logger.info(
        "got neighbour list. neighbourhood_shape: %s, neighbourhood_size: %s" %
        (neighbourhood_shape, neighbourhood_size))

    global split_groups_list
    split_groups_list = []

    # test for neighbors and set groups id
    for task_id in sorted(yes_results_dict.keys()):
        try:
            # this task has already a group id, great.
            group_id = yes_results_dict[task_id]["my_group_id"]
        except KeyError:
            group_id = highest_group_id + 1
            highest_group_id += 1
            yes_results_dict[task_id]["my_group_id"] = group_id
            logger.debug("created new group id")
        logger.debug("group id: %s" % group_id)

        # check for other results in the neighbourhood and add the group id to them
        add_group_id_to_neighbours(
            yes_results_dict[task_id]["task_x"],
            yes_results_dict[task_id]["task_y"],
            yes_results_dict[task_id]["task_z"],
            group_id,
        )

    logger.info("added group ids to yes maybe results dict")

    # check if some tasks have different groups from their neighbours
    duplicates_dict = create_duplicates_dict()
    while len(duplicates_dict) > 0:
        remove_duplicates(duplicates_dict)
        duplicates_dict = create_duplicates_dict()
        logger.debug("there are %s duplicated groups" % len(duplicates_dict))

    logger.info("removed all duplicated group ids in yes maybe results dict")

    grouped_results_dict = {}
    for task_id in yes_results_dict.keys():
        group_id = yes_results_dict[task_id]["my_group_id"]
        try:
            grouped_results_dict[group_id][task_id] = yes_results_dict[task_id]
        except KeyError:
            grouped_results_dict[group_id] = {}
            grouped_results_dict[group_id][task_id] = yes_results_dict[task_id]

    logger.info("created dict item for each group")

    # reset highest group id since we merged several groups
    highest_group_id = max(grouped_results_dict)
    logger.debug("new highest group id: %s" % highest_group_id)

    q = Queue(maxsize=0)
    num_threads = 1

    for group_id in grouped_results_dict.keys():

        if len(grouped_results_dict[group_id]) < group_size:
            final_groups_dict[group_id] = grouped_results_dict[group_id]
        else:
            group_data = grouped_results_dict[group_id]
            # add this group to the queue
            q.put([group_id, group_data, group_size])

    logger.info("added groups to queue.")

    for i in range(num_threads):
        worker = threading.Thread(target=split_groups, args=(q, ))
        worker.start()

    q.join()
    logger.info("split all groups.")

    logger.debug("there are %s split groups" % len(split_groups_list))

    # add the split groups to the final groups dict
    for group_data in split_groups_list:
        new_group_id = highest_group_id + 1
        highest_group_id += 1
        final_groups_dict[new_group_id] = group_data

    logger.info("created %s groups." % len(final_groups_dict))
    return final_groups_dict
def split_groups(q):
    """
    This function will be executed using threading.
    First it checks if there are still processes pending in the queue.
    We are using a clustering algorithm to put tasks together in groups.
    Since it is computationally expensive to check which tiles are neighbours,
    we split our results into chunks (called groups here).
    When we reach a group size below the defined group size we will stop.
    Otherwise, the group will be split into two parts and
    both will be added as new groups to our queue.
    """

    while not q.empty():
        group_id, group_data, group_size = q.get()
        logger.debug(f"the group ({group_id}) has {len(group_data)} members")

        # find min x, and min y
        x_list = []
        y_list = []

        for result, data in group_data.items():
            x_list.append(int(data["task_x"]))
            y_list.append(int(data["task_y"]))

        min_x = min(x_list)
        max_x = max(x_list)
        x_width = max_x - min_x

        min_y = min(y_list)
        max_y = max(y_list)
        y_width = max_y - min_y

        new_grouped_data = {"a": {}, "b": {}}

        if x_width >= y_width:
            # first split vertically
            for result, data in group_data.items():
                # result is in first segment
                if int(data["task_x"]) < (min_x + (x_width / 2)):
                    new_grouped_data["a"][result] = data
                else:
                    new_grouped_data["b"][result] = data
        else:
            # first split horizontally
            for result, data in group_data.items():
                # result is in first segment
                if int(data["task_y"]) < (min_y + (y_width / 2)):
                    new_grouped_data["a"][result] = data
                else:
                    new_grouped_data["b"][result] = data

        for k in ["a", "b"]:
            logger.debug("there are %s results in %s" %
                         (len(new_grouped_data[k]), k))

            for result, data in new_grouped_data[k].items():
                x_list.append(int(data["task_x"]))
                y_list.append(int(data["task_y"]))

            min_x = min(x_list)
            max_x = max(x_list)
            x_width = max_x - min_x

            min_y = min(y_list)
            max_y = max(y_list)
            y_width = max_y - min_y

            if len(new_grouped_data[k]) < group_size:

                # add this check to avoid large groups groups with few items
                if x_width * y_width > 2 * (my_neighbourhood_size *
                                            my_neighbourhood_size):
                    q.put([group_id, new_grouped_data[k], group_size])
                else:
                    split_groups_list.append(new_grouped_data[k])
                    logger.debug('add "a" to split_groups_dict')
            else:
                # add this group to a queue
                q.put([group_id, new_grouped_data[k], group_size])

        q.task_done()