예제 #1
0
class WorkQueue:
    def __init__(self, max_depth: int = 8):
        self._queue = Queue(maxsize=max_depth)

    def get_queue(self):
        """
        :return: Ray Queue actor, needed by the consumers.
        """
        return self._queue

    def empty(self):
        """
        :return: Ray Queue actor, needed by the consumers.
        """
        return self._queue.empty()

    def group(self, labels_all: np.ndarray, probs_all: np.ndarray,
              filename: str, original_shape: tuple, inference_time_sec: float,
              page_number: int) -> dict:
        return {
            "labels_all": labels_all,
            "probs_all": probs_all,
            "filename": filename,
            "original_shape": original_shape,
            "inference_time_sec": inference_time_sec,
            "page_number": page_number
        }

    def ungroup(self, dictionary):
        """
        use this like: labels_all, probs_all, filename, original_shape = ungroup(d)
        :param dictionary:
        :return:
        """
        return dictionary["labels_all"], dictionary["probs_all"], dictionary[
            "filename"], dictionary["original_shape"], dictionary[
                "inference_time_sec"], dictionary["page_number"]

    def push(self, dictionary):
        """
        Push dictionary of params to post-process. Blocks if queue is full for flow-control and proceeds when
        queue has enough space.
        :param dictionary: a dictionary created with group() method.
        :return: None
        """
        # put in object store
        ref = ray.put(dictionary)
        # put ref in queue
        self._queue.put(ref)
        return None

    def pop(self):
        """
        :return: a dictionary created with group() method, use ungroup() to unpack or lookup individually.
        """
        return self._queue.get()
예제 #2
0
def _train(params: Dict,
           dtrain: RayDMatrix,
           *args,
           evals=(),
           num_actors: int = 4,
           cpus_per_actor: int = 0,
           gpus_per_actor: int = -1,
           resources_per_actor: Optional[Dict] = None,
           checkpoint_prefix: Optional[str] = None,
           checkpoint_path: str = "/tmp",
           checkpoint_frequency: int = 5,
           **kwargs) -> Tuple[xgb.Booster, Dict, Dict]:
    _assert_ray_support()

    if not ray.is_initialized():
        ray.init()

    if gpus_per_actor == -1:
        gpus_per_actor = 0
        if "tree_method" in params and params["tree_method"].startswith("gpu"):
            gpus_per_actor = 1

    if cpus_per_actor <= 0:
        cluster_cpus = _ray_get_cluster_cpus() or 1
        cpus_per_actor = min(int(_get_max_node_cpus() or 1),
                             int(cluster_cpus // num_actors))

    if "nthread" in params:
        if params["nthread"] > cpus_per_actor:
            raise ValueError(
                "Specified number of threads greater than number of CPUs. "
                "\nFIX THIS by passing a lower value for the `nthread` "
                "parameter or a higher number for `cpus_per_actor`.")
    else:
        params["nthread"] = cpus_per_actor

    # Create queue for communication from worker to caller.
    # Always create queue.
    queue = Queue()

    # Create remote actors
    actors = [
        _create_actor(i, num_actors, cpus_per_actor, gpus_per_actor,
                      resources_per_actor, queue, checkpoint_prefix,
                      checkpoint_path, checkpoint_frequency)
        for i in range(num_actors)
    ]
    logger.info(f"[RayXGBoost] Created {len(actors)} remote actors.")

    # Split data across workers
    wait_load = []
    for _, actor in enumerate(actors):
        wait_load.extend(_trigger_data_load(actor, dtrain, evals))

    try:
        ray.get(wait_load)
    except Exception:
        _shutdown(actors, queue, force=True)
        raise

    logger.info("[RayXGBoost] Starting XGBoost training.")

    # Start tracker
    env = _start_rabit_tracker(num_actors)
    rabit_args = [("%s=%s" % item).encode() for item in env.items()]

    # Train
    fut = [
        actor.train.remote(rabit_args, params, dtrain, evals, *args, **kwargs)
        for actor in actors
    ]

    callback_returns = [list() for _ in range(len(actors))]
    try:
        not_ready = fut
        while not_ready:
            if queue:
                while not queue.empty():
                    (actor_rank, item) = queue.get()
                    if isinstance(item, Callable):
                        item()
                    else:
                        callback_returns[actor_rank].append(item)
            ready, not_ready = ray.wait(not_ready, timeout=0)
            logger.debug("[RayXGBoost] Waiting for results...")
            ray.get(ready)
        # Once everything is ready
        ray.get(fut)
    # The inner loop should catch all exceptions
    except Exception:
        _shutdown(remote_workers=actors, queue=queue, force=True)
        raise

    # All results should be the same because of Rabit tracking. So we just
    # return the first one.
    res: Dict[str, Any] = ray.get(fut[0])
    bst = res["bst"]
    evals_result = res["evals_result"]
    additional_results = {}

    if callback_returns:
        additional_results["callback_returns"] = callback_returns

    all_res = ray.get(fut)
    total_n = sum(res["train_n"] or 0 for res in all_res)

    logger.info(f"[RayXGBoost] Finished XGBoost training on training data "
                f"with total N={total_n:,}.")

    if checkpoint_prefix:
        _cleanup(checkpoint_prefix, checkpoint_path, num_actors)

    _shutdown(remote_workers=actors, queue=queue, force=False)

    return bst, evals_result, additional_results