예제 #1
0
    def reconcile_placement_groups(self, trials: List["Trial"]):
        """Reconcile placement groups to match requirements.

        This will loop through all trials and count their statuses by
        placement group factory. This will make sure that only as many
        placement groups are needed as there are trials left to run.

        E.g. if PGF_A has 2 terminated, 1 errored, 2 paused, 1 running,
        and 3 pending trials, a total of 6 placement groups
        (paused+running+pending) should be in staging, use, or the cache.

        Args:
            trials: List of trials.

        """
        # Keep track of the currently tracked placement groups
        current_counts: Dict[PlacementGroupFactory, int] = defaultdict(int)

        # Count number of expected placement groups
        pgf_expected: Dict[PlacementGroupFactory, int] = defaultdict(int)
        for trial in trials:
            # Count in-use placement groups
            if trial in self._in_use_trials:
                current_counts[trial.placement_group_factory] += 1

            pgf_expected[trial.placement_group_factory] += (
                1 if trial.status in ["PAUSED", "PENDING", "RUNNING"] else 0)

        # Ensure that unexpected placement groups are accounted for
        for pgf in self._staging:
            if pgf not in pgf_expected:
                pgf_expected[pgf] = 0

        for pgf in self._ready:
            if pgf not in pgf_expected:
                pgf_expected[pgf] = 0

        # Count cached placement groups
        for pg, pgf in self._cached_pgs.items():
            current_counts[pgf] += 1

        # Compare current with expected
        for pgf, expected in pgf_expected.items():
            # Add staging and ready pgs
            current_counts[pgf] += len(self._staging[pgf])
            current_counts[pgf] += len(self._ready[pgf])

            while current_counts[pgf] > expected:
                pg = self._unstage_unused_pg(pgf)
                if not pg:
                    break
                logger.debug(f"Removing unneeded placement group {pg.id}")
                self.remove_pg(pg)
                current_counts[pgf] -= 1

            while expected > current_counts[pgf]:
                self._stage_pgf_pg(pgf)
                current_counts[pgf] += 1
                logger.debug(f"Adding an expected but previously unstaged "
                             f"placement group for factory {pgf}")
예제 #2
0
    def get_full_actor_cls(
        self, trial: "Trial", actor_cls: ActorClass
    ) -> Optional[ActorClass]:
        """Get a fully configured actor class.

        Returns the actor handle if the placement group is ready. In this case,
        the placement group is moved to `self._in_use_pgs` and removed from
        `self._ready`.

        Args:
            trial: "Trial" object to start
            actor_cls: Ray actor class.

        Returns:
            Configured ActorClass or None

        """
        pgf = trial.placement_group_factory

        if not self._ready[pgf]:
            return None

        pg = self._ready[pgf].pop()
        self._in_use_pgs[pg] = trial
        self._in_use_trials[trial] = pg

        logger.debug(f"For trial {trial} use pg {pg.id}")

        # We still have to pass resource specs
        if not pgf.head_bundle_is_empty:
            # Pass the full resource specs of the first bundle per default
            head_bundle = pg.bundle_specs[0].copy()
            num_cpus = head_bundle.pop("CPU", 0)
            num_gpus = head_bundle.pop("GPU", 0)
            memory = head_bundle.pop("memory", None)
            object_store_memory = head_bundle.pop("object_store_memory", None)

            # Only custom resources remain in `head_bundle`
            resources = head_bundle
            return actor_cls.options(
                placement_group=pg,
                placement_group_bundle_index=0,
                placement_group_capture_child_tasks=True,
                num_cpus=num_cpus,
                num_gpus=num_gpus,
                memory=memory,
                object_store_memory=object_store_memory,
                resources=resources,
            )
        else:
            return actor_cls.options(
                placement_group=pg,
                placement_group_capture_child_tasks=True,
                num_cpus=0,
                num_gpus=0,
                resources={},
            )
예제 #3
0
    def assign_cached_pg(self, pg: PlacementGroup, trial: "Trial") -> bool:
        """Assign a cached pg to a trial."""
        pgf = self._cached_pgs.pop(pg)
        trial_pgf = trial.placement_group_factory

        assert pgf == trial_pgf, (f"Cannot assign placement group with a "
                                  f"non-matching factory to trial {trial}")

        logger.debug(f"For trial {trial} RE-use pg {pg.id}")

        self._in_use_pgs[pg] = trial
        self._in_use_trials[trial] = pg

        return True
예제 #4
0
 def _init_shape(self, obs_space: gym.Space, options: dict):
     assert isinstance(self._obs_space, gym.spaces.Tuple)
     size = None
     self.preprocessors = []
     for i in range(len(self._obs_space.spaces)):
         space = self._obs_space.spaces[i]
         logger.debug("Creating sub-preprocessor for {}".format(space))
         preprocessor = _get_preprocessor(space)(space, self._options)
         self.preprocessors.append(preprocessor)
         if size is not None:
             assert size == preprocessor.size
         else:
             size = preprocessor.size
     return len(self._obs_space.spaces), size
예제 #5
0
    def get_full_actor_cls(self, trial: "Trial",
                           actor_cls: ActorClass) -> Optional[ActorClass]:
        """Get a fully configured actor class.

        Returns the actor handle if the placement group is ready. In this case,
        the placement group is moved to `self._in_use_pgs` and removed from
        `self._ready`.

        Args:
            trial ("Trial"): "Trial" object to start
            actor_cls: Ray actor class.

        Returns:
            Configured ActorClass or None

        """
        pgf = trial.placement_group_factory

        if not self._ready[pgf]:
            return None

        pg = self._ready[pgf].pop()
        self._in_use_pgs[pg] = trial
        self._in_use_trials[trial] = pg

        # We still have to pass resource specs
        # Pass the full resource specs of the first bundle per default
        first_bundle = pg.bundle_specs[0].copy()
        num_cpus = first_bundle.pop("CPU", None)
        num_gpus = first_bundle.pop("GPU", None)

        # Only custom resources remain in `first_bundle`
        resources = first_bundle or None

        if num_cpus is None:
            # If the placement group specifically set the number
            # of CPUs to 0, use this.
            num_cpus = pgf.head_cpus

        logger.debug(f"For trial {trial} use pg {pg.id}")

        return actor_cls.options(
            placement_group=pg,
            placement_group_bundle_index=0,
            placement_group_capture_child_tasks=True,
            num_cpus=num_cpus,
            num_gpus=num_gpus,
            resources=resources)
예제 #6
0
def _shutdown(remote_workers: List[ActorHandle],
              queue: Optional[Queue] = None,
              force: bool = False):
    if force:
        logger.debug(f"Killing {len(remote_workers)} workers.")
        for worker in remote_workers:
            ray.kill(worker)
        if queue is not None:
            logger.debug("Killing Queue.")
            ray.kill(queue.actor)
    else:
        try:
            [worker.__ray_terminate__.remote() for worker in remote_workers]
            if queue is not None:
                queue.actor.__ray_terminate__.remote()
        except RayActorError:
            logger.warning("Failed to shutdown gracefully, forcing a "
                           "shutdown.")
            _shutdown(remote_workers, force=True)
예제 #7
0
def _train(params: Dict,
           dtrain: RayDMatrix,
           *args,
           evals=(),
           num_actors: int = 4,
           cpus_per_actor: int = 0,
           gpus_per_actor: int = -1,
           resources_per_actor: Optional[Dict] = None,
           checkpoint_prefix: Optional[str] = None,
           checkpoint_path: str = "/tmp",
           checkpoint_frequency: int = 5,
           **kwargs) -> Tuple[xgb.Booster, Dict, Dict]:
    _assert_ray_support()

    if not ray.is_initialized():
        ray.init()

    if gpus_per_actor == -1:
        gpus_per_actor = 0
        if "tree_method" in params and params["tree_method"].startswith("gpu"):
            gpus_per_actor = 1

    if cpus_per_actor <= 0:
        cluster_cpus = _ray_get_cluster_cpus() or 1
        cpus_per_actor = min(int(_get_max_node_cpus() or 1),
                             int(cluster_cpus // num_actors))

    if "nthread" in params:
        if params["nthread"] > cpus_per_actor:
            raise ValueError(
                "Specified number of threads greater than number of CPUs. "
                "\nFIX THIS by passing a lower value for the `nthread` "
                "parameter or a higher number for `cpus_per_actor`.")
    else:
        params["nthread"] = cpus_per_actor

    # Create queue for communication from worker to caller.
    # Always create queue.
    queue = Queue()

    # Create remote actors
    actors = [
        _create_actor(i, num_actors, cpus_per_actor, gpus_per_actor,
                      resources_per_actor, queue, checkpoint_prefix,
                      checkpoint_path, checkpoint_frequency)
        for i in range(num_actors)
    ]
    logger.info(f"[RayXGBoost] Created {len(actors)} remote actors.")

    # Split data across workers
    wait_load = []
    for _, actor in enumerate(actors):
        wait_load.extend(_trigger_data_load(actor, dtrain, evals))

    try:
        ray.get(wait_load)
    except Exception:
        _shutdown(actors, queue, force=True)
        raise

    logger.info("[RayXGBoost] Starting XGBoost training.")

    # Start tracker
    env = _start_rabit_tracker(num_actors)
    rabit_args = [("%s=%s" % item).encode() for item in env.items()]

    # Train
    fut = [
        actor.train.remote(rabit_args, params, dtrain, evals, *args, **kwargs)
        for actor in actors
    ]

    callback_returns = [list() for _ in range(len(actors))]
    try:
        not_ready = fut
        while not_ready:
            if queue:
                while not queue.empty():
                    (actor_rank, item) = queue.get()
                    if isinstance(item, Callable):
                        item()
                    else:
                        callback_returns[actor_rank].append(item)
            ready, not_ready = ray.wait(not_ready, timeout=0)
            logger.debug("[RayXGBoost] Waiting for results...")
            ray.get(ready)
        # Once everything is ready
        ray.get(fut)
    # The inner loop should catch all exceptions
    except Exception:
        _shutdown(remote_workers=actors, queue=queue, force=True)
        raise

    # All results should be the same because of Rabit tracking. So we just
    # return the first one.
    res: Dict[str, Any] = ray.get(fut[0])
    bst = res["bst"]
    evals_result = res["evals_result"]
    additional_results = {}

    if callback_returns:
        additional_results["callback_returns"] = callback_returns

    all_res = ray.get(fut)
    total_n = sum(res["train_n"] or 0 for res in all_res)

    logger.info(f"[RayXGBoost] Finished XGBoost training on training data "
                f"with total N={total_n:,}.")

    if checkpoint_prefix:
        _cleanup(checkpoint_prefix, checkpoint_path, num_actors)

    _shutdown(remote_workers=actors, queue=queue, force=False)

    return bst, evals_result, additional_results