def reconcile_placement_groups(self, trials: List["Trial"]): """Reconcile placement groups to match requirements. This will loop through all trials and count their statuses by placement group factory. This will make sure that only as many placement groups are needed as there are trials left to run. E.g. if PGF_A has 2 terminated, 1 errored, 2 paused, 1 running, and 3 pending trials, a total of 6 placement groups (paused+running+pending) should be in staging, use, or the cache. Args: trials: List of trials. """ # Keep track of the currently tracked placement groups current_counts: Dict[PlacementGroupFactory, int] = defaultdict(int) # Count number of expected placement groups pgf_expected: Dict[PlacementGroupFactory, int] = defaultdict(int) for trial in trials: # Count in-use placement groups if trial in self._in_use_trials: current_counts[trial.placement_group_factory] += 1 pgf_expected[trial.placement_group_factory] += ( 1 if trial.status in ["PAUSED", "PENDING", "RUNNING"] else 0) # Ensure that unexpected placement groups are accounted for for pgf in self._staging: if pgf not in pgf_expected: pgf_expected[pgf] = 0 for pgf in self._ready: if pgf not in pgf_expected: pgf_expected[pgf] = 0 # Count cached placement groups for pg, pgf in self._cached_pgs.items(): current_counts[pgf] += 1 # Compare current with expected for pgf, expected in pgf_expected.items(): # Add staging and ready pgs current_counts[pgf] += len(self._staging[pgf]) current_counts[pgf] += len(self._ready[pgf]) while current_counts[pgf] > expected: pg = self._unstage_unused_pg(pgf) if not pg: break logger.debug(f"Removing unneeded placement group {pg.id}") self.remove_pg(pg) current_counts[pgf] -= 1 while expected > current_counts[pgf]: self._stage_pgf_pg(pgf) current_counts[pgf] += 1 logger.debug(f"Adding an expected but previously unstaged " f"placement group for factory {pgf}")
def get_full_actor_cls( self, trial: "Trial", actor_cls: ActorClass ) -> Optional[ActorClass]: """Get a fully configured actor class. Returns the actor handle if the placement group is ready. In this case, the placement group is moved to `self._in_use_pgs` and removed from `self._ready`. Args: trial: "Trial" object to start actor_cls: Ray actor class. Returns: Configured ActorClass or None """ pgf = trial.placement_group_factory if not self._ready[pgf]: return None pg = self._ready[pgf].pop() self._in_use_pgs[pg] = trial self._in_use_trials[trial] = pg logger.debug(f"For trial {trial} use pg {pg.id}") # We still have to pass resource specs if not pgf.head_bundle_is_empty: # Pass the full resource specs of the first bundle per default head_bundle = pg.bundle_specs[0].copy() num_cpus = head_bundle.pop("CPU", 0) num_gpus = head_bundle.pop("GPU", 0) memory = head_bundle.pop("memory", None) object_store_memory = head_bundle.pop("object_store_memory", None) # Only custom resources remain in `head_bundle` resources = head_bundle return actor_cls.options( placement_group=pg, placement_group_bundle_index=0, placement_group_capture_child_tasks=True, num_cpus=num_cpus, num_gpus=num_gpus, memory=memory, object_store_memory=object_store_memory, resources=resources, ) else: return actor_cls.options( placement_group=pg, placement_group_capture_child_tasks=True, num_cpus=0, num_gpus=0, resources={}, )
def assign_cached_pg(self, pg: PlacementGroup, trial: "Trial") -> bool: """Assign a cached pg to a trial.""" pgf = self._cached_pgs.pop(pg) trial_pgf = trial.placement_group_factory assert pgf == trial_pgf, (f"Cannot assign placement group with a " f"non-matching factory to trial {trial}") logger.debug(f"For trial {trial} RE-use pg {pg.id}") self._in_use_pgs[pg] = trial self._in_use_trials[trial] = pg return True
def _init_shape(self, obs_space: gym.Space, options: dict): assert isinstance(self._obs_space, gym.spaces.Tuple) size = None self.preprocessors = [] for i in range(len(self._obs_space.spaces)): space = self._obs_space.spaces[i] logger.debug("Creating sub-preprocessor for {}".format(space)) preprocessor = _get_preprocessor(space)(space, self._options) self.preprocessors.append(preprocessor) if size is not None: assert size == preprocessor.size else: size = preprocessor.size return len(self._obs_space.spaces), size
def get_full_actor_cls(self, trial: "Trial", actor_cls: ActorClass) -> Optional[ActorClass]: """Get a fully configured actor class. Returns the actor handle if the placement group is ready. In this case, the placement group is moved to `self._in_use_pgs` and removed from `self._ready`. Args: trial ("Trial"): "Trial" object to start actor_cls: Ray actor class. Returns: Configured ActorClass or None """ pgf = trial.placement_group_factory if not self._ready[pgf]: return None pg = self._ready[pgf].pop() self._in_use_pgs[pg] = trial self._in_use_trials[trial] = pg # We still have to pass resource specs # Pass the full resource specs of the first bundle per default first_bundle = pg.bundle_specs[0].copy() num_cpus = first_bundle.pop("CPU", None) num_gpus = first_bundle.pop("GPU", None) # Only custom resources remain in `first_bundle` resources = first_bundle or None if num_cpus is None: # If the placement group specifically set the number # of CPUs to 0, use this. num_cpus = pgf.head_cpus logger.debug(f"For trial {trial} use pg {pg.id}") return actor_cls.options( placement_group=pg, placement_group_bundle_index=0, placement_group_capture_child_tasks=True, num_cpus=num_cpus, num_gpus=num_gpus, resources=resources)
def _shutdown(remote_workers: List[ActorHandle], queue: Optional[Queue] = None, force: bool = False): if force: logger.debug(f"Killing {len(remote_workers)} workers.") for worker in remote_workers: ray.kill(worker) if queue is not None: logger.debug("Killing Queue.") ray.kill(queue.actor) else: try: [worker.__ray_terminate__.remote() for worker in remote_workers] if queue is not None: queue.actor.__ray_terminate__.remote() except RayActorError: logger.warning("Failed to shutdown gracefully, forcing a " "shutdown.") _shutdown(remote_workers, force=True)
def _train(params: Dict, dtrain: RayDMatrix, *args, evals=(), num_actors: int = 4, cpus_per_actor: int = 0, gpus_per_actor: int = -1, resources_per_actor: Optional[Dict] = None, checkpoint_prefix: Optional[str] = None, checkpoint_path: str = "/tmp", checkpoint_frequency: int = 5, **kwargs) -> Tuple[xgb.Booster, Dict, Dict]: _assert_ray_support() if not ray.is_initialized(): ray.init() if gpus_per_actor == -1: gpus_per_actor = 0 if "tree_method" in params and params["tree_method"].startswith("gpu"): gpus_per_actor = 1 if cpus_per_actor <= 0: cluster_cpus = _ray_get_cluster_cpus() or 1 cpus_per_actor = min(int(_get_max_node_cpus() or 1), int(cluster_cpus // num_actors)) if "nthread" in params: if params["nthread"] > cpus_per_actor: raise ValueError( "Specified number of threads greater than number of CPUs. " "\nFIX THIS by passing a lower value for the `nthread` " "parameter or a higher number for `cpus_per_actor`.") else: params["nthread"] = cpus_per_actor # Create queue for communication from worker to caller. # Always create queue. queue = Queue() # Create remote actors actors = [ _create_actor(i, num_actors, cpus_per_actor, gpus_per_actor, resources_per_actor, queue, checkpoint_prefix, checkpoint_path, checkpoint_frequency) for i in range(num_actors) ] logger.info(f"[RayXGBoost] Created {len(actors)} remote actors.") # Split data across workers wait_load = [] for _, actor in enumerate(actors): wait_load.extend(_trigger_data_load(actor, dtrain, evals)) try: ray.get(wait_load) except Exception: _shutdown(actors, queue, force=True) raise logger.info("[RayXGBoost] Starting XGBoost training.") # Start tracker env = _start_rabit_tracker(num_actors) rabit_args = [("%s=%s" % item).encode() for item in env.items()] # Train fut = [ actor.train.remote(rabit_args, params, dtrain, evals, *args, **kwargs) for actor in actors ] callback_returns = [list() for _ in range(len(actors))] try: not_ready = fut while not_ready: if queue: while not queue.empty(): (actor_rank, item) = queue.get() if isinstance(item, Callable): item() else: callback_returns[actor_rank].append(item) ready, not_ready = ray.wait(not_ready, timeout=0) logger.debug("[RayXGBoost] Waiting for results...") ray.get(ready) # Once everything is ready ray.get(fut) # The inner loop should catch all exceptions except Exception: _shutdown(remote_workers=actors, queue=queue, force=True) raise # All results should be the same because of Rabit tracking. So we just # return the first one. res: Dict[str, Any] = ray.get(fut[0]) bst = res["bst"] evals_result = res["evals_result"] additional_results = {} if callback_returns: additional_results["callback_returns"] = callback_returns all_res = ray.get(fut) total_n = sum(res["train_n"] or 0 for res in all_res) logger.info(f"[RayXGBoost] Finished XGBoost training on training data " f"with total N={total_n:,}.") if checkpoint_prefix: _cleanup(checkpoint_prefix, checkpoint_path, num_actors) _shutdown(remote_workers=actors, queue=queue, force=False) return bst, evals_result, additional_results