def pg_launcher(num_pgs_to_create): print("Creating pgs") pgs = [] for i in range(num_pgs_to_create): pgs.append(placement_group(bundles, strategy="STRICT_SPREAD")) pgs_removed = [] pgs_unremoved = [] # Randomly choose placement groups to remove. if pg_removal: print("removing pgs") for pg in pgs: if random() < 0.5 and pg_removal: pgs_removed.append(pg) else: pgs_unremoved.append(pg) print(len(pgs_unremoved)) tasks = [] # Randomly schedule tasks or actors on placement groups that # are not removed. for pg in pgs_unremoved: for i in range(num_nodes): tasks.append( mock_task.options(placement_group=pg, placement_group_bundle_index=i).remote()) # Remove the rest of placement groups. if pg_removal: for pg in pgs_removed: remove_placement_group(pg) ray.get(tasks) # Since placement groups are scheduled, remove them. for pg in pgs_unremoved: remove_placement_group(pg)
def cleanup_existing_pg(self, block: bool = False): """Clean up (remove) all existing placement groups. This scans through the placement_group_table to discover existing placement groups and calls remove_placement_group on all that match the ``_tune__`` prefix. This method is called at the beginning of the tuning run to clean up existing placement groups should the experiment be interrupted by a driver failure and resumed in the same driver script. Args: block (bool): If True, will wait until all placement groups are shut down. """ should_cleanup = not int( os.getenv("TUNE_PLACEMENT_GROUP_CLEANUP_DISABLED", "0")) if should_cleanup: has_non_removed_pg_left = True while has_non_removed_pg_left: has_non_removed_pg_left = False for pid, info in placement_group_table().items(): if not info["name"].startswith(self._prefix): continue if info["state"] == "REMOVED": continue # If block=False, only run once has_non_removed_pg_left = block pg = get_placement_group(info["name"]) remove_placement_group(pg) time.sleep(0.1)
def pg_launcher(pre_created_pgs, num_pgs_to_create): pgs = [] pgs += pre_created_pgs for i in range(num_pgs_to_create): pgs.append( placement_group(bundles, strategy="STRICT_SPREAD", name=str(i))) pgs_removed = [] pgs_unremoved = [] # Randomly choose placement groups to remove. for pg in pgs: if random() < .5: pgs_removed.append(pg) else: pgs_unremoved.append(pg) # Randomly schedule tasks or actors on placement groups that # are not removed. for pg in pgs_unremoved: # TODO(sang): Comment in this line causes GCS actor management # failure. We need to fix it. # if random() < .5: mock_task.options(placement_group=pg).remote() # else: # MockActor.options(placement_group=pg).remote() # Remove the rest of placement groups. for pg in pgs_removed: remove_placement_group(pg) ray.get([pg.ready() for pg in pgs_unremoved], timeout=10) # Since placement groups are scheduled, remove them. for pg in pgs_unremoved: remove_placement_group(pg)
def shutdown(self, force=False): force_kill = force if not force_kill: cleanup = self._shutdown_remote_workers() force_kill = not self._terminate_remote_workers(cleanup) if force_kill: for worker in self.remote_workers: logger.debug(f"Killing worker {worker}.") ray.kill(worker) self.remote_workers = [] # Remove worker placement group. if self._worker_placement_group: remove_placement_group(self._worker_placement_group) removed_placement_group = self._worker_placement_group self._worker_placement_group = None def is_placement_group_removed(): table = ray.util.placement_group_table(removed_placement_group) if "state" not in table: return False return table["state"] == "REMOVED" # Wait the placement_group been deleted success = wait_for_condition(is_placement_group_removed, SGD_PLACEMENT_GROUP_TIMEOUT_S) if not success: logger.warning( f"Placement Group removal is not successful after " f"{SGD_PLACEMENT_GROUP_TIMEOUT_S} seconds.")
def spread_to_all_nodes(f: RemoteFunction): nodes = ray.state.nodes() resources = [{'CPU': f._num_cpus} for _ in range(len(nodes))] pg = placement_group(resources, strategy="STRICT_SPREAD") ray.get(pg.ready()) yield len(nodes), pg remove_placement_group(pg)
def cleanup(self, force: bool = False): """Remove placement groups that are scheduled for removal. Currently, this will remove placement groups after they've been marked for removal for ``self._removal_delay`` seconds. If ``force=True``, this condition is disregarded and all placement groups are removed instead. Args: force: If True, all placement groups scheduled for removal will be removed, disregarding any removal conditions. """ # Wrap in list so we can modify the dict for pg in list(self._pgs_for_removal): if (force or (time.time() - self._removal_delay) >= self._pgs_for_removal[pg]): self._pgs_for_removal.pop(pg) remove_placement_group(pg) # Remove from unstaged cache if pg in self._unstaged_pg_pgf: pgf = self._unstaged_pg_pgf.pop(pg) self._unstaged_pgf_pg[pgf].discard(pg)
def _restart(self): self.worker_group.shutdown() if self._initialization_hook is not None: initialization_hook = self._initialization_hook else: initialization_hook = None if self._placement_group: remove_placement_group(self._placement_group) self._placement_group = None self.start(initialization_hook=initialization_hook)
def is_all_placement_group_removed(): ready, _ = ray.wait(list(wait_pgs.keys()), timeout=0.5) if ready: ready_pg = wait_pgs[ready[0]] remove_placement_group(ready_pg) del wait_pgs[ready[0]] if len(wait_pgs) == 0: return True return False
def shutdown(self, force=False): force_kill = force if not force_kill: cleanup = self._shutdown_remote_workers() force_kill = not self._terminate_remote_workers(cleanup) if force_kill: for worker in self.remote_workers: logger.debug(f"Killing worker {worker}.") ray.kill(worker) self.remote_workers = [] # Remove worker placement group. if self._worker_placement_group: remove_placement_group(self._worker_placement_group) self._worker_placement_group = None
def shutdown(self): """Shuts down the workers in the worker group.""" try: self._backend.on_shutdown(self.worker_group, self._backend_config) except RayActorError: logger.warning("Graceful shutdown of backend failed. This is " "expected if one of the workers has crashed.") self.worker_group.shutdown() self.worker_group = InactiveWorkerGroup() if self._placement_group: remove_placement_group(self._placement_group) self._placement_group = None self.dataset_shards = None
def _predict( booster, data, num_actors, **kwargs, ): s = time.time() X_row_parts, _ = data num_actors = _get_num_actors( num_actors if isinstance(num_actors, int) else "default_predict" ) if num_actors > len(X_row_parts): num_actors = len(X_row_parts) # Create remote actors actors, pg = create_actors(num_actors) # Split data across workers _split_data_across_actors( actors, lambda actor, *X: actor.set_predict_data.remote(*X), X_row_parts, is_predict=True, ) LOGGER.info(f"Data preparation time: {time.time() - s} s") s = time.time() booster = ray.put(booster) predictions = [ tuple(actor.predict._remote(args=(booster,), kwargs=kwargs, num_returns=2)) for actor in actors ] ray.wait([part for _, part in predictions], num_returns=len(predictions)) remove_placement_group(pg) result = from_partitions(predictions, 0) LOGGER.info(f"Prediction time: {time.time() - s} s") return result
def test_many_placement_groups(): @ray.remote(num_cpus=1, resources={"node": 0.02}) def f1(): sleep(10) pass @ray.remote(num_cpus=1) def f2(): sleep(10) pass @ray.remote(resources={"node": 0.02}) def f3(): sleep(10) pass bundle1 = {"node": 0.02, "CPU": 1} bundle2 = {"CPU": 1} bundle3 = {"node": 0.02} pgs = [] for _ in trange(MAX_PLACEMENT_GROUPS, desc="Creating pgs"): pg = placement_group(bundles=[bundle1, bundle2, bundle3]) pgs.append(pg) for pg in tqdm(pgs, desc="Waiting for pgs to be ready"): ray.get(pg.ready()) refs = [] for pg in tqdm(pgs, desc="Scheduling tasks"): ref1 = f1.options(placement_group=pg).remote() ref2 = f2.options(placement_group=pg).remote() ref3 = f3.options(placement_group=pg).remote() refs.extend([ref1, ref2, ref3]) for _ in trange(10, desc="Waiting"): sleep(1) with tqdm() as p_bar: while refs: done, refs = ray.wait(refs) p_bar.update() for pg in tqdm(pgs, desc="Cleaning up pgs"): remove_placement_group(pg)
def test_many_placement_groups(): # @ray.remote(num_cpus=1, resources={"node": 0.02}) @ray.remote class C1: def ping(self): return "pong" # @ray.remote(num_cpus=1) @ray.remote class C2: def ping(self): return "pong" # @ray.remote(resources={"node": 0.02}) @ray.remote class C3: def ping(self): return "pong" bundle1 = {"node": 0.02, "CPU": 1} bundle2 = {"CPU": 1} bundle3 = {"node": 0.02} pgs = [] for _ in tqdm.trange(MAX_PLACEMENT_GROUPS, desc="Creating pgs"): pg = placement_group(bundles=[bundle1, bundle2, bundle3]) pgs.append(pg) for pg in tqdm.tqdm(pgs, desc="Waiting for pgs to be ready"): ray.get(pg.ready()) actors = [] for pg in tqdm.tqdm(pgs, desc="Scheduling tasks"): actors.append(C1.options(placement_group=pg).remote()) actors.append(C2.options(placement_group=pg).remote()) actors.append(C3.options(placement_group=pg).remote()) not_ready = [actor.ping.remote() for actor in actors] for _ in tqdm.trange(len(actors)): ready, not_ready = ray.wait(not_ready) assert ray.get(*ready) == "pong" for pg in tqdm.tqdm(pgs, desc="Cleaning up pgs"): remove_placement_group(pg)
def post_stop_cleanup(future, pg): """Things to be done after a trial is stopped.""" assert isinstance(pg, PlacementGroup) try: # This should not be blocking as # we are only here when triggered. ray.get(future, timeout=0) except GetTimeoutError: if log_once("tune_trial_cleanup_timeout"): logger.error( "Timed out when trying to stop the Ray actor gracefully. " "Consider making `stop` a faster operation.") except Exception: if log_once("tune_trial_cleanup_exception"): logger.error( f"An exception occurred when trying to stop the Ray actor:" f"{traceback.format_exc()}") finally: remove_placement_group(pg)
def pg_launcher(pre_created_pgs, num_pgs_to_create): pgs = [] pgs += pre_created_pgs for i in range(num_pgs_to_create): pgs.append(placement_group(BUNDLES, strategy="STRICT_SPREAD")) pgs_removed = [] pgs_unremoved = [] # Randomly choose placement groups to remove. for pg in pgs: if random() < .5: pgs_removed.append(pg) else: pgs_unremoved.append(pg) tasks = [] max_actor_cnt = 5 actor_cnt = 0 actors = [] # Randomly schedule tasks or actors on placement groups that # are not removed. for pg in pgs_unremoved: # TODO(sang): Comment in this line causes GCS actor management # failure. We need to fix it. if random() < .5: tasks.append(mock_task.options(placement_group=pg).remote()) else: if actor_cnt < max_actor_cnt: actors.append(MockActor.options(placement_group=pg).remote()) actor_cnt += 1 # Remove the rest of placement groups. for pg in pgs_removed: remove_placement_group(pg) ray.get([pg.ready() for pg in pgs_unremoved]) ray.get(tasks) ray.get([actor.ping.remote() for actor in actors]) # Since placement groups are scheduled, remove them. for pg in pgs_unremoved: remove_placement_group(pg)
async def _create_workers(self, allocation): if self._pg: remove_placement_group(self._pg) logging.info(f"Creating {len(allocation)} worker tasks") self._placement_group_factory = allocation_to_pgf( allocation, self._worker_resources) self._pg = self._placement_group_factory() self._worker_tasks = { worker_index: run_adaptdl.options(num_cpus=self._worker_resources.get("CPU", 1), num_gpus=self._worker_resources.get("GPU", 0), placement_group=self._pg).remote( f"{NAMESPACE}/{NAME}", job_uid, worker_index, len(allocation), self._iteration, self._checkpoint_ref, self._worker_port_offset, **self._job_params) for worker_index, node in enumerate(allocation) } self._checkpoint_received.clear() self._running = True self._iteration += 1
def add(self, trial: Trial, actor: ActorHandle, placement_group: Optional[PlacementGroup] = None): """Adds a trial actor to be stopped. If the number of futures exceeds the threshold, the cleanup mechanism will kick in. Args: trial (Trial): The trial corresponding to the future. actor (ActorHandle): Handle to the trainable to be stopped. placement_group (PlacementGroup): Placement group to stop. """ future = actor.stop.remote() if placement_group: remove_placement_group(placement_group) else: actor.__ray_terminate__.remote() self._cleanup_map[future] = trial if len(self._cleanup_map) > self.threshold: self.cleanup(partial=True)
def stop(self): ray.get([worker.stop.remote() for worker in self.workers]) if self.should_colocate: remove_placement_group(self._placement_group)
@ray.remote def run_wrk(): logger.info("Warming up for ~3 seconds") for _ in range(5): resp = requests.get("http://127.0.0.1:8000/hey").text logger.info("Received response \'" + resp + "\'") time.sleep(0.5) result = subprocess.run( [ "wrk", "-c", str(num_connections), "-t", str(num_threads), "-d", time_to_run, "http://127.0.0.1:8000/hey" ], stdout=subprocess.PIPE) return result.stdout.decode() results = ray.get([ run_wrk.options(placement_group=pg, placement_group_bundle_index=i).remote() for i in range(expected_num_nodes) ]) for i in range(expected_num_nodes): logger.info("Results for node %i of %i:", i + 1, expected_num_nodes) logger.info(results[i]) remove_placement_group(pg)
def run_trial(total_stage, num_pg_per_stage): creating_e2e_s = [] removing_e2e_s = [] # Create and remove placement groups. for i in range(total_stage): # Create pgs. pgs = [] start = perf_counter() for _ in range(num_pg_per_stage): pgs.append( placement_group(bundles=[{ "custom": 0.025 } for _ in range(4)], strategy="PACK")) logger.info(f"Created {num_pg_per_stage} pgs.") ray.get([pg.ready() for pg in pgs]) end = perf_counter() total_creating_time = end - start logger.info(f"Creating {num_pg_per_stage} took " f"{total_creating_time} seconds at stage {i}") creating_e2e_s.append(total_creating_time * 1000.0) # Remove pgs start = perf_counter() for _, pg in enumerate(pgs): remove_placement_group(pg) end = perf_counter() total_removal_time = end - start logger.info(f"removed {num_pg_per_stage} pgs took " f"{total_removal_time} seconds at stage {i}") removing_e2e_s.append(total_removal_time * 1000.0) # time.sleep(1) # Calculate the scheduling latency (excluding queueing time). latencies = [] for entry in ray.util.placement_group_table().values(): latency = entry["stats"]["scheduling_latency_ms"] latencies.append(latency) latencies = sorted(latencies) removing_e2e_s = sorted(removing_e2e_s) creating_e2e_s = sorted(creating_e2e_s) def get_scheduling_perf(latencies): """Return P10, 50, 95, 99 latency""" p10 = latencies[int(len(latencies) * 0.1)] p50 = latencies[int(len(latencies) * 0.5)] p95 = latencies[int(len(latencies) * 0.95)] p99 = latencies[int(len(latencies) * 0.99)] return {"p10_ms": p10, "p50_ms": p50, "p95_ms": p95, "p99_ms": p99} scheduling_perf = get_scheduling_perf(latencies) removing_perf = get_scheduling_perf(removing_e2e_s) creation_perf = get_scheduling_perf(creating_e2e_s) wait_for_condition( lambda: (ray.cluster_resources()["custom"] == ray.available_resources( )["custom"]), timeout=30, ) wait_for_condition( lambda: (ray.cluster_resources()["pending"] == ray.available_resources( )["pending"]), timeout=30, ) return scheduling_perf, removing_perf, creation_perf
def _predict( booster, data, num_actors, **kwargs, ): """ Run distributed prediction with a trained booster on Ray backend. During work it evenly distributes `data` between workers, runs xgb.predict on each worker for subset of `data` and creates Modin DataFrame with prediction results. Parameters ---------- booster : xgboost.Booster A trained booster. data : modin.experimental.xgboost.DMatrix Input data used for prediction. num_actors : int, optional Number of actors for prediction. If unspecified, this value will be computed automatically. **kwargs : dist Other parameters are the same as `xgboost.Booster.predict`. Returns ------- modin.pandas.DataFrame Modin DataFrame with prediction results. """ s = time.time() X_row_parts, _ = data num_actors = _get_num_actors( num_actors if isinstance(num_actors, int) else "default_predict") if num_actors > len(X_row_parts): num_actors = len(X_row_parts) # Create remote actors actors, pg = create_actors(num_actors) # Split data across workers _split_data_across_actors( actors, lambda actor, *X: actor.set_predict_data.remote(*X), X_row_parts, is_predict=True, ) LOGGER.info(f"Data preparation time: {time.time() - s} s") s = time.time() booster = ray.put(booster) predictions = [ tuple(actor.predict.options(num_returns=2).remote(booster, **kwargs)) for actor in actors ] ray.wait([part for _, part in predictions], num_returns=len(predictions)) remove_placement_group(pg) result = from_partitions(predictions, 0) LOGGER.info(f"Prediction time: {time.time() - s} s") return result
def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): request_resources(num_cpus=42) # add placement groups. pg_demands = [{"GPU": 2}, {"extra_resource": 2}] strategy = "STRICT_PACK" pg = placement_group(pg_demands, strategy=strategy) pg.ready() time.sleep(2) # wait for placemnt groups to propogate. # Disable event clearing for test. monitor.event_summarizer.clear = lambda *a: None visited_atleast_once = [set(), set()] while True: monitor.update_load_metrics() monitor.update_resource_requests() monitor.update_event_summary() resource_usage = monitor.load_metrics._get_resource_usage() # Check resource request propagation. req = monitor.load_metrics.resource_requests assert req == [{"CPU": 1}] * 42, req pg_response_data = monitor.load_metrics.pending_placement_groups assert_correct_pg(pg_response_data, pg_demands, strategy) if "memory" in resource_usage[0]: del resource_usage[0]["memory"] visited_atleast_once[0].add("memory") if "object_store_memory" in resource_usage[0]: del resource_usage[0]["object_store_memory"] visited_atleast_once[0].add("object_store_memory") if "memory" in resource_usage[1]: del resource_usage[1]["memory"] visited_atleast_once[1].add("memory") if "object_store_memory" in resource_usage[1]: del resource_usage[1]["object_store_memory"] visited_atleast_once[1].add("object_store_memory") for key in list(resource_usage[0].keys()): if key.startswith("node:"): del resource_usage[0][key] visited_atleast_once[0].add("node:") for key in list(resource_usage[1].keys()): if key.startswith("node:"): del resource_usage[1][key] visited_atleast_once[1].add("node:") if expected_resource_usage is None: if all(x for x in resource_usage[0:]): break elif all(x == y for x, y in zip(resource_usage, expected_resource_usage)): break else: timeout -= 1 time.sleep(1) if timeout <= 0: raise ValueError("Timeout. {} != {}".format( resource_usage, expected_resource_usage)) # Sanity check we emitted a resize event. assert any("Resized to" in x for x in monitor.event_summarizer.summary()) assert visited_atleast_once[0] == { "memory", "object_store_memory", "node:" } assert visited_atleast_once[0] == visited_atleast_once[1] remove_placement_group(pg) return resource_usage
def _train( dtrain, num_actors, params: Dict, *args, evals=(), **kwargs, ): s = time.time() X_row_parts, y_row_parts = dtrain assert len(X_row_parts) == len(y_row_parts), "Unaligned train data" num_actors = _get_num_actors( num_actors if isinstance(num_actors, int) else "default_train" ) if num_actors > len(X_row_parts): num_actors = len(X_row_parts) actors, pg = create_actors(num_actors) add_as_eval_method = None if evals: for (eval_data, method) in evals[:]: if eval_data is dtrain: add_as_eval_method = method evals.remove((eval_data, method)) for ((eval_X, eval_y), eval_method) in evals: # Split data across workers _split_data_across_actors( actors, lambda actor, *X_y: actor.add_eval_data.remote( *X_y, eval_method=eval_method ), eval_X, y_parts=eval_y, ) # Split data across workers _split_data_across_actors( actors, lambda actor, *X_y: actor.set_train_data.remote( *X_y, add_as_eval_method=add_as_eval_method ), X_row_parts, y_parts=y_row_parts, ) LOGGER.info(f"Data preparation time: {time.time() - s} s") s = time.time() with RabitContextManager(len(actors), get_node_ip_address()) as env: rabit_args = [("%s=%s" % item).encode() for item in env.items()] # Train fut = [ actor.train.remote(rabit_args, params, *args, **kwargs) for actor in actors ] # All results should be the same because of Rabit tracking. So we just # return the first one. result = ray.get(fut[0]) remove_placement_group(pg) LOGGER.info(f"Training time: {time.time() - s} s") return result
def _train( dtrain, num_actors, params: Dict, *args, evals=(), **kwargs, ): """ Run distributed training of XGBoost model on Ray backend. During work it evenly distributes `dtrain` between workers according to IP addresses partitions (in case of not even distribution of `dtrain` by nodes, part of partitions will be re-distributed between nodes), runs xgb.train on each worker for subset of `dtrain` and reduces training results of each worker using Rabit Context. Parameters ---------- dtrain : modin.experimental.DMatrix Data to be trained against. num_actors : int, optional Number of actors for training. If unspecified, this value will be computed automatically. params : dict Booster params. *args : iterable Other parameters for `xgboost.train`. evals : list of pairs (modin.experimental.xgboost.DMatrix, str), default: empty List of validation sets for which metrics will be evaluated during training. Validation metrics will help us track the performance of the model. **kwargs : dict Other parameters are the same as `xgboost.train`. Returns ------- dict A dictionary with trained booster and dict of evaluation results as {"booster": xgboost.Booster, "history": dict}. """ s = time.time() X_row_parts, y_row_parts = dtrain assert len(X_row_parts) == len(y_row_parts), "Unaligned train data" num_actors = _get_num_actors( num_actors if isinstance(num_actors, int) else "default_train") if num_actors > len(X_row_parts): num_actors = len(X_row_parts) actors, pg = create_actors(num_actors) add_as_eval_method = None if evals: for (eval_data, method) in evals[:]: if eval_data is dtrain: add_as_eval_method = method evals.remove((eval_data, method)) for ((eval_X, eval_y), eval_method) in evals: # Split data across workers _split_data_across_actors( actors, lambda actor, *X_y: actor.add_eval_data.remote( *X_y, eval_method=eval_method), eval_X, y_parts=eval_y, ) # Split data across workers _split_data_across_actors( actors, lambda actor, *X_y: actor.set_train_data.remote( *X_y, add_as_eval_method=add_as_eval_method), X_row_parts, y_parts=y_row_parts, ) LOGGER.info(f"Data preparation time: {time.time() - s} s") s = time.time() with RabitContextManager(len(actors), get_node_ip_address()) as env: rabit_args = [("%s=%s" % item).encode() for item in env.items()] # Train fut = [ actor.train.remote(rabit_args, params, *args, **kwargs) for actor in actors ] # All results should be the same because of Rabit tracking. So we just # return the first one. result = ray.get(fut[0]) remove_placement_group(pg) LOGGER.info(f"Training time: {time.time() - s} s") return result