def on_no_available_trials(self, trial_runner): if self._queue_trials: return for trial in trial_runner.get_trials(): if trial.uses_placement_groups: return if trial.status == Trial.PENDING: if not self.has_resources_for_trial(trial): resource_string = trial.resources.summary_string() trial_resource_help_msg = trial.get_trainable_cls( ).resource_help(trial.config) autoscaling_msg = "" if is_ray_cluster(): autoscaling_msg = ( "Pass `queue_trials=True` in ray.tune.run() or " "on the command line to queue trials until the " "cluster scales up or resources become available. " ) raise TuneError( "Insufficient cluster resources to launch trial: " f"trial requested {resource_string}, but the cluster " f"has only {self.resource_string()}. " f"{autoscaling_msg}" f"{trial_resource_help_msg} ") elif trial.status == Trial.PAUSED: raise TuneError("There are paused trials, but no more pending " "trials with sufficient resources.")
def _may_warn_insufficient_resources(self, all_trials): # This is approximately saying we are not making progress. if len(all_trials) == self._all_trials_size: if self._no_running_trials_since == -1: self._no_running_trials_since = time.monotonic() elif (time.monotonic() - self._no_running_trials_since > _get_insufficient_resources_warning_threshold()): if not is_ray_cluster(): # autoscaler not enabled # If any of the pending trial cannot be fulfilled, # that's a good enough hint of trial resources not enough. for trial in all_trials: if (trial.status is Trial.PENDING and not _can_fulfill_no_autoscaler(trial)): # TODO(xwjiang): # Raise an Error once #18608 is resolved. logger.warning( _get_insufficient_resources_error_msg(trial)) break else: # TODO(xwjiang): #17799. # Output a more helpful msg for autoscaler. logger.warning(_get_insufficient_resources_warning_msg()) self._no_running_trials_since = time.monotonic() else: self._no_running_trials_since = -1 self._all_trials_size = len(all_trials)
def on_no_available_trials(self, all_trials): """Tracks information across the life of Tune loop and makes guesses about if Tune loop is stuck due to infeasible resources. If so, outputs certain warning messages. The logic should be conservative, non-intrusive and informative. For example, rate limiting is applied so that the message is not spammy. """ # This is approximately saying we are not making progress. if len(all_trials) == self._last_trial_num: if self._no_running_trials_since == -1: self._no_running_trials_since = time.monotonic() elif (time.monotonic() - self._no_running_trials_since > _get_insufficient_resources_warning_threshold()): if not is_ray_cluster(): # autoscaler not enabled # If any of the pending trial cannot be fulfilled, # that's a good enough hint of trial resources not enough. for trial in all_trials: if (trial.status is Trial.PENDING and not _can_fulfill_no_autoscaler(trial)): # TODO(xwjiang): # Raise an Error once #18608 is resolved. logger.warning( _get_insufficient_resources_error_msg(trial)) break else: # TODO(xwjiang): #17799. # Output a more helpful msg for autoscaler. logger.warning(_get_insufficient_resources_warning_msg()) self._no_running_trials_since = time.monotonic() else: self._no_running_trials_since = -1 self._last_trial_num = len(all_trials)
def _get_warning_threshold() -> float: if is_ray_cluster(): return float( os.environ.get( "TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S_AUTOSCALER", "60")) else: return float( os.environ.get("TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S", "1"))
def _get_insufficient_resources_warning_threshold() -> float: if is_ray_cluster(): return float( os.environ.get( "TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S_AUTOSCALER", "60")) else: # Set the default to 10s so that we don't prematurely determine that # a cluster cannot fulfill the resources requirements. return float( os.environ.get("TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S", "10"))
def _get_insufficient_resources_warning_msg() -> str: msg = ( f"No trial is running and no new trial has been started within" f" at least the last " f"{_get_insufficient_resources_warning_threshold()} seconds. " f"This could be due to the cluster not having enough " f"resources available to start the next trial. " f"Stop the tuning job and adjust the resources requested per trial " f"(possibly via `resources_per_trial` or via `num_workers` for rllib) " f"and/or add more resources to your Ray runtime.") if is_ray_cluster(): return "Ignore this message if the cluster is autoscaling. " + msg else: return msg
def __init__(self, queue_trials=None, reuse_actors=False, ray_auto_init=None, refresh_period=RESOURCE_REFRESH_PERIOD): if queue_trials is None: if os.environ.get("TUNE_DISABLE_QUEUE_TRIALS") == "1": logger.info("'TUNE_DISABLE_QUEUE_TRIALS=1' detected.") queue_trials = False elif is_ray_cluster(): queue_trials = True if ray_auto_init is None: if os.environ.get("TUNE_DISABLE_AUTO_INIT") == "1": logger.info("'TUNE_DISABLE_AUTO_INIT=1' detected.") ray_auto_init = False else: ray_auto_init = True super(RayTrialExecutor, self).__init__(queue_trials) # Check for if we are launching a trial without resources in kick off # autoscaler. self._trial_queued = False self._running = {} # Since trial resume after paused should not run # trial.train.remote(), thus no more new remote object ref generated. # We use self._paused to store paused trials here. self._paused = {} self._trial_cleanup = _TrialCleanup() self._reuse_actors = reuse_actors self._cached_actor = None self._avail_resources = Resources(cpu=0, gpu=0) self._committed_resources = Resources(cpu=0, gpu=0) self._resources_initialized = False self._refresh_period = refresh_period self._last_resource_refresh = float("-inf") self._last_nontrivial_wait = time.time() if not ray.is_initialized() and ray_auto_init: logger.info("Initializing Ray automatically." "For cluster usage or custom Ray initialization, " "call `ray.init(...)` before `tune.run`.") ray.init() if ray.is_initialized(): self._update_avail_resources()
def _get_warning_msg() -> str: if is_ray_cluster(): return ( f"If autoscaler is still scaling up, ignore this message. No " f"trial is running and no new trial has been started within at " f"least the last {_get_warning_threshold()} seconds. " f"This could be due to the cluster not having enough " f"resources available to start the next trial. Please stop the " f"tuning job and readjust resources_per_trial argument passed " f"into tune.run() as well as max_workers and worker_nodes " f"InstanceType specified in cluster.yaml.") else: return (f"No trial is running and no new trial has been started within" f" at least the last {_get_warning_threshold()} seconds. " f"This could be due to the cluster not having enough " f"resources available to start the next trial. Please stop " f"the tuning job and readjust resources_per_trial argument " f"passed into tune.run() and/or start a cluster with more " f"resources.")
def main(): os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1" # Tweak ray.init(address="auto") num_samples = 1000 results_per_second = 0.5 trial_length_s = 100 max_runtime = 120 if is_ray_cluster(): # Add constant overhead for SSH connection max_runtime = 120 timed_tune_run(name="result throughput cluster", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, sync_config=tune.SyncConfig(sync_to_driver=False)) # Tweak!
def main(): ray.init(address="auto") num_samples = 1000 sleep_time = 0.1 num_iters = 300 expected_run_time = num_iters * sleep_time # Allow minimum of 20 % overhead (or 10 seconds for short runs) expected_run_time += max(expected_run_time * 0.2, 10.) if is_ray_cluster(): # Add constant overhead for SSH connection expected_run_time += 0.3 * num_samples start_time = time.time() tune.run( my_naive_trainable, config={ "score": tune.uniform(0., 1.), "num_iters": num_iters, "sleep_time": sleep_time }, reuse_actors=True, verbose=2, num_samples=num_samples) time_taken = time.time() - start_time assert time_taken < expected_run_time, \ f"The buffering test took {time_taken:.2f} seconds, but should not " \ f"have exceeded {expected_run_time:.2f} seconds. Test failed." print(f"The buffering test took {time_taken:.2f} seconds, which " f"is below the budget of {expected_run_time:.2f} seconds. " f"Test successful.")