def default_resource_request(cls, config): cf = dict(cls._default_config, **config) return Resources( cpu=1 + cf["optimizer_config"]["num_replay_buffer_shards"], gpu=cf["gpu"] and 1 or 0, extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
def __init__(self, scheduler=None, launch_web_server=False, server_port=TuneServer.DEFAULT_PORT): """Initializes a new TrialRunner. Args: scheduler (TrialScheduler): Defaults to FIFOScheduler. launch_web_server (bool): Flag for starting TuneServer server_port (int): Port number for launching TuneServer""" self._scheduler_alg = scheduler or FIFOScheduler() self._trials = [] self._running = {} self._avail_resources = Resources(cpu=0, gpu=0) self._committed_resources = Resources(cpu=0, gpu=0) self._resources_initialized = False # For debugging, it may be useful to halt trials after some time has # elapsed. TODO(ekl) consider exposing this in the API. self._global_time_limit = float( os.environ.get("TRIALRUNNER_WALLTIME_LIMIT", float('inf'))) self._total_time = 0 self._server = None if launch_web_server: self._server = TuneServer(self, server_port) self._stop_queue = []
def default_resource_request(cls, config): cf = merge_dicts(cls._default_config, config) return Resources( cpu=1, gpu=cf["num_gpus"], extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
def testCheckpointing(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 1 }, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) path = trials[0].checkpoint() kwargs["restore_path"] = path runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(ray.get(trials[1].runner.get_info.remote()), 1) self.addCleanup(os.remove, path)
def testPauseThenResume(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.get_info.remote()), None) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) trials[0].pause() self.assertEqual(trials[0].status, Trial.PAUSED) trials[0].resume() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.get_info.remote()), 1) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED)
def testMultiStepRun(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "stopping_criterion": { "training_iteration": 5 }, "resources": Resources(cpu=1, gpu=1), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.RUNNING)
def testFailureRecoveryMaxFailures(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(BasicVariantGenerator()) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 2, "config": { "mock_error": True, "persistent_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[0].num_failures, 1) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[0].num_failures, 2) runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[0].num_failures, 3)
def testResourceScheduler(self): ray.init(num_cpus=4, num_gpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 1 }, "resources": Resources(cpu=1, gpu=1), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.TERMINATED)
def default_resource_request(cls, config): cf = merge_dicts(cls._default_config, config) return Resources( cpu=1, gpu=0, extra_cpu=cf["num_workers"], extra_gpu=cf["use_gpu_for_workers"] and cf["num_workers"] or 0)
def testStepHook(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner(BasicVariantGenerator()) def on_step_begin(self): self._update_avail_resources() cnt = self.pre_step if hasattr(self, 'pre_step') else 0 setattr(self, 'pre_step', cnt + 1) def on_step_end(self): cnt = self.pre_step if hasattr(self, 'post_step') else 0 setattr(self, 'post_step', 1 + cnt) import types runner.trial_executor.on_step_begin = types.MethodType( on_step_begin, runner.trial_executor) runner.trial_executor.on_step_end = types.MethodType( on_step_end, runner.trial_executor) kwargs = { "stopping_criterion": { "training_iteration": 5 }, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) runner.step() self.assertEqual(runner.trial_executor.pre_step, 1) self.assertEqual(runner.trial_executor.post_step, 1)
def default_resource_request(cls, config): cf = dict(cls._default_config, **config) return Resources( cpu=1, gpu=cf["gpu"] and 1 or 0, extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
def default_resource_request(cls, config): cf = merge_dicts(cls._default_config, config) return Resources( cpu=1 + cf["optimizer"]["num_replay_buffer_shards"], gpu=cf["gpu"] and cf["gpu_fraction"] or 0, extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
def _update_avail_resources(self): clients = ray.global_state.client_table() if ray.worker.global_worker.use_raylet: # TODO(rliaw): Remove once raylet flag is swapped num_cpus = sum(cl['Resources']['CPU'] for cl in clients) num_gpus = sum(cl['Resources'].get('GPU', 0) for cl in clients) else: local_schedulers = [ entry for client in clients.values() for entry in client if (entry['ClientType'] == 'local_scheduler' and not entry['Deleted']) ] num_cpus = sum(ls['CPU'] for ls in local_schedulers) num_gpus = sum(ls.get('GPU', 0) for ls in local_schedulers) self._avail_resources = Resources(int(num_cpus), int(num_gpus)) self._resources_initialized = True
def has_resources(self, resources): """Returns whether this runner has at least the specified resources.""" self._update_avail_resources() currently_available = Resources.subtract(self._avail_resources, self._committed_resources) have_space = ( resources.cpu_total() <= currently_available.cpu and resources.gpu_total() <= currently_available.gpu and all( resources.get_res_total(res) <= currently_available.get(res) for res in resources.custom_resources)) if have_space: return True can_overcommit = self._queue_trials if (resources.cpu_total() > 0 and currently_available.cpu <= 0) or \ (resources.gpu_total() > 0 and currently_available.gpu <= 0) or \ any((resources.get_res_total(res_name) > 0 and currently_available.get(res_name) <= 0) for res_name in resources.custom_resources): can_overcommit = False # requested resource is already saturated if can_overcommit: logger.warning( "Allowing trial to start even though the " "cluster does not have enough free resources. Trial actors " "may appear to hang until enough resources are added to the " "cluster (e.g., via autoscaling). You can disable this " "behavior by specifying `queue_trials=False` in " "ray.tune.run_experiments().") return True return False
def testTrialErrorOnStart(self): ray.init() _default_registry.register(TRAINABLE_CLASS, "asdf", None) trial = Trial("asdf", resources=Resources(1, 0)) try: trial.start() except Exception as e: self.assertIn("a class", str(e))
def default_resource_request(cls, config): """Returns the resource requirement for the given configuration. This can be overriden by sub-classes to set the correct trial resource allocation, so the user does not need to. """ return Resources(cpu=1, gpu=0)
def _return_resources(self, resources): committed = self._committed_resources all_keys = set(resources.custom_resources).union( set(committed.custom_resources)) custom_resources = { k: committed.get(k) - resources.get_res_total(k) for k in all_keys } self._committed_resources = Resources( committed.cpu - resources.cpu_total(), committed.gpu - resources.gpu_total(), custom_resources=custom_resources) assert self._committed_resources.is_nonnegative(), ( "Resource invalid: {}".format(resources))
def __init__(self, i, config): self.trainable_name = "trial_{}".format(i) self.config = config self.experiment_tag = "{}tag".format(i) self.trial_name_creator = None self.logger_running = False self.restored_checkpoint = None self.resources = Resources(1, 0) self.custom_trial_name = None
def testTrialErrorOnStart(self): ray.init() trial_executor = RayTrialExecutor() _global_registry.register(TRAINABLE_CLASS, "asdf", None) trial = Trial("asdf", resources=Resources(1, 0)) try: trial_executor.start_trial(trial) except Exception as e: self.assertIn("a class", str(e))
def default_resource_request(cls, config): cf = dict(cls._default_config, **config) Agent._validate_config(cf) return Resources( cpu=cf["num_cpus_for_driver"], gpu=cf["num_gpus"], extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] + cf["num_aggregation_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
def _update_avail_resources(self): clients = ray.global_state.client_table() local_schedulers = [ entry for client in clients.values() for entry in client if (entry['ClientType'] == 'local_scheduler' and not entry['Deleted']) ] num_cpus = sum(ls['NumCPUs'] for ls in local_schedulers) num_gpus = sum(ls['NumGPUs'] for ls in local_schedulers) self._avail_resources = Resources(int(num_cpus), int(num_gpus))
def resources_to_json(resources): if resources is None: resources = Resources(cpu=1, gpu=0) return { "cpu": resources.cpu, "gpu": resources.gpu, "extra_cpu": resources.extra_cpu, "extra_gpu": resources.extra_gpu, }
def default_resource_request(cls, config): cf = dict(cls._default_config, **config) Agent._validate_config(cf) # TODO(ekl): add custom resources here once tune supports them return Resources( cpu=cf["num_cpus_for_driver"], gpu=cf["num_gpus"], extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
def default_resource_request(cls, config): cf = merge_dicts(cls._default_config, config) if cf["use_gpu_for_workers"]: num_gpus_per_worker = 1 else: num_gpus_per_worker = 0 return Resources(cpu=1, gpu=cf["gpu"] and 1 or 0, extra_cpu=cf["num_workers"], extra_gpu=num_gpus_per_worker * cf["num_workers"])
def _update_avail_resources(self, num_retries=5): for i in range(num_retries): resources = ray.global_state.cluster_resources() if not resources: logger.warning("Cluster resources not detected. Retrying...") time.sleep(0.5) if not resources or "CPU" not in resources: raise TuneError("Cluster resources cannot be detected. " "You can resume this experiment by passing in " "`resume=True` to `run_experiments`.") resources = resources.copy() num_cpus = resources.pop("CPU") num_gpus = resources.pop("GPU") custom_resources = resources self._avail_resources = Resources( int(num_cpus), int(num_gpus), custom_resources=custom_resources) self._resources_initialized = True
def json_to_resources(data): if type(data) is str: data = json.loads(data) for k in data: if k not in Resources._fields: raise TuneError( "Unknown resource type {}, must be one of {}".format( k, Resources._fields)) return Resources(data.get("cpu", 1), data.get("gpu", 0), data.get("driver_cpu_limit"), data.get("driver_gpu_limit"))
def _update_avail_resources(self): clients = ray.global_state.client_table() local_schedulers = [ entry for client in clients.values() for entry in client if (entry['ClientType'] == 'local_scheduler' and not entry['Deleted']) ] num_cpus = sum(ls['CPU'] for ls in local_schedulers) num_gpus = sum(ls.get('GPU', 0) for ls in local_schedulers) self._avail_resources = Resources(int(num_cpus), int(num_gpus)) self._resources_initialized = True
def __init__(self, search_alg, scheduler=None, launch_web_server=False, server_port=TuneServer.DEFAULT_PORT, verbose=True, queue_trials=False): """Initializes a new TrialRunner. Args: search_alg (SearchAlgorithm): SearchAlgorithm for generating Trial objects. scheduler (TrialScheduler): Defaults to FIFOScheduler. launch_web_server (bool): Flag for starting TuneServer server_port (int): Port number for launching TuneServer verbose (bool): Flag for verbosity. If False, trial results will not be output. queue_trials (bool): Whether to queue trials when the cluster does not currently have enough resources to launch one. This should be set to True when running on an autoscaling cluster to enable automatic scale-up. """ self._search_alg = search_alg self._scheduler_alg = scheduler or FIFOScheduler() self._trials = [] self._running = {} self._avail_resources = Resources(cpu=0, gpu=0) self._committed_resources = Resources(cpu=0, gpu=0) self._resources_initialized = False # For debugging, it may be useful to halt trials after some time has # elapsed. TODO(ekl) consider exposing this in the API. self._global_time_limit = float( os.environ.get("TRIALRUNNER_WALLTIME_LIMIT", float('inf'))) self._total_time = 0 self._server = None if launch_web_server: self._server = TuneServer(self, server_port) self._stop_queue = [] self._verbose = verbose self._queue_trials = queue_trials
def __init__(self, queue_trials=False, reuse_actors=False, refresh_period=RESOURCE_REFRESH_PERIOD): super(RayTrialExecutor, self).__init__(queue_trials) self._running = {} # Since trial resume after paused should not run # trial.train.remote(), thus no more new remote object id generated. # We use self._paused to store paused trials here. self._paused = {} self._reuse_actors = reuse_actors self._cached_actor = None self._avail_resources = Resources(cpu=0, gpu=0) self._committed_resources = Resources(cpu=0, gpu=0) self._resources_initialized = False self._refresh_period = refresh_period self._last_resource_refresh = float("-inf") self._last_nontrivial_wait = time.time() if ray.is_initialized(): self._update_avail_resources()
def default_resource_request(cls, config): cf = dict(cls._default_config, **config) Agent._validate_config(cf) if cf["optimizer_class"] == "AsyncReplayOptimizer": extra = cf["optimizer"]["num_replay_buffer_shards"] else: extra = 0 return Resources( cpu=cf["num_cpus_for_driver"], gpu=cf["num_gpus"], extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] + extra, extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
def _update_avail_resources(self, num_retries=5): for i in range(num_retries): resources = ray.global_state.cluster_resources() if not resources: logger.warning("Cluster resources not detected. Retrying...") time.sleep(0.5) num_cpus = resources["CPU"] num_gpus = resources["GPU"] self._avail_resources = Resources(int(num_cpus), int(num_gpus)) self._resources_initialized = True
def json_to_resources(data): if type(data) is str: data = json.loads(data) for k in data: if k in ["driver_cpu_limit", "driver_gpu_limit"]: raise TuneError( "The field `{}` is no longer supported. Use `extra_cpu` " "or `extra_gpu` instead.".format(k)) if k not in Resources._fields: raise TuneError( "Unknown resource type {}, must be one of {}".format( k, Resources._fields)) return Resources(data.get("cpu", 1), data.get("gpu", 0), data.get("extra_cpu", 0), data.get("extra_gpu", 0))
class TrialRunner(object): """A TrialRunner implements the event loop for scheduling trials on Ray. Example: runner = TrialRunner() runner.add_trial(Trial(...)) runner.add_trial(Trial(...)) while not runner.is_finished(): runner.step() print(runner.debug_string()) The main job of TrialRunner is scheduling trials to efficiently use cluster resources, without overloading the cluster. While Ray itself provides resource management for tasks and actors, this is not sufficient when scheduling trials that may instantiate multiple actors. This is because if insufficient resources are available, concurrent trials could deadlock waiting for new resources to become available. Furthermore, oversubscribing the cluster could degrade training performance, leading to misleading benchmark results. """ def __init__(self, scheduler=None, launch_web_server=False, server_port=TuneServer.DEFAULT_PORT): """Initializes a new TrialRunner. Args: scheduler (TrialScheduler): Defaults to FIFOScheduler. launch_web_server (bool): Flag for starting TuneServer server_port (int): Port number for launching TuneServer""" self._scheduler_alg = scheduler or FIFOScheduler() self._trials = [] self._running = {} self._avail_resources = Resources(cpu=0, gpu=0) self._committed_resources = Resources(cpu=0, gpu=0) self._resources_initialized = False # For debugging, it may be useful to halt trials after some time has # elapsed. TODO(ekl) consider exposing this in the API. self._global_time_limit = float( os.environ.get("TRIALRUNNER_WALLTIME_LIMIT", float('inf'))) self._total_time = 0 self._server = None if launch_web_server: self._server = TuneServer(self, server_port) self._stop_queue = [] def is_finished(self): """Returns whether all trials have finished running.""" if self._total_time > self._global_time_limit: print( "Exceeded global time limit {} / {}".format( self._total_time, self._global_time_limit)) return True for t in self._trials: if t.status in [Trial.PENDING, Trial.RUNNING, Trial.PAUSED]: return False return True def step(self): """Runs one step of the trial event loop. Callers should typically run this method repeatedly in a loop. They may inspect or modify the runner's state in between calls to step(). """ if self._can_launch_more(): self._launch_trial() elif self._running: self._process_events() else: for trial in self._trials: if trial.status == Trial.PENDING: if not self.has_resources(trial.resources): raise TuneError(( "Insufficient cluster resources to launch trial: " "trial requested {} but the cluster only has {} " "available.").format( trial.resources.summary_string(), self._avail_resources.summary_string())) elif trial.status == Trial.PAUSED: raise TuneError( "There are paused trials, but no more pending " "trials with sufficient resources.") raise TuneError("Called step when all trials finished?") if self._server: self._process_requests() if self.is_finished(): self._server.shutdown() def get_trial(self, tid): trial = [t for t in self._trials if t.trial_id == tid] return trial[0] if trial else None def get_trials(self): """Returns the list of trials managed by this TrialRunner. Note that the caller usually should not mutate trial state directly. """ return self._trials def add_trial(self, trial): """Adds a new trial to this TrialRunner. Trials may be added at any time. """ self._scheduler_alg.on_trial_add(self, trial) self._trials.append(trial) def debug_string(self, max_debug=MAX_DEBUG_TRIALS): """Returns a human readable message for printing to the console.""" messages = self._debug_messages() states = collections.defaultdict(set) limit_per_state = collections.Counter() for t in self._trials: states[t.status].add(t) # Show at most max_debug total, but divide the limit fairly while max_debug > 0: start_num = max_debug for s in states: if limit_per_state[s] >= len(states[s]): continue max_debug -= 1 limit_per_state[s] += 1 if max_debug == start_num: break for local_dir in sorted(set([t.local_dir for t in self._trials])): messages.append("Result logdir: {}".format(local_dir)) for state, trials in sorted(states.items()): limit = limit_per_state[state] messages.append("{} trials:".format(state)) for t in sorted( trials, key=lambda t: t.experiment_tag)[:limit]: messages.append(" - {}:\t{}".format(t, t.progress_string())) if len(trials) > limit: messages.append(" ... {} more not shown".format( len(trials) - limit)) return "\n".join(messages) + "\n" def _debug_messages(self): messages = ["== Status =="] messages.append(self._scheduler_alg.debug_string()) if self._resources_initialized: messages.append( "Resources used: {}/{} CPUs, {}/{} GPUs".format( self._committed_resources.cpu, self._avail_resources.cpu, self._committed_resources.gpu, self._avail_resources.gpu)) return messages def has_resources(self, resources): """Returns whether this runner has at least the specified resources.""" cpu_avail = self._avail_resources.cpu - self._committed_resources.cpu gpu_avail = self._avail_resources.gpu - self._committed_resources.gpu return resources.cpu <= cpu_avail and resources.gpu <= gpu_avail def _can_launch_more(self): self._update_avail_resources() trial = self._get_runnable() return trial is not None def _launch_trial(self, custom_trial=None): trial = custom_trial or self._get_runnable() self._commit_resources(trial.resources) try: trial.start() self._running[trial.train_remote()] = trial except Exception: error_msg = traceback.format_exc() print("Error starting runner, retrying:", error_msg) time.sleep(2) trial.stop(error=True, error_msg=error_msg) try: trial.start() self._running[trial.train_remote()] = trial except Exception: error_msg = traceback.format_exc() print("Error starting runner, abort:", error_msg) trial.stop(error=True, error_msg=error_msg) # note that we don't return the resources, since they may # have been lost def _process_events(self): [result_id], _ = ray.wait(list(self._running)) trial = self._running.pop(result_id) try: result = ray.get(result_id) self._total_time += result.time_this_iter_s if trial.should_stop(result): self._scheduler_alg.on_trial_complete(self, trial, result) decision = TrialScheduler.STOP else: decision = self._scheduler_alg.on_trial_result( self, trial, result) trial.update_last_result( result, terminate=(decision == TrialScheduler.STOP)) if decision == TrialScheduler.CONTINUE: if trial.should_checkpoint(): # TODO(rliaw): This is a blocking call trial.checkpoint() self._running[trial.train_remote()] = trial elif decision == TrialScheduler.PAUSE: self._pause_trial(trial) elif decision == TrialScheduler.STOP: self._stop_trial(trial) else: assert False, "Invalid scheduling decision: {}".format( decision) except Exception: error_msg = traceback.format_exc() print("Error processing event:", error_msg) if trial.status == Trial.RUNNING: if trial.has_checkpoint() and \ trial.num_failures < trial.max_failures: self._try_recover(trial, error_msg) else: self._scheduler_alg.on_trial_error(self, trial) self._stop_trial(trial, error=True, error_msg=error_msg) def _try_recover(self, trial, error_msg): try: print("Attempting to recover trial state from last checkpoint") trial.stop(error=True, error_msg=error_msg, stop_logger=False) trial.result_logger.flush() # make sure checkpoint is synced trial.start() self._running[trial.train_remote()] = trial except Exception: error_msg = traceback.format_exc() print("Error recovering trial from checkpoint, abort:", error_msg) self._stop_trial(trial, error=True, error_msg=error_msg) def _get_runnable(self): return self._scheduler_alg.choose_trial_to_run(self) def _commit_resources(self, resources): self._committed_resources = Resources( self._committed_resources.cpu + resources.cpu, self._committed_resources.gpu + resources.gpu) def _return_resources(self, resources): self._committed_resources = Resources( self._committed_resources.cpu - resources.cpu, self._committed_resources.gpu - resources.gpu) assert self._committed_resources.cpu >= 0 assert self._committed_resources.gpu >= 0 def request_stop_trial(self, trial): self._stop_queue.append(trial) def _process_requests(self): while self._stop_queue: t = self._stop_queue.pop() self.stop_trial(t) def stop_trial(self, trial): """Stops trial. Trials may be stopped at any time. If trial is in state PENDING or PAUSED, calls `scheduler.on_trial_remove`. Otherwise waits for result for the trial and calls `scheduler.on_trial_complete` if RUNNING.""" error = False error_msg = None if trial.status in [Trial.ERROR, Trial.TERMINATED]: return elif trial.status in [Trial.PENDING, Trial.PAUSED]: self._scheduler_alg.on_trial_remove(self, trial) elif trial.status is Trial.RUNNING: # NOTE: There should only be one... result_id = [rid for rid, t in self._running.items() if t is trial][0] self._running.pop(result_id) try: result = ray.get(result_id) trial.update_last_result(result, terminate=True) self._scheduler_alg.on_trial_complete(self, trial, result) except Exception: error_msg = traceback.format_exc() print("Error processing event:", error_msg) self._scheduler_alg.on_trial_error(self, trial) error = True self._stop_trial(trial, error=error, error_msg=error_msg) def _stop_trial(self, trial, error=False, error_msg=None): """Only returns resources if resources allocated.""" prior_status = trial.status trial.stop(error=error, error_msg=error_msg) if prior_status == Trial.RUNNING: self._return_resources(trial.resources) def _pause_trial(self, trial): """Only returns resources if resources allocated.""" prior_status = trial.status trial.pause() if prior_status == Trial.RUNNING: self._return_resources(trial.resources) def _update_avail_resources(self): clients = ray.global_state.client_table() local_schedulers = [ entry for client in clients.values() for entry in client if (entry['ClientType'] == 'local_scheduler' and not entry['Deleted']) ] num_cpus = sum(ls['CPU'] for ls in local_schedulers) num_gpus = sum(ls.get('GPU', 0) for ls in local_schedulers) self._avail_resources = Resources(int(num_cpus), int(num_gpus)) self._resources_initialized = True