def _on_insufficient_samples( self, trial_runner: "trial_runner.TrialRunner", trial: Trial, time: float ) -> str: pause = time - self._last_pause[trial] > self._min_time_slice pause = pause and [ t for t in trial_runner.get_live_trials() if t.status in (Trial.PENDING, Trial.PAUSED) ] return TrialScheduler.PAUSE if pause else TrialScheduler.CONTINUE
def example_resources_allocation_function( trial_runner: "trial_runner.TrialRunner", trial: Trial, result: Dict[str, Any], scheduler: "ResourceChangingScheduler", ) -> Optional[Union[PlacementGroupFactory, Resources]]: """This is a basic example of a resource allocating function. The function naively balances available CPUs over live trials. This function returns a new ``PlacementGroupFactory`` with updated resource requirements, or None. If the returned ``PlacementGroupFactory`` is equal by value to the one the trial has currently, the scheduler will skip the update process internally (same with None). See :class:`DistributeResources` for a more complex, robust approach. Args: trial_runner: Trial runner for this Tune run. Can be used to obtain information about other trials. trial: The trial to allocate new resources to. result: The latest results of trial. scheduler: The scheduler calling the function. """ # Get base trial resources as defined in # ``tune.run(resources_per_trial)`` base_trial_resource = scheduler._base_trial_resources # Don't bother if this is just the first iteration if result["training_iteration"] < 1: return None # default values if resources_per_trial is unspecified if base_trial_resource is None: base_trial_resource = PlacementGroupFactory([{"CPU": 1, "GPU": 0}]) # Assume that the number of CPUs cannot go below what was # specified in tune.run min_cpu = base_trial_resource.required_resources.get("CPU", 0) # Get the number of CPUs available in total (not just free) total_available_cpus = ( trial_runner.trial_executor._resource_updater.get_num_cpus() ) # Divide the free CPUs among all live trials cpu_to_use = max( min_cpu, total_available_cpus // len(trial_runner.get_live_trials()) ) # Assign new CPUs to the trial in a PlacementGroupFactory return PlacementGroupFactory([{"CPU": cpu_to_use, "GPU": 0}])
def on_trial_result(self, trial_runner: "trial_runner.TrialRunner", trial: Trial, result: Dict) -> str: if self._time_attr not in result: time_missing_msg = ("Cannot find time_attr {} " "in trial result {}. Make sure that this " "attribute is returned in the " "results of your Trainable.".format( self._time_attr, result)) if self._require_attrs: raise RuntimeError( time_missing_msg + "If this error is expected, you can change this to " "a warning message by " "setting PBT(require_attrs=False)") else: if log_once("pbt-time_attr-error"): logger.warning(time_missing_msg) if self._metric not in result: metric_missing_msg = ("Cannot find metric {} in trial result {}. " "Make sure that this attribute is returned " "in the " "results of your Trainable.".format( self._metric, result)) if self._require_attrs: raise RuntimeError( metric_missing_msg + "If this error is expected, " "you can change this to a warning message by " "setting PBT(require_attrs=False)") else: if log_once("pbt-metric-error"): logger.warning(metric_missing_msg) if self._metric not in result or self._time_attr not in result: return TrialScheduler.CONTINUE time = result[self._time_attr] state = self._trial_state[trial] # Continue training if burn-in period has not been reached, yet. if time < self._burn_in_period: return TrialScheduler.CONTINUE # Continue training if perturbation interval has not been reached, yet. if time - state.last_perturbation_time < self._perturbation_interval: return TrialScheduler.CONTINUE # avoid checkpoint overhead self._save_trial_state(state, time, result, trial) if not self._synch: state.last_perturbation_time = time lower_quantile, upper_quantile = self._quantiles() decision = TrialScheduler.CONTINUE for other_trial in trial_runner.get_trials(): if other_trial.status in [Trial.PENDING, Trial.PAUSED]: decision = TrialScheduler.PAUSE break self._checkpoint_or_exploit(trial, trial_runner.trial_executor, upper_quantile, lower_quantile) return TrialScheduler.NOOP if trial.status == Trial.PAUSED else decision else: # Synchronous mode. if any(self._trial_state[t].last_train_time < self._next_perturbation_sync and t != trial for t in trial_runner.get_live_trials()): logger.debug("Pausing trial {}".format(trial)) else: # All trials are synced at the same timestep. lower_quantile, upper_quantile = self._quantiles() all_trials = trial_runner.get_trials() not_in_quantile = [] for t in all_trials: if t not in lower_quantile and t not in upper_quantile: not_in_quantile.append(t) # Move upper quantile trials to beginning and lower quantile # to end. This ensures that checkpointing of strong trials # occurs before exploiting of weaker ones. all_trials = upper_quantile + not_in_quantile + lower_quantile for t in all_trials: logger.debug("Perturbing Trial {}".format(t)) self._trial_state[t].last_perturbation_time = time self._checkpoint_or_exploit(t, trial_runner.trial_executor, upper_quantile, lower_quantile) all_train_times = [ self._trial_state[t].last_train_time for t in trial_runner.get_trials() ] max_last_train_time = max(all_train_times) self._next_perturbation_sync = max( self._next_perturbation_sync + self._perturbation_interval, max_last_train_time, ) # In sync mode we should pause all trials once result comes in. # Once a perturbation step happens for all trials, they should # still all be paused. # choose_trial_to_run will then pick the next trial to run out of # the paused trials. return (TrialScheduler.NOOP if trial.status == Trial.PAUSED else TrialScheduler.PAUSE)
def __call__( self, trial_runner: "trial_runner.TrialRunner", trial: Trial, result: Dict[str, Any], scheduler: "ResourceChangingScheduler", ) -> Optional[PlacementGroupFactory]: """Run resource allocation logic. Returns a new ``PlacementGroupFactory`` with updated resource requirements, or None. If the returned ``PlacementGroupFactory`` is equal by value to the one the trial has currently, the scheduler will skip the update process internally (same with None). Args: trial_runner: Trial runner for this Tune run. Can be used to obtain information about other trials. trial: The trial to allocate new resources to. result: The latest results of trial. scheduler: The scheduler calling the function. """ # Get base trial resources as defined in # ``tune.run(resources_per_trial)`` base_trial_resource = scheduler.base_trial_resources if not self._validate(base_trial_resource=base_trial_resource, result=result): return None # default values if resources_per_trial is unspecified if base_trial_resource is None: base_trial_resource = PlacementGroupFactory([{"CPU": 1, "GPU": 0}]) if self.increase_by: increase_by = self.increase_by assert not self._is_bundle_empty(increase_by) assert increase_by.get("CPU", 0) >= 0 and increase_by.get("GPU", 0) >= 0 elif self.add_bundles: increase_by = base_trial_resource.bundles[-1] elif base_trial_resource.bundles[0].get("GPU", 0): increase_by = {"GPU": 1} else: increase_by = {"CPU": 1} base_bundles = deepcopy(base_trial_resource.bundles) ( total_available_cpus, total_available_gpus, ) = self._get_total_available_resources(trial_runner=trial_runner) all_trials = trial_runner.get_live_trials() used_cpus_and_gpus = [self._get_used_cpus_and_gpus(t) for t in all_trials] used_cpus, used_gpus = zip(*used_cpus_and_gpus) used_cpus = sum(used_cpus) used_gpus = sum(used_gpus) added_bundles = self._get_new_added_bundles( trial, all_trials, base_bundles, increase_by, total_available_cpus, total_available_gpus, used_cpus, used_gpus, ) new_bundles = self._add_two_bundles( base_bundles, added_bundles, increase_by, False ) pgf = PlacementGroupFactory(new_bundles) pgf._head_bundle_is_empty = base_trial_resource._head_bundle_is_empty return pgf