def choose_trial_to_run( self, trial_runner: "trial_runner.TrialRunner") -> Optional[Trial]: for trial in trial_runner.get_trials(): if (trial.status == Trial.PENDING and trial_runner.has_resources_for_trial(trial)): return trial for trial in trial_runner.get_trials(): if (trial.status == Trial.PAUSED and trial_runner.has_resources_for_trial(trial)): return trial return None
def choose_trial_to_run( self, trial_runner: "trial_runner.TrialRunner" ) -> Optional[Trial]: """Ensures all trials get fair share of time (as defined by time_attr). This enables the PBT scheduler to support a greater number of concurrent trials than can fit in the cluster at any given time. """ candidates = [] for trial in trial_runner.get_trials(): if ( trial.status in [ Trial.PENDING, Trial.PAUSED, ] and trial_runner.trial_executor.has_resources_for_trial(trial) ): if not self._synch: candidates.append(trial) elif ( self._trial_state[trial].last_train_time < self._next_perturbation_sync ): candidates.append(trial) candidates.sort(key=lambda trial: self._trial_state[trial].last_train_time) return candidates[0] if candidates else None
def on_trial_result(self, trial_runner: "trial_runner.TrialRunner", trial: Trial, result: Dict) -> str: trials = [ trial for trial in trial_runner.get_trials() if trial.status in (Trial.RUNNING, Trial.PENDING) ] if self._should_realloc and len(self._allocs) == 0: in_use_pgs = [ pg.to_dict() for pg in trial_runner.trial_executor._pg_manager._in_use_pgs ] consumed_resources = pgs_to_resources(in_use_pgs) nodes = config.nodes(consumed_resources) self._allocs, _ = self._allocator.allocate(trials, nodes) alloc = self._allocs.pop(trial.trial_id, None) if alloc is None: # No change in allocation for this Trial return TrialScheduler.CONTINUE if alloc == [] and trial.status == Trial.RUNNING: # Pause only if the trial is running trial.pause(trial_runner) return TrialScheduler.PAUSE elif alloc != trial.allocation: trial = AdaptDLTrial.create_from(trial, trial_runner, alloc, copy_state=True) # Stop the old trial that's being replaced return TrialScheduler.STOP return TrialScheduler.CONTINUE
def _on_insufficient_samples(self, trial_runner: "trial_runner.TrialRunner", trial: Trial, time: float) -> str: pause = time - self._last_pause[trial] > self._min_time_slice pause = pause and [ t for t in trial_runner.get_trials() if t.status in (Trial.PENDING, Trial.PAUSED) ] return TrialScheduler.PAUSE if pause else TrialScheduler.CONTINUE
def choose_trial_to_run( self, trial_runner: "trial_runner.TrialRunner") -> Optional[Trial]: for trial in trial_runner.get_trials(): if (trial.status == Trial.PENDING and trial_runner.trial_executor.has_resources_for_trial( trial)): return trial for trial in trial_runner.get_trials(): if (trial.status == Trial.PAUSED and trial_runner.trial_executor.has_resources_for_trial(trial) and len(self._allocs) == 0): # Note: this puts the trial back to RUNNING, we allow Trials to # resume when the allocation cache is empty and we reach a sync # point. return AdaptDLTrial.create_from( trial, trial_runner, self._allocator.default_allocation(), copy_state=True) return None
def choose_trial_to_run( self, trial_runner: "trial_runner.TrialRunner", allow_recurse: bool = True ) -> Optional[Trial]: """Fair scheduling within iteration by completion percentage. List of trials not used since all trials are tracked as state of scheduler. If iteration is occupied (ie, no trials to run), then look into next iteration. """ for hyperband in self._hyperbands: # band will have None entries if no resources # are to be allocated to that bracket. scrubbed = [b for b in hyperband if b is not None] for bracket in scrubbed: for trial in bracket.current_trials(): if ( trial.status == Trial.PENDING and trial_runner.trial_executor.has_resources_for_trial(trial) ): return trial # MAIN CHANGE HERE! if not any(t.status == Trial.RUNNING for t in trial_runner.get_trials()): for hyperband in self._hyperbands: for bracket in hyperband: if bracket and any( trial.status == Trial.PAUSED for trial in bracket.current_trials() ): # This will change the trial state self._process_bracket(trial_runner, bracket) # If there are pending trials now, suggest one. # This is because there might be both PENDING and # PAUSED trials now, and PAUSED trials will raise # an error before the trial runner tries again. if allow_recurse and any( trial.status == Trial.PENDING for trial in bracket.current_trials() ): return self.choose_trial_to_run( trial_runner, allow_recurse=False ) # MAIN CHANGE HERE! return None
def on_trial_result(self, trial_runner: "trial_runner.TrialRunner", trial: Trial, result: Dict) -> str: if self._time_attr not in result: time_missing_msg = "Cannot find time_attr {} " \ "in trial result {}. Make sure that this " \ "attribute is returned in the " \ "results of your Trainable.".format( self._time_attr, result) if self._require_attrs: raise RuntimeError( time_missing_msg + "If this error is expected, you can change this to " "a warning message by " "setting PBT(require_attrs=False)") else: if log_once("pbt-time_attr-error"): logger.warning(time_missing_msg) if self._metric not in result: metric_missing_msg = "Cannot find metric {} in trial result {}. " \ "Make sure that this attribute is returned " \ "in the " \ "results of your Trainable.".format( self._metric, result) if self._require_attrs: raise RuntimeError( metric_missing_msg + "If this error is expected, " "you can change this to a warning message by " "setting PBT(require_attrs=False)") else: if log_once("pbt-metric-error"): logger.warning(metric_missing_msg) if self._metric not in result or self._time_attr not in result: return TrialScheduler.CONTINUE time = result[self._time_attr] state = self._trial_state[trial] # Continue training if perturbation interval has not been reached yet. if time - state.last_perturbation_time < self._perturbation_interval: return TrialScheduler.CONTINUE # avoid checkpoint overhead self._save_trial_state(state, time, result, trial) if not self._synch: state.last_perturbation_time = time lower_quantile, upper_quantile = self._quantiles() self._perturb_trial(trial, trial_runner, upper_quantile, lower_quantile) for trial in trial_runner.get_trials(): if trial.status in [Trial.PENDING, Trial.PAUSED]: return TrialScheduler.PAUSE # yield time to other trials return TrialScheduler.CONTINUE else: # Synchronous mode. if any(self._trial_state[t].last_train_time < self._next_perturbation_sync and t != trial for t in trial_runner.get_trials()): logger.debug("Pausing trial {}".format(trial)) else: # All trials are synced at the same timestep. lower_quantile, upper_quantile = self._quantiles() all_trials = trial_runner.get_trials() not_in_quantile = [] for t in all_trials: if t not in lower_quantile and t not in upper_quantile: not_in_quantile.append(t) # Move upper quantile trials to beginning and lower quantile # to end. This ensures that checkpointing of strong trials # occurs before exploiting of weaker ones. all_trials = upper_quantile + not_in_quantile + lower_quantile for t in all_trials: logger.debug("Perturbing Trial {}".format(t)) self._trial_state[t].last_perturbation_time = time self._perturb_trial(t, trial_runner, upper_quantile, lower_quantile) all_train_times = [ self._trial_state[trial].last_train_time for trial in trial_runner.get_trials() ] max_last_train_time = max(all_train_times) self._next_perturbation_sync = max( self._next_perturbation_sync + self._perturbation_interval, max_last_train_time) # In sync mode we should pause all trials once result comes in. # Once a perturbation step happens for all trials, they should # still all be paused. # choose_trial_to_run will then pick the next trial to run out of # the paused trials. return TrialScheduler.PAUSE