def __init__(self, reuse_actors: bool = False, result_buffer_length: Optional[int] = None, refresh_period: Optional[float] = None, wait_for_placement_group: Optional[float] = None): super(RayTrialExecutor, self).__init__() self._running = {} # Since trial resume after paused should not run # trial.train.remote(), thus no more new remote object ref generated. # We use self._paused to store paused trials here. self._paused = {} force_trial_cleanup = int( os.environ.get("TUNE_FORCE_TRIAL_CLEANUP_S", "0")) self._trial_cleanup = _TrialCleanup(force_cleanup=force_trial_cleanup) self._has_cleaned_up_pgs = False self._reuse_actors = reuse_actors # The maxlen will be updated when `set_max_pending_trials()` is called self._cached_actor_pg = deque(maxlen=1) self._avail_resources = Resources(cpu=0, gpu=0) self._pg_manager = PlacementGroupManager(prefix=get_tune_pg_prefix()) self._staged_trials = set() self._just_staged_trials = set() self._trial_just_finished = False self._trial_just_finished_before = False self._resources_initialized = False if refresh_period is None: refresh_period = float( os.environ.get("TUNE_STATE_REFRESH_PERIOD", TUNE_STATE_REFRESH_PERIOD)) self._refresh_period = refresh_period self._wait_for_pg = wait_for_placement_group or float( os.environ.get("TUNE_PLACEMENT_GROUP_WAIT_S", "-1")) if self._wait_for_pg < 0: self._wait_for_pg = None self.last_pg_recon = 0 self.pg_recon_interval = float( os.environ.get("TUNE_PLACEMENT_GROUP_RECON_INTERVAL", "5")) self._default_buffer_length = result_buffer_length or int( os.getenv("TUNE_RESULT_BUFFER_LENGTH", 1000)) self._buffer_length = result_buffer_length self._buffer_min_time_s = float( os.getenv("TUNE_RESULT_BUFFER_MIN_TIME_S", 0.)) self._buffer_max_time_s = float( os.getenv("TUNE_RESULT_BUFFER_MAX_TIME_S", 100.)) self._last_resource_refresh = float("-inf") self._last_ip_refresh = float("-inf") self._last_ip_addresses = set() self._last_nontrivial_wait = time.time() if ray.is_initialized(): self._update_avail_resources()
def __init__( self, reuse_actors: bool = False, result_buffer_length: Optional[int] = None, refresh_period: Optional[float] = None, wait_for_placement_group: Optional[float] = None, ): super(RayTrialExecutor, self).__init__() # future --> (type, trial/pg) self._futures = {} force_trial_cleanup = int(os.environ.get("TUNE_FORCE_TRIAL_CLEANUP_S", "0")) self._get_next_event_wait = int( os.environ.get("TUNE_GET_EXECUTOR_EVENT_WAIT_S", "5") ) if force_trial_cleanup: self._trial_cleanup = _TrialCleanup(force_trial_cleanup) else: self._trial_cleanup = None self._has_cleaned_up_pgs = False self._reuse_actors = reuse_actors # The maxlen will be updated when `set_max_pending_trials()` is called self._cached_actor_pg = deque(maxlen=1) self._avail_resources = Resources(cpu=0, gpu=0) self._pg_manager = PlacementGroupManager(prefix=get_tune_pg_prefix()) self._staged_trials = set() self._trial_just_finished = False self._trial_just_finished_before = False self._resources_initialized = False if refresh_period is None: refresh_period = float( os.environ.get("TUNE_STATE_REFRESH_PERIOD", TUNE_STATE_REFRESH_PERIOD) ) self._refresh_period = refresh_period self.last_pg_recon = 0 self.pg_recon_interval = float( os.environ.get("TUNE_PLACEMENT_GROUP_RECON_INTERVAL", "5") ) self._buffer_length = result_buffer_length or int( os.getenv("TUNE_RESULT_BUFFER_LENGTH", 1) ) self._buffer_min_time_s = float(os.getenv("TUNE_RESULT_BUFFER_MIN_TIME_S", 0.0)) self._buffer_max_time_s = float( os.getenv("TUNE_RESULT_BUFFER_MAX_TIME_S", 100.0) ) self._last_resource_refresh = float("-inf") self._last_ip_refresh = float("-inf") self._last_ip_addresses = set() self._last_nontrivial_wait = time.time() if ray.is_initialized(): self._update_avail_resources()
def __init__(self, queue_trials: bool = False, reuse_actors: bool = False, refresh_period: Optional[float] = None, wait_for_placement_group: Optional[float] = None): super(RayTrialExecutor, self).__init__(queue_trials) # Check for if we are launching a trial without resources in kick off # autoscaler. self._trial_queued = False self._running = {} # Since trial resume after paused should not run # trial.train.remote(), thus no more new remote object ref generated. # We use self._paused to store paused trials here. self._paused = {} self._trial_cleanup = _TrialCleanup() self._has_cleaned_up_pgs = False self._reuse_actors = reuse_actors self._cached_actor_pg = (None, None) self._avail_resources = Resources(cpu=0, gpu=0) self._committed_resources = Resources(cpu=0, gpu=0) self._pg_manager = PlacementGroupManager(prefix=get_tune_pg_prefix()) self._staged_trials = set() self._just_staged_trials = set() self._trial_just_finished = False self._trial_just_finished_before = False self._resources_initialized = False if refresh_period is None: refresh_period = float( os.environ.get("TUNE_STATE_REFRESH_PERIOD", TUNE_STATE_REFRESH_PERIOD)) self._refresh_period = refresh_period self._wait_for_pg = wait_for_placement_group or float( os.environ.get("TUNE_PLACEMENT_GROUP_WAIT_S", "-1")) if self._wait_for_pg < 0: self._wait_for_pg = None self.last_pg_recon = 0 self.pg_recon_interval = float( os.environ.get("TUNE_PLACEMENT_GROUP_RECON_INTERVAL", "5")) self._buffer_length = int(os.getenv("TUNE_RESULT_BUFFER_LENGTH", 1000)) self._buffer_min_time_s = float( os.getenv("TUNE_RESULT_BUFFER_MIN_TIME_S", 0.)) self._buffer_max_time_s = float( os.getenv("TUNE_RESULT_BUFFER_MAX_TIME_S", 100.)) self._last_resource_refresh = float("-inf") self._last_ip_refresh = float("-inf") self._last_ip_addresses = set() self._last_nontrivial_wait = time.time() if ray.is_initialized(): self._update_avail_resources()
def __init__( self, reuse_actors: bool = False, result_buffer_length: Optional[int] = None, refresh_period: Optional[float] = None, ): self._cached_trial_state = {} self._trials_to_cache = set() # future --> (type, trial/pg) self._futures = {} force_trial_cleanup = int( os.environ.get("TUNE_FORCE_TRIAL_CLEANUP_S", "0")) self._get_next_event_wait = int( os.environ.get("TUNE_GET_EXECUTOR_EVENT_WAIT_S", "5")) if force_trial_cleanup: self._trial_cleanup = _TrialCleanup(force_trial_cleanup) else: self._trial_cleanup = None self._resource_updater = _ResourceUpdater(refresh_period) self._has_cleaned_up_pgs = False self._reuse_actors = reuse_actors # The maxlen will be updated when `set_max_pending_trials()` is called self._cached_actor_pg = deque(maxlen=1) self._pg_manager = _PlacementGroupManager(prefix=get_tune_pg_prefix()) self._staged_trials = set() self._trial_just_finished = False self._trial_just_finished_before = False self.last_pg_recon = 0 self.pg_recon_interval = float( os.environ.get("TUNE_PLACEMENT_GROUP_RECON_INTERVAL", "5")) self._buffer_length = result_buffer_length or int( os.getenv("TUNE_RESULT_BUFFER_LENGTH", 1)) self._buffer_min_time_s = float( os.getenv("TUNE_RESULT_BUFFER_MIN_TIME_S", 0.0)) self._buffer_max_time_s = float( os.getenv("TUNE_RESULT_BUFFER_MAX_TIME_S", 100.0))