def test_resource_updater(ray_start_cluster): cluster = ray_start_cluster resource_updater = _ResourceUpdater(refresh_period=100) # Before intialization, all resources are 0. assert resource_updater.get_num_cpus() == 0 assert resource_updater.get_num_gpus() == 0 cluster.add_node(num_cpus=1, num_gpus=2) cluster.wait_for_nodes() ray.init(address=cluster.address) # Resource updater will update resource immediately # after ray is initialized for the first time. assert resource_updater.get_num_cpus() == 1 assert resource_updater.get_num_gpus() == 2 # It will not update the resource before "refresh_period". cluster.add_node(num_cpus=1, num_gpus=1) cluster.wait_for_nodes() assert resource_updater.get_num_cpus() == 1 assert resource_updater.get_num_gpus() == 2 resource_updater = _ResourceUpdater(refresh_period=0) assert resource_updater.get_num_cpus() == 2 assert resource_updater.get_num_gpus() == 3 cluster.add_node(num_cpus=1, num_gpus=1) cluster.wait_for_nodes() assert resource_updater.get_num_cpus() == 3 assert resource_updater.get_num_gpus() == 4
def __init__( self, reuse_actors: bool = False, result_buffer_length: Optional[int] = None, refresh_period: Optional[float] = None, ): self._cached_trial_state = {} self._trials_to_cache = set() # future --> (type, trial/pg) self._futures = {} force_trial_cleanup = int( os.environ.get("TUNE_FORCE_TRIAL_CLEANUP_S", "0")) self._get_next_event_wait = int( os.environ.get("TUNE_GET_EXECUTOR_EVENT_WAIT_S", "5")) if force_trial_cleanup: self._trial_cleanup = _TrialCleanup(force_trial_cleanup) else: self._trial_cleanup = None self._resource_updater = _ResourceUpdater(refresh_period) self._has_cleaned_up_pgs = False self._reuse_actors = reuse_actors # The maxlen will be updated when `set_max_pending_trials()` is called self._cached_actor_pg = deque(maxlen=1) self._pg_manager = _PlacementGroupManager(prefix=get_tune_pg_prefix()) self._staged_trials = set() self._trial_just_finished = False self._trial_just_finished_before = False self.last_pg_recon = 0 self.pg_recon_interval = float( os.environ.get("TUNE_PLACEMENT_GROUP_RECON_INTERVAL", "5")) self._buffer_length = result_buffer_length or int( os.getenv("TUNE_RESULT_BUFFER_LENGTH", 1)) self._buffer_min_time_s = float( os.getenv("TUNE_RESULT_BUFFER_MIN_TIME_S", 0.0)) self._buffer_max_time_s = float( os.getenv("TUNE_RESULT_BUFFER_MAX_TIME_S", 100.0))