def test_resource_updater(ray_start_cluster):
    cluster = ray_start_cluster

    resource_updater = _ResourceUpdater(refresh_period=100)
    # Before intialization, all resources are 0.
    assert resource_updater.get_num_cpus() == 0
    assert resource_updater.get_num_gpus() == 0

    cluster.add_node(num_cpus=1, num_gpus=2)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    # Resource updater will update resource immediately
    # after ray is initialized for the first time.
    assert resource_updater.get_num_cpus() == 1
    assert resource_updater.get_num_gpus() == 2

    # It will not update the resource before "refresh_period".
    cluster.add_node(num_cpus=1, num_gpus=1)
    cluster.wait_for_nodes()
    assert resource_updater.get_num_cpus() == 1
    assert resource_updater.get_num_gpus() == 2

    resource_updater = _ResourceUpdater(refresh_period=0)
    assert resource_updater.get_num_cpus() == 2
    assert resource_updater.get_num_gpus() == 3

    cluster.add_node(num_cpus=1, num_gpus=1)
    cluster.wait_for_nodes()
    assert resource_updater.get_num_cpus() == 3
    assert resource_updater.get_num_gpus() == 4
示例#2
0
    def __init__(
        self,
        reuse_actors: bool = False,
        result_buffer_length: Optional[int] = None,
        refresh_period: Optional[float] = None,
    ):
        self._cached_trial_state = {}
        self._trials_to_cache = set()

        # future --> (type, trial/pg)
        self._futures = {}

        force_trial_cleanup = int(
            os.environ.get("TUNE_FORCE_TRIAL_CLEANUP_S", "0"))
        self._get_next_event_wait = int(
            os.environ.get("TUNE_GET_EXECUTOR_EVENT_WAIT_S", "5"))
        if force_trial_cleanup:
            self._trial_cleanup = _TrialCleanup(force_trial_cleanup)
        else:
            self._trial_cleanup = None

        self._resource_updater = _ResourceUpdater(refresh_period)

        self._has_cleaned_up_pgs = False
        self._reuse_actors = reuse_actors
        # The maxlen will be updated when `set_max_pending_trials()` is called
        self._cached_actor_pg = deque(maxlen=1)
        self._pg_manager = _PlacementGroupManager(prefix=get_tune_pg_prefix())
        self._staged_trials = set()
        self._trial_just_finished = False
        self._trial_just_finished_before = False
        self.last_pg_recon = 0
        self.pg_recon_interval = float(
            os.environ.get("TUNE_PLACEMENT_GROUP_RECON_INTERVAL", "5"))

        self._buffer_length = result_buffer_length or int(
            os.getenv("TUNE_RESULT_BUFFER_LENGTH", 1))
        self._buffer_min_time_s = float(
            os.getenv("TUNE_RESULT_BUFFER_MIN_TIME_S", 0.0))
        self._buffer_max_time_s = float(
            os.getenv("TUNE_RESULT_BUFFER_MAX_TIME_S", 100.0))