Пример #1
0
    def __init__(self,
                 reuse_actors: bool = False,
                 result_buffer_length: Optional[int] = None,
                 refresh_period: Optional[float] = None,
                 wait_for_placement_group: Optional[float] = None):
        super(RayTrialExecutor, self).__init__()
        self._running = {}
        # Since trial resume after paused should not run
        # trial.train.remote(), thus no more new remote object ref generated.
        # We use self._paused to store paused trials here.
        self._paused = {}

        force_trial_cleanup = int(
            os.environ.get("TUNE_FORCE_TRIAL_CLEANUP_S", "0"))
        self._trial_cleanup = _TrialCleanup(force_cleanup=force_trial_cleanup)
        self._has_cleaned_up_pgs = False
        self._reuse_actors = reuse_actors
        # The maxlen will be updated when `set_max_pending_trials()` is called
        self._cached_actor_pg = deque(maxlen=1)

        self._avail_resources = Resources(cpu=0, gpu=0)
        self._pg_manager = PlacementGroupManager(prefix=get_tune_pg_prefix())
        self._staged_trials = set()
        self._just_staged_trials = set()
        self._trial_just_finished = False
        self._trial_just_finished_before = False

        self._resources_initialized = False

        if refresh_period is None:
            refresh_period = float(
                os.environ.get("TUNE_STATE_REFRESH_PERIOD",
                               TUNE_STATE_REFRESH_PERIOD))
        self._refresh_period = refresh_period

        self._wait_for_pg = wait_for_placement_group or float(
            os.environ.get("TUNE_PLACEMENT_GROUP_WAIT_S", "-1"))
        if self._wait_for_pg < 0:
            self._wait_for_pg = None

        self.last_pg_recon = 0
        self.pg_recon_interval = float(
            os.environ.get("TUNE_PLACEMENT_GROUP_RECON_INTERVAL", "5"))

        self._default_buffer_length = result_buffer_length or int(
            os.getenv("TUNE_RESULT_BUFFER_LENGTH", 1000))
        self._buffer_length = result_buffer_length

        self._buffer_min_time_s = float(
            os.getenv("TUNE_RESULT_BUFFER_MIN_TIME_S", 0.))
        self._buffer_max_time_s = float(
            os.getenv("TUNE_RESULT_BUFFER_MAX_TIME_S", 100.))

        self._last_resource_refresh = float("-inf")
        self._last_ip_refresh = float("-inf")
        self._last_ip_addresses = set()
        self._last_nontrivial_wait = time.time()

        if ray.is_initialized():
            self._update_avail_resources()
Пример #2
0
    def __init__(
        self,
        reuse_actors: bool = False,
        result_buffer_length: Optional[int] = None,
        refresh_period: Optional[float] = None,
        wait_for_placement_group: Optional[float] = None,
    ):
        super(RayTrialExecutor, self).__init__()
        # future --> (type, trial/pg)
        self._futures = {}

        force_trial_cleanup = int(os.environ.get("TUNE_FORCE_TRIAL_CLEANUP_S", "0"))
        self._get_next_event_wait = int(
            os.environ.get("TUNE_GET_EXECUTOR_EVENT_WAIT_S", "5")
        )
        if force_trial_cleanup:
            self._trial_cleanup = _TrialCleanup(force_trial_cleanup)
        else:
            self._trial_cleanup = None
        self._has_cleaned_up_pgs = False
        self._reuse_actors = reuse_actors
        # The maxlen will be updated when `set_max_pending_trials()` is called
        self._cached_actor_pg = deque(maxlen=1)

        self._avail_resources = Resources(cpu=0, gpu=0)
        self._pg_manager = PlacementGroupManager(prefix=get_tune_pg_prefix())
        self._staged_trials = set()
        self._trial_just_finished = False
        self._trial_just_finished_before = False

        self._resources_initialized = False

        if refresh_period is None:
            refresh_period = float(
                os.environ.get("TUNE_STATE_REFRESH_PERIOD", TUNE_STATE_REFRESH_PERIOD)
            )
        self._refresh_period = refresh_period

        self.last_pg_recon = 0
        self.pg_recon_interval = float(
            os.environ.get("TUNE_PLACEMENT_GROUP_RECON_INTERVAL", "5")
        )

        self._buffer_length = result_buffer_length or int(
            os.getenv("TUNE_RESULT_BUFFER_LENGTH", 1)
        )

        self._buffer_min_time_s = float(os.getenv("TUNE_RESULT_BUFFER_MIN_TIME_S", 0.0))
        self._buffer_max_time_s = float(
            os.getenv("TUNE_RESULT_BUFFER_MAX_TIME_S", 100.0)
        )

        self._last_resource_refresh = float("-inf")
        self._last_ip_refresh = float("-inf")
        self._last_ip_addresses = set()
        self._last_nontrivial_wait = time.time()

        if ray.is_initialized():
            self._update_avail_resources()
Пример #3
0
    def __init__(self,
                 queue_trials: bool = False,
                 reuse_actors: bool = False,
                 refresh_period: Optional[float] = None,
                 wait_for_placement_group: Optional[float] = None):
        super(RayTrialExecutor, self).__init__(queue_trials)
        # Check for if we are launching a trial without resources in kick off
        # autoscaler.
        self._trial_queued = False
        self._running = {}
        # Since trial resume after paused should not run
        # trial.train.remote(), thus no more new remote object ref generated.
        # We use self._paused to store paused trials here.
        self._paused = {}

        self._trial_cleanup = _TrialCleanup()
        self._has_cleaned_up_pgs = False
        self._reuse_actors = reuse_actors
        self._cached_actor_pg = (None, None)

        self._avail_resources = Resources(cpu=0, gpu=0)
        self._committed_resources = Resources(cpu=0, gpu=0)
        self._pg_manager = PlacementGroupManager(prefix=get_tune_pg_prefix())
        self._staged_trials = set()
        self._just_staged_trials = set()
        self._trial_just_finished = False
        self._trial_just_finished_before = False

        self._resources_initialized = False

        if refresh_period is None:
            refresh_period = float(
                os.environ.get("TUNE_STATE_REFRESH_PERIOD",
                               TUNE_STATE_REFRESH_PERIOD))
        self._refresh_period = refresh_period

        self._wait_for_pg = wait_for_placement_group or float(
            os.environ.get("TUNE_PLACEMENT_GROUP_WAIT_S", "-1"))
        if self._wait_for_pg < 0:
            self._wait_for_pg = None

        self.last_pg_recon = 0
        self.pg_recon_interval = float(
            os.environ.get("TUNE_PLACEMENT_GROUP_RECON_INTERVAL", "5"))

        self._buffer_length = int(os.getenv("TUNE_RESULT_BUFFER_LENGTH", 1000))
        self._buffer_min_time_s = float(
            os.getenv("TUNE_RESULT_BUFFER_MIN_TIME_S", 0.))
        self._buffer_max_time_s = float(
            os.getenv("TUNE_RESULT_BUFFER_MAX_TIME_S", 100.))

        self._last_resource_refresh = float("-inf")
        self._last_ip_refresh = float("-inf")
        self._last_ip_addresses = set()
        self._last_nontrivial_wait = time.time()

        if ray.is_initialized():
            self._update_avail_resources()
Пример #4
0
    def __init__(
        self,
        reuse_actors: bool = False,
        result_buffer_length: Optional[int] = None,
        refresh_period: Optional[float] = None,
    ):
        self._cached_trial_state = {}
        self._trials_to_cache = set()

        # future --> (type, trial/pg)
        self._futures = {}

        force_trial_cleanup = int(
            os.environ.get("TUNE_FORCE_TRIAL_CLEANUP_S", "0"))
        self._get_next_event_wait = int(
            os.environ.get("TUNE_GET_EXECUTOR_EVENT_WAIT_S", "5"))
        if force_trial_cleanup:
            self._trial_cleanup = _TrialCleanup(force_trial_cleanup)
        else:
            self._trial_cleanup = None

        self._resource_updater = _ResourceUpdater(refresh_period)

        self._has_cleaned_up_pgs = False
        self._reuse_actors = reuse_actors
        # The maxlen will be updated when `set_max_pending_trials()` is called
        self._cached_actor_pg = deque(maxlen=1)
        self._pg_manager = _PlacementGroupManager(prefix=get_tune_pg_prefix())
        self._staged_trials = set()
        self._trial_just_finished = False
        self._trial_just_finished_before = False
        self.last_pg_recon = 0
        self.pg_recon_interval = float(
            os.environ.get("TUNE_PLACEMENT_GROUP_RECON_INTERVAL", "5"))

        self._buffer_length = result_buffer_length or int(
            os.getenv("TUNE_RESULT_BUFFER_LENGTH", 1))
        self._buffer_min_time_s = float(
            os.getenv("TUNE_RESULT_BUFFER_MIN_TIME_S", 0.0))
        self._buffer_max_time_s = float(
            os.getenv("TUNE_RESULT_BUFFER_MAX_TIME_S", 100.0))