예제 #1
0
    def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
        if app_id not in self._apps:
            return None

        local_app = self._apps[app_id]
        structured_error_msg = local_app.get_structured_error_msg()

        # check if the app is known to have finished
        if is_terminal(local_app.state):
            state = local_app.state
        else:
            running = False
            failed = False
            for replicas in local_app.role_replicas.values():
                for r in replicas:
                    running |= r.is_alive()
                    failed |= r.failed()

            if running:
                state = AppState.RUNNING
            elif failed:
                state = AppState.FAILED
            else:
                state = AppState.SUCCEEDED
            local_app.set_state(state)

        if is_terminal(local_app.state):
            local_app.close()

        resp = DescribeAppResponse()
        resp.app_id = app_id
        resp.structured_error_msg = structured_error_msg
        resp.state = state
        resp.num_restarts = 0
        return resp
예제 #2
0
    def _evict_lru(self) -> bool:
        """
        Evicts one least recently used element from the apps cache. LRU is defined as
        the oldest app in a terminal state (e.g. oldest finished app).

        Returns:
            ``True`` if an entry was evicted, ``False`` if no entries could be evicted
            (e.g. all apps are running)
        """
        lru_time = sys.maxsize
        lru_app_id = None
        for (app_id, app) in self._apps.items():
            if is_terminal(app.state):
                if app.last_updated <= lru_time:
                    lru_app_id = app_id

        if lru_app_id:
            # evict LRU finished app from the apps cache
            del self._apps[lru_app_id]

            log.debug(
                f"evicting app: {lru_app_id}, from local scheduler cache")
            return True
        else:
            log.debug(
                f"no apps evicted, all {len(self._apps)} apps are running")
            return False
예제 #3
0
    def wait(self, app_id: str, timeout=-1) -> Optional[DescribeAppResponse]:
        """
        Waits for the app to finish or raise TimeoutError upon timeout (in seconds).
        If no timeout is specified waits indefinitely.

        Returns:
            The last return value from ``describe()``
        """

        if timeout > 0:
            expiry = time.time()
            interval = timeout / 10
        else:
            expiry = sys.maxsize
            interval = 1

        while expiry > time.time():
            desc = self.describe(app_id)

            if desc is None:
                return None
            elif is_terminal(desc.state):
                return desc

            time.sleep(interval)

        raise TimeoutError(f"timed out waiting for app: {app_id} to finish")
예제 #4
0
    def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
        if app_id not in self._apps:
            return None

        local_app = self._apps[app_id]

        # check if the app has been known to have finished
        if is_terminal(local_app.state):
            state = local_app.state
        else:
            running = False
            failed = False
            for (_, procs) in local_app.role_procs.items():
                for p in procs:
                    running |= self._is_alive(p)
                    failed |= self._failed(p)

            if running:
                state = AppState.RUNNING
            elif failed:
                state = AppState.FAILED
                self._terminate(local_app)  # terminate danglers
            else:
                state = AppState.SUCCEEDED
            local_app.set_state(state)

        resp = DescribeAppResponse()
        resp.app_id = app_id
        resp.state = state
        resp.num_restarts = 0
        return resp
예제 #5
0
    def _evict_lru(self) -> bool:
        """
        Evicts one least recently used element from the apps cache. LRU is defined as
        the oldest app in a terminal state (e.g. oldest finished app).

        Returns:
            ``True`` if an entry was evicted, ``False`` if no entries could be evicted
            (e.g. all apps are running)
        """
        lru_time = sys.maxsize
        lru_app_id = None
        for (app_id, app) in self._apps.items():
            if is_terminal(app.state):
                if app.last_updated <= lru_time:
                    lru_app_id = app_id

        if lru_app_id:
            # evict LRU finished app from the apps cache
            # do not remove the app name from the ids map so that the ids
            # remain unique throughout the lifespan of this scheduler
            # for example if cache size == 1
            #     app_id1 = submit(app)
            #     app_id2 = submit(app) # app_id1 was evicted here
            #     app_id1 == "app.name_0"
            #     app_id2 == "app.name_1"
            del self._apps[lru_app_id]

            log.debug(
                f"evicting app: {lru_app_id}, from local scheduler cache")
            return True
        else:
            log.debug(
                f"no apps evicted, all {len(self._apps)} apps are running")
            return False
예제 #6
0
    def wait(
        self,
        app_id: str,
        scheduler: Optional[LocalScheduler] = None,
        timeout: float = 30,
    ) -> Optional[DescribeAppResponse]:
        """
        Waits for the app to finish or raise TimeoutError upon timeout (in seconds).
        If no timeout is specified waits indefinitely.

        Returns:
            The last return value from ``describe()``
        """
        scheduler_ = scheduler or self.scheduler

        interval = timeout / 100
        expiry = time.time() + timeout
        while expiry > time.time():
            desc = scheduler_.describe(app_id)

            if desc is None:
                return None
            elif is_terminal(desc.state):
                return desc

            time.sleep(interval)
        raise TimeoutError(f"timed out waiting for app: {app_id}")
예제 #7
0
 def _check_finished(self):
     # either the app (already finished) was evicted from the LRU cache
     # -- or -- the app reached a terminal state (and still in the cache)
     desc = self._scheduler.describe(self._app_id)
     if not desc or is_terminal(desc.state):
         self._app_finished = True
     else:
         self._app_finished = False