def guaranteed_lease_timeout(x, sem): """ This function simulates a payload computation with some GIL locking in the beginning. To simulate this we will manually disable the refresh callback, i.e. all leases will eventually timeout. The function will only release/return once the "Event" is set, i.e. our observer is done. """ sem.refresh_leases = False client = get_client() with sem: # This simulates a task which holds the GIL for longer than the # lease-timeout. This is twice the lease timeout to ensurre that the # leases are actually timed out slowidentity(delay=0.2) assert sem._leases # Now the GIL is free again, i.e. we enable the callback again sem.refresh_leases = True sleep(0.1) # This is the poormans Event.wait() while client.get_metadata("release") is not True: sleep(0.05) assert sem.get_value() >= 1 return x
def get_scheduler(get=None, scheduler=None, collections=None, cls=None): """Get scheduler function There are various ways to specify the scheduler to use: 1. Passing in scheduler= parameters 2. Passing these into global configuration 3. Using defaults of a dask collection This function centralizes the logic to determine the right scheduler to use from those many options """ if get: raise TypeError(get_err_msg) if scheduler is not None: if callable(scheduler): return scheduler elif "Client" in type(scheduler).__name__ and hasattr( scheduler, "get"): return scheduler.get elif scheduler.lower() in named_schedulers: return named_schedulers[scheduler.lower()] elif scheduler.lower() in ("dask.distributed", "distributed"): from distributed.worker import get_client return get_client().get else: raise ValueError("Expected one of [distributed, %s]" % ", ".join(sorted(named_schedulers))) # else: # try to connect to remote scheduler with this name # return get_client(scheduler).get if config.get("scheduler", None): return get_scheduler(scheduler=config.get("scheduler", None)) if config.get("get", None): raise ValueError(get_err_msg) if getattr(thread_state, "key", False): from distributed.worker import get_worker return get_worker().client.get if cls is not None: return cls.__dask_scheduler__ if collections: collections = [c for c in collections if c is not None] if collections: get = collections[0].__dask_scheduler__ if not all(c.__dask_scheduler__ == get for c in collections): raise ValueError("Compute called on multiple collections with " "differing default schedulers. Please specify a " "scheduler=` parameter explicitly in compute or " "globally with `dask.config.set`.") return get return None
def __setstate__(self, state): name, address = state try: client = get_client(address) assert client.scheduler.address == address except (AttributeError, AssertionError): client = Client(address, set_as_default=False) self.__init__(name=name, client=client)
def get_scheduler(get=None, scheduler=None, collections=None, cls=None): """ Get scheduler function There are various ways to specify the scheduler to use: 1. Passing in get= parameters (deprecated) 2. Passing in scheduler= parameters 3. Passing these into global confiuration 4. Using defaults of a dask collection This function centralizes the logic to determine the right scheduler to use from those many options """ if get is not None: if scheduler is not None: raise ValueError("Both get= and scheduler= provided. Choose one") warn_on_get(get) return get if scheduler is not None: if scheduler.lower() in named_schedulers: return named_schedulers[scheduler.lower()] elif scheduler.lower() in ('dask.distributed', 'distributed'): from distributed.worker import get_client return get_client().get else: raise ValueError("Expected one of [distributed, %s]" % ', '.join(sorted(named_schedulers))) # else: # try to connect to remote scheduler with this name # return get_client(scheduler).get if config.get('scheduler', None): return get_scheduler(scheduler=config.get('scheduler', None)) if config.get('get', None): warn_on_get(config.get('get', None)) return config.get('get', None) if getattr(thread_state, 'key', False): from distributed.worker import get_worker return get_worker().client.get if cls is not None: return cls.__dask_scheduler__ if collections: collections = [c for c in collections if c is not None] if collections: get = collections[0].__dask_scheduler__ if not all(c.__dask_scheduler__ == get for c in collections): raise ValueError("Compute called on multiple collections with " "differing default schedulers. Please specify a " "scheduler=` parameter explicitly in compute or " "globally with `set_options`.") return get return None
def get_scheduler(get=None, scheduler=None, collections=None, cls=None): """ Get scheduler function There are various ways to specify the scheduler to use: 1. Passing in scheduler= parameters 2. Passing these into global confiuration 3. Using defaults of a dask collection This function centralizes the logic to determine the right scheduler to use from those many options """ if get: raise TypeError(get_err_msg) if scheduler is not None: if callable(scheduler): return scheduler elif "Client" in type(scheduler).__name__ and hasattr(scheduler, 'get'): return scheduler.get elif scheduler.lower() in named_schedulers: return named_schedulers[scheduler.lower()] elif scheduler.lower() in ('dask.distributed', 'distributed'): from distributed.worker import get_client return get_client().get elif scheduler.lower() in ['processes', 'multiprocessing']: raise ValueError("Please install cloudpickle to use the '%s' scheduler." % scheduler) else: raise ValueError("Expected one of [distributed, %s]" % ', '.join(sorted(named_schedulers))) # else: # try to connect to remote scheduler with this name # return get_client(scheduler).get if config.get('scheduler', None): return get_scheduler(scheduler=config.get('scheduler', None)) if config.get('get', None): raise ValueError(get_err_msg) if getattr(thread_state, 'key', False): from distributed.worker import get_worker return get_worker().client.get if cls is not None: return cls.__dask_scheduler__ if collections: collections = [c for c in collections if c is not None] if collections: get = collections[0].__dask_scheduler__ if not all(c.__dask_scheduler__ == get for c in collections): raise ValueError("Compute called on multiple collections with " "differing default schedulers. Please specify a " "scheduler=` parameter explicitly in compute or " "globally with `dask.config.set`.") return get return None
def worker_client(timeout=None, separate_thread=True): """Get client for this thread This context manager is intended to be called within functions that we run on workers. When run as a context manager it delivers a client ``Client`` object that can submit other tasks directly from that worker. Parameters ---------- timeout : Number or String Timeout after which to error out. Defaults to the ``distributed.comm.timeouts.connect`` configuration value. separate_thread : bool, optional Whether to run this function outside of the normal thread pool defaults to True Examples -------- >>> def func(x): ... with worker_client(timeout="10s") as c: # connect from worker back to scheduler ... a = c.submit(inc, x) # this task can submit more tasks ... b = c.submit(dec, x) ... result = c.gather([a, b]) # and gather results ... return result >>> future = client.submit(func, 1) # submit func(1) on cluster See Also -------- get_worker get_client secede """ if timeout is None: timeout = dask.config.get("distributed.comm.timeouts.connect") timeout = dask.utils.parse_timedelta(timeout, "s") worker = get_worker() client = get_client(timeout=timeout) if separate_thread: duration = time() - thread_state.start_time secede() # have this thread secede from the thread pool worker.loop.add_callback( worker.transition, worker.tasks[thread_state.key], "long-running", stimulus_id=f"worker-client-secede-{time()}", compute_duration=duration, ) yield client if separate_thread: rejoin()
def __init__( self, max_leases=1, name=None, register=True, scheduler_rpc=None, loop=None, ): try: worker = get_worker() self.scheduler = scheduler_rpc or worker.scheduler self.loop = loop or worker.loop except ValueError: client = get_client() self.scheduler = scheduler_rpc or client.scheduler self.loop = loop or client.io_loop self.name = name or "semaphore-" + uuid.uuid4().hex self.max_leases = max_leases self.id = uuid.uuid4().hex self._leases = deque() self.refresh_leases = True self._registered = None if register: self._registered = self.register() # this should give ample time to refresh without introducing another # config parameter since this *must* be smaller than the timeout anyhow refresh_leases_interval = (parse_timedelta( dask.config.get("distributed.scheduler.locks.lease-timeout"), default="s", ) / 5) pc = PeriodicCallback(self._refresh_leases, callback_time=refresh_leases_interval * 1000) self.refresh_callback = pc # Need to start the callback using IOLoop.add_callback to ensure that the # PC uses the correct event loop. self.loop.add_callback(pc.start)
def __init__(self, storage=None, name: str = None, client: Client = None): self.name = name or f"dask-storage-{uuid.uuid4().hex}" self.client = client or get_client() if self.client.asynchronous or getattr( thread_state, "on_event_loop_thread", False ): async def _register(): await self.client.run_on_scheduler( register_with_scheduler, storage=storage, name=self.name ) return self self._started = asyncio.ensure_future(_register()) else: self.client.run_on_scheduler( register_with_scheduler, storage=storage, name=self.name )
def observe_state(sem): """ This function is 100% artificial and acts as an observer to verify our assumptions. The function will wait until both payload tasks are executing, i.e. we're in an oversubscription scenario. It will then try to acquire and hopefully fail showing that the semaphore is protected if the oversubscription is recognized. """ sem.refresh_callback.stop() # We wait until we're in an oversubscribed state, i.e. both tasks # are executed although there should only be one allowed while not sem.get_value() > 1: sleep(0.2) # Once we're in an oversubscribed state, we must not be able to # acquire a lease. assert not sem.acquire(timeout=0) client = get_client() client.set_metadata("release", True)
def __init__(self, cls, address, key, worker=None): super().__init__(key) self._cls = cls self._address = address self._future = None if worker: self._worker = worker self._client = None else: try: # TODO: `get_worker` may return the wrong worker instance for async local clusters (most tests) # when run outside of a task (when deserializing a key pointing to an Actor, etc.) self._worker = get_worker() except ValueError: self._worker = None try: self._client = get_client() self._future = Future(key, inform=self._worker is None) # ^ When running on a worker, only hold a weak reference to the key, otherwise the key could become unreleasable. except ValueError: self._client = None
def get_scheduler(get=None, scheduler=None, collections=None, cls=None): """Get scheduler function There are various ways to specify the scheduler to use: 1. Passing in scheduler= parameters 2. Passing these into global configuration 3. Using defaults of a dask collection This function centralizes the logic to determine the right scheduler to use from those many options """ if get: raise TypeError(get_err_msg) if scheduler is not None: if callable(scheduler): return scheduler elif "Client" in type(scheduler).__name__ and hasattr(scheduler, "get"): return scheduler.get elif isinstance(scheduler, str): scheduler = scheduler.lower() if scheduler in named_schedulers: if config.get("scheduler", None) in ("dask.distributed", "distributed"): warnings.warn( "Running on a single-machine scheduler when a distributed client " "is active might lead to unexpected results." ) return named_schedulers[scheduler] elif scheduler in ("dask.distributed", "distributed"): from distributed.worker import get_client return get_client().get else: raise ValueError( "Expected one of [distributed, %s]" % ", ".join(sorted(named_schedulers)) ) elif isinstance(scheduler, Executor): # Get `num_workers` from `Executor`'s `_max_workers` attribute. # If undefined, fallback to `config` or worst case CPU_COUNT. num_workers = getattr(scheduler, "_max_workers", None) if num_workers is None: num_workers = config.get("num_workers", CPU_COUNT) assert isinstance(num_workers, Integral) and num_workers > 0 return partial(local.get_async, scheduler.submit, num_workers) else: raise ValueError("Unexpected scheduler: %s" % repr(scheduler)) # else: # try to connect to remote scheduler with this name # return get_client(scheduler).get if config.get("scheduler", None): return get_scheduler(scheduler=config.get("scheduler", None)) if config.get("get", None): raise ValueError(get_err_msg) if getattr(thread_state, "key", False): from distributed.worker import get_worker return get_worker().client.get if cls is not None: return cls.__dask_scheduler__ if collections: collections = [c for c in collections if c is not None] if collections: get = collections[0].__dask_scheduler__ if not all(c.__dask_scheduler__ == get for c in collections): raise ValueError( "Compute called on multiple collections with " "differing default schedulers. Please specify a " "scheduler=` parameter explicitly in compute or " "globally with `dask.config.set`." ) return get return None