def test_Future_knows_status_immediately(c, s, a, b): x = yield c.scatter(123) v = Variable('x') yield v.set(x) c2 = yield Client(s.address, asynchronous=True) v2 = Variable('x', client=c2) future = yield v2.get() assert future.status == 'finished' x = c.submit(div, 1, 0) yield wait(x) yield v.set(x) future2 = yield v2.get() assert future2.status == 'error' with pytest.raises(Exception): yield future2 start = time() while True: # we learn about the true error eventually try: yield future2 except ZeroDivisionError: break except Exception: assert time() < start + 5 yield gen.sleep(0.05) yield c2.close()
def test_sync(client): future = client.submit(lambda x: x + 1, 10) x = Variable('x') xx = Variable('x') x.set(future) future2 = xx.get() assert future2.result() == 11
def test_future_erred_sync(client): future = client.submit(div, 1, 0) var = Variable() var.set(future) sleep(0.1) future2 = var.get() with pytest.raises(ZeroDivisionError): future2.result()
def f(i): with worker_client() as c: v = Variable('x', client=c) for _ in range(NITERS): future = v.get() x = future.result() y = c.submit(inc, x) v.set(y) sleep(0.01 * random.random()) result = v.get().result() sleep(0.1) # allow fire-and-forget messages to clear return result
def test_race(c, s, *workers): NITERS = 50 def f(i): with worker_client() as c: v = Variable('x', client=c) for _ in range(NITERS): future = v.get() x = future.result() y = c.submit(inc, x) v.set(y) sleep(0.01 * random.random()) result = v.get().result() sleep(0.1) # allow fire-and-forget messages to clear return result v = Variable('x', client=c) x = yield c.scatter(1) yield v.set(x) futures = c.map(f, range(15)) results = yield c.gather(futures) assert all(r > NITERS * 0.8 for r in results) start = time() while len(s.wants_what['variable-x']) != 1: yield gen.sleep(0.01) assert time() - start < 2
def test_queue_with_data(c, s, a, b): x = Variable('x') xx = Variable('x') assert x.client is c yield x.set([1, 'hello']) data = yield xx.get() assert data == [1, 'hello']
def test_queue_with_data(c, s, a, b): x = Variable("x") xx = Variable("x") assert x.client is c yield x.set((1, "hello")) data = yield xx.get() assert data == (1, "hello")
def test_queue_with_data(c, s, a, b): x = Variable('x') xx = Variable('x') assert x.client is c yield x.set((1, 'hello')) data = yield xx.get() assert data == (1, 'hello')
def test_timeout_get(c, s, a, b): v = Variable('v') tornado_future = v.get() vv = Variable('v') yield vv.set(1) result = yield tornado_future assert result == 1
def test_timeout_get(c, s, a, b): v = Variable("v") tornado_future = v.get() vv = Variable("v") yield vv.set(1) result = yield tornado_future assert result == 1
def test_as_completed_distributed(loop): # noqa with cluster(active_rpc_timeout=10, nanny=Nanny) as (s, [a, b]): with Client(s["address"], loop=loop) as c: counter_name = "counter_name" counter = Variable(counter_name, client=c) counter.set(0) lock_name = "lock" killed_workers_name = "killed_workers" killed_workers = Variable(killed_workers_name, client=c) killed_workers.set({}) X, y = make_classification(n_samples=100, n_features=10, random_state=0) gs = dcv.GridSearchCV( AsCompletedEstimator(killed_workers_name, lock_name, counter_name, 7), param_grid={"foo_param": [0, 1, 2]}, cv=3, refit=False, cache_cv=False, scheduler=c, ) gs.fit(X, y) def f(dask_scheduler): return dask_scheduler.transition_log def check_reprocess(transition_log): finished = set() for transition in transition_log: key, start_state, end_state = ( transition[0], transition[1], transition[2], ) assert key not in finished if ("score" in key and start_state == "memory" and end_state == "forgotten"): finished.add(key) check_reprocess(c.run_on_scheduler(f))
def test_erred_future(c, s, a, b): future = c.submit(div, 1, 0) var = Variable() yield var.set(future) yield gen.sleep(0.1) future2 = yield var.get() with pytest.raises(ZeroDivisionError): yield future2.result() exc = yield future2.exception() assert isinstance(exc, ZeroDivisionError)
def test_erred_future(c, s, a, b): future = c.submit(div, 1, 0) var = Variable() yield var.set(future) yield gen.sleep(0.1) future2 = yield var.get() with pytest.raises(ZeroDivisionError): yield future2.result() exc = yield future2.exception() assert isinstance(exc, ZeroDivisionError)
def fit(self, X, y=None): w = get_worker() dsk_lock = Lock(self.lock_name, client=w.client) dsk_counter = Variable(self.counter_name, client=w.client) dsk_killed_workers = Variable(self.killed_workers_name, client=w.client) for e in list(w.executing): should_die = False t = literal_eval(e) with dsk_lock: c = dsk_counter.get() dsk_counter.set(c + 1) killed_workers = dsk_killed_workers.get() if c > self.min_complete and t not in killed_workers: killed_workers[t] = True should_die = True dsk_killed_workers.set(killed_workers) if should_die: os.kill(os.getpid(), 9) return self
def process(self, events): from distributed import worker_client, Variable, Lock assert isinstance(self.proc, BaseProcessor) assert not isinstance(self.proc, _Preheater) s = self.proc.get_dataset(events).data_source d = self.prefix + s with worker_client(separate_thread=False) as c: v = Variable(d, c) l = Lock(d, c) if l.acquire(blocking=False): self.proc.process(events) cols = set() for col in events.materialized: col = col.replace("_", ".", 1) try: attrgetter(col)(events) except AttributeError: pass else: cols.add(col) cols = sorted(cols) v.set(cols) return dict_accumulator({s: set_accumulator(cols)}) else: cols = v.get() for ag in map(attrgetter, cols): data = ag(events) data = getattr(data, "content", data) if callable(getattr(data, "materialize")): data.materialize() return dict_accumulator({})
def test_hold_futures(s, a, b): c1 = yield Client(s.address, asynchronous=True) future = c1.submit(lambda x: x + 1, 10) x1 = Variable("x") yield x1.set(future) del x1 yield c1.close() yield gen.sleep(0.1) c2 = yield Client(s.address, asynchronous=True) x2 = Variable("x") future2 = yield x2.get() result = yield future2 assert result == 11 yield c2.close()
def test_hold_futures(s, a, b): c1 = yield Client(s.address, asynchronous=True) future = c1.submit(lambda x: x + 1, 10) x1 = Variable('x') yield x1.set(future) del x1 yield c1.close() yield gen.sleep(0.1) c2 = yield Client(s.address, asynchronous=True) x2 = Variable('x') future2 = yield x2.get() result = yield future2 assert result == 11 yield c2.close()
async def test_cleanup(c, s, a, b): v = Variable("v") vv = Variable("v") x = c.submit(lambda x: x + 1, 10) y = c.submit(lambda x: x + 1, 20) x_key = x.key await v.set(x) del x await gen.sleep(0.1) t_future = xx = asyncio.ensure_future(vv._get()) await gen.sleep(0) asyncio.ensure_future(v.set(y)) future = await t_future assert future.key == x_key result = await future assert result == 11
def test_cleanup(c, s, a, b): v = Variable('v') vv = Variable('v') x = c.submit(lambda x: x + 1, 10) y = c.submit(lambda x: x + 1, 20) x_key = x.key yield v.set(x) del x yield gen.sleep(0.1) t_future = xx = vv._get() yield gen.moment v._set(y) future = yield t_future assert future.key == x_key result = yield future assert result == 11
def test_cleanup(c, s, a, b): v = Variable('v') vv = Variable('v') x = c.submit(lambda x: x + 1, 10) y = c.submit(lambda x: x + 1, 20) x_key = x.key yield v.set(x) del x yield gen.sleep(0.1) t_future = xx = vv._get() yield gen.moment v._set(y) future = yield t_future assert future.key == x_key result = yield future assert result == 11
def test_variable(c, s, a, b): x = Variable('x') xx = Variable('x') assert x.client is c future = c.submit(inc, 1) yield x.set(future) future2 = yield xx.get() assert future.key == future2.key del future, future2 yield gen.sleep(0.1) assert s.tasks # future still present x.delete() start = time() while s.tasks: yield gen.sleep(0.01) assert time() < start + 5
def test_variable(c, s, a, b): x = Variable("x") xx = Variable("x") assert x.client is c future = c.submit(inc, 1) yield x.set(future) future2 = yield xx.get() assert future.key == future2.key del future, future2 yield gen.sleep(0.1) assert s.tasks # future still present x.delete() start = time() while s.tasks: yield gen.sleep(0.01) assert time() < start + 5
class DistStatusReporter(object): """Report status through the training scheduler. Example ------- >>> @autogluon_method >>> def train_func(config, reporter): ... reporter(accuracy=0.1) """ def __init__(self, remote=None): self._queue = Queue(client=remote) self._stop = Variable(client=remote) self._stop.set(False) self._continue_semaphore = DistSemaphore(0, remote) self._last_report_time = time.time() def __call__(self, **kwargs): """Report updated training status. Pass in `done=True` when the training job is completed. Args: kwargs: Latest training result status. Example _______ >>> reporter(accuracy=1, training_iters=4) """ report_time = time.time() if 'time_this_iter' not in kwargs: kwargs['time_this_iter'] = report_time - self._last_report_time self._last_report_time = report_time logger.debug('Reporting {}'.format(json.dumps(kwargs))) try: self._queue.put(kwargs.copy()) except RuntimeError: return self._continue_semaphore.acquire() if self._stop.get(): raise AutoGluonEarlyStop('Stopping!') def fetch(self, block=True): try: kwargs = self._queue.get() except CommClosedError: return {} return kwargs def terminate(self): self._stop.set(True) self._continue_semaphore.release() def move_on(self): self._continue_semaphore.release() def _start(self): """Adjust the real starting time """ self._last_report_time = time.time() def save_dict(self, **state_dict): raise NotImplementedError def get_dict(self): raise NotImplementedError def __repr__(self): reprstr = self.__class__.__name__ return reprstr
class DaskExecutor(Executor): """ An executor that runs all functions using the `dask.distributed` scheduler. By default a temporary `distributed.LocalCluster` is created (and subsequently torn down) within the `start()` contextmanager. To use a different cluster class (e.g. [`dask_kubernetes.KubeCluster`](https://kubernetes.dask.org/)), you can specify `cluster_class`/`cluster_kwargs`. Alternatively, if you already have a dask cluster running, you can provide the address of the scheduler via the `address` kwarg. Note that if you have tasks with tags of the form `"dask-resource:KEY=NUM"` they will be parsed and passed as [Worker Resources](https://distributed.dask.org/en/latest/resources.html) of the form `{"KEY": float(NUM)}` to the Dask Scheduler. Args: - address (string, optional): address of a currently running dask scheduler; if one is not provided, a temporary cluster will be created in `executor.start()`. Defaults to `None`. - cluster_class (string or callable, optional): the cluster class to use when creating a temporary dask cluster. Can be either the full class name (e.g. `"distributed.LocalCluster"`), or the class itself. - cluster_kwargs (dict, optional): addtional kwargs to pass to the `cluster_class` when creating a temporary dask cluster. - adapt_kwargs (dict, optional): additional kwargs to pass to ``cluster.adapt` when creating a temporary dask cluster. Note that adaptive scaling is only enabled if `adapt_kwargs` are provided. - client_kwargs (dict, optional): additional kwargs to use when creating a [`dask.distributed.Client`](https://distributed.dask.org/en/latest/api.html#client). - debug (bool, optional): When running with a local cluster, setting `debug=True` will increase dask's logging level, providing potentially useful debug info. Defaults to the `debug` value in your Prefect configuration. - **kwargs: DEPRECATED Using a temporary local dask cluster: ```python executor = DaskExecutor() ``` Using a temporary cluster running elsewhere. Any Dask cluster class should work, here we use [dask-cloudprovider](https://cloudprovider.dask.org): ```python executor = DaskExecutor( cluster_class="dask_cloudprovider.FargateCluster", cluster_kwargs={ "image": "prefecthq/prefect:latest", "n_workers": 5, ... }, ) ``` Connecting to an existing dask cluster ```python executor = DaskExecutor(address="192.0.2.255:8786") ``` """ def __init__( self, address: str = None, cluster_class: Union[str, Callable] = None, cluster_kwargs: dict = None, adapt_kwargs: dict = None, client_kwargs: dict = None, debug: bool = None, **kwargs: Any, ): if address is None: address = context.config.engine.executor.dask.address or None # XXX: deprecated if address == "local": warnings.warn( "`address='local'` is deprecated. To use a local cluster, leave the " "`address` field empty.") address = None # XXX: deprecated local_processes = kwargs.pop("local_processes", None) if local_processes is None: local_processes = context.config.engine.executor.dask.get( "local_processes", None) if local_processes is not None: warnings.warn( "`local_processes` is deprecated, please use " "`cluster_kwargs={'processes': local_processes}`. The default is " "now `local_processes=True`.") if address is not None: if cluster_class is not None or cluster_kwargs is not None: raise ValueError( "Cannot specify `address` and `cluster_class`/`cluster_kwargs`" ) else: if cluster_class is None: cluster_class = context.config.engine.executor.dask.cluster_class if isinstance(cluster_class, str): cluster_class = import_object(cluster_class) if cluster_kwargs is None: cluster_kwargs = {} else: cluster_kwargs = cluster_kwargs.copy() from distributed.deploy.local import LocalCluster if cluster_class == LocalCluster: if debug is None: debug = context.config.debug cluster_kwargs.setdefault( "silence_logs", logging.CRITICAL if not debug else logging.WARNING) if local_processes is not None: cluster_kwargs.setdefault("processes", local_processes) for_cluster = set(kwargs).difference(_valid_client_kwargs) if for_cluster: warnings.warn( "Forwarding executor kwargs to `LocalCluster` is now handled by the " "`cluster_kwargs` parameter, please update accordingly" ) for k in for_cluster: cluster_kwargs[k] = kwargs.pop(k) if adapt_kwargs is None: adapt_kwargs = {} if client_kwargs is None: client_kwargs = {} else: client_kwargs = client_kwargs.copy() if kwargs: warnings.warn( "Forwarding executor kwargs to `Client` is now handled by the " "`client_kwargs` parameter, please update accordingly") client_kwargs.update(kwargs) client_kwargs.setdefault("set_as_default", False) self.address = address self.cluster_class = cluster_class self.cluster_kwargs = cluster_kwargs self.adapt_kwargs = adapt_kwargs self.client_kwargs = client_kwargs # Runtime attributes self.client = None # These are coupled - they're either both None, or both non-None. # They're used in the case we can't forcibly kill all the dask workers, # and need to wait for all the dask tasks to cleanup before exiting. self._futures = None # type: Optional[weakref.WeakSet[Future]] self._should_run_var = None # type: Optional[Variable] super().__init__() @contextmanager def start(self) -> Iterator[None]: """ Context manager for initializing execution. Creates a `dask.distributed.Client` and yields it. """ from distributed import Client try: if self.address is not None: with Client(self.address, **self.client_kwargs) as client: self.client = client try: self._pre_start_yield() yield finally: self._post_start_yield() else: with self.cluster_class( **self.cluster_kwargs) as cluster: # type: ignore if self.adapt_kwargs: cluster.adapt(**self.adapt_kwargs) with Client(cluster, **self.client_kwargs) as client: self.client = client try: self._pre_start_yield() yield finally: self._post_start_yield() finally: self.client = None def _pre_start_yield(self) -> None: from distributed import Variable is_inproc = self.client.scheduler.address.startswith( "inproc") # type: ignore if self.address is not None or is_inproc: self._futures = weakref.WeakSet() self._should_run_var = Variable(f"prefect-{uuid.uuid4().hex}", client=self.client) self._should_run_var.set(True) def _post_start_yield(self) -> None: from distributed import wait if self._should_run_var is not None: # Multipart cleanup, ignoring exceptions in each stage # 1.) Stop pending tasks from starting try: self._should_run_var.set(False) except Exception: pass # 2.) Wait for all running tasks to complete try: futures = [f for f in list(self._futures) if not f.done()] # type: ignore if futures: self.logger.info( "Stopping executor, waiting for %d active tasks to complete", len(futures), ) wait(futures) except Exception: pass # 3.) Delete the distributed variable try: self._should_run_var.delete() except Exception: pass self._should_run_var = None self._futures = None def _prep_dask_kwargs(self, extra_context: dict = None) -> dict: if extra_context is None: extra_context = {} dask_kwargs = {"pure": False} # type: dict # set a key for the dask scheduler UI key = _make_task_key(**extra_context) if key is not None: dask_kwargs["key"] = key # infer from context if dask resources are being utilized task_tags = extra_context.get("task_tags", []) dask_resource_tags = [ tag for tag in task_tags if tag.lower().startswith("dask-resource") ] if dask_resource_tags: resources = {} for tag in dask_resource_tags: prefix, val = tag.split("=") resources.update({prefix.split(":")[1]: float(val)}) dask_kwargs.update(resources=resources) return dask_kwargs def __getstate__(self) -> dict: state = self.__dict__.copy() state.update( {k: None for k in ["client", "_futures", "_should_run_var"]}) return state def __setstate__(self, state: dict) -> None: self.__dict__.update(state) def submit(self, fn: Callable, *args: Any, extra_context: dict = None, **kwargs: Any) -> "Future": """ Submit a function to the executor for execution. Returns a Future object. Args: - fn (Callable): function that is being submitted for execution - *args (Any): arguments to be passed to `fn` - extra_context (dict, optional): an optional dictionary with extra information about the submitted task - **kwargs (Any): keyword arguments to be passed to `fn` Returns: - Future: a Future-like object that represents the computation of `fn(*args, **kwargs)` """ if self.client is None: raise ValueError("This executor has not been started.") kwargs.update(self._prep_dask_kwargs(extra_context)) if self._should_run_var is None: fut = self.client.submit(fn, *args, **kwargs) else: fut = self.client.submit(_maybe_run, self._should_run_var.name, fn, *args, **kwargs) self._futures.add(fut) return fut def wait(self, futures: Any) -> Any: """ Resolves the Future objects to their values. Blocks until the computation is complete. Args: - futures (Any): single or iterable of future-like objects to compute Returns: - Any: an iterable of resolved futures with similar shape to the input """ if self.client is None: raise ValueError("This executor has not been started.") return self.client.gather(futures)