async def test_close_async(c, s, a, b): sem = await Semaphore(name="test") assert await sem.acquire() with pytest.warns( RuntimeWarning, match="Closing semaphore .* but there remain unreleased leases .*", ): await sem.close() with pytest.raises(RuntimeError, match="Semaphore `test` not known or already closed."): await sem.acquire() sem2 = await Semaphore(name="t2", max_leases=1) assert await sem2.acquire() def f(sem_): return sem_.acquire() semaphore_object = s.extensions["semaphores"] fire_and_forget(c.submit(f, sem_=sem2)) while not semaphore_object.metrics["pending"][ "t2"]: # Wait for the pending lease await asyncio.sleep(0.01) with pytest.warns( RuntimeWarning, match="Closing semaphore .* but there remain pending leases"): await sem2.close() assert not semaphore_object.max_leases assert not semaphore_object.leases assert not semaphore_object.events for metric_dict in semaphore_object.metrics.values(): assert not metric_dict
def submit(self, fn: Callable, *args: Any, **kwargs: Any) -> Future: """ Submit a function to the executor for execution. Returns a Future object. Args: - fn (Callable): function that is being submitted for execution - *args (Any): arguments to be passed to `fn` - **kwargs (Any): keyword arguments to be passed to `fn` Returns: - Future: a Future-like object that represents the computation of `fn(*args, **kwargs)` """ task = kwargs["task"] workflow = self.client.new_workflow_stub(get_workflow(task.worker_id)) dask_kwargs = self._prep_dask_kwargs() kwargs.update(dask_kwargs) if self.is_started and hasattr(self, "client"): if hasattr(task, "hydra_task"): self.hydra_client.submit(fn, *args, **kwargs) else: future = self.client.submit(fn, *args, **kwargs) elif self.is_started: with worker_client(separate_thread=True) as client: future = client.submit(fn, *args, **kwargs) else: raise ValueError("This executor has not been started.") fire_and_forget(future) return future
def map(self, fn: Callable, *args: Any, **kwargs: Any) -> List[Future]: """ Submit a function to be mapped over its iterable arguments. Args: - fn (Callable): function that is being submitted for execution - *args (Any): arguments that the function will be mapped over - **kwargs (Any): additional keyword arguments that will be passed to the Dask Client Returns: - List[Future]: a list of Future-like objects that represent each computation of fn(*a), where a = zip(*args)[i] """ if not args: return [] dask_kwargs = self._prep_dask_kwargs() kwargs.update(dask_kwargs) if self.is_started and hasattr(self, "client"): futures = self.client.map(fn, *args, **kwargs) elif self.is_started: with worker_client(separate_thread=True) as client: futures = client.map(fn, *args, **kwargs) return client.gather(futures) else: raise ValueError("This executor has not been started.") fire_and_forget(futures) return futures
def dask_endpoint(owner, app_name, action): """ Route dask simulation to appropriate dask scheduluer. """ print(f"dask endpoint: {owner}/{app_name}/{action}") data = request.get_data() inputs = json.loads(data) print("inputs", inputs) addr = dask_scheduler_address(owner, app_name) job_id = str(uuid.uuid4()) # Worker needs the job_id to push the results back to the # webapp. # The url and api token are passed as args insted of env # variables so that the wrapper has access to them # but the model does not. inputs.update({ "job_id": job_id, "comp_url": os.environ.get("COMP_URL"), "comp_api_token": os.environ.get("COMP_API_TOKEN"), "timeout": get_time_out(owner, app_name), }) with Client(addr) as c: fut = c.submit(dask_sim, **inputs) fire_and_forget(fut) return {"job_id": job_id, "qlength": 1}
def submit(self, fn: Callable, *args: Any, **kwargs: Any) -> "Future": """ Submit a function to the executor for execution. Returns a Future object. Args: - fn (Callable): function that is being submitted for execution - *args (Any): arguments to be passed to `fn` - **kwargs (Any): keyword arguments to be passed to `fn` Returns: - Future: a Future-like object that represents the computation of `fn(*args, **kwargs)` """ # import dask functions here to decrease our import times from distributed import fire_and_forget, worker_client dask_kwargs = self._prep_dask_kwargs() kwargs.update(dask_kwargs) if self.is_started and hasattr(self, "client"): future = self.client.submit(fn, *args, **kwargs) elif self.is_started: with worker_client(separate_thread=True) as client: future = client.submit(fn, *args, **kwargs) else: raise ValueError("This executor has not been started.") fire_and_forget(future) return future
def run_job(self, job_type: str, spec: dict, scheduler: str, mode: str = "async") -> Union[ExecutedJob, InvalidJob]: from distributed import fire_and_forget import dask from mason_dask.jobs.executed import ExecutedJob as ExecutedDaskJob from mason_dask.jobs.executed import InvalidJob as InvalidDaskJob from mason_dask.utils.cluster_spec import ClusterSpec if job_type == "format": from mason_dask.jobs.format import FormatJob as DaskFormatJob job = DaskFormatJob(spec) elif job_type == "query": from mason_dask.jobs.query import QueryJob as DaskQueryJob job = DaskQueryJob(spec) else: raise NotImplementedError(f"Job not implemented: {job_type}") dask_job = job.validate() def to_mason_job(job: Result[ExecutedDaskJob, InvalidDaskJob]): j = compute(job) if isinstance(j, ExecutedDaskJob): return ExecutedJob("format-job", j.message) else: value = job._inner_value assert (isinstance(value, InvalidDaskJob)) return InvalidJob(value.message) with self.client() as client: cluster_spec = ClusterSpec(client, scheduler=self.scheduler) final: Union[ExecutedJob, InvalidJob] if isinstance(dask_job, InvalidDaskJob): final = InvalidJob(f"Invalid Dask Job: {dask_job.message}") else: result: Result[ExecutedDaskJob, InvalidDaskJob] if scheduler.startswith("local"): result = dask_job.run(cluster_spec) final = to_mason_job(result) else: dask.config.set( {'distributed.scheduler.allowed-failures': 50}) future = client.submit(dask_job.run, cluster_spec) if mode == "sync": result = client.gather(future) final = to_mason_job(result) else: fire_and_forget(future) final = ExecutedJob( f"Queued job {dask_job} to run against dask scheduler: {scheduler}" ) return final
def run_flow(self, environment, config, context, **kwargs): future = self.dask_client.submit( run_flow_in_worker, environment=environment, config=config, context=context, pure=False, ) distributed.fire_and_forget(future)
def test_cancel_fire_and_forget(c, s, a, b): x = delayed(slowinc)(1, delay=0.05) y = delayed(slowinc)(x, delay=0.05) z = delayed(slowinc)(y, delay=0.05) w = delayed(slowinc)(z, delay=0.05) future = c.compute(w) fire_and_forget(future) yield gen.sleep(0.05) yield future.cancel(force=True) assert future.status == 'cancelled' assert not s.task_state
def test_cancel_fire_and_forget(c, s, a, b): x = delayed(slowinc)(1, delay=0.05) y = delayed(slowinc)(x, delay=0.05) z = delayed(slowinc)(y, delay=0.05) w = delayed(slowinc)(z, delay=0.05) future = c.compute(w) fire_and_forget(future) yield gen.sleep(0.05) yield future.cancel(force=True) assert future.status == 'cancelled' assert not s.tasks
def run(spec: dict, scheduler: str): class CompleteDaskJob: def __init__(self, message: str = ""): self.message = message class InvalidDaskJob(): def __init__(self, message: str = ""): self.message = message class DaskQueryJob(): def __init__(self, job_spec: dict): self.query_string = job_spec.get("query_string") self.database = job_spec.get("database") self.output_path = job_spec.get("output_path") def run_job(self) -> Union[CompleteDaskJob, InvalidDaskJob]: # df: DataFrame = dd.read_sql_table(self.query_string) if self.output_path: # df.to_parquet(self.output_path) return CompleteDaskJob( f"Job to query via Dask succesfully queued to scheduler") else: return InvalidDaskJob( "Output path required for Dask implementation of table query" ) dask_job = DaskQueryJob(spec) mode = "async" if scheduler == "local": client = Client() dask_job.run_job() else: dask.config.set({'distributed.scheduler.allowed-failures': 50}) client = Client(scheduler) future = client.submit(dask_job.run_job) if mode == "sync": client.gather(future) else: fire_and_forget(future)
def submit(self, fn: Callable, *args: Any, **kwargs: Any) -> Future: """ Submit a function to the executor for execution. Returns a Future object. Args: - fn (Callable): function that is being submitted for execution - *args (Any): arguments to be passed to `fn` - **kwargs (Any): keyword arguments to be passed to `fn` Returns: - Future: a Future-like object which represents the computation of `fn(*args, **kwargs)` """ if self.is_started and hasattr(self, "client"): future = self.client.submit(fn, *args, pure=False, **kwargs) elif self.is_started: with worker_client(separate_thread=True) as client: future = client.submit(fn, *args, pure=False, **kwargs) else: raise ValueError("This executor has not been started.") fire_and_forget(future) return future
def submit(self, fn: Callable, *args: Any, extra_context: dict = None, **kwargs: Any) -> "Future": """ Submit a function to the executor for execution. Returns a Future object. Args: - fn (Callable): function that is being submitted for execution - *args (Any): arguments to be passed to `fn` - extra_context (dict, optional): an optional dictionary with extra information about the submitted task - **kwargs (Any): keyword arguments to be passed to `fn` Returns: - Future: a Future-like object that represents the computation of `fn(*args, **kwargs)` """ # import dask functions here to decrease our import times from distributed import fire_and_forget, worker_client extra_context = extra_context or {} task_name = extra_context.get("task_full_name", "") task_tags = extra_context.get("task_tags", []) dask_kwargs = self._prep_dask_kwargs(task_name=task_name, task_tags=task_tags) kwargs.update(dask_kwargs) if self.is_started and hasattr(self, "client"): future = self.client.submit(fn, *args, **kwargs) elif self.is_started: with worker_client(separate_thread=True) as client: future = client.submit(fn, *args, **kwargs) else: raise ValueError("This executor has not been started.") fire_and_forget(future) return future
def cleanup(self, badstatuslist=['cancelled', 'error', 'lost'], keep=None): """ Clean up job list. Scans futures, removes finished jobs, and pushes results to relevant indices. badstatuslist can include 'cancelled', 'error', 'lost'. keep defines a scanId (string) key that should not be removed from dicts. """ removed = 0 cindexed = 0 sdms = 0 scanIds = [scanId for scanId in self.futures] if len(scanIds): logger.info("Checking on scanIds: {0}".format(','.join(scanIds))) # clean futures and get finished jobs removed = self.removefutures(badstatuslist) for scanId in self.futures: # check on finished finishedlist = [ (seg, data, cc, acc) for (scanId0, futurelist) in iteritems(self.futures) for seg, data, cc, acc in futurelist if (acc.status == 'finished') and (scanId0 == scanId) ] self.finished[scanId] += len(finishedlist) if self.indexresults: elastic.indexscanstatus(scanId, pending=self.pending[scanId], finished=self.finished[scanId], errors=self.errors[scanId], indexprefix=self.indexprefix) # TODO: check on error handling for fire_and_forget for futures in finishedlist: seg, data, cc, acc = futures ncands, mocks = acc.result() # index mocks if self.indexresults and mocks: distributed.fire_and_forget( self.client.submit(elastic.indexmock, scanId, mocks, indexprefix=self.indexprefix)) else: logger.debug( "No mocks indexed from scanId {0}".format(scanId)) # index noises noisefile = self.states[scanId].noisefile if self.indexresults and os.path.exists(noisefile): distributed.fire_and_forget( self.client.submit(elastic.indexnoises, noisefile, scanId, indexprefix=self.indexprefix)) else: logger.debug( "No noises indexed from scanId {0}.".format(scanId)) # index cands if self.indexresults and ncands: workdir = self.states[scanId].prefs.workdir distributed.fire_and_forget( self.client.submit(util.indexcands_and_plots, cc, scanId, self.tags, self.indexprefix, workdir, priority=5)) else: logger.debug( "No cands indexed from scanId {0}".format(scanId)) # optionally save and archive sdm/bdfs for segment if self.saveproducts and ncands: distributed.fire_and_forget( self.client.submit(createproducts, cc, data, self.archiveproducts, indexprefix=self.indexprefix, priority=5)) logger.info( "Creating an SDM for {0}, segment {1}, with {2} candidates" .format(scanId, seg, ncands)) sdms += 1 else: logger.debug( "No SDMs plots moved for scanId {0}.".format(scanId)) # remove job from list self.futures[scanId].remove(futures) removed += 1 # clean up self.futures removeids = [ scanId for scanId in self.futures if (len(self.futures[scanId]) == 0) and (scanId != keep) ] if removeids: logstr = ("No jobs left for scanIds: {0}.".format( ', '.join(removeids))) if keep is not None: logstr += (". Cleaning state and futures dicts (keeping {0})". format(keep)) else: logstr += ". Cleaning state and futures dicts." logger.info(logstr) for scanId in removeids: _ = self.futures.pop(scanId) _ = self.states.pop(scanId) _ = self.finished.pop(scanId) _ = self.errors.pop(scanId) try: _ = self.known_segments.pop(scanId) except KeyError: pass # _ = self.client.run(gc.collect) if removed or cindexed or sdms: logger.info( 'Removed {0} jobs, indexed {1} cands, made {2} SDMs.'.format( removed, cindexed, sdms))
def start_pipeline(self, scanId, cfile=None, segments=None): """ Start pipeline conditional on cluster state. Sets futures and state after submission keyed by scanId. segments arg can be used to select or slow segment submission. """ st = self.states[scanId] w_memlim = self.read_overhead * st.vismem * 1e9 if segments is None: segments = list(range(st.nsegment)) vys_timeout = self.vys_timeout if st.metadata.datasource in ['vys', 'vyssim']: if self.vys_timeout is not None: logger.debug( "vys_timeout factor set to fixed value of {0:.1f}x".format( vys_timeout)) else: assert self.vys_sec_per_spec is not None, "Must define vys_sec_per_spec to estimate vys_timeout" nspec = st.readints * st.nbl * st.nspw * st.npol vys_timeout = (st.t_segment + self.vys_sec_per_spec * nspec) / st.t_segment logger.debug( "vys_timeout factor scaled by nspec to {0:.1f}x".format( vys_timeout)) mockseg = random.choice(segments) if random.uniform( 0, 1) < self.mockprob else None if mockseg is not None: logger.info("Mock set for scanId {0} in segment {1}".format( scanId, mockseg)) # vys data means realtime operations must timeout within a scan time if st.metadata.datasource == 'vys': timeout = 0.9 * st.metadata.inttime * st.metadata.nints # bit shorter than scan else: timeout = 0 throttletime = self.throttle * st.metadata.inttime * st.metadata.nints / st.nsegment logger.info( 'Submitting {0} segments for scanId {1} with {2:.1f}s per segment'. format(len(segments), scanId, throttletime)) logger.debug('Read_overhead {0}, read_totfrac {1}, and ' 'spill_limit {2} with timeout {3}s'.format( self.read_overhead, self.read_totfrac, self.spill_limit, timeout)) tot_memlim = self.read_totfrac * sum([ v['resources']['MEMORY'] for v in itervalues(self.client.scheduler_info()['workers']) if 'READER' in v['resources'] ]) # submit segments t0 = time.Time.now().unix elapsedtime = 0 nsubmitted = 0 # count number submitted from list segments segments = iter(segments) segment = next(segments) telcalset = self.set_telcalfile(scanId) while True: segsubtime = time.Time.now().unix if st.metadata.datasource == 'vys': endtime = time.Time(st.segmenttimes[segment][1], format='mjd').unix if endtime < segsubtime - 2: # TODO: define buffer delay better logger.warning( "Segment {0} time window has passed ({1} < {2}). Skipping." .format(segment, endtime, segsubtime - 1)) try: segment = next(segments) continue except StopIteration: logger.debug( "No more segments for scanId {0}".format(scanId)) break # try setting telcal if not telcalset: telcalset = self.set_telcalfile(scanId) # submit if cluster ready and telcal available if (heuristics.reader_memory_ok(self.client, w_memlim) and heuristics.readertotal_memory_ok( self.client, tot_memlim) and heuristics.spilled_memory_ok(limit=self.spill_limit, daskdir=self.daskdir) and (telcalset if self.requirecalibration else True)): # first time initialize scan if scanId not in self.futures: self.futures[scanId] = [] self.errors[scanId] = 0 self.finished[scanId] = 0 if self.indexresults: elastic.indexscan( inmeta=self.states[scanId].metadata, preferences=self.states[scanId].prefs, indexprefix=self.indexprefix) else: logger.info("Not indexing scan or prefs.") futures = pipeline.pipeline_seg(st, segment, cl=self.client, cfile=cfile, vys_timeout=vys_timeout, mem_read=w_memlim, mem_search=2 * st.vismem * 1e9, mockseg=mockseg) self.futures[scanId].append(futures) nsubmitted += 1 if self.data_logging: segment, data, cc, acc = futures distributed.fire_and_forget( self.client.submit(util.data_logger, st, segment, data, fifo_timeout='0s', priority=-1)) if self.indexresults: elastic.indexscanstatus(scanId, pending=self.pending[scanId], finished=self.finished[scanId], errors=self.errors[scanId], indexprefix=self.indexprefix, nsegment=st.nsegment) try: segment = next(segments) except StopIteration: logger.info( "No more segments for scanId {0}".format(scanId)) break else: if not heuristics.reader_memory_ok(self.client, w_memlim): logger.info( "System not ready. No reader available with required memory {0}" .format(w_memlim)) elif not heuristics.readertotal_memory_ok( self.client, tot_memlim): logger.info( "System not ready. Total reader memory exceeds limit of {0}" .format(tot_memlim)) elif not heuristics.spilled_memory_ok(limit=self.spill_limit, daskdir=self.daskdir): logger.info( "System not ready. Spilled memory exceeds limit of {0}" .format(self.spill_limit)) elif not (self.set_telcalfile(scanId) if self.requirecalibration else True): logger.info( "System not ready. No telcalfile available for {0}". format(scanId)) # periodically check on submissions. always, if memory limited. if not (segment % 2) or not ( heuristics.reader_memory_ok(self.client, w_memlim) and heuristics.readertotal_memory_ok(self.client, tot_memlim) and heuristics.spilled_memory_ok(limit=self.spill_limit, daskdir=self.daskdir)): self.cleanup( keep=scanId) # do not remove keys of ongoing submission # check timeout and wait time for next segment elapsedtime = time.Time.now().unix - t0 if elapsedtime > timeout and timeout: logger.info("Submission timed out. Submitted {0}/{1} segments " "in ScanId {2}".format(nsubmitted, st.nsegment, scanId)) break else: dt = time.Time.now().unix - segsubtime if dt < throttletime: logger.debug("Waiting {0:.1f}s to submit segment.".format( throttletime - dt)) sleep(throttletime - dt)
with joblib.parallel_backend('dask'): grid_search.fit(X, y) fs = gcsfs.GCSFileSystem() path = "gcs://pangeo-scratch/tomaugspurger/model-1.pkl" with fs.open(path, "wb") as f: joblib.dump(grid_search, f) return path if __name__ == "__main__": auth = JupyterHubAuth(os.environ["PANGEO_TOKEN"]) # Proxy address will be made easier to find. print("Connecting to Gateway") gateway = Gateway( address= "https://staging.us-central1-b.gcp.pangeo.io/services/dask-gateway/", proxy_address="tls://104.197.142.28:8786", auth=auth) cluster = gateway.new_cluster(shutdown_on_close=False) client = cluster.get_client() print("Dashboard:", client.dashboard_link) cluster.scale(4) client.wait_for_workers(4) print("Cluster ready") fut = client.submit(main) fire_and_forget(fut)