def wait_for_future( future, poll_timeout: float = 1.0, t0: Optional[datetime] = None) -> Iterator[Tuple[float, datetime]]: """ Generate a sequence of (time_passed, timestamp) tuples, stop when future becomes ready. :param future: Dask future :param poll_timeout: Controls how often :param t0: From what point to start counting (defaults to right now) """ if t0 is None: t0 = datetime.utcnow() while not future.done(): try: dask_wait(future, timeout=poll_timeout, return_when="FIRST_COMPLETED") return except TimeoutError: pass t_now = datetime.utcnow() yield ((t_now - t0).total_seconds(), t_now)
def chunked_persist_ds(xx: xr.Dataset, client, verbose: bool = False) -> xr.Dataset: names = list(xx.data_vars) data = [xx[n].data for n in names] delayed = [d.to_delayed().ravel() for d in data] delayed = list(zip(*delayed)) persisted = [] for chunk in delayed: chunk = client.persist(chunk) _ = dask_wait(chunk) persisted.extend(chunk) if verbose: print(".", end="") # at this point it should be almost no-op data = client.persist(data) # reconstruct xr.Dataset from persisted chunks _vars = {} for n, d in zip(names, data): dv = xx[n] _vars[n] = xr.DataArray(data=d, dims=dv.dims, coords=dv.coords, name=n) return xr.Dataset(_vars)
def chunked_persist(data, n_concurrent, client, verbose=False): """ Force limited concurrency when persisting a large collection. This is useful to control memory usage when operating close to capacity. Sometimes `client.persist(data)` will run out of memory, not because fully-realized data is large, but because of intermediate data memory requirements. This is particularly common when using local dask cluster with only one worker. This function forces evaluation order of the dask graph to control peak memory usage. Say you have a largish task graph of 10x10 top-level sub-tasks, you have enough memory to process 5 sub-tasks concurrently, but Dask might decide to schedule more than that and will cause worker restarts due to out of memory errors. With this function you can force dask scheduler to persist this collection in batches of 5 concurrent sub-tasks, keeping the computation within the memory budget. """ delayed = data.to_delayed().ravel() persisted = [] for chunk in partition_all(n_concurrent, delayed): chunk = client.persist(chunk) _ = dask_wait(chunk) persisted.extend(chunk) if verbose: print(".", end="") # at this point it should be almost no-op return client.persist(data)
def process_tasks(tasks: Iterable[Task], proc: TaskProc, client: Client, sink: S3COGSink, check_exists: bool = True, chunked_persist: int = 0, verbose: bool = True) -> Iterator[str]: def prep_stage(tasks: Iterable[Task], proc: TaskProc) -> Iterator[Tuple[Union[xr.Dataset, xr.DataArray, None], Task, str]]: for task in tasks: path = sink.uri(task) if check_exists: if sink.exists(task): yield (None, task, path) continue ds = proc(task) yield (ds, task, path) in_flight_cogs: Set[Future] = set() for ds, task, path in _with_lookahead1(prep_stage(tasks, proc)): if ds is None: if verbose: print(f"..skipping: {path} (exists already)") yield path continue if chunked_persist > 0: assert isinstance(ds, xr.DataArray) ds = chunked_persist_da(ds, chunked_persist, client) else: ds = client.persist(ds, fifo_timeout='1ms') if len(in_flight_cogs): done, in_flight_cogs = drain(in_flight_cogs, 1.0) for r in done: yield r if isinstance(ds, xr.DataArray): attrs = ds.attrs.copy() ds = ds.to_dataset(dim='band') for dv in ds.data_vars.values(): dv.attrs.update(attrs) cog = client.compute(sink.dump(task, ds), fifo_timeout='1ms') rr = dask_wait(ds) assert len(rr.not_done) == 0 del ds, rr in_flight_cogs.add(cog) done, _ = drain(in_flight_cogs) for r in done: yield r
def drain(futures: Set[Future], timeout: Optional[float] = None) -> Tuple[List[str], Set[Future]]: return_when = 'FIRST_COMPLETED' if timeout is None: return_when = 'ALL_COMPLETED' try: rr = dask_wait(futures, timeout=timeout, return_when=return_when) except dask.distributed.TimeoutError: return [], futures done: List[str] = [] for f in rr.done: try: path, ok = f.result() if ok: done.append(path) else: print(f"Failed to write: {path}") except Exception as e: print(e) return done, rr.not_done
def dask_make_blobs(nrows, ncols, n_centers=8, n_parts=None, cluster_std=1.0, center_box=(-10, 10), random_state=None, verbose=False): """ Makes unlabeled dask.Dataframe and dask_cudf.Dataframes containing blobs for a randomly generated set of centroids. This function calls `make_blobs` from Scikitlearn on each Dask worker and aggregates them into a single Dask Dataframe. For more information on Scikit-learn's `make_blobs: <https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html>`_. :param nrows: number of rows :param ncols: number of features :param n_centers: number of centers to generate :param n_parts: number of partitions to generate (this can be greater than the number of workers) :param cluster_std: how far can each generated point deviate from its closest centroid? :param center_box: the bounding box which constrains all the centroids :param random_state: sets random seed :param verbose: enables / disables verbose printing. :return: dask.Dataframe & dask_cudf.Dataframe """ client = default_client() workers = list(client.has_what().keys()) n_parts = n_parts if n_parts is not None else len(workers) parts = (workers * math.ceil(n_parts / len(workers)))[:n_parts] centers = np.random.uniform(center_box[0], center_box[1], size=(n_centers, ncols)).astype(np.float32) if verbose: print( "Generating %d samples across %d partitions on " "%d workers (total=%d samples)" % (math.ceil(nrows / len(workers)), len(parts), len(workers), nrows)) # Create dfs on each worker (gpu) dfs = [ client.submit(create_df, n, math.ceil(nrows / len(workers)), ncols, centers, cluster_std, random_state, random.random(), workers=[worker]) for worker, n in list(zip(parts, list(range(len(workers))))) ] # Wait for completion dask_wait(dfs) ddfs = [client.submit(to_cudf, df, random.random()) for df in dfs] # Wait for completion dask_wait(ddfs) meta_ddf = client.submit(get_meta, dfs[0]).result() meta_cudf = client.submit(get_meta, ddfs[0]).result() d_df = dd.from_delayed(dfs, meta=meta_ddf) d_cudf = dask_cudf.from_delayed(ddfs, meta=meta_cudf) return d_df, d_cudf