Пример #1
0
def wait_for_future(
        future,
        poll_timeout: float = 1.0,
        t0: Optional[datetime] = None) -> Iterator[Tuple[float, datetime]]:
    """
    Generate a sequence of (time_passed, timestamp) tuples, stop when future becomes ready.

    :param future: Dask future
    :param poll_timeout: Controls how often
    :param t0: From what point to start counting (defaults to right now)
    """
    if t0 is None:
        t0 = datetime.utcnow()

    while not future.done():
        try:
            dask_wait(future,
                      timeout=poll_timeout,
                      return_when="FIRST_COMPLETED")
            return
        except TimeoutError:
            pass
        t_now = datetime.utcnow()

        yield ((t_now - t0).total_seconds(), t_now)
Пример #2
0
def chunked_persist_ds(xx: xr.Dataset,
                       client,
                       verbose: bool = False) -> xr.Dataset:
    names = list(xx.data_vars)
    data = [xx[n].data for n in names]
    delayed = [d.to_delayed().ravel() for d in data]
    delayed = list(zip(*delayed))

    persisted = []
    for chunk in delayed:
        chunk = client.persist(chunk)
        _ = dask_wait(chunk)
        persisted.extend(chunk)
        if verbose:
            print(".", end="")

    # at this point it should be almost no-op
    data = client.persist(data)

    # reconstruct xr.Dataset from persisted chunks
    _vars = {}
    for n, d in zip(names, data):
        dv = xx[n]
        _vars[n] = xr.DataArray(data=d, dims=dv.dims, coords=dv.coords, name=n)

    return xr.Dataset(_vars)
Пример #3
0
def chunked_persist(data, n_concurrent, client, verbose=False):
    """
    Force limited concurrency when persisting a large collection.

    This is useful to control memory usage when operating close to capacity.

    Sometimes `client.persist(data)` will run out of memory, not because
    fully-realized data is large, but because of intermediate data memory
    requirements. This is particularly common when using local dask cluster
    with only one worker.

    This function forces evaluation order of the dask graph to control peak
    memory usage.

    Say you have a largish task graph of 10x10 top-level sub-tasks, you have
    enough memory to process 5 sub-tasks concurrently, but Dask might decide
    to schedule more than that and will cause worker restarts due to out of
    memory errors. With this function you can force dask scheduler to
    persist this collection in batches of 5 concurrent sub-tasks, keeping
    the computation within the memory budget.
    """
    delayed = data.to_delayed().ravel()

    persisted = []
    for chunk in partition_all(n_concurrent, delayed):
        chunk = client.persist(chunk)
        _ = dask_wait(chunk)
        persisted.extend(chunk)
        if verbose:
            print(".", end="")

    # at this point it should be almost no-op
    return client.persist(data)
Пример #4
0
def process_tasks(tasks: Iterable[Task],
                  proc: TaskProc,
                  client: Client,
                  sink: S3COGSink,
                  check_exists: bool = True,
                  chunked_persist: int = 0,
                  verbose: bool = True) -> Iterator[str]:

    def prep_stage(tasks: Iterable[Task],
                   proc: TaskProc) -> Iterator[Tuple[Union[xr.Dataset, xr.DataArray, None], Task, str]]:
        for task in tasks:
            path = sink.uri(task)
            if check_exists:
                if sink.exists(task):
                    yield (None, task, path)
                    continue

            ds = proc(task)
            yield (ds, task, path)

    in_flight_cogs: Set[Future] = set()
    for ds, task, path in _with_lookahead1(prep_stage(tasks, proc)):
        if ds is None:
            if verbose:
                print(f"..skipping: {path} (exists already)")
            yield path
            continue

        if chunked_persist > 0:
            assert isinstance(ds, xr.DataArray)
            ds = chunked_persist_da(ds, chunked_persist, client)
        else:
            ds = client.persist(ds, fifo_timeout='1ms')

        if len(in_flight_cogs):
            done, in_flight_cogs = drain(in_flight_cogs, 1.0)
            for r in done:
                yield r

        if isinstance(ds, xr.DataArray):
            attrs = ds.attrs.copy()
            ds = ds.to_dataset(dim='band')
            for dv in ds.data_vars.values():
                dv.attrs.update(attrs)

        cog = client.compute(sink.dump(task, ds),
                             fifo_timeout='1ms')
        rr = dask_wait(ds)
        assert len(rr.not_done) == 0
        del ds, rr
        in_flight_cogs.add(cog)

    done, _ = drain(in_flight_cogs)
    for r in done:
        yield r
Пример #5
0
def drain(futures: Set[Future],
          timeout: Optional[float] = None) -> Tuple[List[str], Set[Future]]:
    return_when = 'FIRST_COMPLETED'
    if timeout is None:
        return_when = 'ALL_COMPLETED'

    try:
        rr = dask_wait(futures, timeout=timeout, return_when=return_when)
    except dask.distributed.TimeoutError:
        return [], futures

    done: List[str] = []
    for f in rr.done:
        try:
            path, ok = f.result()
            if ok:
                done.append(path)
            else:
                print(f"Failed to write: {path}")
        except Exception as e:
            print(e)

    return done, rr.not_done
Пример #6
0
def dask_make_blobs(nrows,
                    ncols,
                    n_centers=8,
                    n_parts=None,
                    cluster_std=1.0,
                    center_box=(-10, 10),
                    random_state=None,
                    verbose=False):
    """
    Makes unlabeled dask.Dataframe and dask_cudf.Dataframes containing blobs
    for a randomly generated set of centroids.

    This function calls `make_blobs` from Scikitlearn on each Dask worker
    and aggregates them into a single Dask Dataframe.

    For more information on Scikit-learn's `make_blobs:
    <https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html>`_.

    :param nrows: number of rows
    :param ncols: number of features
    :param n_centers: number of centers to generate
    :param n_parts: number of partitions to generate (this can be greater
    than the number of workers)
    :param cluster_std: how far can each generated point deviate from its
    closest centroid?
    :param center_box: the bounding box which constrains all the centroids
    :param random_state: sets random seed
    :param verbose: enables / disables verbose printing.
    :return: dask.Dataframe & dask_cudf.Dataframe
    """

    client = default_client()

    workers = list(client.has_what().keys())

    n_parts = n_parts if n_parts is not None else len(workers)

    parts = (workers * math.ceil(n_parts / len(workers)))[:n_parts]

    centers = np.random.uniform(center_box[0],
                                center_box[1],
                                size=(n_centers, ncols)).astype(np.float32)

    if verbose:
        print(
            "Generating %d samples across %d partitions on "
            "%d workers (total=%d samples)" %
            (math.ceil(nrows / len(workers)), len(parts), len(workers), nrows))

    # Create dfs on each worker (gpu)
    dfs = [
        client.submit(create_df,
                      n,
                      math.ceil(nrows / len(workers)),
                      ncols,
                      centers,
                      cluster_std,
                      random_state,
                      random.random(),
                      workers=[worker])
        for worker, n in list(zip(parts, list(range(len(workers)))))
    ]
    # Wait for completion
    dask_wait(dfs)

    ddfs = [client.submit(to_cudf, df, random.random()) for df in dfs]
    # Wait for completion
    dask_wait(ddfs)

    meta_ddf = client.submit(get_meta, dfs[0]).result()
    meta_cudf = client.submit(get_meta, ddfs[0]).result()

    d_df = dd.from_delayed(dfs, meta=meta_ddf)
    d_cudf = dask_cudf.from_delayed(ddfs, meta=meta_cudf)

    return d_df, d_cudf