Exemplo n.º 1
0
def test_from_dask_array_unknown_chunks():
    # Series
    dx = da.Array(
        {
            ("x", 0): np.arange(5),
            ("x", 1): np.arange(5, 11)
        },
        "x",
        ((np.nan, np.nan), ),
        np.arange(1).dtype,
    )
    df = dd.from_dask_array(dx)
    assert isinstance(df, dd.Series)
    assert not df.known_divisions
    assert_eq(df, pd.Series(np.arange(11)), check_index=False)

    # DataFrame
    dsk = {
        ("x", 0, 0): np.random.random((2, 3)),
        ("x", 1, 0): np.random.random((5, 3))
    }
    dx = da.Array(dsk, "x", ((np.nan, np.nan), (3, )), np.float64)
    df = dd.from_dask_array(dx)
    assert isinstance(df, dd.DataFrame)
    assert not df.known_divisions
    assert_eq(df, pd.DataFrame(dx.compute()), check_index=False)

    # Unknown width
    dx = da.Array(dsk, "x", ((np.nan, np.nan), (np.nan, )), np.float64)
    with pytest.raises(ValueError):
        df = dd.from_dask_array(dx)
Exemplo n.º 2
0
def test__dask_array_collections(s, a, b):
    import dask.array as da
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    x_dsk = {('x', i, j): np.random.random((3, 3))
             for i in range(3) for j in range(2)}
    y_dsk = {('y', i, j): np.random.random((3, 3))
             for i in range(2) for j in range(3)}
    x_futures = yield e._scatter(x_dsk)
    y_futures = yield e._scatter(y_dsk)

    dt = np.random.random(0).dtype
    x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt)
    y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt)

    x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt)
    y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt)

    exprs = [
        lambda x, y: x.T + y, lambda x, y: x.mean() + y.mean(),
        lambda x, y: x.dot(y).std(axis=0),
        lambda x, y: x - x.mean(axis=1)[:, None]
    ]

    for expr in exprs:
        local = expr(x_local, y_local).compute(get=dask.get)

        remote, = e.compute(expr(x_remote, y_remote))
        remote = yield remote._result()

        assert np.all(local == remote)

    yield e._shutdown()
Exemplo n.º 3
0
async def test_dask_array_collections(c, s, a, b):
    import dask.array as da

    s.validate = False
    x_dsk = {("x", i, j): np.random.random((3, 3)) for i in range(3) for j in range(2)}
    y_dsk = {("y", i, j): np.random.random((3, 3)) for i in range(2) for j in range(3)}
    x_futures = await c.scatter(x_dsk)
    y_futures = await c.scatter(y_dsk)

    dt = np.random.random(0).dtype
    x_local = da.Array(x_dsk, "x", ((3, 3, 3), (3, 3)), dt)
    y_local = da.Array(y_dsk, "y", ((3, 3), (3, 3, 3)), dt)

    x_remote = da.Array(x_futures, "x", ((3, 3, 3), (3, 3)), dt)
    y_remote = da.Array(y_futures, "y", ((3, 3), (3, 3, 3)), dt)

    exprs = [
        lambda x, y: x.T + y,
        lambda x, y: x.mean() + y.mean(),
        lambda x, y: x.dot(y).std(axis=0),
        lambda x, y: x - x.mean(axis=1)[:, None],
    ]

    for expr in exprs:
        local = expr(x_local, y_local).compute(scheduler="sync")

        remote = c.compute(expr(x_remote, y_remote))
        remote = await remote

        assert np.all(local == remote)
Exemplo n.º 4
0
def test__dask_array_collections(c, s, a, b):
    import dask.array as da

    x_dsk = {('x', i, j): np.random.random((3, 3)) for i in range(3)
             for j in range(2)}
    y_dsk = {('y', i, j): np.random.random((3, 3)) for i in range(2)
             for j in range(3)}
    x_futures = yield c._scatter(x_dsk)
    y_futures = yield c._scatter(y_dsk)

    dt = np.random.random(0).dtype
    x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt)
    y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt)

    x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt)
    y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt)

    exprs = [lambda x, y: x.T + y,
             lambda x, y: x.mean() + y.mean(),
             lambda x, y: x.dot(y).std(axis=0),
             lambda x, y: x - x.mean(axis=1)[:, None]]

    for expr in exprs:
        local = expr(x_local, y_local).compute(scheduler='sync')

        remote = c.compute(expr(x_remote, y_remote))
        remote = yield remote

        assert np.all(local == remote)
Exemplo n.º 5
0
def __read_template_as_dask(dd, tcPerf):
    """
    Read template binary data and return as a dask array
    """
    t, y, x = dd.tdef.length(), dd.ydef.length(), dd.xdef.length()

    totalNum = sum([
        reduce(lambda x, y: x * y, (tcPerf[0], v.zcount, y, x))
        for v in dd.vdef
    ])

    # print(totalNum * 4.0 / 1024.0 / 1024.0)

    binData = []

    dtype = '<f4' if dd.byteOrder == 'little' else '>f4'

    for m, v in enumerate(dd.vdef):
        if totalNum > (200 * 100 * 100 * 100):  # about 800 MB, chunk 2D slice
            # print('large')
            chunk = (1, 1, y, x)
            shape = (t, v.zcount, y, x)

            dsk = {(v.name + '_@miniufo', l + sum(tcPerf[:m]), k, 0, 0):
                   (__read_var, f, v, dd.tRecLength, l, k, dtype)
                   for m, f in enumerate(dd.dsetPath[:len(tcPerf)])
                   for l in range(tcPerf[m]) for k in range(v.zcount)}

            binData.append(
                dsa.Array(dsk,
                          v.name + '_@miniufo',
                          chunk,
                          dtype=dtype,
                          shape=shape))

        else:  # in between, chunk 3D slice
            # print('between')
            chunk = (1, v.zcount, y, x)
            shape = (t, v.zcount, y, x)

            dsk = {(v.name + '_@miniufo', l + sum(tcPerf[:m]), 0, 0, 0):
                   (__read_var, f, v, dd.tRecLength, l, None, dtype)
                   for m, f in enumerate(dd.dsetPath[:len(tcPerf)])
                   for l in range(tcPerf[m])}

            binData.append(
                dsa.Array(dsk,
                          v.name + '_@miniufo',
                          chunk,
                          dtype=dtype,
                          shape=shape))

    return binData
Exemplo n.º 6
0
def stream_reduction(time_index, antenna1, antenna2,
                     dde1_jones, source_coh, dde2_jones,
                     predict_check_tup, out_dtype, streams):
    """
    Reduces source coherencies + ddes over the source dimension in
    ``N`` parallel streams.

    This is accomplished by calling predict_vis on on ddes and source
    coherencies to produce visibilities which are passed into
    the `base_vis` argument of ``predict_vis`` for the next chunk.
    """

    # Unique name and token for this operation
    token = tokenize(time_index, antenna1, antenna2,
                     dde1_jones, source_coh, dde2_jones,
                     streams)

    name = 'stream-coherency-reduction-' + token

    # Number of dim blocks
    blocks = _extract_blocks(time_index, dde1_jones, source_coh, dde2_jones)
    (src_blocks, row_blocks, _,
     chan_blocks), corr_blocks = blocks[:4], blocks[4:]

    # Total number of other dimension blocks
    nblocks = reduce(mul, (row_blocks, chan_blocks) + corr_blocks, 1)

    # Create the compressed mapping
    layers = CoherencyStreamReduction(time_index, antenna1, antenna2,
                                      dde1_jones, source_coh, dde2_jones,
                                      name, streams)

    # Create the graph
    extra_deps = [a for a in (dde1_jones, source_coh, dde2_jones)
                  if a is not None]
    deps = [time_index, antenna1, antenna2] + extra_deps

    graph = HighLevelGraph.from_collections(name, layers, deps)

    chunks = ((1,) * src_blocks, (1,)*nblocks)
    # This should never be directly computed, reported chunks
    # and dtype don't match the actual data. We create it
    # because it makes chaining HighLevelGraphs easier
    stream_reduction = da.Array(graph, name, chunks, dtype=np.int8)

    name = "coherency-reduction-" + tokenize(stream_reduction)
    layers = CoherencyFinalReduction(name, layers)
    graph = HighLevelGraph.from_collections(name, layers, [stream_reduction])

    chunks = _extract_chunks(time_index, dde1_jones, source_coh, dde2_jones)
    return da.Array(graph, name, chunks[1:], dtype=out_dtype)
Exemplo n.º 7
0
def inlined_array(a, inline_arrays=None):
    """ Flatten underlying graph """
    agraph = a.__dask_graph__()
    akeys = set(flatten(a.__dask_keys__()))

    # Inline everything except the output keys
    if inline_arrays is None:
        inline_keys = set(agraph.keys()) - akeys
        dsk2 = inline(agraph, keys=inline_keys, inline_constants=True)
        dsk3, _ = cull(dsk2, akeys)

        graph = HighLevelGraph.from_collections(a.name, dsk3, [])
        return da.Array(graph, a.name, a.chunks, dtype=a.dtype)

    # We're given specific arrays to inline, promote to list
    if isinstance(inline_arrays, da.Array):
        inline_arrays = [inline_arrays]
    elif isinstance(inline_arrays, tuple):
        inline_arrays = list(inline_arrays)

    if not isinstance(inline_arrays, list):
        raise TypeError("Invalid inline_arrays, must be "
                        "(None, list, tuple, dask.array.Array)")

    inline_names = set(a.name for a in inline_arrays)
    layers = agraph.layers.copy()
    deps = {k: v.copy() for k, v in agraph.dependencies.items()}
    # We want to inline layers that depend on the inlined arrays
    inline_layers = set(k for k, v in deps.items()
                        if len(inline_names.intersection(v)) > 0)

    for layer_name in inline_layers:
        dsk = dict(layers[layer_name])
        layer_keys = set(dsk.keys())
        inline_keys = set()

        for array in inline_arrays:
            dsk.update(layers[array.name])
            deps.pop(array.name, None)
            deps[layer_name].discard(array.name)
            inline_keys.update(layers[array.name].keys())

        dsk2 = inline(dsk, keys=inline_keys, inline_constants=True)
        layers[layer_name], _ = cull(dsk2, layer_keys)

    # Remove layers containing the inlined arrays
    for inline_name in inline_names:
        layers.pop(inline_name)

    return da.Array(HighLevelGraph(layers, deps), a.name, a.chunks, a.dtype)
Exemplo n.º 8
0
def inlined_array(a, inline_arrays=None):
    """ Flatten underlying graph """
    agraph = a.__dask_graph__()
    akeys = set(flatten(a.__dask_keys__()))

    # Inline everything except the output keys
    if inline_arrays is None:
        inline_keys = set(agraph.keys()) - akeys
        dsk2 = inline(agraph, keys=inline_keys, inline_constants=True)
        dsk3, _ = cull(dsk2, akeys)

        graph = HighLevelGraph.from_collections(a.name, dsk3, [])
        return da.Array(graph, a.name, a.chunks, dtype=a.dtype)

    # We're given specific arrays to inline, promote to list
    if isinstance(inline_arrays, da.Array):
        inline_arrays = [inline_arrays]
    elif isinstance(inline_arrays, tuple):
        inline_arrays = list(inline_arrays)

    if not isinstance(inline_arrays, list):
        raise TypeError("Invalid inline_arrays, must be "
                        "(None, list, tuple, dask.array.Array)")

    layers = agraph.layers.copy()
    deps = agraph.dependencies.copy()
    inline_keys = set()
    dsk = dict(layers[a.name])

    # Inline specified arrays
    for array in inline_arrays:
        # Remove array from layers and dependencies
        try:
            dsk.update(layers.pop(array.name))
            del deps[array.name]
        except KeyError:
            raise ValueError("%s is not a valid dependency of a"
                             % array.name)

        # Record keys to inline
        inline_keys.update(flatten(array.__dask_keys__()))

    dsk2 = inline(dsk, keys=inline_keys, inline_constants=True)
    dsk3, _ = cull(dsk2, akeys)

    layers[a.name] = dsk3
    graph = HighLevelGraph(layers, deps)

    return da.Array(graph, a.name, a.chunks, a.dtype)
Exemplo n.º 9
0
def test_write_dict_data(tmp_path, chunks, dtype):
    rs = np.random.RandomState(42)
    row_sum = 0

    def _vis_factory(chan, corr):
        # Variably sized-channels per row, as in BDA data
        nchan = rs.randint(chan)
        return (rs.normal(size=(1, nchan, corr)) +
                rs.normal(size=(1, nchan, corr))*1j)

    shapes = {k: sum(c) for k, c in chunks.items()}
    row_sum += shapes['row']

    # assert len(chunks['chan']) == 1
    assert len(chunks['corr']) == 1

    # Make some visibilities
    dims = ("row", "chan", "corr")
    row, chan, corr = (shapes[d] for d in dims)
    name = "vis-data-" + uuid.uuid4().hex

    nchunks = (len(chunks[d]) for d in dims)
    keys = product((name,), *(range(c) for c in nchunks))
    chunk_sizes = product(*(chunks[d] for d in dims))

    layer = {k: {'r%d' % (i + 1): _vis_factory(chan, corr)
                 for i in range(r)}
             for k, (r, _, _) in zip(keys, chunk_sizes)}

    hlg = HighLevelGraph.from_collections(name, layer, [])
    chunks = tuple(chunks[d] for d in dims)
    meta = np.empty((0,)*len(chunks), dtype=np.complex128)
    vis = da.Array(hlg, name, chunks, meta=meta)
    ds = Dataset({"DATA": (dims, vis)})

    table_name = os.path.join(str(tmp_path), 'test.table')
    writes, table_proxy = write_datasets(table_name, ds, ["DATA"],
                                         table_proxy=True,
                                         # No fixed shape columns
                                         descriptor="ms(False)")

    dask.compute(writes)

    data = table_proxy.getvarcol("DATA").result()

    # First row chunk
    assert_array_almost_equal(layer[(name, 0, 0, 0)]['r1'], data['r1'])
    assert_array_almost_equal(layer[(name, 0, 0, 0)]['r2'], data['r2'])
    assert_array_almost_equal(layer[(name, 0, 0, 0)]['r3'], data['r3'])
    assert_array_almost_equal(layer[(name, 0, 0, 0)]['r4'], data['r4'])
    assert_array_almost_equal(layer[(name, 0, 0, 0)]['r5'], data['r5'])

    # Second row chunk
    assert_array_almost_equal(layer[(name, 1, 0, 0)]['r1'], data['r6'])
    assert_array_almost_equal(layer[(name, 1, 0, 0)]['r2'], data['r7'])
    assert_array_almost_equal(layer[(name, 1, 0, 0)]['r3'], data['r8'])

    # Third row chunk
    assert_array_almost_equal(layer[(name, 2, 0, 0)]['r1'], data['r9'])
    assert_array_almost_equal(layer[(name, 2, 0, 0)]['r2'], data['r10'])
Exemplo n.º 10
0
def _da_from_mem(
    token: Delayed,
    shape: ShapeLike,
    dtype: DtypeLike,
    chunks: Tuple[int, ...],
    name: str = "from_mem",
) -> da.Array:
    """
    Construct dask view of some yet to be computed in RAM store.

    :param token: Should evaluate to either Token or string key in to the Cache,
                  which is expected to contain ``numpy`` array of supplied
                  ``shape`` and ``dtype``

    :param shape: Expected shape of the future array

    :param dtype: Expected dtype of the future array

    :param chunks: Tuple of integers describing chunk partitioning for output array

    :param name: Dask name

    Gotchas
    =======

    - Output array can not be moved from one worker to another.
      - Works with in-process Client
      - Works with single worker cluster
      - Can work if scheduler is told to schedule this on a single worker

    - Cache life cycle management can be tough. If token evaluates to a
      ``Token`` object then automatic cache cleanup should happen when output
      array is destroyed. If it is just a string, then it's up to caller to
      ensure that there is cleanup and no use after free.

    Returns
    =======
    Dask Array
    """
    if not isinstance(shape, tuple):
        shape = (shape, )

    assert dask.is_dask_collection(token)
    assert len(shape) == len(chunks)

    _chunks = unpack_chunks(chunks, shape)
    _rois = [tuple(_roi_from_chunks(ch)) for ch in _chunks]
    _roi = lambda idx: tuple(_rois[i][k] for i, k in enumerate(idx))

    shape_in_chunks = tuple(len(ch) for ch in _chunks)

    dsk = {}
    name = randomize(name)

    for idx in np.ndindex(shape_in_chunks):
        dsk[(name, *idx)] = (_chunk_extractor, token.key, _roi(idx))

    dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[token])

    return da.Array(dsk, name, shape=shape, dtype=dtype, chunks=_chunks)
Exemplo n.º 11
0
def test_gh_4176():
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        from dask.sharedict import ShareDict

    def foo(A):
        return A[None, ...]

    A = da.ones(shape=(10, 20, 4), chunks=(2, 5, 4))

    name = 'D'

    dsk = blockwise(foo,
                    name, ("nsrc", "ntime", "nbl", "npol"),
                    A.name, ("ntime", "nbl", "npol"),
                    new_axes={"nsrc": 1},
                    numblocks={a.name: a.numblocks
                               for a in (A, )})

    array_dsk = ShareDict()
    array_dsk.update(dsk)
    array_dsk.update(A.__dask_graph__())

    chunks = ((1, ), ) + A.chunks

    D = da.Array(array_dsk, name, chunks, dtype=A.dtype)
    D.sum(axis=0).compute()
Exemplo n.º 12
0
    def get_dask_array(self, array_name, chunks, dtype, offset=()):
        """Get dask array from the store.

        Any missing chunks are replaced with zeros, suppressing any
        :exc:`ChunkNotFound` errors.

        Parameters
        ----------
        array_name : string
            Identifier of array in chunk store
        chunks : tuple of tuples of ints
            Chunk specification
        dtype : :class:`numpy.dtype` object or equivalent
            Data type of array
        offset : tuple of int, optional
            Offset to add to each dimension when addressing chunks in store

        Returns
        -------
        array : :class:`dask.array.Array` object
            Dask array of given dtype
        """
        getter = functools.partial(self.get_chunk_or_zeros, dtype=dtype)
        if offset:
            getter = _add_offset_to_slices(getter, offset)
        # Use dask utility function that forms the core of da.from_array
        dask_graph = da.core.getem(array_name, chunks, getter)
        return da.Array(dask_graph, array_name, chunks, dtype)
Exemplo n.º 13
0
def build_array(bag, n_features, meta):
    name = "from-bag-" + bag.name
    layer = {(name, i, 0): (k, i) for k, i in bag.__dask_keys__()}
    dsk = dask.highlevelgraph.HighLevelGraph.from_collections(
        name, layer, dependencies=[bag])
    chunks = ((np.nan, ) * bag.npartitions, (n_features, ))
    return da.Array(dsk, name, chunks, meta=meta)
Exemplo n.º 14
0
def _create_window_dask(name,
                        ntime,
                        nchan,
                        nbl,
                        ncorr,
                        token,
                        dtype,
                        default=0,
                        backend="numpy",
                        path=None):
    if backend == "zarr-disk" and path is None:
        path = mkdtemp(prefix='-'.join(('tricolour', name, 'windows', '')))

    # Include name and token in new token
    token = dask.base.tokenize(name, ntime, nchan, nbl, ncorr, token, dtype,
                               default, backend, path)

    collection_name = '-'.join(("create", name, "windows", token))
    layers = {
        (collection_name, 0): (_create_window, name, ntime, nchan, nbl, ncorr,
                               dtype, default, token, backend, path)
    }

    graph = HighLevelGraph.from_collections(collection_name, layers, ())
    chunks = ((0, ), )  # One chunk containing single zarr array object
    return da.Array(graph, collection_name, chunks, dtype=np.object)
Exemplo n.º 15
0
def chan_metadata(row_chan_arrays, chan_arrays, chan_bin_size):
    """ Create dask array with channel metadata for each chunk channel """
    chan_chunks = None

    for array in row_chan_arrays:
        if array is not None:
            chan_chunks = array.chunks[1]
            break

    if chan_chunks is None:
        for array in chan_arrays:
            if array is not None:
                chan_chunks = array.chunks[0]
                break

    if chan_chunks is None:
        return None

    # Create a dask channel mapping structure
    name = "channel-mapper-" + tokenize(chan_chunks, chan_bin_size)
    layers = {(name, i): (np_channel_mapper, c, chan_bin_size)
              for i, c in enumerate(chan_chunks)}
    graph = HighLevelGraph.from_collections(name, layers, ())
    chunks = (chan_chunks, )
    chan_mapper = da.Array(graph, name, chunks, dtype=np.object)

    return chan_mapper
Exemplo n.º 16
0
    def download_dask_array(self, object_name, dask_name='array'):
        """Downloads a split matrix as a ``dask.array.Array`` object

        This uses the stored object metadata to reconstruct the full
        n-dimensional array uploaded using ``upload_dask_array``.

        Examples
        --------
        >>> s3_response = cci.upload_dask_array('test_dim', arr, axis=-1)
        >>> dask_object = cci.download_dask_array('test_dim')
        >>> dask_object
        dask.array<array, shape=(100, 600, 1000), dtype=float64, chunksize=(100, 600, 100)>
        >>> dask_slice = dask_object[..., :200]
        >>> dask_slice
        dask.array<getitem..., shape=(100, 600, 1000), dtype=float64, chunksize=(100, 600, 100)>
        >>> downloaded_data = np.asarray(dask_slice) # this downloads the array
        >>> downloaded_data.shape
        (100, 600, 200)
        """
        from dask import array as da

        metadata = self.download_json(self.pathjoin(object_name, 'metadata.json'))
        chunks = metadata['chunks']
        shape = metadata['shape']
        dtype = np.dtype(metadata['dtype'])

        dask = {(dask_name,) + tuple(shape): (self.download_raw_array, part_name) \
                for shape, part_name in metadata['dask']}

        return da.Array(dask, dask_name, chunks, shape = shape, dtype = dtype)
Exemplo n.º 17
0
        def wrapped(shape, *args, **kwargs):
            if isinstance(shape, collections.abc.Iterable):
                shape = tuple(int(s) for s in shape)
            else:
                shape = (int(shape), )

            # Estimate 100 Mi elements per block
            blocksize = int((100 * (2**20))**(1 / len(shape)))

            chunks = []
            for l in shape:
                chunks.append([])
                while l > 0:
                    s = max(min(blocksize, l), 0)
                    chunks[-1].append(s)
                    l -= s

            name = func.__name__ + "-" + hex(random.randrange(2**64))
            dsk = {}
            with set_backend(self._inner):
                for chunk_id in itertools.product(
                        *map(lambda x: range(len(x)), chunks)):
                    shape = tuple(chunks[i][j] for i, j in enumerate(chunk_id))
                    dsk[(name, ) + chunk_id] = func(shape, *args, **kwargs)

                meta = func(tuple(0 for _ in shape), *args, **kwargs)
                dtype = str(meta.dtype)

            return da.Array(dsk, name, chunks, dtype=dtype, meta=meta)
Exemplo n.º 18
0
def cascaded_compute(callback, arrays, optimize=True):
    """Dask helper function for iterating over computed dask arrays.

    Args:
        callback (callable): Called with a single numpy array computed from
                             the provided dask arrays.
        arrays (list, tuple): Dask arrays to pass to callback.
        optimize (bool): Whether to try to optimize the dask graphs of the
                         provided arrays.

    Returns: `dask.Delayed` object to be computed

    """
    if optimize:
        # optimize Dask graph over all objects
        dsk = da.Array.__dask_optimize__(
            # combine all Dask Array graphs
            dask.sharedict.merge(*[e.__dask_graph__() for e in arrays]),
            # get Dask Array keys in result
            list(dask.core.flatten([e.__dask_keys__() for e in arrays]))
        )
        # rebuild Dask Arrays
        arrays = [da.Array(dsk, e.name, e.chunks, e.dtype) for e in arrays]

    def _callback_wrapper(arr, cb=callback, previous_call=None):
        del previous_call  # used only for task ordering
        return cb(arr)

    current_write = None
    for dask_arr in arrays:
        current_write = dask.delayed(_callback_wrapper)(
            dask_arr, previous_call=current_write)
    return current_write
Exemplo n.º 19
0
    def read_band_blocks(self, blocksize=CHUNK_SIZE):
        """Read the band in native blocks."""
        # For sentinel 1 data, the block are 1 line, and dask seems to choke on that.
        band = self.filehandle

        shape = band.shape
        token = tokenize(blocksize, band)
        name = 'read_band-' + token
        dskx = dict()
        if len(band.block_shapes) != 1:
            raise NotImplementedError(
                'Bands with multiple shapes not supported.')
        else:
            chunks = band.block_shapes[0]

        def do_read(the_band, the_window, the_lock):
            with the_lock:
                return the_band.read(1, None, window=the_window)

        for ji, window in band.block_windows(1):
            dskx[(name, ) + ji] = (do_read, band, window, self.read_lock)

        res = da.Array(dskx,
                       name,
                       shape=list(shape),
                       chunks=chunks,
                       dtype=band.dtypes[0])
        return DataArray(res, dims=('y', 'x'))
Exemplo n.º 20
0
def write_blocks(source, target, region: Optional[Tuple[slice, ...]]) -> da.Array:
    """
    Return a dask array with where each chunk contains the result of writing 
    each chunk of `source` to `target`.
    """

    slices = slices_from_chunks(source.chunks)
    if region:
        slices = [fuse_slice(region, slc) for slc in slices]

    source_name = 'store-source-' + tokenize(source)
    store_name = 'store-' + tokenize(source)
    
    layers = {source_name: source.__dask_graph__()}
    deps = {source_name: set()}
    
    dsk = {}
    chunks = tuple((1,) * s for s in source.blocks.shape)
    
    for slice, key in zip(slices, flatten(source.__dask_keys__())):
        dsk[(store_name,) + key[1:]] = (ndwrapper, store_chunk, source.ndim, key, target, slice)
    
    layers[store_name] = dsk
    deps[store_name] = {source_name}
    store_dsk = HighLevelGraph(layers, deps)
    
    return da.Array(store_dsk,
                    store_name,
                    shape=source.blocks.shape,
                    chunks=chunks,
                    dtype=int)
Exemplo n.º 21
0
def concatenate_row_chunks(array, group_every=1000):
    """
    When averaging, the output array's are substantially smaller, which
    can affect disk I/O since many small operations are submitted.
    This operation concatenates row chunks together so that more rows
    are submitted at once
    """

    # Single chunk already
    if len(array.chunks[0]) == 1:
        return array

    data = partial_reduce(np.concatenate,
                          array,
                          split_every={0: group_every},
                          reduced_meta=None,
                          keepdims=True)

    # NOTE(sjperkins)
    # partial_reduce sets the number of rows in each chunk
    # to 1, which is untrue. Correctly set the row chunks to nan,
    # steal the graph and recreate the array
    row_chunks = tuple(np.nan for _ in data.chunks[0])
    chunks = (row_chunks, ) + data.chunks[1:]
    graph = data.__dask_graph__()

    return da.Array(graph, data.name, chunks, dtype=data.dtype)
    def _imread(filenames, imread=None, preprocess=None):
        """
        modified dask imread method, accepts list of file names instead of glob string
        """

        def add_leading_dimension(x):
            return x[None, ...]

        imread = imread or sk_imread
        name = 'imread-%s' % tokenize(filenames, map(os.path.getmtime, filenames))

        sample = imread(filenames[0])
        if preprocess:
            sample = preprocess(sample)

        keys = [(name, i) + (0,) * len(sample.shape) for i in range(len(filenames))]
        if preprocess:
            values = [(add_leading_dimension, (preprocess, (imread, fn)))
                      for fn in filenames]
        else:
            values = [(add_leading_dimension, (imread, fn))
                      for fn in filenames]
        dsk = dict(zip(keys, values))

        chunks = ((1,) * len(filenames),) + tuple((d,) for d in sample.shape)

        return da.Array(dsk, name, chunks, sample.dtype)
Exemplo n.º 23
0
    def _run_fornav_single(self, data, out_chunks, target_geo_def, fill_value,
                           **kwargs):
        ll2cr_result = self.cache['ll2cr_result']
        ll2cr_blocks = self.cache['ll2cr_blocks'].items()
        ll2cr_numblocks = ll2cr_result.shape if isinstance(
            ll2cr_result, np.ndarray) else ll2cr_result.numblocks
        fornav_task_name = f"fornav-{data.name}-{ll2cr_result.name}"
        maximum_weight_mode = kwargs.setdefault('maximum_weight_mode', False)
        weight_sum_min = kwargs.setdefault('weight_sum_min', -1.0)
        output_stack = self._generate_fornav_dask_tasks(
            out_chunks, ll2cr_blocks, fornav_task_name, data.name,
            target_geo_def, fill_value, kwargs)

        dsk_graph = HighLevelGraph.from_collections(
            fornav_task_name, output_stack, dependencies=[data, ll2cr_result])
        stack_chunks = (
            (1, ) * (ll2cr_numblocks[0] * ll2cr_numblocks[1]), ) + out_chunks
        out_stack = da.Array(dsk_graph, fornav_task_name, stack_chunks,
                             data.dtype)
        combine_fornav_with_kwargs = partial(
            _combine_fornav, maximum_weight_mode=maximum_weight_mode)
        average_fornav_with_kwargs = partial(
            _average_fornav,
            maximum_weight_mode=maximum_weight_mode,
            weight_sum_min=weight_sum_min,
            dtype=data.dtype,
            fill_value=fill_value)
        out = da.reduction(out_stack,
                           _chunk_callable,
                           average_fornav_with_kwargs,
                           combine=combine_fornav_with_kwargs,
                           axis=(0, ),
                           dtype=data.dtype,
                           concatenate=False)
        return out
Exemplo n.º 24
0
    def _dask_array(self, nfacet, varname, iters, klevels, k_chunksize):
        # return a dask array for a single facet
        facet_shape = _facet_shape(nfacet, self.nx)
        time_chunks = (len(iters) * (1, ), ) if iters is not None else ()
        k_chunks = (tuple([len(c) for c in _chunks(klevels, k_chunksize)]), )
        chunks = time_chunks + k_chunks + tuple([(s, ) for s in facet_shape])

        # manually build dask graph
        dsk = {}
        token = tokenize(varname, self.store, nfacet)
        name = '-'.join([varname, token])

        # iters == None for grid variables
        if iters is not None:
            for n_iter, iternum in enumerate(iters):
                for n_k, these_klevels in enumerate(
                        _chunks(klevels, k_chunksize)):
                    key = name, n_iter, n_k, 0, 0, 0
                    task = (_get_facet_chunk, self.store, varname, iternum,
                            nfacet, these_klevels, self.nx, self.nz,
                            self.dtype, self.mask_override)
                    dsk[key] = task
        else:
            for n_k, these_klevels in enumerate(_chunks(klevels, k_chunksize)):
                key = name, n_k, 0, 0, 0
                task = (_get_facet_chunk, self.store, varname, None, nfacet,
                        these_klevels, self.nx, self.nz, self.dtype,
                        self.mask_override)
                dsk[key] = task

        return dsa.Array(dsk, name, chunks, self.dtype)
Exemplo n.º 25
0
def _make_dask_array(sources,
                     geobox,
                     measurement,
                     skip_broken_datasets=False,
                     fuse_func=None,
                     dask_chunks=None):
    dsk_name = 'datacube_' + measurement['name']

    irr_chunks, grid_chunks = _calculate_chunk_sizes(sources, geobox,
                                                     dask_chunks)
    sliced_irr_chunks = (1, ) * sources.ndim

    dsk = {}
    geobox_subsets = _chunk_geobox(geobox, grid_chunks)

    for irr_index, datasets in numpy.ndenumerate(sources.values):
        for grid_index, subset_geobox in geobox_subsets.items():
            dsk[(dsk_name, ) + irr_index +
                grid_index] = (fuse_lazy, datasets, subset_geobox, measurement,
                               skip_broken_datasets, fuse_func, sources.ndim)

    data = da.Array(dsk,
                    dsk_name,
                    chunks=(sliced_irr_chunks + grid_chunks),
                    dtype=measurement['dtype'],
                    shape=(sources.shape + geobox.shape))

    if irr_chunks != sliced_irr_chunks:
        data = data.rechunk(chunks=(irr_chunks + grid_chunks))
    return data
Exemplo n.º 26
0
def _rechunk_2x2(xx, name="2x2"):
    """
    this is for testing only, ignore it, it's not robust
    """
    assert xx.ndim == 2
    name = randomize(name)
    ny, nx = (len(ch) // 2 for ch in xx.chunks[:2])

    dsk = {}
    chunks = _chunk_getter(xx)

    for r, c in np.ndindex((ny, nx)):
        r2 = r * 2
        c2 = c * 2
        ch_idx = np.s_[r2:r2 + 2, c2:c2 + 2]
        _xx = chunks(ch_idx)
        dsk[(name, r, c)] = (_stack_2d_np, (2, 2), *_xx)

    chy = tuple(xx.chunks[0][i * 2] + xx.chunks[0][i * 2 + 1]
                for i in range(ny))
    chx = tuple(xx.chunks[1][i * 2] + xx.chunks[1][i * 2 + 1]
                for i in range(nx))

    chunks = (chy, chx)
    dsk = HighLevelGraph.from_collections(name, dsk, dependencies=(xx, ))

    return da.Array(dsk, name, chunks=chunks, dtype=xx.dtype, shape=xx.shape)
Exemplo n.º 27
0
    def _get_solar_flux_old(self, band):
        # TODO: this could be replaced with vectorized indexing in the future.
        from dask.base import tokenize
        blocksize = CHUNK_SIZE

        solar_flux = self.cal['solar_flux'].isel(bands=band).values
        d_index = self.cal['detector_index'].fillna(0).astype(int)

        shape = d_index.shape
        vchunks = range(0, shape[0], blocksize)
        hchunks = range(0, shape[1], blocksize)

        token = tokenize(band, d_index, solar_flux)
        name = 'solar_flux_' + token

        def get_items(array, slices):
            return solar_flux[d_index[slices].values]

        dsk = {(name, i, j): (get_items,
                              d_index,
                              (slice(vcs, min(vcs + blocksize, shape[0])),
                               slice(hcs, min(hcs + blocksize, shape[1]))))
               for i, vcs in enumerate(vchunks)
               for j, hcs in enumerate(hchunks)
               }

        res = da.Array(dsk, name, shape=shape,
                       chunks=(blocksize, blocksize),
                       dtype=solar_flux.dtype)
        return res
Exemplo n.º 28
0
def _make_dask_array(sources, geobox, measurement,
                     skip_broken_datasets=False,
                     fuse_func=None,
                     dask_chunks=None):
    dsk_name = 'datacube_load_{name}-{token}'.format(name=measurement['name'], token=uuid.uuid4().hex)

    irr_chunks, grid_chunks = _calculate_chunk_sizes(sources, geobox, dask_chunks)
    sliced_irr_chunks = (1,) * sources.ndim

    dsk = {}
    geobox_subsets = _chunk_geobox(geobox, grid_chunks)

    for irr_index, datasets in numpy.ndenumerate(sources.values):
        for dataset in datasets:
            ds_token = _tokenize_dataset(dataset)
            dsk[ds_token] = dataset

        for grid_index, subset_geobox in geobox_subsets.items():
            dataset_keys = [_tokenize_dataset(d) for d in
                            select_datasets_inside_polygon(datasets, subset_geobox.extent)]
            dsk[(dsk_name,) + irr_index + grid_index] = (fuse_lazy,
                                                         dataset_keys, subset_geobox, measurement,
                                                         skip_broken_datasets, fuse_func,
                                                         sources.ndim)

    data = da.Array(dsk, dsk_name,
                    chunks=(sliced_irr_chunks + grid_chunks),
                    dtype=measurement['dtype'],
                    shape=(sources.shape + geobox.shape))

    if irr_chunks != sliced_irr_chunks:
        data = data.rechunk(chunks=(irr_chunks + grid_chunks))
    return data
Exemplo n.º 29
0
def interpolate_xarray(xpoints,
                       ypoints,
                       values,
                       shape,
                       kind='cubic',
                       blocksize=CHUNK_SIZE):
    """Interpolate, generating a dask array."""
    vchunks = range(0, shape[0], blocksize)
    hchunks = range(0, shape[1], blocksize)

    token = tokenize(blocksize, xpoints, ypoints, values, kind, shape)
    name = 'interpolate-' + token

    from scipy.interpolate import interp2d
    interpolator = interp2d(xpoints, ypoints, values, kind=kind)

    dskx = {(name, i, j):
            (interpolate_slice, slice(vcs, min(vcs + blocksize, shape[0])),
             slice(hcs, min(hcs + blocksize, shape[1])), interpolator)
            for i, vcs in enumerate(vchunks) for j, hcs in enumerate(hchunks)}

    res = da.Array(dskx,
                   name,
                   shape=list(shape),
                   chunks=(blocksize, blocksize),
                   dtype=values.dtype)
    return DataArray(res, dims=('y', 'x'))
Exemplo n.º 30
0
 def as_daskarray(self):
     return da.Array(
             self.dask,
             self.name,
             self.chunks,
             self.dtype,
             self.shape)