Пример #1
0
    def _slice_padded(self, _bounds):
        pads = (max(-_bounds[0], 0), max(-_bounds[1], 0),
                max(_bounds[2]-self.shape[2], 0), max(_bounds[3]-self.shape[1], 0))
        bounds = (max(_bounds[0], 0),
                  max(_bounds[1], 0),
                  max(min(_bounds[2], self.shape[2]), 0),
                  max(min(_bounds[3], self.shape[1]), 0))
        result = self[:, bounds[1]:bounds[3], bounds[0]:bounds[2]]
        if pads[0] > 0:
            dims = (result.shape[0], result.shape[1], pads[0])
            result = da.concatenate([da.zeros(dims, chunks=dims, dtype=result.dtype),
                                     result], axis=2)
        if pads[2] > 0:
            dims = (result.shape[0], result.shape[1], pads[2])
            result = da.concatenate([result,
                                     da.zeros(dims, chunks=dims, dtype=result.dtype)], axis=2)
        if pads[1] > 0:
            dims = (result.shape[0], pads[1], result.shape[2])
            result = da.concatenate([da.zeros(dims, chunks=dims, dtype=result.dtype),
                                     result], axis=1)
        if pads[3] > 0:
            dims = (result.shape[0], pads[3], result.shape[2])
            result = da.concatenate([result,
                                     da.zeros(dims, chunks=dims, dtype=result.dtype)], axis=1)

        return (result, _bounds[0], _bounds[1])
Пример #2
0
def test_mixed_concatenate(func):
    x = da.random.random((2, 3, 4), chunks=(1, 2, 2))
    y = da.random.random((2, 3, 4), chunks=(1, 2, 2))

    y[y < 0.4] = 0
    yy = da.ma.masked_equal(y, 0)

    d = da.concatenate([x, y], axis=0)
    s = da.concatenate([x, yy], axis=0)

    dd = func(d)
    ss = func(s)
    assert_eq(dd, ss)
Пример #3
0
def test_mixed_concatenate(func):
    x = da.random.random((2, 3, 4), chunks=(1, 2, 2))

    y = da.random.random((2, 3, 4), chunks=(1, 2, 2))
    y[y < 0.8] = 0
    yy = y.map_blocks(sparse.COO.from_numpy)

    d = da.concatenate([x, y], axis=0)
    s = da.concatenate([x, yy], axis=0)

    dd = func(d)
    ss = func(s)

    assert_eq(dd, ss)
Пример #4
0
def est_sh_part(varr, max_sh, npart, local):
    if varr.shape[0] <= 1:
        return varr.squeeze(), np.array([[0, 0]])
    idx_spt = np.array_split(np.arange(varr.shape[0]), npart)
    fm_ls, sh_ls = [], []
    for idx in idx_spt:
        if len(idx) > 0:
            fm, sh = est_sh_part(varr[idx, :, :], max_sh, npart, local)
            fm_ls.append(fm)
            sh_ls.append(sh)
    mid = int(len(sh_ls) / 2)
    sh_add_ls = [np.array([0, 0])] * len(sh_ls)
    for i, fm in enumerate(fm_ls):
        if i < mid:
            temp = fm_ls[i + 1]
            sh_idx = np.arange(i + 1)
        elif i > mid:
            temp = fm_ls[i - 1]
            sh_idx = np.arange(i, len(sh_ls))
        else:
            continue
        sh_add = darr.from_delayed(
            delayed(match_temp)(fm, temp, max_sh, local), (2,), float
        )
        for j in sh_idx:
            sh_ls[j] = sh_ls[j] + sh_add.reshape((1, -1))
            sh_add_ls[j] = sh_add_ls[j] + sh_add
    for i, (fm, sh) in enumerate(zip(fm_ls, sh_add_ls)):
        fm_ls[i] = darr.nan_to_num(
            darr.from_delayed(delayed(shift_perframe)(fm, sh), fm.shape, fm.dtype)
        )
    sh_ret = darr.concatenate(sh_ls)
    fm_ret = darr.stack(fm_ls)
    return fm_ret.max(axis=0), sh_ret
Пример #5
0
def euclidean(XA, XB):
    """Returns the distance between points using
       Euclidean distance (2-norm) as the distance metric between the
       points.

       Find the Euclidean distances between four 2-D coordinates:
        >>> coords = [(35.0456, -85.2672),
        ...           (35.1174, -89.9711),
        ...           (35.9728, -83.9422),
        ...           (36.1667, -86.7833)]
        >>> euclidean(coords, coords)
        array([[ 0.    ,  4.7044,  1.6172,  1.8856],
            [ 4.7044,  0.    ,  6.0893,  3.3561],
            [ 1.6172,  6.0893,  0.    ,  2.8477],
            [ 1.8856,  3.3561,  2.8477,  0.    ]])

    """
    mA = (XA.shape)[0]
    mB = (XB.shape)[0]

    distances = []

    for i in xrange(0, mA):
        dm = np.zeros(shape=(1, mB), dtype=np.double)
        for j in xrange(0, mB):
            XA_XB = XA[i, :] - XB[j, :]
            dm[0, j] = da.sqrt(da.dot(XA_XB, XA_XB))

        distances.append(
            da.from_array(dm, chunks=(mA + mB) / multiprocessing.cpu_count()))

    return da.concatenate(distances, axis=0)
Пример #6
0
def f_oneway(*args):
    # args = [np.asarray(arg, dtype=float) for arg in args]
    # ANOVA on N groups, each in its own array
    num_groups = len(args)
    alldata = da.concatenate(args)
    bign = len(alldata)

    # Determine the mean of the data, and subtract that from all inputs to a
    # variance (via sum_of_sq / sq_of_sum) calculation.  Variance is invariance
    # to a shift in location, and centering all data around zero vastly
    # improves numerical stability.
    offset = alldata.mean()
    alldata -= offset

    sstot = _sum_of_squares(alldata) - (_square_of_sums(alldata) / float(bign))
    ssbn = 0
    for a in args:
        ssbn += _square_of_sums(a - offset) / float(len(a))

    # Naming: variables ending in bn/b are for "between treatments", wn/w are
    # for "within treatments"
    ssbn -= (_square_of_sums(alldata) / float(bign))
    sswn = sstot - ssbn
    dfbn = num_groups - 1
    dfwn = bign - num_groups
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)
    f = msb / msw

    prob = _fdtrc(dfbn, dfwn, f)   # equivalent to stats.f.sf

    return delayed(F_onewayResult, nout=2)(f, prob)
Пример #7
0
 def get_lazy_well() -> da.Array:
     lazy_rows = []
     # For level 0, return whole image for each tile
     for row in range(row_count):
         lazy_row: List[da.Array] = []
         for col in range(column_count):
             tile_name = f"{row},{col}"
             LOGGER.debug(f"creating lazy_reader. row:{row} col:{col}")
             lazy_tile = da.from_delayed(
                 lazy_reader(tile_name),
                 shape=self.img_shape,
                 dtype=self.numpy_type,
             )
             lazy_row.append(lazy_tile)
         lazy_rows.append(da.concatenate(lazy_row, axis=4))
     return da.concatenate(lazy_rows, axis=3)
Пример #8
0
    def get_array(self, key):
        """Get all data from file for the given BUFR key."""
        with open(self.filename, "rb") as fh:
            msgCount = 0
            while True:
                bufr = ec.codes_bufr_new_from_file(fh)
                if bufr is None:
                    break

                ec.codes_set(bufr, 'unpack', 1)

                # if is the first message initialise our final array
                if (msgCount == 0):
                    arr = da.from_array(ec.codes_get_array(bufr, key, float),
                                        chunks=CHUNK_SIZE)
                else:
                    tmpArr = da.from_array(ec.codes_get_array(
                        bufr, key, float),
                                           chunks=CHUNK_SIZE)
                    arr = da.concatenate((arr, tmpArr))

                msgCount = msgCount + 1
                ec.codes_release(bufr)

        if arr.size == 1:
            arr = arr[0]

        return arr
Пример #9
0
def test_bag_array_conversion():
    import dask.bag as db
    b = db.range(10, npartitions=1)
    x, = b.map_partitions(np.asarray).to_delayed()
    x, = [da.from_delayed(a, shape=(10, ), dtype=int) for a in [x]]
    z = da.concatenate([x])
    assert_eq(z, np.arange(10), check_graph=False)
Пример #10
0
def test_bag_array_conversion():
    import dask.bag as db
    b = db.range(10, npartitions=1)
    x, = b.map_partitions(np.asarray).to_delayed()
    x, = [da.from_delayed(a, shape=(10,), dtype=int) for a in [x]]
    z = da.concatenate([x])
    assert_eq(z, np.arange(10), check_graph=False)
Пример #11
0
def test_cupy_sparse_concatenate(axis):
    pytest.importorskip("cupyx")

    rs = da.random.RandomState(RandomState=cupy.random.RandomState)
    meta = cupyx.scipy.sparse.csr_matrix((0, 0))

    xs = []
    ys = []
    for i in range(2):
        x = rs.random((1000, 10), chunks=(100, 10))
        x[x < 0.9] = 0
        xs.append(x)
        ys.append(x.map_blocks(cupyx.scipy.sparse.csr_matrix, meta=meta))

    z = da.concatenate(ys, axis=axis)
    z = z.compute()

    if axis == 0:
        sp_concatenate = cupyx.scipy.sparse.vstack
    elif axis == 1:
        sp_concatenate = cupyx.scipy.sparse.hstack
    z_expected = sp_concatenate(
        [cupyx.scipy.sparse.csr_matrix(e.compute()) for e in xs])

    assert (z.toarray() == z_expected.toarray()).all()
Пример #12
0
    def transform(self, raw_X):
        msg = "'X' should be a 1-dimensional array with length 'num_samples'."

        if not dask.is_dask_collection(raw_X):
            return self._hasher(**self.get_params()).transform(raw_X)

        if isinstance(raw_X, db.Bag):
            bag2 = raw_X.map_partitions(self._transformer)
            objs = bag2.to_delayed()
            arrs = [
                da.from_delayed(obj, (np.nan, self.n_features), self.dtype)
                for obj in objs
            ]
            result = da.concatenate(arrs, axis=0)
        elif isinstance(raw_X, dd.Series):
            result = raw_X.map_partitions(self._transformer)
        elif isinstance(raw_X, da.Array):
            # dask.Array
            chunks = ((np.nan, ) * raw_X.numblocks[0], (self.n_features, ))
            if raw_X.ndim == 1:
                result = raw_X.map_blocks(self._transformer,
                                          dtype="f8",
                                          chunks=chunks,
                                          new_axis=1)
            else:
                raise ValueError(msg)
        else:
            raise ValueError(msg)

        meta = scipy.sparse.eye(0, format="csr")
        result._meta = meta
        return result
Пример #13
0
def compute_center_of_geometry(traj):
    """Daskified version of mdtraj.geometry.compute_center_of_geometry

    This mimics py:method:`mdtraj.compute_center_of_geometry()` but returns the
    answer as a py:class:`dask.array` object

    Parameters
    ----------
    traj : :py:class:`dask_traj.Trajectory`
        The trajectory to compute the angles for.

    Returns
    -------
    com : dask.array, shape(n_frames, 3)
        Dask array with the delayed calculated Coordinates of center of
        geometry for each frame.
    """

    xyz = traj.xyz
    length = len(xyz)
    lazy_results = []
    current_frame = 0
    for frames in xyz.chunks[0]:
        chunk_size = (frames, 3)
        next_frame = current_frame + frames
        lazy_results.append(
            wrap_da(
                f=_compute_center_of_geometry_chunk,
                chunk_size=chunk_size,
                xyz=xyz[current_frame:next_frame],
            ))
        current_frame = next_frame
    max_result = da.concatenate(lazy_results)
    results = max_result[:length]
    return results
Пример #14
0
    def _project(self, X_dask):
        """Compute hidden layer output with Dask functionality.
        """
        H_list = []
        for hl, W in zip(self.hidden_layers_, self.W_):
            if hl.hidden_layer_ == HiddenLayerType.PAIRWISE:
                H0 = X_dask.map_blocks(pairwise_distances,
                                       W,
                                       dtype=X_dask.dtype,
                                       chunks=(X_dask.chunks[0],
                                               (W.shape[0], )),
                                       metric=hl.pairwise_metric)
            else:
                XW_dask = da.dot(X_dask, W.transpose())
                if hl.ufunc_ is dummy:
                    H0 = XW_dask
                elif hl.ufunc_ is np.tanh:
                    H0 = da.tanh(XW_dask)
                else:
                    H0 = XW_dask.map_blocks(hl.ufunc_)
            H_list.append(H0)

        if self.include_original_features:
            H_list.append(X_dask)
        H_list.append(da.ones((X_dask.shape[0], 1)))

        H_dask = da.concatenate(H_list, axis=1).rechunk(self.bsize_)
        return H_dask
Пример #15
0
    def transform(self, X):
        """Transform a sequence of documents to a document-term matrix.

        Transformation is done in parallel, and correctly handles dask
        collections.

        Parameters
        ----------
        X : dask.Bag of raw text documents, length = n_samples
            Samples. Each sample must be a text document (either bytes or
            unicode strings, file name or file object depending on the
            constructor argument) which will be tokenized and hashed.

        Returns
        -------
        X : dask.array.Array, shape = (n_samples, self.n_features)
            Document-term matrix. Each block of the array is a scipy sparse
            matrix.

        Notes
        -----
        The returned dask Array is composed scipy sparse matricies. If you need
        to compute on the result immediately, you may need to convert the individual
        blocks to ndarrays or pydata/sparse matricies.

        >>> import sparse
        >>> X.map_blocks(sparse.COO.from_scipy_sparse)  # doctest: +SKIP

        See the :doc:`examples/text-vectorization` for more.
        """
        transformer = super(HashingVectorizer, self).transform

        msg = "'X' should be a 1-dimensional array with length 'num_samples'."

        if not dask.is_dask_collection(X):
            return transformer(X)

        if isinstance(X, db.Bag):
            bag2 = X.map_partitions(transformer)
            objs = bag2.to_delayed()
            arrs = [
                da.from_delayed(obj, (np.nan, self.n_features), self.dtype)
                for obj in objs
            ]
            result = da.concatenate(arrs, axis=0)
        elif isinstance(X, dd.Series):
            result = X.map_partitions(transformer)
        elif isinstance(X, da.Array):
            # dask.Array
            chunks = ((np.nan,) * X.numblocks[0], (self.n_features,))
            if X.ndim == 1:
                result = X.map_blocks(
                    transformer, dtype="f8", chunks=chunks, new_axis=1
                )
            else:
                raise ValueError(msg)
        else:
            raise ValueError(msg)

        return result
Пример #16
0
def f_oneway(*args):
    # args = [np.asarray(arg, dtype=float) for arg in args]
    # ANOVA on N groups, each in its own array
    num_groups = len(args)
    alldata = da.concatenate(args)
    bign = len(alldata)

    # Determine the mean of the data, and subtract that from all inputs to a
    # variance (via sum_of_sq / sq_of_sum) calculation.  Variance is invariance
    # to a shift in location, and centering all data around zero vastly
    # improves numerical stability.
    offset = alldata.mean()
    alldata -= offset

    sstot = _sum_of_squares(alldata) - (_square_of_sums(alldata) / float(bign))
    ssbn = 0
    for a in args:
        ssbn += _square_of_sums(a - offset) / float(len(a))

    # Naming: variables ending in bn/b are for "between treatments", wn/w are
    # for "within treatments"
    ssbn -= _square_of_sums(alldata) / float(bign)
    sswn = sstot - ssbn
    dfbn = num_groups - 1
    dfwn = bign - num_groups
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)
    f = msb / msw

    prob = _fdtrc(dfbn, dfwn, f)  # equivalent to stats.f.sf

    return delayed(F_onewayResult, nout=2)(f, prob)
Пример #17
0
def test_taql_where(ms, index_cols):
    # three cases test here, corresponding to the
    # if-elif-else ladder in xds_from_table

    # No group_cols case
    xds = xds_from_table(ms, taql_where="FIELD_ID >= 0 AND FIELD_ID < 2",
                         columns=["FIELD_ID"])

    assert len(xds) == 1
    assert_array_equal(xds[0].FIELD_ID.data, [0, 0, 0, 1, 1, 1, 1])

    # Group columns case
    xds = xds_from_table(ms, taql_where="FIELD_ID >= 0 AND FIELD_ID < 2",
                         group_cols=["DATA_DESC_ID", "SCAN_NUMBER"],
                         columns=["FIELD_ID"])

    assert len(xds) == 2

    # Check group id's
    assert xds[0].DATA_DESC_ID == 0 and xds[0].SCAN_NUMBER == 0
    assert xds[1].DATA_DESC_ID == 0 and xds[1].SCAN_NUMBER == 1

    # Check field id's in each group
    fields = da.concatenate([ds.FIELD_ID.data for ds in xds])
    assert_array_equal(fields, [0, 0, 1, 1, 0, 1, 1])

    # Group columns case
    xds = xds_from_table(ms, taql_where="FIELD_ID >= 0 AND FIELD_ID < 2",
                         group_cols=["DATA_DESC_ID", "FIELD_ID"],
                         columns=["FIELD_ID"])

    assert len(xds) == 2

    # Check group id's, no DATA_DESC_ID == 1 because it only
    # contains FIELD_ID == 2
    assert xds[0].DATA_DESC_ID == 0 and xds[0].FIELD_ID == 0
    assert xds[1].DATA_DESC_ID == 0 and xds[1].FIELD_ID == 1

    # Group on each row
    xds = xds_from_table(ms, taql_where="FIELD_ID >= 0 AND FIELD_ID < 2",
                         group_cols=["__row__"],
                         columns=["FIELD_ID"])

    assert len(xds) == 7

    fields = da.concatenate([ds.FIELD_ID.data for ds in xds])
    assert_array_equal(fields, [0, 0, 0, 1, 1, 1, 1])
Пример #18
0
    def __init__(self, parent_ds):
        """
        The plotter is constructed from a parent data set. From the parent's 
        `XC` and `YC` variables, it constructs its own coordinates XC and YC, 
        and an internal xarray Dataset object based on these coordinates.
        """
        if not isinstance(parent_ds, xr.Dataset):
            raise TypeError(
                'LLC_plotter must be constructed from an xarray dataset')

        self.parent = parent_ds
        XC = concatenate([parent_ds.XC[i,:,:].data \
                          for i in range(parent_ds.XC.shape[0])])
        YC = concatenate([parent_ds.YC[i,:,:].data \
                          for i in range(parent_ds.YC.shape[0])])
        XG = concatenate([parent_ds.XG[i,:,:].data \
                          for i in range(parent_ds.XG.shape[0])])
        YG = concatenate([parent_ds.YG[i,:,:].data \
                          for i in range(parent_ds.YG.shape[0])])

        # Important assumption - this certainly *should* be the case for any
        # sane data set.
        assert XC.shape == YC.shape and XG.shape == YG.shape
        assert XC.shape == XG.shape

        jdim, idim = XC.shape
        i = xr.DataArray(np.arange(idim), coords=[('i', np.arange(idim))])
        j = xr.DataArray(np.arange(jdim), coords=[('j', np.arange(jdim))])
        i_g = xr.DataArray(np.arange(idim), coords=[('i_g', np.arange(idim))])
        j_g = xr.DataArray(np.arange(jdim), coords=[('j_g', np.arange(jdim))])

        XC = xr.DataArray(XC, coords=[('j', j), ('i', i)])
        YC = xr.DataArray(YC, coords=[('j', j), ('i', i)])
        XG = xr.DataArray(XG, coords=[('j_g', j_g), ('i_g', i_g)])
        YG = xr.DataArray(YG, coords=[('j_g', j_g), ('i_g', i_g)])

        self.ds = xr.Dataset(
            coords={
                'i': i,
                'j': j,
                'i_g': i_g,
                'j_g': j_g,
                'XC': XC,
                'XG': XG,
                'YC': YC,
                'YG': YG
            })
Пример #19
0
    def _split(self, test_start, test_stop, n_samples, chunks, seeds):
        train_objs = []
        test_objs = []
        train_sizes = []
        test_sizes = []

        offset = 0
        for chunk, seed in zip(chunks, seeds):
            start, stop = offset, offset + chunk

            test_id_start = max(test_start, start)
            test_id_stop = min(test_stop, stop)

            if test_id_start < test_id_stop:
                test_objs.append(
                    dask.delayed(_generate_offset_idx)(chunk, test_id_start,
                                                       test_id_stop, offset,
                                                       seed))
                test_sizes.append(test_id_stop - test_id_start)

            train_id_stop = min(test_id_start, stop)
            if train_id_stop > start:
                train_objs.append(
                    dask.delayed(_generate_offset_idx)(chunk, start,
                                                       train_id_stop, offset,
                                                       seed))
                train_sizes.append(train_id_stop - start)

            train_id_start = max(test_id_stop, start)
            if train_id_start < stop:
                train_objs.append(
                    dask.delayed(_generate_offset_idx)(chunk, train_id_start,
                                                       stop, offset, seed))
                train_sizes.append(stop - train_id_start)
            offset = stop

        train_idx = da.concatenate([
            da.from_delayed(obj, (train_size, ), np.dtype("int"))
            for obj, train_size in zip(train_objs, train_sizes)
        ])

        test_idx = da.concatenate([
            da.from_delayed(obj, (test_size, ), np.dtype("int"))
            for obj, test_size in zip(test_objs, test_sizes)
        ])

        return train_idx, test_idx
Пример #20
0
def ConcatenateSources(*sources, **kwargs):
    """
    Concatenate CatalogSource objects together, optionally including only
    certain columns in the returned source.

    .. note::
        The returned catalog object carries the meta-data from only
        the first catalog supplied to this function (in the ``attrs`` dict).

    Parameters
    ----------
    *sources : subclass of :class:`~nbodykit.base.catalog.CatalogSource`
        the catalog source objects to concatenate together
    columns : str, list of str, optional
        the columns to include in the concatenated catalog

    Returns
    -------
    CatalogSource :
        the concatenated catalog source object

    Examples
    --------
    >>> from nbodykit.lab import *
    >>> source1 = UniformCatalog(nbar=100, BoxSize=1.0)
    >>> source2 = UniformCatalog(nbar=100, BoxSize=1.0)
    >>> print(source1.csize, source2.csize)
    >>> combined = transform.ConcatenateSources(source1, source2, columns=['Position', 'Velocity'])
    >>> print(combined.csize)
    """
    from nbodykit.base.catalog import CatalogSource

    columns = kwargs.get('columns', None)
    if isinstance(columns, string_types):
        columns = [columns]

    # concatenate all columns, if none provided
    if columns is None or columns == []:
        columns = sources[0].columns

    # check comms
    if not all(src.comm == sources[0].comm for src in sources):
        raise ValueError("cannot concatenate sources: comm mismatch")

    # check all columns are there
    for source in sources:
        if not all(col in source for col in columns):
            raise ValueError(("cannot concatenate sources: columns are missing "
                              "from some sources"))
    # the total size
    size = numpy.sum([src.size for src in sources], dtype='intp')

    data = {}
    for col in columns:
        data[col] = da.concatenate([src[col] for src in sources], axis=0)

    toret = CatalogSource._from_columns(size, sources[0].comm, **data)
    toret.attrs.update(sources[0].attrs)
    return toret
Пример #21
0
def ConcatenateSources(*sources, **kwargs):
    """
    Concatenate CatalogSource objects together, optionally including only
    certain columns in the returned source.

    .. note::
        The returned catalog object carries the meta-data from only
        the first catalog supplied to this function (in the ``attrs`` dict).

    Parameters
    ----------
    *sources : subclass of :class:`~nbodykit.base.catalog.CatalogSource`
        the catalog source objects to concatenate together
    columns : str, list of str, optional
        the columns to include in the concatenated catalog

    Returns
    -------
    CatalogSource :
        the concatenated catalog source object

    Examples
    --------
    >>> from nbodykit.lab import *
    >>> source1 = UniformCatalog(nbar=100, BoxSize=1.0)
    >>> source2 = UniformCatalog(nbar=100, BoxSize=1.0)
    >>> print(source1.csize, source2.csize)
    >>> combined = transform.ConcatenateSources(source1, source2, columns=['Position', 'Velocity'])
    >>> print(combined.csize)
    """
    from nbodykit.base.catalog import CatalogSource

    columns = kwargs.get('columns', None)
    if isinstance(columns, string_types):
        columns = [columns]

    # concatenate all columns, if none provided
    if columns is None or columns == []:
        columns = sources[0].columns

    # check comms
    if not all(src.comm == sources[0].comm for src in sources):
        raise ValueError("cannot concatenate sources: comm mismatch")

    # check all columns are there
    for source in sources:
        if not all(col in source for col in columns):
            raise ValueError(("cannot concatenate sources: columns are missing "
                              "from some sources"))
    # the total size
    size = sum(src.size for src in sources)

    data = {}
    for col in columns:
        data[col] = da.concatenate([src[col] for src in sources], axis=0)

    toret = CatalogSource._from_columns(size, sources[0].comm, **data)
    toret.attrs.update(sources[0].attrs)
    return toret
 def _expand_tiepoint_array_1km(self, arr, lines, cols):
     arr = da.repeat(arr, lines, axis=1)
     arr = da.concatenate(
         (arr[:, :lines // 2, :], arr, arr[:, -(lines // 2):, :]), axis=1)
     arr = da.repeat(arr.reshape((-1, self.cscan_full_width - 1)),
                     cols,
                     axis=1)
     return da.hstack((arr, arr[:, -cols:]))
Пример #23
0
 def _rmatvec(self, x):
     y = []
     for iop, oper in enumerate(self.ops):
         y.append(
             oper.rmatvec(x[self.nnops[iop]:self.nnops[iop + 1]]).squeeze())
     y = da.concatenate(y)
     y = y.rechunk(self.chunks[0])
     return y
Пример #24
0
    def from_iterator(  # type: ignore[override]
        cls,
        name: str,
        iterator: Iterable[Tuple[str, str]],
        batch_size: int = 64,
        overwrite: bool = False,
    ) -> LabeledDataset:
        #
        # An alternative implementation can use itertool.tee + threading/async
        # https://stackoverflow.com/questions/50039223/how-to-execute-two-aggregate-functions-like-sum-concurrently-feeding-them-f
        # https://github.com/omnilib/aioitertools
        #
        # (Label, Doc)

        dataset = cls(name, overwrite=overwrite)
        data_path = common.PROJDIR / name / (name + ".raw.zarr.zip")
        label_path = common.PROJDIR / name / (name + ".label.raw.zarr.zip")

        labels, docs = unzip(iterator)

        if data_path.is_file() and (not overwrite):
            raise RuntimeError("File already exists")
        else:
            dataset.data["raw"] = Raw.from_dask_array(
                data_path,
                da.concatenate([
                    da.from_array(np.array(chunk))
                    for chunk in chunked(docs, batch_size)
                ]),
                overwrite=overwrite,
            )

        if label_path.is_file() and (not overwrite):
            raise RuntimeError("File already exists")
        else:
            dataset.labels["raw"] = Raw.from_dask_array(
                label_path,
                da.concatenate([
                    da.from_array(np.array(chunk))
                    for chunk in chunked(labels, batch_size)
                ]),
                overwrite=overwrite,
            )

        dataset.save()
        return dataset
Пример #25
0
def svs2dask_array(svs_file, tile_size=1000, overlap=0, remove_last=True, allow_unknown_chunksizes=False, transpose=False):
	"""Convert SVS, TIF or TIFF to dask array.
	Parameters
	----------
	svs_file : str
			Image file.
	tile_size : int
			Size of chunk to be read in.
	overlap : int
			Do not modify, overlap between neighboring tiles.
	remove_last : bool
			Remove last tile because it has a custom size.
	allow_unknown_chunksizes : bool
			Allow different chunk sizes, more flexible, but slowdown.
	Returns
	-------
	arr : dask.array.Array
			A Dask Array representing the contents of the image file.
	>>> arr = svs2dask_array(svs_file, tile_size=1000, overlap=0, remove_last=True, allow_unknown_chunksizes=False)
	>>> arr2 = arr.compute()
	>>> arr3 = to_pil(cv2.resize(arr2, dsize=(1440, 700), interpolation=cv2.INTER_CUBIC))
	>>> arr3.save(test_image_name)
	"""
	# https://github.com/jlevy44/PathFlowAI/blob/master/pathflowai/utils.py
	img = openslide.open_slide(svs_file)
	if type(img) is openslide.OpenSlide:
		gen = deepzoom.DeepZoomGenerator(
			img, tile_size=tile_size, overlap=overlap, limit_bounds=True)
		max_level = len(gen.level_dimensions) - 1
		n_tiles_x, n_tiles_y = gen.level_tiles[max_level]

		@dask.delayed(pure=True)
		def get_tile(level, column, row):
			tile = gen.get_tile(level, (column, row))
			return np.array(tile).transpose((1, 0, 2))

		sample_tile_shape = get_tile(max_level, 0, 0).shape.compute()
		rows = range(n_tiles_y - (0 if not remove_last else 1))
		cols = range(n_tiles_x - (0 if not remove_last else 1))
		arr = da.concatenate([da.concatenate([da.from_delayed(get_tile(max_level, col, row), sample_tile_shape, np.uint8) for row in rows],
											 allow_unknown_chunksizes=allow_unknown_chunksizes, axis=1) for col in cols], allow_unknown_chunksizes=allow_unknown_chunksizes)
		if transpose:
			arr=arr.transpose([1, 0, 2])
		return arr
	else:  # img is instance of openslide.ImageSlide
		return dask_image.imread.imread(svs_file)
Пример #26
0
def make_da(delayed_list, length):
    sample = delayed_list[0].compute()
    arrays = [
        da.from_delayed(item, dtype=sample.dtype, shape=sample.shape)
        for item in delayed_list
    ]
    result = da.concatenate(arrays, axis=0)[:length]
    return result
Пример #27
0
def as_known(X, lengths):
    blocks = X.to_delayed().flatten()
    P = X.shape[1]
    arrays = [
        da.from_delayed(x, dtype=X.dtype, shape=(length, P))
        for x, length in zip(blocks, lengths)
    ]
    return da.concatenate(arrays, axis=0)
Пример #28
0
    def compute(self,
                data,
                cache_id=None,
                rows_per_scan=None,
                chunks=None,
                fill_value=None,
                weight_count=10000,
                weight_min=0.01,
                weight_distance_max=1.0,
                weight_delta_max=1.0,
                weight_sum_min=-1.0,
                maximum_weight_mode=None,
                **kwargs):
        """Resample the data according to the precomputed X/Y coordinates."""
        # not used in this step
        kwargs.pop("persist", None)
        data_in, xr_obj = self._get_input_tuples(data)
        rows_per_scan = self._get_rows_per_scan(rows_per_scan)
        data_in = tuple(self._convert_to_dask(data_in, rows_per_scan))
        out_chunks = normalize_chunks(chunks or 'auto',
                                      shape=self.target_geo_def.shape,
                                      dtype=data.dtype)
        fornav_kwargs = kwargs.copy()
        maximum_weight_mode = self._handle_mwm(data, maximum_weight_mode)
        fornav_kwargs.update(
            dict(
                weight_count=weight_count,
                weight_min=weight_min,
                weight_distance_max=weight_distance_max,
                weight_delta_max=weight_delta_max,
                weight_sum_min=weight_sum_min,
                maximum_weight_mode=maximum_weight_mode,
                rows_per_scan=rows_per_scan,
            ))

        # determine a fill value if they didn't tell us what they have as a
        # fill value in the numpy arrays
        if fill_value is None:
            fill_value = self._get_default_fill(data_in[0])

        data_out = []
        for data_subarr in data_in:
            res = self._run_fornav_single(data_subarr, out_chunks,
                                          self.target_geo_def, fill_value,
                                          **fornav_kwargs)
            data_out.append(res)
        if data.ndim == 2:
            out = data_out[0]
        else:
            out = da.concatenate([arr[None, ...] for arr in data_out], axis=0)

        if xr_obj is not None:
            dims = [d for d in xr_obj.dims if d not in ('y', 'x')] + ['y', 'x']
            out = xr.DataArray(out, attrs=xr_obj.attrs.copy(), dims=dims)
            out = update_resampled_coords(xr_obj, out, self.target_geo_def)
        if isinstance(data, np.ndarray):
            return out.compute()
        return out
Пример #29
0
    def work(self):
        import dask.array as da
        import numpy as np
        import h5py
        from luigi.file import atomic_file

        fs = [h5py.File(f.path, mode='r') for f in self.input()]

        # Verify all H5s have the same structure
        datasets, groups, samples = [[] for x in fs], [[] for x in fs
                                                       ], [[] for x in fs]
        for i, f in enumerate(fs):
            f.visititems(lambda n, o: datasets[i].append(n) if isinstance(
                o, h5py.Dataset) else groups[i].append(n))
            samples[i] = f['samples'][:]
        if not all([set(datasets[0]) == set(x) for x in datasets]) and np.all(
                samples == samples[0], axis=0):
            raise Exception(
                "All HDF5 files must have the same groups/datasets/samples!")
        datasets, groups, samples = datasets[0], groups[0], samples[0]

        # Drop Samples dataset and handle separately
        datasets = [x for x in datasets if x != 'samples']
        combined = {
            d: da.concatenate([da.from_array(f[d], chunks=100000) for f in fs])
            for d in datasets
        }

        shapes = [(np.sum([f.get(d).shape
                           for f in fs], axis=0)[0], *fs[0].get(d).shape[1:])
                  for d in datasets]
        dtypes = [fs[0].get(d).dtype for d in datasets]

        # Handles Samples dataset
        datasets.append('samples')
        combined.update({'samples': da.from_array(fs[0]['samples'], chunks=1)})
        shapes.append(samples.shape)
        dtypes.append(samples.dtype)

        af = atomic_file(self.output().path)
        fout = h5py.File(af.tmp_path, 'w')

        # Set up group structure
        for g in groups:
            fout.create_group(g)

        # Create the datasets
        out_datasets = {}
        for p, dtype, shape in zip(datasets, dtypes, shapes):
            g, d = os.path.split(p)
            out_datasets[p] = (fout[g] if g else fout).create_dataset(
                d, shape=shape, dtype=dtype, chunks=True, compression='gzip')
        for k in combined.keys():
            s = da.store(combined[k], out_datasets[k], compute=False)
            s.compute(num_workers=self.n_cpu)
            print("Done " + k)

        af.move_to_final_destination()
Пример #30
0
def missing_spectrum(  # pylint: disable=too-many-locals
        df: DataArray, bins: int) -> Dict[str, da.Array]:
    """Calculate a missing spectrum for each column."""

    nrows, ncols = df.shape
    data = df.nulls

    if nrows > 1:
        num_bins = min(bins, nrows - 1)
        bin_size = nrows // num_bins
        chunk_size = min(1024 * 1024 * 128, nrows *
                         ncols)  # max 1024 x 1024 x 128 Bytes bool values
        nbins_per_chunk = max(chunk_size // (bin_size * data.shape[1]), 1)
        chunk_size = nbins_per_chunk * bin_size
        data = data.rechunk((chunk_size, None))
        sep = nrows // chunk_size * chunk_size
    else:
        # avoid division or module by zero
        bin_size = 1
        nbins_per_chunk = 1
        chunk_size = 1
        data = data.rechunk((chunk_size, None))
        sep = 1

    spectrum_missing_percs = data[:sep].map_blocks(
        missing_perc_blockwise(bin_size),
        chunks=(nbins_per_chunk, *data.chunksize[1:]),
        dtype=float,
    )

    # calculation for the last chunk
    if sep != nrows:
        spectrum_missing_percs_remain = data[sep:].map_blocks(
            missing_perc_blockwise(bin_size),
            chunks=(int(np.ceil((nrows - sep) / bin_size)), *data.shape[1:]),
            dtype=float,
        )
        spectrum_missing_percs = da.concatenate(
            [spectrum_missing_percs, spectrum_missing_percs_remain], axis=0)

    num_bins = spectrum_missing_percs.shape[0]

    locs0 = da.arange(num_bins) * bin_size
    locs1 = da.minimum(locs0 + bin_size, nrows)
    locs_middle = locs0 + bin_size / 2

    return {
        "column":
        da.repeat(da.from_array(df.columns.values, (1, )), num_bins),
        "location":
        da.tile(locs_middle, ncols),
        "missing_rate":
        spectrum_missing_percs.T.ravel().rechunk(locs_middle.shape[0]),
        "loc_start":
        da.tile(locs0, ncols),
        "loc_end":
        da.tile(locs1, ncols),
    }
Пример #31
0
 def as_stitched_array(self,
                       channel_index=0,
                       channel_name=None,
                       t_index=0,
                       verbose=True):
     if channel_name is not None:
         channel_index = self._channel_name_to_index(channel_name)
     z_list = []
     for z in self.z_indices:
         # this doesn't work with explore acquisitions and would need to be updated
         rows, cols = self.get_num_rows_and_cols()
         empty_tile = np.zeros((self.image_height, self.image_width),
                               self.pixel_type)
         row_list = []
         for row in range(rows):
             if verbose:
                 print('stitching row {} of {}'.format(row + 1, rows))
             col_list = []
             for col in range(cols):
                 pos_index_array = np.nonzero(
                     np.logical_and(self.row_col_array[:, 0] == row,
                                    self.row_col_array[:, 1] == col))[0]
                 pos_index = None if pos_index_array.size == 0 else pos_index_array[
                     0]
                 if pos_index is not None and self.has_image(
                         channel_index=channel_index,
                         z_index=z,
                         t_index=t_index,
                         pos_index=pos_index):
                     img = self.read_image(channel_index=channel_index,
                                           z_index=z,
                                           t_index=t_index,
                                           pos_index=pos_index,
                                           memmapped=True)
                 else:
                     img = empty_tile
                 # crop to center of tile
                 col_list.append(
                     img[self.overlap[0] // 2:-self.overlap[0] // 2,
                         self.overlap[1] // 2:-self.overlap[1] // 2])
             stitched_col = da.concatenate(col_list, axis=1)
             row_list.append(stitched_col)
         stitched = da.concatenate(row_list, axis=0)
         z_list.append(stitched)
     return da.stack(z_list)
Пример #32
0
def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs):
    X, y, g = make_ranking(n_samples=n_samples, random_state=42, **kwargs)
    rnd = np.random.RandomState(42)
    w = rnd.rand(X.shape[0]) * 0.01
    g_rle = np.array([len(list(grp)) for _, grp in groupby(g)])

    if output == 'dataframe':
        # add target, weight, and group to DataFrame so that partitions abide by group boundaries.
        X_df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
        X = X_df.copy()
        X_df = X_df.assign(y=y, g=g, w=w)

        # set_index ensures partitions are based on group id.
        # See https://stackoverflow.com/questions/49532824/dask-dataframe-split-partitions-based-on-a-column-or-function.
        X_df.set_index('g', inplace=True)
        dX = dd.from_pandas(X_df, chunksize=chunk_size)

        # separate target, weight from features.
        dy = dX['y']
        dw = dX['w']
        dX = dX.drop(columns=['y', 'w'])
        dg = dX.index.to_series()

        # encode group identifiers into run-length encoding, the format LightGBMRanker is expecting
        # so that within each partition, sum(g) = n_samples.
        dg = dg.map_partitions(lambda p: p.groupby('g', sort=False).apply(lambda z: z.shape[0]))
    elif output == 'array':
        # ranking arrays: one chunk per group. Each chunk must include all columns.
        p = X.shape[1]
        dX, dy, dw, dg = [], [], [], []
        for g_idx, rhs in enumerate(np.cumsum(g_rle)):
            lhs = rhs - g_rle[g_idx]
            dX.append(da.from_array(X[lhs:rhs, :], chunks=(rhs - lhs, p)))
            dy.append(da.from_array(y[lhs:rhs]))
            dw.append(da.from_array(w[lhs:rhs]))
            dg.append(da.from_array(np.array([g_rle[g_idx]])))

        dX = da.concatenate(dX, axis=0)
        dy = da.concatenate(dy, axis=0)
        dw = da.concatenate(dw, axis=0)
        dg = da.concatenate(dg, axis=0)
    else:
        raise ValueError('Ranking data creation only supported for Dask arrays and dataframes')

    return X, y, w, g_rle, dX, dy, dw, dg
Пример #33
0
def _convert_C_to_F_order(client, X, chunksizes, n_features, dtype):
    X_ddh = DistributedDataHandler.create(data=X, client=client)
    X_converted = [client.submit(cp.array, X_part, copy=False, order='F',
                                 workers=[w])
                   for idx, (w, X_part) in enumerate(X_ddh.gpu_futures)]

    X_dela = _create_delayed(X_converted, dtype, chunksizes, n_features)

    return da.concatenate(X_dela, axis=0)
Пример #34
0
def pad_chunks(darray, chunklen):
    ''' make sure chunks are the right shape'''
    padlen = chunklen - np.mod(darray.shape[0], chunklen)
    if padlen == 0:
        return darray
    else:
        pad = da.zeros((padlen, ), dtype=np.complex64)
        padded = da.concatenate([darray, pad], axis=0)
        return padded
Пример #35
0
    def parallel_request_OA(self) -> da.array:
        """
        Requests elevation data from OpenAltimetry API in parallel.
        Currently supports OA_Products ['ATL06','ATL07','ATL08','ATL10','ATL12','ATL13']

        For ATL03 Photon Data, OA only supports single date request
        according to: https://openaltimetry.org/data/swagger-ui/#/Public/getATL08DataByDate,
        with geospatial limitation of 1 degree lat/lon. Visualization of ATL03 data
        is not implemented within this module at this time.

        Returns
        -------
        OA_data_da : dask.Array
            A dask array containing the ICESat-2 data.
        """
        print("Generating urls")

        # generate parameter lists for OA requesting
        OA_para_list = self.generate_OA_parameters()

        url_number = len(OA_para_list)

        if url_number > 200:
            answer = user_check(
                "Too many API requests, this may take a long time, do you still want to continue: "
                "please enter yes/no\n")

            if answer == "yes":
                pass
            else:
                return

        print("Sending request to OpenAltimetry, please wait...")

        # Parallel processing
        requested_OA_data = []

        with concurrent.futures.ThreadPoolExecutor(
                max_workers=len(OA_para_list)) as executor:
            parallel_OA_data = {
                executor.submit(self.request_OA_data, para): para
                for para in OA_para_list
            }

            for future in tqdm(
                    iterable=concurrent.futures.as_completed(parallel_OA_data),
                    total=len(parallel_OA_data),
            ):
                r = future.result()
                if r is not None:
                    requested_OA_data.append(r)

        if not requested_OA_data:
            return
        else:
            OA_data_da = da.concatenate(requested_OA_data, axis=0)
            return OA_data_da
Пример #36
0
def _stage_1(G: Array,
             X: Array,
             Y: Array,
             alphas: Optional[NDArray[np.float_]] = None) -> Array:
    """Stage 1 - WGR Base Regression

    This stage will predict outcomes separately for each alpha parameter and variant
    block. This "compresses" the variant dimension into a smaller space that is
    much more amenable to efficient blockwise regressions in stage 2. Another
    interpretation for this operation is that all sample blocks are treated
    as folds in a K-fold CV fit within one single variant block. Predictions for
    any one combination of variant and sample block then correspond to a
    regression model fit all across sample blocks for that range of variants
    except for a single sample block. In other words, the predictions are
    out of sample which enables training of a stage 2 regressor based on
    these predictions, a technique commonly referred to as stacking.

    For more details, see the level 0 regression model described in step 1
    of [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2).
    """
    assert G.ndim == 2
    assert X.ndim == 2
    assert Y.ndim == 2
    # Check that chunking across samples is the same for all arrays
    assert G.shape[0] == X.shape[0] == Y.shape[0]
    assert G.numblocks[0] == X.numblocks[0] == Y.numblocks[0]
    assert G.chunks[0] == X.chunks[0] == Y.chunks[0]
    assert X.numblocks[1] == Y.numblocks[1] == 1
    if alphas is None:
        alphas = get_alphas(G.shape[1], like=G)
    # Extract shape statistics
    n_sample = G.shape[0]
    n_outcome = Y.shape[1]
    n_alpha = alphas.size
    n_sample_block = G.numblocks[0]
    n_variant_block = G.numblocks[1]
    sample_chunks = Y.chunks[0]

    YP = []
    for i in range(n_variant_block):
        # Extract all sample blocks for one variant block
        GB = G.blocks[:, i]
        # Prepend covariates and chunk along first dim only
        XGB = da.concatenate((X, GB), axis=1)
        XGB = XGB.rechunk(chunks=(None, -1))
        # Fit and predict folds for each parameter and outcome
        YPB = _ridge_regression_cv(XGB, Y, alphas, n_zero_reg=X.shape[1])[-1]
        assert_block_shape(YPB, 1, n_sample_block, 1)
        assert_chunk_shape(YPB, n_alpha, sample_chunks[0], n_outcome)
        assert_array_shape(YPB, n_alpha, n_sample, n_outcome)
        YP.append(YPB)
    # Stack as (n_variant_block, n_alpha, n_sample, n_outcome)
    YP = da.stack(YP, axis=0)
    assert_block_shape(YP, n_variant_block, 1, n_sample_block, 1)
    assert_chunk_shape(YP, 1, n_alpha, sample_chunks[0], n_outcome)
    assert_array_shape(YP, n_variant_block, n_alpha, n_sample, n_outcome)
    return YP
Пример #37
0
def test_gh3937():
    # test for github issue #3937
    x = da.from_array([1, 2, 3.], (2,))
    x = da.concatenate((x, [x[-1]]))
    y = x.rechunk((2,))
    # This will produce Integral type indices that are not ints (np.int64), failing
    # the optimizer
    y = da.coarsen(np.sum, y, {0: 2})
    # How to trigger the optimizer explicitly?
    y.compute()
Пример #38
0
def test_mixed_output_type():
    y = da.random.random((10, 10), chunks=(5, 5))
    y[y < 0.4] = 0

    y = da.ma.masked_equal(y, 0)
    x = da.zeros((10, 1), chunks=(5, 1))

    z = da.concatenate([x, y], axis=1)
    assert z.shape == (10, 11)
    zz = z.compute()
    assert isinstance(zz, np.ma.masked_array)
Пример #39
0
def interleaved_concat(arrays, indices, axis=0):
    """Concatenate each array along the given axis, but also assign each array
    element into the location given by indices. This operation is used for
    groupby.transform.
    """
    if has_dask and isinstance(arrays[0], da.Array):
        if not _interleaved_indices_required(indices):
            return da.concatenate(arrays, axis)
        else:
            return _interleaved_concat_slow(arrays, indices, axis)
    else:
        return _interleaved_concat_numpy(arrays, indices, axis)
Пример #40
0
def coarsen_destagger_dask(x, blocks, stagger=None, mode='wrap'):
    """


    Examples
    --------
    >>> x = da.arange(6, chunks=6)
    >>> xc = coarsen_destagger_dask(x, {0: 2}, stagger=0)
    >>> xc.compute()
    array([ 1. ,  3. ,  3.5])
    >>> x = da.from_array(x, chunks=x.shape)
    >>> xc = coarsen_destagger_dask(x, {0: 2}, stagger=0)
    >>> xc.compute()
    array([ 1. ,  3. ,  3.5])
    """
    output_numpy = False

    try:
        x._keys
    except AttributeError:
        output_numpy = True
        x = da.from_array(x, x.shape)

    xcoarse = coarsen_centered_np(x, blocks)
    # TODO refactor this code to another function
    if stagger is not None:
        blk = {key: val
               for key, val in blocks.items()
               if key != stagger}

        left_inds = np.arange(0, x.shape[stagger], blocks[stagger])
        left = da.coarsen(np.sum, da.take(x, left_inds, stagger), blk)
        n = left.shape[stagger]
        # handle boundary conditions
        if mode == 'wrap':
            bc = da.take(left, [0], axis=stagger)
        elif mode == 'clip':
            bc = da.take(left, [-1], axis=stagger)
        else:
            raise ValueError(f"Unknown boundary `mode` given: {mode}")

        right = da.take(left, np.arange(1, n), axis=stagger)
        right = da.concatenate((right, bc), axis=stagger)
        xcoarse = xcoarse + (right - left)/2

    n = np.prod(list(blocks.values()))
    ans = xcoarse/n

    if output_numpy:
        return ans.compute()
    else:
        return ans
Пример #41
0
 def _create_global_data(self):
     self._result_data = self._scheme.prepared_data()
     for index, problem in self._global_problem.items():
         if isinstance(problem, list):
             data = [da.from_array(self._result_data[p.dataset_descriptor.label]
                     .data.sel({self._scheme.model.global_dimension: p.index}).values,
                                   chunks='auto')
                     for p in problem]
             self._global_data[index] = da.concatenate(data).persist()
         else:
             data = self._result_data[problem.dataset_descriptor.label].data
             data = data.sel({self._scheme.model.global_dimension: problem.index}).values
             self._global_data[index] = ds.delayed(data).persist()
Пример #42
0
    def _build_data(self):
        """
        Generate the data payload for the new concatenated cube.

        Returns:
            The concatenated :class:`iris.cube.Cube` data payload.

        """
        skeletons = self._skeletons
        data = [skeleton.data for skeleton in skeletons]

        data = da.concatenate(data, self.axis)

        return data
Пример #43
0
def test_mixed_output_type():
    y = da.random.random((10, 10), chunks=(5, 5))
    y[y < 0.8] = 0
    y = y.map_blocks(sparse.COO.from_numpy)

    x = da.zeros((10, 1), chunks=(5, 1))

    z = da.concatenate([x, y], axis=1)

    assert z.shape == (10, 11)

    zz = z.compute()
    assert isinstance(zz, sparse.COO)
    assert zz.nnz == y.compute().nnz
Пример #44
0
def _reshape_llc_data(data, jdim):
    """Fix the weird problem with llc data array order."""
    # Can we do this without copying any data?
    # If not, we need to go upstream and implement this at the MDS level
    # Or can we fudge it with dask?
    # this is all very specific to the llc file output
    # would be nice to generalize more, but how?
    nside = data.shape[jdim] / LLC_NUM_FACES
    # how the LLC data is laid out along the j dimension
    strides = ((0,3), (3,6), (6,7), (7,10), (10,13))
    # whether to reshape each face
    reshape = (False, False, False, True, True)
    # this will slice the data into 5 facets
    slices = [jdim * (slice(None),) + (slice(nside*st[0], nside*st[1]),)
              for st in strides]
    facet_arrays = [data[sl] for sl in slices]
    face_arrays = []
    for ar, rs, st in zip(facet_arrays, reshape, strides):
        nfaces_in_facet = st[1] - st[0]
        shape = list(ar.shape)
        if rs:
            # we assume the other horizontal dimension is immediately after jdim
            shape[jdim] = ar.shape[jdim+1]
            shape[jdim+1] = ar.shape[jdim]
        # insert a length-1 dimension along which to concatenate
        shape.insert(jdim, 1)
        # modify the array shape in place, no copies allowed
        ar.shape = shape
        # now ar is propery shaped, but we still need to slice it into faces
        face_slice_dim = jdim + 1 + rs
        for n in range(nfaces_in_facet):
            face_slice = (face_slice_dim * (slice(None),) +
                          (slice(nside*n, nside*(n+1)),))
            data_face = ar[face_slice]
            face_arrays.append(data_face)

    # We can't concatenate using numpy (hcat etc.) because it makes a copy,
    # presumably loading the memmaps into memory.
    # Using dask gets around this.
    # But what if we want different chunks, or already chunked the data
    # upstream? Doesn't seem like this is ideal
    # TODO: Refactor handling of dask arrays and chunking
    #return np.concatenate(face_arrays, axis=jdim)
    # the dask version doesn't work because of this:
    # https://github.com/dask/dask/issues/1645
    face_arrays_dask = [da.from_array(fa, chunks=fa.shape)
                        for fa in face_arrays]
    concat = da.concatenate(face_arrays_dask, axis=jdim)
    return concat
Пример #45
0
    def __init__(self, fasulist, check=None, hdfdir=None):

        if not isinstance(fasulist, list):
            fasulist = [fasulist,]

        if isinstance(fasulist[0], str):
            self.hflist = fasulist
        else:
            b = db.from_sequence([f._process(hdfdir=hdfdir) for f in fasulist])
            self.hflist = b.compute()

        self.hlist = [h5py.File(h, 'r+') for h in self.hflist]
        
        if check is "names":
            nref = self.hlist[0]['n']
            for i in range(1, len(self.hlist)):
                if not (nref[:] == self.hlist[i]['n'][:]).all:
                    raise ValueError('Fasus with mismatched atom names')

        elif check is "masses":
            mref = self.hlist[0]['m']
            for i in range(1, len(self.hlist)):
                if not (mref[:] == self.hlist[i]['m'][:]).all:
                    raise ValueError('Fasus with mismatched atom masses')

        xs = [da.from_array(h['x'], chunks=CHUNKS) for h in self.hlist]
        self.x = da.concatenate(xs)
        if 'h' in h:
            xb = [da.from_array(h['b'], chunks=CHUNKS) for h in self.hlist]
            self.box = da.concatenate(xb)
        else:
            self.box = None
        self.fasulist = fasulist
        self.masses = self.hlist[0]['m']
        self.shape = self.shape()
        self.top = self.fasulist[0].top
Пример #46
0
def get_valid_images(directory, width=224, height=224, channels=3):
    '''
    Function to build needed arrays for training or validating the neural network using out of core processing.
    If labels are passed, get a list of training image files, their labels
    '''    
    
    validationList, _ = get_list_of_validation_files(directory)  # Pass directory containing validation images
    print('There are ', len(validationList), ' files in the validation list.')
    print('Breaking the list into chunks to handle size of request.')
    chunkedList = get_chunks(validationList, 8000)
    print('The length of the chunkedList is:  ', len(chunkedList))
    if channels == 3:  
        for i in range(len(chunkedList)):
            print('i =', i)
            validation_sublist = chunkedList[i][:]
            X = create_holding_array(validation_sublist, width = width, height=height, channels=channels)    # Create empty array
            print('Shape of the holding array is:  ', X.shape)
            print('Resizing 3-channel images for validation...')
            count = 0                                                       # Set counter for empty array
            filenames = []
            for validFile in validation_sublist:
                filenames.append(os.path.basename(validFile))
                img = misc.imread(validFile)                                # Read the image
                img = misc.imresize(img, size = (width, height, channels))  # Resize image with color channel = 3
                X[count] = img                                              # Store resized image in empty array
                count += 1                                                  # Advance index counter
            print('Shape of X is:  ', X.shape)
            print('Transposing X...')
            X1 = np.transpose(X, (2,0,1))
            print('Transposed shape for X is:  ', X.shape)
            if i == 0:
                print('Creating a dask array for images...')
                X_array = da.from_array(X1, chunks=4000)
            else:
                print('Concatenating the dask arrays...')
                X_array = da.concatenate(X_array, da.from_array(X1, chunks=4000))
            del X, X1
        
        return X_array, filenames

    else:     # If number of channels != 1 or != 3

        print('Could not create dataset and resize training images...')
Пример #47
0
Файл: core.py Проект: elaeon/ML
    def concat(da_groups, axis=0) -> 'GroupManager':
        if axis == 0:
            all_groups = [da_group.groups for da_group in da_groups]
            da_group_dict = GroupManager()
            intersection_groups = set(all_groups[0])
            for group in all_groups[1:]:
                intersection_groups = intersection_groups.intersection(set(group))

            if len(intersection_groups) > 0:
                # to maintain connexions order
                groups = [group for group in all_groups[0] if group in intersection_groups]
                for group in groups:
                    da_arrays = [da_group.conn[group] for da_group in da_groups]
                    da_array_c = da.concatenate(da_arrays, axis=axis)
                    da_group_dict[group] = da_array_c
                return da_group_dict
            else:
                return sum(da_groups)
        else:
            raise NotImplementedError
Пример #48
0
    def _from_p(self, mode):
        """Convert the image from P or PA to RGB or RGBA."""
        self._check_modes(("P", "PA"))

        if not self.palette:
            raise RuntimeError("Can't convert palettized image, missing palette.")
        pal = np.array(self.palette)
        pal = da.from_array(pal, chunks=pal.shape)

        if pal.shape[1] == 4:
            # colormap's alpha overrides data alpha
            mode = "RGBA"
            alpha = None
        elif self.mode.endswith("A"):
            # add a new/fake 'bands' dimension to the end
            alpha = self.data.sel(bands="A").data[..., None]
            mode = mode + "A" if not mode.endswith("A") else mode
        else:
            alpha = None

        flat_indexes = self.data.sel(bands='P').data.ravel().astype('int64')
        dim_sizes = ((key, val) for key, val in self.data.sizes.items() if key != 'bands')
        dims, new_shape = zip(*dim_sizes)
        dims = dims + ('bands',)
        new_shape = new_shape + (pal.shape[1],)
        new_data = pal[flat_indexes].reshape(new_shape)
        coords = dict(self.data.coords)
        coords["bands"] = list(mode)

        if alpha is not None:
            new_arr = da.concatenate((new_data, alpha), axis=-1)
            data = xr.DataArray(new_arr, coords=coords, attrs=self.data.attrs, dims=dims)
        else:
            data = xr.DataArray(new_data, coords=coords, attrs=self.data.attrs, dims=dims)

        return data
Пример #49
0
    def palettize(self, colormap):
        """Palettize the current image using `colormap`.

        .. note::

            Works only on "L" or "LA" images.

        """

        if self.mode not in ("L", "LA"):
            raise ValueError("Image should be grayscale to colorize")

        l_data = self.data.sel(bands=['L'])
        new_data = l_data.data.map_blocks(self._palettize, colormap, dtype=l_data.dtype)
        self.palette = tuple(colormap.colors)

        if self.mode == "L":
            mode = "P"
        else:
            mode = "PA"
            new_data = da.concatenate([new_data, self.data.sel(bands=['A'])], axis=0)

        self.data.data = new_data
        self.data.coords['bands'] = list(mode)
Пример #50
0
    def colorize(self, colormap):
        """Colorize the current image using `colormap`.

        .. note::

            Works only on "L" or "LA" images.

        """

        if self.mode not in ("L", "LA"):
            raise ValueError("Image should be grayscale to colorize")

        if self.mode == "LA":
            alpha = self.data.sel(bands=['A'])
        else:
            alpha = None

        l_data = self.data.sel(bands=['L'])
        new_data = l_data.data.map_blocks(self._colorize, colormap,
                                          chunks=(colormap.colors.shape[1],) + l_data.data.chunks[1:],
                                          dtype=np.float64)

        if colormap.colors.shape[1] == 4:
            mode = "RGBA"
        elif alpha is not None:
            new_data = da.concatenate([new_data, alpha.data], axis=0)
            mode = "RGBA"
        else:
            mode = "RGB"

        # copy the coordinates so we don't affect the original
        coords = dict(self.data.coords)
        coords['bands'] = list(mode)
        attrs = self.data.attrs
        dims = self.data.dims
        self.data = xr.DataArray(new_data, coords=coords, attrs=attrs, dims=dims)
Пример #51
0
 def reset(self):
     '''
     Removes any alignment from the trajectories
     '''
     xs = [da.from_array(h['x'], chunks=CHUNKS) for h in self.hlist]
     self.x = da.concatenate(xs)
Пример #52
0
def stack(signal_list, axis=None, new_axis_name='stack_element',
          lazy=None, **kwargs):
    """Concatenate the signals in the list over a given axis or a new axis.

    The title is set to that of the first signal in the list.

    Parameters
    ----------
    signal_list : list of BaseSignal instances
    axis : {None, int, str}
        If None, the signals are stacked over a new axis. The data must
        have the same dimensions. Otherwise the
        signals are stacked over the axis given by its integer index or
        its name. The data must have the same shape, except in the dimension
        corresponding to `axis`.
    new_axis_name : string
        The name of the new axis when `axis` is None.
        If an axis with this name already
        exists it automatically append '-i', where `i` are integers,
        until it finds a name that is not yet in use.
    lazy: {bool, None}
        Returns a LazySignal if True. If None, only returns lazy rezult if at
        least one is lazy.

    Returns
    -------
    signal : BaseSignal instance (or subclass, determined by the objects in
        signal list)

    Examples
    --------
    >>> data = np.arange(20)
    >>> s = hs.stack([hs.signals.Signal1D(data[:10]),
    ...               hs.signals.Signal1D(data[10:])])
    >>> s
    <Signal1D, title: Stack of , dimensions: (2, 10)>
    >>> s.data
    array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
           [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]])

    """
    from itertools import zip_longest
    from hyperspy.signals import BaseSignal
    import dask.array as da
    from numbers import Number
    # TODO: remove next time
    deprecated = ['mmap', 'mmap_dir']
    warn_str = "'{}' argument is deprecated, please use 'lazy' instead"
    for k in deprecated:
        if k in kwargs:
            lazy=True
            warnings.warn(warn_str.format(k), VisibleDeprecationWarning)

    axis_input = copy.deepcopy(axis)
    signal_list = list(signal_list)
    # Get the real signal with the most axes to get metadata/class/etc
    # first = sorted(filter(lambda _s: isinstance(_s, BaseSignal), signal_list),
    #                key=lambda _s: _s.data.ndim)[-1]
    first = next(filter(lambda _s: isinstance(_s, BaseSignal), signal_list))

    # Cast numbers as signals. Will broadcast later

    for i, _s in enumerate(signal_list):
        if isinstance(_s, BaseSignal):
            pass
        elif isinstance(_s, Number):
            sig = BaseSignal(_s)
            signal_list[i] = sig
        else:
            raise ValueError("{} type cannot be stacked.".format(type(_s)))

    if lazy is None:
        lazy = any(_s._lazy for _s in signal_list)
    if not isinstance(lazy, bool):
        raise ValueError("'lazy' argument has to be None, True or False")

    # Cast all as lazy if required
    for i, _s in enumerate(signal_list):
        if not _s._lazy:
            signal_list[i] = _s.as_lazy()
    if len(signal_list) > 1:
        newlist = broadcast_signals(*signal_list, ignore_axis=axis_input)
        if axis is not None:
            step_sizes = [s.axes_manager[axis].size for s in newlist]
            axis = newlist[0].axes_manager[axis]
        datalist = [s.data for s in newlist]
        newdata = da.stack(datalist, axis=0) if axis is None else \
            da.concatenate(datalist, axis=axis.index_in_array)
        if axis_input is None:
            signal = first.__class__(newdata)
            signal._lazy = True
            signal._assign_subclass()
            signal.axes_manager._axes[1:] = copy.deepcopy(newlist[0].axes_manager._axes)
            axis_name = new_axis_name
            axis_names = [axis_.name for axis_ in signal.axes_manager._axes[1:]]
            j = 1
            while axis_name in axis_names:
                axis_name = new_axis_name + "_%i" % j
                j += 1
            eaxis = signal.axes_manager._axes[0]
            eaxis.name = axis_name
            eaxis.navigate = True  # This triggers _update_parameters
            signal.metadata = copy.deepcopy(first.metadata)
            # Get the title from 1st object
            signal.metadata.General.title = (
                "Stack of " + first.metadata.General.title)
            signal.original_metadata = DictionaryTreeBrowser({})
        else:
            signal = newlist[0]._deepcopy_with_new_data(newdata)
            signal._lazy = True
            signal._assign_subclass()
        signal.get_dimensions_from_data()
        signal.original_metadata.add_node('stack_elements')

        for i, obj in enumerate(signal_list):
            signal.original_metadata.stack_elements.add_node('element%i' % i)
            node = signal.original_metadata.stack_elements['element%i' % i]
            node.original_metadata = \
                obj.original_metadata.as_dictionary()
            node.metadata = \
                obj.metadata.as_dictionary()

        if axis_input is None:
            axis_input = signal.axes_manager[-1 + 1j].index_in_axes_manager
            step_sizes = 1

        signal.metadata._HyperSpy.set_item('Stacking_history.axis', axis_input)
        signal.metadata._HyperSpy.set_item('Stacking_history.step_sizes',
                                           step_sizes)
        if np.all([
                s.metadata.has_item('Signal.Noise_properties.variance')
                for s in signal_list
        ]):
            variance = stack([
                s.metadata.Signal.Noise_properties.variance for s in signal_list
            ], axis)
            signal.metadata.set_item('Signal.Noise_properties.variance', variance)
    else:
        signal = signal_list[0]

    # Leave as lazy or compute
    if lazy:
        signal = signal.as_lazy()
    else:
        signal.compute(False)

    return signal
Пример #53
0
def DAFT(x, axis=-1, chunksize=2**26):
    """Disk-Array Fourier Transform
    
    This function enables Fourier transforms of a very large series, where the
    entire series will not fit in memory.  The standard radix-2 Cooley–Tukey
    algorithm is used to split the series up into smaller pieces until a given
    piece can be done entirely in memory.  This smaller result is then stored
    as a `dask.array`, and combined with other similar results out of memory,
    using dask.
    
    Parameters
    ----------
    x : array_like
        Input array, can be complex.
    axis : int, optional
        Axis over which to compute the FFT. If not given, the last axis is used.
    chunksize : int, optional
        Chunksize to use when splitting up the input array.  Default is 2**24,
        which is about 64MB -- a reasonable target that reduces memory usage.

    Returns
    -------
    X_da : dask Array object
        The Fourier transform is not yet computed; you must call
        `X_da.compute()` on the result to perform the computation.

    Example
    -------
    >>> import numpy as np
    >>> from chest import Chest  # For more flexible caching
    >>> cache = Chest(available_memory=(4 * 1024**3))  # Use 4GB at most
    >>> N = 2**26
    >>> chunksize = N//(2**2)
    >>> np.random.seed(1234)
    >>> x = np.random.random(N) + 1j*np.random.random(N)
    >>> X_dask = DAFT(x, chunksize=chunksize)
    >>> %tic
    >>> X_DAFT = X_dask.compute(cache=cache)
    >>> %toc
    >>> %tic
    >>> X_np = np.fft.fft(x)
    >>> %toc
    >>> np.allclose(X_DAFT, X_np)

    """
    import numpy as np
    import dask.array as da
    
    if axis<0:
        axis = x.ndim + axis
    N = x.shape[axis]
    chunks = tuple(1 if ax!=axis else chunksize for ax,dim in enumerate(x.shape))
    if isinstance(x, da.Array):
        x_da = x.rechunk(chunks=chunks)
    else:
        x_da = da.from_array(x, chunks=chunks)

    W = np.exp(-2j * np.pi * np.arange(N) / N)

    # print(x.shape, axis, x_da.chunks, x_da.chunks[axis]); sys.stdout.flush()
    slice_even = tuple(slice(None) if ax!=axis else slice(None, None, 2) for ax in range(x_da.ndim))
    slice_odd =  tuple(slice(None) if ax!=axis else slice(1, None, 2)    for ax in range(x_da.ndim))
    if len(x_da.chunks[axis]) != 1:
        # TODO: Fix the following lines to be correct when x is multi-dimensional
        FFT_even = DAFT(x_da[slice_even], axis, chunksize=chunksize)
        FFT_odd = DAFT(x_da[slice_odd], axis, chunksize=chunksize)
    else:
        # TODO: Fix the following lines to be correct when x is multi-dimensional
        FFT_even = da.fft.fft(x_da[slice_even], n=None, axis=axis)
        FFT_odd = da.fft.fft(x_da[slice_odd], n=None, axis=axis)

    # TODO: Fix the following line to broadcast W correctly when x is multi-dimensional
    return da.concatenate([FFT_even + W[:N//2] * FFT_odd, FFT_even + W[N//2:] * FFT_odd], axis=axis)
 def _expand_tiepoint_array_1km(self, arr, lines, cols):
     arr = da.repeat(arr, lines, axis=1)
     arr = da.concatenate((arr[:, :lines//2, :], arr, arr[:, -(lines//2):, :]), axis=1)
     arr = da.repeat(arr.reshape((-1, self.cscan_full_width - 1)), cols, axis=1)
     return da.hstack((arr, arr[:, -cols:]))
Пример #55
0
def valid_images_to_hdf5(directory, width=224, height=224, channels=3):
    '''
    Function to build needed arrays for training or validating the neural network using out of core processing.
    If labels are passed, get a list of training image files, their labels
    '''    
    
    validationList, _ = get_list_of_validation_files(directory)  # Pass directory containing validation images
    print('Creating the hdf5 file...')
    len_array = len(validationList)
    with h5py.File('validation_files.h5', 'w') as hf:
        dset = hf.create_dataset('validation_array', (len_array, channels, width, height), chunks=True)
        img_names = hf.create_dataset('image_names', (len_array,), chunks=True, dtype='S40')

    with h5py.File('validation_files.h5', 'r+') as hf:
        x = hf['validation_array']
        X = da.from_array(x, chunks=1000)
        image_names = list(hf['image_names'])

    print('There are ', len(validationList), ' files in the validation list.')
    print('Breaking the validation list into chunks of 10,000...')
    chunkedList = get_chunks(validationList, 10000)    # Break the list of files in to chunks of 10000

    if channels == 3:
        for i, chunk in enumerate(chunkedList):
#            print(chunk)
            count = i + len(chunk[i][:])*i                 # Set counter for empty array
#            valid_sublist = chunk[i][:]
            print('Create empty list to store image names..')
            filenames = []
            print('Creating an empty array to store images...')
            X = create_holding_array(chunk, width = width, height=height, channels=channels)    # Create empty array
            for j, validFile in enumerate(chunk):
                print('Reading file #:  ', j)
                filenames.append(os.path.basename(validFile))
#                print(chunk)
#                input('')
                img = misc.imread(validFile)                                # Read the image
                img = misc.imresize(img, size = (width, height, channels))  # Resize image with color channel = 3
#                img = np.transpose(img, (2,0,1))    # Store resized image in empty array
                X[j] = img
            asciiList = []
            asciiList = [n.encode("ascii", "ignore") for n in filenames]
            X1 = np.transpose(X, (0, 3, 1, 2))
            del X, filenames
            print(X1.shape)
            X_da = da.from_array(X1, chunks=1000)
            print('Opening validation_files.h5...')
            with h5py.File('validation_files.h5', 'r+') as hf:
                print('Putting validation_array in x...')
                x = hf['validation_array']
                print('Putting validation_array in dask array...')
                dset = da.from_array(x, chunks=1000)
                print('Concatenating the two dask arrays...')
                X2 = da.concatenate([dset, X_da], axis=0)
                print('Storing the dask array in the hdf5 file...')
                da.store(X2, x)
                print('Put image_names dset into a list...')
                image_names = list(hf['image_names'])
                print('Extend the list with additional image names...')
                image_names.extend(asciiList)
                

            print('Done.')    
        return filenames

    else:     # If number of channels != 1 or != 3

            print('Could not create dataset and resize training images...')
Пример #56
0
def read_PD0_bytes_ensembles(PD0_BYTES, return_pd0=False, headerid='\x7f\x7f',
                             format='sentinel', use_dask=True, chunks=1e4,
                             verbose=True, print_every=1000):
    """
    Finds the hex positions in the bytearray that identify the header of each
    ensemble. Then read each ensemble into a dictionary and accumulates them
    in a list.
    """
    chunks = int(chunks)
    if format=='workhorse':
        parsepd0 = parse_pd0_bytearray
    elif format=='sentinel':
        parsepd0 = parse_sentinelVpd0_bytearray
    else:
        print('Unknown *.pd0 format')

    # Split segments of the byte array per ensemble.
    ensbytes = PD0_BYTES.split(headerid)
    ensbytes = [headerid + ens for ens in ensbytes] # Prepend header id back.
    ensbytes = ensbytes[1:] # First entry is empty, cap it off.
    nens = len(ensbytes)
    nensm = nens - 1
    fbad_ens = []
    BAD_ENS = []
    # embed()

    # Get timestamps for all ensembles.
    # Note that these timestamps indicate the Janus' (i.e., beams 1-4) pings,
    # which will not necessarily be the same as the vertical beam's timestamp.
    t = np.empty(nens, dtype=object)

    if use_dask:
        DATA = darr.from_array(np.array([], dtype=object, ndmin=1), chunks=chunks)
        ntotalchunks = nens//chunks
        rem_ens = nens%chunks
        has_tail=rem_ens>0
        if has_tail: ntotalchunks+=1 # Last chunk takes remaining ensembles.
        DATAbuffskel = np.empty(chunks, dtype=object)
        DATAbuff = DATAbuffskel.copy()
        daNaN = darr.from_array(np.array(np.nan, ndmin=1), chunks=1)
        cont_inchnk=0
    else:
        DATA = np.empty(nens, dtype=object)

    nChecksumError, nReadChecksumError, nReadHeaderError = 0, 0, 0
    cont=0
    cont_inchnk=0
    for ensb in ensbytes:
        try:
            if use_dask:
                dd = delayed(parsepd0)(ensb)
            else:
                dd = parsepd0(ensb)
            # embed()
            t[cont] = dd['timestamp']
        except (ChecksumError, ReadChecksumError, ReadHeaderError) as E:
            t[cont] = np.nan
            fbad_ens.append(cont) # Store index of bad ensemble.
            # BAD_ENS.append(ens)   # Store bytes of the bad ensemble.

            # Which type of error was it?
            if isinstance(E, ChecksumError):
                nChecksumError += 1
            elif isinstance(E, ReadChecksumError):
                nReadChecksumError += 1
            elif isinstance(E, ReadHeaderError):
                nReadHeaderError += 1

            if use_dask:
                if cont_inchnk==chunks:
                    DATA = darr.concatenate((DATA, daNaN.copy()))
                    DATAbuff = DATAbuffskel.copy()
                    cont_inchnk=0
                else:
                    DATAbuff[cont_inchnk] = np.nan
                    cont_inchnk+=1
                    if has_tail and cont==nensm: # Save the last chunk.
                        DATA = darr.concatenate((DATA, daNaN.copy()))
            else:
                DATA[cont] = np.nan

            cont+=1
            continue

        if use_dask:
            if cont_inchnk==chunks:
                DATA = darr.concatenate((DATA, darr.from_array(DATAbuff, chunks=chunks)))
                DATAbuff = DATAbuffskel.copy()
                cont_inchnk=0
                # embed()
            else:
                DATAbuff[cont_inchnk] = dd
                cont_inchnk+=1
                if has_tail and cont==nensm: # Save the last chunk.
                    DATA = darr.concatenate((DATA, darr.from_array(DATAbuff, chunks=chunks)))
        else:
            DATA[cont] = dd

        cont+=1
        if verbose and not cont%print_every: print("Ensemble %d"%cont)

    errortype_count = dict(bad_checksum=nChecksumError,
                           read_checksum=nReadChecksumError,
                           read_header=nReadHeaderError)

    # Extract ensemble-independent fields (store in xr.Dataset attributes).
    # fixed_attrs = _pick_misc(DATA) # FIXME
    fixed_attrs = []
    # embed()

    if return_pd0:
        ret = (DATA, t, fixed_attrs, BAD_ENS, fbad_ens, errortype_count, PD0_BYTES)
    else:
        ret = (DATA, t, fixed_attrs, BAD_ENS, fbad_ens, errortype_count)

    return ret
Пример #57
0
def rolling_window(a, axis, window, center, fill_value):
    """ Dask's equivalence to np.utils.rolling_window """
    orig_shape = a.shape
    # inputs for ghost
    if axis < 0:
        axis = a.ndim + axis
    depth = {d: 0 for d in range(a.ndim)}
    depth[axis] = int(window / 2)
    # For evenly sized window, we need to crop the first point of each block.
    offset = 1 if window % 2 == 0 else 0

    if depth[axis] > min(a.chunks[axis]):
        raise ValueError(
            "For window size %d, every chunk should be larger than %d, "
            "but the smallest chunk size is %d. Rechunk your array\n"
            "with a larger chunk size or a chunk size that\n"
            "more evenly divides the shape of your array." %
            (window, depth[axis], min(a.chunks[axis])))

    # Although dask.ghost pads values to boundaries of the array,
    # the size of the generated array is smaller than what we want
    # if center == False.
    if center:
        start = int(window / 2)  # 10 -> 5,  9 -> 4
        end = window - 1 - start
    else:
        start, end = window - 1, 0
    pad_size = max(start, end) + offset - depth[axis]
    drop_size = 0
    # pad_size becomes more than 0 when the ghosted array is smaller than
    # needed. In this case, we need to enlarge the original array by padding
    # before ghosting.
    if pad_size > 0:
        if pad_size < depth[axis]:
            # Ghosting requires each chunk larger than depth. If pad_size is
            # smaller than the depth, we enlarge this and truncate it later.
            drop_size = depth[axis] - pad_size
            pad_size = depth[axis]
        shape = list(a.shape)
        shape[axis] = pad_size
        chunks = list(a.chunks)
        chunks[axis] = (pad_size, )
        fill_array = da.full(shape, fill_value, dtype=a.dtype, chunks=chunks)
        a = da.concatenate([fill_array, a], axis=axis)

    boundary = {d: fill_value for d in range(a.ndim)}

    # create ghosted arrays
    ag = da.ghost.ghost(a, depth=depth, boundary=boundary)

    # apply rolling func
    def func(x, window, axis=-1):
        x = np.asarray(x)
        rolling = nputils._rolling_window(x, window, axis)
        return rolling[(slice(None), ) * axis + (slice(offset, None), )]

    chunks = list(a.chunks)
    chunks.append(window)
    out = ag.map_blocks(func, dtype=a.dtype, new_axis=a.ndim, chunks=chunks,
                        window=window, axis=axis)

    # crop boundary.
    index = (slice(None),) * axis + (slice(drop_size,
                                           drop_size + orig_shape[axis]), )
    return out[index]