示例#1
0
def missing_spectrum(  # pylint: disable=too-many-locals
        data: da.Array, cols: np.ndarray, bins: int) -> dd.DataFrame:
    """
    Calculate a missing spectrum for each column
    """
    nrows, ncols = data.shape
    num_bins = min(bins, nrows - 1)
    bin_size = nrows // num_bins
    chunk_size = min(1024 * 1024 * 128,
                     nrows * ncols)  # max 1024 x 1024 x 128 Bytes bool values
    nbins_per_chunk = max(chunk_size // (bin_size * data.shape[1]), 1)

    chunk_size = nbins_per_chunk * bin_size

    data = data.rechunk((chunk_size, None))

    sep = nrows // chunk_size * chunk_size
    spectrum_missing_percs = data[:sep].map_blocks(
        missing_perc_blockwise(bin_size),
        chunks=(nbins_per_chunk, *data.shape[1:]),
        dtype=float,
    )

    # calculation for the last chunk
    if sep != nrows:
        spectrum_missing_percs_remain = data[sep:].map_blocks(
            missing_perc_blockwise(bin_size),
            chunks=(int(np.ceil((nrows - sep) / bin_size)), *data.shape[1:]),
            dtype=float,
        )
        spectrum_missing_percs = da.concatenate(
            [spectrum_missing_percs, spectrum_missing_percs_remain], axis=0)

    num_bins = spectrum_missing_percs.shape[0]

    locs0 = da.arange(num_bins) * bin_size
    locs1 = da.minimum(locs0 + bin_size, nrows)
    locs_middle = locs0 + bin_size / 2

    df = dd.from_dask_array(
        da.repeat(da.from_array(cols, (1, )), num_bins),
        columns=["column"],
    )

    df = df.assign(
        location=da.tile(locs_middle, ncols),
        missing_rate=spectrum_missing_percs.T.ravel().rechunk(
            locs_middle.shape[0]),
        loc_start=da.tile(locs0, ncols),
        loc_end=da.tile(locs1, ncols),
    )

    return df
示例#2
0
def scatter_with_regression(
    x: da.Array,
    y: da.Array,
    sample_size: int,
    k: Optional[int] = None
) -> Tuple[Tuple[da.Array, da.Array], Tuple[da.Array, da.Array],
           Optional[da.Array]]:
    """Calculate pearson correlation on 2 given arrays.

    Parameters
    ----------
    xarr : da.Array
    yarr : da.Array
    sample_size : int
    k : Optional[int] = None
        Highlight k points which influence pearson correlation most
    """
    if k == 0:
        raise ValueError("k should be larger than 0")

    xp1 = da.vstack([x, da.ones_like(x)]).T
    xp1 = xp1.rechunk((xp1.chunks[0], -1))

    mask = ~(da.isnan(x) | da.isnan(y))
    # if chunk size in the first dimension is 1, lstsq will use sfqr instead of tsqr,
    # where the former does not support nan in shape.

    if len(xp1.chunks[0]) == 1:
        xp1 = xp1.rechunk((2, -1))
        y = y.rechunk((2, -1))
        mask = mask.rechunk((2, -1))

    (coeffa, coeffb), _, _, _ = da.linalg.lstsq(xp1[mask], y[mask])

    if sample_size < x.shape[0]:
        samplesel = da.random.choice(x.shape[0],
                                     int(sample_size),
                                     chunks=x.chunksize)
        x = x[samplesel]
        y = y[samplesel]

    if k is None:
        return (coeffa, coeffb), (x, y), None

    influences = pearson_influence(x, y)
    return (coeffa, coeffb), (x, y), influences
def _rechunk(dask_array: da.Array):
    ndim_to_chunks = {
        2: {
            0: -1,
            1: -1
        },
        3: {
            0: "auto",
            1: -1,
            2: -1
        },
        4: {
            0: "auto",
            1: "auto",
            2: -1,
            3: -1
        },
    }
    return dask_array.rechunk(ndim_to_chunks[dask_array.ndim])
示例#4
0
def _unchunk_ifneeded(data: da.Array, axis: int) -> da.Array:
    """Returns `data` unchunked along `axis`.

    Parameters
    ----------
    data : :class:`dask.array.Array`
        Data which may be chunked along `axis`.
    axis : :class:`int`
        Axis number which specifies the axis to unchunk.

    Returns
    -------
        data : :class:`dask.array.Array`
            A dask array which is not chunked along the specified axis.
    """
    if isinstance(data, da.Array):
        shape = data.shape
        chunksize = data.chunksize
        axis = _check_axis(axis, data.ndim)
        if shape[axis] != chunksize[axis]:
            data = data.rechunk({axis: -1})
        return data
    else:
        raise TypeError("data must be a dask array.")
示例#5
0
def est_motion_part(varr: darr.Array,
                    npart: int,
                    chunk_nfm: int,
                    alt_error=5,
                    **kwargs) -> Tuple[darr.Array, darr.Array]:
    """
    Construct dask graph for the recursive motion estimation algorithm.

    Parameters
    ----------
    varr : darr.Array
        Input dask array representing movie data.
    npart : int
        Number of frames/chunks to combine for the recursive algorithm.
    chunk_nfm : int
        Number of frames in each parallel task.
    alt_error : int, optional
        Error threshold between estimated shifts from two alternative methods,
        specified in pixels. By default `5`.

    Returns
    -------
    temps : darr.Array
        Registration template for the movie.
    shifts : darr.Array
        Estimated motion.
    See Also
    --------
    estimate_motion
    """
    if chunk_nfm is None:
        chunk_nfm = varr.chunksize[0]
    varr = varr.rechunk((chunk_nfm, None, None))
    arr_opt = fct.partial(custom_arr_optimize,
                          keep_patterns=["^est_motion_chunk"])
    if kwargs.get("mesh_size", None):
        param = get_bspline_param(varr[0].compute(), kwargs["mesh_size"])
    tmp_ls = []
    sh_ls = []
    for blk in varr.blocks:
        res = da.delayed(est_motion_chunk)(blk,
                                           None,
                                           alt_error=alt_error,
                                           npart=npart,
                                           **kwargs)
        if alt_error:
            tmp = darr.from_delayed(res[0],
                                    shape=(3, blk.shape[1], blk.shape[2]),
                                    dtype=blk.dtype)
        else:
            tmp = darr.from_delayed(res[0],
                                    shape=(blk.shape[1], blk.shape[2]),
                                    dtype=blk.dtype)
        if kwargs.get("mesh_size", None):
            sh = darr.from_delayed(
                res[1],
                shape=(blk.shape[0], 2, int(param[1]), int(param[0])),
                dtype=float,
            )
        else:
            sh = darr.from_delayed(res[1],
                                   shape=(blk.shape[0], 2),
                                   dtype=float)
        tmp_ls.append(tmp)
        sh_ls.append(sh)
    with da.config.set(array_optimize=arr_opt):
        temps = da.optimize(darr.stack(tmp_ls, axis=0))[0]
        shifts = da.optimize(darr.concatenate(sh_ls, axis=0))[0]
    while temps.shape[0] > 1:
        tmp_ls = []
        sh_ls = []
        for idx in np.arange(0, temps.numblocks[0], npart):
            tmps = temps.blocks[idx:idx + npart]
            sh_org = shifts.blocks[idx:idx + npart]
            sh_org_ls = [sh_org.blocks[i] for i in range(sh_org.numblocks[0])]
            res = da.delayed(est_motion_chunk)(tmps,
                                               sh_org_ls,
                                               alt_error=alt_error,
                                               npart=npart,
                                               **kwargs)
            if alt_error:
                tmp = darr.from_delayed(res[0],
                                        shape=(3, tmps.shape[1],
                                               tmps.shape[2]),
                                        dtype=tmps.dtype)
            else:
                tmp = darr.from_delayed(res[0],
                                        shape=(tmps.shape[1], tmps.shape[2]),
                                        dtype=tmps.dtype)
            sh_new = darr.from_delayed(res[1],
                                       shape=sh_org.shape,
                                       dtype=sh_org.dtype)
            tmp_ls.append(tmp)
            sh_ls.append(sh_new)
        temps = darr.stack(tmp_ls, axis=0)
        shifts = darr.concatenate(sh_ls, axis=0)
    return temps, shifts