def missing_spectrum( # pylint: disable=too-many-locals data: da.Array, cols: np.ndarray, bins: int) -> dd.DataFrame: """ Calculate a missing spectrum for each column """ nrows, ncols = data.shape num_bins = min(bins, nrows - 1) bin_size = nrows // num_bins chunk_size = min(1024 * 1024 * 128, nrows * ncols) # max 1024 x 1024 x 128 Bytes bool values nbins_per_chunk = max(chunk_size // (bin_size * data.shape[1]), 1) chunk_size = nbins_per_chunk * bin_size data = data.rechunk((chunk_size, None)) sep = nrows // chunk_size * chunk_size spectrum_missing_percs = data[:sep].map_blocks( missing_perc_blockwise(bin_size), chunks=(nbins_per_chunk, *data.shape[1:]), dtype=float, ) # calculation for the last chunk if sep != nrows: spectrum_missing_percs_remain = data[sep:].map_blocks( missing_perc_blockwise(bin_size), chunks=(int(np.ceil((nrows - sep) / bin_size)), *data.shape[1:]), dtype=float, ) spectrum_missing_percs = da.concatenate( [spectrum_missing_percs, spectrum_missing_percs_remain], axis=0) num_bins = spectrum_missing_percs.shape[0] locs0 = da.arange(num_bins) * bin_size locs1 = da.minimum(locs0 + bin_size, nrows) locs_middle = locs0 + bin_size / 2 df = dd.from_dask_array( da.repeat(da.from_array(cols, (1, )), num_bins), columns=["column"], ) df = df.assign( location=da.tile(locs_middle, ncols), missing_rate=spectrum_missing_percs.T.ravel().rechunk( locs_middle.shape[0]), loc_start=da.tile(locs0, ncols), loc_end=da.tile(locs1, ncols), ) return df
def scatter_with_regression( x: da.Array, y: da.Array, sample_size: int, k: Optional[int] = None ) -> Tuple[Tuple[da.Array, da.Array], Tuple[da.Array, da.Array], Optional[da.Array]]: """Calculate pearson correlation on 2 given arrays. Parameters ---------- xarr : da.Array yarr : da.Array sample_size : int k : Optional[int] = None Highlight k points which influence pearson correlation most """ if k == 0: raise ValueError("k should be larger than 0") xp1 = da.vstack([x, da.ones_like(x)]).T xp1 = xp1.rechunk((xp1.chunks[0], -1)) mask = ~(da.isnan(x) | da.isnan(y)) # if chunk size in the first dimension is 1, lstsq will use sfqr instead of tsqr, # where the former does not support nan in shape. if len(xp1.chunks[0]) == 1: xp1 = xp1.rechunk((2, -1)) y = y.rechunk((2, -1)) mask = mask.rechunk((2, -1)) (coeffa, coeffb), _, _, _ = da.linalg.lstsq(xp1[mask], y[mask]) if sample_size < x.shape[0]: samplesel = da.random.choice(x.shape[0], int(sample_size), chunks=x.chunksize) x = x[samplesel] y = y[samplesel] if k is None: return (coeffa, coeffb), (x, y), None influences = pearson_influence(x, y) return (coeffa, coeffb), (x, y), influences
def _rechunk(dask_array: da.Array): ndim_to_chunks = { 2: { 0: -1, 1: -1 }, 3: { 0: "auto", 1: -1, 2: -1 }, 4: { 0: "auto", 1: "auto", 2: -1, 3: -1 }, } return dask_array.rechunk(ndim_to_chunks[dask_array.ndim])
def _unchunk_ifneeded(data: da.Array, axis: int) -> da.Array: """Returns `data` unchunked along `axis`. Parameters ---------- data : :class:`dask.array.Array` Data which may be chunked along `axis`. axis : :class:`int` Axis number which specifies the axis to unchunk. Returns ------- data : :class:`dask.array.Array` A dask array which is not chunked along the specified axis. """ if isinstance(data, da.Array): shape = data.shape chunksize = data.chunksize axis = _check_axis(axis, data.ndim) if shape[axis] != chunksize[axis]: data = data.rechunk({axis: -1}) return data else: raise TypeError("data must be a dask array.")
def est_motion_part(varr: darr.Array, npart: int, chunk_nfm: int, alt_error=5, **kwargs) -> Tuple[darr.Array, darr.Array]: """ Construct dask graph for the recursive motion estimation algorithm. Parameters ---------- varr : darr.Array Input dask array representing movie data. npart : int Number of frames/chunks to combine for the recursive algorithm. chunk_nfm : int Number of frames in each parallel task. alt_error : int, optional Error threshold between estimated shifts from two alternative methods, specified in pixels. By default `5`. Returns ------- temps : darr.Array Registration template for the movie. shifts : darr.Array Estimated motion. See Also -------- estimate_motion """ if chunk_nfm is None: chunk_nfm = varr.chunksize[0] varr = varr.rechunk((chunk_nfm, None, None)) arr_opt = fct.partial(custom_arr_optimize, keep_patterns=["^est_motion_chunk"]) if kwargs.get("mesh_size", None): param = get_bspline_param(varr[0].compute(), kwargs["mesh_size"]) tmp_ls = [] sh_ls = [] for blk in varr.blocks: res = da.delayed(est_motion_chunk)(blk, None, alt_error=alt_error, npart=npart, **kwargs) if alt_error: tmp = darr.from_delayed(res[0], shape=(3, blk.shape[1], blk.shape[2]), dtype=blk.dtype) else: tmp = darr.from_delayed(res[0], shape=(blk.shape[1], blk.shape[2]), dtype=blk.dtype) if kwargs.get("mesh_size", None): sh = darr.from_delayed( res[1], shape=(blk.shape[0], 2, int(param[1]), int(param[0])), dtype=float, ) else: sh = darr.from_delayed(res[1], shape=(blk.shape[0], 2), dtype=float) tmp_ls.append(tmp) sh_ls.append(sh) with da.config.set(array_optimize=arr_opt): temps = da.optimize(darr.stack(tmp_ls, axis=0))[0] shifts = da.optimize(darr.concatenate(sh_ls, axis=0))[0] while temps.shape[0] > 1: tmp_ls = [] sh_ls = [] for idx in np.arange(0, temps.numblocks[0], npart): tmps = temps.blocks[idx:idx + npart] sh_org = shifts.blocks[idx:idx + npart] sh_org_ls = [sh_org.blocks[i] for i in range(sh_org.numblocks[0])] res = da.delayed(est_motion_chunk)(tmps, sh_org_ls, alt_error=alt_error, npart=npart, **kwargs) if alt_error: tmp = darr.from_delayed(res[0], shape=(3, tmps.shape[1], tmps.shape[2]), dtype=tmps.dtype) else: tmp = darr.from_delayed(res[0], shape=(tmps.shape[1], tmps.shape[2]), dtype=tmps.dtype) sh_new = darr.from_delayed(res[1], shape=sh_org.shape, dtype=sh_org.dtype) tmp_ls.append(tmp) sh_ls.append(sh_new) temps = darr.stack(tmp_ls, axis=0) shifts = darr.concatenate(sh_ls, axis=0) return temps, shifts