def test_broadcast_arrays_dask(d1_chunks): d1 = dsa.empty((5, 25), chunks=d1_chunks) d2 = dsa.empty((1, 25), chunks=(1, 25)) d1b, d2b = broadcast_arrays(d1, d2) assert d1b.shape == (5, 25) assert d2b.shape == (5, 25) assert d1b.chunks == d1_chunks assert d2b.chunks == d1_chunks
def setup(self): r = rnd() self.a = da.empty(shape=(2000000, 200, 2), dtype='i1', chunks=(10000, 100, 2)) self.c = r.randint(0, 2, size=self.a.shape[0], dtype=bool) self.s = sorted(r.choice(self.a.shape[1], size=100, replace=False))
def convert_probability_to_call( ds: Dataset, call_genotype_probability: str = variables.call_genotype_probability, threshold: float = 0.9, merge: bool = True, ) -> Dataset: """ Convert genotype probabilities to hard calls. Parameters ---------- ds Dataset containing genotype probabilities, such as from :func:`sgkit.io.bgen.read_bgen`. call_genotype_probability Genotype probability variable to be converted as defined by :data:`sgkit.variables.call_genotype_probability_spec`. threshold Probability threshold in [0, 1] that must be met or exceeded by at least one genotype probability in order for any calls to be made -- all values will be -1 (missing) otherwise. Setting this value to less than or equal to 0 disables any effect it has. Default value is 0.9. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the following variables: - `call_genotype` (variants, samples, ploidy): Converted hard calls. Defined by :data:`sgkit.variables.call_genotype_spec`. - `call_genotype_mask` (variants, samples, ploidy): Mask for converted hard calls. Defined by :data:`sgkit.variables.call_genotype_mask_spec`. """ if not (0 <= threshold <= 1): raise ValueError( f"Threshold must be float in [0, 1], not {threshold}.") variables.validate( ds, {call_genotype_probability: variables.call_genotype_probability_spec}) if ds.dims["genotypes"] != 3: raise NotImplementedError( f"Hard call conversion only supported for diploid, biallelic genotypes; " f"num genotypes in provided probabilities array = {ds.dims['genotypes']}." ) GP = da.asarray(ds[call_genotype_probability]) # Remove chunking in genotypes dimension, if present if len(GP.chunks[2]) > 1: GP = GP.rechunk((None, None, -1)) K = da.empty(2, dtype=np.uint8) GT = _convert_probability_to_call(GP, K, threshold) new_ds = create_dataset({ variables.call_genotype: (("variants", "samples", "ploidy"), GT), variables.call_genotype_mask: (("variants", "samples", "ploidy"), GT < 0), }) return conditional_merge_datasets(ds, new_ds, merge)
def cf_y_x(): return xr.DataArray( da.empty((Y_DIM_SIZE, X_DIM_SIZE)), dims=("y", "x"), attrs={ "grid_mapping": "a_grid_map_var", }, )
def duck_empty(dims, sizes, dtype="float64", chunks=None): """Return an empty DataArray based on a numpy or dask backend, depending on the chunks argument.""" shape = [sizes[dim] for dim in dims] if chunks: chnks = [chunks.get(dim, (sizes[dim], )) for dim in dims] content = dsk.empty(shape, chunks=chnks, dtype=dtype) else: content = np.empty(shape, dtype=dtype) return xr.DataArray(content, dims=dims)
def raw_coords_lats1d_lons1d(): return xr.DataArray( da.empty((Y_DIM_SIZE, X_DIM_SIZE)), dims=("lats", "lons"), coords={ "lons": da.linspace(25, 35, X_DIM_SIZE), "lats": da.linspace(25, 35, Y_DIM_SIZE), }, )
def label_adjacency_graph(labels, nlabels, depth, iou_threshold): all_mappings = [da.empty((2, 0), dtype=np.int32, chunks=1)] slices_and_axes = get_slices_and_axes(labels.chunks, labels.shape, depth) for face_slice, axis in slices_and_axes: face = labels[face_slice] mapped = _across_block_iou_delayed(face, axis, iou_threshold) all_mappings.append(mapped) i, j = da.concatenate(all_mappings, axis=1) result = _label._to_csr_matrix(i, j, nlabels + 1) return result
def regrid_chunk( block_src_data, block_src_y_pnts, block_src_low_y_bnds, block_src_upp_y_bnds, src_y_coord_metadata, src_x_coord, src_cube_metadata, tgt_y_coord, tgt_x_coord, tgt_y_slices, tgt_cube_metadata, y_dim, x_dim, scheme, block_info=None, ): # Construct source and target cubes. block_src_y_coord = iris.coords.DimCoord( block_src_y_pnts.ravel(), bounds=np.hstack(( block_src_low_y_bnds, block_src_upp_y_bnds, )), ) block_src_y_coord.metadata = src_y_coord_metadata src_cube = iris.cube.Cube( block_src_data, dim_coords_and_dims=[(block_src_y_coord, y_dim), (src_x_coord, x_dim)], ) src_cube.metadata = src_cube_metadata tgt_y_slice = tgt_y_slices[block_info[0]["chunk-location"][0]] block_tgt_y_coord = tgt_y_coord[tgt_y_slice] tgt_shape = ( block_tgt_y_coord.shape[0], tgt_x_coord.shape[0], ) tgt_cube = iris.cube.Cube( da.empty(tgt_shape), dim_coords_and_dims=[(block_tgt_y_coord, y_dim), (tgt_x_coord, x_dim)], ) tgt_cube.metadata = tgt_cube_metadata # Regrid and ensure that there are 2 dimensions. reg_data = src_cube.regrid(tgt_cube, scheme).data.reshape(tgt_shape) return reg_data
def read_5d(filename: str, sizes: Tuple[int, int, int, int, int], s: int, mdata: czimd.CziMetadata, remove_Adim: bool = True) -> np.ndarray: array_md = da.empty([ sizes[0], sizes[1], sizes[2], sizes[3], sizes[4], 3 if mdata.isRGB else 1 ], dtype=mdata.npdtype) # open the CZI document to read the with pyczi.open_czi(filename) as czidoc: # read array for the scene for t, z, c in product(range(sizes[0]), range(sizes[1]), range(sizes[2])): if mdata.image.SizeS is None: image2d = czidoc.read() else: image2d = czidoc.read(plane={ 'T': t, 'Z': z, 'C': c }, scene=s) # check if the image2d is really not too big if mdata.pyczi_dims["X"][ 1] > mdata.image.SizeX or mdata.pyczi_dims["Y"][ 1] > mdata.image.SizeY: image2d = image2d[..., 0:mdata.image.SizeY, 0:mdata.image.SizeX, :] array_md[t, z, c, ...] = image2d if remove_Adim: array_md = np.squeeze(array_md, axis=-1) return array_md
def gx_y_x(): crs = CRS.from_epsg(4326) return xr.DataArray( da.empty((Y_DIM_SIZE, X_DIM_SIZE)), dims=("y", "x"), attrs={ "grid_mapping": "spatial_ref", }, coords={ "spatial_ref": xr.DataArray( 0, attrs={ "crs_wkt": crs.to_wkt(), "spatial_ref": crs.to_wkt(), }, ), "y": da.linspace(0, 15000, X_DIM_SIZE), "x": da.linspace(-15000, 10000, Y_DIM_SIZE), }, )
def label_adjacency_graph(labels, structure, nlabels): """ Adjacency graph of labels between chunks of ``labels``. Each chunk in ``labels`` has been labeled independently, and the labels in different chunks are guaranteed to be unique. Here we construct a graph connecting labels in different chunks that correspond to the same logical label in the global volume. This is true if the two labels "touch" across the block face as defined by the input ``structure``. Parameters ---------- labels : dask array of int The input labeled array, where each chunk is independently labeled. structure : array of bool Structuring element, shape (3,) * labels.ndim. nlabels : delayed int The total number of labels in ``labels`` *before* correcting for global consistency. Returns ------- mat : delayed scipy.sparse.csr_matrix This matrix has value 1 at (i, j) if label i is connected to label j in the global volume, 0 everywhere else. """ faces = _chunk_faces(labels.chunks, labels.shape) all_mappings = [da.empty((2, 0), dtype=LABEL_DTYPE, chunks=1)] for face_slice in faces: face = labels[face_slice] mapped = _across_block_label_grouping_delayed(face, structure) all_mappings.append(mapped) all_mappings = da.concatenate(all_mappings, axis=1) i, j = all_mappings mat = _to_csr_matrix(i, j, nlabels + 1) return mat
def strfunc_from_pdf1(rxs, pdf, values, order, absolute=False): """Compute structure function of specified order from pdf for increments module. """ if absolute: values = abs(values) irx_max = rxs.size n = pdf.shape[1] dpdf = da.from_array(pdf, chunks=(1, n)) dvalues = da.from_array(values, chunks=(1, n)) S_order = da.empty(rxs.shape, chunks=1) print(f"S_order {S_order.shape}") print(f"pdf {pdf.shape}") print(f"values {values.shape}") for irx in range(irx_max): S_order[irx] = da.sum( dpdf[irx] * dvalues[irx]**order) * np.abs(dvalues[irx, 1] - dvalues[irx, 0]) return S_order.compute()
def count_cohort_alleles( ds: Dataset, *, call_allele_count: Hashable = variables.call_allele_count, sample_cohort: Hashable = variables.sample_cohort, merge: bool = True, ) -> Dataset: """Compute per cohort allele counts from per-sample allele counts, or genotype calls. Parameters ---------- ds Dataset containing genotype calls. call_allele_count Input variable name holding call_allele_count as defined by :data:`sgkit.variables.call_allele_count_spec`. If the variable is not present in ``ds``, it will be computed using :func:`count_call_alleles`. sample_cohort Input variable name holding sample_cohort as defined by :data:`sgkit.variables.sample_cohort_spec`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing :data:`sgkit.variables.cohort_allele_count_spec` of allele counts with shape (variants, cohorts, alleles) and values corresponding to the number of non-missing occurrences of each allele. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> ds["sample_cohort"] = xr.DataArray(np.repeat([0, 1], ds.dims["samples"] // 2), dims="samples") >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE samples S0 S1 S2 S3 variants 0 0/0 1/0 1/0 0/1 1 1/0 0/1 0/0 1/0 2 1/1 0/0 1/0 0/1 3 1/0 1/1 1/1 1/0 4 1/0 0/0 1/0 1/1 >>> sg.count_cohort_alleles(ds)["cohort_allele_count"].values # doctest: +NORMALIZE_WHITESPACE array([[[3, 1], [2, 2]], <BLANKLINE> [[2, 2], [3, 1]], <BLANKLINE> [[2, 2], [2, 2]], <BLANKLINE> [[1, 3], [1, 3]], <BLANKLINE> [[3, 1], [1, 3]]]) """ ds = define_variable_if_absent( ds, variables.call_allele_count, call_allele_count, count_call_alleles ) variables.validate(ds, {call_allele_count: variables.call_allele_count_spec}) n_variants = ds.dims["variants"] n_alleles = ds.dims["alleles"] AC, SC = da.asarray(ds[call_allele_count]), da.asarray(ds[sample_cohort]) n_cohorts = SC.max().compute() + 1 # 0-based indexing C = da.empty(n_cohorts, dtype=np.uint8) G = da.asarray(ds.call_genotype) shape = (G.chunks[0], n_cohorts, n_alleles) AC = da.map_blocks(_count_cohort_alleles, AC, SC, C, chunks=shape, dtype=np.int32) assert_array_shape( AC, n_variants, n_cohorts * AC.numblocks[1], n_alleles * AC.numblocks[2] ) # Stack the blocks and sum across them # (which will only work because each chunk is guaranteed to have same size) AC = da.stack([AC.blocks[:, i] for i in range(AC.numblocks[1])]).sum(axis=0) assert_array_shape(AC, n_variants, n_cohorts, n_alleles) new_ds = create_dataset( {variables.cohort_allele_count: (("variants", "cohorts", "alleles"), AC)} ) return conditional_merge_datasets(ds, new_ds, merge)
def count_call_alleles(ds: Dataset, merge: bool = True) -> Dataset: """Compute per sample allele counts from genotype calls. Parameters ---------- ds : Dataset Genotype call dataset such as from `sgkit.create_genotype_call_dataset`. merge : bool, optional If True (the default), merge the input dataset and the computed output variables into a single dataset. Output variables will overwrite any input variables with the same name, and a warning will be issued in this case. If False, return only the computed output variables. Returns ------- Dataset Array `call_allele_count` of allele counts with shape (variants, samples, alleles) and values corresponding to the number of non-missing occurrences of each allele. Examples -------- >>> import sgkit as sg >>> from sgkit.testing import simulate_genotype_call_dataset >>> ds = simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1) >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE samples S0 S1 variants 0 1/0 1/0 1 1/0 1/1 2 0/1 1/0 3 0/0 0/0 >>> sg.count_call_alleles(ds)["call_allele_count"].values # doctest: +NORMALIZE_WHITESPACE array([[[1, 1], [1, 1]], <BLANKLINE> [[1, 1], [0, 2]], <BLANKLINE> [[1, 1], [1, 1]], <BLANKLINE> [[2, 0], [2, 0]]], dtype=uint8) """ n_alleles = ds.dims["alleles"] G = da.asarray(ds["call_genotype"]) shape = (G.chunks[0], G.chunks[1], n_alleles) N = da.empty(n_alleles, dtype=np.uint8) new_ds = Dataset( { "call_allele_count": ( ("variants", "samples", "alleles"), da.map_blocks( count_alleles, G, N, chunks=shape, drop_axis=2, new_axis=2 ), ) } ) return merge_datasets(ds, new_ds) if merge else new_ds
def concatenate_and_rechunk( zarrs: Sequence[zarr.Array], chunks: Optional[Tuple[int, ...]] = None, dtype: DType = None, ) -> da.Array: """Perform a concatenate and rechunk operation on a collection of Zarr arrays to produce an array with a uniform chunking, suitable for saving as a single Zarr array. In contrast to Dask's ``rechunk`` method, the Dask computation graph is embarrassingly parallel and will make efficient use of memory, since no Zarr chunks are cached by the Dask scheduler. The Zarr arrays must have matching shapes except in the first dimension. Parameters ---------- zarrs Collection of Zarr arrays to concatenate. chunks : Optional[Tuple[int, ...]], optional The chunks to apply to the concatenated arrays. If not specified the chunks for the first array will be applied to the concatenated array. dtype The dtype of the concatenated array, by default the same as the first array. Returns ------- A Dask array, suitable for saving as a single Zarr array. Raises ------ ValueError If the Zarr arrays do not have matching shapes (except in the first dimension). """ if len(set([z.shape[1:] for z in zarrs])) > 1: shapes = [z.shape for z in zarrs] raise ValueError( f"Zarr arrays must have matching shapes (except in the first dimension): {shapes}" ) lengths = np.array([z.shape[0] for z in zarrs]) lengths0 = np.insert(lengths, 0, 0, axis=0) # type: ignore[no-untyped-call] offsets = np.cumsum(lengths0) total_length = offsets[-1] shape = (total_length, *zarrs[0].shape[1:]) chunks = chunks or zarrs[0].chunks dtype = dtype or zarrs[0].dtype ar = da.empty(shape, chunks=chunks) def load_chunk( x: ArrayLike, zarrs: Sequence[zarr.Array], offsets: ArrayLike, block_info: Dict[Any, Any], ) -> ArrayLike: return _slice_zarrs(zarrs, offsets, block_info[0]["array-location"]) return ar.map_blocks(load_chunk, zarrs=zarrs, offsets=offsets, dtype=dtype)
def computeTimeChunk(self, time, dt): """Load a chunk of three data time steps into the FieldSet. This is used when FieldSet uses data imported from netcdf, with default option deferred_load. The loaded time steps are at or immediatly before time and the two time steps immediately following time if dt is positive (and inversely for negative dt) :param time: Time around which the FieldSet chunks are to be loaded. Time is provided as a double, relatively to Fieldset.time_origin :param dt: time step of the integration scheme """ signdt = np.sign(dt) nextTime = np.infty if dt > 0 else -np.infty for g in self.gridset.grids: g.update_status = 'not_updated' for f in self.get_fields(): if type(f) in [VectorField, NestedField, SummedField] or not f.grid.defer_load: continue if f.grid.update_status == 'not_updated': nextTime_loc = f.grid.computeTimeChunk(f, time, signdt) if time == nextTime_loc and signdt != 0: raise TimeExtrapolationError(time, field=f, msg='In fset.computeTimeChunk') nextTime = min(nextTime, nextTime_loc) if signdt >= 0 else max(nextTime, nextTime_loc) for f in self.get_fields(): if type(f) in [VectorField, NestedField, SummedField] or not f.grid.defer_load or f.dataFiles is None: continue g = f.grid if g.update_status == 'first_updated': # First load of data data = da.empty((g.tdim, g.zdim, g.ydim-2*g.meridional_halo, g.xdim-2*g.zonal_halo), dtype=np.float32) f.loaded_time_indices = range(3) for tind in f.loaded_time_indices: for fb in f.filebuffers: if fb is not None: fb.dataset.close() data = f.computeTimeChunk(data, tind) data = f.rescale_and_set_minmax(data) f.data = f.reshape(data) if not f.chunk_set: f.chunk_setup() if len(g.load_chunk) > 0: g.load_chunk = np.where(g.load_chunk == 2, 1, g.load_chunk) g.load_chunk = np.where(g.load_chunk == 3, 0, g.load_chunk) elif g.update_status == 'updated': data = da.empty((g.tdim, g.zdim, g.ydim-2*g.meridional_halo, g.xdim-2*g.zonal_halo), dtype=np.float32) if signdt >= 0: f.loaded_time_indices = [2] f.filebuffers[0].dataset.close() f.filebuffers[:2] = f.filebuffers[1:] data = f.computeTimeChunk(data, 2) else: f.loaded_time_indices = [0] f.filebuffers[2].dataset.close() f.filebuffers[1:] = f.filebuffers[:2] data = f.computeTimeChunk(data, 0) data = f.rescale_and_set_minmax(data) if signdt >= 0: data = f.reshape(data)[2:, :] f.data = da.concatenate([f.data[1:, :], data], axis=0) else: data = f.reshape(data)[0:1, :] f.data = da.concatenate([data, f.data[:2, :]], axis=0) if len(g.load_chunk) > 0: if signdt >= 0: for block_id in range(len(g.load_chunk)): if g.load_chunk[block_id] == 2: if f.data_chunks[block_id] is None: # file chunks were never loaded. # happens when field not called by kernel, but shares a grid with another field called by kernel break block = f.get_block(block_id) f.data_chunks[block_id][:2] = f.data_chunks[block_id][1:] f.data_chunks[block_id][2] = np.array(f.data.blocks[(slice(3),)+block][2]) else: for block_id in range(len(g.load_chunk)): if g.load_chunk[block_id] == 2: if f.data_chunks[block_id] is None: # file chunks were never loaded. # happens when field not called by kernel, but shares a grid with another field called by kernel break block = f.get_block(block_id) f.data_chunks[block_id][1:] = f.data_chunks[block_id][:2] f.data_chunks[block_id][0] = np.array(f.data.blocks[(slice(3),)+block][0]) # do user-defined computations on fieldset data if self.compute_on_defer: self.compute_on_defer(self) if abs(nextTime) == np.infty or np.isnan(nextTime): # Second happens when dt=0 return nextTime else: nSteps = int((nextTime - time) / dt) if nSteps == 0: return nextTime else: return time + nSteps * dt
def misc_t_z_y_x(): return xr.DataArray( da.empty((TIME_DIM_SIZE, ALT_DIM_SIZE, Y_DIM_SIZE, X_DIM_SIZE)), dims=("t", "z", "y", "x"), )
def misc_y_x_z(): return xr.DataArray( da.empty((Y_DIM_SIZE, X_DIM_SIZE, ALT_DIM_SIZE)), dims=("y", "x", "z"), )
def geotiff_y_x_bands(): return xr.DataArray( da.empty((Y_DIM_SIZE, X_DIM_SIZE, OTHER_DIM_SIZE)), dims=("y", "x", "bands"), )
def geotiff_bands_y_x(): return xr.DataArray( da.empty((OTHER_DIM_SIZE, Y_DIM_SIZE, X_DIM_SIZE)), dims=("bands", "y", "x"), )
def geotiff_b_a(): return xr.DataArray( da.empty((Y_DIM_SIZE, X_DIM_SIZE)), dims=("a", "b"), )
def wrapper(func, time, params=1, chunks=None, dtype='float', output='xarray', name='z', **kwargs): """ Wraps timeseries generation code in order to distribute the generation Parameters ---------- func: method Method wrapped, signature needs to be func(p1, p2, ..., time, draws=1, **kwargs) where p1, p2 are dimensioning parameters that are nor time nor draw Minimal signature is func(time, draws=1, **kwargs) time: int, np.ndarray, tuple Number of time steps, time array, tuple (T, dt) params: int, dict, optional Parameters that will lead to dimensions or required to generate time series chunks: dict, optional Associated chunks seed: int, optional numpy seed name: str, optional output name, may be required if multiple variables are correlated (dask otherwise will assume they are one and the same) **kwargs: passed to func """ if isinstance(time, int): time = np.arange(time) elif isinstance(time, tuple): time = np.arange(0., time[0], time[1]) else: time = np.array(time) Nt = time.size if isinstance(params, dict): dims = {} for d, v in params.items(): if d=='draw' and isinstance(v, int): dims[d] = np.arange(v) else: dims[d] = np.array(v, ndmin=1) else: dims = {'draw': np.array(range(params), ndmin=1)} dims['time'] = time Nd = len(dims) shape = tuple(v.size for d, v in dims.items()) xr_chunks = {d: 'auto' for d in dims} xr_chunks['time'] = -1 if chunks: xr_chunks.update(**chunks) da_chunks = tuple(xr_chunks[d] for d in dims) # transform dimensions into dask arrays with appropriate forms # Note: adding name to dimension names below is pretty critical if # multiple calls to wrapper are made. # dask will create a single object ... danger dims_da = tuple(da.from_array(dims[d] .reshape(tuple(dims[d].size if i==j else 1 for j in range(Nd))), chunks=tuple(xr_chunks[d] if i==j else -1 for j in range(Nd)), name=name+d ) for i, d in enumerate(dims) ) # wraps func to reinit numpy seed from chunk number def _func(*args, seed=None, block_info=None, **kwargs): if seed is None: seed = np.random.randint(0,2**32-1) np.random.seed(seed+block_info[0]['num-chunks'][0]) return func(*args[1:], draws=args[0].shape[-2], seed=seed, **kwargs) x = da.empty(shape=shape, chunks=da_chunks) dims_da = tuple(d for d in dims_da if d.name!=name+'draw') x = x.map_blocks(_func, *dims_da, **kwargs, dtype=dtype) x = x.squeeze() dims = {d: v for d, v in dims.items() if v.size>1} # put result in an xarray DataArray if output=='xarray': x = xr.DataArray(x, dims=tuple(dims), coords=dims).rename(name) elif output=='dask_dd': assert x.ndim<3, 'Data generated is not 2D and cannot be transformed' \ +' into a dataframe' to_index = lambda d: (dd .from_array(dims[d], columns=d) .to_frame() .set_index(d) .index ) if shape[0]==1: i=to_index('time') c='draw' else: i=to_index('draw') c=time x = dd.from_dask_array(x, index=i, columns=c) return x
def test_non_square_datasets(self): data_array_dask = da.ones((6, 16, 100, 50), chunks=(2, 2, 25, 25)) peak_array_dask = da.empty((6, 16), chunks=(2, 2), dtype=np.object) dt._intensity_peaks_image(data_array_dask, peak_array_dask, 5)
def count_call_alleles( ds: Dataset, *, call_genotype: Hashable = variables.call_genotype, merge: bool = True, ) -> Dataset: """Compute per sample allele counts from genotype calls. Parameters ---------- ds Dataset containing genotype calls. call_genotype Input variable name holding call_genotype as defined by :data:`sgkit.variables.call_genotype_spec`. Must be present in ``ds``. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing :data:`sgkit.variables.call_allele_count_spec` of allele counts with shape (variants, samples, alleles) and values corresponding to the number of non-missing occurrences of each allele. Examples -------- >>> import sgkit as sg >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=2, seed=1) >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE samples S0 S1 variants 0 1/0 1/0 1 1/0 1/1 2 0/1 1/0 3 0/0 0/0 >>> sg.count_call_alleles(ds)["call_allele_count"].values # doctest: +NORMALIZE_WHITESPACE array([[[1, 1], [1, 1]], <BLANKLINE> [[1, 1], [0, 2]], <BLANKLINE> [[1, 1], [1, 1]], <BLANKLINE> [[2, 0], [2, 0]]], dtype=uint8) """ variables.validate(ds, {call_genotype: variables.call_genotype_spec}) n_alleles = ds.dims["alleles"] G = da.asarray(ds[call_genotype]) shape = (G.chunks[0], G.chunks[1], n_alleles) N = da.empty(n_alleles, dtype=np.uint8) new_ds = create_dataset( { variables.call_allele_count: ( ("variants", "samples", "alleles"), da.map_blocks( count_alleles, G, N, chunks=shape, drop_axis=2, new_axis=2 ), ) } ) return conditional_merge_datasets(ds, new_ds, merge)
def test_different_chunks(self): data_array_dask = da.ones((6, 16, 100, 50), chunks=(6, 4, 50, 25)) peak_array_dask = da.empty((6, 16), chunks=(3, 2), dtype=np.object) dt._intensity_peaks_image(data_array_dask, peak_array_dask, 5)
def time_fancy(self): a = da.empty(shape=(2000000, 200, 2), dtype='i1', chunks=(10000, 100, 2)) c = np.random.randint(0, 2, size=a.shape[0], dtype=bool) s = sorted(np.random.choice(a.shape[1], size=100, replace=False)) a[c][:, s]
def statistic(stat, images, band, num_process, chunksize, feedback): # create a empty initial wrapper raster for managed dask parallel # in chunks and storage result wrapper_array = da.empty(Image.wrapper_shape, chunks=chunksize) chunksize = wrapper_array.chunks[0][0] # call built in numpy statistical functions, with a specified axis. if # axis=2 means it will Compute along the 'depth' axis, per pixel. # with the return being n by m, the shape of each band. # # Compute the median if stat == 'median': def stat_func(stack_chunk, metadata): return np.nanmedian(stack_chunk, axis=2) # Compute the arithmetic mean if stat == 'mean': def stat_func(stack_chunk, metadata): return np.nanmean(stack_chunk, axis=2) # Compute the geometric mean if stat == 'gmean': def stat_func(stack_chunk, metadata): product = np.nanprod(stack_chunk, axis=2) count = np.count_nonzero(np.nan_to_num(stack_chunk), axis=2) gmean = np.array([p**(1.0 / c) for p, c in zip(product, count)]) gmean[gmean == 1] = np.nan return gmean # Compute the maximum value if stat == 'max': def stat_func(stack_chunk, metadata): return np.nanmax(stack_chunk, axis=2) # Compute the minimum value if stat == 'min': def stat_func(stack_chunk, metadata): return np.nanmin(stack_chunk, axis=2) # Compute the standard deviation if stat == 'std': def stat_func(stack_chunk, metadata): return np.nanstd(stack_chunk, axis=2) # Compute the valid pixels # this count the valid data (no nans) across the z-axis if stat == 'valid_pixels': def stat_func(stack_chunk, metadata): return stack_chunk.shape[2] - np.isnan(stack_chunk).sum(axis=2) # Compute the percentile NN if stat.startswith('percentile_'): p = int(stat.split('_')[1]) def stat_func(stack_chunk, metadata): return np.nanpercentile(stack_chunk, p, axis=2) # Compute the last valid pixel if stat == 'last_pixel': def last_pixel(pixel_time_series, index_sort): if np.isnan(pixel_time_series).all(): return np.nan for index in index_sort: if not np.isnan(pixel_time_series[index]): return pixel_time_series[index] def stat_func(stack_chunk, metadata): index_sort = np.argsort( metadata['date'])[::-1] # from the most recent to the oldest return np.apply_along_axis(last_pixel, 2, stack_chunk, index_sort) # Compute the julian day of the last valid pixel if stat == 'jday_last_pixel': def jday_last_pixel(pixel_time_series, index_sort, jdays): if np.isnan(pixel_time_series).all(): return 0 # better np.nan but there is bug with multiprocessing with return nan value here for index in index_sort: if not np.isnan(pixel_time_series[index]): return jdays[index] def stat_func(stack_chunk, metadata): index_sort = np.argsort( metadata['date'])[::-1] # from the most recent to the oldest return np.apply_along_axis(jday_last_pixel, 2, stack_chunk, index_sort, metadata['jday']) # Compute the julian day of the median value if stat == 'jday_median': def jday_median(pixel_time_series, index_sort, jdays): if np.isnan(pixel_time_series).all(): return 0 # better np.nan but there is bug with multiprocessing with return nan value here jdays = [ jdays[index] for index in index_sort if not np.isnan(pixel_time_series[index]) ] return np.ceil(np.median(jdays)) def stat_func(stack_chunk, metadata): index_sort = np.argsort( metadata['date']) # from the oldest to most recent return np.apply_along_axis(jday_median, 2, stack_chunk, index_sort, metadata['jday']) # Compute the trimmed median with lower limit and upper limit if stat.startswith('trim_mean_'): # TODO: check this stats when the time series have few data lower = int(stat.split('_')[2]) upper = int(stat.split('_')[3]) def trim_mean(pixel_time_series): if np.isnan(pixel_time_series).all(): return 0 # better np.nan but there is bug with multiprocessing with return nan value here pts = pixel_time_series[~np.isnan(pixel_time_series)] if len(pts) <= 2: return np.percentile(pts, (lower + upper) / 2) return np.mean(pts[(pts >= np.percentile(pts, lower)) & (pts <= np.percentile(pts, upper))]) def stat_func(stack_chunk, metadata): return np.apply_along_axis(trim_mean, 2, stack_chunk) # Compute the linear trend using least-squares method if stat == 'linear_trend': def linear_trend(pixel_time_series, index_sort, date_list): if np.isnan(pixel_time_series).all() or len( pixel_time_series[~np.isnan(pixel_time_series)]) == 1: return np.nan # Unix timestamp in days x = [ int(int(date_list[index].strftime("%s")) / 86400) for index in index_sort ] x = [i - x[0] for i in x] # diff from minimum pts = np.array([pixel_time_series[index] for index in index_sort]) y = np.ma.array(pts, mask=np.isnan(pts)) ssxm, ssxym, ssyxm, ssym = np.ma.cov(x, y, bias=1).flat slope = ssxym / ssxm return slope * 1000000 def stat_func(stack_chunk, metadata): index_sort = np.argsort( metadata['date']) # from the oldest to most recent return np.apply_along_axis(linear_trend, 2, stack_chunk, index_sort, metadata['date']) # Compute the statistical for the respective chunk def calc(block, block_id=None, chunksize=None): if feedback.isCanceled(): return yc = block_id[0] * chunksize yc_size = block.shape[0] xc = block_id[1] * chunksize xc_size = block.shape[1] # make stack reading all images only in specific chunk chunks_list = [ image.get_chunk_in_wrapper(band, xc, xc_size, yc, yc_size) for image in images ] # delete empty chunks mask_none = [False if x is None else True for x in chunks_list] chunks_list = np.array([i for i in chunks_list if i is not None]) if not chunks_list.size: # all chunks are empty, return the chunk with nan return np.full((yc_size, xc_size), np.nan) # for some statistics that required filename as metadata metadata = {} if stat in [ "last_pixel", "jday_last_pixel", "jday_median", "linear_trend" ]: metadata["date"] = np.array([image.date for image in images])[mask_none] if stat in ["jday_last_pixel", "jday_median"]: metadata["jday"] = np.array([image.jday for image in images])[mask_none] stack_chunk = np.stack(chunks_list, axis=2) return stat_func(stack_chunk, metadata) # process with ProgressBar(feedback=feedback): map_blocks = da.map_blocks(calc, wrapper_array, chunks=wrapper_array.chunks, chunksize=chunksize, dtype=float) result_array = map_blocks.compute(num_workers=num_process, scheduler="threads") return result_array
def geotiff_y_x(): return xr.DataArray( da.empty((Y_DIM_SIZE, X_DIM_SIZE)), dims=("y", "x"), )
def setup(self): self.N = 100000 self.a = da.empty(shape=(self.N, ), dtype='i1', chunks=[1] * self.N)
def geotiff_x_y(): # transposed data return xr.DataArray( da.empty((X_DIM_SIZE, Y_DIM_SIZE)), dims=("x", "y"), )