def create_counts_mat( assay, store: hierarchy, cross_map: np.ndarray, scalar_coeff: float, renormalization: bool, ) -> None: """ Populate the count matrix in the Zarr store. Args: assay: Scarf Assay object which contains the rawData attribute representing Dask array of count matrix store: Output Zarr Dataset cross_map: Mapping of indices. as obtained from get_feature_mappings function scalar_coeff: An arbitrary scalar multiplier. Only used when renormalization is True. renormalization: Whether to rescale the sum of feature values for each cell to `scalar_coeff` Returns: None """ from sparse import COO idx = np.where(cross_map)[0] feat_idx = np.repeat(idx, list(map(len, cross_map[idx]))) peak_idx = np.array( sum(list(cross_map[idx]), [])) # There is no guarantee that these are in sorted order assert feat_idx.shape == peak_idx.shape n_term_per_doc = assay.cells.fetch_all(assay.name + "_nFeatures") n_docs = n_term_per_doc.shape[0] n_docs_per_term = assay.feats.fetch_all("nCells") s = 0 for a in tqdmbar(assay.rawData.blocks, total=assay.rawData.numblocks[0]): a = controlled_compute(a, assay.nthreads) tf = a / n_term_per_doc[s:s + a.shape[0]].reshape(-1, 1) idf = np.log2(1 + (n_docs / (n_docs_per_term + 1))) a = tf * idf df = pd.DataFrame(a[:, peak_idx]).T df["fidx"] = feat_idx df = df.groupby("fidx").sum().T if renormalization: df = (scalar_coeff * df) / df.sum(axis=1).values.reshape(-1, 1) assert df.shape[1] == idx.shape[0] coord_renamer = dict(enumerate(df.columns)) coo = COO(df.values) coo.coords[1] = np.array([coord_renamer[x] for x in coo.coords[1]]) coo.shape = (coo.shape[0], store.shape[1]) store.set_coordinate_selection((s + coo.coords[0], coo.coords[1]), coo.data) s += a.shape[0]
def _get_size(self, zgrp: zarr_hierarchy, strict_mode: bool = False) -> int: """ Args: zgrp: strict_mode: Returns: """ sizes = [] for i in zgrp.keys(): sizes.append(zgrp[i].shape[0]) if len(sizes) > 0: if len(set(sizes)) != 1: raise ValueError( "ERROR: Metadata table is corrupted. Not all columns are of same length" ) return sizes[0] else: if strict_mode: raise ValueError("Attempted to get size of empty zarr group") else: return self.N
def mount_location(self, zgrp: zarr_hierarchy, identifier: str) -> None: """ Args: zgrp: identifier: Returns: """ if identifier in self.locations: raise ValueError( f"ERROR: a location with identifier '{identifier}' already mounted" ) size = self._get_size(zgrp) if size != self.N: raise ValueError( f"ERROR: The index size of the mount location ({size}) is not same as primary ({self.N})" ) new_cols = [self._col_renamer(identifier, x) for x in zgrp.keys()] cols = self.columns conflict_names = [x for x in new_cols if x in cols] if len(conflict_names) > 0: conflict_names = ' '.join(conflict_names) raise ValueError( f"ERROR: These names in location conflict with existing names: {conflict_names}\n. " f"Please try with a different identifier value.") self.locations[identifier] = zgrp
def create_zarr_dataset(g: zarr.hierarchy, name: str, chunks: tuple, dtype: Any, shape: Tuple, overwrite: bool = True) -> zarr.hierarchy: from numcodecs import Blosc compressor = Blosc(cname='lz4', clevel=5, shuffle=Blosc.BITSHUFFLE) return g.create_dataset(name, chunks=chunks, dtype=dtype, shape=shape, compressor=compressor, overwrite=overwrite)
def create_zarr_obj_array(g: zarr.hierarchy, name: str, data, dtype: Union[str, Any] = None, overwrite: bool = True) -> zarr.hierarchy: data = np.array(data) if dtype is None or dtype == object: dtype = 'U' + str(max([len(str(x)) for x in data])) if np.issubdtype(data.dtype, np.dtype('S')): data = data.astype('U') return g.create_dataset(name, data=data, chunks=(100000, ), shape=len(data), dtype=dtype, overwrite=overwrite)
def create_zarr_count_assay(z: zarr.hierarchy, assay_name: str, chunk_size: Tuple[int, int], n_cells: int, feat_ids: Union[np.ndarray, List[str]], feat_names: Union[np.ndarray, List[str]], dtype: str = 'uint32') -> zarr.hierarchy: g = z.create_group(assay_name, overwrite=True) g.attrs['is_assay'] = True g.attrs['misc'] = {} create_zarr_obj_array(g, 'featureData/ids', feat_ids) create_zarr_obj_array(g, 'featureData/names', feat_names) create_zarr_obj_array(g, 'featureData/I', [True for _ in range(len(feat_ids))], 'bool') return create_zarr_dataset(g, 'counts', chunk_size, dtype, (n_cells, len(feat_ids)), overwrite=True)