示例#1
0
def create_counts_mat(
    assay,
    store: hierarchy,
    cross_map: np.ndarray,
    scalar_coeff: float,
    renormalization: bool,
) -> None:
    """
    Populate the count matrix in the Zarr store.

    Args:
        assay: Scarf Assay object which contains the rawData attribute representing Dask array of count matrix
        store: Output Zarr Dataset
        cross_map: Mapping of indices. as obtained from get_feature_mappings function
        scalar_coeff: An arbitrary scalar multiplier. Only used when renormalization is True.
        renormalization: Whether to rescale the sum of feature values for each cell to `scalar_coeff`

    Returns:
        None

    """

    from sparse import COO

    idx = np.where(cross_map)[0]
    feat_idx = np.repeat(idx, list(map(len, cross_map[idx])))
    peak_idx = np.array(
        sum(list(cross_map[idx]),
            []))  # There is no guarantee that these are in sorted order
    assert feat_idx.shape == peak_idx.shape

    n_term_per_doc = assay.cells.fetch_all(assay.name + "_nFeatures")
    n_docs = n_term_per_doc.shape[0]
    n_docs_per_term = assay.feats.fetch_all("nCells")

    s = 0
    for a in tqdmbar(assay.rawData.blocks, total=assay.rawData.numblocks[0]):

        a = controlled_compute(a, assay.nthreads)
        tf = a / n_term_per_doc[s:s + a.shape[0]].reshape(-1, 1)
        idf = np.log2(1 + (n_docs / (n_docs_per_term + 1)))
        a = tf * idf

        df = pd.DataFrame(a[:, peak_idx]).T
        df["fidx"] = feat_idx
        df = df.groupby("fidx").sum().T
        if renormalization:
            df = (scalar_coeff * df) / df.sum(axis=1).values.reshape(-1, 1)
        assert df.shape[1] == idx.shape[0]

        coord_renamer = dict(enumerate(df.columns))
        coo = COO(df.values)
        coo.coords[1] = np.array([coord_renamer[x] for x in coo.coords[1]])
        coo.shape = (coo.shape[0], store.shape[1])

        store.set_coordinate_selection((s + coo.coords[0], coo.coords[1]),
                                       coo.data)
        s += a.shape[0]
示例#2
0
    def _get_size(self,
                  zgrp: zarr_hierarchy,
                  strict_mode: bool = False) -> int:
        """

        Args:
            zgrp:
            strict_mode:

        Returns:

        """
        sizes = []
        for i in zgrp.keys():
            sizes.append(zgrp[i].shape[0])
        if len(sizes) > 0:
            if len(set(sizes)) != 1:
                raise ValueError(
                    "ERROR: Metadata table is corrupted. Not all columns are of same length"
                )
            return sizes[0]
        else:
            if strict_mode:
                raise ValueError("Attempted to get size of empty zarr group")
            else:
                return self.N
示例#3
0
    def mount_location(self, zgrp: zarr_hierarchy, identifier: str) -> None:
        """

        Args:
            zgrp:
            identifier:

        Returns:

        """
        if identifier in self.locations:
            raise ValueError(
                f"ERROR: a location with identifier '{identifier}' already mounted"
            )
        size = self._get_size(zgrp)
        if size != self.N:
            raise ValueError(
                f"ERROR: The index size of the mount location ({size}) is not same as primary ({self.N})"
            )
        new_cols = [self._col_renamer(identifier, x) for x in zgrp.keys()]
        cols = self.columns
        conflict_names = [x for x in new_cols if x in cols]
        if len(conflict_names) > 0:
            conflict_names = ' '.join(conflict_names)
            raise ValueError(
                f"ERROR: These names in location conflict with existing names: {conflict_names}\n. "
                f"Please try with a different identifier value.")
        self.locations[identifier] = zgrp
示例#4
0
def create_zarr_dataset(g: zarr.hierarchy,
                        name: str,
                        chunks: tuple,
                        dtype: Any,
                        shape: Tuple,
                        overwrite: bool = True) -> zarr.hierarchy:
    from numcodecs import Blosc

    compressor = Blosc(cname='lz4', clevel=5, shuffle=Blosc.BITSHUFFLE)
    return g.create_dataset(name,
                            chunks=chunks,
                            dtype=dtype,
                            shape=shape,
                            compressor=compressor,
                            overwrite=overwrite)
示例#5
0
def create_zarr_obj_array(g: zarr.hierarchy,
                          name: str,
                          data,
                          dtype: Union[str, Any] = None,
                          overwrite: bool = True) -> zarr.hierarchy:
    data = np.array(data)
    if dtype is None or dtype == object:
        dtype = 'U' + str(max([len(str(x)) for x in data]))
    if np.issubdtype(data.dtype, np.dtype('S')):
        data = data.astype('U')
    return g.create_dataset(name,
                            data=data,
                            chunks=(100000, ),
                            shape=len(data),
                            dtype=dtype,
                            overwrite=overwrite)
示例#6
0
def create_zarr_count_assay(z: zarr.hierarchy,
                            assay_name: str,
                            chunk_size: Tuple[int, int],
                            n_cells: int,
                            feat_ids: Union[np.ndarray, List[str]],
                            feat_names: Union[np.ndarray, List[str]],
                            dtype: str = 'uint32') -> zarr.hierarchy:
    g = z.create_group(assay_name, overwrite=True)
    g.attrs['is_assay'] = True
    g.attrs['misc'] = {}
    create_zarr_obj_array(g, 'featureData/ids', feat_ids)
    create_zarr_obj_array(g, 'featureData/names', feat_names)
    create_zarr_obj_array(g, 'featureData/I',
                          [True for _ in range(len(feat_ids))], 'bool')
    return create_zarr_dataset(g,
                               'counts',
                               chunk_size,
                               dtype, (n_cells, len(feat_ids)),
                               overwrite=True)