示例#1
0
 def aggregate(self) -> MultimodalData:
     """ Aggregate all data together """
     data = MultimodalData()
     for key in list(self.aggr):
         unidata = self._aggregate_unidata(self.aggr.pop(key))
         data.add_data(unidata)
     return data
def load_mtx_file(path: str, genome: str = None, modality: str = None) -> MultimodalData:
    """Load gene-count matrix from Market Matrix files (10x v2, v3 and HCA DCP formats)

    Parameters
    ----------

    path : `str`
        Path to mtx files. The directory implied by path should either contain matrix, feature and barcode information, or folders containing these information.
    genome : `str`, optional (default: None)
        Genome name of the matrix. If None, genome will be inferred from path.
    modality: `str`, optional (default: None)
        Modality, choosing from 'rna', 'citeseq', 'hashing', 'tcr', 'bcr', 'crispr' or 'atac'. If None, use 'rna' as default.

    Returns
    -------

    A MultimodalData object containing (genome, UnimodalData) pairs.

    Examples
    --------
    >>> io.load_mtx_file('example.mtx.gz', genome = 'mm10')
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"{path} does not exist!")

    orig_file = path
    if os.path.isdir(orig_file):
        path = orig_file.rstrip('/')
        file_name = _locate_mtx_file(path)
    else:
        if (not orig_file.endswith(".mtx")) and (not orig_file.endswith(".mtx.gz")):
            raise ValueError(f"File {orig_file} does not end with suffix .mtx or .mtx.gz!")
        path, file_name = os.path.split(orig_file)

    data = MultimodalData()

    if modality is None:
        modality = "rna"

    if file_name is not None:
        if genome is None:
            genome = "unknown"
        data.add_data(
            load_one_mtx_file(
                path,
                file_name,
                genome,
                modality
            ),
        )
    else:
        for dir_entry in os.scandir(path):
            if dir_entry.is_dir():
                file_name = _locate_mtx_file(dir_entry.path)
                if file_name is None:
                    raise ValueError(f"Folder {dir_entry.path} does not contain a mtx file!")
                dgenome, dmodality = _parse_dir_name(dir_entry.name, modality)
                data.add_data(load_one_mtx_file(dir_entry.path, file_name, dgenome, dmodality))

    return data
    def read_multimodal_data(self, attach_zarrobj = False) -> MultimodalData:
        """ Read MultimodalData
        """
        data = MultimodalData()
        for key, group in self.root.groups():
            unidata = self.read_unimodal_data(group)
            data.add_data(unidata)

        if self.root.attrs.get('_selected', None) is not None:
            data.select_data(self.root.attrs['_selected'])

        if attach_zarrobj:
            data._zarrobj = self

        return data
示例#4
0
def load_10x_h5_file_v2(h5_in: h5py.Group) -> MultimodalData:
    """Load 10x v2 format matrix from hdf5 file

    Parameters
    ----------

    h5_in : h5py.Group
        An instance of h5py.Group class that is connected to a 10x v2 formatted hdf5 file.

    Returns
    -------

    A MultimodalData object containing (genome, UnimodalData) pair per genome.

    Examples
    --------
    >>> io.load_10x_h5_file_v2(h5_in)
    """
    data = MultimodalData()
    for genome in h5_in.keys():
        group = h5_in[genome]

        M, N = group["shape"][...]
        mat = csr_matrix(
            (
                group["data"][...],
                group["indices"][...],
                group["indptr"][...],
            ),
            shape=(N, M),
        )

        barcodes = group["barcodes"][...].astype(str)
        ids = group["genes"][...].astype(str)
        names = group["gene_names"][...].astype(str)

        unidata = UnimodalData({"barcodekey": barcodes}, {
            "featurekey": names,
            "featureid": ids
        }, {"X": mat}, {
            "modality": "rna",
            "genome": genome
        })
        unidata.separate_channels()

        data.add_data(unidata)

    return data
示例#5
0
def load_10x_h5_file_v3(h5_in: h5py.Group) -> MultimodalData:
    """Load 10x v3 format matrix from hdf5 file, allowing detection of crispr and citeseq libraries

    Parameters
    ----------

    h5_in : h5py.Group
        An instance of h5py.Group class that is connected to a 10x v3 formatted hdf5 file.

    Returns
    -------

    A MultimodalData object containing (genome, UnimodalData) pair per genome.

    Examples
    --------
    >>> io.load_10x_h5_file_v3(h5_in)
    """
    M, N = h5_in["matrix/shape"][...]
    bigmat = csr_matrix(
        (
            h5_in["matrix/data"][...],
            h5_in["matrix/indices"][...],
            h5_in["matrix/indptr"][...],
        ),
        shape=(N, M),
    )
    barcodes = h5_in["matrix/barcodes"][...].astype(str)
    df = pd.DataFrame(
        data={
            "genome": h5_in["matrix/features/genome"][...].astype(str),
            "feature_type": h5_in["matrix/features/feature_type"][...].astype(
                str),
            "id": h5_in["matrix/features/id"][...].astype(str),
            "name": h5_in["matrix/features/name"][...].astype(str)
        })

    genomes = list(df["genome"].unique())
    if "" in genomes:
        genomes.remove("")
    default_genome = genomes[0] if len(genomes) == 1 else None

    data = MultimodalData()
    gb = df.groupby(by=["genome", "feature_type"])
    for name, group in gb:
        barcode_metadata = {"barcodekey": barcodes}
        feature_metadata = {
            "featurekey": group["name"].values,
            "featureid": group["id"].values
        }
        mat = bigmat[:, gb.groups[name]]

        genome = name[0] if (name[0] != ""
                             or default_genome is None) else default_genome
        modality = "custom"
        if name[1] == "Gene Expression":
            modality = "rna"
        elif name[1] == "CRISPR Guide Capture":
            modality = "crispr"
        elif name[1] == "Antibody Capture":
            modality = "citeseq"

        if modality == "citeseq":
            unidata = CITESeqData(barcode_metadata, feature_metadata,
                                  {"raw.count": mat}, {
                                      "genome": genome,
                                      "modality": modality
                                  })
        else:
            unidata = UnimodalData(barcode_metadata, feature_metadata,
                                   {"X": mat}, {
                                       "genome": genome,
                                       "modality": modality
                                   })
        unidata.separate_channels()

        data.add_data(unidata)

    return data
def pseudobulk(
    data: MultimodalData,
    sample: str,
    attrs: Optional[Union[List[str], str]] = None,
    mat_key: Optional[str] = "counts",
    cluster: Optional[str] = None,
) -> UnimodalData:
    """Generate Pseudo-bulk count matrices.

    Parameters
    -----------
    data: ``MultimodalData`` or ``UnimodalData`` object
        Annotated data matrix with rows for cells and columns for genes.

    sample: ``str``
        Specify the cell attribute used for aggregating pseudo-bulk data.
        Key must exist in ``data.obs``.

    attrs: ``str`` or ``List[str]``, optional, default: ``None``
        Specify additional cell attributes to remain in the pseudo bulk data.
        If set, all attributes' keys must exist in ``data.obs``.
        Notice that for a categorical attribute, each pseudo-bulk's value is the one of highest frequency among its cells,
        and for a numeric attribute, each pseudo-bulk's value is the mean among its cells.

    mat_key: ``str``, optional, default: ``counts``
        Specify the single-cell count matrix used for aggregating pseudo-bulk counts:
        If specified, use the count matrix with key ``mat_key`` from matrices of ``data``; otherwise, default is ``counts``.

    cluster: ``str``, optional, default: ``None``
        If set, additionally generate pseudo-bulk matrices per cluster specified in ``data.obs[cluster]``.

    Returns
    -------
    A UnimodalData object ``udata`` containing pseudo-bulk information:
        * It has the following count matrices:

          * ``X``: The pseudo-bulk count matrix over all cells.
          * If ``cluster`` is set, a number of pseudo-bulk count matrices of cells belonging to the clusters, respectively.
        * ``udata.obs``: It contains pseudo-bulk attributes aggregated from the corresponding single-cell attributes.
        * ``udata.var``: Gene names and Ensembl IDs are maintained.

    Update ``data``:
        * Add the returned UnimodalData object above to ``data`` with key ``<sample>-pseudobulk``, where ``<sample>`` is replaced by the actual value of ``sample`` argument.

    Examples
    --------
    >>> pg.pseudobulk(data, sample="Channel")
    """
    X = data.get_matrix(mat_key)

    assert sample in data.obs.columns, f"Sample key '{sample}' must exist in data.obs!"

    sample_vec = (data.obs[sample] if is_categorical_dtype(data.obs[sample])
                  else data.obs[sample].astype("category"))
    bulk_list = sample_vec.cat.categories

    df_barcode = data.obs.reset_index()

    mat_dict = {
        "counts": get_pseudobulk_count(X, df_barcode, sample, bulk_list)
    }

    # Generate pseudo-bulk attributes if specified
    bulk_attr_list = []

    if attrs is not None:
        if isinstance(attrs, str):
            attrs = [attrs]
        for attr in attrs:
            assert (attr in data.obs.columns
                    ), f"Cell attribute key '{attr}' must exist in data.obs!"

    for bulk in bulk_list:
        df_bulk = df_barcode.loc[df_barcode[sample] == bulk]
        if attrs is not None:
            bulk_attr = df_bulk[attrs].apply(set_bulk_value, axis=0)
            bulk_attr["barcodekey"] = bulk
        else:
            bulk_attr = pd.Series({"barcodekey": bulk})
        bulk_attr_list.append(bulk_attr)

    df_pseudobulk = pd.DataFrame(bulk_attr_list)

    df_feature = pd.DataFrame(index=data.var_names)
    if "featureid" in data.var.columns:
        df_feature["featureid"] = data.var["featureid"]

    if cluster is not None:
        assert (cluster in data.obs.columns
                ), f"Cluster key '{attr}' must exist in data.obs!"

        cluster_list = data.obs[cluster].astype("category").cat.categories
        for cls in cluster_list:
            mat_dict[f"{cluster}_{cls}.X"] = get_pseudobulk_count(
                X, df_barcode.loc[df_barcode[cluster] == cls], sample,
                bulk_list)

    udata = UnimodalData(
        barcode_metadata=df_pseudobulk,
        feature_metadata=df_feature,
        matrices=mat_dict,
        genome=sample,
        modality="pseudobulk",
        cur_matrix="counts",
    )

    data.add_data(udata)

    return udata