예제 #1
0
def net_fle(
    data: MultimodalData,
    file_name: str = None,
    n_jobs: int = -1,
    rep: str = "diffmap",
    K: int = 50,
    full_speed: bool = False,
    target_change_per_node: float = 2.0,
    target_steps: int = 5000,
    is3d: bool = False,
    memory: int = 8,
    random_state: int = 0,
    select_frac: float = 0.1,
    select_K: int = 25,
    select_alpha: float = 1.0,
    net_alpha: float = 0.1,
    polish_target_steps: int = 1500,
    out_basis: str = "net_fle",
) -> None:
    """Construct Net-Force-directed (FLE) graph.

    Net-FLE is an approximated FLE graph using Deep Learning model to improve the speed.

    In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor.

    See [Li20]_ for details.

    .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    file_name: ``str``, optional, default: ``None``
        Temporary file to store the coordinates as the input to forceatlas2. If ``None``, use ``tempfile.mkstemp`` to generate file name.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all available threads.

    rep: ``str``, optional, default: ``"diffmap"``
        Representation of data used for the calculation. By default, use Diffusion Map coordinates. If ``None``, use the count matrix ``data.X``.

    K: ``int``, optional, default: ``50``
        Number of nearest neighbors to be considered during the computation.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    target_change_per_node: ``float``, optional, default: ``2.0``
        Target change per node to stop ForceAtlas2.

    target_steps: ``int``, optional, default: ``5000``
        Maximum number of iterations before stopping the ForceAtlas2 algorithm.

    is3d: ``bool``, optional, default: ``False``
        If ``True``, calculate 3D force-directed layout.

    memory: ``int``, optional, default: ``8``
        Memory size in GB for the Java FA2 component. By default, use 8GB memory.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    select_frac: ``float``, optional, default: ``0.1``
        Down sampling fraction on the cells.

    select_K: ``int``, optional, default: ``25``
        Number of neighbors to be used to estimate local density for each data point for down sampling.

    select_alpha: ``float``, optional, default: ``1.0``
        Weight the down sample to be proportional to ``radius ** select_alpha``.

    net_alpha: ``float``, optional, default: ``0.1``
        L2 penalty (regularization term) parameter of the deep regressor.

    polish_target_steps: ``int``, optional, default: ``1500``
        After running the deep regressor to predict new coordinate, Number of ForceAtlas2 iterations.

    out_basis: ``str``, optional, default: ``"net_fle"``
        Key name for calculated FLE coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: Net FLE coordinates of the data.

    Update ``data.obs``:
        * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase.

    Examples
    --------
    >>> pg.net_fle(data)
    """

    if file_name is None:
        if file_name is None:
            import tempfile

            _, file_name = tempfile.mkstemp()

    n_jobs = effective_n_jobs(n_jobs)
    rep = update_rep(rep)

    if ("W_" + rep) not in data.uns:
        neighbors(
            data,
            K=K,
            rep=rep,
            n_jobs=n_jobs,
            random_state=random_state,
            full_speed=full_speed,
        )

    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if not knn_is_cached(data, indices_key, distances_key, select_K):
        raise ValueError("Please run neighbors first!")

    selected = select_cells(
        data.uns[distances_key],
        select_frac,
        K=select_K,
        alpha=select_alpha,
        random_state=random_state,
    )

    X_full = X_from_rep(data, rep)
    X = X_full[selected, :]

    ds_indices_key = "ds_" + rep + "_knn_indices"
    ds_distances_key = "ds_" + rep + "_knn_distances"
    indices, distances = calculate_nearest_neighbors(X,
                                                     K=K,
                                                     n_jobs=n_jobs,
                                                     random_state=random_state,
                                                     full_speed=full_speed)
    data.uns[ds_indices_key] = indices
    data.uns[ds_distances_key] = distances

    W = calculate_affinity_matrix(indices, distances)

    X_fle = calc_force_directed_layout(
        W,
        file_name + ".small",
        n_jobs,
        target_change_per_node,
        target_steps,
        is3d,
        memory,
        random_state,
    )

    data.uns["X_" + out_basis + "_small"] = X_fle
    data.obs["ds_diffmap_selected"] = selected

    n_components = 2 if not is3d else 3
    Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64)
    Y_init[selected, :] = X_fle
    Y_init[~selected, :] = net_train_and_predict(X,
                                                 X_fle,
                                                 X_full[~selected, :],
                                                 net_alpha,
                                                 random_state,
                                                 verbose=True)

    data.obsm["X_" + out_basis + "_pred"] = Y_init

    data.obsm["X_" + out_basis] = calc_force_directed_layout(
        W_from_rep(data, rep),
        file_name,
        n_jobs,
        target_change_per_node,
        polish_target_steps,
        is3d,
        memory,
        random_state,
        init=Y_init,
    )
예제 #2
0
def fle(
    data: MultimodalData,
    file_name: str = None,
    n_jobs: int = -1,
    rep: str = "diffmap",
    K: int = 50,
    full_speed: bool = False,
    target_change_per_node: float = 2.0,
    target_steps: int = 5000,
    is3d: bool = False,
    memory: int = 8,
    random_state: int = 0,
    out_basis: str = "fle",
) -> None:
    """Construct the Force-directed (FLE) graph.

    This implementation uses forceatlas2-python_ package, which is a Python wrapper of ForceAtlas2_.

    See [Jacomy14]_ for details on FLE.

    .. _forceatlas2-python: https://github.com/klarman-cell-observatory/forceatlas2-python
    .. _ForceAtlas2: https://github.com/klarman-cell-observatory/forceatlas2

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    file_name: ``str``, optional, default: ``None``
        Temporary file to store the coordinates as the input to forceatlas2. If ``None``, use ``tempfile.mkstemp`` to generate file name.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all available threads.

    rep: ``str``, optional, default: ``"diffmap"``
        Representation of data used for the calculation. By default, use Diffusion Map coordinates. If ``None``, use the count matrix ``data.X``.

    K: ``int``, optional, default: ``50``
        Number of nearest neighbors to be considered during the computation.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    target_change_per_node: ``float``, optional, default: ``2.0``
        Target change per node to stop ForceAtlas2.

    target_steps: ``int``, optional, default: ``5000``
        Maximum number of iterations before stopping the ForceAtlas2 algorithm.

    is3d: ``bool``, optional, default: ``False``
        If ``True``, calculate 3D force-directed layout.

    memory: ``int``, optional, default: ``8``
        Memory size in GB for the Java FA2 component. By default, use 8GB memory.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    out_basis: ``str``, optional, default: ``"fle"``
        Key name for calculated FLE coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: FLE coordinates of the data.

    Examples
    --------
    >>> pg.fle(data)
    """

    if file_name is None:
        import tempfile

        _, file_name = tempfile.mkstemp()

    n_jobs = effective_n_jobs(n_jobs)
    rep = update_rep(rep)

    if ("W_" + rep) not in data.uns:
        neighbors(
            data,
            K=K,
            rep=rep,
            n_jobs=n_jobs,
            random_state=random_state,
            full_speed=full_speed,
        )

    data.obsm["X_" + out_basis] = calc_force_directed_layout(
        W_from_rep(data, rep),
        file_name,
        n_jobs,
        target_change_per_node,
        target_steps,
        is3d,
        memory,
        random_state,
    )
예제 #3
0
def run_pipeline(input_file, output_name, **kwargs):
    is_raw = not kwargs["processed"]

    if "seurat_compatible" not in kwargs:
        kwargs["seurat_compatible"] = False

    # load input data
    adata = io.read_input(
        input_file,
        genome=kwargs["genome"],
        concat_matrices=False if kwargs["cite_seq"] else True,
        h5ad_mode=("a" if (is_raw or kwargs["subcluster"]) else "r+"),
        select_singlets=kwargs["select_singlets"],
        channel_attr=kwargs["channel_attr"],
        black_list=(
            kwargs["black_list"].split(",") if kwargs["black_list"] is not None else []
        ),
    )

    if not kwargs["cite_seq"]:
        if is_raw:
            values = adata.X.getnnz(axis=1)
            if values.min() == 0:  # 10x raw data
                adata._inplace_subset_obs(values >= kwargs["min_genes_on_raw"])
    else:
        data_list = adata
        assert len(data_list) == 2
        adata = cdata = None
        for i in range(len(data_list)):
            if data_list[i].uns["genome"].startswith("CITE_Seq"):
                cdata = data_list[i]
            else:
                adata = data_list[i]
        assert adata is not None and cdata is not None
    print("Inputs are loaded.")

    if kwargs["seurat_compatible"]:
        assert is_raw and kwargs["select_hvf"]

    if kwargs["subcluster"]:
        adata = tools.get_anndata_for_subclustering(adata, kwargs["subset_selections"])
        is_raw = True  # get submat and then set is_raw to True

    if is_raw:
        if not kwargs["subcluster"]:
            # filter out low quality cells/genes
            tools.run_filter_data(
                adata,
                output_filt=kwargs["output_filt"],
                plot_filt=kwargs["plot_filt"],
                plot_filt_figsize=kwargs["plot_filt_figsize"],
                mito_prefix=kwargs["mito_prefix"],
                min_genes=kwargs["min_genes"],
                max_genes=kwargs["max_genes"],
                min_umis=kwargs["min_umis"],
                max_umis=kwargs["max_umis"],
                percent_mito=kwargs["percent_mito"],
                percent_cells=kwargs["percent_cells"],
            )

            if kwargs["seurat_compatible"]:
                raw_data = adata.copy()  # raw as count

            # normailize counts and then transform to log space
            tools.log_norm(adata, kwargs["norm_count"])

            # set group attribute
            if kwargs["batch_correction"] and kwargs["group_attribute"] is not None:
                tools.set_group_attribute(adata, kwargs["group_attribute"])

        # select highly variable features
        if kwargs["select_hvf"]:
            tools.highly_variable_features(
                adata,
                kwargs["batch_correction"],
                flavor=kwargs["hvf_flavor"],
                n_top=kwargs["hvf_ngenes"],
                n_jobs=kwargs["n_jobs"],
            )
            if kwargs["hvf_flavor"] == "pegasus":
                if kwargs["plot_hvf"] is not None:
                    from pegasus.plotting import plot_hvf

                    robust_idx = adata.var["robust"].values
                    plot_hvf(
                        adata.var.loc[robust_idx, "mean"],
                        adata.var.loc[robust_idx, "var"],
                        adata.var.loc[robust_idx, "hvf_loess"],
                        adata.var.loc[robust_idx, "highly_variable_features"],
                        kwargs["plot_hvf"] + ".hvf.pdf",
                    )

        # batch correction
        if kwargs["batch_correction"]:
            tools.correct_batch(adata, features="highly_variable_features")

        # PCA
        tools.pca(
            adata,
            n_components=kwargs["nPC"],
            features="highly_variable_features",
            random_state=kwargs["random_state"],
        )

        # Find K neighbors
        tools.neighbors(
            adata,
            K=kwargs["K"],
            rep="pca",
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            full_speed=kwargs["full_speed"],
        )

        # calculate diffmap
        if (
            kwargs["fle"]
            or kwargs["net_fle"]
        ):
            if not kwargs["diffmap"]:
                print("Turn on --diffmap option!")
            kwargs["diffmap"] = True

        if kwargs["diffmap"]:
            tools.diffmap(
                adata,
                n_components=kwargs["diffmap_ndc"],
                rep="pca",
                solver=kwargs["diffmap_solver"],
                random_state=kwargs["random_state"],
                max_t=kwargs["diffmap_maxt"],
            )
            if kwargs["diffmap_to_3d"]:
                tools.reduce_diffmap_to_3d(adata, random_state=kwargs["random_state"])

    # calculate kBET
    if ("kBET" in kwargs) and kwargs["kBET"]:
        stat_mean, pvalue_mean, accept_rate = tools.calc_kBET(
            adata,
            kwargs["kBET_batch"],
            K=kwargs["kBET_K"],
            alpha=kwargs["kBET_alpha"],
            n_jobs=kwargs["n_jobs"],
        )
        print(
            "kBET stat_mean = {:.2f}, pvalue_mean = {:.4f}, accept_rate = {:.2%}.".format(
                stat_mean, pvalue_mean, accept_rate
            )
        )

    # clustering
    if kwargs["spectral_louvain"]:
        tools.cluster(
            adata,
            algo="spectral_louvain",
            rep="pca",
            resolution=kwargs["spectral_louvain_resolution"],
            rep_kmeans=kwargs["spectral_louvain_basis"],
            n_clusters=kwargs["spectral_louvain_nclusters"],
            n_clusters2=kwargs["spectral_louvain_nclusters2"],
            n_init=kwargs["spectral_louvain_ninit"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            class_label="spectral_louvain_labels",
        )

    if kwargs["spectral_leiden"]:
        tools.cluster(
            adata,
            algo="spectral_leiden",
            rep="pca",
            resolution=kwargs["spectral_leiden_resolution"],
            rep_kmeans=kwargs["spectral_leiden_basis"],
            n_clusters=kwargs["spectral_leiden_nclusters"],
            n_clusters2=kwargs["spectral_leiden_nclusters2"],
            n_init=kwargs["spectral_leiden_ninit"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            class_label="spectral_leiden_labels",
        )

    if kwargs["louvain"]:
        tools.cluster(
            adata,
            algo="louvain",
            rep="pca",
            resolution=kwargs["louvain_resolution"],
            random_state=kwargs["random_state"],
            class_label=kwargs["louvain_class_label"],
        )

    if kwargs["leiden"]:
        tools.cluster(
            adata,
            algo="leiden",
            rep="pca",
            resolution=kwargs["leiden_resolution"],
            n_iter=kwargs["leiden_niter"],
            random_state=kwargs["random_state"],
            class_label=kwargs["leiden_class_label"],
        )

    # visualization
    if kwargs["net_tsne"]:
        tools.net_tsne(
            adata,
            rep="pca",
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            net_alpha=kwargs["net_l2"],
            polish_learning_frac=kwargs["net_tsne_polish_learing_frac"],
            polish_n_iter=kwargs["net_tsne_polish_niter"],
            out_basis=kwargs["net_tsne_basis"],
        )

    if kwargs["net_umap"]:
        tools.net_umap(
            adata,
            rep="pca",
            n_jobs=kwargs["n_jobs"],
            n_neighbors=kwargs["umap_K"],
            min_dist=kwargs["umap_min_dist"],
            spread=kwargs["umap_spread"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            full_speed=kwargs["full_speed"],
            net_alpha=kwargs["net_l2"],
            polish_learning_rate=kwargs["net_umap_polish_learing_rate"],
            polish_n_epochs=kwargs["net_umap_polish_nepochs"],
            out_basis=kwargs["net_umap_basis"],
        )

    if kwargs["net_fle"]:
        tools.net_fle(
            adata,
            output_name,
            n_jobs=kwargs["n_jobs"],
            K=kwargs["fle_K"],
            full_speed=kwargs["full_speed"],
            target_change_per_node=kwargs["fle_target_change_per_node"],
            target_steps=kwargs["fle_target_steps"],
            is3d=False,
            memory=kwargs["fle_memory"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            net_alpha=kwargs["net_l2"],
            polish_target_steps=kwargs["net_fle_polish_target_steps"],
            out_basis=kwargs["net_fle_basis"],
        )

    if kwargs["tsne"]:
        tools.tsne(
            adata,
            rep="pca",
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
        )

    if kwargs["fitsne"]:
        tools.fitsne(
            adata,
            rep="pca",
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
        )

    if kwargs["umap"]:
        tools.umap(
            adata,
            rep="pca",
            n_neighbors=kwargs["umap_K"],
            min_dist=kwargs["umap_min_dist"],
            spread=kwargs["umap_spread"],
            random_state=kwargs["random_state"],
        )

    if kwargs["fle"]:
        tools.fle(
            adata,
            output_name,
            n_jobs=kwargs["n_jobs"],
            K=kwargs["fle_K"],
            full_speed=kwargs["full_speed"],
            target_change_per_node=kwargs["fle_target_change_per_node"],
            target_steps=kwargs["fle_target_steps"],
            is3d=False,
            memory=kwargs["fle_memory"],
            random_state=kwargs["random_state"],
        )

    # calculate diffusion-based pseudotime from roots
    if len(kwargs["pseudotime"]) > 0:
        tools.calc_pseudotime(adata, kwargs["pseudotime"])

    # merge cite-seq data and run t-SNE
    if kwargs["cite_seq"]:
        adt_matrix = np.zeros((adata.shape[0], cdata.shape[1]), dtype="float32")
        idx = adata.obs_names.isin(cdata.obs_names)
        adt_matrix[idx, :] = cdata[adata.obs_names[idx],].X.toarray()
        if abs(100.0 - kwargs["cite_seq_capping"]) > 1e-4:
            cite_seq.capping(adt_matrix, kwargs["cite_seq_capping"])

        var_names = np.concatenate(
            [adata.var_names, ["AD-" + x for x in cdata.var_names]]
        )

        new_data = anndata.AnnData(
            X=hstack([adata.X, csr_matrix(adt_matrix)], format="csr"),
            obs=adata.obs,
            obsm=adata.obsm,
            uns=adata.uns,
            var={
                "var_names": var_names,
                "gene_ids": var_names,
                "n_cells": np.concatenate(
                    [adata.var["n_cells"].values, [0] * cdata.shape[1]]
                ),
                "percent_cells": np.concatenate(
                    [adata.var["percent_cells"].values, [0.0] * cdata.shape[1]]
                ),
                "robust": np.concatenate(
                    [adata.var["robust"].values, [False] * cdata.shape[1]]
                ),
                "highly_variable_features": np.concatenate(
                    [
                        adata.var["highly_variable_features"].values,
                        [False] * cdata.shape[1],
                    ]
                ),
            },
        )
        new_data.obsm["X_CITE-Seq"] = adt_matrix
        adata = new_data
        print("ADT count matrix is attached.")

        tools.fitsne(
            adata,
            rep="CITE-Seq",
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
            out_basis="citeseq_fitsne",
        )
        print("Antibody embedding is done.")

    if kwargs["seurat_compatible"]:
        seurat_data = adata.copy()
        seurat_data.raw = raw_data
        seurat_data.uns["scale.data"] = adata.uns["fmat_highly_variable_features"] # assign by reference
        seurat_data.uns["scale.data.rownames"] = adata.var_names[
            adata.var["highly_variable_features"]
        ].values
        io.write_output(seurat_data, output_name + ".seurat.h5ad")

    # write out results
    io.write_output(adata, output_name + ".h5ad")

    if kwargs["output_loom"]:
        io.write_output(adata, output_name + ".loom")

    print("Results are written.")
예제 #4
0
def split_one_cluster(
    data: MultimodalData,
    clust_label: str,
    clust_id: str,
    n_clust: int,
    res_label: str,
    rep: str = "pca",
    random_state: int = 0,
) -> None:
    """
    Use Leiden algorithm to split 'clust_id' in 'clust_label' into 'n_components' clusters and write the new clusting results to 'res_label'. Assume 'clust_label' named clusters as numbers (in str format).

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    clust_label: `str`
        Use existing clustering stored in data.obs['clust_label'].

    clust_id: `str`
        Cluster ID in data.obs['clust_label'].

    n_clust: `int`
        Split 'clust_id' into `n_clust' subclusters.

    res_label: `str`,
        Write new clustering in data.obs['res_label']. The largest subcluster will use 'clust_id' as its cluster ID, while other subclusters will be numbered after existing clusters.

    rep: ``str``, optional, default: ``"pca"``
        The embedding representation used for Kmeans clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates.

    n_jobs : `int`, optional (default: -1)
        Number of threads to use for the KMeans step in 'spectral_louvain' and 'spectral_leiden'. -1 refers to using all physical CPU cores.

    random_state: ``int``, optional, default: ``0``
        Random seed for reproducing results.

    Returns
    -------
    ``None``

    Update ``data.obs``:
        * ``data.obs[res_label]``: New cluster labels of cells as categorical data.

    Examples
    --------
    >>> pg.split_one_cluster(data, 'leiden_labels', '15', 2, 'leiden_labels_split')
    """
    idx = np.where(data.obs[clust_label] == clust_id)[0]
    tmpdat = data[idx].copy()
    from pegasus.tools import neighbors
    neighbors(tmpdat, rep=rep, use_cache=False)
    leiden(tmpdat,
           rep=rep,
           resolution=None,
           n_clust=n_clust,
           random_state=random_state)
    new_clust = data.obs[clust_label].values.astype(int)
    new_label = new_clust.max() + 1
    for label in tmpdat.obs['leiden_labels'].value_counts().index[1:]:
        new_clust[idx[(
            tmpdat.obs['leiden_labels'] == label).values]] = new_label
        new_label += 1
    data.obs[res_label] = pd.Categorical(values=new_clust.astype(str),
                                         categories=np.array(
                                             range(1, new_label)).astype(str))
    data.register_attr(res_label, "cluster")
    del tmpdat
예제 #5
0
def analyze_one_modality(unidata: UnimodalData, output_name: str, is_raw: bool,
                         append_data: UnimodalData, **kwargs) -> None:
    print()
    logger.info(f"Begin to analyze UnimodalData {unidata.get_uid()}.")

    if is_raw:
        # normailize counts and then transform to log space
        tools.log_norm(unidata, kwargs["norm_count"])

        # select highly variable features
        standardize = False  # if no select HVF, False
        if kwargs["select_hvf"]:
            if unidata.shape[1] <= kwargs["hvf_ngenes"]:
                logger.warning(
                    f"Number of genes {unidata.shape[1]} is no greater than the target number of highly variable features {kwargs['hvf_ngenes']}. HVF selection is omitted."
                )
            else:
                standardize = True
                tools.highly_variable_features(
                    unidata,
                    kwargs["batch_attr"]
                    if kwargs["batch_correction"] else None,
                    flavor=kwargs["hvf_flavor"],
                    n_top=kwargs["hvf_ngenes"],
                    n_jobs=kwargs["n_jobs"],
                )
                if kwargs["hvf_flavor"] == "pegasus":
                    if kwargs["plot_hvf"] is not None:
                        from pegasus.plotting import hvfplot
                        fig = hvfplot(unidata, return_fig=True)
                        fig.savefig(f"{kwargs['plot_hvf']}.hvf.pdf")

        n_pc = min(kwargs["pca_n"], unidata.shape[0], unidata.shape[1])
        if n_pc < kwargs["pca_n"]:
            logger.warning(
                f"UnimodalData {unidata.get_uid()} has either dimension ({unidata.shape[0]}, {unidata.shape[1]}) less than the specified number of PCs {kwargs['pca_n']}. Reduce the number of PCs to {n_pc}."
            )

        # Run PCA irrespect of which batch correction method would apply
        tools.pca(
            unidata,
            n_components=n_pc,
            features="highly_variable_features",
            standardize=standardize,
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
        )
        dim_key = "pca"

        if kwargs["nmf"] or (kwargs["batch_correction"]
                             and kwargs["correction_method"] == "inmf"):
            n_nmf = min(kwargs["nmf_n"], unidata.shape[0], unidata.shape[1])
            if n_nmf < kwargs["nmf_n"]:
                logger.warning(
                    f"UnimodalData {unidata.get_uid()} has either dimension ({unidata.shape[0]}, {unidata.shape[1]}) less than the specified number of NMF components {kwargs['nmf_n']}. Reduce the number of NMF components to {n_nmf}."
                )

        if kwargs["nmf"]:
            if kwargs["batch_correction"] and kwargs[
                    "correction_method"] == "inmf":
                logger.warning(
                    "NMF is skipped because integrative NMF is run instead.")
            else:
                tools.nmf(
                    unidata,
                    n_components=n_nmf,
                    features="highly_variable_features",
                    n_jobs=kwargs["n_jobs"],
                    random_state=kwargs["random_state"],
                )

        if kwargs["batch_correction"]:
            if kwargs["correction_method"] == "harmony":
                dim_key = tools.run_harmony(
                    unidata,
                    batch=kwargs["batch_attr"],
                    rep="pca",
                    n_jobs=kwargs["n_jobs"],
                    n_clusters=kwargs["harmony_nclusters"],
                    random_state=kwargs["random_state"])
            elif kwargs["correction_method"] == "inmf":
                dim_key = tools.integrative_nmf(
                    unidata,
                    batch=kwargs["batch_attr"],
                    n_components=n_nmf,
                    features="highly_variable_features",
                    lam=kwargs["inmf_lambda"],
                    n_jobs=kwargs["n_jobs"],
                    random_state=kwargs["random_state"])
            elif kwargs["correction_method"] == "scanorama":
                dim_key = tools.run_scanorama(
                    unidata,
                    batch=kwargs["batch_attr"],
                    n_components=n_pc,
                    features="highly_variable_features",
                    standardize=standardize,
                    random_state=kwargs["random_state"])
            else:
                raise ValueError(
                    f"Unknown batch correction method {kwargs['correction_method']}!"
                )

        # Find K neighbors
        tools.neighbors(
            unidata,
            K=kwargs["K"],
            rep=dim_key,
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            full_speed=kwargs["full_speed"],
        )

    if kwargs["calc_sigscore"] is not None:
        sig_files = kwargs["calc_sigscore"].split(",")
        for sig_file in sig_files:
            tools.calc_signature_score(unidata, sig_file)

    # calculate diffmap
    if (kwargs["fle"] or kwargs["net_fle"]):
        if not kwargs["diffmap"]:
            print("Turn on --diffmap option!")
        kwargs["diffmap"] = True

    if kwargs["diffmap"]:
        tools.diffmap(
            unidata,
            n_components=kwargs["diffmap_ndc"],
            rep=dim_key,
            solver=kwargs["diffmap_solver"],
            max_t=kwargs["diffmap_maxt"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
        )

    # calculate kBET
    if ("kBET" in kwargs) and kwargs["kBET"]:
        stat_mean, pvalue_mean, accept_rate = tools.calc_kBET(
            unidata,
            kwargs["kBET_batch"],
            rep=dim_key,
            K=kwargs["kBET_K"],
            alpha=kwargs["kBET_alpha"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"])
        print(
            "kBET stat_mean = {:.2f}, pvalue_mean = {:.4f}, accept_rate = {:.2%}."
            .format(stat_mean, pvalue_mean, accept_rate))

    # clustering
    if kwargs["spectral_louvain"]:
        tools.cluster(
            unidata,
            algo="spectral_louvain",
            rep=dim_key,
            resolution=kwargs["spectral_louvain_resolution"],
            rep_kmeans=kwargs["spectral_louvain_basis"],
            n_clusters=kwargs["spectral_louvain_nclusters"],
            n_clusters2=kwargs["spectral_louvain_nclusters2"],
            n_init=kwargs["spectral_louvain_ninit"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            class_label="spectral_louvain_labels",
        )

    if kwargs["spectral_leiden"]:
        tools.cluster(
            unidata,
            algo="spectral_leiden",
            rep=dim_key,
            resolution=kwargs["spectral_leiden_resolution"],
            rep_kmeans=kwargs["spectral_leiden_basis"],
            n_clusters=kwargs["spectral_leiden_nclusters"],
            n_clusters2=kwargs["spectral_leiden_nclusters2"],
            n_init=kwargs["spectral_leiden_ninit"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            class_label="spectral_leiden_labels",
        )

    if kwargs["louvain"]:
        tools.cluster(
            unidata,
            algo="louvain",
            rep=dim_key,
            resolution=kwargs["louvain_resolution"],
            random_state=kwargs["random_state"],
            class_label=kwargs["louvain_class_label"],
        )

    if kwargs["leiden"]:
        tools.cluster(
            unidata,
            algo="leiden",
            rep=dim_key,
            resolution=kwargs["leiden_resolution"],
            n_iter=kwargs["leiden_niter"],
            random_state=kwargs["random_state"],
            class_label=kwargs["leiden_class_label"],
        )

    # visualization
    if kwargs["net_umap"]:
        tools.net_umap(
            unidata,
            rep=dim_key,
            n_jobs=kwargs["n_jobs"],
            n_neighbors=kwargs["umap_K"],
            min_dist=kwargs["umap_min_dist"],
            spread=kwargs["umap_spread"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            full_speed=kwargs["full_speed"],
            net_alpha=kwargs["net_l2"],
            polish_learning_rate=kwargs["net_umap_polish_learing_rate"],
            polish_n_epochs=kwargs["net_umap_polish_nepochs"],
            out_basis=kwargs["net_umap_basis"],
        )

    if kwargs["net_fle"]:
        tools.net_fle(
            unidata,
            output_name,
            n_jobs=kwargs["n_jobs"],
            K=kwargs["fle_K"],
            full_speed=kwargs["full_speed"],
            target_change_per_node=kwargs["fle_target_change_per_node"],
            target_steps=kwargs["fle_target_steps"],
            is3d=False,
            memory=kwargs["fle_memory"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            net_alpha=kwargs["net_l2"],
            polish_target_steps=kwargs["net_fle_polish_target_steps"],
            out_basis=kwargs["net_fle_basis"],
        )

    if kwargs["tsne"]:
        tools.tsne(
            unidata,
            rep=dim_key,
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
            initialization=kwargs["tsne_init"],
        )

    if kwargs["umap"]:
        tools.umap(
            unidata,
            rep=dim_key,
            n_neighbors=kwargs["umap_K"],
            min_dist=kwargs["umap_min_dist"],
            spread=kwargs["umap_spread"],
            n_jobs=kwargs["n_jobs"],
            full_speed=kwargs["full_speed"],
            random_state=kwargs["random_state"],
        )

    if kwargs["fle"]:
        tools.fle(
            unidata,
            output_name,
            n_jobs=kwargs["n_jobs"],
            K=kwargs["fle_K"],
            full_speed=kwargs["full_speed"],
            target_change_per_node=kwargs["fle_target_change_per_node"],
            target_steps=kwargs["fle_target_steps"],
            is3d=False,
            memory=kwargs["fle_memory"],
            random_state=kwargs["random_state"],
        )

    if kwargs["infer_doublets"]:
        channel_attr = "Channel"
        if (channel_attr not in unidata.obs) or (
                unidata.obs["Channel"].cat.categories.size == 1):
            channel_attr = None
        clust_attr = kwargs["dbl_cluster_attr"]
        if (clust_attr is None) or (clust_attr not in unidata.obs):
            clust_attr = None
            for value in [
                    "leiden_labels", "louvain_labels",
                    "spectral_leiden_labels", "spectral_louvain_labels"
            ]:
                if value in unidata.obs:
                    clust_attr = value
                    break

        if channel_attr is not None:
            logger.info(f"For doublet inference, channel_attr={channel_attr}.")
        if clust_attr is not None:
            logger.info(f"For doublet inference, clust_attr={clust_attr}.")

        tools.infer_doublets(
            unidata,
            channel_attr=channel_attr,
            clust_attr=clust_attr,
            expected_doublet_rate=kwargs["expected_doublet_rate"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"],
            plot_hist=output_name)

        dbl_clusts = None
        if clust_attr is not None:
            clusts = []
            for idx, row in unidata.uns["pred_dbl_cluster"].iterrows():
                if row["percentage"] >= 50.0:
                    logger.info(
                        f"Cluster {row['cluster']} (percentage={row['percentage']:.2f}%, q-value={row['qval']:.6g}) is identified as a doublet cluster."
                    )
                    clusts.append(row["cluster"])
            if len(clusts) > 0:
                dbl_clusts = f"{clust_attr}:{','.join(clusts)}"

        tools.mark_doublets(unidata, dbl_clusts=dbl_clusts)

    # calculate diffusion-based pseudotime from roots
    if len(kwargs["pseudotime"]) > 0:
        tools.calc_pseudotime(unidata, kwargs["pseudotime"])

    genome = unidata.uns["genome"]

    if append_data is not None:
        locs = unidata.obs_names.get_indexer(append_data.obs_names)
        idx = locs >= 0
        locs = locs[idx]
        Y = append_data.X[idx, :].tocoo(copy=False)
        Z = coo_matrix((Y.data, (locs[Y.row], Y.col)),
                       shape=(unidata.shape[0], append_data.shape[1])).tocsr()

        idy = Z.getnnz(axis=0) > 0
        n_nonzero = idy.sum()
        if n_nonzero > 0:
            if n_nonzero < append_data.shape[1]:
                Z = Z[:, idy]
                append_df = append_data.feature_metadata.loc[idy, :]
            else:
                append_df = append_data.feature_metadata

            if kwargs["citeseq"]:
                append_df = append_df.copy()
                append_df.index = append_df.index.map(lambda x: f"Ab-{x}")

            rawX = hstack([unidata.get_matrix("counts"), Z], format="csr")

            Zt = Z.astype(np.float32)
            if not kwargs["citeseq"]:
                Zt.data *= np.repeat(unidata.obs["scale"].values,
                                     np.diff(Zt.indptr))
                Zt.data = np.log1p(Zt.data)
            else:
                Zt.data = np.arcsinh(Zt.data / 5.0, dtype=np.float32)

            X = hstack([unidata.get_matrix(unidata.current_matrix()), Zt],
                       format="csr")

            new_genome = unidata.get_genome()
            if new_genome != append_data.get_genome():
                new_genome = f"{new_genome}_and_{append_data.get_genome()}"

            feature_metadata = pd.concat([unidata.feature_metadata, append_df],
                                         axis=0)
            feature_metadata.reset_index(inplace=True)
            _fillna(feature_metadata)
            unidata = UnimodalData(
                unidata.barcode_metadata, feature_metadata, {
                    unidata.current_matrix(): X,
                    "counts": rawX
                }, unidata.uns.mapping, unidata.obsm.mapping,
                unidata.varm.mapping
            )  # uns.mapping, obsm.mapping and varm.mapping are passed by reference
            unidata.uns["genome"] = new_genome

            if kwargs["citeseq"] and kwargs["citeseq_umap"]:
                umap_index = append_df.index.difference(
                    [f"Ab-{x}" for x in kwargs["citeseq_umap_exclude"]])
                unidata.obsm["X_citeseq"] = unidata.X[:,
                                                      unidata.var_names.
                                                      isin(umap_index
                                                           )].toarray()
                tools.umap(
                    unidata,
                    rep="citeseq",
                    n_neighbors=kwargs["umap_K"],
                    min_dist=kwargs["umap_min_dist"],
                    spread=kwargs["umap_spread"],
                    n_jobs=kwargs["n_jobs"],
                    full_speed=kwargs["full_speed"],
                    random_state=kwargs["random_state"],
                    out_basis="citeseq_umap",
                )

    if kwargs["output_h5ad"]:
        import time
        start_time = time.perf_counter()
        adata = unidata.to_anndata()
        if "_tmp_fmat_highly_variable_features" in adata.uns:
            adata.uns["scale.data"] = adata.uns.pop(
                "_tmp_fmat_highly_variable_features")  # assign by reference
            adata.uns["scale.data.rownames"] = unidata.var_names[
                unidata.var["highly_variable_features"] == True].values
        adata.write(f"{output_name}.h5ad", compression="gzip")
        del adata
        end_time = time.perf_counter()
        logger.info(
            f"H5AD file {output_name}.h5ad is written. Time spent = {end_time - start_time:.2f}s."
        )

    # write out results
    if kwargs["output_loom"]:
        write_output(unidata, f"{output_name}.loom")

    # Change genome name back if append_data is True
    if unidata.uns["genome"] != genome:
        unidata.uns["genome"] = genome
    # Eliminate objects starting with _tmp from uns
    unidata.uns.pop("_tmp_fmat_highly_variable_features", None)
예제 #6
0
파일: pipeline.py 프로젝트: slowkow/pegasus
def analyze_one_modality(unidata: UnimodalData, output_name: str, is_raw: bool,
                         append_data: UnimodalData, **kwargs) -> None:
    print()
    logger.info(f"Begin to analyze UnimodalData {unidata.get_uid()}.")
    if kwargs["channel_attr"] is not None:
        unidata.obs["Channel"] = unidata.obs[kwargs["channel_attr"]]

    if is_raw:
        # normailize counts and then transform to log space
        tools.log_norm(unidata, kwargs["norm_count"])
        # set group attribute
        if kwargs["batch_correction"] and kwargs["group_attribute"] is not None:
            tools.set_group_attribute(unidata, kwargs["group_attribute"])

    # select highly variable features
    standardize = False  # if no select HVF, False
    if kwargs["select_hvf"]:
        if unidata.shape[1] <= kwargs["hvf_ngenes"]:
            logger.warning(
                f"Number of genes {unidata.shape[1]} is no greater than the target number of highly variable features {kwargs['hvf_ngenes']}. HVF selection is omitted."
            )
        else:
            standardize = True
            tools.highly_variable_features(
                unidata,
                kwargs["batch_correction"],
                flavor=kwargs["hvf_flavor"],
                n_top=kwargs["hvf_ngenes"],
                n_jobs=kwargs["n_jobs"],
            )
            if kwargs["hvf_flavor"] == "pegasus":
                if kwargs["plot_hvf"] is not None:
                    from pegasus.plotting import hvfplot
                    fig = hvfplot(unidata, return_fig=True)
                    fig.savefig(f"{kwargs['plot_hvf']}.hvf.pdf")

    # batch correction: L/S
    if kwargs["batch_correction"] and kwargs["correction_method"] == "L/S":
        tools.correct_batch(unidata, features="highly_variable_features")

    if kwargs["calc_sigscore"] is not None:
        sig_files = kwargs["calc_sigscore"].split(",")
        for sig_file in sig_files:
            tools.calc_signature_score(unidata, sig_file)

    n_pc = min(kwargs["pca_n"], unidata.shape[0], unidata.shape[1])
    if n_pc < kwargs["pca_n"]:
        logger.warning(
            f"UnimodalData {unidata.get_uid()} has either dimension ({unidata.shape[0]}, {unidata.shape[1]}) less than the specified number of PCs {kwargs['pca_n']}. Reduce the number of PCs to {n_pc}."
        )

    if kwargs["batch_correction"] and kwargs[
            "correction_method"] == "scanorama":
        pca_key = tools.run_scanorama(unidata,
                                      n_components=n_pc,
                                      features="highly_variable_features",
                                      standardize=standardize,
                                      random_state=kwargs["random_state"])
    else:
        # PCA
        tools.pca(
            unidata,
            n_components=n_pc,
            features="highly_variable_features",
            standardize=standardize,
            robust=kwargs["pca_robust"],
            random_state=kwargs["random_state"],
        )
        pca_key = "pca"

    # batch correction: Harmony
    if kwargs["batch_correction"] and kwargs["correction_method"] == "harmony":
        pca_key = tools.run_harmony(unidata,
                                    rep="pca",
                                    n_jobs=kwargs["n_jobs"],
                                    n_clusters=kwargs["harmony_nclusters"],
                                    random_state=kwargs["random_state"])

    # Find K neighbors
    tools.neighbors(
        unidata,
        K=kwargs["K"],
        rep=pca_key,
        n_jobs=kwargs["n_jobs"],
        random_state=kwargs["random_state"],
        full_speed=kwargs["full_speed"],
    )

    # calculate diffmap
    if (kwargs["fle"] or kwargs["net_fle"]):
        if not kwargs["diffmap"]:
            print("Turn on --diffmap option!")
        kwargs["diffmap"] = True

    if kwargs["diffmap"]:
        tools.diffmap(
            unidata,
            n_components=kwargs["diffmap_ndc"],
            rep=pca_key,
            solver=kwargs["diffmap_solver"],
            random_state=kwargs["random_state"],
            max_t=kwargs["diffmap_maxt"],
        )
        if kwargs["diffmap_to_3d"]:
            tools.reduce_diffmap_to_3d(unidata,
                                       random_state=kwargs["random_state"])

    # calculate kBET
    if ("kBET" in kwargs) and kwargs["kBET"]:
        stat_mean, pvalue_mean, accept_rate = tools.calc_kBET(
            unidata,
            kwargs["kBET_batch"],
            rep=pca_key,
            K=kwargs["kBET_K"],
            alpha=kwargs["kBET_alpha"],
            n_jobs=kwargs["n_jobs"],
            random_state=kwargs["random_state"])
        print(
            "kBET stat_mean = {:.2f}, pvalue_mean = {:.4f}, accept_rate = {:.2%}."
            .format(stat_mean, pvalue_mean, accept_rate))

    # clustering
    if kwargs["spectral_louvain"]:
        tools.cluster(
            unidata,
            algo="spectral_louvain",
            rep=pca_key,
            resolution=kwargs["spectral_louvain_resolution"],
            rep_kmeans=kwargs["spectral_louvain_basis"],
            n_clusters=kwargs["spectral_louvain_nclusters"],
            n_clusters2=kwargs["spectral_louvain_nclusters2"],
            n_init=kwargs["spectral_louvain_ninit"],
            random_state=kwargs["random_state"],
            class_label="spectral_louvain_labels",
        )

    if kwargs["spectral_leiden"]:
        tools.cluster(
            unidata,
            algo="spectral_leiden",
            rep=pca_key,
            resolution=kwargs["spectral_leiden_resolution"],
            rep_kmeans=kwargs["spectral_leiden_basis"],
            n_clusters=kwargs["spectral_leiden_nclusters"],
            n_clusters2=kwargs["spectral_leiden_nclusters2"],
            n_init=kwargs["spectral_leiden_ninit"],
            random_state=kwargs["random_state"],
            class_label="spectral_leiden_labels",
        )

    if kwargs["louvain"]:
        tools.cluster(
            unidata,
            algo="louvain",
            rep=pca_key,
            resolution=kwargs["louvain_resolution"],
            random_state=kwargs["random_state"],
            class_label=kwargs["louvain_class_label"],
        )

    if kwargs["leiden"]:
        tools.cluster(
            unidata,
            algo="leiden",
            rep=pca_key,
            resolution=kwargs["leiden_resolution"],
            n_iter=kwargs["leiden_niter"],
            random_state=kwargs["random_state"],
            class_label=kwargs["leiden_class_label"],
        )

    # visualization
    if kwargs["net_tsne"]:
        tools.net_tsne(
            unidata,
            rep=pca_key,
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            net_alpha=kwargs["net_l2"],
            polish_learning_frac=kwargs["net_tsne_polish_learing_frac"],
            polish_n_iter=kwargs["net_tsne_polish_niter"],
            out_basis=kwargs["net_tsne_basis"],
        )

    if kwargs["net_umap"]:
        tools.net_umap(
            unidata,
            rep=pca_key,
            n_jobs=kwargs["n_jobs"],
            n_neighbors=kwargs["umap_K"],
            min_dist=kwargs["umap_min_dist"],
            spread=kwargs["umap_spread"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            full_speed=kwargs["full_speed"],
            net_alpha=kwargs["net_l2"],
            polish_learning_rate=kwargs["net_umap_polish_learing_rate"],
            polish_n_epochs=kwargs["net_umap_polish_nepochs"],
            out_basis=kwargs["net_umap_basis"],
        )

    if kwargs["net_fle"]:
        tools.net_fle(
            unidata,
            output_name,
            n_jobs=kwargs["n_jobs"],
            K=kwargs["fle_K"],
            full_speed=kwargs["full_speed"],
            target_change_per_node=kwargs["fle_target_change_per_node"],
            target_steps=kwargs["fle_target_steps"],
            is3d=False,
            memory=kwargs["fle_memory"],
            random_state=kwargs["random_state"],
            select_frac=kwargs["net_ds_frac"],
            select_K=kwargs["net_ds_K"],
            select_alpha=kwargs["net_ds_alpha"],
            net_alpha=kwargs["net_l2"],
            polish_target_steps=kwargs["net_fle_polish_target_steps"],
            out_basis=kwargs["net_fle_basis"],
        )

    if kwargs["tsne"]:
        tools.tsne(
            unidata,
            rep=pca_key,
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
        )

    if kwargs["fitsne"]:
        tools.fitsne(
            unidata,
            rep=pca_key,
            n_jobs=kwargs["n_jobs"],
            perplexity=kwargs["tsne_perplexity"],
            random_state=kwargs["random_state"],
        )

    if kwargs["umap"]:
        tools.umap(
            unidata,
            rep=pca_key,
            n_neighbors=kwargs["umap_K"],
            min_dist=kwargs["umap_min_dist"],
            spread=kwargs["umap_spread"],
            random_state=kwargs["random_state"],
        )

    if kwargs["fle"]:
        tools.fle(
            unidata,
            output_name,
            n_jobs=kwargs["n_jobs"],
            K=kwargs["fle_K"],
            full_speed=kwargs["full_speed"],
            target_change_per_node=kwargs["fle_target_change_per_node"],
            target_steps=kwargs["fle_target_steps"],
            is3d=False,
            memory=kwargs["fle_memory"],
            random_state=kwargs["random_state"],
        )

    # calculate diffusion-based pseudotime from roots
    if len(kwargs["pseudotime"]) > 0:
        tools.calc_pseudotime(unidata, kwargs["pseudotime"])

    genome = unidata.uns["genome"]

    if append_data is not None:
        locs = unidata.obs_names.get_indexer(append_data.obs_names)
        idx = locs >= 0
        locs = locs[idx]
        Y = append_data.X[idx, :].tocoo(copy=False)
        Z = coo_matrix((Y.data, (locs[Y.row], Y.col)),
                       shape=(unidata.shape[0], append_data.shape[1])).tocsr()

        idy = Z.getnnz(axis=0) > 0
        n_nonzero = idy.sum()
        if n_nonzero > 0:
            if n_nonzero < append_data.shape[1]:
                Z = Z[:, idy]
                append_df = append_data.feature_metadata.loc[idy, :]
            else:
                append_df = append_data.feature_metadata

            rawX = hstack([unidata.get_matrix("raw.X"), Z], format="csr")

            Zt = Z.astype(np.float32)
            Zt.data *= np.repeat(unidata.obs["scale"].values,
                                 np.diff(Zt.indptr))
            Zt.data = np.log1p(Zt.data)

            X = hstack([unidata.get_matrix("X"), Zt], format="csr")

            new_genome = unidata.get_genome(
            ) + "_and_" + append_data.get_genome()

            feature_metadata = pd.concat([unidata.feature_metadata, append_df],
                                         axis=0)
            feature_metadata.reset_index(inplace=True)
            feature_metadata.fillna(value=_get_fillna_dict(
                unidata.feature_metadata),
                                    inplace=True)

            unidata = UnimodalData(
                unidata.barcode_metadata, feature_metadata, {
                    "X": X,
                    "raw.X": rawX
                }, unidata.uns.mapping, unidata.obsm.mapping,
                unidata.varm.mapping
            )  # uns.mapping, obsm.mapping and varm.mapping are passed by reference
            unidata.uns["genome"] = new_genome

    if kwargs["output_h5ad"]:
        adata = unidata.to_anndata()
        adata.uns["scale.data"] = adata.uns.pop(
            "_tmp_fmat_highly_variable_features")  # assign by reference
        adata.uns["scale.data.rownames"] = unidata.var_names[
            unidata.var["highly_variable_features"]].values
        adata.write(f"{output_name}.h5ad", compression="gzip")
        del adata

    # write out results
    if kwargs["output_loom"]:
        write_output(unidata, f"{output_name}.loom")

    # Change genome name back if append_data is True
    if unidata.uns["genome"] != genome:
        unidata.uns["genome"] = genome
    # Eliminate objects starting with fmat_ from uns
    unidata.uns.pop("_tmp_fmat_highly_variable_features", None)