def net_fle( data: MultimodalData, file_name: str = None, n_jobs: int = -1, rep: str = "diffmap", K: int = 50, full_speed: bool = False, target_change_per_node: float = 2.0, target_steps: int = 5000, is3d: bool = False, memory: int = 8, random_state: int = 0, select_frac: float = 0.1, select_K: int = 25, select_alpha: float = 1.0, net_alpha: float = 0.1, polish_target_steps: int = 1500, out_basis: str = "net_fle", ) -> None: """Construct Net-Force-directed (FLE) graph. Net-FLE is an approximated FLE graph using Deep Learning model to improve the speed. In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor. See [Li20]_ for details. .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. file_name: ``str``, optional, default: ``None`` Temporary file to store the coordinates as the input to forceatlas2. If ``None``, use ``tempfile.mkstemp`` to generate file name. n_jobs: ``int``, optional, default: ``-1`` Number of threads to use. If ``-1``, use all available threads. rep: ``str``, optional, default: ``"diffmap"`` Representation of data used for the calculation. By default, use Diffusion Map coordinates. If ``None``, use the count matrix ``data.X``. K: ``int``, optional, default: ``50`` Number of nearest neighbors to be considered during the computation. full_speed: ``bool``, optional, default: ``False`` * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible. * Otherwise, use only one thread to make sure results are reproducible. target_change_per_node: ``float``, optional, default: ``2.0`` Target change per node to stop ForceAtlas2. target_steps: ``int``, optional, default: ``5000`` Maximum number of iterations before stopping the ForceAtlas2 algorithm. is3d: ``bool``, optional, default: ``False`` If ``True``, calculate 3D force-directed layout. memory: ``int``, optional, default: ``8`` Memory size in GB for the Java FA2 component. By default, use 8GB memory. random_state: ``int``, optional, default: ``0`` Random seed set for reproducing results. select_frac: ``float``, optional, default: ``0.1`` Down sampling fraction on the cells. select_K: ``int``, optional, default: ``25`` Number of neighbors to be used to estimate local density for each data point for down sampling. select_alpha: ``float``, optional, default: ``1.0`` Weight the down sample to be proportional to ``radius ** select_alpha``. net_alpha: ``float``, optional, default: ``0.1`` L2 penalty (regularization term) parameter of the deep regressor. polish_target_steps: ``int``, optional, default: ``1500`` After running the deep regressor to predict new coordinate, Number of ForceAtlas2 iterations. out_basis: ``str``, optional, default: ``"net_fle"`` Key name for calculated FLE coordinates to store. Returns ------- ``None`` Update ``data.obsm``: * ``data.obsm['X_' + out_basis]``: Net FLE coordinates of the data. Update ``data.obs``: * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase. Examples -------- >>> pg.net_fle(data) """ if file_name is None: if file_name is None: import tempfile _, file_name = tempfile.mkstemp() n_jobs = effective_n_jobs(n_jobs) rep = update_rep(rep) if ("W_" + rep) not in data.uns: neighbors( data, K=K, rep=rep, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed, ) indices_key = rep + "_knn_indices" distances_key = rep + "_knn_distances" if not knn_is_cached(data, indices_key, distances_key, select_K): raise ValueError("Please run neighbors first!") selected = select_cells( data.uns[distances_key], select_frac, K=select_K, alpha=select_alpha, random_state=random_state, ) X_full = X_from_rep(data, rep) X = X_full[selected, :] ds_indices_key = "ds_" + rep + "_knn_indices" ds_distances_key = "ds_" + rep + "_knn_distances" indices, distances = calculate_nearest_neighbors(X, K=K, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed) data.uns[ds_indices_key] = indices data.uns[ds_distances_key] = distances W = calculate_affinity_matrix(indices, distances) X_fle = calc_force_directed_layout( W, file_name + ".small", n_jobs, target_change_per_node, target_steps, is3d, memory, random_state, ) data.uns["X_" + out_basis + "_small"] = X_fle data.obs["ds_diffmap_selected"] = selected n_components = 2 if not is3d else 3 Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64) Y_init[selected, :] = X_fle Y_init[~selected, :] = net_train_and_predict(X, X_fle, X_full[~selected, :], net_alpha, random_state, verbose=True) data.obsm["X_" + out_basis + "_pred"] = Y_init data.obsm["X_" + out_basis] = calc_force_directed_layout( W_from_rep(data, rep), file_name, n_jobs, target_change_per_node, polish_target_steps, is3d, memory, random_state, init=Y_init, )
def fle( data: MultimodalData, file_name: str = None, n_jobs: int = -1, rep: str = "diffmap", K: int = 50, full_speed: bool = False, target_change_per_node: float = 2.0, target_steps: int = 5000, is3d: bool = False, memory: int = 8, random_state: int = 0, out_basis: str = "fle", ) -> None: """Construct the Force-directed (FLE) graph. This implementation uses forceatlas2-python_ package, which is a Python wrapper of ForceAtlas2_. See [Jacomy14]_ for details on FLE. .. _forceatlas2-python: https://github.com/klarman-cell-observatory/forceatlas2-python .. _ForceAtlas2: https://github.com/klarman-cell-observatory/forceatlas2 Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. file_name: ``str``, optional, default: ``None`` Temporary file to store the coordinates as the input to forceatlas2. If ``None``, use ``tempfile.mkstemp`` to generate file name. n_jobs: ``int``, optional, default: ``-1`` Number of threads to use. If ``-1``, use all available threads. rep: ``str``, optional, default: ``"diffmap"`` Representation of data used for the calculation. By default, use Diffusion Map coordinates. If ``None``, use the count matrix ``data.X``. K: ``int``, optional, default: ``50`` Number of nearest neighbors to be considered during the computation. full_speed: ``bool``, optional, default: ``False`` * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible. * Otherwise, use only one thread to make sure results are reproducible. target_change_per_node: ``float``, optional, default: ``2.0`` Target change per node to stop ForceAtlas2. target_steps: ``int``, optional, default: ``5000`` Maximum number of iterations before stopping the ForceAtlas2 algorithm. is3d: ``bool``, optional, default: ``False`` If ``True``, calculate 3D force-directed layout. memory: ``int``, optional, default: ``8`` Memory size in GB for the Java FA2 component. By default, use 8GB memory. random_state: ``int``, optional, default: ``0`` Random seed set for reproducing results. out_basis: ``str``, optional, default: ``"fle"`` Key name for calculated FLE coordinates to store. Returns ------- ``None`` Update ``data.obsm``: * ``data.obsm['X_' + out_basis]``: FLE coordinates of the data. Examples -------- >>> pg.fle(data) """ if file_name is None: import tempfile _, file_name = tempfile.mkstemp() n_jobs = effective_n_jobs(n_jobs) rep = update_rep(rep) if ("W_" + rep) not in data.uns: neighbors( data, K=K, rep=rep, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed, ) data.obsm["X_" + out_basis] = calc_force_directed_layout( W_from_rep(data, rep), file_name, n_jobs, target_change_per_node, target_steps, is3d, memory, random_state, )
def run_pipeline(input_file, output_name, **kwargs): is_raw = not kwargs["processed"] if "seurat_compatible" not in kwargs: kwargs["seurat_compatible"] = False # load input data adata = io.read_input( input_file, genome=kwargs["genome"], concat_matrices=False if kwargs["cite_seq"] else True, h5ad_mode=("a" if (is_raw or kwargs["subcluster"]) else "r+"), select_singlets=kwargs["select_singlets"], channel_attr=kwargs["channel_attr"], black_list=( kwargs["black_list"].split(",") if kwargs["black_list"] is not None else [] ), ) if not kwargs["cite_seq"]: if is_raw: values = adata.X.getnnz(axis=1) if values.min() == 0: # 10x raw data adata._inplace_subset_obs(values >= kwargs["min_genes_on_raw"]) else: data_list = adata assert len(data_list) == 2 adata = cdata = None for i in range(len(data_list)): if data_list[i].uns["genome"].startswith("CITE_Seq"): cdata = data_list[i] else: adata = data_list[i] assert adata is not None and cdata is not None print("Inputs are loaded.") if kwargs["seurat_compatible"]: assert is_raw and kwargs["select_hvf"] if kwargs["subcluster"]: adata = tools.get_anndata_for_subclustering(adata, kwargs["subset_selections"]) is_raw = True # get submat and then set is_raw to True if is_raw: if not kwargs["subcluster"]: # filter out low quality cells/genes tools.run_filter_data( adata, output_filt=kwargs["output_filt"], plot_filt=kwargs["plot_filt"], plot_filt_figsize=kwargs["plot_filt_figsize"], mito_prefix=kwargs["mito_prefix"], min_genes=kwargs["min_genes"], max_genes=kwargs["max_genes"], min_umis=kwargs["min_umis"], max_umis=kwargs["max_umis"], percent_mito=kwargs["percent_mito"], percent_cells=kwargs["percent_cells"], ) if kwargs["seurat_compatible"]: raw_data = adata.copy() # raw as count # normailize counts and then transform to log space tools.log_norm(adata, kwargs["norm_count"]) # set group attribute if kwargs["batch_correction"] and kwargs["group_attribute"] is not None: tools.set_group_attribute(adata, kwargs["group_attribute"]) # select highly variable features if kwargs["select_hvf"]: tools.highly_variable_features( adata, kwargs["batch_correction"], flavor=kwargs["hvf_flavor"], n_top=kwargs["hvf_ngenes"], n_jobs=kwargs["n_jobs"], ) if kwargs["hvf_flavor"] == "pegasus": if kwargs["plot_hvf"] is not None: from pegasus.plotting import plot_hvf robust_idx = adata.var["robust"].values plot_hvf( adata.var.loc[robust_idx, "mean"], adata.var.loc[robust_idx, "var"], adata.var.loc[robust_idx, "hvf_loess"], adata.var.loc[robust_idx, "highly_variable_features"], kwargs["plot_hvf"] + ".hvf.pdf", ) # batch correction if kwargs["batch_correction"]: tools.correct_batch(adata, features="highly_variable_features") # PCA tools.pca( adata, n_components=kwargs["nPC"], features="highly_variable_features", random_state=kwargs["random_state"], ) # Find K neighbors tools.neighbors( adata, K=kwargs["K"], rep="pca", n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], full_speed=kwargs["full_speed"], ) # calculate diffmap if ( kwargs["fle"] or kwargs["net_fle"] ): if not kwargs["diffmap"]: print("Turn on --diffmap option!") kwargs["diffmap"] = True if kwargs["diffmap"]: tools.diffmap( adata, n_components=kwargs["diffmap_ndc"], rep="pca", solver=kwargs["diffmap_solver"], random_state=kwargs["random_state"], max_t=kwargs["diffmap_maxt"], ) if kwargs["diffmap_to_3d"]: tools.reduce_diffmap_to_3d(adata, random_state=kwargs["random_state"]) # calculate kBET if ("kBET" in kwargs) and kwargs["kBET"]: stat_mean, pvalue_mean, accept_rate = tools.calc_kBET( adata, kwargs["kBET_batch"], K=kwargs["kBET_K"], alpha=kwargs["kBET_alpha"], n_jobs=kwargs["n_jobs"], ) print( "kBET stat_mean = {:.2f}, pvalue_mean = {:.4f}, accept_rate = {:.2%}.".format( stat_mean, pvalue_mean, accept_rate ) ) # clustering if kwargs["spectral_louvain"]: tools.cluster( adata, algo="spectral_louvain", rep="pca", resolution=kwargs["spectral_louvain_resolution"], rep_kmeans=kwargs["spectral_louvain_basis"], n_clusters=kwargs["spectral_louvain_nclusters"], n_clusters2=kwargs["spectral_louvain_nclusters2"], n_init=kwargs["spectral_louvain_ninit"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], class_label="spectral_louvain_labels", ) if kwargs["spectral_leiden"]: tools.cluster( adata, algo="spectral_leiden", rep="pca", resolution=kwargs["spectral_leiden_resolution"], rep_kmeans=kwargs["spectral_leiden_basis"], n_clusters=kwargs["spectral_leiden_nclusters"], n_clusters2=kwargs["spectral_leiden_nclusters2"], n_init=kwargs["spectral_leiden_ninit"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], class_label="spectral_leiden_labels", ) if kwargs["louvain"]: tools.cluster( adata, algo="louvain", rep="pca", resolution=kwargs["louvain_resolution"], random_state=kwargs["random_state"], class_label=kwargs["louvain_class_label"], ) if kwargs["leiden"]: tools.cluster( adata, algo="leiden", rep="pca", resolution=kwargs["leiden_resolution"], n_iter=kwargs["leiden_niter"], random_state=kwargs["random_state"], class_label=kwargs["leiden_class_label"], ) # visualization if kwargs["net_tsne"]: tools.net_tsne( adata, rep="pca", n_jobs=kwargs["n_jobs"], perplexity=kwargs["tsne_perplexity"], random_state=kwargs["random_state"], select_frac=kwargs["net_ds_frac"], select_K=kwargs["net_ds_K"], select_alpha=kwargs["net_ds_alpha"], net_alpha=kwargs["net_l2"], polish_learning_frac=kwargs["net_tsne_polish_learing_frac"], polish_n_iter=kwargs["net_tsne_polish_niter"], out_basis=kwargs["net_tsne_basis"], ) if kwargs["net_umap"]: tools.net_umap( adata, rep="pca", n_jobs=kwargs["n_jobs"], n_neighbors=kwargs["umap_K"], min_dist=kwargs["umap_min_dist"], spread=kwargs["umap_spread"], random_state=kwargs["random_state"], select_frac=kwargs["net_ds_frac"], select_K=kwargs["net_ds_K"], select_alpha=kwargs["net_ds_alpha"], full_speed=kwargs["full_speed"], net_alpha=kwargs["net_l2"], polish_learning_rate=kwargs["net_umap_polish_learing_rate"], polish_n_epochs=kwargs["net_umap_polish_nepochs"], out_basis=kwargs["net_umap_basis"], ) if kwargs["net_fle"]: tools.net_fle( adata, output_name, n_jobs=kwargs["n_jobs"], K=kwargs["fle_K"], full_speed=kwargs["full_speed"], target_change_per_node=kwargs["fle_target_change_per_node"], target_steps=kwargs["fle_target_steps"], is3d=False, memory=kwargs["fle_memory"], random_state=kwargs["random_state"], select_frac=kwargs["net_ds_frac"], select_K=kwargs["net_ds_K"], select_alpha=kwargs["net_ds_alpha"], net_alpha=kwargs["net_l2"], polish_target_steps=kwargs["net_fle_polish_target_steps"], out_basis=kwargs["net_fle_basis"], ) if kwargs["tsne"]: tools.tsne( adata, rep="pca", n_jobs=kwargs["n_jobs"], perplexity=kwargs["tsne_perplexity"], random_state=kwargs["random_state"], ) if kwargs["fitsne"]: tools.fitsne( adata, rep="pca", n_jobs=kwargs["n_jobs"], perplexity=kwargs["tsne_perplexity"], random_state=kwargs["random_state"], ) if kwargs["umap"]: tools.umap( adata, rep="pca", n_neighbors=kwargs["umap_K"], min_dist=kwargs["umap_min_dist"], spread=kwargs["umap_spread"], random_state=kwargs["random_state"], ) if kwargs["fle"]: tools.fle( adata, output_name, n_jobs=kwargs["n_jobs"], K=kwargs["fle_K"], full_speed=kwargs["full_speed"], target_change_per_node=kwargs["fle_target_change_per_node"], target_steps=kwargs["fle_target_steps"], is3d=False, memory=kwargs["fle_memory"], random_state=kwargs["random_state"], ) # calculate diffusion-based pseudotime from roots if len(kwargs["pseudotime"]) > 0: tools.calc_pseudotime(adata, kwargs["pseudotime"]) # merge cite-seq data and run t-SNE if kwargs["cite_seq"]: adt_matrix = np.zeros((adata.shape[0], cdata.shape[1]), dtype="float32") idx = adata.obs_names.isin(cdata.obs_names) adt_matrix[idx, :] = cdata[adata.obs_names[idx],].X.toarray() if abs(100.0 - kwargs["cite_seq_capping"]) > 1e-4: cite_seq.capping(adt_matrix, kwargs["cite_seq_capping"]) var_names = np.concatenate( [adata.var_names, ["AD-" + x for x in cdata.var_names]] ) new_data = anndata.AnnData( X=hstack([adata.X, csr_matrix(adt_matrix)], format="csr"), obs=adata.obs, obsm=adata.obsm, uns=adata.uns, var={ "var_names": var_names, "gene_ids": var_names, "n_cells": np.concatenate( [adata.var["n_cells"].values, [0] * cdata.shape[1]] ), "percent_cells": np.concatenate( [adata.var["percent_cells"].values, [0.0] * cdata.shape[1]] ), "robust": np.concatenate( [adata.var["robust"].values, [False] * cdata.shape[1]] ), "highly_variable_features": np.concatenate( [ adata.var["highly_variable_features"].values, [False] * cdata.shape[1], ] ), }, ) new_data.obsm["X_CITE-Seq"] = adt_matrix adata = new_data print("ADT count matrix is attached.") tools.fitsne( adata, rep="CITE-Seq", n_jobs=kwargs["n_jobs"], perplexity=kwargs["tsne_perplexity"], random_state=kwargs["random_state"], out_basis="citeseq_fitsne", ) print("Antibody embedding is done.") if kwargs["seurat_compatible"]: seurat_data = adata.copy() seurat_data.raw = raw_data seurat_data.uns["scale.data"] = adata.uns["fmat_highly_variable_features"] # assign by reference seurat_data.uns["scale.data.rownames"] = adata.var_names[ adata.var["highly_variable_features"] ].values io.write_output(seurat_data, output_name + ".seurat.h5ad") # write out results io.write_output(adata, output_name + ".h5ad") if kwargs["output_loom"]: io.write_output(adata, output_name + ".loom") print("Results are written.")
def split_one_cluster( data: MultimodalData, clust_label: str, clust_id: str, n_clust: int, res_label: str, rep: str = "pca", random_state: int = 0, ) -> None: """ Use Leiden algorithm to split 'clust_id' in 'clust_label' into 'n_components' clusters and write the new clusting results to 'res_label'. Assume 'clust_label' named clusters as numbers (in str format). Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. clust_label: `str` Use existing clustering stored in data.obs['clust_label']. clust_id: `str` Cluster ID in data.obs['clust_label']. n_clust: `int` Split 'clust_id' into `n_clust' subclusters. res_label: `str`, Write new clustering in data.obs['res_label']. The largest subcluster will use 'clust_id' as its cluster ID, while other subclusters will be numbered after existing clusters. rep: ``str``, optional, default: ``"pca"`` The embedding representation used for Kmeans clustering. Keyword ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates. n_jobs : `int`, optional (default: -1) Number of threads to use for the KMeans step in 'spectral_louvain' and 'spectral_leiden'. -1 refers to using all physical CPU cores. random_state: ``int``, optional, default: ``0`` Random seed for reproducing results. Returns ------- ``None`` Update ``data.obs``: * ``data.obs[res_label]``: New cluster labels of cells as categorical data. Examples -------- >>> pg.split_one_cluster(data, 'leiden_labels', '15', 2, 'leiden_labels_split') """ idx = np.where(data.obs[clust_label] == clust_id)[0] tmpdat = data[idx].copy() from pegasus.tools import neighbors neighbors(tmpdat, rep=rep, use_cache=False) leiden(tmpdat, rep=rep, resolution=None, n_clust=n_clust, random_state=random_state) new_clust = data.obs[clust_label].values.astype(int) new_label = new_clust.max() + 1 for label in tmpdat.obs['leiden_labels'].value_counts().index[1:]: new_clust[idx[( tmpdat.obs['leiden_labels'] == label).values]] = new_label new_label += 1 data.obs[res_label] = pd.Categorical(values=new_clust.astype(str), categories=np.array( range(1, new_label)).astype(str)) data.register_attr(res_label, "cluster") del tmpdat
def analyze_one_modality(unidata: UnimodalData, output_name: str, is_raw: bool, append_data: UnimodalData, **kwargs) -> None: print() logger.info(f"Begin to analyze UnimodalData {unidata.get_uid()}.") if is_raw: # normailize counts and then transform to log space tools.log_norm(unidata, kwargs["norm_count"]) # select highly variable features standardize = False # if no select HVF, False if kwargs["select_hvf"]: if unidata.shape[1] <= kwargs["hvf_ngenes"]: logger.warning( f"Number of genes {unidata.shape[1]} is no greater than the target number of highly variable features {kwargs['hvf_ngenes']}. HVF selection is omitted." ) else: standardize = True tools.highly_variable_features( unidata, kwargs["batch_attr"] if kwargs["batch_correction"] else None, flavor=kwargs["hvf_flavor"], n_top=kwargs["hvf_ngenes"], n_jobs=kwargs["n_jobs"], ) if kwargs["hvf_flavor"] == "pegasus": if kwargs["plot_hvf"] is not None: from pegasus.plotting import hvfplot fig = hvfplot(unidata, return_fig=True) fig.savefig(f"{kwargs['plot_hvf']}.hvf.pdf") n_pc = min(kwargs["pca_n"], unidata.shape[0], unidata.shape[1]) if n_pc < kwargs["pca_n"]: logger.warning( f"UnimodalData {unidata.get_uid()} has either dimension ({unidata.shape[0]}, {unidata.shape[1]}) less than the specified number of PCs {kwargs['pca_n']}. Reduce the number of PCs to {n_pc}." ) # Run PCA irrespect of which batch correction method would apply tools.pca( unidata, n_components=n_pc, features="highly_variable_features", standardize=standardize, n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], ) dim_key = "pca" if kwargs["nmf"] or (kwargs["batch_correction"] and kwargs["correction_method"] == "inmf"): n_nmf = min(kwargs["nmf_n"], unidata.shape[0], unidata.shape[1]) if n_nmf < kwargs["nmf_n"]: logger.warning( f"UnimodalData {unidata.get_uid()} has either dimension ({unidata.shape[0]}, {unidata.shape[1]}) less than the specified number of NMF components {kwargs['nmf_n']}. Reduce the number of NMF components to {n_nmf}." ) if kwargs["nmf"]: if kwargs["batch_correction"] and kwargs[ "correction_method"] == "inmf": logger.warning( "NMF is skipped because integrative NMF is run instead.") else: tools.nmf( unidata, n_components=n_nmf, features="highly_variable_features", n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], ) if kwargs["batch_correction"]: if kwargs["correction_method"] == "harmony": dim_key = tools.run_harmony( unidata, batch=kwargs["batch_attr"], rep="pca", n_jobs=kwargs["n_jobs"], n_clusters=kwargs["harmony_nclusters"], random_state=kwargs["random_state"]) elif kwargs["correction_method"] == "inmf": dim_key = tools.integrative_nmf( unidata, batch=kwargs["batch_attr"], n_components=n_nmf, features="highly_variable_features", lam=kwargs["inmf_lambda"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"]) elif kwargs["correction_method"] == "scanorama": dim_key = tools.run_scanorama( unidata, batch=kwargs["batch_attr"], n_components=n_pc, features="highly_variable_features", standardize=standardize, random_state=kwargs["random_state"]) else: raise ValueError( f"Unknown batch correction method {kwargs['correction_method']}!" ) # Find K neighbors tools.neighbors( unidata, K=kwargs["K"], rep=dim_key, n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], full_speed=kwargs["full_speed"], ) if kwargs["calc_sigscore"] is not None: sig_files = kwargs["calc_sigscore"].split(",") for sig_file in sig_files: tools.calc_signature_score(unidata, sig_file) # calculate diffmap if (kwargs["fle"] or kwargs["net_fle"]): if not kwargs["diffmap"]: print("Turn on --diffmap option!") kwargs["diffmap"] = True if kwargs["diffmap"]: tools.diffmap( unidata, n_components=kwargs["diffmap_ndc"], rep=dim_key, solver=kwargs["diffmap_solver"], max_t=kwargs["diffmap_maxt"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], ) # calculate kBET if ("kBET" in kwargs) and kwargs["kBET"]: stat_mean, pvalue_mean, accept_rate = tools.calc_kBET( unidata, kwargs["kBET_batch"], rep=dim_key, K=kwargs["kBET_K"], alpha=kwargs["kBET_alpha"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"]) print( "kBET stat_mean = {:.2f}, pvalue_mean = {:.4f}, accept_rate = {:.2%}." .format(stat_mean, pvalue_mean, accept_rate)) # clustering if kwargs["spectral_louvain"]: tools.cluster( unidata, algo="spectral_louvain", rep=dim_key, resolution=kwargs["spectral_louvain_resolution"], rep_kmeans=kwargs["spectral_louvain_basis"], n_clusters=kwargs["spectral_louvain_nclusters"], n_clusters2=kwargs["spectral_louvain_nclusters2"], n_init=kwargs["spectral_louvain_ninit"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], class_label="spectral_louvain_labels", ) if kwargs["spectral_leiden"]: tools.cluster( unidata, algo="spectral_leiden", rep=dim_key, resolution=kwargs["spectral_leiden_resolution"], rep_kmeans=kwargs["spectral_leiden_basis"], n_clusters=kwargs["spectral_leiden_nclusters"], n_clusters2=kwargs["spectral_leiden_nclusters2"], n_init=kwargs["spectral_leiden_ninit"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], class_label="spectral_leiden_labels", ) if kwargs["louvain"]: tools.cluster( unidata, algo="louvain", rep=dim_key, resolution=kwargs["louvain_resolution"], random_state=kwargs["random_state"], class_label=kwargs["louvain_class_label"], ) if kwargs["leiden"]: tools.cluster( unidata, algo="leiden", rep=dim_key, resolution=kwargs["leiden_resolution"], n_iter=kwargs["leiden_niter"], random_state=kwargs["random_state"], class_label=kwargs["leiden_class_label"], ) # visualization if kwargs["net_umap"]: tools.net_umap( unidata, rep=dim_key, n_jobs=kwargs["n_jobs"], n_neighbors=kwargs["umap_K"], min_dist=kwargs["umap_min_dist"], spread=kwargs["umap_spread"], random_state=kwargs["random_state"], select_frac=kwargs["net_ds_frac"], select_K=kwargs["net_ds_K"], select_alpha=kwargs["net_ds_alpha"], full_speed=kwargs["full_speed"], net_alpha=kwargs["net_l2"], polish_learning_rate=kwargs["net_umap_polish_learing_rate"], polish_n_epochs=kwargs["net_umap_polish_nepochs"], out_basis=kwargs["net_umap_basis"], ) if kwargs["net_fle"]: tools.net_fle( unidata, output_name, n_jobs=kwargs["n_jobs"], K=kwargs["fle_K"], full_speed=kwargs["full_speed"], target_change_per_node=kwargs["fle_target_change_per_node"], target_steps=kwargs["fle_target_steps"], is3d=False, memory=kwargs["fle_memory"], random_state=kwargs["random_state"], select_frac=kwargs["net_ds_frac"], select_K=kwargs["net_ds_K"], select_alpha=kwargs["net_ds_alpha"], net_alpha=kwargs["net_l2"], polish_target_steps=kwargs["net_fle_polish_target_steps"], out_basis=kwargs["net_fle_basis"], ) if kwargs["tsne"]: tools.tsne( unidata, rep=dim_key, n_jobs=kwargs["n_jobs"], perplexity=kwargs["tsne_perplexity"], random_state=kwargs["random_state"], initialization=kwargs["tsne_init"], ) if kwargs["umap"]: tools.umap( unidata, rep=dim_key, n_neighbors=kwargs["umap_K"], min_dist=kwargs["umap_min_dist"], spread=kwargs["umap_spread"], n_jobs=kwargs["n_jobs"], full_speed=kwargs["full_speed"], random_state=kwargs["random_state"], ) if kwargs["fle"]: tools.fle( unidata, output_name, n_jobs=kwargs["n_jobs"], K=kwargs["fle_K"], full_speed=kwargs["full_speed"], target_change_per_node=kwargs["fle_target_change_per_node"], target_steps=kwargs["fle_target_steps"], is3d=False, memory=kwargs["fle_memory"], random_state=kwargs["random_state"], ) if kwargs["infer_doublets"]: channel_attr = "Channel" if (channel_attr not in unidata.obs) or ( unidata.obs["Channel"].cat.categories.size == 1): channel_attr = None clust_attr = kwargs["dbl_cluster_attr"] if (clust_attr is None) or (clust_attr not in unidata.obs): clust_attr = None for value in [ "leiden_labels", "louvain_labels", "spectral_leiden_labels", "spectral_louvain_labels" ]: if value in unidata.obs: clust_attr = value break if channel_attr is not None: logger.info(f"For doublet inference, channel_attr={channel_attr}.") if clust_attr is not None: logger.info(f"For doublet inference, clust_attr={clust_attr}.") tools.infer_doublets( unidata, channel_attr=channel_attr, clust_attr=clust_attr, expected_doublet_rate=kwargs["expected_doublet_rate"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], plot_hist=output_name) dbl_clusts = None if clust_attr is not None: clusts = [] for idx, row in unidata.uns["pred_dbl_cluster"].iterrows(): if row["percentage"] >= 50.0: logger.info( f"Cluster {row['cluster']} (percentage={row['percentage']:.2f}%, q-value={row['qval']:.6g}) is identified as a doublet cluster." ) clusts.append(row["cluster"]) if len(clusts) > 0: dbl_clusts = f"{clust_attr}:{','.join(clusts)}" tools.mark_doublets(unidata, dbl_clusts=dbl_clusts) # calculate diffusion-based pseudotime from roots if len(kwargs["pseudotime"]) > 0: tools.calc_pseudotime(unidata, kwargs["pseudotime"]) genome = unidata.uns["genome"] if append_data is not None: locs = unidata.obs_names.get_indexer(append_data.obs_names) idx = locs >= 0 locs = locs[idx] Y = append_data.X[idx, :].tocoo(copy=False) Z = coo_matrix((Y.data, (locs[Y.row], Y.col)), shape=(unidata.shape[0], append_data.shape[1])).tocsr() idy = Z.getnnz(axis=0) > 0 n_nonzero = idy.sum() if n_nonzero > 0: if n_nonzero < append_data.shape[1]: Z = Z[:, idy] append_df = append_data.feature_metadata.loc[idy, :] else: append_df = append_data.feature_metadata if kwargs["citeseq"]: append_df = append_df.copy() append_df.index = append_df.index.map(lambda x: f"Ab-{x}") rawX = hstack([unidata.get_matrix("counts"), Z], format="csr") Zt = Z.astype(np.float32) if not kwargs["citeseq"]: Zt.data *= np.repeat(unidata.obs["scale"].values, np.diff(Zt.indptr)) Zt.data = np.log1p(Zt.data) else: Zt.data = np.arcsinh(Zt.data / 5.0, dtype=np.float32) X = hstack([unidata.get_matrix(unidata.current_matrix()), Zt], format="csr") new_genome = unidata.get_genome() if new_genome != append_data.get_genome(): new_genome = f"{new_genome}_and_{append_data.get_genome()}" feature_metadata = pd.concat([unidata.feature_metadata, append_df], axis=0) feature_metadata.reset_index(inplace=True) _fillna(feature_metadata) unidata = UnimodalData( unidata.barcode_metadata, feature_metadata, { unidata.current_matrix(): X, "counts": rawX }, unidata.uns.mapping, unidata.obsm.mapping, unidata.varm.mapping ) # uns.mapping, obsm.mapping and varm.mapping are passed by reference unidata.uns["genome"] = new_genome if kwargs["citeseq"] and kwargs["citeseq_umap"]: umap_index = append_df.index.difference( [f"Ab-{x}" for x in kwargs["citeseq_umap_exclude"]]) unidata.obsm["X_citeseq"] = unidata.X[:, unidata.var_names. isin(umap_index )].toarray() tools.umap( unidata, rep="citeseq", n_neighbors=kwargs["umap_K"], min_dist=kwargs["umap_min_dist"], spread=kwargs["umap_spread"], n_jobs=kwargs["n_jobs"], full_speed=kwargs["full_speed"], random_state=kwargs["random_state"], out_basis="citeseq_umap", ) if kwargs["output_h5ad"]: import time start_time = time.perf_counter() adata = unidata.to_anndata() if "_tmp_fmat_highly_variable_features" in adata.uns: adata.uns["scale.data"] = adata.uns.pop( "_tmp_fmat_highly_variable_features") # assign by reference adata.uns["scale.data.rownames"] = unidata.var_names[ unidata.var["highly_variable_features"] == True].values adata.write(f"{output_name}.h5ad", compression="gzip") del adata end_time = time.perf_counter() logger.info( f"H5AD file {output_name}.h5ad is written. Time spent = {end_time - start_time:.2f}s." ) # write out results if kwargs["output_loom"]: write_output(unidata, f"{output_name}.loom") # Change genome name back if append_data is True if unidata.uns["genome"] != genome: unidata.uns["genome"] = genome # Eliminate objects starting with _tmp from uns unidata.uns.pop("_tmp_fmat_highly_variable_features", None)
def analyze_one_modality(unidata: UnimodalData, output_name: str, is_raw: bool, append_data: UnimodalData, **kwargs) -> None: print() logger.info(f"Begin to analyze UnimodalData {unidata.get_uid()}.") if kwargs["channel_attr"] is not None: unidata.obs["Channel"] = unidata.obs[kwargs["channel_attr"]] if is_raw: # normailize counts and then transform to log space tools.log_norm(unidata, kwargs["norm_count"]) # set group attribute if kwargs["batch_correction"] and kwargs["group_attribute"] is not None: tools.set_group_attribute(unidata, kwargs["group_attribute"]) # select highly variable features standardize = False # if no select HVF, False if kwargs["select_hvf"]: if unidata.shape[1] <= kwargs["hvf_ngenes"]: logger.warning( f"Number of genes {unidata.shape[1]} is no greater than the target number of highly variable features {kwargs['hvf_ngenes']}. HVF selection is omitted." ) else: standardize = True tools.highly_variable_features( unidata, kwargs["batch_correction"], flavor=kwargs["hvf_flavor"], n_top=kwargs["hvf_ngenes"], n_jobs=kwargs["n_jobs"], ) if kwargs["hvf_flavor"] == "pegasus": if kwargs["plot_hvf"] is not None: from pegasus.plotting import hvfplot fig = hvfplot(unidata, return_fig=True) fig.savefig(f"{kwargs['plot_hvf']}.hvf.pdf") # batch correction: L/S if kwargs["batch_correction"] and kwargs["correction_method"] == "L/S": tools.correct_batch(unidata, features="highly_variable_features") if kwargs["calc_sigscore"] is not None: sig_files = kwargs["calc_sigscore"].split(",") for sig_file in sig_files: tools.calc_signature_score(unidata, sig_file) n_pc = min(kwargs["pca_n"], unidata.shape[0], unidata.shape[1]) if n_pc < kwargs["pca_n"]: logger.warning( f"UnimodalData {unidata.get_uid()} has either dimension ({unidata.shape[0]}, {unidata.shape[1]}) less than the specified number of PCs {kwargs['pca_n']}. Reduce the number of PCs to {n_pc}." ) if kwargs["batch_correction"] and kwargs[ "correction_method"] == "scanorama": pca_key = tools.run_scanorama(unidata, n_components=n_pc, features="highly_variable_features", standardize=standardize, random_state=kwargs["random_state"]) else: # PCA tools.pca( unidata, n_components=n_pc, features="highly_variable_features", standardize=standardize, robust=kwargs["pca_robust"], random_state=kwargs["random_state"], ) pca_key = "pca" # batch correction: Harmony if kwargs["batch_correction"] and kwargs["correction_method"] == "harmony": pca_key = tools.run_harmony(unidata, rep="pca", n_jobs=kwargs["n_jobs"], n_clusters=kwargs["harmony_nclusters"], random_state=kwargs["random_state"]) # Find K neighbors tools.neighbors( unidata, K=kwargs["K"], rep=pca_key, n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], full_speed=kwargs["full_speed"], ) # calculate diffmap if (kwargs["fle"] or kwargs["net_fle"]): if not kwargs["diffmap"]: print("Turn on --diffmap option!") kwargs["diffmap"] = True if kwargs["diffmap"]: tools.diffmap( unidata, n_components=kwargs["diffmap_ndc"], rep=pca_key, solver=kwargs["diffmap_solver"], random_state=kwargs["random_state"], max_t=kwargs["diffmap_maxt"], ) if kwargs["diffmap_to_3d"]: tools.reduce_diffmap_to_3d(unidata, random_state=kwargs["random_state"]) # calculate kBET if ("kBET" in kwargs) and kwargs["kBET"]: stat_mean, pvalue_mean, accept_rate = tools.calc_kBET( unidata, kwargs["kBET_batch"], rep=pca_key, K=kwargs["kBET_K"], alpha=kwargs["kBET_alpha"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"]) print( "kBET stat_mean = {:.2f}, pvalue_mean = {:.4f}, accept_rate = {:.2%}." .format(stat_mean, pvalue_mean, accept_rate)) # clustering if kwargs["spectral_louvain"]: tools.cluster( unidata, algo="spectral_louvain", rep=pca_key, resolution=kwargs["spectral_louvain_resolution"], rep_kmeans=kwargs["spectral_louvain_basis"], n_clusters=kwargs["spectral_louvain_nclusters"], n_clusters2=kwargs["spectral_louvain_nclusters2"], n_init=kwargs["spectral_louvain_ninit"], random_state=kwargs["random_state"], class_label="spectral_louvain_labels", ) if kwargs["spectral_leiden"]: tools.cluster( unidata, algo="spectral_leiden", rep=pca_key, resolution=kwargs["spectral_leiden_resolution"], rep_kmeans=kwargs["spectral_leiden_basis"], n_clusters=kwargs["spectral_leiden_nclusters"], n_clusters2=kwargs["spectral_leiden_nclusters2"], n_init=kwargs["spectral_leiden_ninit"], random_state=kwargs["random_state"], class_label="spectral_leiden_labels", ) if kwargs["louvain"]: tools.cluster( unidata, algo="louvain", rep=pca_key, resolution=kwargs["louvain_resolution"], random_state=kwargs["random_state"], class_label=kwargs["louvain_class_label"], ) if kwargs["leiden"]: tools.cluster( unidata, algo="leiden", rep=pca_key, resolution=kwargs["leiden_resolution"], n_iter=kwargs["leiden_niter"], random_state=kwargs["random_state"], class_label=kwargs["leiden_class_label"], ) # visualization if kwargs["net_tsne"]: tools.net_tsne( unidata, rep=pca_key, n_jobs=kwargs["n_jobs"], perplexity=kwargs["tsne_perplexity"], random_state=kwargs["random_state"], select_frac=kwargs["net_ds_frac"], select_K=kwargs["net_ds_K"], select_alpha=kwargs["net_ds_alpha"], net_alpha=kwargs["net_l2"], polish_learning_frac=kwargs["net_tsne_polish_learing_frac"], polish_n_iter=kwargs["net_tsne_polish_niter"], out_basis=kwargs["net_tsne_basis"], ) if kwargs["net_umap"]: tools.net_umap( unidata, rep=pca_key, n_jobs=kwargs["n_jobs"], n_neighbors=kwargs["umap_K"], min_dist=kwargs["umap_min_dist"], spread=kwargs["umap_spread"], random_state=kwargs["random_state"], select_frac=kwargs["net_ds_frac"], select_K=kwargs["net_ds_K"], select_alpha=kwargs["net_ds_alpha"], full_speed=kwargs["full_speed"], net_alpha=kwargs["net_l2"], polish_learning_rate=kwargs["net_umap_polish_learing_rate"], polish_n_epochs=kwargs["net_umap_polish_nepochs"], out_basis=kwargs["net_umap_basis"], ) if kwargs["net_fle"]: tools.net_fle( unidata, output_name, n_jobs=kwargs["n_jobs"], K=kwargs["fle_K"], full_speed=kwargs["full_speed"], target_change_per_node=kwargs["fle_target_change_per_node"], target_steps=kwargs["fle_target_steps"], is3d=False, memory=kwargs["fle_memory"], random_state=kwargs["random_state"], select_frac=kwargs["net_ds_frac"], select_K=kwargs["net_ds_K"], select_alpha=kwargs["net_ds_alpha"], net_alpha=kwargs["net_l2"], polish_target_steps=kwargs["net_fle_polish_target_steps"], out_basis=kwargs["net_fle_basis"], ) if kwargs["tsne"]: tools.tsne( unidata, rep=pca_key, n_jobs=kwargs["n_jobs"], perplexity=kwargs["tsne_perplexity"], random_state=kwargs["random_state"], ) if kwargs["fitsne"]: tools.fitsne( unidata, rep=pca_key, n_jobs=kwargs["n_jobs"], perplexity=kwargs["tsne_perplexity"], random_state=kwargs["random_state"], ) if kwargs["umap"]: tools.umap( unidata, rep=pca_key, n_neighbors=kwargs["umap_K"], min_dist=kwargs["umap_min_dist"], spread=kwargs["umap_spread"], random_state=kwargs["random_state"], ) if kwargs["fle"]: tools.fle( unidata, output_name, n_jobs=kwargs["n_jobs"], K=kwargs["fle_K"], full_speed=kwargs["full_speed"], target_change_per_node=kwargs["fle_target_change_per_node"], target_steps=kwargs["fle_target_steps"], is3d=False, memory=kwargs["fle_memory"], random_state=kwargs["random_state"], ) # calculate diffusion-based pseudotime from roots if len(kwargs["pseudotime"]) > 0: tools.calc_pseudotime(unidata, kwargs["pseudotime"]) genome = unidata.uns["genome"] if append_data is not None: locs = unidata.obs_names.get_indexer(append_data.obs_names) idx = locs >= 0 locs = locs[idx] Y = append_data.X[idx, :].tocoo(copy=False) Z = coo_matrix((Y.data, (locs[Y.row], Y.col)), shape=(unidata.shape[0], append_data.shape[1])).tocsr() idy = Z.getnnz(axis=0) > 0 n_nonzero = idy.sum() if n_nonzero > 0: if n_nonzero < append_data.shape[1]: Z = Z[:, idy] append_df = append_data.feature_metadata.loc[idy, :] else: append_df = append_data.feature_metadata rawX = hstack([unidata.get_matrix("raw.X"), Z], format="csr") Zt = Z.astype(np.float32) Zt.data *= np.repeat(unidata.obs["scale"].values, np.diff(Zt.indptr)) Zt.data = np.log1p(Zt.data) X = hstack([unidata.get_matrix("X"), Zt], format="csr") new_genome = unidata.get_genome( ) + "_and_" + append_data.get_genome() feature_metadata = pd.concat([unidata.feature_metadata, append_df], axis=0) feature_metadata.reset_index(inplace=True) feature_metadata.fillna(value=_get_fillna_dict( unidata.feature_metadata), inplace=True) unidata = UnimodalData( unidata.barcode_metadata, feature_metadata, { "X": X, "raw.X": rawX }, unidata.uns.mapping, unidata.obsm.mapping, unidata.varm.mapping ) # uns.mapping, obsm.mapping and varm.mapping are passed by reference unidata.uns["genome"] = new_genome if kwargs["output_h5ad"]: adata = unidata.to_anndata() adata.uns["scale.data"] = adata.uns.pop( "_tmp_fmat_highly_variable_features") # assign by reference adata.uns["scale.data.rownames"] = unidata.var_names[ unidata.var["highly_variable_features"]].values adata.write(f"{output_name}.h5ad", compression="gzip") del adata # write out results if kwargs["output_loom"]: write_output(unidata, f"{output_name}.loom") # Change genome name back if append_data is True if unidata.uns["genome"] != genome: unidata.uns["genome"] = genome # Eliminate objects starting with fmat_ from uns unidata.uns.pop("_tmp_fmat_highly_variable_features", None)