def _write_scp_expression(unidata: UnimodalData, output_name: str, is_sparse: bool, precision: int = 2) -> None: """ Only write the main matrix X """ try: from pegasusio.cylib.io import write_mtx, write_dense except ModuleNotFoundError: print("No module named 'pegasusio.cylib.io'") matrix = unidata.get_matrix("X") if is_sparse: barcode_file = f"{output_name}.scp.barcodes.tsv" with open(barcode_file, "w") as fout: fout.write("\n".join(unidata.obs_names) + "\n") logger.info(f"Barcode file {barcode_file} is written.") feature_file = f"{output_name}.scp.features.tsv" gene_names = unidata.var_names.values gene_ids = unidata.var["featureid"].values if "featureid" in unidata.var else (unidata.var["gene_ids"] if "gene_ids" in unidata.var else gene_names) df = pd.DataFrame({"gene_names": gene_names, "gene_ids": gene_ids})[["gene_ids", "gene_names"]] df.to_csv(feature_file, sep="\t", header=False, index=False) logger.info(f"Feature file {feature_file} is written.") mtx_file = f"{output_name}.scp.matrix.mtx" write_mtx(mtx_file, matrix.data, matrix.indices, matrix.indptr, matrix.shape[0], matrix.shape[1], precision = precision) # matrix is cell x gene csr_matrix, will write as gene x cell logger.info(f"Matrix file {mtx_file} is written.") else: expr_file = f"{output_name}.scp.expr.txt" matrix = matrix.T.tocsr() # convert to gene x cell write_dense(expr_file, unidata.obs_names.values, unidata.var_names.values, matrix.data, matrix.indices, matrix.indptr, matrix.shape[0], matrix.shape[1], precision = precision) logger.info(f"Dense expression file {expr_file} is written.")
def deseq2( pseudobulk: UnimodalData, design: str, contrast: Tuple[str, str, str], de_key: str = "deseq2", replaceOutliers: bool = True, ) -> None: """Perform Differential Expression (DE) Analysis using DESeq2 on pseduobulk data. This function calls R package DESeq2, requiring DESeq2 in R installed. DE analysis will be performed on all pseudo-bulk matrices in pseudobulk. Parameters ---------- pseudobulk: ``UnimodalData`` Pseudobulk data with rows for samples and columns for genes. If pseudobulk contains multiple matrices, DESeq2 will apply to all matrices. design: ``str`` Design formula that will be passed to DESeq2 contrast: ``Tuple[str, str, str]`` A tuple of three elements passing to DESeq2: a factor in design formula, a level in the factor as numeritor of fold change, and a level as denominator of fold change. de_key: ``str``, optional, default: ``"deseq2"`` Key name of DE analysis results stored. For cluster.X, stored key will be cluster.de_key replaceOutliers: ``bool``, optional, default: ``True`` If execute DESeq2's replaceOutliers step. If set to ``False``, we will set minReplicatesForReplace=Inf in ``DESeq`` function and set cooksCutoff=False in ``results`` function. Returns ------- ``None`` Update ``pseudobulk.varm``: ``pseudobulk.varm[de_key]``: DE analysis result for pseudo-bulk count matrix. ``pseudobulk.varm[cluster.de_key]``: DE results for cluster-specific pseudo-bulk count matrices. Examples -------- >>> pg.deseq2(pseudobulk, '~gender', ('gender', 'female', 'male')) """ try: import rpy2.robjects as ro from rpy2.robjects import pandas2ri, numpy2ri, Formula from rpy2.robjects.packages import importr from rpy2.robjects.conversion import localconverter except ModuleNotFoundError as e: import sys logger.error(f"{e}\nNeed rpy2! Try 'pip install rpy2'.") sys.exit(-1) try: deseq2 = importr('DESeq2') except ModuleNotFoundError: import sys text = """Please install DESeq2 in order to run this function.\n To install this package, start R and enter:\n if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("DESeq2")""" logger.error(text) sys.exit(-1) import math to_dataframe = ro.r('function(x) data.frame(x)') for mat_key in pseudobulk.list_keys(): with localconverter(ro.default_converter + numpy2ri.converter + pandas2ri.converter): dds = deseq2.DESeqDataSetFromMatrix( countData=pseudobulk.get_matrix(mat_key).T, colData=pseudobulk.obs, design=Formula(design)) if replaceOutliers: dds = deseq2.DESeq(dds) res = deseq2.results(dds, contrast=ro.StrVector(contrast)) else: dds = deseq2.DESeq(dds, minReplicatesForReplace=math.inf) res = deseq2.results(dds, contrast=ro.StrVector(contrast), cooksCutoff=False) with localconverter(ro.default_converter + pandas2ri.converter): res_df = ro.conversion.rpy2py(to_dataframe(res)) res_df.fillna( { 'log2FoldChange': 0.0, 'lfcSE': 0.0, 'stat': 0.0, 'pvalue': 1.0, 'padj': 1.0 }, inplace=True) de_res_key = de_key if mat_key.find( '.') < 0 else f"{mat_key.partition('.')[0]}.{de_key}" pseudobulk.varm[de_res_key] = res_df.to_records(index=False)
def analyze_one_modality(unidata: UnimodalData, output_name: str, is_raw: bool, append_data: UnimodalData, **kwargs) -> None: print() logger.info(f"Begin to analyze UnimodalData {unidata.get_uid()}.") if is_raw: # normailize counts and then transform to log space tools.log_norm(unidata, kwargs["norm_count"]) # select highly variable features standardize = False # if no select HVF, False if kwargs["select_hvf"]: if unidata.shape[1] <= kwargs["hvf_ngenes"]: logger.warning( f"Number of genes {unidata.shape[1]} is no greater than the target number of highly variable features {kwargs['hvf_ngenes']}. HVF selection is omitted." ) else: standardize = True tools.highly_variable_features( unidata, kwargs["batch_attr"] if kwargs["batch_correction"] else None, flavor=kwargs["hvf_flavor"], n_top=kwargs["hvf_ngenes"], n_jobs=kwargs["n_jobs"], ) if kwargs["hvf_flavor"] == "pegasus": if kwargs["plot_hvf"] is not None: from pegasus.plotting import hvfplot fig = hvfplot(unidata, return_fig=True) fig.savefig(f"{kwargs['plot_hvf']}.hvf.pdf") n_pc = min(kwargs["pca_n"], unidata.shape[0], unidata.shape[1]) if n_pc < kwargs["pca_n"]: logger.warning( f"UnimodalData {unidata.get_uid()} has either dimension ({unidata.shape[0]}, {unidata.shape[1]}) less than the specified number of PCs {kwargs['pca_n']}. Reduce the number of PCs to {n_pc}." ) # Run PCA irrespect of which batch correction method would apply tools.pca( unidata, n_components=n_pc, features="highly_variable_features", standardize=standardize, n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], ) dim_key = "pca" if kwargs["nmf"] or (kwargs["batch_correction"] and kwargs["correction_method"] == "inmf"): n_nmf = min(kwargs["nmf_n"], unidata.shape[0], unidata.shape[1]) if n_nmf < kwargs["nmf_n"]: logger.warning( f"UnimodalData {unidata.get_uid()} has either dimension ({unidata.shape[0]}, {unidata.shape[1]}) less than the specified number of NMF components {kwargs['nmf_n']}. Reduce the number of NMF components to {n_nmf}." ) if kwargs["nmf"]: if kwargs["batch_correction"] and kwargs[ "correction_method"] == "inmf": logger.warning( "NMF is skipped because integrative NMF is run instead.") else: tools.nmf( unidata, n_components=n_nmf, features="highly_variable_features", n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], ) if kwargs["batch_correction"]: if kwargs["correction_method"] == "harmony": dim_key = tools.run_harmony( unidata, batch=kwargs["batch_attr"], rep="pca", n_jobs=kwargs["n_jobs"], n_clusters=kwargs["harmony_nclusters"], random_state=kwargs["random_state"]) elif kwargs["correction_method"] == "inmf": dim_key = tools.integrative_nmf( unidata, batch=kwargs["batch_attr"], n_components=n_nmf, features="highly_variable_features", lam=kwargs["inmf_lambda"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"]) elif kwargs["correction_method"] == "scanorama": dim_key = tools.run_scanorama( unidata, batch=kwargs["batch_attr"], n_components=n_pc, features="highly_variable_features", standardize=standardize, random_state=kwargs["random_state"]) else: raise ValueError( f"Unknown batch correction method {kwargs['correction_method']}!" ) # Find K neighbors tools.neighbors( unidata, K=kwargs["K"], rep=dim_key, n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], full_speed=kwargs["full_speed"], ) if kwargs["calc_sigscore"] is not None: sig_files = kwargs["calc_sigscore"].split(",") for sig_file in sig_files: tools.calc_signature_score(unidata, sig_file) # calculate diffmap if (kwargs["fle"] or kwargs["net_fle"]): if not kwargs["diffmap"]: print("Turn on --diffmap option!") kwargs["diffmap"] = True if kwargs["diffmap"]: tools.diffmap( unidata, n_components=kwargs["diffmap_ndc"], rep=dim_key, solver=kwargs["diffmap_solver"], max_t=kwargs["diffmap_maxt"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], ) # calculate kBET if ("kBET" in kwargs) and kwargs["kBET"]: stat_mean, pvalue_mean, accept_rate = tools.calc_kBET( unidata, kwargs["kBET_batch"], rep=dim_key, K=kwargs["kBET_K"], alpha=kwargs["kBET_alpha"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"]) print( "kBET stat_mean = {:.2f}, pvalue_mean = {:.4f}, accept_rate = {:.2%}." .format(stat_mean, pvalue_mean, accept_rate)) # clustering if kwargs["spectral_louvain"]: tools.cluster( unidata, algo="spectral_louvain", rep=dim_key, resolution=kwargs["spectral_louvain_resolution"], rep_kmeans=kwargs["spectral_louvain_basis"], n_clusters=kwargs["spectral_louvain_nclusters"], n_clusters2=kwargs["spectral_louvain_nclusters2"], n_init=kwargs["spectral_louvain_ninit"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], class_label="spectral_louvain_labels", ) if kwargs["spectral_leiden"]: tools.cluster( unidata, algo="spectral_leiden", rep=dim_key, resolution=kwargs["spectral_leiden_resolution"], rep_kmeans=kwargs["spectral_leiden_basis"], n_clusters=kwargs["spectral_leiden_nclusters"], n_clusters2=kwargs["spectral_leiden_nclusters2"], n_init=kwargs["spectral_leiden_ninit"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], class_label="spectral_leiden_labels", ) if kwargs["louvain"]: tools.cluster( unidata, algo="louvain", rep=dim_key, resolution=kwargs["louvain_resolution"], random_state=kwargs["random_state"], class_label=kwargs["louvain_class_label"], ) if kwargs["leiden"]: tools.cluster( unidata, algo="leiden", rep=dim_key, resolution=kwargs["leiden_resolution"], n_iter=kwargs["leiden_niter"], random_state=kwargs["random_state"], class_label=kwargs["leiden_class_label"], ) # visualization if kwargs["net_umap"]: tools.net_umap( unidata, rep=dim_key, n_jobs=kwargs["n_jobs"], n_neighbors=kwargs["umap_K"], min_dist=kwargs["umap_min_dist"], spread=kwargs["umap_spread"], random_state=kwargs["random_state"], select_frac=kwargs["net_ds_frac"], select_K=kwargs["net_ds_K"], select_alpha=kwargs["net_ds_alpha"], full_speed=kwargs["full_speed"], net_alpha=kwargs["net_l2"], polish_learning_rate=kwargs["net_umap_polish_learing_rate"], polish_n_epochs=kwargs["net_umap_polish_nepochs"], out_basis=kwargs["net_umap_basis"], ) if kwargs["net_fle"]: tools.net_fle( unidata, output_name, n_jobs=kwargs["n_jobs"], K=kwargs["fle_K"], full_speed=kwargs["full_speed"], target_change_per_node=kwargs["fle_target_change_per_node"], target_steps=kwargs["fle_target_steps"], is3d=False, memory=kwargs["fle_memory"], random_state=kwargs["random_state"], select_frac=kwargs["net_ds_frac"], select_K=kwargs["net_ds_K"], select_alpha=kwargs["net_ds_alpha"], net_alpha=kwargs["net_l2"], polish_target_steps=kwargs["net_fle_polish_target_steps"], out_basis=kwargs["net_fle_basis"], ) if kwargs["tsne"]: tools.tsne( unidata, rep=dim_key, n_jobs=kwargs["n_jobs"], perplexity=kwargs["tsne_perplexity"], random_state=kwargs["random_state"], initialization=kwargs["tsne_init"], ) if kwargs["umap"]: tools.umap( unidata, rep=dim_key, n_neighbors=kwargs["umap_K"], min_dist=kwargs["umap_min_dist"], spread=kwargs["umap_spread"], n_jobs=kwargs["n_jobs"], full_speed=kwargs["full_speed"], random_state=kwargs["random_state"], ) if kwargs["fle"]: tools.fle( unidata, output_name, n_jobs=kwargs["n_jobs"], K=kwargs["fle_K"], full_speed=kwargs["full_speed"], target_change_per_node=kwargs["fle_target_change_per_node"], target_steps=kwargs["fle_target_steps"], is3d=False, memory=kwargs["fle_memory"], random_state=kwargs["random_state"], ) if kwargs["infer_doublets"]: channel_attr = "Channel" if (channel_attr not in unidata.obs) or ( unidata.obs["Channel"].cat.categories.size == 1): channel_attr = None clust_attr = kwargs["dbl_cluster_attr"] if (clust_attr is None) or (clust_attr not in unidata.obs): clust_attr = None for value in [ "leiden_labels", "louvain_labels", "spectral_leiden_labels", "spectral_louvain_labels" ]: if value in unidata.obs: clust_attr = value break if channel_attr is not None: logger.info(f"For doublet inference, channel_attr={channel_attr}.") if clust_attr is not None: logger.info(f"For doublet inference, clust_attr={clust_attr}.") tools.infer_doublets( unidata, channel_attr=channel_attr, clust_attr=clust_attr, expected_doublet_rate=kwargs["expected_doublet_rate"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], plot_hist=output_name) dbl_clusts = None if clust_attr is not None: clusts = [] for idx, row in unidata.uns["pred_dbl_cluster"].iterrows(): if row["percentage"] >= 50.0: logger.info( f"Cluster {row['cluster']} (percentage={row['percentage']:.2f}%, q-value={row['qval']:.6g}) is identified as a doublet cluster." ) clusts.append(row["cluster"]) if len(clusts) > 0: dbl_clusts = f"{clust_attr}:{','.join(clusts)}" tools.mark_doublets(unidata, dbl_clusts=dbl_clusts) # calculate diffusion-based pseudotime from roots if len(kwargs["pseudotime"]) > 0: tools.calc_pseudotime(unidata, kwargs["pseudotime"]) genome = unidata.uns["genome"] if append_data is not None: locs = unidata.obs_names.get_indexer(append_data.obs_names) idx = locs >= 0 locs = locs[idx] Y = append_data.X[idx, :].tocoo(copy=False) Z = coo_matrix((Y.data, (locs[Y.row], Y.col)), shape=(unidata.shape[0], append_data.shape[1])).tocsr() idy = Z.getnnz(axis=0) > 0 n_nonzero = idy.sum() if n_nonzero > 0: if n_nonzero < append_data.shape[1]: Z = Z[:, idy] append_df = append_data.feature_metadata.loc[idy, :] else: append_df = append_data.feature_metadata if kwargs["citeseq"]: append_df = append_df.copy() append_df.index = append_df.index.map(lambda x: f"Ab-{x}") rawX = hstack([unidata.get_matrix("counts"), Z], format="csr") Zt = Z.astype(np.float32) if not kwargs["citeseq"]: Zt.data *= np.repeat(unidata.obs["scale"].values, np.diff(Zt.indptr)) Zt.data = np.log1p(Zt.data) else: Zt.data = np.arcsinh(Zt.data / 5.0, dtype=np.float32) X = hstack([unidata.get_matrix(unidata.current_matrix()), Zt], format="csr") new_genome = unidata.get_genome() if new_genome != append_data.get_genome(): new_genome = f"{new_genome}_and_{append_data.get_genome()}" feature_metadata = pd.concat([unidata.feature_metadata, append_df], axis=0) feature_metadata.reset_index(inplace=True) _fillna(feature_metadata) unidata = UnimodalData( unidata.barcode_metadata, feature_metadata, { unidata.current_matrix(): X, "counts": rawX }, unidata.uns.mapping, unidata.obsm.mapping, unidata.varm.mapping ) # uns.mapping, obsm.mapping and varm.mapping are passed by reference unidata.uns["genome"] = new_genome if kwargs["citeseq"] and kwargs["citeseq_umap"]: umap_index = append_df.index.difference( [f"Ab-{x}" for x in kwargs["citeseq_umap_exclude"]]) unidata.obsm["X_citeseq"] = unidata.X[:, unidata.var_names. isin(umap_index )].toarray() tools.umap( unidata, rep="citeseq", n_neighbors=kwargs["umap_K"], min_dist=kwargs["umap_min_dist"], spread=kwargs["umap_spread"], n_jobs=kwargs["n_jobs"], full_speed=kwargs["full_speed"], random_state=kwargs["random_state"], out_basis="citeseq_umap", ) if kwargs["output_h5ad"]: import time start_time = time.perf_counter() adata = unidata.to_anndata() if "_tmp_fmat_highly_variable_features" in adata.uns: adata.uns["scale.data"] = adata.uns.pop( "_tmp_fmat_highly_variable_features") # assign by reference adata.uns["scale.data.rownames"] = unidata.var_names[ unidata.var["highly_variable_features"] == True].values adata.write(f"{output_name}.h5ad", compression="gzip") del adata end_time = time.perf_counter() logger.info( f"H5AD file {output_name}.h5ad is written. Time spent = {end_time - start_time:.2f}s." ) # write out results if kwargs["output_loom"]: write_output(unidata, f"{output_name}.loom") # Change genome name back if append_data is True if unidata.uns["genome"] != genome: unidata.uns["genome"] = genome # Eliminate objects starting with _tmp from uns unidata.uns.pop("_tmp_fmat_highly_variable_features", None)
def analyze_one_modality(unidata: UnimodalData, output_name: str, is_raw: bool, append_data: UnimodalData, **kwargs) -> None: print() logger.info(f"Begin to analyze UnimodalData {unidata.get_uid()}.") if kwargs["channel_attr"] is not None: unidata.obs["Channel"] = unidata.obs[kwargs["channel_attr"]] if is_raw: # normailize counts and then transform to log space tools.log_norm(unidata, kwargs["norm_count"]) # set group attribute if kwargs["batch_correction"] and kwargs["group_attribute"] is not None: tools.set_group_attribute(unidata, kwargs["group_attribute"]) # select highly variable features standardize = False # if no select HVF, False if kwargs["select_hvf"]: if unidata.shape[1] <= kwargs["hvf_ngenes"]: logger.warning( f"Number of genes {unidata.shape[1]} is no greater than the target number of highly variable features {kwargs['hvf_ngenes']}. HVF selection is omitted." ) else: standardize = True tools.highly_variable_features( unidata, kwargs["batch_correction"], flavor=kwargs["hvf_flavor"], n_top=kwargs["hvf_ngenes"], n_jobs=kwargs["n_jobs"], ) if kwargs["hvf_flavor"] == "pegasus": if kwargs["plot_hvf"] is not None: from pegasus.plotting import hvfplot fig = hvfplot(unidata, return_fig=True) fig.savefig(f"{kwargs['plot_hvf']}.hvf.pdf") # batch correction: L/S if kwargs["batch_correction"] and kwargs["correction_method"] == "L/S": tools.correct_batch(unidata, features="highly_variable_features") if kwargs["calc_sigscore"] is not None: sig_files = kwargs["calc_sigscore"].split(",") for sig_file in sig_files: tools.calc_signature_score(unidata, sig_file) n_pc = min(kwargs["pca_n"], unidata.shape[0], unidata.shape[1]) if n_pc < kwargs["pca_n"]: logger.warning( f"UnimodalData {unidata.get_uid()} has either dimension ({unidata.shape[0]}, {unidata.shape[1]}) less than the specified number of PCs {kwargs['pca_n']}. Reduce the number of PCs to {n_pc}." ) if kwargs["batch_correction"] and kwargs[ "correction_method"] == "scanorama": pca_key = tools.run_scanorama(unidata, n_components=n_pc, features="highly_variable_features", standardize=standardize, random_state=kwargs["random_state"]) else: # PCA tools.pca( unidata, n_components=n_pc, features="highly_variable_features", standardize=standardize, robust=kwargs["pca_robust"], random_state=kwargs["random_state"], ) pca_key = "pca" # batch correction: Harmony if kwargs["batch_correction"] and kwargs["correction_method"] == "harmony": pca_key = tools.run_harmony(unidata, rep="pca", n_jobs=kwargs["n_jobs"], n_clusters=kwargs["harmony_nclusters"], random_state=kwargs["random_state"]) # Find K neighbors tools.neighbors( unidata, K=kwargs["K"], rep=pca_key, n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], full_speed=kwargs["full_speed"], ) # calculate diffmap if (kwargs["fle"] or kwargs["net_fle"]): if not kwargs["diffmap"]: print("Turn on --diffmap option!") kwargs["diffmap"] = True if kwargs["diffmap"]: tools.diffmap( unidata, n_components=kwargs["diffmap_ndc"], rep=pca_key, solver=kwargs["diffmap_solver"], random_state=kwargs["random_state"], max_t=kwargs["diffmap_maxt"], ) if kwargs["diffmap_to_3d"]: tools.reduce_diffmap_to_3d(unidata, random_state=kwargs["random_state"]) # calculate kBET if ("kBET" in kwargs) and kwargs["kBET"]: stat_mean, pvalue_mean, accept_rate = tools.calc_kBET( unidata, kwargs["kBET_batch"], rep=pca_key, K=kwargs["kBET_K"], alpha=kwargs["kBET_alpha"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"]) print( "kBET stat_mean = {:.2f}, pvalue_mean = {:.4f}, accept_rate = {:.2%}." .format(stat_mean, pvalue_mean, accept_rate)) # clustering if kwargs["spectral_louvain"]: tools.cluster( unidata, algo="spectral_louvain", rep=pca_key, resolution=kwargs["spectral_louvain_resolution"], rep_kmeans=kwargs["spectral_louvain_basis"], n_clusters=kwargs["spectral_louvain_nclusters"], n_clusters2=kwargs["spectral_louvain_nclusters2"], n_init=kwargs["spectral_louvain_ninit"], random_state=kwargs["random_state"], class_label="spectral_louvain_labels", ) if kwargs["spectral_leiden"]: tools.cluster( unidata, algo="spectral_leiden", rep=pca_key, resolution=kwargs["spectral_leiden_resolution"], rep_kmeans=kwargs["spectral_leiden_basis"], n_clusters=kwargs["spectral_leiden_nclusters"], n_clusters2=kwargs["spectral_leiden_nclusters2"], n_init=kwargs["spectral_leiden_ninit"], random_state=kwargs["random_state"], class_label="spectral_leiden_labels", ) if kwargs["louvain"]: tools.cluster( unidata, algo="louvain", rep=pca_key, resolution=kwargs["louvain_resolution"], random_state=kwargs["random_state"], class_label=kwargs["louvain_class_label"], ) if kwargs["leiden"]: tools.cluster( unidata, algo="leiden", rep=pca_key, resolution=kwargs["leiden_resolution"], n_iter=kwargs["leiden_niter"], random_state=kwargs["random_state"], class_label=kwargs["leiden_class_label"], ) # visualization if kwargs["net_tsne"]: tools.net_tsne( unidata, rep=pca_key, n_jobs=kwargs["n_jobs"], perplexity=kwargs["tsne_perplexity"], random_state=kwargs["random_state"], select_frac=kwargs["net_ds_frac"], select_K=kwargs["net_ds_K"], select_alpha=kwargs["net_ds_alpha"], net_alpha=kwargs["net_l2"], polish_learning_frac=kwargs["net_tsne_polish_learing_frac"], polish_n_iter=kwargs["net_tsne_polish_niter"], out_basis=kwargs["net_tsne_basis"], ) if kwargs["net_umap"]: tools.net_umap( unidata, rep=pca_key, n_jobs=kwargs["n_jobs"], n_neighbors=kwargs["umap_K"], min_dist=kwargs["umap_min_dist"], spread=kwargs["umap_spread"], random_state=kwargs["random_state"], select_frac=kwargs["net_ds_frac"], select_K=kwargs["net_ds_K"], select_alpha=kwargs["net_ds_alpha"], full_speed=kwargs["full_speed"], net_alpha=kwargs["net_l2"], polish_learning_rate=kwargs["net_umap_polish_learing_rate"], polish_n_epochs=kwargs["net_umap_polish_nepochs"], out_basis=kwargs["net_umap_basis"], ) if kwargs["net_fle"]: tools.net_fle( unidata, output_name, n_jobs=kwargs["n_jobs"], K=kwargs["fle_K"], full_speed=kwargs["full_speed"], target_change_per_node=kwargs["fle_target_change_per_node"], target_steps=kwargs["fle_target_steps"], is3d=False, memory=kwargs["fle_memory"], random_state=kwargs["random_state"], select_frac=kwargs["net_ds_frac"], select_K=kwargs["net_ds_K"], select_alpha=kwargs["net_ds_alpha"], net_alpha=kwargs["net_l2"], polish_target_steps=kwargs["net_fle_polish_target_steps"], out_basis=kwargs["net_fle_basis"], ) if kwargs["tsne"]: tools.tsne( unidata, rep=pca_key, n_jobs=kwargs["n_jobs"], perplexity=kwargs["tsne_perplexity"], random_state=kwargs["random_state"], ) if kwargs["fitsne"]: tools.fitsne( unidata, rep=pca_key, n_jobs=kwargs["n_jobs"], perplexity=kwargs["tsne_perplexity"], random_state=kwargs["random_state"], ) if kwargs["umap"]: tools.umap( unidata, rep=pca_key, n_neighbors=kwargs["umap_K"], min_dist=kwargs["umap_min_dist"], spread=kwargs["umap_spread"], random_state=kwargs["random_state"], ) if kwargs["fle"]: tools.fle( unidata, output_name, n_jobs=kwargs["n_jobs"], K=kwargs["fle_K"], full_speed=kwargs["full_speed"], target_change_per_node=kwargs["fle_target_change_per_node"], target_steps=kwargs["fle_target_steps"], is3d=False, memory=kwargs["fle_memory"], random_state=kwargs["random_state"], ) # calculate diffusion-based pseudotime from roots if len(kwargs["pseudotime"]) > 0: tools.calc_pseudotime(unidata, kwargs["pseudotime"]) genome = unidata.uns["genome"] if append_data is not None: locs = unidata.obs_names.get_indexer(append_data.obs_names) idx = locs >= 0 locs = locs[idx] Y = append_data.X[idx, :].tocoo(copy=False) Z = coo_matrix((Y.data, (locs[Y.row], Y.col)), shape=(unidata.shape[0], append_data.shape[1])).tocsr() idy = Z.getnnz(axis=0) > 0 n_nonzero = idy.sum() if n_nonzero > 0: if n_nonzero < append_data.shape[1]: Z = Z[:, idy] append_df = append_data.feature_metadata.loc[idy, :] else: append_df = append_data.feature_metadata rawX = hstack([unidata.get_matrix("raw.X"), Z], format="csr") Zt = Z.astype(np.float32) Zt.data *= np.repeat(unidata.obs["scale"].values, np.diff(Zt.indptr)) Zt.data = np.log1p(Zt.data) X = hstack([unidata.get_matrix("X"), Zt], format="csr") new_genome = unidata.get_genome( ) + "_and_" + append_data.get_genome() feature_metadata = pd.concat([unidata.feature_metadata, append_df], axis=0) feature_metadata.reset_index(inplace=True) feature_metadata.fillna(value=_get_fillna_dict( unidata.feature_metadata), inplace=True) unidata = UnimodalData( unidata.barcode_metadata, feature_metadata, { "X": X, "raw.X": rawX }, unidata.uns.mapping, unidata.obsm.mapping, unidata.varm.mapping ) # uns.mapping, obsm.mapping and varm.mapping are passed by reference unidata.uns["genome"] = new_genome if kwargs["output_h5ad"]: adata = unidata.to_anndata() adata.uns["scale.data"] = adata.uns.pop( "_tmp_fmat_highly_variable_features") # assign by reference adata.uns["scale.data.rownames"] = unidata.var_names[ unidata.var["highly_variable_features"]].values adata.write(f"{output_name}.h5ad", compression="gzip") del adata # write out results if kwargs["output_loom"]: write_output(unidata, f"{output_name}.loom") # Change genome name back if append_data is True if unidata.uns["genome"] != genome: unidata.uns["genome"] = genome # Eliminate objects starting with fmat_ from uns unidata.uns.pop("_tmp_fmat_highly_variable_features", None)