def plot_down_sampling( rna_file, adt_file, out_file, probs=[i / 10.0 for i in range(9, 0, -1)], n_threads=1, dpi=500, figsize=None, ): data_gt = read_input(rna_file) adt_gt = read_input(adt_file) fracs, accuracy = down_sampling(data_gt, adt_gt, probs, n_threads=n_threads) plt.plot(fracs, accuracy, ".-") ax = plt.gca() ax.set_xlim(1.0, 0.0) ax.set_ylim(0.79, 1.01) vals = ax.get_yticks() ax.set_yticklabels(["{:.0%}".format(v) for v in vals]) ax.set_xlabel("Fraction of hashtag UMIs") ax.set_ylabel("Consistency") if figsize is not None: plt.gcf().set_size_inches(*figsize) plt.savefig(out_file, dpi=dpi) plt.close()
def show_attributes( input_file: str, show_attributes: bool, show_gene_attributes: bool, show_values_for_attributes: str, ) -> None: """ Show data attributes. For command line use. """ data = read_input(input_file, h5ad_mode="r") if show_attributes: print( "Available sample attributes in input dataset: {0}".format( ", ".join(data.obs.columns.values) ) ) if show_gene_attributes: print( "Available gene attributes in input dataset: {0}".format( ", ".join(data.var.columns.values) ) ) if not show_values_for_attributes is None: for attr in show_values_for_attributes.split(","): print( "Available values for attribute {0}: {1}.".format( attr, ", ".join(np.unique(data.obs[attr])) ) )
def run_annotate_cluster( input_file: str, output_file: str, marker_file: str, de_test: str, de_alpha: float = 0.05, de_key: str = "de_res", threshold: float = 0.5, ignore_nonde: bool = False, ) -> None: """ For command line use. """ import time from sccloud.io import read_input start = time.time() data = read_input(input_file, h5ad_mode="r") infer_cell_types( data, marker_file, de_test, de_alpha=de_alpha, de_key=de_key, threshold=threshold, ignore_nonde=ignore_nonde, output_file=output_file, ) data.file.close() end = time.time() logger.info("Time spent for annotating clusters is {:.2f}s.".format(end - start))
def make_interactive_plots(input_file, plot_type, output_file, **kwargs): adata = read_input(input_file, h5ad_mode="r") basis = transform_basis(plot_type) if plot_type == "diffmap" or plot_type == "diffmap_pca": df = pd.DataFrame( adata.obsm["X_{}".format(plot_type)][:, 0:3], index=adata.obs.index, columns=[basis + i for i in ["1", "2", "3"]], ) if kwargs["isgene"]: coln = adata.var.index.get_loc(kwargs["attr"]) df.insert(0, "Annotation", adata.X[:, coln].toarray().ravel()) else: df.insert(0, "Annotation", adata.obs[kwargs["attr"]]) if not kwargs["isreal"]: iplot_library.scatter3d(df, output_file) else: iplot_library.scatter3d_real(df, output_file, kwargs["log10"]) else: df = pd.DataFrame( adata.obsm["X_{}".format(plot_type)], index=adata.obs.index, columns=[basis + i for i in ["1", "2"]], ) if kwargs["isgene"]: coln = adata.var.index.get_loc(kwargs["attr"]) df.insert(0, "Annotation", adata.X[:, coln].toarray().ravel()) else: df.insert(0, "Annotation", adata.obs[kwargs["attr"]]) if not kwargs["isreal"]: iplot_library.scatter(df, output_file) else: iplot_library.scatter_real(df, output_file, kwargs["log10"]) print(output_file + " is generated.") adata.file.close()
def merge_rna_and_adt_data(input_raw_h5, input_csv, antibody_control_csv, output_name): data = read_input(input_raw_h5, return_type="MemData") print("Loaded the RNA matrix.") keyword = "CITE_Seq_" + data.listKeys()[0] data_citeseq = read_input(input_csv, return_type="MemData", genome=keyword) print("Loaded the ADT matrix.") array2d = data_citeseq.getData(keyword) if antibody_control_csv is None: array2d.matrix = array2d.matrix.log1p() else: size = array2d.feature_metadata.shape[0] idx = np.zeros(size, dtype=bool) antibody_to_pos = pd.Series(data=range(size), index=array2d.feature_metadata.index) adt_matrix = array2d.matrix.toarray().astype(float) series = pd.read_csv(antibody_control_csv, header=0, index_col=0, squeeze=True) for antibody, control in series.iteritems(): pos_a = antibody_to_pos[antibody] pos_c = antibody_to_pos[control] idx[pos_a] = True # convert to log expression adt_matrix[:, pos_a] = np.maximum( np.log(adt_matrix[:, pos_a] + 1.0) - np.log(adt_matrix[:, pos_c] + 1.0), 0.0, ) array2d.feature_metadata = array2d.feature_metadata[idx] array2d.matrix = csr_matrix(adt_matrix[:, idx]) data.addData(keyword, array2d) write_output(data, output_name) print("Merged output is written.")
def annotate_anndata_object(input_file: str, annotation: str) -> None: """ For command line use. annotation: anno_name:clust_name:cell_type1;...cell_typen """ from sccloud.io import read_input, write_output data = read_input(input_file, h5ad_mode="r+") anno_name, clust_name, anno_str = annotation.split(":") anno_dict = {str(i + 1): x for i, x in enumerate(anno_str.split(";"))} annotate(data, anno_name, clust_name, anno_dict) write_output(data, input_file, whitelist = ["obs"])
def run_conversion(input_h5ad_file, output_name, nthreads): start = time.time() data = read_input(input_h5ad_file) end = time.time() print("Time spent for loading the expression matrix is {:.2f}s.".format( end - start)) start = time.time() convert_to_parquet(data, output_name, nthreads) end = time.time() print( "Time spent on generating the PARQUET file is {:.2f}s.".format(end - start))
def run_de_analysis( input_file: str, output_excel_file: str, cluster: str, result_key: str = "de_res", n_jobs: int = -1, auc: bool = True, t: bool = True, fisher: bool = False, mwu: bool = False, temp_folder: str = None, verbose: bool = True, alpha: float = 0.05, ndigits: int = 3, ) -> None: """ For command line only """ start = time.time() from sccloud.io import read_input, write_output data = read_input(input_file, h5ad_mode="r+") de_analysis( data, cluster, result_key=result_key, n_jobs=n_jobs, auc=auc, t=t, fisher=fisher, mwu=mwu, temp_folder=temp_folder, verbose=verbose, ) write_output(data, input_file, whitelist=["varm/{}".format(result_key)]) logger.info( "Differential expression results are written to varm/{} in h5ad file.". format(result_key)) results = markers(data, de_key=result_key, alpha=alpha) write_results_to_excel(results, output_excel_file, ndigits=ndigits) end = time.time() logger.info("run_de_analysis is finished in {:.2f}s.".format(end - start))
def make_static_plots(input_file, plot_type, output_file, dpi=500, **kwargs): adata = read_input(input_file, h5ad_mode="r") if plot_type == "qc_violin": if kwargs["attr"] is None: plot_qc_violin( adata, kwargs["qc_type"], output_file, xattr=kwargs["cluster"], xlabel=kwargs["cluster"], xtick_font=kwargs["qc_xtick_font"], xtick_rotation=kwargs["qc_xtick_rotation"], figsize=kwargs["subplot_size"], linewidth=kwargs["qc_line_width"], ) else: plot_qc_violin( adata, kwargs["qc_type"], output_file, xattr=kwargs["cluster"], hue=kwargs["attr"], xlabel=kwargs["cluster"], xtick_font=kwargs["qc_xtick_font"], xtick_rotation=kwargs["qc_xtick_rotation"], split=True, figsize=kwargs["subplot_size"], linewidth=kwargs["qc_line_width"], ) else: fig = getattr(plot_library, "plot_" + plot_type)(adata, **kwargs) fig.savefig(output_file, dpi=dpi) print(output_file + " is generated.") adata.file.close()
def run_pipeline(input_file, output_name, **kwargs): is_raw = not kwargs["processed"] if "seurat_compatible" not in kwargs: kwargs["seurat_compatible"] = False # load input data adata = io.read_input( input_file, genome=kwargs["genome"], concat_matrices=False if kwargs["cite_seq"] else True, h5ad_mode=("a" if (is_raw or kwargs["subcluster"]) else "r+"), select_singlets=kwargs["select_singlets"], channel_attr=kwargs["channel_attr"], black_list=(kwargs["black_list"].split(",") if kwargs["black_list"] is not None else []), ) if not kwargs["cite_seq"]: if is_raw: values = adata.X.getnnz(axis=1) if values.min() == 0: # 10x raw data adata._inplace_subset_obs(values >= kwargs["min_genes_on_raw"]) else: data_list = adata assert len(data_list) == 2 adata = cdata = None for i in range(len(data_list)): if data_list[i].uns["genome"].startswith("CITE_Seq"): cdata = data_list[i] else: adata = data_list[i] assert adata is not None and cdata is not None print("Inputs are loaded.") if kwargs["seurat_compatible"]: assert is_raw and kwargs["select_hvf"] if kwargs["subcluster"]: adata = tools.get_anndata_for_subclustering( adata, kwargs["subset_selections"]) is_raw = True # get submat and then set is_raw to True if is_raw: if not kwargs["subcluster"]: # filter out low quality cells/genes tools.run_filter_data( adata, output_filt=kwargs["output_filt"], plot_filt=kwargs["plot_filt"], plot_filt_figsize=kwargs["plot_filt_figsize"], mito_prefix=kwargs["mito_prefix"], min_genes=kwargs["min_genes"], max_genes=kwargs["max_genes"], min_umis=kwargs["min_umis"], max_umis=kwargs["max_umis"], percent_mito=kwargs["percent_mito"], percent_cells=kwargs["percent_cells"], ) if kwargs["seurat_compatible"]: raw_data = adata.copy() # raw as count # normailize counts and then transform to log space tools.log_norm(adata, kwargs["norm_count"]) # set group attribute if kwargs["batch_correction"] and kwargs[ "group_attribute"] is not None: tools.set_group_attribute(adata, kwargs["group_attribute"]) # select highly variable features if kwargs["select_hvf"]: tools.highly_variable_features( adata, kwargs["batch_correction"], flavor=kwargs["hvf_flavor"], n_top=kwargs["hvf_ngenes"], n_jobs=kwargs["n_jobs"], ) if kwargs["hvf_flavor"] == "sccloud": if kwargs["plot_hvf"] is not None: from sccloud.plotting import plot_hvf robust_idx = adata.var["robust"].values plot_hvf( adata.var.loc[robust_idx, "mean"], adata.var.loc[robust_idx, "var"], adata.var.loc[robust_idx, "hvf_loess"], adata.var.loc[robust_idx, "highly_variable_features"], kwargs["plot_hvf"] + ".hvf.pdf", ) # batch correction if kwargs["batch_correction"]: tools.correct_batch(adata, features="highly_variable_features") # PCA tools.pca( adata, n_components=kwargs["nPC"], features="highly_variable_features", random_state=kwargs["random_state"], ) # Find K neighbors tools.neighbors( adata, K=kwargs["K"], rep="pca", n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], full_speed=kwargs["full_speed"], ) # calculate diffmap if (kwargs["fle"] or kwargs["net_fle"]): if not kwargs["diffmap"]: print("Turn on --diffmap option!") kwargs["diffmap"] = True if kwargs["diffmap"]: tools.diffmap( adata, n_components=kwargs["diffmap_ndc"], rep="pca", solver=kwargs["diffmap_solver"], random_state=kwargs["random_state"], max_t=kwargs["diffmap_maxt"], ) if kwargs["diffmap_to_3d"]: tools.reduce_diffmap_to_3d(adata, random_state=kwargs["random_state"]) # calculate kBET if ("kBET" in kwargs) and kwargs["kBET"]: stat_mean, pvalue_mean, accept_rate = tools.calc_kBET( adata, kwargs["kBET_batch"], K=kwargs["kBET_K"], alpha=kwargs["kBET_alpha"], n_jobs=kwargs["n_jobs"], ) print( "kBET stat_mean = {:.2f}, pvalue_mean = {:.4f}, accept_rate = {:.2%}." .format(stat_mean, pvalue_mean, accept_rate)) # clustering if kwargs["spectral_louvain"]: tools.spectral_louvain( adata, rep="pca", resolution=kwargs["spectral_louvain_resolution"], rep_kmeans=kwargs["spectral_louvain_basis"], n_clusters=kwargs["spectral_louvain_nclusters"], n_init=kwargs["spectral_louvain_ninit"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], temp_folder=kwargs["temp_folder"], class_label="spectral_louvain_labels", ) if kwargs["spectral_leiden"]: tools.spectral_leiden( adata, rep="pca", resolution=kwargs["spectral_leiden_resolution"], rep_kmeans=kwargs["spectral_leiden_basis"], n_clusters=kwargs["spectral_leiden_nclusters"], n_init=kwargs["spectral_leiden_ninit"], n_jobs=kwargs["n_jobs"], random_state=kwargs["random_state"], temp_folder=kwargs["temp_folder"], class_label="spectral_leiden_labels", ) if kwargs["louvain"]: tools.louvain( adata, rep="pca", resolution=kwargs["louvain_resolution"], random_state=kwargs["random_state"], class_label=kwargs["louvain_class_label"], ) if kwargs["leiden"]: tools.leiden( adata, rep="pca", resolution=kwargs["leiden_resolution"], n_iter=kwargs["leiden_niter"], random_state=kwargs["random_state"], class_label=kwargs["leiden_class_label"], ) # visualization if kwargs["net_tsne"]: tools.net_tsne( adata, rep="pca", n_jobs=kwargs["n_jobs"], perplexity=kwargs["tsne_perplexity"], random_state=kwargs["random_state"], select_frac=kwargs["net_ds_frac"], select_K=kwargs["net_ds_K"], select_alpha=kwargs["net_ds_alpha"], net_alpha=kwargs["net_l2"], polish_learning_frac=kwargs["net_tsne_polish_learing_frac"], polish_n_iter=kwargs["net_tsne_polish_niter"], out_basis=kwargs["net_tsne_basis"], ) if kwargs["net_umap"]: tools.net_umap( adata, rep="pca", n_jobs=kwargs["n_jobs"], n_neighbors=kwargs["umap_K"], min_dist=kwargs["umap_min_dist"], spread=kwargs["umap_spread"], random_state=kwargs["random_state"], select_frac=kwargs["net_ds_frac"], select_K=kwargs["net_ds_K"], select_alpha=kwargs["net_ds_alpha"], full_speed=kwargs["full_speed"], net_alpha=kwargs["net_l2"], polish_learning_rate=kwargs["net_umap_polish_learing_rate"], polish_n_epochs=kwargs["net_umap_polish_nepochs"], out_basis=kwargs["net_umap_basis"], ) if kwargs["net_fle"]: tools.net_fle( adata, output_name, n_jobs=kwargs["n_jobs"], K=kwargs["fle_K"], full_speed=kwargs["full_speed"], target_change_per_node=kwargs["fle_target_change_per_node"], target_steps=kwargs["fle_target_steps"], is3d=False, memory=kwargs["fle_memory"], random_state=kwargs["random_state"], select_frac=kwargs["net_ds_frac"], select_K=kwargs["net_ds_K"], select_alpha=kwargs["net_ds_alpha"], net_alpha=kwargs["net_l2"], polish_target_steps=kwargs["net_fle_polish_target_steps"], out_basis=kwargs["net_fle_basis"], ) if kwargs["tsne"]: tools.tsne( adata, rep="pca", n_jobs=kwargs["n_jobs"], perplexity=kwargs["tsne_perplexity"], random_state=kwargs["random_state"], ) if kwargs["fitsne"]: tools.fitsne( adata, rep="pca", n_jobs=kwargs["n_jobs"], perplexity=kwargs["tsne_perplexity"], random_state=kwargs["random_state"], ) if kwargs["umap"]: tools.umap( adata, rep="pca", n_neighbors=kwargs["umap_K"], min_dist=kwargs["umap_min_dist"], spread=kwargs["umap_spread"], random_state=kwargs["random_state"], ) if kwargs["fle"]: tools.fle( adata, output_name, n_jobs=kwargs["n_jobs"], K=kwargs["fle_K"], full_speed=kwargs["full_speed"], target_change_per_node=kwargs["fle_target_change_per_node"], target_steps=kwargs["fle_target_steps"], is3d=False, memory=kwargs["fle_memory"], random_state=kwargs["random_state"], ) # calculate diffusion-based pseudotime from roots if len(kwargs["pseudotime"]) > 0: tools.calc_pseudotime(adata, kwargs["pseudotime"]) # merge cite-seq data and run t-SNE if kwargs["cite_seq"]: adt_matrix = np.zeros((adata.shape[0], cdata.shape[1]), dtype="float32") idx = adata.obs_names.isin(cdata.obs_names) adt_matrix[idx, :] = cdata[adata.obs_names[idx], ].X.toarray() if abs(100.0 - kwargs["cite_seq_capping"]) > 1e-4: cite_seq.capping(adt_matrix, kwargs["cite_seq_capping"]) var_names = np.concatenate( [adata.var_names, ["AD-" + x for x in cdata.var_names]]) new_data = anndata.AnnData( X=hstack([adata.X, csr_matrix(adt_matrix)], format="csr"), obs=adata.obs, obsm=adata.obsm, uns=adata.uns, var={ "var_names": var_names, "gene_ids": var_names, "n_cells": np.concatenate( [adata.var["n_cells"].values, [0] * cdata.shape[1]]), "percent_cells": np.concatenate([ adata.var["percent_cells"].values, [0.0] * cdata.shape[1] ]), "robust": np.concatenate( [adata.var["robust"].values, [False] * cdata.shape[1]]), "highly_variable_features": np.concatenate([ adata.var["highly_variable_features"].values, [False] * cdata.shape[1], ]), }, ) new_data.obsm["X_CITE-Seq"] = adt_matrix adata = new_data print("ADT count matrix is attached.") tools.fitsne( adata, rep="CITE-Seq", n_jobs=kwargs["n_jobs"], perplexity=kwargs["tsne_perplexity"], random_state=kwargs["random_state"], out_basis="citeseq_fitsne", ) print("Antibody embedding is done.") if kwargs["seurat_compatible"]: seurat_data = adata.copy() seurat_data.raw = raw_data seurat_data.uns["scale.data"] = adata.uns[ "fmat_highly_variable_features"] seurat_data.uns["scale.data.rownames"] = adata.var_names[ adata.var["highly_variable_features"]].values io.write_output(seurat_data, output_name + ".seurat.h5ad") # write out results io.write_output(adata, output_name + ".h5ad") if kwargs["output_loom"]: io.write_output(adata, output_name + ".loom") print("Results are written.")
def run_demuxEM_pipeline(input_adt_file, input_rna_file, output_name, **kwargs): # load input data adt = io.read_input(input_adt_file, genome="_ADT_") print("ADT file is loaded.") data = io.read_input(input_rna_file, genome=kwargs["genome"], concat_matrices=True) print("RNA file is loaded.") # Filter the RNA matrix data.obs["n_genes"] = data.X.getnnz(axis=1) data.obs["n_counts"] = data.X.sum(axis=1).A1 obs_index = np.logical_and.reduce( ( data.obs["n_genes"] >= kwargs["min_num_genes"], data.obs["n_counts"] >= kwargs["min_num_umis"], ) ) data._inplace_subset_obs(obs_index) data.var["robust"] = True # run demuxEM demuxEM.estimate_background_probs(adt, random_state=kwargs["random_state"]) print("Background probability distribution is estimated.") demuxEM.demultiplex( data, adt, min_signal=kwargs["min_signal"], alpha=kwargs["alpha"], n_threads=kwargs["n_jobs"], ) print("Demultiplexing is done.") # annotate raw matrix with demuxEM results genome_indexed_raw_data = io.read_input( input_rna_file, return_type="MemData", concat_matrices=False ) for keyword in genome_indexed_raw_data.listKeys(): array2d = genome_indexed_raw_data.getData(keyword) barcodes = array2d.barcode_metadata.index idx = barcodes.isin(data.obs_names) selected = barcodes[idx] demux_type = np.empty(barcodes.size, dtype="object") demux_type[:] = "" demux_type[idx] = data.obs.loc[selected, "demux_type"] array2d.barcode_metadata["demux_type"] = demux_type assignment = np.empty(barcodes.size, dtype="object") assignment[:] = "" assignment[idx] = data.obs.loc[selected, "assignment"] array2d.barcode_metadata["assignment"] = assignment if "assignment.dedup" in data.obs: assignment_dedup = np.empty(barcodes.size, dtype="object") assignment_dedup[:] = "" assignment_dedup[idx] = data.obs.loc[selected, "assignment.dedup"] array2d.barcode_metadata["assignment.dedup"] = assignment_dedup print("Demultiplexing results are added to raw expression matrices.") # generate plots if kwargs["gen_plots"]: demuxEM.plot_adt_hist( adt, "hto_type", output_name + ".ambient_hashtag.hist.pdf", alpha=1.0 ) demuxEM.plot_bar( adt.uns["background_probs"], adt.var_names, "Sample ID", "Background probability", output_name + ".background_probabilities.bar.pdf", ) demuxEM.plot_adt_hist( adt, "rna_type", output_name + ".real_content.hist.pdf", alpha=0.5 ) demuxEM.plot_rna_hist(data, output_name + ".rna_demux.hist.pdf") print("Diagnostic plots are generated.") if len(kwargs["gen_gender_plot"]) > 0: tools.log_norm(data, 1e5) for gene_name in kwargs["gen_gender_plot"]: demuxEM.plot_violin( data, {"gene": gene_name}, "{output_name}.{gene_name}.violin.pdf".format( output_name=output_name, gene_name=gene_name ), title="{gene_name}: a gender-specific gene".format(gene_name=gene_name), ) print("Gender-specific gene expression violin plots are generated.") # output results io.write_output(adt, output_name + "_ADTs.h5ad") print( "Hashtag count information is written to {output_name}_ADTs.h5ad .".format( output_name=output_name ) ) io.write_output(data, output_name + "_demux.h5ad") print( "Demutiplexed RNA expression information is written to {output_name}_demux.h5ad .".format( output_name=output_name ) ) io.write_output(genome_indexed_raw_data, output_name + "_demux") print( "Raw sccloud-format hdf5 file with demultiplexing results is written to {output_name}_demux.h5sc .".format( output_name=output_name ) ) # output summary statistics print("\nSummary statistics:") print("total\t{}".format(data.shape[0])) for name, value in data.obs["demux_type"].value_counts().iteritems(): print("{}\t{}".format(name, value))
def aggregate_matrices( csv_file: str, what_to_return: str = AnnData, restrictions: List[str] = [], attributes: List[str] = [], google_cloud: bool = False, select_singlets: bool = False, ngene: int = None, concat_matrices: bool = False, ) -> "None or AnnData or MemData": """Aggregate channel-specific count matrices into one big count matrix. This function takes as input a csv_file, which contains at least 2 columns — Sample, sample name; Location, file that contains the count matrices (e.g. filtered_gene_bc_matrices_h5.h5), and merges matrices from the same genome together. Depending on what_to_return, it can output the merged results into a sccloud-formatted HDF5 file or return as an AnnData or MemData object. Parameters ---------- csv_file : `str` The CSV file containing information about each channel. what_to_return : `str`, optional (default: 'AnnData') If this value is equal to 'AnnData' or 'MemData', an AnnData or MemData object will be returned. Otherwise, results will be written into 'what_to_return.sccloud.h5' file and None is returned. restrictions : `list[str]`, optional (default: []) A list of restrictions used to select channels, each restriction takes the format of name:value,…,value or name:~value,..,value, where ~ refers to not. attributes : `list[str]`, optional (default: []) A list of attributes need to be incorporated into the output count matrix. google_cloud : `bool`, optional (default: False) If the channel-specific count matrices are stored in a google bucket. select_singlets : `bool`, optional (default: False) If we have demultiplexed data, turning on this option will make sccloud only include barcodes that are predicted as singlets. ngene : `int`, optional (default: None) The minimum number of expressed genes to keep one barcode. concat_matrices : `bool`, optional (default: False) If concatenate multiple matrices. If so, return only one AnnData object, otherwise, might return a list of AnnData objects. Returns ------- `None` or `AnnData` or `MemData` Either `None` or an `AnnData` object or a `MemData` object. Examples -------- >>> scc.aggregate_matrix('example.csv', 'example_10x.h5', ['Source:pbmc', 'Donor:1'], ['Source', 'Platform', 'Donor']) """ df = pd.read_csv(csv_file, header=0, index_col="Sample") df["Sample"] = df.index # Select channels rvec = [parse_restriction_string(x) for x in restrictions] idx = pd.Series([True] * df.shape[0], index=df.index, name="selected") for name, isin, content in rvec: assert name in df.columns if isin: idx = idx & df[name].isin(content) else: idx = idx & (~(df[name].isin(content))) df = df.loc[idx] if df.shape[0] == 0: raise ValueError("No channels pass the restrictions!") # Load channels tot = 0 aggrData = MemData() dest_paths = [] for sample_name, row in df.iterrows(): input_file = os.path.expanduser( os.path.expandvars(row["Location"].rstrip(os.sep))) file_format, copy_path, copy_type = infer_file_format(input_file) if google_cloud: base_name = os.path.basename(copy_path) dest_path = sample_name + "_tmp_" + base_name if copy_type == "directory": check_call(["mkdir", "-p", dest_path]) call_args = ["gsutil", "-m", "cp", "-r", copy_path, dest_path] else: call_args = ["gsutil", "-m", "cp", copy_path, dest_path] check_call(call_args) dest_paths.append(dest_path) input_file = dest_path if file_format == "csv" and copy_type == "directory": input_file = os.path.join(dest_path, os.path.basename(input_file)) genome = None if file_format in ["dge", "csv", "mtx", "loom"]: assert "Reference" in row genome = row["Reference"] data = read_input( input_file, genome=genome, return_type="MemData", ngene=ngene, select_singlets=select_singlets, ) data.update_barcode_metadata_info(sample_name, row, attributes) aggrData.addAggrData(data) tot += 1 print("Processed {}.".format(input_file)) # Delete temporary file for dest_path in dest_paths: check_call(["rm", "-rf", dest_path]) # Merge channels t1 = time.time() aggrData.aggregate() t2 = time.time() print("Data aggregation is finished in {:.2f}s.".format(t2 - t1)) if what_to_return == "AnnData": aggrData = aggrData.convert_to_anndata(concat_matrices) elif what_to_return != "MemData": write_output(aggrData, what_to_return) aggrData = None print("Aggregated {tot} files.".format(tot=tot)) return aggrData
def run_find_markers( input_h5ad_file: str, output_file: str, label_attr: str, de_key: str = "de_res", n_jobs: int = -1, min_gain: float = 1.0, random_state: int = 0, remove_ribo: bool = False, ) -> None: """ For command line use. """ import xlsxwriter from natsort import natsorted data = read_input(input_h5ad_file) markers = find_markers( data, label_attr, de_key=de_key, n_jobs=n_jobs, min_gain=min_gain, random_state=random_state, remove_ribo=remove_ribo, ) keywords = [("strong", "strong_gain"), ("weak", "weak_gain"), ("down", "down_gain")] writer = pd.ExcelWriter(output_file, engine="xlsxwriter") for clust_id in natsorted(markers.keys()): clust_markers = markers[clust_id] sizes = [] for keyword in keywords: sizes.append(len(clust_markers[keyword[0]])) arr = np.zeros((max(sizes), 8), dtype=object) arr[:] = "" for i in range(3): arr[0:sizes[i], i * 3] = clust_markers[keywords[i][0]] arr[0:sizes[i], i * 3 + 1] = clust_markers[keywords[i][1]] df = pd.DataFrame( data=arr, columns=[ "strongly up-regulated", "gain", "", "weakly up-regulated", "gain", "", "down-regulated", "gain", ], ) df.to_excel(writer, sheet_name=clust_id, index=False) writer.save()