def test_zarr(self): data = io.read_input("pegasusio-test-data/case4/MantonBM1_1_dbls.zarr") io.write_output(data, "pegasusio-test-data/case4/MantonBM_out.zarr") data = io.read_input("pegasusio-test-data/case4/MantonBM_out.zarr") self.assertEqual(data.shape, (4274, 19360), "Count matrix shape differs!") self.assertEqual(data.get_genome(), "GRCh38", "Genome differs!") self.assertEqual(data.get_modality(), "rna", "Modality differs!")
def test_loom(self): data = io.read_input("pegasusio-test-data/case3/pancreas.loom", genome='hg19') io.write_output(data, "pegasusio-test-data/case3/pancreas_out.loom") data = io.read_input("pegasusio-test-data/case3/pancreas_out.loom") self.assertEqual(data.shape, (2544, 58347), "Count matrix shape differs!") self.assertEqual(data.get_genome(), "hg19", "Genome differs!") self.assertEqual(data.get_modality(), "rna", "Modality differs!")
def test_h5ad(self): data = io.read_input("pegasusio-test-data/case1/pbmc3k.h5ad", genome='hg19') io.write_output(data, "pegasusio-test-data/case1/pbmc3k_out.h5ad") data = io.read_input("pegasusio-test-data/case1/pbmc3k_out.h5ad") self.assertEqual(data.shape, (2638, 1838), "Count matrix shape differs!") self.assertEqual(data.get_genome(), "hg19", "Genome differs!") self.assertEqual(data.get_modality(), "rna", "Modality differs!")
def test_10x_mtx(self): data = io.read_input( "pegasusio-test-data/case3/42468c97-1c5a-4c9f-86ea-9eaa1239445a.mtx", genome='hg19') io.write_output(data, "pegasusio-test-data/case3/test.mtx") data = io.read_input("pegasusio-test-data/case3/test.mtx") self.assertEqual(data.shape, (2544, 58347), "Count matrix shape differs!") self.assertEqual(data.get_genome(), "hg19", "Genome differs!") self.assertEqual(data.get_modality(), "rna", "Modality differs!")
def plot_down_sampling( demuxEM_res_file: str, out_file: str, probs: List[float] = [i / 10.0 for i in range(9, 0, -1)], n_threads: int = 1, dpi: int = 500, figsize: Tuple[float, float] = None, ): data = read_input(demuxEM_res_file) rna_gt = data.get_data(modality="rna") hto_gt = data.get_data(modality="hashing") fracs, accuracy = down_sampling(rna_gt, hto_gt, probs, n_threads=n_threads) plt.plot(fracs, accuracy, ".-") ax = plt.gca() ax.set_xlim(1.0, 0.0) ax.set_ylim(0.79, 1.01) vals = ax.get_yticks() ax.set_yticklabels(["{:.0%}".format(v) for v in vals]) ax.set_xlabel("Fraction of hashtag UMIs") ax.set_ylabel("Consistency") if figsize is not None: plt.gcf().set_size_inches(*figsize) plt.savefig(out_file, dpi=dpi) plt.close()
def run_annotate_cluster( input_file: str, output_file: str, markers: str, de_test: str, de_alpha: float = 0.05, de_key: str = "de_res", threshold: float = 0.5, ignore_nonde: bool = False, ) -> None: """ For command line use. """ from pegasusio import read_input data = read_input(input_file, mode="r") infer_cell_types( data, markers, de_test, de_alpha=de_alpha, de_key=de_key, threshold=threshold, ignore_nonde=ignore_nonde, output_file=output_file, )
def write_output(assignment_file: str, input_mat_file: str, output_zarr_file: str, matching: dict) -> None: df = pd.read_csv(assignment_file, sep = '\t', header = 0, index_col = 0) df.index = pd.Index([x[:-2] for x in df.index]) f = np.vectorize(translate_donor_name) df['assignment'] = f(df['assignment'].values, matching) idx = df['status'].values == 'unassigned' df.loc[idx, 'status'] = 'unknown' df.loc[idx, 'assignment'] = '' type_counts = df['status'].value_counts() print("\nSinglets = {}, doublets = {}, unknown = {}.".format(type_counts['singlet'], type_counts['doublet'], type_counts['unknown'])) idx = df['status'] == 'singlet' singlet_counts = df.loc[idx, 'assignment'].value_counts() print("Among {} singlets, we have the following statistics:".format(type_counts['singlet'])) for donor in natsorted(singlet_counts.index): print(" Reference donor {}: {}".format(donor, singlet_counts[donor])) print() data = pegasusio.read_input(input_mat_file) data.obs['demux_type'] = '' data.obs['assignment'] = '' idx = data.obs_names.isin(df.index) barcodes = data.obs_names[idx] ndf = df.loc[barcodes, ['status', 'assignment']] data.obs.loc[idx, 'demux_type'] = ndf['status'].values data.obs.loc[idx, 'assignment'] = ndf['assignment'].values pegasusio.write_output(data, output_zarr_file, zarr_zipstore = True)
def run_pipeline(input_file: str, output_name: str, **kwargs): is_raw = not kwargs["processed"] black_list = set() if kwargs["black_list"] is not None: black_list = set(kwargs["black_list"].split(",")) # load input data data = read_input(input_file, black_list=black_list) # process focus_list focus_list = kwargs["focus"] if len(focus_list) == 0: focus_list = [data.current_key()] append_data = None if kwargs["append"] is not None: append_data = data.get_data(kwargs["append"]) logger.info("Inputs are loaded.") if is_raw and not kwargs["subcluster"]: # filter out low quality cells/genes tools._run_filter_data( data, focus_list=focus_list, output_filt=kwargs["output_filt"], plot_filt=kwargs["plot_filt"], plot_filt_figsize=kwargs["plot_filt_figsize"], min_genes_before_filt=kwargs["min_genes_before_filt"], select_singlets=kwargs["select_singlets"], remap_string=kwargs["remap_singlets"], subset_string=kwargs["subset_singlets"], min_genes=kwargs["min_genes"], max_genes=kwargs["max_genes"], min_umis=kwargs["min_umis"], max_umis=kwargs["max_umis"], mito_prefix=kwargs["mito_prefix"], percent_mito=kwargs["percent_mito"], percent_cells=kwargs["percent_cells"], ) for key in focus_list: unidata = data.get_data(key) analyze_one_modality(unidata, f"{output_name}.{unidata.get_uid()}", is_raw, append_data, **kwargs) print() # if kwargs["subcluster"]: # unidata = tools.get_anndata_for_subclustering(adata, kwargs["subset_selections"]) # is_raw = True # get submat and then set is_raw to True # write out results write_output(data, f"{output_name}.zarr.zip") print("Results are written.")
def test_zarr(self): import pegasusio as io data = io.read_input("tests/inmf_result.zarr.zip") self.assertEqual(data.shape, (self.n_cells, self.n_features), "Count matrix shape not correct!") self.assertEqual(data.obsm['H'].shape, (self.n_cells, self.n_factors), "H shape not correct!") self.assertEqual(data.uns['V'].shape, (self.n_batches, self.n_factors, self.n_hvfs), "V shape not correct!") self.assertEqual(data.uns['W'].shape, (self.n_hvfs, self.n_factors), "W shape not correct!") self.assertEqual(data.obsm['X_inmf'].shape, (self.n_cells, self.n_factors), "iNMF embedding shape not correct!")
def attach_demux_results(input_rna_file: str, rna_data: UnimodalData) -> MultimodalData: """ Write demultiplexing results into raw gene expression matrix. Parameters ---------- input_rna_file: ``str`` Input file for the raw gene count matrix. rna_data: ``UnimodalData`` Processed gene count matrix containing demultiplexing results Returns ------- ``MultimodalData`` A multimodal data object. Examples -------- >>> data = attach_demux_results('raw_data.h5', rna_data) """ demux_results = read_input(input_rna_file) demux_results.subset_data(modality_subset=['rna']) # Assume all matrices are of the same dimension assert demux_results.uns["modality"] == "rna" barcodes = demux_results.obs_names idx = barcodes.isin(rna_data.obs_names) selected = barcodes[idx] demux_type = np.empty(barcodes.size, dtype="object") demux_type[:] = "" demux_type[idx] = rna_data.obs.loc[selected, "demux_type"] assignment = np.empty(barcodes.size, dtype="object") assignment[:] = "" assignment[idx] = rna_data.obs.loc[selected, "assignment"] assignment_dedup = None if "assignment.dedup" in rna_data.obs: assignment_dedup = np.empty(barcodes.size, dtype="object") assignment_dedup[:] = "" assignment_dedup[idx] = rna_data.obs.loc[selected, "assignment.dedup"] for keyword in demux_results.list_data(): unidata = demux_results.get_data(keyword) assert unidata.uns["modality"] == "rna" unidata.obs["demux_type"] = demux_type unidata.obs["assignment"] = assignment if assignment_dedup is not None: unidata.obs["assignment.dedup"] = assignment_dedup logger.info("Demultiplexing results are added to raw expression matrices.") return demux_results
def annotate_data_object(input_file: str, annotation: str) -> None: """ For command line use. annotation: anno_name:clust_name:cell_type1;...cell_typen """ from pegasusio import read_input, write_output data = read_input(input_file, mode="r") anno_name, clust_name, anno_str = annotation.split(":") anno_dict = {str(i + 1): x for i, x in enumerate(anno_str.split(";"))} annotate(data, anno_name, clust_name, anno_dict) write_output(data, input_file)
def test_mixture_data(self): data = io.read_input( "pegasusio-test-data/case2/1k_hgmm_v3_filtered_feature_bc_matrix.h5" ) data.select_data('mm10-rna') self.assertEqual(data.shape, (1063, 54232), "Mouse data shape differs!") self.assertEqual(data.get_genome(), "mm10", "Mouse data genome differs!") self.assertEqual(data.get_modality(), "rna", "Mouse data modality differs!") data.select_data('hg19-rna') self.assertEqual(data.shape, (1063, 57905), "Human data shape differs!") self.assertEqual(data.get_genome(), "hg19", "Human data genome differs!") self.assertEqual(data.get_modality(), "rna", "Human data modality differs!")
def execute(self): kwargs = { "restrictions": self.args["--restriction"], "attrs": self.convert_to_list(self.args["--attributes"]), "basis": self.args["--basis"], "alpha": self.convert_to_list(self.args["--alpha"], converter=float), "legend_loc": self.convert_to_list(self.args["--legend-loc"]), "palettes" : self.args["--palette"], "show_background": self.args["--show-background"], "nrows": self.convert_to_int(self.args["--nrows"]), "ncols": self.convert_to_int(self.args["--ncols"]), "panel_size": self.convert_to_list(self.args["--panel-size"], converter=float), "left": self.convert_to_float(self.args["--left"]), "bottom": self.convert_to_float(self.args["--bottom"]), "wspace": self.convert_to_float(self.args["--wspace"]), "hspace": self.convert_to_float(self.args["--hspace"]), "groupby": self.args["--groupby"], "condition": self.args["--condition"], "style": self.args["--style"], "factor": int(self.args["--factor"]) if self.args["--factor"] is not None else self.args["--factor"], "max_words": int(self.args["--max-words"]), "return_fig": True, "dpi": int(self.args["--dpi"]), } for key in ["nrows", "ncols", "panel_size", "left", "bottom", "wspace", "hspace"]: if kwargs[key] is None: del kwargs[key] if self.args["<plot_type>"] == "scatter" and kwargs["attrs"] is None: raise KeyError("--attributes must be provided for scatter plots!") if self.args["<plot_type>"] == "compo" and (kwargs["groupby"] is None or kwargs["condition"] is None): raise KeyError("--groupby and --condition must be provided for composition plots!") if self.args["<plot_type>"] == "wordcloud" and kwargs["factor"] is None: raise KeyError("--factor must be provided for word cloud plots!") plot_type2keyword = {"scatter": "scatter", "compo" : "compo_plot", "wordcloud": "wordcloud"} data = read_input(self.args["<input_file>"]) fig = getattr(pegasus.plotting, plot_type2keyword[self.args["<plot_type>"]])(data, **kwargs) output_file = self.args["<output_file>"] fig.savefig(output_file) logger.info(f"{output_file} is generated.")
def run_de_analysis( input_file: str, output_excel_file: str, cluster: str, condition: Optional[str] = None, de_key: str = "de_res", n_jobs: int = -1, auc: bool = True, t: bool = True, fisher: bool = False, mwu: bool = False, temp_folder: str = None, verbose: bool = True, alpha: float = 0.05, ndigits: int = 3, ) -> None: """ For command line only """ from pegasusio import read_input, write_output data = read_input(input_file, mode='r') de_analysis( data, cluster, condition=condition, de_key=de_key, n_jobs=n_jobs, t=t, fisher=fisher, temp_folder=temp_folder, verbose=verbose, ) write_output(data, input_file) logger.info( f"Differential expression results are written to varm/{de_key}.") results = markers(data, de_key=de_key, alpha=alpha) write_results_to_excel(results, output_excel_file, ndigits=ndigits)
def write_output(assignment_file: str, input_mat_file: str, output_zarr_file: str) -> None: df = pd.read_csv(assignment_file, sep='\t', header=0, index_col='BARCODE') df.index = pd.Index([x[:-2] for x in df.index]) df['demux_type'] = df['DROPLET.TYPE'].apply(lambda s: demux_type_dict[s]) df['assignment'] = '' df.loc[df['demux_type'] == 'singlet', 'assignment'] = df.loc[df['demux_type'] == 'singlet', 'SNG.BEST.GUESS'] df.loc[df['demux_type'] == 'doublet', 'assignment'] = df.loc[ df['demux_type'] == 'doublet', 'DBL.BEST.GUESS'].apply(lambda s: ','.join(s.split(',')[:-1])) data = io.read_input(input_mat_file) data.obs['demux_type'] = '' data.obs['assignment'] = '' idx = data.obs_names.isin(df.index) barcodes = data.obs_names[idx] df_valid = df.loc[barcodes, ['demux_type', 'assignment']] data.obs.loc[idx, 'demux_type'] = df_valid['demux_type'].values data.obs.loc[idx, 'assignment'] = df_valid['assignment'].values io.write_output(data, output_zarr_file)
#!/usr/bin/env python from sys import argv, exit import pegasusio if len(argv) != 4: print( "Usage: python extract_barcodes_from_rna.py input_raw.h5 output_barcodes.tsv ngene" ) exit(-1) data = pegasusio.read_input(argv[1], ngene=int(argv[3])) with open(argv[2], "w") as fout: fout.write('\n'.join([x + '-1' for x in data.obs_names]) + '\n')
def run_pipeline(input_rna_file, input_hto_file, output_name, **kwargs): # load input rna data data = io.read_input(input_rna_file, genome=kwargs["genome"], modality="rna") data.concat_data() # in case of multi-organism mixing data rna_key = data.uns["genome"] # load input hashing data data.update( io.read_input(input_hto_file, genome="hashing", modality="hashing")) hashing_key = "hashing" # Extract rna and hashing data rna_data = data.get_data(rna_key) hashing_data = data.get_data(hashing_key) # Filter the RNA matrix rna_data.obs["n_genes"] = rna_data.X.getnnz(axis=1) rna_data.obs["n_counts"] = rna_data.X.sum(axis=1).A1 obs_index = np.logical_and.reduce(( rna_data.obs["n_genes"] >= kwargs["min_num_genes"], rna_data.obs["n_counts"] >= kwargs["min_num_umis"], )) rna_data._inplace_subset_obs(obs_index) # run demuxEM estimate_background_probs(hashing_data, random_state=kwargs["random_state"]) demultiplex( rna_data, hashing_data, min_signal=kwargs["min_signal"], alpha=kwargs["alpha"], n_threads=kwargs["n_jobs"], ) # annotate raw matrix with demuxEM results demux_results = attach_demux_results(input_rna_file, rna_data) # generate plots if kwargs["gen_plots"]: plot_hto_hist(hashing_data, "hto_type", output_name + ".ambient_hashtag.hist.pdf", alpha=1.0) plot_bar( hashing_data.uns["background_probs"], hashing_data.var_names, "Sample ID", "Background probability", output_name + ".background_probabilities.bar.pdf", ) plot_hto_hist(hashing_data, "rna_type", output_name + ".real_content.hist.pdf", alpha=0.5) plot_rna_hist(rna_data, output_name + ".rna_demux.hist.pdf") logger.info("Diagnostic plots are generated.") if len(kwargs["gen_gender_plot"]) > 0: rna_data.matrices["raw.X"] = rna_data.X.copy() rna_data.as_float() scale = 1e5 / rna_data.X.sum(axis=1).A1 rna_data.X.data *= np.repeat(scale, np.diff(data.X.indptr)) rna_data.X.data = np.log1p(rna_data.X.data) for gene_name in kwargs["gen_gender_plot"]: plot_gene_violin( rna_data, gene_name, "{output_name}.{gene_name}.violin.pdf".format( output_name=output_name, gene_name=gene_name), title="{gene_name}: a gender-specific gene".format( gene_name=gene_name), ) logger.info( "Gender-specific gene expression violin plots are generated.") # output results io.write_output(demux_results, output_name + "_demux.zarr", zarr_zipstore=True) io.write_output(data, output_name + ".out.demuxEM.zarr", zarr_zipstore=True) # output summary statistics print("\nSummary statistics:") print("total\t{}".format(rna_data.shape[0])) for name, value in rna_data.obs["demux_type"].value_counts().iteritems(): print("{}\t{}".format(name, value))
def aggregate_matrices( csv_file: Union[str, Dict[str, np.ndarray], pd.DataFrame], restrictions: Optional[Union[List[str], str]] = [], attributes: Optional[Union[List[str], str]] = [], default_ref: Optional[str] = None, append_sample_name: Optional[bool] = True, select_singlets: Optional[bool] = False, remap_string: Optional[str] = None, subset_string: Optional[str] = None, min_genes: Optional[int] = None, max_genes: Optional[int] = None, min_umis: Optional[int] = None, max_umis: Optional[int] = None, mito_prefix: Optional[str] = None, percent_mito: Optional[float] = None, ) -> MultimodalData: """Aggregate channel-specific count matrices into one big count matrix. This function takes as input a csv_file, which contains at least 2 columns — Sample, sample name; Location, file that contains the count matrices (e.g. filtered_gene_bc_matrices_h5.h5), and merges matrices from the same genome together. If multi-modality exists, a third Modality column might be required. An aggregated Multimodal Data will be returned. Parameters ---------- csv_file : `str` The CSV file containing information about each channel. Alternatively, a dictionary or pd.Dataframe can be passed. restrictions : `list[str]` or `str`, optional (default: []) A list of restrictions used to select channels, each restriction takes the format of name:value,…,value or name:~value,..,value, where ~ refers to not. If only one restriction is provided, it can be provided as a string instead of a list. attributes : `list[str]` or `str`, optional (default: []) A list of attributes need to be incorporated into the output count matrix. If only one attribute is provided, this attribute can be provided as a string instead of a list. default_ref : `str`, optional (default: None) Default reference name to use. If there is no Reference column in the csv_file, a Reference column will be added with default_ref as its value. This argument can also be used for replacing genome names. For example, if default_ref is 'hg19:GRCh38,GRCh38', we will change any genome with name 'hg19' to 'GRCh38' and if no genome is provided, 'GRCh38' is the default. append_sample_name : `bool`, optional (default: True) By default, append sample_name to each channel. Turn this option off if each channel has distinct barcodes. select_singlets : `bool`, optional (default: False) If we have demultiplexed data, turning on this option will make pegasus only include barcodes that are predicted as singlets. remap_string: ``str``, optional, default ``None`` Remap singlet names using <remap_string>, where <remap_string> takes the format "new_name_i:old_name_1,old_name_2;new_name_ii:old_name_3;...". For example, if we hashed 5 libraries from 3 samples sample1_lib1, sample1_lib2, sample2_lib1, sample2_lib2 and sample3, we can remap them to 3 samples using this string: "sample1:sample1_lib1,sample1_lib2;sample2:sample2_lib1,sample2_lib2". In this way, the new singlet names will be in metadata field with key 'assignment', while the old names will be kept in metadata field with key 'assignment.orig'. subset_string: ``str``, optional, default ``None`` If select singlets, only select singlets in the <subset_string>, which takes the format "name1,name2,...". Note that if --remap-singlets is specified, subsetting happens after remapping. For example, we can only select singlets from sampe 1 and 3 using "sample1,sample3". min_genes: ``int``, optional, default: None Only keep cells with at least ``min_genes`` genes. max_genes: ``int``, optional, default: None Only keep cells with less than ``max_genes`` genes. min_umis: ``int``, optional, default: None Only keep cells with at least ``min_umis`` UMIs. max_umis: ``int``, optional, default: None Only keep cells with less than ``max_umis`` UMIs. mito_prefix: ``str``, optional, default: None Prefix for mitochondrial genes. percent_mito: ``float``, optional, default: None Only keep cells with percent mitochondrial genes less than ``percent_mito`` % of total counts. Only when both mito_prefix and percent_mito set, the mitochondrial filter will be triggered. Returns ------- `MultimodalData` object. The aggregated count matrix as an MultimodalData object. Examples -------- >>> data = aggregate_matrix('example.csv', restrictions=['Source:pbmc', 'Donor:1'], attributes=['Source', 'Platform', 'Donor']) """ if isinstance(csv_file, str): df = pd.read_csv(csv_file, header=0, index_col=False) # load sample sheet elif isinstance(csv_file, dict): df = pd.DataFrame(csv_file) else: df = csv_file # Remove duplicated items if isinstance(restrictions, str): restrictions = [restrictions] restrictions = set(restrictions) if isinstance(attributes, str): attributes = [attributes] attributes = set(attributes) # Select data rvec = [_parse_restriction_string(x) for x in restrictions] idx = pd.Series([True] * df.shape[0], index=df.index, name="selected") for name, isin, content in rvec: assert name in df.columns if isin: idx = idx & df[name].isin(content) else: idx = idx & (~(df[name].isin(content))) if idx.sum() == 0: raise ValueError("No data pass the restrictions!") df = df.loc[idx].sort_values(by="Sample") # sort by sample_name # parse default_ref def_genome, genome_dict = _parse_genome_string(default_ref) # Load data tot = 0 dest_paths = [ ] # record localized file paths so that we can remove them later curr_sample = "" curr_row = curr_data = None aggrData = AggrData() for idx_num, row in df.iterrows(): input_file = os.path.expanduser( os.path.expandvars(row["Location"].rstrip( os.sep))) # extend all user variables file_type, copy_path, copy_type = infer_file_type( input_file) # infer file type if row["Location"].lower().startswith('gs://'): # if Google bucket base_name = os.path.basename(copy_path) dest_path = f"{idx_num}_tmp_{base_name}" # id_num will make sure dest_path is unique in the sample sheet if not os.path.exists( dest_path ): # if dest_path exists, we may try to localize it once and may have the file cached if copy_type == "directory": check_call(["mkdir", "-p", dest_path]) call_args = [ "gsutil", "-m", "rsync", "-r", copy_path, dest_path ] else: call_args = ["gsutil", "-m", "cp", copy_path, dest_path] check_call(call_args) dest_paths.append(dest_path) if input_file == copy_path: input_file = dest_path else: input_file = os.path.join(dest_path, os.path.basename(input_file)) genome = row.get("Reference", None) if (genome is not None) and (not isinstance(genome, str)): # to avoid NaN genome = None if genome is None: genome = def_genome modality = row.get("Modality", None) data = read_input(input_file, file_type=file_type, genome=genome, modality=modality) if len(genome_dict) > 0: data._update_genome(genome_dict) if row["Sample"] != curr_sample: if curr_data is not None: curr_data._propogate_genome() curr_data.filter_data(select_singlets=select_singlets, remap_string=remap_string, subset_string=subset_string, min_genes=min_genes, max_genes=max_genes, min_umis=min_umis, max_umis=max_umis, mito_prefix=mito_prefix, percent_mito=percent_mito) curr_data._update_barcode_metadata_info( curr_row, attributes, append_sample_name) aggrData.add_data(curr_data) curr_data = data curr_row = row curr_sample = row["Sample"] else: curr_data.update(data) tot += 1 if curr_data is not None: curr_data._propogate_genome() curr_data.filter_data(select_singlets=select_singlets, remap_string=remap_string, subset_string=subset_string, min_genes=min_genes, max_genes=max_genes, min_umis=min_umis, max_umis=max_umis, mito_prefix=mito_prefix, percent_mito=percent_mito) curr_data._update_barcode_metadata_info(curr_row, attributes, append_sample_name) aggrData.add_data(curr_data) # Merge data aggregated_data = aggrData.aggregate() attributes.add("Channel") aggregated_data._convert_attributes_to_categorical(attributes) logger.info(f"Aggregated {tot} files.") # Delete temporary file if len(dest_paths) > 0: for dest_path in dest_paths: check_call(["rm", "-rf", dest_path]) logger.info("Temporary files are deleted.") return aggregated_data
def run_find_markers( input_h5ad_file: str, output_file: str, label_attr: str, de_key: str = "de_res", n_jobs: int = -1, min_gain: float = 1.0, random_state: int = 0, remove_ribo: bool = False, ) -> None: """ For command line use. """ import xlsxwriter from natsort import natsorted data = read_input(input_h5ad_file) markers = find_markers( data, label_attr, de_key=de_key, n_jobs=n_jobs, min_gain=min_gain, random_state=random_state, remove_ribo=remove_ribo, ) keywords = [("strong", "strong_gain"), ("weak", "weak_gain"), ("down", "down_gain")] writer = pd.ExcelWriter(output_file, engine="xlsxwriter") for clust_id in natsorted(markers.keys()): clust_markers = markers[clust_id] sizes = [] for keyword in keywords: sizes.append(len(clust_markers[keyword[0]])) arr = np.zeros((max(sizes), 8), dtype=object) arr[:] = "" for i in range(3): arr[0:sizes[i], i * 3] = clust_markers[keywords[i][0]] arr[0:sizes[i], i * 3 + 1] = clust_markers[keywords[i][1]] df = pd.DataFrame( data=arr, columns=[ "strongly up-regulated", "gain", "", "weakly up-regulated", "gain", "", "down-regulated", "gain", ], ) df.to_excel(writer, sheet_name=clust_id, index=False) writer.save()
#!/usr/bin/env python from sys import argv, exit import pegasusio as io if len(argv) != 4: print( "Usage: python extract_barcodes_from_rna.py input_raw.h5 output_barcodes.tsv ngene" ) exit(-1) data = io.read_input(argv[1]) data.filter_data(min_genes=int(argv[3])) with open(argv[2], "w") as fout: fout.write('\n'.join([x + '-1' for x in data.obs_names]) + '\n')
type=int, help='Random seed', default=0) parser_plot = subparsers.add_parser('plot', help='Plot topic modelling stats') parser_plot.add_argument('stats', type=str, nargs='+') args = parser.parse_args() if args.sub_parser == 'prepare': prefix_exclude = None if args.prefix_exclude is not None: prefix_exclude = args.prefix_exclude.split(',') input_path = args.input d = pio.read_input(input_path) lda_setup(adata=d, prefix_exclude=prefix_exclude, min_percent=args.min_percent, max_percent=args.max_percent) elif args.sub_parser == 'run': dictionary = gensim.corpora.Dictionary.load(args.dictionary) corpus = gensim.corpora.MmCorpus(args.corpus) compute_lda(corpus=corpus, cell_ids=pd.read_csv(args.cell_ids, index_col=0).index.values, dictionary=dictionary, topics=args.topics, random_state=args.random_seed) elif args.sub_parser == 'plot': stats = []