def maelstrom(args): """Run the maelstrom method.""" infile = args.inputfile genome = args.genome outdir = args.outdir pfmfile = args.pfmfile filter_redundant = args.filter_redundant filter_cutoff = args.filter_cutoff methods = args.methods ncpus = args.ncpus zscore = args.zscore center = args.center gc = args.gc aggregation = args.aggregation if not os.path.exists(infile): raise ValueError("file {} does not exist".format(infile)) if methods: methods = [x.strip() for x in methods.split(",")] run_maelstrom( infile, genome, outdir, pfmfile, filter_redundant=filter_redundant, filter_cutoff=filter_cutoff, methods=methods, ncpus=ncpus, zscore=zscore, gc=gc, center=center, aggregation=aggregation, )
def maelstrom(args): """Run the maelstrom method.""" infile = args.inputfile genome = args.genome outdir = args.outdir pfmfile = args.pfmfile methods = args.methods ncpus = args.ncpus zscore = args.zscore gc = args.gc if not os.path.exists(infile): raise ValueError("file {} does not exist".format(infile)) if methods: methods = [x.strip() for x in methods.split(",")] run_maelstrom( infile, genome, outdir, pfmfile, methods=methods, ncpus=ncpus, zscore=zscore, gc=gc, )
def test1_maelstrom(self): """ Test Motif Activity by Ensemble Learning (maelstrom) """ run_maelstrom(self.clusters, "mm10", self.outdir, score_table=self.score_table, count_table=self.count_table) df = pd.read_table(self.outfile, index_col=0, comment="#") self.assertEquals((623, 4), df.shape) for fname in glob(os.path.join(self.outdir, "activity*")): os.unlink(fname) os.unlink(self.outfile)
def maelstrom(args): infile = args.inputfile genome = args.genome outdir = args.outdir if not os.path.exists(infile): raise ValueError("file {} does not exist".format(infile)) check_genome(genome) run_maelstrom(infile, genome, outdir)
def test1_maelstrom(self): """ Test Motif Activity by Ensemble Learning (maelstrom) """ run_maelstrom(self.clusters, "mm10", self.outdir, score_table=self.score_table, count_table=self.count_table, plot=False) df = pd.read_table(self.outfile, index_col=0, comment="#") self.assertEquals((623, 4), df.shape) for fname in glob(os.path.join(self.outdir, "activity*")): os.unlink(fname) os.unlink(self.outfile)
def test1_maelstrom(self): """ Test Motif Activity by Ensemble Learning (maelstrom) """ run_maelstrom( self.clusters, "mm10", self.outdir, filter_redundant=False, score_table=self.score_table, count_table=self.count_table, plot=False, ) df = pd.read_table(self.outfile, index_col=0, comment="#") print(df.shape) self.assertEquals((623, 8), df.shape) # Filter redundant motifs run_maelstrom( self.clusters, "mm10", self.outdir, filter_redundant=True, score_table=self.score_table, count_table=self.count_table, plot=False, ) df = pd.read_table(self.outfile, index_col=0, comment="#") print(df.shape) self.assertEquals((156, 8), df.shape) for fname in glob(os.path.join(self.outdir, "activity*")): os.unlink(fname) for fname in glob(os.path.join(self.outdir, "gimme.verte*")): os.unlink(fname) os.unlink(self.outfile)
def infer_motifs( adata: AnnData, dataset: str, cluster: Optional[str] = "louvain", n_top_genes: Optional[int] = 1000, max_cell_types: Optional[int] = 50, pfm: Optional[str] = None, min_annotated: Optional[int] = 50, num_enhancers: Optional[int] = 10000, maelstrom: Optional[bool] = False, indirect: Optional[bool] = True, n_sketch: Optional[int] = 2500, n_permutations: Optional[int] = 100000, ) -> None: """Infer motif ativity for single cell RNA-seq data. The adata object is modified with the following fields. **X_cell_types** : `adata.obsm` field Cell type coefficients. Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data matrix. dataset : `str` Name of reference data set or directory with reference data. cluster : `str`, optional (default: "louvain") Name of the clustering, can be either louvain or leiden. n_top_genes : `int`, optional (default: 1000) Number of variable genes that is used. If `n_top_genes` is greater than the number of hypervariable genes in `adata` then all variable genes are used. max_cell_types : `int`, optional (default: 50) Maximum number of cell types to select. pfm : `str`, optional (default: None) Name of motif file in PFM format. The GimmeMotifs default is used if this parameter is not specified. This can be a filename, or a pfm name support by GimmeMotifs such as `JASPAR2018_vertebrates`. If a custom PFM file is specified, there should also be an associated `.motif2factors.txt` file. min_annotated : `int`, optional (default: 50) Cells that are annotated with cell types less than this number will be annotated as "other". num_enhancers : `int`, optional (default: 10000) Number of enhancers to use for motif activity analysis. maelstrom : `boolean`, optional (default: False) Use maelstrom instead of ridge regression for motif activity analysis. """ use_name = True validate_adata(adata) data = ScepiaDataset(dataset) if "scepia" not in adata.uns: adata.uns["scepia"] = {"version": __version__} # Annotate each cell with H3K27ac reference if "cell_annotation" not in adata.obs or "cluster_annotation" not in adata.obs: annotate_cells( adata, dataset=dataset, cluster=cluster, n_top_genes=n_top_genes, min_annotated=min_annotated, max_cell_types=max_cell_types, ) logger.info("Linking variable genes to differential enhancers.") gene_map_file = data.gene_mapping link_file = data.link_file link = pd.read_feather(link_file) if use_name: ens2name = pd.read_csv(gene_map_file, sep="\t", index_col=0, names=["identifier", "name"]) link = link.join(ens2name, on="gene").dropna() link = link.set_index("name") link.index = link.index.str.upper() enh_genes = adata.var_names[adata.var_names.str.upper().isin( link.index)].str.upper() var_enhancers = change_region_size(link.loc[enh_genes, "loc"]).unique() enhancer_df = data.load_reference_data(reftype="enhancer") enhancer_df.index = change_region_size(enhancer_df.index) enhancer_df = enhancer_df.loc[var_enhancers, adata.uns["scepia"]["cell_types"]] enhancer_df = enhancer_df.groupby(enhancer_df.columns, axis=1).mean() enhancer_df.loc[:, :] = scale(enhancer_df) # Select top most variable enhancers enhancer_df = enhancer_df.loc[enhancer_df.var(1).sort_values().tail( num_enhancers).index] # Center by mean of the most import cell types # Here we chose the majority cell type per cluster cluster_cell_types = adata.obs["cluster_annotation"].unique() mean_value = enhancer_df[cluster_cell_types].mean(1) enhancer_df = enhancer_df.sub(mean_value, axis=0) fname = NamedTemporaryFile(delete=False).name enhancer_df.to_csv(fname, sep="\t") logger.info("inferring motif activity") pfm = pfmfile_location(pfm) if maelstrom: with TemporaryDirectory() as tmpdir: run_maelstrom( fname, data.genome, tmpdir, center=False, filter_redundant=True, ) motif_act = pd.read_csv( os.path.join(tmpdir, "final.out.txt"), sep="\t", comment="#", index_col=0, ) motif_act.columns = motif_act.columns.str.replace( r"z-score\s+", "") pfm = pfmfile_location( os.path.join(tmpdir, "nonredundant.motifs.pfm")) else: logger.info(f"Activity based on genome {data.genome}") motif_act = moap( fname, scoring="score", genome=data.genome, method="bayesianridge", pfmfile=pfm, ncpus=12, ) adata.uns["scepia"]["pfm"] = pfm adata.uns["scepia"]["motif_activity"] = motif_act[adata.uns["scepia"] ["cell_types"]] logger.info("calculating cell-specific motif activity") cell_motif_activity = ( adata.uns["scepia"]["motif_activity"] @ adata.obsm["X_cell_types"].T).T cell_motif_activity.index = adata.obs_names adata.obs = adata.obs.drop( columns=cell_motif_activity.columns.intersection(adata.obs.columns)) adata.obs = adata.obs.join(cell_motif_activity) correlate_tf_motifs(adata, indirect=indirect, n_sketch=n_sketch, n_permutations=n_permutations) add_activity(adata) logger.info("Done with motif inference.")