def moap_with_table(input_table, motif_table, data_dir, method, scoring): outfile = os.path.join(data_dir,"activity.{}.{}.out.txt".format( method, scoring)) moap(input_table, outfile=outfile, method=method, scoring=scoring, motiffile=motif_table)
def moap_with_bg(input_table, genome, data_dir, method, scoring, pwmfile=None, ncpus=None): outfile = os.path.join(data_dir,"activity.{}.{}.out.txt".format( method, scoring)) moap(input_table, outfile=outfile, genome=genome, method=method, scoring=scoring, fpr=FPR, ncpus=ncpus)
def moap_with_bg(input_table, genome, data_dir, method, scoring): threshold_file = check_threshold(data_dir, genome, scoring) outfile = os.path.join(data_dir,"activity.{}.{}.out.txt".format( method, scoring)) moap(input_table, outfile=outfile, genome=genome, method=method, scoring=scoring, cutoff=threshold_file)
def moap_with_table(input_table, motif_table, data_dir, method, scoring): outfile = os.path.join(data_dir, "activity.{}.{}.out.txt".format(method, scoring)) moap(input_table, outfile=outfile, method=method, scoring=scoring, motiffile=motif_table)
def moap_with_bg(input_table, genome, data_dir, method, scoring): threshold_file = check_threshold(data_dir, genome, scoring) outfile = os.path.join(data_dir, "activity.{}.{}.out.txt".format(method, scoring)) moap(input_table, outfile=outfile, genome=genome, method=method, scoring=scoring, cutoff=threshold_file)
def moap_with_bg( input_table, genome, data_dir, method, scoring, pfmfile=None, ncpus=None ): outfile = os.path.join(data_dir, "activity.{}.{}.out.txt".format(method, scoring)) moap( input_table, outfile=outfile, pfmfile=pfmfile, genome=genome, method=method, scoring=scoring, fpr=FPR, ncpus=ncpus, )
def test2_moap(self): """ Test motif activity prediction for two clusters """ for method in ["mwu", "rf", "lightningclassification"]: df = moap(self.clusters2, method=method, scoring="score", motiffile=self.motifs_score2, ) self.assertEquals((623, 2), df.shape) for method in ["hypergeom"]: df = moap(self.clusters2, method=method, scoring="count", motiffile=self.motifs_count2, ) self.assertEquals((623, 2), df.shape)
def predict_factor_activity(self, outfile, nregions=20000): """Predict TF activity. Predicted based on motif activity using ridge regression. Parameters ---------- outfile : str Name of outputfile. """ # Run ridge regression using motif score to predict (relative) ATAC/H3K27ac signal activity = pd.DataFrame() for df in (self._atac_data, self._histone_data): if df is None: continue for col in df.columns: with NamedTemporaryFile() as f: # float16 will give NaN's signal = df[col].astype("float32") signal = pd.DataFrame({col: scale(signal)}, index=df.index) if df.shape[0] < nregions: signal.to_csv(f.name, sep="\t") else: signal.sample(nregions).to_csv(f.name, sep="\t") try: activity = activity.join( moap( f.name, genome=self.genome, method="bayesianridge", pfmfile=self.pfmfile, ), how="outer", ) except Exception as e: print(e) print(activity) # Rank aggregation for col in activity: activity[col] = rankdata(activity[col]) activity = activity.mean(1) activity[:] = minmax_scale(activity) # Take the maximum activity from the motifs of each factor factor_activity = [] for factor, motifs in self.f2m.items(): act = activity.loc[motifs].max() factor_activity.append([factor, act]) factor_activity = pd.DataFrame(factor_activity, columns=["factor", "activity"]) factor_activity.to_csv(outfile, sep="\t", index=False)
def test1_moap(self): """ Test motif activity prediction """ for method in ["mwu", "rf"]: df = moap( self.clusters, method=method, scoring="score", motiffile=self.motifs_score, ) self.assertEquals((623, 4), df.shape) for method in ["hypergeom"]: df = moap( self.clusters, method=method, scoring="count", motiffile=self.motifs_count, ) self.assertEquals((623, 4), df.shape)
def infer_motifs( adata: AnnData, dataset: str, cluster: Optional[str] = "louvain", n_top_genes: Optional[int] = 1000, max_cell_types: Optional[int] = 50, pfm: Optional[str] = None, min_annotated: Optional[int] = 50, num_enhancers: Optional[int] = 10000, maelstrom: Optional[bool] = False, indirect: Optional[bool] = True, n_sketch: Optional[int] = 2500, n_permutations: Optional[int] = 100000, ) -> None: """Infer motif ativity for single cell RNA-seq data. The adata object is modified with the following fields. **X_cell_types** : `adata.obsm` field Cell type coefficients. Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data matrix. dataset : `str` Name of reference data set or directory with reference data. cluster : `str`, optional (default: "louvain") Name of the clustering, can be either louvain or leiden. n_top_genes : `int`, optional (default: 1000) Number of variable genes that is used. If `n_top_genes` is greater than the number of hypervariable genes in `adata` then all variable genes are used. max_cell_types : `int`, optional (default: 50) Maximum number of cell types to select. pfm : `str`, optional (default: None) Name of motif file in PFM format. The GimmeMotifs default is used if this parameter is not specified. This can be a filename, or a pfm name support by GimmeMotifs such as `JASPAR2018_vertebrates`. If a custom PFM file is specified, there should also be an associated `.motif2factors.txt` file. min_annotated : `int`, optional (default: 50) Cells that are annotated with cell types less than this number will be annotated as "other". num_enhancers : `int`, optional (default: 10000) Number of enhancers to use for motif activity analysis. maelstrom : `boolean`, optional (default: False) Use maelstrom instead of ridge regression for motif activity analysis. """ use_name = True validate_adata(adata) data = ScepiaDataset(dataset) if "scepia" not in adata.uns: adata.uns["scepia"] = {"version": __version__} # Annotate each cell with H3K27ac reference if "cell_annotation" not in adata.obs or "cluster_annotation" not in adata.obs: annotate_cells( adata, dataset=dataset, cluster=cluster, n_top_genes=n_top_genes, min_annotated=min_annotated, max_cell_types=max_cell_types, ) logger.info("Linking variable genes to differential enhancers.") gene_map_file = data.gene_mapping link_file = data.link_file link = pd.read_feather(link_file) if use_name: ens2name = pd.read_csv(gene_map_file, sep="\t", index_col=0, names=["identifier", "name"]) link = link.join(ens2name, on="gene").dropna() link = link.set_index("name") link.index = link.index.str.upper() enh_genes = adata.var_names[adata.var_names.str.upper().isin( link.index)].str.upper() var_enhancers = change_region_size(link.loc[enh_genes, "loc"]).unique() enhancer_df = data.load_reference_data(reftype="enhancer") enhancer_df.index = change_region_size(enhancer_df.index) enhancer_df = enhancer_df.loc[var_enhancers, adata.uns["scepia"]["cell_types"]] enhancer_df = enhancer_df.groupby(enhancer_df.columns, axis=1).mean() enhancer_df.loc[:, :] = scale(enhancer_df) # Select top most variable enhancers enhancer_df = enhancer_df.loc[enhancer_df.var(1).sort_values().tail( num_enhancers).index] # Center by mean of the most import cell types # Here we chose the majority cell type per cluster cluster_cell_types = adata.obs["cluster_annotation"].unique() mean_value = enhancer_df[cluster_cell_types].mean(1) enhancer_df = enhancer_df.sub(mean_value, axis=0) fname = NamedTemporaryFile(delete=False).name enhancer_df.to_csv(fname, sep="\t") logger.info("inferring motif activity") pfm = pfmfile_location(pfm) if maelstrom: with TemporaryDirectory() as tmpdir: run_maelstrom( fname, data.genome, tmpdir, center=False, filter_redundant=True, ) motif_act = pd.read_csv( os.path.join(tmpdir, "final.out.txt"), sep="\t", comment="#", index_col=0, ) motif_act.columns = motif_act.columns.str.replace( r"z-score\s+", "") pfm = pfmfile_location( os.path.join(tmpdir, "nonredundant.motifs.pfm")) else: logger.info(f"Activity based on genome {data.genome}") motif_act = moap( fname, scoring="score", genome=data.genome, method="bayesianridge", pfmfile=pfm, ncpus=12, ) adata.uns["scepia"]["pfm"] = pfm adata.uns["scepia"]["motif_activity"] = motif_act[adata.uns["scepia"] ["cell_types"]] logger.info("calculating cell-specific motif activity") cell_motif_activity = ( adata.uns["scepia"]["motif_activity"] @ adata.obsm["X_cell_types"].T).T cell_motif_activity.index = adata.obs_names adata.obs = adata.obs.drop( columns=cell_motif_activity.columns.intersection(adata.obs.columns)) adata.obs = adata.obs.join(cell_motif_activity) correlate_tf_motifs(adata, indirect=indirect, n_sketch=n_sketch, n_permutations=n_permutations) add_activity(adata) logger.info("Done with motif inference.")