Пример #1
0
def moap_with_table(input_table, motif_table, data_dir, method, scoring):
    outfile = os.path.join(data_dir,"activity.{}.{}.out.txt".format(
            method,
            scoring))

    moap(input_table, outfile=outfile, method=method, scoring=scoring, 
            motiffile=motif_table)
Пример #2
0
def moap_with_bg(input_table, genome, data_dir, method, scoring, pwmfile=None, ncpus=None):
    outfile = os.path.join(data_dir,"activity.{}.{}.out.txt".format(
            method,
            scoring))

    moap(input_table, outfile=outfile, genome=genome, method=method,
            scoring=scoring, fpr=FPR, ncpus=ncpus)
Пример #3
0
def moap_with_bg(input_table, genome, data_dir, method, scoring):
    threshold_file = check_threshold(data_dir, genome, scoring)
    
    outfile = os.path.join(data_dir,"activity.{}.{}.out.txt".format(
            method,
            scoring))

    moap(input_table, outfile=outfile, genome=genome, method=method,
            scoring=scoring, cutoff=threshold_file)
Пример #4
0
def moap_with_table(input_table, motif_table, data_dir, method, scoring):
    outfile = os.path.join(data_dir,
                           "activity.{}.{}.out.txt".format(method, scoring))

    moap(input_table,
         outfile=outfile,
         method=method,
         scoring=scoring,
         motiffile=motif_table)
Пример #5
0
def moap_with_bg(input_table, genome, data_dir, method, scoring):
    threshold_file = check_threshold(data_dir, genome, scoring)

    outfile = os.path.join(data_dir,
                           "activity.{}.{}.out.txt".format(method, scoring))

    moap(input_table,
         outfile=outfile,
         genome=genome,
         method=method,
         scoring=scoring,
         cutoff=threshold_file)
Пример #6
0
def moap_with_bg(
    input_table, genome, data_dir, method, scoring, pfmfile=None, ncpus=None
):
    outfile = os.path.join(data_dir, "activity.{}.{}.out.txt".format(method, scoring))

    moap(
        input_table,
        outfile=outfile,
        pfmfile=pfmfile,
        genome=genome,
        method=method,
        scoring=scoring,
        fpr=FPR,
        ncpus=ncpus,
    )
Пример #7
0
    def test2_moap(self):
        """ Test motif activity prediction for two clusters """
        
        for method in ["mwu", "rf", "lightningclassification"]:
            df = moap(self.clusters2,
                    method=method,
                    scoring="score",
                    motiffile=self.motifs_score2,
                    )
            self.assertEquals((623, 2), df.shape)

        for method in ["hypergeom"]:
            df = moap(self.clusters2,
                    method=method,
                    scoring="count",
                    motiffile=self.motifs_count2,
                    )
            self.assertEquals((623, 2), df.shape)
Пример #8
0
    def predict_factor_activity(self, outfile, nregions=20000):
        """Predict TF activity.

        Predicted based on motif activity using ridge regression.

        Parameters
        ----------
        outfile : str
            Name of outputfile.
        """
        # Run ridge regression using motif score to predict (relative) ATAC/H3K27ac signal
        activity = pd.DataFrame()
        for df in (self._atac_data, self._histone_data):
            if df is None:
                continue

            for col in df.columns:
                with NamedTemporaryFile() as f:
                    # float16 will give NaN's
                    signal = df[col].astype("float32")
                    signal = pd.DataFrame({col: scale(signal)}, index=df.index)
                    if df.shape[0] < nregions:
                        signal.to_csv(f.name, sep="\t")
                    else:
                        signal.sample(nregions).to_csv(f.name, sep="\t")
                    try:
                        activity = activity.join(
                            moap(
                                f.name,
                                genome=self.genome,
                                method="bayesianridge",
                                pfmfile=self.pfmfile,
                            ),
                            how="outer",
                        )
                    except Exception as e:
                        print(e)
                    print(activity)

        # Rank aggregation
        for col in activity:
            activity[col] = rankdata(activity[col])
        activity = activity.mean(1)
        activity[:] = minmax_scale(activity)

        # Take the maximum activity from the motifs of each factor
        factor_activity = []
        for factor, motifs in self.f2m.items():
            act = activity.loc[motifs].max()
            factor_activity.append([factor, act])

        factor_activity = pd.DataFrame(factor_activity,
                                       columns=["factor", "activity"])

        factor_activity.to_csv(outfile, sep="\t", index=False)
Пример #9
0
    def test1_moap(self):
        """ Test motif activity prediction """

        for method in ["mwu", "rf"]:
            df = moap(
                self.clusters,
                method=method,
                scoring="score",
                motiffile=self.motifs_score,
            )
            self.assertEquals((623, 4), df.shape)

        for method in ["hypergeom"]:
            df = moap(
                self.clusters,
                method=method,
                scoring="count",
                motiffile=self.motifs_count,
            )
            self.assertEquals((623, 4), df.shape)
Пример #10
0
def infer_motifs(
    adata: AnnData,
    dataset: str,
    cluster: Optional[str] = "louvain",
    n_top_genes: Optional[int] = 1000,
    max_cell_types: Optional[int] = 50,
    pfm: Optional[str] = None,
    min_annotated: Optional[int] = 50,
    num_enhancers: Optional[int] = 10000,
    maelstrom: Optional[bool] = False,
    indirect: Optional[bool] = True,
    n_sketch: Optional[int] = 2500,
    n_permutations: Optional[int] = 100000,
) -> None:
    """Infer motif ativity for single cell RNA-seq data.

    The adata object is modified with the following fields.

    **X_cell_types** : `adata.obsm` field
        Cell type coefficients.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix.
    dataset : `str`
        Name of reference data set or directory with reference data.
    cluster : `str`, optional (default: "louvain")
        Name of the clustering, can be either louvain or leiden.
    n_top_genes : `int`, optional (default: 1000)
        Number of variable genes that is used. If `n_top_genes` is greater than the
        number of hypervariable genes in `adata` then all variable genes are
        used.
    max_cell_types : `int`, optional (default: 50)
        Maximum number of cell types to select.
    pfm : `str`, optional (default: None)
        Name of motif file in PFM format. The GimmeMotifs default is used
        if this parameter is not specified. This can be a filename, or a
        pfm name support by GimmeMotifs such as `JASPAR2018_vertebrates`.
        If a custom PFM file is specified, there should also be an associated
        `.motif2factors.txt` file.
    min_annotated : `int`, optional (default: 50)
        Cells that are annotated with cell types less than this number will be
        annotated as "other".
    num_enhancers : `int`, optional (default: 10000)
        Number of enhancers to use for motif activity analysis.
    maelstrom : `boolean`, optional (default: False)
        Use maelstrom instead of ridge regression for motif activity analysis.
    """

    use_name = True

    validate_adata(adata)

    data = ScepiaDataset(dataset)

    if "scepia" not in adata.uns:
        adata.uns["scepia"] = {"version": __version__}

    # Annotate each cell with H3K27ac reference
    if "cell_annotation" not in adata.obs or "cluster_annotation" not in adata.obs:
        annotate_cells(
            adata,
            dataset=dataset,
            cluster=cluster,
            n_top_genes=n_top_genes,
            min_annotated=min_annotated,
            max_cell_types=max_cell_types,
        )

    logger.info("Linking variable genes to differential enhancers.")
    gene_map_file = data.gene_mapping

    link_file = data.link_file
    link = pd.read_feather(link_file)
    if use_name:
        ens2name = pd.read_csv(gene_map_file,
                               sep="\t",
                               index_col=0,
                               names=["identifier", "name"])
        link = link.join(ens2name, on="gene").dropna()
        link = link.set_index("name")

    link.index = link.index.str.upper()
    enh_genes = adata.var_names[adata.var_names.str.upper().isin(
        link.index)].str.upper()
    var_enhancers = change_region_size(link.loc[enh_genes, "loc"]).unique()

    enhancer_df = data.load_reference_data(reftype="enhancer")
    enhancer_df.index = change_region_size(enhancer_df.index)
    enhancer_df = enhancer_df.loc[var_enhancers,
                                  adata.uns["scepia"]["cell_types"]]
    enhancer_df = enhancer_df.groupby(enhancer_df.columns, axis=1).mean()
    enhancer_df.loc[:, :] = scale(enhancer_df)
    # Select top most variable enhancers
    enhancer_df = enhancer_df.loc[enhancer_df.var(1).sort_values().tail(
        num_enhancers).index]
    # Center by mean of the most import cell types
    # Here we chose the majority cell type per cluster
    cluster_cell_types = adata.obs["cluster_annotation"].unique()
    mean_value = enhancer_df[cluster_cell_types].mean(1)
    enhancer_df = enhancer_df.sub(mean_value, axis=0)
    fname = NamedTemporaryFile(delete=False).name
    enhancer_df.to_csv(fname, sep="\t")
    logger.info("inferring motif activity")

    pfm = pfmfile_location(pfm)
    if maelstrom:
        with TemporaryDirectory() as tmpdir:
            run_maelstrom(
                fname,
                data.genome,
                tmpdir,
                center=False,
                filter_redundant=True,
            )

            motif_act = pd.read_csv(
                os.path.join(tmpdir, "final.out.txt"),
                sep="\t",
                comment="#",
                index_col=0,
            )
            motif_act.columns = motif_act.columns.str.replace(
                r"z-score\s+", "")
            pfm = pfmfile_location(
                os.path.join(tmpdir, "nonredundant.motifs.pfm"))
    else:
        logger.info(f"Activity based on genome {data.genome}")
        motif_act = moap(
            fname,
            scoring="score",
            genome=data.genome,
            method="bayesianridge",
            pfmfile=pfm,
            ncpus=12,
        )
    adata.uns["scepia"]["pfm"] = pfm

    adata.uns["scepia"]["motif_activity"] = motif_act[adata.uns["scepia"]
                                                      ["cell_types"]]

    logger.info("calculating cell-specific motif activity")
    cell_motif_activity = (
        adata.uns["scepia"]["motif_activity"] @ adata.obsm["X_cell_types"].T).T
    cell_motif_activity.index = adata.obs_names
    adata.obs = adata.obs.drop(
        columns=cell_motif_activity.columns.intersection(adata.obs.columns))
    adata.obs = adata.obs.join(cell_motif_activity)

    correlate_tf_motifs(adata,
                        indirect=indirect,
                        n_sketch=n_sketch,
                        n_permutations=n_permutations)

    add_activity(adata)

    logger.info("Done with motif inference.")