예제 #1
0
def maelstrom(args):
    """Run the maelstrom method."""
    infile = args.inputfile
    genome = args.genome
    outdir = args.outdir
    pfmfile = args.pfmfile
    filter_redundant = args.filter_redundant
    filter_cutoff = args.filter_cutoff
    methods = args.methods
    ncpus = args.ncpus
    zscore = args.zscore
    center = args.center
    gc = args.gc
    aggregation = args.aggregation

    if not os.path.exists(infile):
        raise ValueError("file {} does not exist".format(infile))

    if methods:
        methods = [x.strip() for x in methods.split(",")]

    run_maelstrom(
        infile,
        genome,
        outdir,
        pfmfile,
        filter_redundant=filter_redundant,
        filter_cutoff=filter_cutoff,
        methods=methods,
        ncpus=ncpus,
        zscore=zscore,
        gc=gc,
        center=center,
        aggregation=aggregation,
    )
예제 #2
0
def maelstrom(args):
    """Run the maelstrom method."""
    infile = args.inputfile
    genome = args.genome
    outdir = args.outdir
    pfmfile = args.pfmfile
    methods = args.methods
    ncpus = args.ncpus
    zscore = args.zscore
    gc = args.gc

    if not os.path.exists(infile):
        raise ValueError("file {} does not exist".format(infile))

    if methods:
        methods = [x.strip() for x in methods.split(",")]

    run_maelstrom(
        infile,
        genome,
        outdir,
        pfmfile,
        methods=methods,
        ncpus=ncpus,
        zscore=zscore,
        gc=gc,
    )
예제 #3
0
    def test1_maelstrom(self):
        """ Test Motif Activity by Ensemble Learning (maelstrom) """
        
        run_maelstrom(self.clusters, "mm10", self.outdir,
                score_table=self.score_table, count_table=self.count_table)
        df = pd.read_table(self.outfile, index_col=0, comment="#")
        self.assertEquals((623, 4), df.shape)

        for fname in glob(os.path.join(self.outdir, "activity*")):
            os.unlink(fname)
        os.unlink(self.outfile)
예제 #4
0
def maelstrom(args):
    infile = args.inputfile
    genome = args.genome
    outdir = args.outdir

    if not os.path.exists(infile):
        raise ValueError("file {} does not exist".format(infile))

    check_genome(genome)
                
    run_maelstrom(infile, genome, outdir)
예제 #5
0
def maelstrom(args):
    infile = args.inputfile
    genome = args.genome
    outdir = args.outdir

    if not os.path.exists(infile):
        raise ValueError("file {} does not exist".format(infile))

    check_genome(genome)

    run_maelstrom(infile, genome, outdir)
예제 #6
0
    def test1_maelstrom(self):
        """ Test Motif Activity by Ensemble Learning (maelstrom) """

        run_maelstrom(self.clusters,
                      "mm10",
                      self.outdir,
                      score_table=self.score_table,
                      count_table=self.count_table,
                      plot=False)
        df = pd.read_table(self.outfile, index_col=0, comment="#")
        self.assertEquals((623, 4), df.shape)

        for fname in glob(os.path.join(self.outdir, "activity*")):
            os.unlink(fname)
        os.unlink(self.outfile)
    def test1_maelstrom(self):
        """ Test Motif Activity by Ensemble Learning (maelstrom) """

        run_maelstrom(
            self.clusters,
            "mm10",
            self.outdir,
            filter_redundant=False,
            score_table=self.score_table,
            count_table=self.count_table,
            plot=False,
        )
        df = pd.read_table(self.outfile, index_col=0, comment="#")
        print(df.shape)

        self.assertEquals((623, 8), df.shape)
        
        # Filter redundant motifs
        run_maelstrom(
            self.clusters,
            "mm10",
            self.outdir,
            filter_redundant=True,
            score_table=self.score_table,
            count_table=self.count_table,
            plot=False,
        )
        df = pd.read_table(self.outfile, index_col=0, comment="#")
        print(df.shape)
        self.assertEquals((156, 8), df.shape)


        for fname in glob(os.path.join(self.outdir, "activity*")):
            os.unlink(fname)
        for fname in glob(os.path.join(self.outdir, "gimme.verte*")):
            os.unlink(fname)
        os.unlink(self.outfile)
예제 #8
0
def infer_motifs(
    adata: AnnData,
    dataset: str,
    cluster: Optional[str] = "louvain",
    n_top_genes: Optional[int] = 1000,
    max_cell_types: Optional[int] = 50,
    pfm: Optional[str] = None,
    min_annotated: Optional[int] = 50,
    num_enhancers: Optional[int] = 10000,
    maelstrom: Optional[bool] = False,
    indirect: Optional[bool] = True,
    n_sketch: Optional[int] = 2500,
    n_permutations: Optional[int] = 100000,
) -> None:
    """Infer motif ativity for single cell RNA-seq data.

    The adata object is modified with the following fields.

    **X_cell_types** : `adata.obsm` field
        Cell type coefficients.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix.
    dataset : `str`
        Name of reference data set or directory with reference data.
    cluster : `str`, optional (default: "louvain")
        Name of the clustering, can be either louvain or leiden.
    n_top_genes : `int`, optional (default: 1000)
        Number of variable genes that is used. If `n_top_genes` is greater than the
        number of hypervariable genes in `adata` then all variable genes are
        used.
    max_cell_types : `int`, optional (default: 50)
        Maximum number of cell types to select.
    pfm : `str`, optional (default: None)
        Name of motif file in PFM format. The GimmeMotifs default is used
        if this parameter is not specified. This can be a filename, or a
        pfm name support by GimmeMotifs such as `JASPAR2018_vertebrates`.
        If a custom PFM file is specified, there should also be an associated
        `.motif2factors.txt` file.
    min_annotated : `int`, optional (default: 50)
        Cells that are annotated with cell types less than this number will be
        annotated as "other".
    num_enhancers : `int`, optional (default: 10000)
        Number of enhancers to use for motif activity analysis.
    maelstrom : `boolean`, optional (default: False)
        Use maelstrom instead of ridge regression for motif activity analysis.
    """

    use_name = True

    validate_adata(adata)

    data = ScepiaDataset(dataset)

    if "scepia" not in adata.uns:
        adata.uns["scepia"] = {"version": __version__}

    # Annotate each cell with H3K27ac reference
    if "cell_annotation" not in adata.obs or "cluster_annotation" not in adata.obs:
        annotate_cells(
            adata,
            dataset=dataset,
            cluster=cluster,
            n_top_genes=n_top_genes,
            min_annotated=min_annotated,
            max_cell_types=max_cell_types,
        )

    logger.info("Linking variable genes to differential enhancers.")
    gene_map_file = data.gene_mapping

    link_file = data.link_file
    link = pd.read_feather(link_file)
    if use_name:
        ens2name = pd.read_csv(gene_map_file,
                               sep="\t",
                               index_col=0,
                               names=["identifier", "name"])
        link = link.join(ens2name, on="gene").dropna()
        link = link.set_index("name")

    link.index = link.index.str.upper()
    enh_genes = adata.var_names[adata.var_names.str.upper().isin(
        link.index)].str.upper()
    var_enhancers = change_region_size(link.loc[enh_genes, "loc"]).unique()

    enhancer_df = data.load_reference_data(reftype="enhancer")
    enhancer_df.index = change_region_size(enhancer_df.index)
    enhancer_df = enhancer_df.loc[var_enhancers,
                                  adata.uns["scepia"]["cell_types"]]
    enhancer_df = enhancer_df.groupby(enhancer_df.columns, axis=1).mean()
    enhancer_df.loc[:, :] = scale(enhancer_df)
    # Select top most variable enhancers
    enhancer_df = enhancer_df.loc[enhancer_df.var(1).sort_values().tail(
        num_enhancers).index]
    # Center by mean of the most import cell types
    # Here we chose the majority cell type per cluster
    cluster_cell_types = adata.obs["cluster_annotation"].unique()
    mean_value = enhancer_df[cluster_cell_types].mean(1)
    enhancer_df = enhancer_df.sub(mean_value, axis=0)
    fname = NamedTemporaryFile(delete=False).name
    enhancer_df.to_csv(fname, sep="\t")
    logger.info("inferring motif activity")

    pfm = pfmfile_location(pfm)
    if maelstrom:
        with TemporaryDirectory() as tmpdir:
            run_maelstrom(
                fname,
                data.genome,
                tmpdir,
                center=False,
                filter_redundant=True,
            )

            motif_act = pd.read_csv(
                os.path.join(tmpdir, "final.out.txt"),
                sep="\t",
                comment="#",
                index_col=0,
            )
            motif_act.columns = motif_act.columns.str.replace(
                r"z-score\s+", "")
            pfm = pfmfile_location(
                os.path.join(tmpdir, "nonredundant.motifs.pfm"))
    else:
        logger.info(f"Activity based on genome {data.genome}")
        motif_act = moap(
            fname,
            scoring="score",
            genome=data.genome,
            method="bayesianridge",
            pfmfile=pfm,
            ncpus=12,
        )
    adata.uns["scepia"]["pfm"] = pfm

    adata.uns["scepia"]["motif_activity"] = motif_act[adata.uns["scepia"]
                                                      ["cell_types"]]

    logger.info("calculating cell-specific motif activity")
    cell_motif_activity = (
        adata.uns["scepia"]["motif_activity"] @ adata.obsm["X_cell_types"].T).T
    cell_motif_activity.index = adata.obs_names
    adata.obs = adata.obs.drop(
        columns=cell_motif_activity.columns.intersection(adata.obs.columns))
    adata.obs = adata.obs.join(cell_motif_activity)

    correlate_tf_motifs(adata,
                        indirect=indirect,
                        n_sketch=n_sketch,
                        n_permutations=n_permutations)

    add_activity(adata)

    logger.info("Done with motif inference.")