Exemplo n.º 1
0
    def __init__(self, ncore=1, genome="hg38", gene_bed=None, pfmfile=None, include_notfs=False, rm_curated=True, etype="hg38H3K27ac", tffile=None):

        self.ncore = ncore
        self.genome = genome

        # dream_model.txt is the logistic regression model.
        package_dir = os.path.dirname(ananse.__file__)
        self.etype = etype

        if self.genome == "hg38" and self.etype == "hg38H3K27ac":
            self.model = os.path.join(package_dir, "db", "dream_model_h3k27ac.txt")
        elif self.etype == "p300" or self.etype == "ATAC":
            self.model = os.path.join(package_dir, "db", "dream_model_p300.txt")
        else:
            raise TypeError("""The input enhancer data type should hg38H3K27ac, p300 or ATAC. 
            It is not possible set -e to hg38H3K27ac if the genome is not hg38. 
            Please provide a enhancer type with -e argument. By default is hg38H3K27ac.""")

        # filter tfs?
        self.include_notfs = include_notfs
        # remove curated?
        self.rm_curated = rm_curated

        # load real tfs
        self.tffile = tffile
        if self.tffile is None:
            self.tffile = os.path.join(package_dir, "db", "tfs.txt")
        # self.tffile = "db/tfs.txt"

        # Motif information file
        self.pfmfile = pfmfile_location(pfmfile)
        self.motifs2factors = self.pfmfile.replace(".pfm", ".motif2factors.txt")
        self.filtermotifs2factors = clear_tfs(self.motifs2factors, self.tffile, self.include_notfs, self.rm_curated)
Exemplo n.º 2
0
    def __init__(
        self,
        peak_weights,
        motif_weights,
        pfmfile=None,
        model=None,
        curation_filter=None,
        tf_list=None,
        whitelist=True,
        ncore=1,
        verbose=True,
    ):
        self.peak_weights = peak_weights  # output from ScorePeaks
        self.motif_weights = motif_weights  # output from ScoreMotifs

        self.motifs2factors_file = pfmfile_location(pfmfile).replace(
            ".pfm", ".motif2factors.txt")
        self.motifs2factors = self.filter_transcription_factors(
            curation_filter, tf_list, whitelist)

        self.model = model
        if self.model is None:
            # dream_model.txt is a 2D logistic regression model.
            package_dir = os.path.dirname(__file__)
            self.model = os.path.join(package_dir, "db",
                                      "dream_model_p300.pickle")

        self.ncore = ncore
        self.verbose = verbose
Exemplo n.º 3
0
def logo(args):
    if args.pfmfile is None and args.ids is None:
        name = os.path.splitext(os.path.split(pfmfile_location(None))[-1])[0]
        print(
            "Use the -i argument to specify which motif ids you want to use for logos."
        )
        print("If you really want to create logos for all of the motifs in the default")
        print("PFM file use the following command:")
        print(f"gimme logo -p {name}")
        sys.exit(1)
    inputfile = args.pfmfile

    motifs = read_motifs(inputfile)
    if args.ids:
        ids = args.ids.split(",")
        motifs = [m for m in motifs if m.id in ids]

    for motif in motifs:
        motif.plot_logo(
            fname="{}.png".format(motif.id), kind=args.kind, title=args.title
        )
Exemplo n.º 4
0
def moap(
    inputfile,
    method="hypergeom",
    scoring=None,
    outfile=None,
    motiffile=None,
    pfmfile=None,
    genome=None,
    fpr=0.01,
    ncpus=None,
    subsample=None,
    zscore=True,
    gc=True,
):
    """Run a single motif activity prediction algorithm.

    Parameters
    ----------
    inputfile : str
        :1File with regions (chr:start-end) in first column and either cluster
        name in second column or a table with values.

    method : str, optional
        Motif activity method to use. Any of 'hypergeom', 'lasso',
        'bayesianridge',
        'rf', 'xgboost'. Default is 'hypergeom'.

    scoring:  str, optional
        Either 'score' or 'count'

    outfile : str, optional
        Name of outputfile to save the fitted activity values.

    motiffile : str, optional
        Table with motif scan results. First column should be exactly the same
        regions as in the inputfile.

    pfmfile : str, optional
        File with motifs in pwm format. Required when motiffile is not
        supplied.

    genome : str, optional
        Genome name, as indexed by gimme. Required when motiffile is not
        supplied

    fpr : float, optional
        FPR for motif scanning

    ncpus : int, optional
        Number of threads to use. Default is the number specified in the config.

    zscore : bool, optional
        Use z-score normalized motif scores.

    gc : bool optional
        Use GC% bins for z-score.

    Returns
    -------
    pandas DataFrame with motif activity
    """

    if scoring and scoring not in ["score", "count"]:
        raise ValueError("valid values are 'score' and 'count'")

    if inputfile.endswith("feather"):
        df = pd.read_feather(inputfile)
        df = df.set_index(df.columns[0])
    else:
        # read data
        df = pd.read_table(inputfile, index_col=0, comment="#")

    clf = Moap.create(method, ncpus=ncpus)

    if clf.ptype == "classification":
        if df.shape[1] != 1:
            raise ValueError("1 column expected for {}".format(method))
    else:
        if np.dtype("object") in set(df.dtypes):
            raise ValueError(
                "columns should all be numeric for {}".format(method))

    if motiffile is None:
        if genome is None:
            raise ValueError("need a genome")

        pfmfile = pfmfile_location(pfmfile)
        try:
            motifs = read_motifs(pfmfile)
        except Exception:
            sys.stderr.write("can't read motifs from {}".format(pfmfile))
            raise

        # scan for motifs
        motif_names = [m.id for m in read_motifs(pfmfile)]
        scores = []
        if method == "classic" or scoring == "count":
            logger.info("motif scanning (scores)")
            scores = scan_regionfile_to_table(
                inputfile,
                genome,
                "count",
                pfmfile=pfmfile,
                ncpus=ncpus,
                zscore=zscore,
                gc=gc,
            )
        else:
            logger.info("motif scanning (scores)")
            scores = scan_regionfile_to_table(
                inputfile,
                genome,
                "score",
                pfmfile=pfmfile,
                ncpus=ncpus,
                zscore=zscore,
                gc=gc,
            )
        motifs = pd.DataFrame(scores, index=df.index, columns=motif_names)

    elif isinstance(motiffile, pd.DataFrame):
        motifs = motiffile
    else:
        motifs = pd.read_table(motiffile, index_col=0, comment="#")

    if outfile and os.path.exists(outfile):
        out = pd.read_table(outfile, index_col=0, comment="#")
        ncols = df.shape[1]
        if ncols == 1:
            ncols = len(df.iloc[:, 0].unique())

        if out.shape[0] == motifs.shape[1] and out.shape[1] == ncols:
            logger.warn("%s output already exists... skipping", method)
            return out

    if subsample is not None:
        n = int(subsample * df.shape[0])
        logger.debug("Subsampling %d regions", n)
        df = df.sample(n)

    motifs = motifs.loc[df.index]

    clf.fit(motifs, df)

    if outfile:
        with open(outfile, "w") as f:
            f.write(
                "# maelstrom - GimmeMotifs version {}\n".format(__version__))
            f.write("# method: {} with motif {}\n".format(method, scoring))
            if genome:
                f.write("# genome: {}\n".format(genome))
            if isinstance(motiffile, str):
                f.write("# motif table: {}\n".format(motiffile))
            f.write("# {}\n".format(clf.act_description))

        with open(outfile, "a") as f:
            clf.act_.to_csv(f, sep="\t")

    return clf.act_
Exemplo n.º 5
0
def infer_motifs(
    adata: AnnData,
    dataset: str,
    cluster: Optional[str] = "louvain",
    n_top_genes: Optional[int] = 1000,
    max_cell_types: Optional[int] = 50,
    pfm: Optional[str] = None,
    min_annotated: Optional[int] = 50,
    num_enhancers: Optional[int] = 10000,
    maelstrom: Optional[bool] = False,
    indirect: Optional[bool] = True,
    n_sketch: Optional[int] = 2500,
    n_permutations: Optional[int] = 100000,
) -> None:
    """Infer motif ativity for single cell RNA-seq data.

    The adata object is modified with the following fields.

    **X_cell_types** : `adata.obsm` field
        Cell type coefficients.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix.
    dataset : `str`
        Name of reference data set or directory with reference data.
    cluster : `str`, optional (default: "louvain")
        Name of the clustering, can be either louvain or leiden.
    n_top_genes : `int`, optional (default: 1000)
        Number of variable genes that is used. If `n_top_genes` is greater than the
        number of hypervariable genes in `adata` then all variable genes are
        used.
    max_cell_types : `int`, optional (default: 50)
        Maximum number of cell types to select.
    pfm : `str`, optional (default: None)
        Name of motif file in PFM format. The GimmeMotifs default is used
        if this parameter is not specified. This can be a filename, or a
        pfm name support by GimmeMotifs such as `JASPAR2018_vertebrates`.
        If a custom PFM file is specified, there should also be an associated
        `.motif2factors.txt` file.
    min_annotated : `int`, optional (default: 50)
        Cells that are annotated with cell types less than this number will be
        annotated as "other".
    num_enhancers : `int`, optional (default: 10000)
        Number of enhancers to use for motif activity analysis.
    maelstrom : `boolean`, optional (default: False)
        Use maelstrom instead of ridge regression for motif activity analysis.
    """

    use_name = True

    validate_adata(adata)

    data = ScepiaDataset(dataset)

    if "scepia" not in adata.uns:
        adata.uns["scepia"] = {"version": __version__}

    # Annotate each cell with H3K27ac reference
    if "cell_annotation" not in adata.obs or "cluster_annotation" not in adata.obs:
        annotate_cells(
            adata,
            dataset=dataset,
            cluster=cluster,
            n_top_genes=n_top_genes,
            min_annotated=min_annotated,
            max_cell_types=max_cell_types,
        )

    logger.info("Linking variable genes to differential enhancers.")
    gene_map_file = data.gene_mapping

    link_file = data.link_file
    link = pd.read_feather(link_file)
    if use_name:
        ens2name = pd.read_csv(gene_map_file,
                               sep="\t",
                               index_col=0,
                               names=["identifier", "name"])
        link = link.join(ens2name, on="gene").dropna()
        link = link.set_index("name")

    link.index = link.index.str.upper()
    enh_genes = adata.var_names[adata.var_names.str.upper().isin(
        link.index)].str.upper()
    var_enhancers = change_region_size(link.loc[enh_genes, "loc"]).unique()

    enhancer_df = data.load_reference_data(reftype="enhancer")
    enhancer_df.index = change_region_size(enhancer_df.index)
    enhancer_df = enhancer_df.loc[var_enhancers,
                                  adata.uns["scepia"]["cell_types"]]
    enhancer_df = enhancer_df.groupby(enhancer_df.columns, axis=1).mean()
    enhancer_df.loc[:, :] = scale(enhancer_df)
    # Select top most variable enhancers
    enhancer_df = enhancer_df.loc[enhancer_df.var(1).sort_values().tail(
        num_enhancers).index]
    # Center by mean of the most import cell types
    # Here we chose the majority cell type per cluster
    cluster_cell_types = adata.obs["cluster_annotation"].unique()
    mean_value = enhancer_df[cluster_cell_types].mean(1)
    enhancer_df = enhancer_df.sub(mean_value, axis=0)
    fname = NamedTemporaryFile(delete=False).name
    enhancer_df.to_csv(fname, sep="\t")
    logger.info("inferring motif activity")

    pfm = pfmfile_location(pfm)
    if maelstrom:
        with TemporaryDirectory() as tmpdir:
            run_maelstrom(
                fname,
                data.genome,
                tmpdir,
                center=False,
                filter_redundant=True,
            )

            motif_act = pd.read_csv(
                os.path.join(tmpdir, "final.out.txt"),
                sep="\t",
                comment="#",
                index_col=0,
            )
            motif_act.columns = motif_act.columns.str.replace(
                r"z-score\s+", "")
            pfm = pfmfile_location(
                os.path.join(tmpdir, "nonredundant.motifs.pfm"))
    else:
        logger.info(f"Activity based on genome {data.genome}")
        motif_act = moap(
            fname,
            scoring="score",
            genome=data.genome,
            method="bayesianridge",
            pfmfile=pfm,
            ncpus=12,
        )
    adata.uns["scepia"]["pfm"] = pfm

    adata.uns["scepia"]["motif_activity"] = motif_act[adata.uns["scepia"]
                                                      ["cell_types"]]

    logger.info("calculating cell-specific motif activity")
    cell_motif_activity = (
        adata.uns["scepia"]["motif_activity"] @ adata.obsm["X_cell_types"].T).T
    cell_motif_activity.index = adata.obs_names
    adata.obs = adata.obs.drop(
        columns=cell_motif_activity.columns.intersection(adata.obs.columns))
    adata.obs = adata.obs.join(cell_motif_activity)

    correlate_tf_motifs(adata,
                        indirect=indirect,
                        n_sketch=n_sketch,
                        n_permutations=n_permutations)

    add_activity(adata)

    logger.info("Done with motif inference.")
Exemplo n.º 6
0
def select_nonredundant_motifs(roc_report,
                               pfmfile,
                               fg_table,
                               bg_table,
                               tolerance=0.001):
    pfmfile = pfmfile_location(pfmfile)
    motifs = read_motifs(pfmfile)
    motif_dict = read_motifs(pfmfile, as_dict=True)

    mc = MotifComparer()

    df = pd.read_csv(roc_report, sep="\t", index_col=0)
    df = df[df["Enr. at 1% FPR"] >= 2]
    motifs = [m for m in motifs if m.id in df.index]

    cols = ["ROC AUC", "PR AUC", "Enr. at 1% FPR", "Recall at 10% FDR"]
    rank = df[cols].rank().mean(1).sort_values(ascending=False)

    redundant_motifs = []
    keep = []
    while df[~df.index.isin(redundant_motifs)].shape[0] > 0:
        motif = rank[~rank.index.isin(redundant_motifs)].head(1).index[0]
        keep.append(motif)

        result = mc.get_all_scores(
            [motif_dict[motif]],
            [m for m in motifs if m.id not in redundant_motifs],
            "partial",
            "seqcor",
            "mean",
        )
        result = result[motif]
        redundant_motifs += [m for m in result.keys() if result[m][0] >= 0.7]
    logger.debug(f"Selected {len(keep)} motifs for feature elimination")

    # Read motif scan results
    fg_table = pd.read_csv(fg_table, index_col=0, comment="#", sep="\t")
    bg_table = pd.read_csv(bg_table, index_col=0, comment="#", sep="\t")

    X = pd.concat((fg_table, bg_table), axis=0)
    y = np.hstack((np.ones(fg_table.shape[0]), np.zeros(bg_table.shape[0])))

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.4,
        random_state=2,
        shuffle=True,
    )

    X_bla = X_train[keep]
    model = LogisticRegression(solver="liblinear", max_iter=500, penalty="l1")
    # = RandomForestClassifier(n_estimators=100)
    max_score = np.mean(
        cross_val_score(model,
                        X_bla,
                        y_train,
                        cv=5,
                        scoring="average_precision"))
    mean_scores = []
    step = 1

    logger.info("selecting non-redundant motifs")
    n_features = 1
    for i in range(1, X_bla.shape[1], step):
        rfe = RFE(model, i)
        fit = rfe.fit(X_bla, y_train)
        mean_score = np.mean(
            cross_val_score(
                model,
                X_bla.loc[:, fit.support_],
                y_train,
                cv=5,
                scoring="average_precision",
            ))
        if i > 1 and mean_score - mean_scores[-1] < (max_score * tolerance):
            n_features = i - 1
            break
        mean_scores.append(mean_score)

    rfe = RFE(model, n_features)
    fit = rfe.fit(X_bla, y_train)

    selected_features = X_bla.columns[fit.support_]
    model.fit(X_train.loc[:, selected_features], y_train)
    y_pred = model.predict_proba(X_test.loc[:, selected_features])[:, 1]
    pr_auc = average_precision_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    logger.info(
        f"selected {len(selected_features)} non-redundant motifs: ROC AUC {roc_auc:.3f}, PR AUC {pr_auc:.3f}"
    )
    return selected_features
Exemplo n.º 7
0
def run_maelstrom(
    infile,
    genome,
    outdir,
    pfmfile=None,
    plot=True,
    cluster=False,
    score_table=None,
    count_table=None,
    methods=None,
    ncpus=None,
    zscore=True,
    gc=True,
):
    """Run maelstrom on an input table.

    Parameters
    ----------
    infile : str
        Filename of input table. Can be either a text-separated tab file or a
        feather file.

    genome : str
        Genome name. Can be either the name of a FASTA-formatted file or a
        genomepy genome name.

    outdir : str
        Output directory for all results.

    pfmfile : str, optional
        Specify a PFM file for scanning.

    plot : bool, optional
        Create heatmaps.

    cluster : bool, optional
        If True and if the input table has more than one column, the data is
        clustered and the cluster activity methods are also run. Not
        well-tested.

    score_table : str, optional
        Filename of pre-calculated table with motif scores.

    count_table : str, optional
        Filename of pre-calculated table with motif counts.

    methods : list, optional
        Activity methods to use. By default are all used.

    ncpus : int, optional
        If defined this specifies the number of cores to use.

    zscore : bool, optional
        Use z-score normalized motif scores.

    gc : bool, optional
        Use GC% bins to normalize motif scores.
    """
    logger.info("Starting maelstrom")
    if infile.endswith("feather"):
        df = pd.read_feather(infile)
        df = df.set_index(df.columns[0])
    else:
        df = pd.read_table(infile, index_col=0, comment="#")

    # Check for duplicates
    if df.index.duplicated(keep=False).any():
        logger.warning("Input file contains duplicate regions!")
        logger.warning("These will be removed.")
        df = df.iloc[~df.index.duplicated(keep=False)]

    if not os.path.exists(outdir):
        os.mkdir(outdir)

    if methods is None:
        methods = Moap.list_predictors()
    methods = [m.lower() for m in methods]

    df.to_csv(os.path.join(outdir, "input.table.txt"), sep="\t")
    infile = os.path.join(outdir, "input.table.txt")

    # Copy the motif informatuon
    pfmfile = pfmfile_location(pfmfile)
    if pfmfile:
        shutil.copy2(pfmfile, outdir)
        mapfile = re.sub(".p[fw]m$", ".motif2factors.txt", pfmfile)
        if os.path.exists(mapfile):
            shutil.copy2(mapfile, outdir)

    # Create a file with the number of motif matches
    if count_table is None:
        count_table = os.path.join(outdir, "motif.count.txt.gz")
        if not os.path.exists(count_table):
            logger.info("motif scanning (counts)")
            counts = scan_to_table(
                infile,
                genome,
                "count",
                pfmfile=pfmfile,
                ncpus=ncpus,
                zscore=zscore,
                gc=gc,
            )
            counts.to_csv(count_table, sep="\t", compression="gzip")
        else:
            logger.info("Counts, using: %s", count_table)

    # Create a file with the score of the best motif match
    if score_table is None:
        score_table = os.path.join(outdir, "motif.score.txt.gz")
        if not os.path.exists(score_table):
            logger.info("motif scanning (scores)")
            scores = scan_to_table(
                infile,
                genome,
                "score",
                pfmfile=pfmfile,
                ncpus=ncpus,
                zscore=zscore,
                gc=gc,
            )
            scores.to_csv(score_table,
                          sep="\t",
                          float_format="%.3f",
                          compression="gzip")
        else:
            logger.info("Scores, using: %s", score_table)

    if cluster:
        cluster = False
        for method in methods:
            m = Moap.create(method, ncpus=ncpus)
            if m.ptype == "classification":
                cluster = True
                break
        if not cluster:
            logger.info("Skipping clustering, no classification methods")

    exps = []
    clusterfile = infile
    if df.shape[1] != 1:
        # More than one column
        for method in Moap.list_regression_predictors():
            if method in methods:
                m = Moap.create(method, ncpus=ncpus)
                exps.append([method, m.pref_table, infile])
                logger.debug("Adding %s", method)

        if cluster:
            clusterfile = os.path.join(
                outdir,
                os.path.basename(infile) + ".cluster.txt")

            df[:] = scale(df, axis=0)
            names = df.columns
            df_changed = pd.DataFrame(index=df.index)
            df_changed["cluster"] = np.nan
            for name in names:
                df_changed.loc[(df[name] -
                                df.loc[:, df.columns != name].max(1)) > 0.5,
                               "cluster"] = name
            df_changed.dropna().to_csv(clusterfile, sep="\t")
    if df.shape[1] == 1 or cluster:
        for method in Moap.list_classification_predictors():
            if method in methods:
                m = Moap.create(method, ncpus=ncpus)
                exps.append([method, m.pref_table, clusterfile])

    if len(exps) == 0:
        logger.error("No method to run.")
        sys.exit(1)

    for method, scoring, fname in exps:
        try:
            if scoring == "count" and count_table is not None:
                moap_with_table(fname,
                                count_table,
                                outdir,
                                method,
                                scoring,
                                ncpus=ncpus)
            elif scoring == "score" and score_table is not None:
                moap_with_table(fname,
                                score_table,
                                outdir,
                                method,
                                scoring,
                                ncpus=ncpus)
            else:
                moap_with_bg(fname,
                             genome,
                             outdir,
                             method,
                             scoring,
                             pfmfile=pfmfile,
                             ncpus=ncpus)

        except Exception as e:
            logger.warn("Method %s with scoring %s failed", method, scoring)
            logger.warn(e)
            logger.warn("Skipping")
            raise
    dfs = {}
    for method, scoring, fname in exps:
        t = "{}.{}".format(method, scoring)
        fname = os.path.join(outdir,
                             "activity.{}.{}.out.txt".format(method, scoring))
        try:
            dfs[t] = pd.read_table(fname, index_col=0, comment="#")
        except FileNotFoundError:
            logger.warn("Activity file for {} not found!\n".format(t))

    if len(methods) > 1:
        logger.info("Rank aggregation")
        df_p = df_rank_aggregation(df, dfs, exps)
        df_p.to_csv(os.path.join(outdir, "final.out.txt"), sep="\t")
    # df_p = df_p.join(m2f)

    # Write motif frequency table

    if df.shape[1] == 1:
        mcount = df.join(pd.read_table(count_table, index_col=0, comment="#"))
        m_group = mcount.groupby(df.columns[0])
        freq = m_group.sum() / m_group.count()
        freq.to_csv(os.path.join(outdir, "motif.freq.txt"), sep="\t")

    if plot and len(methods) > 1:
        logger.info("html report")
        maelstrom_html_report(outdir, os.path.join(outdir, "final.out.txt"),
                              pfmfile)
        logger.info(os.path.join(outdir, "gimme.maelstrom.report.html"))
Exemplo n.º 8
0
def calc_stats_iterator(
    fg_file=None,
    bg_file=None,
    fg_table=None,
    bg_table=None,
    motifs=None,
    stats=None,
    genome=None,
    zscore=True,
    gc=True,
    ncpus=None,
):
    """Calculate motif enrichment metrics.

    Parameters
    ----------
    fg_file : str, optional
        Filename of a FASTA, BED or region file with positive sequences.

    bg_file : str, optional
        Filename of a FASTA, BED or region file with negative sequences.

    fg_table : str, optional
        Filename of a table with motif scan results of positive sequences.

    bg_table : str, optional
        Filename of a table with motif scan results of negative sequences.

    motifs : str, list or Motif instance, optional
        A file with motifs in pfm format, a list of Motif instances or a
        single Motif instance. If motifs is `None`, the default motif
        database is used.

    genome : str, optional
        Genome or index directory in case of BED/regions.

    stats : list, optional
        Names of metrics to calculate. See gimmemotifs.rocmetrics.__all__
        for available metrics.

    ncpus : int, optional
        Number of cores to use.

    Returns
    -------
    result : dict
        Dictionary with results where keys are motif ids and the values are
        dictionary with metric name and value pairs.
    """
    if not stats:
        stats = rocmetrics.__all__

    if fg_table is None:
        if fg_file is None:
            raise ValueError("Need either fg_table or fg_file argument")
    elif fg_file is not None:
        raise ValueError("Need either fg_table or fg_file argument, not both")

    if bg_table is None:
        if bg_file is None:
            raise ValueError("Need either bg_table or bg_file argument")
    elif bg_file is not None:
        raise ValueError("Need either bg_table or bg_file argument, not both")

    if fg_table is not None or bg_table is not None:
        remove_stats = []
        for s in stats:
            func = getattr(rocmetrics, s)
            if func.input_type == "pos":
                remove_stats.append(s)
        if len(remove_stats) != 0:
            logger.warn(
                "Cannot calculate stats that require position from table of motif scores."
            )
            logger.warn(f"Skipping the following statistics: {', '.join(remove_stats)}")
            stats = [s for s in stats if s not in remove_stats]

    if isinstance(motifs, Motif):
        all_motifs = [motifs]
    else:
        if type([]) == type(motifs):
            all_motifs = motifs
        else:
            motifs = pfmfile_location(motifs)
            all_motifs = read_motifs(motifs, fmt="pwm")
    if fg_table is not None or bg_table is not None:
        filtered_motifs = pd.read_csv(
            fg_table, sep="\t", index_col=0, nrows=1, comment="#"
        ).columns
        filtered_motifs = filtered_motifs.intersection(
            pd.read_csv(bg_table, sep="\t", index_col=0, nrows=1, comment="#").columns
        )
        all_motifs = [m for m in all_motifs if m.id in filtered_motifs]

    if ncpus is None:
        ncpus = int(MotifConfig().get_default_params()["ncpus"])

    if fg_file is not None or bg_file is not None:
        if zscore or gc:
            # Precalculate mean and stddev for z-score calculation
            s = Scanner(ncpus=ncpus)
            s.set_motifs(all_motifs)
            s.set_genome(genome)
            s.set_meanstd(gc=gc)

    chunksize = 240
    for i in range(0, len(all_motifs), chunksize):
        result = {}
        logger.debug(
            "chunk %s of %s", (i / chunksize) + 1, len(all_motifs) // chunksize + 1
        )
        motifs = all_motifs[i : i + chunksize]

        if fg_table is None:
            fg_total = scan_to_best_match(
                fg_file, motifs, ncpus=ncpus, genome=genome, zscore=zscore, gc=gc
            )
        else:
            fg_total = pd.read_csv(
                fg_table, sep="\t", usecols=[m.id for m in motifs], comment="#"
            ).to_dict(orient="list")
            for m in fg_total:
                fg_total[m] = [(x, None) for x in fg_total[m]]

        if bg_table is None:
            bg_total = scan_to_best_match(
                bg_file, motifs, ncpus=ncpus, genome=genome, zscore=zscore, gc=gc
            )
        else:
            bg_total = pd.read_csv(
                bg_table, sep="\t", usecols=[m.id for m in motifs], comment="#"
            ).to_dict(orient="list")
            for m in bg_total:
                bg_total[m] = [(x, None) for x in bg_total[m]]

        logger.debug("calculating statistics")

        if ncpus == 1:
            it = _single_stats(motifs, stats, fg_total, bg_total)
        else:
            it = _mp_stats(motifs, stats, fg_total, bg_total, ncpus)

        for motif_id, s, ret in it:
            if motif_id not in result:
                result[motif_id] = {}
            result[motif_id][s] = ret
        yield result
Exemplo n.º 9
0
 def __init__(self, genome, bed, pfmfile=None, ncore=1, verbose=True):
     self.genome = genome
     self.bed = bed  # putative enhancer regions in format chr:start-end (in column 0 with header)
     self.pfm_file = pfmfile_location(pfmfile)
     self.ncore = ncore
     self.verbose = verbose
Exemplo n.º 10
0
def run_maelstrom(
    infile,
    genome,
    outdir,
    pfmfile=None,
    filter_redundant=True,
    filter_cutoff=0.8,
    plot=True,
    cluster=False,
    score_table=None,
    count_table=None,
    methods=None,
    ncpus=None,
    zscore=True,
    gc=True,
    center=False,
    aggregation="int_stouffer",
):
    """Run maelstrom on an input table.

    Parameters
    ----------
    infile : str
        Filename of input table. Can be either a text-separated tab file or a
        feather file.

    genome : str
        Genome name. Can be either the name of a FASTA-formatted file or a
        genomepy genome name.

    outdir : str
        Output directory for all results.

    pfmfile : str, optional
        Specify a PFM file for scanning.

    filter_redundant : bool, optional
        Create a non-redundant set of motifs based on correlation of motif scores in the input data.

    filter_cutoff : float, optional
        Cutoff to use for non-redundant motif selection. Default is 0.8.

    plot : bool, optional
        Create heatmaps.

    cluster : bool, optional
        If True and if the input table has more than one column, the data is
        clustered and the cluster activity methods are also run. Not
        well-tested.

    score_table : str, optional
        Filename of pre-calculated table with motif scores.

    count_table : str, optional
        Filename of pre-calculated table with motif counts.

    methods : list, optional
        Activity methods to use. By default are all used.

    ncpus : int, optional
        If defined this specifies the number of cores to use.

    zscore : bool, optional
        Use z-score normalized motif scores.

    gc : bool, optional
        Use GC% bins to normalize motif scores.

    center : bool, optional
        Mean-center the input table.

    aggregation: str, optional
        How to combine scores of the predictors. The default is "int_stouffer", for
        inverse normal transform followed by Stouffer's methods to combine z-scores.
        Alternatively, "stuart" performs rank aggregation and reports the -log10 of
        the rank aggregation p-value.
    """
    logger.info("Starting maelstrom")
    if infile.endswith("feather"):
        df = pd.read_feather(infile)
        df = df.set_index(df.columns[0])
    else:
        df = pd.read_table(infile, index_col=0, comment="#")

    # Check if the input is mean-centered
    if df.shape[1] > 1 and not np.allclose(df.mean(1), 0):
        if center:
            logger.info(
                "Input is not mean-centered, setting the mean of all rows to 0."
            )
            logger.info(
                "Use --nocenter if you know what you're doing and want to change this behavior."
            )
            logger.info(
                "Note that if you use count data (ChIP-seq, ATAC-seq) we recommend to "
                "first transform your data, for instance using log2(), and to normalize "
                "between samples. To create a table suitable for maelstrom you can use the "
                "coverage_table script included with GimmeMotifs."
            )
            df = df.sub(df.mean(axis=1), axis=0)
        else:
            logger.info("Input is not mean-centered, but --nocenter was specified.")
            logger.info(
                "Leaving the data as-is, but make sure this is what your really want."
            )

    # Check for duplicates
    if df.index.duplicated(keep=False).any():
        logger.warning("Input file contains duplicate regions!")
        logger.warning("These will be removed.")
        df = df.iloc[~df.index.duplicated(keep=False)]

    if not os.path.exists(outdir):
        os.mkdir(outdir)

    if methods is None:
        methods = Moap.list_predictors()
    methods = [m.lower() for m in methods]

    df.to_csv(os.path.join(outdir, "input.table.txt"), sep="\t")
    infile = os.path.join(outdir, "input.table.txt")

    # Copy the motif informatuon
    pfmfile = pfmfile_location(pfmfile)
    if pfmfile:
        shutil.copy2(pfmfile, outdir)
        mapfile = re.sub(".p[fw]m$", ".motif2factors.txt", pfmfile)
        if os.path.exists(mapfile):
            shutil.copy2(mapfile, outdir)

    # Create a file with the number of motif matches
    if count_table is None:
        count_table = os.path.join(outdir, "motif.count.txt.gz")
        if not os.path.exists(count_table):
            logger.info("motif scanning (counts)")
            counts = scan_regionfile_to_table(
                infile,
                genome,
                "count",
                pfmfile=pfmfile,
                ncpus=ncpus,
                zscore=zscore,
                gc=gc,
            )
            counts.to_csv(count_table, sep="\t", compression="gzip")
        else:
            logger.info("Counts, using: %s", count_table)

    # Create a file with the score of the best motif match
    if score_table is None:
        score_table = os.path.join(outdir, "motif.score.txt.gz")
        if not os.path.exists(score_table):
            logger.info("motif scanning (scores)")
            scores = scan_regionfile_to_table(
                infile,
                genome,
                "score",
                pfmfile=pfmfile,
                ncpus=ncpus,
                zscore=zscore,
                gc=gc,
            )
            scores.to_csv(
                score_table, sep="\t", float_format="%.3f", compression="gzip"
            )
        else:
            logger.info("Scores, using: %s", score_table)

    counts = pd.read_csv(count_table, index_col=0, comment="#", sep="\t")
    scores = pd.read_csv(score_table, index_col=0, comment="#", sep="\t")

    if filter_redundant:
        logger.info("Selecting non-redundant motifs")
        fa = FeatureAgglomeration(
            distance_threshold=filter_cutoff,
            n_clusters=None,
            affinity="correlation",
            linkage="complete",
            compute_full_tree=True,
        )
        fa.fit(scores)
        X_cluster = pd.DataFrame({"motif": scores.columns, "label": fa.labels_})
        X_cluster = X_cluster.join(scores.var().to_frame(name="var"), on="motif")
        selected_motifs = (
            X_cluster.sort_values("var")
            .drop_duplicates(subset=["label"], keep="last")["motif"]
            .values
        )

        nr_motif = (
            X_cluster.sort_values("var")
            .drop_duplicates(subset=["label"], keep="last")[["label", "motif"]]
            .set_index("label")
        )
        X_cluster = X_cluster.join(nr_motif, rsuffix="_nr", on="label")
        motif_map = X_cluster[["motif", "motif_nr"]].set_index("motif")

        scores = scores[selected_motifs]
        counts = counts[selected_motifs]
        score_table = os.path.join(outdir, "motif.nr.score.txt.gz")
        scores.to_csv(score_table, sep="\t", compression="gzip")
        count_table = os.path.join(outdir, "motif.nr.count.txt.gz")
        counts.to_csv(count_table, sep="\t", compression="gzip")

        m2f = pd.read_table(os.path.join(outdir, mapfile), comment="#")
        m2f = m2f.join(motif_map, on="Motif")
        m2f.loc[m2f["Motif"] != m2f["motif_nr"], "Curated"] = "N"
        m2f["Motif"] = m2f["motif_nr"]
        m2f = m2f.drop(columns=["motif_nr"])

        motifs = read_motifs(pfmfile)
        pfmfile = os.path.join(outdir, "nonredundant.motifs.pfm")
        with open(pfmfile, "w") as f:
            for motif in motifs:
                f.write(f"{motif.to_pfm()}\n")
        mapfile = pfmfile.replace(".pfm", ".motif2factors.txt")
        with open(mapfile, "w") as f:
            f.write(
                "# Note: this mapping is specifically created for this non-redundant set of motifs.\n"
            )
            f.write(
                "# It also includes factors for motifs that were similar, but this can be\n"
            )
            f.write("# specific to this analysis.\n")

        with open(mapfile, "a") as f:
            m2f.to_csv(f, index=False, sep="\t")
        logger.info(f"Selected {len(selected_motifs)} motifs")
        logger.info(f"Motifs: {pfmfile}")
        logger.info(f"Factor mappings: {mapfile}")

    if cluster:
        cluster = False
        for method in methods:
            m = Moap.create(method, ncpus=ncpus)
            if m.ptype == "classification":
                cluster = True
                break
        if not cluster:
            logger.info("Skipping clustering, no classification methods")

    exps = []
    clusterfile = infile
    if df.shape[1] != 1:
        # More than one column
        for method in Moap.list_regression_predictors():
            if method in methods:
                m = Moap.create(method, ncpus=ncpus)
                exps.append([method, m.pref_table, infile])
                logger.debug("Adding %s", method)

        if cluster:
            clusterfile = os.path.join(
                outdir, os.path.basename(infile) + ".cluster.txt"
            )

            df[:] = scale(df, axis=0)
            names = df.columns
            df_changed = pd.DataFrame(index=df.index)
            df_changed["cluster"] = np.nan
            for name in names:
                df_changed.loc[
                    (df[name] - df.loc[:, df.columns != name].max(1)) > 0.5, "cluster"
                ] = name
            df_changed.dropna().to_csv(clusterfile, sep="\t")
    if df.shape[1] == 1 or cluster:
        for method in Moap.list_classification_predictors():
            if method in methods:
                m = Moap.create(method, ncpus=ncpus)
                exps.append([method, m.pref_table, clusterfile])

    if len(exps) == 0:
        logger.error("No method to run.")
        sys.exit(1)

    for method, scoring, fname in exps:
        try:
            if scoring == "count":
                moap_with_table(
                    fname, count_table, outdir, method, scoring, ncpus=ncpus
                )
            elif scoring == "score":
                moap_with_table(
                    fname, score_table, outdir, method, scoring, ncpus=ncpus
                )

        except Exception as e:
            logger.warn("Method %s with scoring %s failed", method, scoring)
            logger.warn(e)
            logger.warn("Skipping")
            raise
    dfs = {}
    for method, scoring, fname in exps:
        t = "{}.{}".format(method, scoring)
        fname = os.path.join(outdir, "activity.{}.{}.out.txt".format(method, scoring))
        try:
            dfs[t] = pd.read_table(fname, index_col=0, comment="#")
        except FileNotFoundError:
            logger.warn("Activity file for {} not found!\n".format(t))

    if len(methods) > 1:
        logger.info("Rank aggregation")
        df_p = df_rank_aggregation(df, dfs, exps, method=aggregation)

        # Add percentage of input sequences with motif
        if df.shape[1] > 1:
            df_p["% with motif"] = counts[df_p.index].sum(0) / df.shape[0] * 100
        else:
            bla = counts.join(df).groupby(df.columns[0]).mean() * 100
            bla = bla.T
            bla = bla.rename(
                columns={col: f"{col} % with motif" for col in bla.columns}
            )
            df_p = df_p.join(bla)

        if df.shape[1] > 1:
            # Add correlation between motif score and signal
            logger.info("Correlation")
            for col in df.columns:
                df_p[f"corr {col}"] = 0
                for motif in df_p.index:
                    df_p.loc[motif, f"corr {col}"] = pearsonr(df[col], scores[motif])[0]

        df_p.to_csv(os.path.join(outdir, "final.out.txt"), sep="\t")
    # df_p = df_p.join(m2f)

    # Write motif frequency table

    if df.shape[1] == 1:
        mcount = df.join(counts)
        m_group = mcount.groupby(df.columns[0])
        freq = m_group.sum() / m_group.count()
        freq.to_csv(os.path.join(outdir, "motif.freq.txt"), sep="\t")

    if plot and len(methods) > 1:
        logger.info("html report")
        maelstrom_html_report(outdir, os.path.join(outdir, "final.out.txt"), pfmfile)
        logger.info(os.path.join(outdir, "gimme.maelstrom.report.html"))