示例#1
0
def gimme_motifs(
    inputfile,
    outdir,
    params=None,
    filter_significant=True,
    cluster=True,
    create_report=True,
):
    """De novo motif prediction based on an ensemble of different tools.

    Parameters
    ----------
    inputfile : str
        Filename of input. Can be either BED, narrowPeak or FASTA.

    outdir : str
        Name of output directory.

    params : dict, optional
        Optional parameters.

    filter_significant : bool, optional
        Filter motifs for significance using the validation set.

    cluster : bool, optional
        Cluster similar predicted (and significant) motifs.

    create_report : bool, optional
        Create output reports (both .txt and .html).

    Returns
    -------
    motifs : list
        List of predicted motifs.

    Examples
    --------

    >>> from gimmemotifs.denovo import gimme_motifs
    >>> gimme_motifs("input.fa", "motifs.out")
    """
    if outdir is None:
        outdir = "gimmemotifs_{}".format(
            datetime.date.today().strftime("%d_%m_%Y"))

    # Create output directories
    tmpdir = os.path.join(outdir, "intermediate")
    for d in [outdir, tmpdir]:
        if not os.path.exists(d):
            os.mkdir(d)

    # Log to file
    logger = logging.getLogger("gimme")
    logfile = os.path.join(outdir, "gimmemotifs.log")
    fh = logging.FileHandler(logfile, "w")
    fh.setLevel(logging.DEBUG)
    file_formatter = logging.Formatter(
        "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    fh.setFormatter(file_formatter)
    logger.addHandler(fh)
    logger = logging.getLogger("gimme.denovo")

    # Initialize parameters
    params = parse_denovo_params(params)

    # Check the input files
    input_type, background = check_denovo_input(inputfile, params)

    logger.info("starting full motif analysis")
    logger.debug("Using temporary directory %s", mytmpdir())

    params["size"] = int(params["size"])
    if params["size"] > 0:
        logger.info(
            "using size of {}, set size to 0 to use original region size".
            format(params["size"]))
    else:
        logger.info("using original size")

    # Create the necessary files for motif prediction and validation
    if input_type == "bed":
        logger.info("preparing input from BED")
        prepare_denovo_input_bed(inputfile, params, tmpdir)
    elif input_type == "narrowpeak":
        logger.info("preparing input from narrowPeak")
        prepare_denovo_input_narrowpeak(inputfile, params, tmpdir)
    elif input_type == "fasta":
        logger.info("preparing input from FASTA")
        prepare_denovo_input_fa(inputfile, params, tmpdir)
    else:
        logger.error("unknown input file format!")
        sys.exit(1)

    # Create the background FASTA files
    background = create_backgrounds(
        tmpdir,
        background,
        params.get("genome", None),
        params["size"],
        params.get("custom_background", None),
    )

    # Predict de novo motifs
    result = predict_motifs(
        os.path.join(tmpdir, "prediction.fa"),
        os.path.join(tmpdir, "prediction.bg.fa"),
        os.path.join(tmpdir, "all_motifs.pfm"),
        params=params,
        stats_fg=os.path.join(tmpdir, "validation.fa"),
        stats_bg=background,
    )

    if len(result.motifs) == 0:
        logger.info("finished")
        return []

    # Write statistics
    stats_file = os.path.join(tmpdir, "stats.{}.txt")
    write_stats(result.stats, stats_file)

    bg = sorted(background, key=lambda x: BG_RANK[x])[0]
    if filter_significant:
        motifs = filter_significant_motifs(
            os.path.join(tmpdir, "significant_motifs.pfm"), result, bg)
        if len(motifs) == 0:
            logger.info("no significant motifs")
            return

        pfmfile = os.path.join(tmpdir, "significant_motifs.pfm")
    else:
        logger.info("not filtering for significance")
        motifs = result.motifs
        pfmfile = os.path.join(tmpdir, "all_motifs.pfm")

    if cluster:
        clusters = cluster_motifs_with_report(
            pfmfile,
            os.path.join(tmpdir, "clustered_motifs.pfm"),
            outdir,
            0.95,
            title=inputfile,
        )

        # Determine best motif in cluster
        best_motifs = best_motif_in_cluster(
            pfmfile,
            os.path.join(tmpdir, "clustered_motifs.pfm"),
            clusters,
            os.path.join(tmpdir, "validation.fa"),
            background,
            params["genome"],
            result.stats,
        )

        final_motifs, stats = rename_motifs(best_motifs, result.stats)
    else:
        logger.info("not clustering")
        rank = rank_motifs(result.stats)
        sorted_motifs = sorted(motifs,
                               key=lambda x: rank[str(x)],
                               reverse=True)
        final_motifs, stats = rename_motifs(sorted_motifs, result.stats)

    with open(os.path.join(outdir, "gimme.denovo.pfm"), "w") as f:
        for m in final_motifs:
            f.write("{}\n".format(m.to_pwm()))

    if create_report:
        bg = dict([(b, os.path.join(tmpdir, "bg.{}.fa".format(b)))
                   for b in background])

        create_denovo_motif_report(
            inputfile,
            os.path.join(outdir, "gimme.denovo.pfm"),
            os.path.join(tmpdir, "validation.fa"),
            bg,
            os.path.join(tmpdir, "localization.fa"),
            outdir,
            params,
            stats,
        )

    with open(os.path.join(outdir, "params.txt"), "w") as f:
        for k, v in params.items():
            f.write("{}\t{}\n".format(k, v))

    if not (params.get("keep_intermediate")):
        logger.debug(
            "Deleting intermediate files. "
            "Please specifify the -k option if you want to keep these files.")
        shutil.rmtree(tmpdir)

    logger.info("finished")
    logger.info("output dir: %s", outdir)
    if cluster:
        logger.info("de novo report: %s",
                    os.path.join(outdir, "gimme.denovo.html"))

    return final_motifs
示例#2
0
def gimme_motifs(inputfile, outdir, params=None, filter_significant=True, cluster=True, create_report=True):
    """De novo motif prediction based on an ensemble of different tools.

    Parameters
    ----------
    inputfile : str
        Filename of input. Can be either BED, narrowPeak or FASTA.

    outdir : str
        Name of output directory.

    params : dict, optional
        Optional parameters.

    filter_significant : bool, optional
        Filter motifs for significance using the validation set.
    
    cluster : bool, optional
        Cluster similar predicted (and significant) motifs.

    create_report : bool, optional
        Create output reports (both .txt and .html).
 
    Returns
    -------
    motifs : list
        List of predicted motifs.     

    Examples
    --------

    >>> from gimmemotifs.denovo import gimme_motifs
    >>> gimme_motifs("input.fa", "motifs.out")
    """
    if outdir is None:
        outdir = "gimmemotifs_{}".format(datetime.date.today().strftime("%d_%m_%Y"))

    # Create output directories
    tmpdir = os.path.join(outdir, "intermediate")
    for d in [outdir, tmpdir]: 
        if not os.path.exists(d):
            os.mkdir(d) 

    # setup logfile
    logger = logging.getLogger("gimme")
    # Log to file
    logfile = os.path.join(outdir, "gimmemotifs.log")
    fh = logging.FileHandler(logfile, "w")
    fh.setLevel(logging.DEBUG)
    file_formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    fh.setFormatter(file_formatter)
    logger.addHandler(fh)
    logger = logging.getLogger("gimme.denovo.gimme_motifs")
    
    # Initialize parameters
    params = parse_denovo_params(params)
 
    # Check the input files
    input_type, background = check_denovo_input(inputfile, params)
   
    logger.info("starting full motif analysis")
    logger.debug("Using temporary directory %s", mytmpdir())
    
    
    # Create the necessary files for motif prediction and validation
    if input_type == "bed":
        prepare_denovo_input_bed(inputfile, params, tmpdir)
    elif input_type == "narrowpeak":
        prepare_denovo_input_narrowpeak(inputfile, params, tmpdir)
    elif input_type == "fasta":
        prepare_denovo_input_fa(inputfile, params, tmpdir)
    else:
        
        logger.error("Unknown input file.")
        sys.exit(1)

    # Create the background FASTA files
    background = create_backgrounds(
            tmpdir, 
            background, 
            params.get("genome", None), 
            params["width"], 
            params.get("custom_background", None)
            )
    
    # Predict de novo motifs
    result = predict_motifs(
            os.path.join(tmpdir, "prediction.fa"),
            os.path.join(tmpdir, "prediction.bg.fa"),
            os.path.join(tmpdir, "all_motifs.pfm"),
            params=params,
            stats_fg=os.path.join(tmpdir, 'validation.fa'),
            stats_bg=background, 
            )

    if len(result.motifs) == 0:
        logger.info("finished")
        return []
    
    # Write statistics
    stats_file = os.path.join(tmpdir, "stats.{}.txt")
    write_stats(result.stats, stats_file)

    bg = sorted(background, key=lambda x: BG_RANK[x])[0]
    if filter_significant:
        motifs = filter_significant_motifs(
                os.path.join(tmpdir, "significant_motifs.pfm"),
                result, 
                bg)
        if len(motifs) == 0:
            logger.info("no significant motifs")
            return 

        pwmfile = os.path.join(tmpdir, "significant_motifs.pfm")
    else:
        logger.info("not filtering for significance")
        motifs = result.motifs
        pwmfile = os.path.join(tmpdir, "all_motifs.pfm")

    if cluster: 
        clusters = cluster_motifs_with_report(
                    pwmfile,
                    os.path.join(tmpdir, "clustered_motifs.pfm"),
                    outdir,
                    0.95,
                    title=inputfile)
        
        # Determine best motif in cluster
        best_motifs = best_motif_in_cluster(
                pwmfile,
                os.path.join(tmpdir, "clustered_motifs.pfm"),
                clusters, 
                os.path.join(tmpdir, 'validation.fa'), 
                background, 
                result.stats)
        
        final_motifs, stats = rename_motifs(best_motifs, result.stats)
    else:
        logger.info("not clustering")
        rank = rank_motifs(result.stats)
        sorted_motifs = sorted(motifs, key=lambda x: rank[str(x)], reverse=True)
        final_motifs, stats = rename_motifs(sorted_motifs, result.stats)

    with open(os.path.join(outdir, "motifs.pwm"), "w") as f:
        for m in final_motifs:
            f.write("{}\n".format(m.to_pwm()))
    
    if create_report:
        bg = dict([(b, os.path.join(tmpdir, "bg.{}.fa".format(b))) for b in background])

        create_denovo_motif_report(
                inputfile, 
                os.path.join(outdir, "motifs.pwm"), 
                os.path.join(tmpdir, "validation.fa"), 
                bg, 
                os.path.join(tmpdir, "localization.fa"), 
                outdir,
                params,
                stats,
                )
    
    with open(os.path.join(outdir, "params.txt"), "w") as f:
        for k,v in params.items():
            f.write("{}\t{}\n".format(k,v))
    
    if not(params.get("keep_intermediate")):
        logger.debug(
            "Deleting intermediate files. "
            "Please specifify the -k option if you want to keep these files.")
        shutil.rmtree(tmpdir)

    logger.info("finished")
    logger.info("output dir: %s", outdir) 
    if cluster:
        logger.info("report: %s", os.path.join(outdir, "motif_report.html"))

    return final_motifs
示例#3
0
def best_motif_in_cluster(
        single_pwm,
        clus_pwm,
        clusters,
        fg_fa,
        background,
        genome,
        stats=None,
        metrics=("roc_auc", "recall_at_fdr"),
):
    """Return the best motif per cluster for a clustering results.

    The motif can be either the average motif or one of the clustered motifs.

    Parameters
    ----------
    single_pwm : str
        Filename of motifs.

    clus_pwm : str
        Filename of motifs.

    clusters :
        Motif clustering result.

    fg_fa : str
        Filename of FASTA file.

    background : dict
        Dictionary for background file names.

    genome : str
        Genome name.

    stats : dict, optional
        If statistics are not supplied they will be computed.

    metrics : sequence, optional
        Metrics to use for motif evaluation. Default are "roc_auc" and
        "recall_at_fdr".

    Returns
    -------
    motifs : list
        List of Motif instances.
    """
    # combine original and clustered motifs
    motifs = read_motifs(single_pwm) + read_motifs(clus_pwm)
    motifs = dict([(str(m), m) for m in motifs])

    # get the statistics for those motifs that were not yet checked
    clustered_motifs = []
    for clus, singles in clusters:
        for motif in set([clus] + singles):
            if str(motif) not in stats:
                clustered_motifs.append(motifs[str(motif)])

    new_stats = {}
    for bg, bg_fa in background.items():
        for m, s in calc_stats(fg_file=fg_fa,
                               bg_file=bg_fa,
                               motifs=clustered_motifs,
                               genome=genome).items():
            if m not in new_stats:
                new_stats[m] = {}
            new_stats[m][bg] = s
    stats.update(new_stats)

    rank = rank_motifs(stats, metrics)

    # rank the motifs
    best_motifs = []
    for clus, singles in clusters:
        if len(singles) > 1:
            eval_motifs = singles
            if clus not in motifs:
                eval_motifs.append(clus)
            eval_motifs = [motifs[str(e)] for e in eval_motifs]
            best_motif = sorted(eval_motifs, key=lambda x: rank[str(x)])[-1]
            best_motifs.append(best_motif)
        else:
            best_motifs.append(clus)
        for bg in background:
            stats[str(best_motifs[-1])][bg]["num_cluster"] = len(singles)

    best_motifs = sorted(best_motifs, key=lambda x: rank[str(x)], reverse=True)
    return best_motifs
示例#4
0
def best_motif_in_cluster(single_pwm, clus_pwm, clusters, fg_fa, background, stats=None, metrics=("roc_auc", "recall_at_fdr")):
    """Return the best motif per cluster for a clustering results.

    The motif can be either the average motif or one of the clustered motifs.

    Parameters
    ----------
    single_pwm : str
        Filename of motifs.

    clus_pwm : str
        Filename of motifs.

    clusters : 
        Motif clustering result.

    fg_fa : str
        Filename of FASTA file.

    background : dict
        Dictionary for background file names.

    stats : dict, optional
        If statistics are not supplied they will be computed.

    metrics : sequence, optional
        Metrics to use for motif evaluation. Default are "roc_auc" and 
        "recall_at_fdr".
    
    Returns
    -------
    motifs : list
        List of Motif instances.
    """
    # combine original and clustered motifs
    motifs = read_motifs(single_pwm) + read_motifs(clus_pwm)
    motifs = dict([(str(m), m) for m in motifs])

    # get the statistics for those motifs that were not yet checked
    clustered_motifs = []
    for clus,singles in clusters:
        for motif in set([clus] + singles):
            if str(motif) not in stats:
                clustered_motifs.append(motifs[str(motif)])
    
    new_stats = {}
    for bg, bg_fa in background.items():
        for m,s in calc_stats(clustered_motifs, fg_fa, bg_fa).items():
            if m not in new_stats:
                new_stats[m] = {}
            new_stats[m][bg] = s
    stats.update(new_stats)
    
    rank = rank_motifs(stats, metrics)

    # rank the motifs
    best_motifs = []
    for clus, singles in clusters:
        if len(singles) > 1:
            eval_motifs = singles
            if clus not in motifs:
                eval_motifs.append(clus)
            eval_motifs = [motifs[str(e)] for e in eval_motifs]
            best_motif = sorted(eval_motifs, key=lambda x: rank[str(x)])[-1]
            best_motifs.append(best_motif)
        else:
            best_motifs.append(clus)
        for bg in background:
            stats[str(best_motifs[-1])][bg]["num_cluster"] = len(singles)

    best_motifs = sorted(best_motifs, key=lambda x: rank[str(x)], reverse=True)
    return best_motifs