示例#1
0
 def test1_stats(self):
     """ Calculate motif statistics """
     for ncpus in [1,2]:
         stats = calc_stats(self.motifs, self.fg_fa, self.bg_fa, ncpus=ncpus)
         
         for f in self.stat_functions:
             self.assertIn(f, list(stats.values())[0])
         
         # Two motifs
         self.assertEqual(2, len(stats))
 
         m1 = "T-box_M1713_1.01_CTAGGTGTGAA" # not enriched
         m2 = "p53_Average_8_CATGyCnGGrCATGy"    # highly enriched
 
         self.assertLess(stats[m1]["roc_auc"] , 0.9)
         self.assertGreater(stats[m2]["roc_auc"] , 0.5)
 
         self.assertEqual(stats[m1]["recall_at_fdr"] , 0.0)
         self.assertGreater(stats[m2]["recall_at_fdr"] , 0.8)
     
         self.assertGreater(stats[m1]["ks_pvalue"] , 0.01)
         self.assertLess(stats[m2]["ks_pvalue"] , 0.001)
         
         self.assertGreater(stats[m1]["phyper_at_fpr"] , 0.1)
         self.assertLess(stats[m2]["phyper_at_fpr"] , 1e-16)
         
         # Only calculate specific statistic
         stats = calc_stats(self.motifs, self.fg_fa, self.bg_fa, stats=["roc_auc"])
         
         self.assertEqual(1, len(list(stats.values())[0]))
         
         self.assertLess(stats[m1]["roc_auc"] , 0.9)
         self.assertGreater(stats[m2]["roc_auc"] , 0.5)
示例#2
0
def get_roc_values(motif, fg_file, bg_file):
    """Calculate ROC AUC values for ROC plots."""
    #print(calc_stats(motif, fg_file, bg_file, stats=["roc_values"], ncpus=1))
    #["roc_values"])

    try:
        #        fg_result = motif.pwm_scan_score(Fasta(fg_file), cutoff=0.0, nreport=1)
        #        fg_vals = [sorted(x)[-1] for x in fg_result.values()]
        #
        #        bg_result = motif.pwm_scan_score(Fasta(bg_file), cutoff=0.0, nreport=1)
        #        bg_vals = [sorted(x)[-1] for x in bg_result.values()]

        #        (x, y) = roc_values(fg_vals, bg_vals)
        stats = calc_stats(motif,
                           fg_file,
                           bg_file,
                           stats=["roc_values"],
                           ncpus=1)
        (x, y) = list(stats.values())[0]["roc_values"]
        return None, x, y
    except Exception as e:
        print(motif)
        print(motif.id)
        raise
        error = e
        return error, [], []
示例#3
0
def test1_stats(kwargs, stat_functions):
    """ Calculate motif statistics """
    for ncpus in [1, 2]:
        kwargs["ncpus"] = ncpus
        stats = calc_stats(**kwargs)

        for f in stat_functions:
            if "fg_table" not in kwargs or getattr(rocmetrics,
                                                   f).input_type != "pos":
                print(f, fg_table, getattr(rocmetrics, f).input_type)
                assert f in list(stats.values())[0]

        # Two motifs
        assert 2 == len(stats)

        m1 = "T-box_M1713_1.01_CTAGGTGTGAA"  # not enriched
        m2 = "p53_Average_8_CATGyCnGGrCATGy"  # highly enriched

        assert stats[m1]["roc_auc"] < 0.9
        assert stats[m2]["roc_auc"] > 0.5

        assert stats[m1]["recall_at_fdr"] == 0.0
        assert stats[m2]["recall_at_fdr"] > 0.8

        if "fg_table" not in kwargs:
            assert stats[m1]["ks_pvalue"] > 0.01
            assert stats[m2]["ks_pvalue"] < 0.001

        assert stats[m1]["phyper_at_fpr"] > 0.1
        assert stats[m2]["phyper_at_fpr"] < 1e-13
示例#4
0
def test_one_statistic(kwargs):
    # Only calculate specific statistic
    stats = calc_stats(**kwargs)

    assert 1 == len(list(stats.values())[0])

    m1 = "T-box_M1713_1.01_CTAGGTGTGAA"  # not enriched
    m2 = "p53_Average_8_CATGyCnGGrCATGy"  # highly enriched
    assert stats[m1]["roc_auc"] < 0.9
    assert stats[m2]["roc_auc"] > 0.5
示例#5
0
    def test2_stats_single_motif(self):
        """ Calculate motif statistics """

        m_id = "p53_Average_8_CATGyCnGGrCATGy"

        with open(self.motifs) as f:
            motifs = read_motifs(f)
        motif = [m for m in motifs if str(m) == m_id][0]

        stats = calc_stats(motif, self.fg_fa, self.bg_fa, stats=["roc_auc"])
        self.assertGreater(stats[m_id]["roc_auc"], 0.9)
示例#6
0
 def test2_stats_single_motif(self):
     """ Calculate motif statistics """
     
     m_id = "p53_Average_8_CATGyCnGGrCATGy"
     
     with open(self.motifs) as f:
         motifs = read_motifs(f)
     motif = [m for m in motifs if str(m) == m_id][0]
     
     stats = calc_stats(motif, self.fg_fa, self.bg_fa, stats=["roc_auc"])
     self.assertGreater(stats[m_id]["roc_auc"] , 0.9)
示例#7
0
def test2_stats_single_motif(kwargs):
    """ Calculate motif statistics """

    m_id = "p53_Average_8_CATGyCnGGrCATGy"

    motifs = read_motifs(kwargs["motifs"])
    motif = [m for m in motifs if str(m) == m_id][0]
    kwargs["motifs"] = motif

    stats = calc_stats(**kwargs)
    assert stats[m_id]["roc_auc"] > 0.9
示例#8
0
def mp_calc_stats(motifs, fg_fa, bg_fa, bg_name=None):
    """Parallel calculation of motif statistics."""
    try:
        stats = calc_stats(motifs, fg_fa, bg_fa, ncpus=1)
    except Exception as e:
        raise
        sys.stderr.write("ERROR: {}\n".format(str(e)))
        stats = {}

    if not bg_name:
        bg_name = "default"

    return bg_name, stats
示例#9
0
def mp_calc_stats(motifs, fg_fa, bg_fa, bg_name=None):
    """Parallel calculation of motif statistics."""
    try:
        stats = calc_stats(motifs, fg_fa, bg_fa, ncpus=1)
    except Exception as e:
        raise
        sys.stderr.write("ERROR: {}\n".format(str(e)))
        stats = {}

    if not bg_name:
        bg_name = "default"

    return bg_name, stats
示例#10
0
    def test1_stats(self):
        """ Calculate motif statistics """
        for ncpus in [1, 2]:
            stats = calc_stats(self.motifs,
                               self.fg_fa,
                               self.bg_fa,
                               ncpus=ncpus)

            for f in self.stat_functions:
                self.assertIn(f, list(stats.values())[0])

            # Two motifs
            self.assertEqual(2, len(stats))

            m1 = "T-box_M1713_1.01_CTAGGTGTGAA"  # not enriched
            m2 = "p53_Average_8_CATGyCnGGrCATGy"  # highly enriched

            self.assertLess(stats[m1]["roc_auc"], 0.9)
            self.assertGreater(stats[m2]["roc_auc"], 0.5)

            self.assertEqual(stats[m1]["recall_at_fdr"], 0.0)
            self.assertGreater(stats[m2]["recall_at_fdr"], 0.8)

            self.assertGreater(stats[m1]["ks_pvalue"], 0.01)
            self.assertLess(stats[m2]["ks_pvalue"], 0.001)

            self.assertGreater(stats[m1]["phyper_at_fpr"], 0.1)
            self.assertLess(stats[m2]["phyper_at_fpr"], 1e-16)

            # Only calculate specific statistic
            stats = calc_stats(self.motifs,
                               self.fg_fa,
                               self.bg_fa,
                               stats=["roc_auc"])

            self.assertEqual(1, len(list(stats.values())[0]))

            self.assertLess(stats[m1]["roc_auc"], 0.9)
            self.assertGreater(stats[m2]["roc_auc"], 0.5)
示例#11
0
def create_denovo_motif_report(inputfile,
                               pfmfile,
                               fgfa,
                               background,
                               locfa,
                               outdir,
                               params,
                               stats=None):
    """Create text and graphical (.html) motif reports."""
    logger.info("creating de novo reports")

    motifs = read_motifs(pfmfile, fmt="pwm")

    # ROC plots
    create_roc_plots(pfmfile, fgfa, background, outdir, params["genome"])

    # Closest match in database
    mc = MotifComparer()
    closest_match = mc.get_closest_match(motifs)

    if stats is None:
        stats = {}
        for bg, bgfa in background.items():
            for m, s in calc_stats(fg_file=fgfa, bg_file=bgfa,
                                   motifs=motifs).items():
                if m not in stats:
                    stats[m] = {}
                stats[m][bg] = s

    stats = add_star(stats)

    if not params:
        params = {}
    cutoff_fpr = params.get("cutoff_fpr", 0.9)
    lsize = np.median([len(seq) for seq in Fasta(locfa).seqs])

    # Location plots
    logger.debug("Creating localization plots")
    for motif in motifs:
        logger.debug("  {} {}".format(motif.id, motif))
        outfile = os.path.join(outdir,
                               "images/{}_histogram.svg".format(motif.id))
        motif_localization(locfa, motif, lsize, outfile, cutoff=cutoff_fpr)

    # Create reports
    _create_text_report(inputfile, motifs, closest_match, stats, outdir)
    _create_graphical_report(inputfile, pfmfile, background, closest_match,
                             outdir, stats)
示例#12
0
def get_roc_values(motif, fg_file, bg_file, genome):
    """Calculate ROC AUC values for ROC plots."""
    try:
        stats = calc_stats(
            fg_file=fg_file,
            bg_file=bg_file,
            motifs=motif,
            genome=genome,
            stats=["roc_values"],
            ncpus=1,
        )
        (x, y) = list(stats.values())[0]["roc_values"]
        return None, x, y
    except Exception as e:
        print(motif)
        print(motif.id)
        print(str(e))
        raise
示例#13
0
def mp_calc_stats(motifs, fg_fa, bg_fa, zscore, gc, genome, bg_name=None):
    """Parallel calculation of motif statistics."""
    try:
        stats = calc_stats(
            motifs=motifs,
            fg_file=fg_fa,
            bg_file=bg_fa,
            ncpus=1,
            zscore=zscore,
            gc=gc,
            genome=genome,
        )
    except Exception as e:
        sys.stderr.write("ERROR: {}\n".format(str(e)))
        stats = {}
        raise

    if not bg_name:
        bg_name = "default"

    return bg_name, stats
示例#14
0
def get_roc_values(motif, fg_file, bg_file):
    """Calculate ROC AUC values for ROC plots."""
    #print(calc_stats(motif, fg_file, bg_file, stats=["roc_values"], ncpus=1))
    #["roc_values"])
    
    try:
#        fg_result = motif.pwm_scan_score(Fasta(fg_file), cutoff=0.0, nreport=1)
#        fg_vals = [sorted(x)[-1] for x in fg_result.values()]
#
#        bg_result = motif.pwm_scan_score(Fasta(bg_file), cutoff=0.0, nreport=1)
#        bg_vals = [sorted(x)[-1] for x in bg_result.values()]

#        (x, y) = roc_values(fg_vals, bg_vals)
        stats = calc_stats(motif, fg_file, bg_file, stats=["roc_values"], ncpus=1)
        (x,y) = list(stats.values())[0]["roc_values"]
        return None,x,y
    except Exception as e:
        print(motif)
        print(motif.id)
        raise
        error = e
        return error,[],[]
示例#15
0
def create_denovo_motif_report(inputfile, pwmfile, fgfa, background, locfa, outdir, params, stats=None):
    """Create text and graphical (.html) motif reports."""
    logger.info("creating reports")

    motifs = read_motifs(pwmfile, fmt="pwm")
    
    # ROC plots
    create_roc_plots(pwmfile, fgfa, background, outdir)
    
    # Closest match in database
    mc = MotifComparer()
    closest_match = mc.get_closest_match(motifs)
    
    if stats is None:
        stats = {}
        for bg, bgfa in background.items():
            for m, s in calc_stats(motifs, fgfa, bgfa).items():
                if m not in stats:
                    stats[m] = {}
                stats[m][bg] = s

    stats = add_star(stats)

    if not params:
        params = {}
    cutoff_fpr = params.get('cutoff_fpr', 0.9)
    lwidth = np.median([len(seq) for seq in Fasta(locfa).seqs])

    # Location plots
    logger.debug("Creating localization plots")
    for motif in motifs:
        logger.debug("  {} {}".format(motif.id, motif))
        outfile = os.path.join(outdir, "images/{}_histogram.svg".format(motif.id))
        motif_localization(locfa, motif, lwidth, outfile, cutoff=cutoff_fpr)

    # Create reports
    _create_text_report(inputfile, motifs, closest_match, stats, outdir)
    _create_graphical_report(inputfile, pwmfile, background, closest_match, outdir, stats)
示例#16
0
def best_motif_in_cluster(
        single_pwm,
        clus_pwm,
        clusters,
        fg_fa,
        background,
        genome,
        stats=None,
        metrics=("roc_auc", "recall_at_fdr"),
):
    """Return the best motif per cluster for a clustering results.

    The motif can be either the average motif or one of the clustered motifs.

    Parameters
    ----------
    single_pwm : str
        Filename of motifs.

    clus_pwm : str
        Filename of motifs.

    clusters :
        Motif clustering result.

    fg_fa : str
        Filename of FASTA file.

    background : dict
        Dictionary for background file names.

    genome : str
        Genome name.

    stats : dict, optional
        If statistics are not supplied they will be computed.

    metrics : sequence, optional
        Metrics to use for motif evaluation. Default are "roc_auc" and
        "recall_at_fdr".

    Returns
    -------
    motifs : list
        List of Motif instances.
    """
    # combine original and clustered motifs
    motifs = read_motifs(single_pwm) + read_motifs(clus_pwm)
    motifs = dict([(str(m), m) for m in motifs])

    # get the statistics for those motifs that were not yet checked
    clustered_motifs = []
    for clus, singles in clusters:
        for motif in set([clus] + singles):
            if str(motif) not in stats:
                clustered_motifs.append(motifs[str(motif)])

    new_stats = {}
    for bg, bg_fa in background.items():
        for m, s in calc_stats(fg_file=fg_fa,
                               bg_file=bg_fa,
                               motifs=clustered_motifs,
                               genome=genome).items():
            if m not in new_stats:
                new_stats[m] = {}
            new_stats[m][bg] = s
    stats.update(new_stats)

    rank = rank_motifs(stats, metrics)

    # rank the motifs
    best_motifs = []
    for clus, singles in clusters:
        if len(singles) > 1:
            eval_motifs = singles
            if clus not in motifs:
                eval_motifs.append(clus)
            eval_motifs = [motifs[str(e)] for e in eval_motifs]
            best_motif = sorted(eval_motifs, key=lambda x: rank[str(x)])[-1]
            best_motifs.append(best_motif)
        else:
            best_motifs.append(clus)
        for bg in background:
            stats[str(best_motifs[-1])][bg]["num_cluster"] = len(singles)

    best_motifs = sorted(best_motifs, key=lambda x: rank[str(x)], reverse=True)
    return best_motifs
示例#17
0
def roc(args):
    """ Calculate ROC_AUC and other metrics and optionally plot ROC curve."""
    outputfile = args.outfile
    # Default extension for image
    if outputfile and not outputfile.endswith(".png"):
        outputfile += ".png"

    motifs = read_motifs(open(args.pwmfile), fmt="pwm")

    ids = []
    if args.ids:
        ids = args.ids.split(",")
    else:
        ids = [m.id for m in motifs]
    motifs = [m for m in motifs if (m.id in ids)]

    stats = [
        "phyper_at_fpr",
        "roc_auc",
        "enr_at_fpr",
        "max_enrichment",
        "recall_at_fdr",
        "roc_values",
        "matches_at_fpr",
    ]

    motif_stats = calc_stats(motifs,
                             args.sample,
                             args.background,
                             genome=args.genome,
                             stats=stats)

    plot_x = []
    plot_y = []
    legend = []

    f_out = sys.stdout
    if args.outdir:
        if not os.path.exists(args.outdir):
            os.makedirs(args.outdir)
        f_out = open(args.outdir + "/gimme.roc.report.txt", "w")

    # Print the metrics
    f_out.write(
        "Motif\t# matches\t# matches background\tP-value\tlog10 P-value\tROC AUC\tEnr. at 1% FPR\tRecall at 10% FDR\n"
    )
    for motif in motifs:
        if outputfile:
            x, y = motif_stats[str(motif)]["roc_values"]
            plot_x.append(x)
            plot_y.append(y)
            legend.append(motif.id)
        log_pvalue = np.inf
        if motif_stats[str(motif)]["phyper_at_fpr"] > 0:
            log_pvalue = -np.log10(motif_stats[str(motif)]["phyper_at_fpr"])
        f_out.write(
            "{}\t{:d}\t{:d}\t{:.2e}\t{:.3f}\t{:.3f}\t{:.2f}\t{:0.4f}\n".format(
                motif.id,
                motif_stats[str(motif)]["matches_at_fpr"][0],
                motif_stats[str(motif)]["matches_at_fpr"][1],
                motif_stats[str(motif)]["phyper_at_fpr"],
                log_pvalue,
                motif_stats[str(motif)]["roc_auc"],
                motif_stats[str(motif)]["enr_at_fpr"],
                motif_stats[str(motif)]["recall_at_fdr"],
            ))
    f_out.close()

    if args.outdir:
        html_report(
            args.outdir,
            args.outdir + "/gimme.roc.report.txt",
            args.pwmfile,
            0.01,
        )

    # Plot the ROC curve
    if outputfile:
        roc_plot(outputfile, plot_x, plot_y, ids=legend)
示例#18
0
def best_motif_in_cluster(single_pwm, clus_pwm, clusters, fg_fa, background, stats=None, metrics=("roc_auc", "recall_at_fdr")):
    """Return the best motif per cluster for a clustering results.

    The motif can be either the average motif or one of the clustered motifs.

    Parameters
    ----------
    single_pwm : str
        Filename of motifs.

    clus_pwm : str
        Filename of motifs.

    clusters : 
        Motif clustering result.

    fg_fa : str
        Filename of FASTA file.

    background : dict
        Dictionary for background file names.

    stats : dict, optional
        If statistics are not supplied they will be computed.

    metrics : sequence, optional
        Metrics to use for motif evaluation. Default are "roc_auc" and 
        "recall_at_fdr".
    
    Returns
    -------
    motifs : list
        List of Motif instances.
    """
    # combine original and clustered motifs
    motifs = read_motifs(single_pwm) + read_motifs(clus_pwm)
    motifs = dict([(str(m), m) for m in motifs])

    # get the statistics for those motifs that were not yet checked
    clustered_motifs = []
    for clus,singles in clusters:
        for motif in set([clus] + singles):
            if str(motif) not in stats:
                clustered_motifs.append(motifs[str(motif)])
    
    new_stats = {}
    for bg, bg_fa in background.items():
        for m,s in calc_stats(clustered_motifs, fg_fa, bg_fa).items():
            if m not in new_stats:
                new_stats[m] = {}
            new_stats[m][bg] = s
    stats.update(new_stats)
    
    rank = rank_motifs(stats, metrics)

    # rank the motifs
    best_motifs = []
    for clus, singles in clusters:
        if len(singles) > 1:
            eval_motifs = singles
            if clus not in motifs:
                eval_motifs.append(clus)
            eval_motifs = [motifs[str(e)] for e in eval_motifs]
            best_motif = sorted(eval_motifs, key=lambda x: rank[str(x)])[-1]
            best_motifs.append(best_motif)
        else:
            best_motifs.append(clus)
        for bg in background:
            stats[str(best_motifs[-1])][bg]["num_cluster"] = len(singles)

    best_motifs = sorted(best_motifs, key=lambda x: rank[str(x)], reverse=True)
    return best_motifs