def __str__(self): """return string representation.""" return "%i\t%i\t%s\t%i\t%i\t%s\t%s\t%6.4e\t%6.4e\t%6.4e" % \ (self.mSampleCountsCategory, self.mSampleCountsTotal, IOTools.prettyPercent( self.mSampleCountsCategory, self.mSampleCountsTotal), self.mBackgroundCountsCategory, self.mBackgroundCountsTotal, IOTools.prettyPercent( self.mBackgroundCountsCategory, self.mBackgroundCountsTotal), IOTools.val2str(self.mRatio), self.mPValue, self.mProbabilityOverRepresentation, self.mProbabilityUnderRepresentation)
def main(argv=sys.argv): parser = E.OptionParser( version="%prog version: $Id: annotator_distance.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("-a", "--annotations-tsv-file", dest="filename_annotations", type="string", help="filename mapping gene ids to annotations (a tab-separated table with two-columns) [default=%default].") parser.add_option("-r", "--resolution", dest="resolution", type="int", help="resolution of count vector [default=%default].") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="number of bins in count vector [default=%default].") parser.add_option("-i", "--num-samples", dest="num_samples", type="int", help="sample size to compute [default=%default].") parser.add_option("-w", "--workspace-bed-file", dest="filename_workspace", type="string", help="filename with workspace information [default=%default].") parser.add_option("--workspace-builder", dest="workspace_builder", type="choice", choices=( "gff", "gtf-intergenic", "gtf-intronic", "gtf-genic"), help="given a gff/gtf file build a workspace [default=%default].") parser.add_option("--workspace-labels", dest="workspace_labels", type="choice", choices=("none", "direction", "annotation"), help="labels to use for the workspace workspace [default=%default].") parser.add_option("--sampler", dest="sampler", type="choice", choices=("permutation", "gaps"), help="sampler to use. The sampler determines the null model of how segments are distributed in the workspace [default=%default]") parser.add_option("--counter", dest="counters", type="choice", action="append", choices=( "transcription", "closest-distance", "all-distances"), help="counter to use. The counter computes the quantity of interest [default=%default]") parser.add_option("--analysis", dest="analysis", type="choice", action="append", choices=("proximity", "area-under-curve"), help="analysis to perform [default=%default]") parser.add_option("--transform-counts", dest="transform_counts", type="choice", choices=("raw", "cumulative"), help="cumulate counts [default=%default].") parser.add_option("-s", "--segments", dest="filename_segments", type="string", help="filename with segment information [default=%default].") parser.add_option("--xrange", dest="xrange", type="string", help="xrange to plot [default=%default]") parser.add_option("-o", "--logscale", dest="logscale", type="string", help="use logscale on x, y or xy [default=%default]") parser.add_option("-p", "--plot", dest="plot", action="store_true", help="output plots [default=%default]") parser.add_option("--hardcopy", dest="hardcopy", type="string", help="output hardcopies to file [default=%default]") parser.add_option("--no-fdr", dest="do_fdr", action="store_false", help="do not compute FDR rates [default=%default]") parser.add_option("--segments-format", dest="segments_format", type="choice", choices=("gtf", "bed"), help="format of segments file [default=%default].") parser.add_option("--truncate", dest="truncate", action="store_true", help="truncate segments extending beyond a workspace [default=%default]") parser.add_option("--remove-overhangs", dest="remove_overhangs", action="store_true", help="remove segments extending beyond a workspace[default=%default]") parser.add_option("--keep-ambiguous", dest="keep_ambiguous", action="store_true", help="keep segments extending to more than one workspace [default=%default]") parser.set_defaults( filename_annotations=None, filename_workspace="workspace.gff", filename_segments="FastDown.gtf", filename_annotations_gtf="../data/tg1_territories.gff", workspace_builder="gff", workspace_labels="none", sampler="permutation", truncate=False, num_bins=10000, num_samples=10, resolution=100, plot_samples=False, plot_envelope=True, counters=[], transform_counts="raw", xrange=None, plot=False, logscale=None, output_all=False, do_test=False, analysis=[], do_fdr=True, hardcopy="%s.png", segments_format="gtf", remove_overhangs=False, ) (options, args) = E.Start(parser, argv=argv, add_output_options=True) ########################################### # setup options if options.sampler == "permutation": sampler = SamplerPermutation elif options.sampler == "gaps": sampler = SamplerGaps if options.xrange: options.xrange = map(float, options.xrange.split(",")) if len(options.counters) == 0: raise ValueError("please specify at least one counter.") if len(options.analysis) == 0: raise ValueError("please specify at least one analysis.") if options.workspace_labels == "annotation" and not options.filename_annotations: raise ValueError( "please specify --annotations-tsv-file is --workspace-labels=annotations.") ########################################### # read data if options.workspace_labels == "annotation": def constant_factory(value): return itertools.repeat(value).next def dicttype(): return collections.defaultdict(constant_factory(("unknown",))) map_id2annotations = IOTools.readMultiMap(open(options.filename_annotations, "r"), dtype=dicttype) else: map_id2annotations = {} workspace = readWorkspace(open(options.filename_workspace, "r"), options.workspace_builder, options.workspace_labels, map_id2annotations) E.info("read workspace for %i contigs" % (len(workspace))) indexed_workspace = indexIntervals(workspace, with_values=True) segments = readSegments(open(options.filename_segments, "r"), indexed_workspace, format=options.segments_format, keep_ambiguous=options.keep_ambiguous, truncate=options.truncate, remove_overhangs=options.remove_overhangs) nsegments = 0 for contig, vv in segments.iteritems(): nsegments += len(vv) E.info("read %i segments for %i contigs" % (nsegments, len(workspace))) indexed_segments = indexIntervals(segments, with_values=False) if nsegments == 0: E.warn("no segments read - no computation done.") E.Stop() return # build labels labels = collections.defaultdict(int) for contig, vv in workspace.iteritems(): for start, end, v in vv: for l in v[0]: labels[l] += 1 for l in v[1]: labels[l] += 1 E.info("found %i workspace labels" % len(labels)) ########################################### # setup counting containers counters = [] for cc in options.counters: if cc == "transcription": counter = CounterTranscription elif cc == "closest-distance": counter = CounterClosestDistance elif cc == "all-distances": counter = CounterAllDistances if nsegments < 256: dtype = numpy.uint8 elif nsegments < 65536: dtype = numpy.uint16 elif nsegments < 4294967296: dtype = numpy.uint32 else: dtype = numpy.int E.debug("choosen dtype %s" % str(dtype)) E.info("samples space is %i bases: %i bins at %i resolution" % (options.num_bins * options.resolution, options.num_bins, options.resolution, )) E.info("allocating counts: %i bytes (%i labels, %i samples, %i bins)" % (options.num_bins * len(labels) * dtype().itemsize * (options.num_samples + 1), len(labels), options.num_samples, options.num_bins, )) c = CountingResults(labels) c.mObservedCounts = counter( labels, options.num_bins, options.resolution, dtype=dtype) simulated_counts = [] for x in range(options.num_samples): simulated_counts.append( counter(labels, options.num_bins, options.resolution, dtype=dtype)) c.mSimulatedCounts = simulated_counts c.mName = c.mObservedCounts.mName counters.append(c) E.info("allocated memory successfully") segments_per_workspace = [] segment_sizes = [] segments_per_label = collections.defaultdict(int) workspaces_per_label = collections.defaultdict(int) ############################################ # get observed and simpulated counts nworkspaces, nempty_workspaces, nempty_contigs, nmiddle = 0, 0, 0, 0 iteration2 = 0 for contig, vv in workspace.iteritems(): iteration2 += 1 E.info("counting %i/%i: %s %i segments" % (iteration2, len(workspace), contig, len(vv))) if len(vv) == 0: continue iteration1 = 0 for work_start, work_end, v in vv: left_labels, right_labels = v[0], v[1] iteration1 += 1 # ignore empty segments if contig not in indexed_segments: nempty_contigs += 1 continue r = indexed_segments[contig].find(work_start, work_end) segments_per_workspace.append(len(r)) if not r: nempty_workspaces += 1 continue # collect segments and stats nworkspaces += 1 observed = [(x.start, x.end) for x in r] observed.sort() segments_per_workspace.append(len(observed)) segment_sizes.extend([x[1] - x[0] for x in observed]) # collect basic counts for label in list(left_labels) + list(right_labels): workspaces_per_label[label] += 1 segments_per_label[label] += len(observed) # add observed counts for counter in counters: counter.mObservedCounts.addCounts( observed, work_start, work_end, left_labels, right_labels) # create sampler s = sampler(observed, work_start, work_end) # add simulated counts for iteration in range(options.num_samples): simulated = s.sample() for counter in counters: counter.mSimulatedCounts[iteration].addCounts( simulated, work_start, work_end, left_labels, right_labels) E.info("counting finished") E.info("nworkspaces=%i, nmiddle=%i, nempty_workspaces=%i, nempty_contigs=%i" % (nworkspaces, nmiddle, nempty_workspaces, nempty_contigs)) ###################################################### # transform counts if options.transform_counts == "cumulative": transform = cumulative_transform elif options.transform_counts == "raw": transform = normalize_transform #################################################### # analysis if "proximity" in options.analysis: outfile_proximity = E.openOutputFile("proximity") outfile_proximity.write("\t".join(("label", "observed", "pvalue", "expected", "CIlower", "CIupper", "qvalue", "segments", "workspaces")) + "\n") else: outfile_proximity = None if "area-under-curve" in options.analysis: outfile_auc = E.openOutputFile("auc") outfile_auc.write("label\tobserved\texpected\tCIlower\tCIupper\n") else: outfile_auc = None # qvalue: expected false positives at p-value # qvalue = expected false positives / if options.do_fdr: E.info("computing pvalues for fdr") for counter in counters: for label in labels: E.info("working on counter:%s label:%s" % (counter, label)) # collect all P-Values of simulated results to compute FDR sim_pvalues = [] medians = counter.getMedians(label) for median in medians: pvalue = float( scipy.stats.percentileofscore(medians, median)) / 100.0 sim_pvalues.append(pvalue) sim_pvalues.sort() else: sim_pvalues = [] # compute observed p-values for counter in counters: counter.update() obs_pvalues = [] for counter in counters: for label in labels: obs_pvalues.append(counter.mStats[label].pvalue) obs_pvalues.sort() # compute observed p-values if options.do_fdr: for counter in counters: counter.updateFDR(obs_pvalues, sim_pvalues) for counter in counters: outofbounds_sim, totals_sim = 0, 0 outofbounds_obs, totals_obs = 0, 0 for label in labels: for sample in range(options.num_samples): if counter.mSimulatedCounts[sample].mOutOfBounds[label]: E.debug("out of bounds: sample %i, label %s, counts=%i" % (sample, label, counter.mSimulatedCounts[sample].mOutOfBounds[label])) outofbounds_sim += counter.mSimulatedCounts[ sample].mOutOfBounds[label] totals_sim += counter.mSimulatedCounts[sample].mTotals[label] outofbounds_obs += counter.mObservedCounts.mOutOfBounds[label] totals_obs += counter.mObservedCounts.mTotals[label] E.info("out of bounds observations: observed=%i/%i (%5.2f%%), simulations=%i/%i (%5.2f%%)" % (outofbounds_obs, totals_obs, 100.0 * outofbounds_obs / totals_obs, outofbounds_sim, totals_sim, 100.0 * outofbounds_sim / totals_sim, )) for label in labels: if outfile_auc: mmin, mmax, mmean = counter.getEnvelope( label, transform=normalize_transform) obs = normalize_transform( counter.mObservedCounts[label], counter.mObservedCounts.mOutOfBounds[label]) def block_iterator(a1, a2, a3, num_bins): x = 0 while x < num_bins: while x < num_bins and a1[x] <= a2[x]: x += 1 start = x while x < options.num_bins and a1[x] > a2[x]: x += 1 end = x total_a1 = a1[start:end].sum() total_a3 = a3[start:end].sum() if total_a1 > total_a3: yield (total_a1 - total_a3, start, end, total_a1, total_a3) blocks = list( block_iterator(obs, mmax, mmean, options.num_bins)) if options.output_all: for delta, start, end, total_obs, total_mean in blocks: if end - start <= 1: continue outfile_auc.write("%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" % (label, start * options.resolution, end * options.resolution, (end - start) * options.resolution, total_obs, total_mean, delta, total_obs / total_mean, 100.0 * (total_obs / total_mean - 1.0))) # output best block blocks.sort() delta, start, end, total_obs, total_mean = blocks[-1] outfile_auc.write("%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" % (label, start * options.resolution, end * options.resolution, (end - start) * options.resolution, total_obs, total_mean, delta, total_obs / total_mean, 100.0 * (total_obs / total_mean - 1.0))) if outfile_proximity: # find error bars at median st = counter.mStats[label] outfile_proximity.write("%s\t%i\t%f\t%i\t%i\t%i\t%s\t%i\t%i\n" % (label, st.observed * options.resolution, st.pvalue, st.expected * options.resolution, st.ci95lower * options.resolution, st.ci95upper * options.resolution, IOTools.val2str(st.qvalue), segments_per_label[label], workspaces_per_label[label], )) if options.plot: for counter in counters: plotCounts(counter, options, transform) # plot summary stats plt.figure() plt.title("distribution of workspace length") data = [] for contig, segs in workspace.iteritems(): if len(segs) == 0: continue data.extend([x[1] - x[0] for x in segs]) vals, bins = numpy.histogram( data, bins=numpy.arange(0, max(data), 100), new=True) t = float(sum(vals)) plt.plot(bins[:-1], numpy.cumsum(vals) / t) plt.gca().set_xscale('log') plt.legend() t = float(sum(vals)) plt.xlabel("size of workspace") plt.ylabel("cumulative relative frequency") if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "workspace_size")) plt.figure() plt.title("segments per block") vals, bins = numpy.histogram(segments_per_workspace, bins=numpy.arange( 0, max(segments_per_workspace), 1), new=True) plt.plot(bins[:-1], vals) plt.xlabel("segments per block") plt.ylabel("absolute frequency") if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "segments_per_block")) plt.figure() plt.title("workspaces per label") plt.barh( range(0, len(labels)), [workspaces_per_label[x] for x in labels], height=0.5) plt.yticks(range(0, len(labels)), labels) plt.ylabel("workspaces per label") plt.xlabel("absolute frequency") plt.gca().set_xscale('log') if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "workspaces_per_label")) plt.figure() plt.title("segments per label") plt.barh(range(0, len(labels)), [segments_per_label[x] for x in labels], height=0.5) plt.yticks(range(0, len(labels)), labels) plt.ylabel("segments per label") plt.xlabel("absolute frequency") plt.xticks(range(0, len(labels)), labels) if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "segments_per_label")) if not options.hardcopy: plt.show() E.Stop()
def main(argv=sys.argv): parser = E.OptionParser( version= "%prog version: $Id: annotator_distance.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-a", "--annotations-tsv-file", dest="filename_annotations", type="string", help= "filename mapping gene ids to annotations (a tab-separated table with two-columns) [default=%default]." ) parser.add_option("-r", "--resolution", dest="resolution", type="int", help="resolution of count vector [default=%default].") parser.add_option( "-b", "--num-bins", dest="num_bins", type="int", help="number of bins in count vector [default=%default].") parser.add_option("-i", "--num-samples", dest="num_samples", type="int", help="sample size to compute [default=%default].") parser.add_option( "-w", "--workspace-bed-file", dest="filename_workspace", type="string", help="filename with workspace information [default=%default].") parser.add_option( "--workspace-builder", dest="workspace_builder", type="choice", choices=("gff", "gtf-intergenic", "gtf-intronic", "gtf-genic"), help="given a gff/gtf file build a workspace [default=%default].") parser.add_option( "--workspace-labels", dest="workspace_labels", type="choice", choices=("none", "direction", "annotation"), help="labels to use for the workspace workspace [default=%default].") parser.add_option( "--sampler", dest="sampler", type="choice", choices=("permutation", "gaps"), help= "sampler to use. The sampler determines the null model of how segments are distributed in the workspace [default=%default]" ) parser.add_option( "--counter", dest="counters", type="choice", action="append", choices=("transcription", "closest-distance", "all-distances"), help= "counter to use. The counter computes the quantity of interest [default=%default]" ) parser.add_option("--analysis", dest="analysis", type="choice", action="append", choices=("proximity", "area-under-curve"), help="analysis to perform [default=%default]") parser.add_option("--transform-counts", dest="transform_counts", type="choice", choices=("raw", "cumulative"), help="cumulate counts [default=%default].") parser.add_option( "-s", "--segments", dest="filename_segments", type="string", help="filename with segment information [default=%default].") parser.add_option("--xrange", dest="xrange", type="string", help="xrange to plot [default=%default]") parser.add_option("-o", "--logscale", dest="logscale", type="string", help="use logscale on x, y or xy [default=%default]") parser.add_option("-p", "--plot", dest="plot", action="store_true", help="output plots [default=%default]") parser.add_option("--hardcopy", dest="hardcopy", type="string", help="output hardcopies to file [default=%default]") parser.add_option("--no-fdr", dest="do_fdr", action="store_false", help="do not compute FDR rates [default=%default]") parser.add_option("--segments-format", dest="segments_format", type="choice", choices=("gtf", "bed"), help="format of segments file [default=%default].") parser.add_option( "--truncate", dest="truncate", action="store_true", help="truncate segments extending beyond a workspace [default=%default]" ) parser.add_option( "--remove-overhangs", dest="remove_overhangs", action="store_true", help="remove segments extending beyond a workspace[default=%default]") parser.add_option( "--keep-ambiguous", dest="keep_ambiguous", action="store_true", help= "keep segments extending to more than one workspace [default=%default]" ) parser.set_defaults( filename_annotations=None, filename_workspace="workspace.gff", filename_segments="FastDown.gtf", filename_annotations_gtf="../data/tg1_territories.gff", workspace_builder="gff", workspace_labels="none", sampler="permutation", truncate=False, num_bins=10000, num_samples=10, resolution=100, plot_samples=False, plot_envelope=True, counters=[], transform_counts="raw", xrange=None, plot=False, logscale=None, output_all=False, do_test=False, analysis=[], do_fdr=True, hardcopy="%s.png", segments_format="gtf", remove_overhangs=False, ) (options, args) = E.Start(parser, argv=argv, add_output_options=True) ########################################### # setup options if options.sampler == "permutation": sampler = SamplerPermutation elif options.sampler == "gaps": sampler = SamplerGaps if options.xrange: options.xrange = map(float, options.xrange.split(",")) if len(options.counters) == 0: raise ValueError("please specify at least one counter.") if len(options.analysis) == 0: raise ValueError("please specify at least one analysis.") if options.workspace_labels == "annotation" and not options.filename_annotations: raise ValueError( "please specify --annotations-tsv-file is --workspace-labels=annotations." ) ########################################### # read data if options.workspace_labels == "annotation": def constant_factory(value): return itertools.repeat(value).next def dicttype(): return collections.defaultdict(constant_factory(("unknown", ))) map_id2annotations = IOTools.readMultiMap(open( options.filename_annotations, "r"), dtype=dicttype) else: map_id2annotations = {} workspace = readWorkspace(open(options.filename_workspace, "r"), options.workspace_builder, options.workspace_labels, map_id2annotations) E.info("read workspace for %i contigs" % (len(workspace))) indexed_workspace = indexIntervals(workspace, with_values=True) segments = readSegments(open(options.filename_segments, "r"), indexed_workspace, format=options.segments_format, keep_ambiguous=options.keep_ambiguous, truncate=options.truncate, remove_overhangs=options.remove_overhangs) nsegments = 0 for contig, vv in segments.iteritems(): nsegments += len(vv) E.info("read %i segments for %i contigs" % (nsegments, len(workspace))) indexed_segments = indexIntervals(segments, with_values=False) if nsegments == 0: E.warn("no segments read - no computation done.") E.Stop() return # build labels labels = collections.defaultdict(int) for contig, vv in workspace.iteritems(): for start, end, v in vv: for l in v[0]: labels[l] += 1 for l in v[1]: labels[l] += 1 E.info("found %i workspace labels" % len(labels)) ########################################### # setup counting containers counters = [] for cc in options.counters: if cc == "transcription": counter = CounterTranscription elif cc == "closest-distance": counter = CounterClosestDistance elif cc == "all-distances": counter = CounterAllDistances if nsegments < 256: dtype = numpy.uint8 elif nsegments < 65536: dtype = numpy.uint16 elif nsegments < 4294967296: dtype = numpy.uint32 else: dtype = numpy.int E.debug("choosen dtype %s" % str(dtype)) E.info("samples space is %i bases: %i bins at %i resolution" % ( options.num_bins * options.resolution, options.num_bins, options.resolution, )) E.info("allocating counts: %i bytes (%i labels, %i samples, %i bins)" % ( options.num_bins * len(labels) * dtype().itemsize * (options.num_samples + 1), len(labels), options.num_samples, options.num_bins, )) c = CountingResults(labels) c.mObservedCounts = counter(labels, options.num_bins, options.resolution, dtype=dtype) simulated_counts = [] for x in range(options.num_samples): simulated_counts.append( counter(labels, options.num_bins, options.resolution, dtype=dtype)) c.mSimulatedCounts = simulated_counts c.mName = c.mObservedCounts.mName counters.append(c) E.info("allocated memory successfully") segments_per_workspace = [] segment_sizes = [] segments_per_label = collections.defaultdict(int) workspaces_per_label = collections.defaultdict(int) ############################################ # get observed and simpulated counts nworkspaces, nempty_workspaces, nempty_contigs, nmiddle = 0, 0, 0, 0 iteration2 = 0 for contig, vv in workspace.iteritems(): iteration2 += 1 E.info("counting %i/%i: %s %i segments" % (iteration2, len(workspace), contig, len(vv))) if len(vv) == 0: continue iteration1 = 0 for work_start, work_end, v in vv: left_labels, right_labels = v[0], v[1] iteration1 += 1 # ignore empty segments if contig not in indexed_segments: nempty_contigs += 1 continue r = indexed_segments[contig].find(work_start, work_end) segments_per_workspace.append(len(r)) if not r: nempty_workspaces += 1 continue # collect segments and stats nworkspaces += 1 observed = [(x.start, x.end) for x in r] observed.sort() segments_per_workspace.append(len(observed)) segment_sizes.extend([x[1] - x[0] for x in observed]) # collect basic counts for label in list(left_labels) + list(right_labels): workspaces_per_label[label] += 1 segments_per_label[label] += len(observed) # add observed counts for counter in counters: counter.mObservedCounts.addCounts(observed, work_start, work_end, left_labels, right_labels) # create sampler s = sampler(observed, work_start, work_end) # add simulated counts for iteration in range(options.num_samples): simulated = s.sample() for counter in counters: counter.mSimulatedCounts[iteration].addCounts( simulated, work_start, work_end, left_labels, right_labels) E.info("counting finished") E.info( "nworkspaces=%i, nmiddle=%i, nempty_workspaces=%i, nempty_contigs=%i" % (nworkspaces, nmiddle, nempty_workspaces, nempty_contigs)) ###################################################### # transform counts if options.transform_counts == "cumulative": transform = cumulative_transform elif options.transform_counts == "raw": transform = normalize_transform #################################################### # analysis if "proximity" in options.analysis: outfile_proximity = E.openOutputFile("proximity") outfile_proximity.write("\t".join( ("label", "observed", "pvalue", "expected", "CIlower", "CIupper", "qvalue", "segments", "workspaces")) + "\n") else: outfile_proximity = None if "area-under-curve" in options.analysis: outfile_auc = E.openOutputFile("auc") outfile_auc.write("label\tobserved\texpected\tCIlower\tCIupper\n") else: outfile_auc = None # qvalue: expected false positives at p-value # qvalue = expected false positives / if options.do_fdr: E.info("computing pvalues for fdr") for counter in counters: for label in labels: E.info("working on counter:%s label:%s" % (counter, label)) # collect all P-Values of simulated results to compute FDR sim_pvalues = [] medians = counter.getMedians(label) for median in medians: pvalue = float( scipy.stats.percentileofscore(medians, median)) / 100.0 sim_pvalues.append(pvalue) sim_pvalues.sort() else: sim_pvalues = [] # compute observed p-values for counter in counters: counter.update() obs_pvalues = [] for counter in counters: for label in labels: obs_pvalues.append(counter.mStats[label].pvalue) obs_pvalues.sort() # compute observed p-values if options.do_fdr: for counter in counters: counter.updateFDR(obs_pvalues, sim_pvalues) for counter in counters: outofbounds_sim, totals_sim = 0, 0 outofbounds_obs, totals_obs = 0, 0 for label in labels: for sample in range(options.num_samples): if counter.mSimulatedCounts[sample].mOutOfBounds[label]: E.debug( "out of bounds: sample %i, label %s, counts=%i" % (sample, label, counter.mSimulatedCounts[sample].mOutOfBounds[label])) outofbounds_sim += counter.mSimulatedCounts[ sample].mOutOfBounds[label] totals_sim += counter.mSimulatedCounts[sample].mTotals[label] outofbounds_obs += counter.mObservedCounts.mOutOfBounds[label] totals_obs += counter.mObservedCounts.mTotals[label] E.info( "out of bounds observations: observed=%i/%i (%5.2f%%), simulations=%i/%i (%5.2f%%)" % ( outofbounds_obs, totals_obs, 100.0 * outofbounds_obs / totals_obs, outofbounds_sim, totals_sim, 100.0 * outofbounds_sim / totals_sim, )) for label in labels: if outfile_auc: mmin, mmax, mmean = counter.getEnvelope( label, transform=normalize_transform) obs = normalize_transform( counter.mObservedCounts[label], counter.mObservedCounts.mOutOfBounds[label]) def block_iterator(a1, a2, a3, num_bins): x = 0 while x < num_bins: while x < num_bins and a1[x] <= a2[x]: x += 1 start = x while x < options.num_bins and a1[x] > a2[x]: x += 1 end = x total_a1 = a1[start:end].sum() total_a3 = a3[start:end].sum() if total_a1 > total_a3: yield (total_a1 - total_a3, start, end, total_a1, total_a3) blocks = list( block_iterator(obs, mmax, mmean, options.num_bins)) if options.output_all: for delta, start, end, total_obs, total_mean in blocks: if end - start <= 1: continue outfile_auc.write( "%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" % (label, start * options.resolution, end * options.resolution, (end - start) * options.resolution, total_obs, total_mean, delta, total_obs / total_mean, 100.0 * (total_obs / total_mean - 1.0))) # output best block blocks.sort() delta, start, end, total_obs, total_mean = blocks[-1] outfile_auc.write( "%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" % (label, start * options.resolution, end * options.resolution, (end - start) * options.resolution, total_obs, total_mean, delta, total_obs / total_mean, 100.0 * (total_obs / total_mean - 1.0))) if outfile_proximity: # find error bars at median st = counter.mStats[label] outfile_proximity.write( "%s\t%i\t%f\t%i\t%i\t%i\t%s\t%i\t%i\n" % ( label, st.observed * options.resolution, st.pvalue, st.expected * options.resolution, st.ci95lower * options.resolution, st.ci95upper * options.resolution, IOTools.val2str(st.qvalue), segments_per_label[label], workspaces_per_label[label], )) if options.plot: for counter in counters: plotCounts(counter, options, transform) # plot summary stats plt.figure() plt.title("distribution of workspace length") data = [] for contig, segs in workspace.iteritems(): if len(segs) == 0: continue data.extend([x[1] - x[0] for x in segs]) vals, bins = numpy.histogram(data, bins=numpy.arange(0, max(data), 100), new=True) t = float(sum(vals)) plt.plot(bins[:-1], numpy.cumsum(vals) / t) plt.gca().set_xscale('log') plt.legend() t = float(sum(vals)) plt.xlabel("size of workspace") plt.ylabel("cumulative relative frequency") if options.hardcopy: plt.savefig(os.path.expanduser(options.hardcopy % "workspace_size")) plt.figure() plt.title("segments per block") vals, bins = numpy.histogram(segments_per_workspace, bins=numpy.arange( 0, max(segments_per_workspace), 1), new=True) plt.plot(bins[:-1], vals) plt.xlabel("segments per block") plt.ylabel("absolute frequency") if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "segments_per_block")) plt.figure() plt.title("workspaces per label") plt.barh(range(0, len(labels)), [workspaces_per_label[x] for x in labels], height=0.5) plt.yticks(range(0, len(labels)), labels) plt.ylabel("workspaces per label") plt.xlabel("absolute frequency") plt.gca().set_xscale('log') if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "workspaces_per_label")) plt.figure() plt.title("segments per label") plt.barh(range(0, len(labels)), [segments_per_label[x] for x in labels], height=0.5) plt.yticks(range(0, len(labels)), labels) plt.ylabel("segments per label") plt.xlabel("absolute frequency") plt.xticks(range(0, len(labels)), labels) if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "segments_per_label")) if not options.hardcopy: plt.show() E.Stop()