示例#1
0
    def __init__(self,
                 num_samples,
                 samples,
                 samples_outfile,
                 sampler,
                 workspace_generator,
                 counters,
                 outfiles,
                 num_threads=1):
        self.num_samples = num_samples
        self.samples = samples
        self.samples_outfile = samples_outfile
        self.sampler = sampler
        self.workspace_generator = workspace_generator
        self.counters = counters

        self.outfile_sample_stats = outfiles.get("sample_stats", None)
        self.outfile_sample_metrics = outfiles.get("sample_metrics", None)

        if self.outfile_sample_stats:
            E.debug("sample stats go to %s" % self.outfile_sample_stats)
            self.outfile_sample_stats.write(
                "sample\tisochore\tnsegments\tnnucleotides\tmean\t"
                "std\tmin\tq1\tmedian\tq3\tmax\n")

        self.last_sample_id = None
        self.all_lengths = []
        self.num_threads = num_threads
示例#2
0
    def computeSamples(self, work, report_interval=100):
        '''compute samples according to work.

        returns a list of results.
        '''
        n = len(work)

        E.debug('sampling will work on %i items' % n)

        results = []

        if self.num_threads == 0:
            for i, w in enumerate(work):
                r = computeSample(
                    (w, self.samples_outfile, self.outfile_sample_metrics,
                     None))
                if i % report_interval == 0:
                    E.info("%i/%i done (%5.2f)" % (i, n, 100.0 * i / n))
                results.append(r)
        else:
            E.info("generating processpool with %i threads for %i items" %
                   (self.num_threads, len(work)))

            manager = multiprocessing.Manager()

            lock = manager.Lock()

            pool = multiprocessing.Pool(self.num_threads)

            # use file names - not files when multiprocessing
            samples_outfile, metrics_outfile = None, None
            if self.samples_outfile:
                samples_outfile = self.samples_outfile.name
                self.samples_outfile.flush()
            if self.outfile_sample_metrics:
                metrics_outfile = self.outfile_sample_metrics.name
                self.outfile_sample_metrics.flush()

            ww = [(w, samples_outfile, metrics_outfile, lock) for w in work]

            for i, r in enumerate(pool.imap_unordered(computeSample, ww)):
                if i % report_interval == 0:
                    E.info("%i/%i done (%5.2f)" % (i, n, 100.0 * i / n))
                results.append(r)

            pool.close()
            pool.join()

        return results
示例#3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    parser = gat.buildParser(usage=globals()["__doc__"])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ##################################################
    description_header, descriptions, description_width = IO.readDescriptions(
        options)

    ##################################################
    size_pos, size_segment = GatSegmentList.getSegmentSize()
    E.debug("sizes: pos=%i segment=%i, max_coord=%i" %
            (size_pos, size_segment, 2 ** (8 * size_pos)))

    ##################################################
    # set default counter
    if not options.counters:
        options.counters.append("nucleotide-overlap")

    ##################################################
    if options.output_tables_pattern != None:
        if "%s" not in options.output_tables_pattern:
            raise ValueError(
                "output_tables_pattern should contain at least one '%s'")

    if options.output_samples_pattern != None:
        if "%s" not in options.output_samples_pattern:
            raise ValueError(
                "output_samples_pattern should contain at least one '%s'")

    if options.output_counts_pattern != None:
        if "%s" not in options.output_counts_pattern:
            raise ValueError(
                "output_counts_pattern should contain at least one '%s'")

    ##################################################
    # read fold changes that results should be compared with
    if options.null != "default":
        if not os.path.exists(options.null):
            raise OSError("file %s not found" % options.null)
        E.info("reading reference results from %s" % options.null)
        options.reference = IO.readAnnotatorResults(options.null)
    else:
        options.reference = None

    if options.input_filename_counts:
        # use pre-computed counts
        annotator_results = GatEngine.fromCounts(options.input_filename_counts)

    elif options.input_filename_results:
        # use previous results (re-computes fdr)
        E.info("reading gat results from %s" % options.input_filename_results)
        annotator_results = IO.readAnnotatorResults(
            options.input_filename_results)

    else:
        # do full gat analysis
        annotator_results = fromSegments(options, args)

    ##################################################
    if options.pvalue_method != "empirical":
        E.info("updating pvalues to %s" % options.pvalue_method)
        GatEngine.updatePValues(annotator_results, options.pvalue_method)

    ##################################################
    # output
    IO.outputResults(annotator_results,
                     options,
                     GatEngine.AnnotatorResultExtended.headers,
                     description_header,
                     description_width,
                     descriptions)

    IO.plotResults(annotator_results, options)

    # write footer and output benchmark information.
    E.Stop()
示例#4
0
def run(segments,
        annotations,
        workspace,
        sampler,
        counters,
        workspace_generator,
        **kwargs):
    '''run an enrichment analysis.

    segments: an IntervalCollection
    workspace: an IntervalCollection
    annotations: an IntervalCollection

    kwargs recognized are:

    cache
       filename of cache

    num_samples
       number of samples to compute

    output_counts_pattern
       output counts to filename

    output_samples_pattern
       if given, output samles to these files, one per segment

    sample_files
       if given, read samples from these files.

    fdr
       method to compute qvalues

    outfiles
       dictionary of optional additional output files.

    pseudo_count
       pseudo_count to add to observed and expected values

    reference
       data with reference observed and expected values.
    '''

    # get arguments
    num_samples = kwargs.get("num_samples", 10000)
    cache = kwargs.get("cache", None)
    output_counts_pattern = kwargs.get("output_counts_pattern", None)
    sample_files = kwargs.get("sample_files", [])
    pseudo_count = kwargs.get("pseudo_count", 1.0)
    reference = kwargs.get("reference", None)
    output_samples_pattern = kwargs.get("output_samples_pattern", None)
    outfiles = kwargs.get("outfiles", {})
    num_threads = kwargs.get("num_threads", 0)

    ##################################################
    ##################################################
    ##################################################
    # computing summary metrics for segments
    if "segment_metrics" in outfiles:
        E.info("computing summary metrics for segments")
        outfile = outfiles["segment_metrics"]
        outfile.write("track\tsection\tmetric\t%s\n" %
                      "\t".join(Stats.Summary().getHeaders()))
        for track in segments.tracks:
            IO.outputMetrics(outfile,
                             segments[track],
                             workspace,
                             track,
                             'segments',
                             )
        E.info("wrote summary metrics for segments to %s" % str(outfile))

    ##################################################
    ##################################################
    ##################################################
    # collect observed counts from segments
    E.info("collecting observed counts")
    observed_counts = []
    for counter in counters:
        observed_counts.append(Engine.computeCounts(
            counter=counter,
            aggregator=sum,
            segments=segments,
            annotations=annotations,
            workspace=workspace,
            workspace_generator=workspace_generator))

    ##################################################
    ##################################################
    ##################################################
    # sample and collect counts
    ##################################################
    E.info("starting sampling")

    if cache:
        E.info("samples are cached in %s" % cache)
        samples = Engine.SamplesCached(filename=cache)
    elif sample_files:
        if not output_samples_pattern:
            raise ValueError(
                "require output_samples_pattern if loading samples from files")
        # build regex
        regex = re.compile(re.sub("%s", "(\S+)", output_samples_pattern))
        E.info("loading samples from %i files" % len(sample_files))
        samples = Engine.SamplesFile(
            filenames=sample_files,
            regex=regex)
    else:
        samples = Engine.Samples()

    sampled_counts = {}

    counts = E.Counter()

    ntracks = len(segments.tracks)

    for ntrack, track in enumerate(segments.tracks):

        segs = segments[track]

        E.info("sampling: %s: %i/%i" % (track, ntrack + 1, ntracks))

        if output_samples_pattern and not sample_files:
            filename = re.sub("%s", track, output_samples_pattern)
            E.debug("saving samples to %s" % filename)
            dirname = os.path.dirname(filename)
            if dirname and not os.path.exists(dirname):
                os.makedirs(dirname)
            if filename.endswith(".gz"):
                samples_outfile = gzip.open(filename, "w")
            else:
                samples_outfile = open(filename, "w")
        else:
            samples_outfile = None

        if workspace_generator.is_conditional:
            outer_sampler = ConditionalSampler(num_samples,
                                               samples,
                                               samples_outfile,
                                               sampler,
                                               workspace_generator,
                                               counters,
                                               outfiles,
                                               num_threads=num_threads)
        else:
            outer_sampler = UnconditionalSampler(num_samples,
                                                 samples,
                                                 samples_outfile,
                                                 sampler,
                                                 workspace_generator,
                                                 counters,
                                                 outfiles,
                                                 num_threads=num_threads)

        counts_per_track = outer_sampler.sample(
            track, counts, counters, segs, annotations, workspace, outfiles)

        # skip empty tracks
        if counts_per_track is None:
            continue

        if samples_outfile:
            samples_outfile.close()

        sampled_counts[track] = counts_per_track

        # old code, refactor into loop to save samples
        if 0:
            E.info("sampling stats: %s" % str(counts))
            if track not in samples:
                E.warn("no samples for track %s" % track)
                continue

            # clean up samples
            del samples[track]

    E.info("sampling finished")

    # build annotator results
    E.info("computing PValue statistics")

    annotator_results = list()
    counter_id = 0
    for counter, observed_count in zip(counters, observed_counts):
        for track, r in observed_count.items():
            for annotation, observed in r.items():
                temp_segs, temp_annos, temp_workspace = workspace_generator(
                    segments[track],
                    annotations[annotation],
                    workspace)

                # ignore empty results
                if temp_workspace.sum() == 0:
                    continue

                # if reference is given, p-value will indicate difference
                # The test that track and annotation are present is done
                # elsewhere
                if reference:
                    ref = reference[track][annotation]
                else:
                    ref = None

                annotator_results.append(Engine.AnnotatorResultExtended(
                    track=track,
                    annotation=annotation,
                    counter=counter.name,
                    observed=observed,
                    samples=sampled_counts[track][counter_id][annotation],
                    track_segments=temp_segs,
                    annotation_segments=temp_annos,
                    workspace=temp_workspace,
                    reference=ref,
                    pseudo_count=pseudo_count))
        counter_id += 1

    # dump (large) table with counts
    if output_counts_pattern:
        for counter in counters:
            name = counter.name
            filename = re.sub("%s", name, output_counts_pattern)

            E.info("writing counts to %s" % filename)
            output = [x for x in annotator_results if x.counter == name]
            outfile = IOTools.openFile(filename, "w")
            outfile.write("track\tannotation\tobserved\tcounts\n")

            for o in output:
                outfile.write("%s\t%s\t%i\t%s\n" %
                              (o.track, o.annotation,
                               o.observed,
                               ",".join(["%i" % x for x in o.samples])))

    return annotator_results
示例#5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                                   usage=globals()["__doc__"])

    parser.add_option("-o", "--order", dest="output_order", type="choice",
                      choices=(
                          "track", "annotation", "fold", "pvalue", "qvalue", "observed"),
                      help="order results in output by fold, track, etc. [default=%default].")

    parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice",
                      choices=("empirical", "norm", ),
                      help="type of pvalue reported [default=%default].")

    parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
                      choices=(
                          "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"),
                      help="method to perform multiple testing correction by controlling the fdr [default=%default].")

    parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float",
                      help="fdr computation: lambda [default=%default].")

    parser.add_option("--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
                      choices=("smoother", "bootstrap"),
                      help="fdr computation: method for estimating pi0 [default=%default].")

    parser.add_option("--descriptions", dest="input_filename_descriptions", type="string",
                      help="filename mapping annotation terms to descriptions. "
                      " if given, the output table will contain additional columns "
                      " [default=%default]")

    parser.add_option("--pseudo-count", dest="pseudo_count", type="float",
                      help="pseudo count. The pseudo count is added to both the observed and expected overlap. "
                      " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default].")

    parser.add_option("--output-plots-pattern", dest="output_plots_pattern", type="string",
                      help="output pattern for plots [default=%default]")

    parser.set_defaults(
        pvalue_method="empirical",
        qvalue_method="BH",
        qvalue_lambda=None,
        qvalue_pi0_method="smoother",
        # pseudo count for fold change computation to avoid 0 fc
        pseudo_count=1.0,
        output_order="observed",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    input_filenames_counts = args

    ##################################################
    E.info("received %i filenames with counts" % len(input_filenames_counts))

    ##################################################
    description_header, descriptions, description_width = IO.readDescriptions(
        options)

    all_annotator_results = []

    for input_filename_counts in input_filenames_counts:

        E.info("processing %s" % input_filename_counts)

        annotator_results = gat.fromCounts(input_filename_counts)

        ##################################################
        if options.pvalue_method != "empirical":
            E.info("updating pvalues to %s" % options.pvalue_method)
            GatEngine.updatePValues(annotator_results, options.pvalue_method)

        ##################################################
        ##################################################
        ##################################################
        # compute global fdr
        ##################################################
        E.info("computing FDR statistics")
        GatEngine.updateQValues(annotator_results,
                                method=options.qvalue_method,
                                vlambda=options.qvalue_lambda,
                                pi0_method=options.qvalue_pi0_method)

        all_annotator_results.append(annotator_results)

    pseudo_count = options.pseudo_count
    results = []

    if len(all_annotator_results) == 1:
        E.info("performing pairwise comparison within a single file")

        # collect all annotations
        annotations, segments = list(), set()
        for x in all_annotator_results[0]:
            segments.add(x.track)
            annotations.append(x)

        if len(segments) != 1:
            raise NotImplementedError("multiple segments of interest")

        for data1, data2 in itertools.combinations(annotations, 2):

            # note that fold changes can be very large if there are 0 samples
            # this is fine for getting the distributional params (mean, stddev)
            fold_changes1 = data1.observed / (data1.samples + pseudo_count)
            fold_changes2 = data2.observed / (data2.samples + pseudo_count)

            # add a separate fc pseudo-count to avoid 0 values
            fold_changes1 += 0.0001
            fold_changes2 += 0.0001

            # Test is if relative fold change rfc is different from 1
            # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
            #                       = obs1 / obs2 * exp2 / exp1
            # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
            #
            # Convert to log space for easier plotting
            # Move the observed fold ratio in order to get an idea of the magnitude
            # of the underlying fold change
            delta_fold = data2.fold - data1.fold
            sampled_delta_fold = numpy.log(
                fold_changes1 / fold_changes2) + delta_fold
            observed_delta_fold = 0.0 + delta_fold

            result = GatEngine.AnnotatorResult(data1.annotation, data2.annotation,
                                               "na",
                                               observed_delta_fold,
                                               sampled_delta_fold,
                                               reference=None,
                                               pseudo_count=0)

            results.append(result)

    else:
        E.info("performing pairwise comparison between multiple files")

        ##################################################
        # perform pairwise comparison
        for index1, index2 in itertools.combinations(range(len(input_filenames_counts)), 2):
            E.info("comparing %i and %i" % (index1, index2))
            a, b = all_annotator_results[index1], all_annotator_results[index2]

            # index results in a and b
            aa = collections.defaultdict(dict)
            for x in a:
                aa[x.track][x.annotation] = x

            bb = collections.defaultdict(dict)
            for x in b:
                bb[x.track][x.annotation] = x
            
            tracks_a = set(aa.keys())
            tracks_b = set(bb.keys())
            shared_tracks = tracks_a.intersection(tracks_b)
            if len(shared_tracks) == 0:
                E.warn("no shared tracks between {} and {}".format(
                        index1, index2))
                
            for track in sorted(shared_tracks):
                E.debug("computing results for track {}".format(track))
                # get shared annotations
                annotations1 = aa[track].keys()
                annotations2 = bb[track].keys()
                shared_annotations = list(
                    set(annotations1).intersection(set(annotations2)))
                E.info("%i shared annotations" % len(shared_annotations))

                for annotation in shared_annotations:

                    # if not annotation.startswith("Ram:"): continue

                    data1 = aa[track][annotation]
                    data2 = bb[track][annotation]

                    # note that fold changes can be very large if there are 0 samples
                    # this is fine for getting the distributional params (mean,
                    # stddev)
                    fold_changes1 = data1.observed / (data1.samples + pseudo_count)
                    fold_changes2 = data2.observed / (data2.samples + pseudo_count)

                    # add a separate fc pseudo-count to avoid 0 values
                    fold_changes1 += 0.0001
                    fold_changes2 += 0.0001

                    # Test is if relative fold change rfc is different from 1
                    # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
                    #                       = obs1 / obs2 * exp2 / exp1
                    # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
                    #
                    # Convert to log space for easier plotting
                    # Move the observed fold ratio in order to get an idea of the magnitude
                    # of the underlying fold change
                    delta_fold = data2.fold - data1.fold
                    sampled_delta_fold = numpy.log(
                        fold_changes1 / fold_changes2) + delta_fold
                    observed_delta_fold = 0.0 + delta_fold

                    result = GatEngine.AnnotatorResult(track,
                                                       annotation,
                                                       "na",
                                                       observed_delta_fold,
                                                       sampled_delta_fold,
                                                       reference=None,
                                                       pseudo_count=0)

                    results.append(result)

    if len(results) == 0:
        E.critical("no results found")
        E.Stop()
        return

    IO.outputResults(results,
                     options,
                     GatEngine.AnnotatorResult.headers,
                     description_header,
                     description_width,
                     descriptions,
                     format_observed="%6.4f")

    IO.plotResults(results, options)

    # write footer and output benchmark information.
    E.Stop()
示例#6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(
        version=
        "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-o",
        "--order",
        dest="output_order",
        type="choice",
        choices=("track", "annotation", "fold", "pvalue", "qvalue",
                 "observed"),
        help="order results in output by fold, track, etc. [default=%default]."
    )

    parser.add_option("-p",
                      "--pvalue-method",
                      dest="pvalue_method",
                      type="choice",
                      choices=(
                          "empirical",
                          "norm",
                      ),
                      help="type of pvalue reported [default=%default].")

    parser.add_option(
        "-q",
        "--qvalue-method",
        dest="qvalue_method",
        type="choice",
        choices=("storey", "BH", "bonferroni", "holm", "hommel", "hochberg",
                 "BY", "none"),
        help=
        "method to perform multiple testing correction by controlling the fdr [default=%default]."
    )

    parser.add_option("--qvalue-lambda",
                      dest="qvalue_lambda",
                      type="float",
                      help="fdr computation: lambda [default=%default].")

    parser.add_option(
        "--qvalue-pi0-method",
        dest="qvalue_pi0_method",
        type="choice",
        choices=("smoother", "bootstrap"),
        help="fdr computation: method for estimating pi0 [default=%default].")

    parser.add_option(
        "--descriptions",
        dest="input_filename_descriptions",
        type="string",
        help="filename mapping annotation terms to descriptions. "
        " if given, the output table will contain additional columns "
        " [default=%default]")

    parser.add_option(
        "--pseudo-count",
        dest="pseudo_count",
        type="float",
        help=
        "pseudo count. The pseudo count is added to both the observed and expected overlap. "
        " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default]."
    )

    parser.add_option("--output-plots-pattern",
                      dest="output_plots_pattern",
                      type="string",
                      help="output pattern for plots [default=%default]")

    parser.set_defaults(
        pvalue_method="empirical",
        qvalue_method="BH",
        qvalue_lambda=None,
        qvalue_pi0_method="smoother",
        # pseudo count for fold change computation to avoid 0 fc
        pseudo_count=1.0,
        output_order="observed",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    input_filenames_counts = args

    ##################################################
    E.info("received %i filenames with counts" % len(input_filenames_counts))

    ##################################################
    description_header, descriptions, description_width = IO.readDescriptions(
        options)

    all_annotator_results = []

    for input_filename_counts in input_filenames_counts:

        E.info("processing %s" % input_filename_counts)

        annotator_results = gat.fromCounts(input_filename_counts)

        ##################################################
        if options.pvalue_method != "empirical":
            E.info("updating pvalues to %s" % options.pvalue_method)
            GatEngine.updatePValues(annotator_results, options.pvalue_method)

        ##################################################
        ##################################################
        ##################################################
        # compute global fdr
        ##################################################
        E.info("computing FDR statistics")
        GatEngine.updateQValues(annotator_results,
                                method=options.qvalue_method,
                                vlambda=options.qvalue_lambda,
                                pi0_method=options.qvalue_pi0_method)

        all_annotator_results.append(annotator_results)

    pseudo_count = options.pseudo_count
    results = []

    if len(all_annotator_results) == 1:
        E.info("performing pairwise comparison within a single file")

        # collect all annotations
        annotations, segments = list(), set()
        for x in all_annotator_results[0]:
            segments.add(x.track)
            annotations.append(x)

        if len(segments) != 1:
            raise NotImplementedError("multiple segments of interest")

        for data1, data2 in itertools.combinations(annotations, 2):

            # note that fold changes can be very large if there are 0 samples
            # this is fine for getting the distributional params (mean, stddev)
            fold_changes1 = data1.observed / (data1.samples + pseudo_count)
            fold_changes2 = data2.observed / (data2.samples + pseudo_count)

            # add a separate fc pseudo-count to avoid 0 values
            fold_changes1 += 0.0001
            fold_changes2 += 0.0001

            # Test is if relative fold change rfc is different from 1
            # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
            #                       = obs1 / obs2 * exp2 / exp1
            # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
            #
            # Convert to log space for easier plotting
            # Move the observed fold ratio in order to get an idea of the magnitude
            # of the underlying fold change
            delta_fold = data2.fold - data1.fold
            sampled_delta_fold = numpy.log(
                fold_changes1 / fold_changes2) + delta_fold
            observed_delta_fold = 0.0 + delta_fold

            result = GatEngine.AnnotatorResult(data1.annotation,
                                               data2.annotation,
                                               "na",
                                               observed_delta_fold,
                                               sampled_delta_fold,
                                               reference=None,
                                               pseudo_count=0)

            results.append(result)

    else:
        E.info("performing pairwise comparison between multiple files")

        ##################################################
        # perform pairwise comparison
        for index1, index2 in itertools.combinations(
                range(len(input_filenames_counts)), 2):
            E.info("comparing %i and %i" % (index1, index2))
            a, b = all_annotator_results[index1], all_annotator_results[index2]

            # index results in a and b
            aa = collections.defaultdict(dict)
            for x in a:
                aa[x.track][x.annotation] = x

            bb = collections.defaultdict(dict)
            for x in b:
                bb[x.track][x.annotation] = x

            tracks_a = set(aa.keys())
            tracks_b = set(bb.keys())
            shared_tracks = tracks_a.intersection(tracks_b)
            if len(shared_tracks) == 0:
                E.warn("no shared tracks between {} and {}".format(
                    index1, index2))

            for track in sorted(shared_tracks):
                E.debug("computing results for track {}".format(track))
                # get shared annotations
                annotations1 = aa[track].keys()
                annotations2 = bb[track].keys()
                shared_annotations = list(
                    set(annotations1).intersection(set(annotations2)))
                E.info("%i shared annotations" % len(shared_annotations))

                for annotation in shared_annotations:

                    # if not annotation.startswith("Ram:"): continue

                    data1 = aa[track][annotation]
                    data2 = bb[track][annotation]

                    # note that fold changes can be very large if there are 0 samples
                    # this is fine for getting the distributional params (mean,
                    # stddev)
                    fold_changes1 = data1.observed / (data1.samples +
                                                      pseudo_count)
                    fold_changes2 = data2.observed / (data2.samples +
                                                      pseudo_count)

                    # add a separate fc pseudo-count to avoid 0 values
                    fold_changes1 += 0.0001
                    fold_changes2 += 0.0001

                    # Test is if relative fold change rfc is different from 1
                    # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
                    #                       = obs1 / obs2 * exp2 / exp1
                    # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
                    #
                    # Convert to log space for easier plotting
                    # Move the observed fold ratio in order to get an idea of the magnitude
                    # of the underlying fold change
                    delta_fold = data2.fold - data1.fold
                    sampled_delta_fold = numpy.log(
                        fold_changes1 / fold_changes2) + delta_fold
                    observed_delta_fold = 0.0 + delta_fold

                    result = GatEngine.AnnotatorResult(track,
                                                       annotation,
                                                       "na",
                                                       observed_delta_fold,
                                                       sampled_delta_fold,
                                                       reference=None,
                                                       pseudo_count=0)

                    results.append(result)

    if len(results) == 0:
        E.critical("no results found")
        E.Stop()
        return

    IO.outputResults(results,
                     options,
                     GatEngine.AnnotatorResult.headers,
                     description_header,
                     description_width,
                     descriptions,
                     format_observed="%6.4f")

    IO.plotResults(results, options)

    # write footer and output benchmark information.
    E.Stop()
示例#7
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    parser = gat.buildParser(usage=globals()["__doc__"])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ##################################################
    description_header, descriptions, description_width = IO.readDescriptions(
        options)

    ##################################################
    size_pos, size_segment = SegmentList.getSegmentSize()
    E.debug("sizes: pos=%i segment=%i, max_coord=%i" %
            (size_pos, size_segment, 2**(8 * size_pos)))

    ##################################################
    # set default counter
    if not options.counters:
        options.counters.append("nucleotide-overlap")

    ##################################################
    if options.output_tables_pattern is not None:
        if "%s" not in options.output_tables_pattern:
            raise ValueError(
                "output_tables_pattern should contain at least one '%s'")

    if options.output_samples_pattern is not None:
        if "%s" not in options.output_samples_pattern:
            raise ValueError(
                "output_samples_pattern should contain at least one '%s'")

    if options.output_counts_pattern is not None:
        if "%s" not in options.output_counts_pattern:
            raise ValueError(
                "output_counts_pattern should contain at least one '%s'")

    if options.random_seed is not None:
        # initialize python random number generator
        random.seed(options.random_seed)
        # initialize numpy random number generator
        numpy.random.seed(options.random_seed)

    ##################################################
    # read fold changes that results should be compared with
    if options.null != "default":
        if not os.path.exists(options.null):
            raise OSError("file %s not found" % options.null)
        E.info("reading reference results from %s" % options.null)
        options.reference = IO.readAnnotatorResults(options.null)
    else:
        options.reference = None

    if options.input_filename_counts:
        # use pre-computed counts
        annotator_results = Engine.fromCounts(options.input_filename_counts)

    elif options.input_filename_results:
        # use previous results (re-computes fdr)
        E.info("reading gat results from %s" % options.input_filename_results)
        annotator_results = IO.readAnnotatorResults(
            options.input_filename_results)

    else:
        # do full gat analysis
        annotator_results = fromSegments(options, args)

    ##################################################
    if options.pvalue_method != "empirical":
        E.info("updating pvalues to %s" % options.pvalue_method)
        Engine.updatePValues(annotator_results, options.pvalue_method)

    ##################################################
    # output
    IO.outputResults(annotator_results, options,
                     Engine.AnnotatorResultExtended.headers,
                     description_header, description_width, descriptions)

    IO.plotResults(annotator_results, options)

    # write footer and output benchmark information.
    E.Stop()