Пример #1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                                   usage=globals()["__doc__"])

    parser.add_option("-o", "--order", dest="output_order", type="choice",
                      choices=(
                          "track", "annotation", "fold", "pvalue", "qvalue", "observed"),
                      help="order results in output by fold, track, etc. [default=%default].")

    parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice",
                      choices=("empirical", "norm", ),
                      help="type of pvalue reported [default=%default].")

    parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
                      choices=(
                          "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"),
                      help="method to perform multiple testing correction by controlling the fdr [default=%default].")

    parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float",
                      help="fdr computation: lambda [default=%default].")

    parser.add_option("--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
                      choices=("smoother", "bootstrap"),
                      help="fdr computation: method for estimating pi0 [default=%default].")

    parser.add_option("--descriptions", dest="input_filename_descriptions", type="string",
                      help="filename mapping annotation terms to descriptions. "
                      " if given, the output table will contain additional columns "
                      " [default=%default]")

    parser.add_option("--pseudo-count", dest="pseudo_count", type="float",
                      help="pseudo count. The pseudo count is added to both the observed and expected overlap. "
                      " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default].")

    parser.add_option("--output-plots-pattern", dest="output_plots_pattern", type="string",
                      help="output pattern for plots [default=%default]")

    parser.set_defaults(
        pvalue_method="empirical",
        qvalue_method="BH",
        qvalue_lambda=None,
        qvalue_pi0_method="smoother",
        # pseudo count for fold change computation to avoid 0 fc
        pseudo_count=1.0,
        output_order="observed",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    input_filenames_counts = args

    ##################################################
    E.info("received %i filenames with counts" % len(input_filenames_counts))

    ##################################################
    description_header, descriptions, description_width = IO.readDescriptions(
        options)

    all_annotator_results = []

    for input_filename_counts in input_filenames_counts:

        E.info("processing %s" % input_filename_counts)

        annotator_results = gat.fromCounts(input_filename_counts)

        ##################################################
        if options.pvalue_method != "empirical":
            E.info("updating pvalues to %s" % options.pvalue_method)
            GatEngine.updatePValues(annotator_results, options.pvalue_method)

        ##################################################
        ##################################################
        ##################################################
        # compute global fdr
        ##################################################
        E.info("computing FDR statistics")
        GatEngine.updateQValues(annotator_results,
                                method=options.qvalue_method,
                                vlambda=options.qvalue_lambda,
                                pi0_method=options.qvalue_pi0_method)

        all_annotator_results.append(annotator_results)

    pseudo_count = options.pseudo_count
    results = []

    if len(all_annotator_results) == 1:
        E.info("performing pairwise comparison within a file")

        # collect all annotations
        annotations, segments = list(), set()
        for x in all_annotator_results[0]:
            segments.add(x.track)
            annotations.append(x)

        if len(segments) != 1:
            raise NotImplementedError("multiple segments of interest")

        for data1, data2 in itertools.combinations(annotations, 2):

            # note that fold changes can be very large if there are 0 samples
            # this is fine for getting the distributional params (mean, stddev)
            fold_changes1 = data1.observed / (data1.samples + pseudo_count)
            fold_changes2 = data2.observed / (data2.samples + pseudo_count)

            # add a separate fc pseudo-count to avoid 0 values
            fold_changes1 += 0.0001
            fold_changes2 += 0.0001

            # Test is if relative fold change rfc is different from 1
            # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
            #                       = obs1 / obs2 * exp2 / exp1
            # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
            #
            # Convert to log space for easier plotting
            # Move the observed fold ratio in order to get an idea of the magnitude
            # of the underlying fold change
            delta_fold = data2.fold - data1.fold
            sampled_delta_fold = numpy.log(
                fold_changes1 / fold_changes2) + delta_fold
            observed_delta_fold = 0.0 + delta_fold

            result = GatEngine.AnnotatorResult(data1.annotation, data2.annotation,
                                               "na",
                                               observed_delta_fold,
                                               sampled_delta_fold,
                                               reference=None,
                                               pseudo_count=0)

            results.append(result)

    else:
        E.info("performing pairwise comparison between files")

        ##################################################
        # perform pairwise comparison
        for index1, index2 in itertools.combinations(range(len(input_filenames_counts)), 2):
            E.info("comparing %i and %i" % (index1, index2))
            a, b = all_annotator_results[index1], all_annotator_results[index2]

            # index results in a and b
            aa = collections.defaultdict(dict)
            for x in a:
                aa[x.track][x.annotation] = x

            bb = collections.defaultdict(dict)
            for x in b:
                bb[x.track][x.annotation] = x

            if len(aa.keys()) != 1 or len(bb.keys()) != 1:
                raise NotImplementedError("multiple segments of interest")

            track = "merged"
            # get shared annotations
            annotations1 = aa[track].keys()
            annotations2 = bb[track].keys()
            shared_annotations = list(
                set(annotations1).intersection(set(annotations2)))
            E.info("%i shared annotations" % len(shared_annotations))

            for annotation in shared_annotations:

                # if not annotation.startswith("Ram:"): continue

                data1 = aa[track][annotation]
                data2 = bb[track][annotation]

                # note that fold changes can be very large if there are 0 samples
                # this is fine for getting the distributional params (mean,
                # stddev)
                fold_changes1 = data1.observed / (data1.samples + pseudo_count)
                fold_changes2 = data2.observed / (data2.samples + pseudo_count)

                # add a separate fc pseudo-count to avoid 0 values
                fold_changes1 += 0.0001
                fold_changes2 += 0.0001

                # Test is if relative fold change rfc is different from 1
                # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
                #                       = obs1 / obs2 * exp2 / exp1
                # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
                #
                # Convert to log space for easier plotting
                # Move the observed fold ratio in order to get an idea of the magnitude
                # of the underlying fold change
                delta_fold = data2.fold - data1.fold
                sampled_delta_fold = numpy.log(
                    fold_changes1 / fold_changes2) + delta_fold
                observed_delta_fold = 0.0 + delta_fold

                result = GatEngine.AnnotatorResult(track, annotation,
                                                   "na",
                                                   observed_delta_fold,
                                                   sampled_delta_fold,
                                                   reference=None,
                                                   pseudo_count=0)

                results.append(result)

    if len(results) == 0:
        E.critical("no results found")
        E.Stop()
        return

    IO.outputResults(results,
                     options,
                     GatEngine.AnnotatorResult.headers,
                     description_header,
                     description_width,
                     descriptions,
                     format_observed="%6.4f")

    IO.plotResults(results, options)

    # write footer and output benchmark information.
    E.Stop()
Пример #2
0
def buildSegments(options):
    '''load segments, annotations and workspace from parameters
    defined in *options*.

    The workspace will be split by isochores.

    returns segments, annotations and workspace.
    '''

    options.segment_files = expandGlobs(options.segment_files)
    options.annotation_files = expandGlobs(options.annotation_files)
    options.workspace_files = expandGlobs(options.workspace_files)
    options.sample_files = expandGlobs(options.sample_files)

    ##################################################
    # arguments sanity check
    if not options.segment_files:
        raise ValueError("please specify at least one segment file")
    if not options.annotation_files:
        raise ValueError("please specify at least one annotation file")
    if not options.workspace_files:
        raise ValueError("please specify at least one workspace file")

    # read one or more segment files
    segments = readSegmentList("segments",
                               options.segment_files,
                               ignore_tracks=options.ignore_segment_tracks)
    segments.normalize()

    if segments.sum() == 0:
        E.critical("no segments in input file - run aborted")
        raise ValueError("segments file is empty - run aborted")

    if len(segments) > 1000:
        raise ValueError("too many (%i) segment files - use track definitions "
                         "or --ignore-segment-tracks" % len(segments))

    annotations = readSegmentList(
        "annotations",
        options.annotation_files,
        enable_split_tracks=options.enable_split_tracks,
        ignore_tracks=options.annotations_label is not None)

    if options.annotations_label is not None:
        annotations.setName(options.annotations_label)

    if options.annotations_to_points:
        annotations.toPositions(options.annotations_to_points)

    if options.overlapping_annotations:
        # only sort, do not merge
        annotations.sort()
    else:
        annotations.normalize()

    workspaces = readSegmentList("workspaces", options.workspace_files,
                                 options, options.enable_split_tracks)
    workspaces.normalize()

    # intersect workspaces to build a single workspace
    E.info("collapsing workspaces")
    dumpStats(workspaces, "stats_workspaces_input", options)
    workspaces.collapse()
    dumpStats(workspaces, "stats_workspaces_collapsed", options)

    # use merged workspace only, discard others
    workspaces.restrict("collapsed")

    # build isochores or intersect annotations/segments with workspace
    if options.isochore_files:

        # read one or more isochore files
        isochores = Engine.IntervalCollection(name="isochores")
        E.info("%s: reading isochores from %i files" %
               ("isochores", len(options.isochore_files)))
        isochores.load(options.isochore_files)
        dumpStats(isochores, "stats_isochores_raw", options)

        # merge isochores and check if consistent (fully normalized)
        isochores.sort()

        # check that there are no overlapping segments within isochores
        isochores.check()

        # TODO: flag is_normalized not properly set
        isochores.normalize()

        # check that there are no overlapping segments between isochores

        # truncate isochores to workspace
        # crucial if isochores are larger than workspace.
        isochores.intersect(workspaces["collapsed"])

    else:
        isochores = None

    return segments, annotations, workspaces, isochores
Пример #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(
        version=
        "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-o",
        "--order",
        dest="output_order",
        type="choice",
        choices=("track", "annotation", "fold", "pvalue", "qvalue",
                 "observed"),
        help="order results in output by fold, track, etc. [default=%default]."
    )

    parser.add_option("-p",
                      "--pvalue-method",
                      dest="pvalue_method",
                      type="choice",
                      choices=(
                          "empirical",
                          "norm",
                      ),
                      help="type of pvalue reported [default=%default].")

    parser.add_option(
        "-q",
        "--qvalue-method",
        dest="qvalue_method",
        type="choice",
        choices=("storey", "BH", "bonferroni", "holm", "hommel", "hochberg",
                 "BY", "none"),
        help=
        "method to perform multiple testing correction by controlling the fdr [default=%default]."
    )

    parser.add_option("--qvalue-lambda",
                      dest="qvalue_lambda",
                      type="float",
                      help="fdr computation: lambda [default=%default].")

    parser.add_option(
        "--qvalue-pi0-method",
        dest="qvalue_pi0_method",
        type="choice",
        choices=("smoother", "bootstrap"),
        help="fdr computation: method for estimating pi0 [default=%default].")

    parser.add_option(
        "--descriptions",
        dest="input_filename_descriptions",
        type="string",
        help="filename mapping annotation terms to descriptions. "
        " if given, the output table will contain additional columns "
        " [default=%default]")

    parser.add_option(
        "--pseudo-count",
        dest="pseudo_count",
        type="float",
        help=
        "pseudo count. The pseudo count is added to both the observed and expected overlap. "
        " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default]."
    )

    parser.add_option("--output-plots-pattern",
                      dest="output_plots_pattern",
                      type="string",
                      help="output pattern for plots [default=%default]")

    parser.set_defaults(
        pvalue_method="empirical",
        qvalue_method="BH",
        qvalue_lambda=None,
        qvalue_pi0_method="smoother",
        # pseudo count for fold change computation to avoid 0 fc
        pseudo_count=1.0,
        output_order="observed",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    input_filenames_counts = args

    ##################################################
    E.info("received %i filenames with counts" % len(input_filenames_counts))

    ##################################################
    description_header, descriptions, description_width = IO.readDescriptions(
        options)

    all_annotator_results = []

    for input_filename_counts in input_filenames_counts:

        E.info("processing %s" % input_filename_counts)

        annotator_results = gat.fromCounts(input_filename_counts)

        ##################################################
        if options.pvalue_method != "empirical":
            E.info("updating pvalues to %s" % options.pvalue_method)
            GatEngine.updatePValues(annotator_results, options.pvalue_method)

        ##################################################
        ##################################################
        ##################################################
        # compute global fdr
        ##################################################
        E.info("computing FDR statistics")
        GatEngine.updateQValues(annotator_results,
                                method=options.qvalue_method,
                                vlambda=options.qvalue_lambda,
                                pi0_method=options.qvalue_pi0_method)

        all_annotator_results.append(annotator_results)

    pseudo_count = options.pseudo_count
    results = []

    if len(all_annotator_results) == 1:
        E.info("performing pairwise comparison within a single file")

        # collect all annotations
        annotations, segments = list(), set()
        for x in all_annotator_results[0]:
            segments.add(x.track)
            annotations.append(x)

        if len(segments) != 1:
            raise NotImplementedError("multiple segments of interest")

        for data1, data2 in itertools.combinations(annotations, 2):

            # note that fold changes can be very large if there are 0 samples
            # this is fine for getting the distributional params (mean, stddev)
            fold_changes1 = data1.observed / (data1.samples + pseudo_count)
            fold_changes2 = data2.observed / (data2.samples + pseudo_count)

            # add a separate fc pseudo-count to avoid 0 values
            fold_changes1 += 0.0001
            fold_changes2 += 0.0001

            # Test is if relative fold change rfc is different from 1
            # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
            #                       = obs1 / obs2 * exp2 / exp1
            # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
            #
            # Convert to log space for easier plotting
            # Move the observed fold ratio in order to get an idea of the magnitude
            # of the underlying fold change
            delta_fold = data2.fold - data1.fold
            sampled_delta_fold = numpy.log(
                fold_changes1 / fold_changes2) + delta_fold
            observed_delta_fold = 0.0 + delta_fold

            result = GatEngine.AnnotatorResult(data1.annotation,
                                               data2.annotation,
                                               "na",
                                               observed_delta_fold,
                                               sampled_delta_fold,
                                               reference=None,
                                               pseudo_count=0)

            results.append(result)

    else:
        E.info("performing pairwise comparison between multiple files")

        ##################################################
        # perform pairwise comparison
        for index1, index2 in itertools.combinations(
                range(len(input_filenames_counts)), 2):
            E.info("comparing %i and %i" % (index1, index2))
            a, b = all_annotator_results[index1], all_annotator_results[index2]

            # index results in a and b
            aa = collections.defaultdict(dict)
            for x in a:
                aa[x.track][x.annotation] = x

            bb = collections.defaultdict(dict)
            for x in b:
                bb[x.track][x.annotation] = x

            tracks_a = set(aa.keys())
            tracks_b = set(bb.keys())
            shared_tracks = tracks_a.intersection(tracks_b)
            if len(shared_tracks) == 0:
                E.warn("no shared tracks between {} and {}".format(
                    index1, index2))

            for track in sorted(shared_tracks):
                E.debug("computing results for track {}".format(track))
                # get shared annotations
                annotations1 = aa[track].keys()
                annotations2 = bb[track].keys()
                shared_annotations = list(
                    set(annotations1).intersection(set(annotations2)))
                E.info("%i shared annotations" % len(shared_annotations))

                for annotation in shared_annotations:

                    # if not annotation.startswith("Ram:"): continue

                    data1 = aa[track][annotation]
                    data2 = bb[track][annotation]

                    # note that fold changes can be very large if there are 0 samples
                    # this is fine for getting the distributional params (mean,
                    # stddev)
                    fold_changes1 = data1.observed / (data1.samples +
                                                      pseudo_count)
                    fold_changes2 = data2.observed / (data2.samples +
                                                      pseudo_count)

                    # add a separate fc pseudo-count to avoid 0 values
                    fold_changes1 += 0.0001
                    fold_changes2 += 0.0001

                    # Test is if relative fold change rfc is different from 1
                    # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
                    #                       = obs1 / obs2 * exp2 / exp1
                    # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
                    #
                    # Convert to log space for easier plotting
                    # Move the observed fold ratio in order to get an idea of the magnitude
                    # of the underlying fold change
                    delta_fold = data2.fold - data1.fold
                    sampled_delta_fold = numpy.log(
                        fold_changes1 / fold_changes2) + delta_fold
                    observed_delta_fold = 0.0 + delta_fold

                    result = GatEngine.AnnotatorResult(track,
                                                       annotation,
                                                       "na",
                                                       observed_delta_fold,
                                                       sampled_delta_fold,
                                                       reference=None,
                                                       pseudo_count=0)

                    results.append(result)

    if len(results) == 0:
        E.critical("no results found")
        E.Stop()
        return

    IO.outputResults(results,
                     options,
                     GatEngine.AnnotatorResult.headers,
                     description_header,
                     description_width,
                     descriptions,
                     format_observed="%6.4f")

    IO.plotResults(results, options)

    # write footer and output benchmark information.
    E.Stop()
Пример #4
0
def buildSegments(options):
    '''load segments, annotations and workspace from parameters
    defined in *options*.

    The workspace will be split by isochores.

    returns segments, annotations and workspace.
    '''

    options.segment_files = expandGlobs(options.segment_files)
    options.annotation_files = expandGlobs(options.annotation_files)
    options.workspace_files = expandGlobs(options.workspace_files)
    options.sample_files = expandGlobs(options.sample_files)

    ##################################################
    # arguments sanity check
    if not options.segment_files:
        raise ValueError("please specify at least one segment file")
    if not options.annotation_files:
        raise ValueError("please specify at least one annotation file")
    if not options.workspace_files:
        raise ValueError("please specify at least one workspace file")

    # read one or more segment files
    segments = readSegmentList("segments", options.segment_files, options)
    if options.ignore_segment_tracks:
        segments.merge(delete=True)
        E.info("merged all segments into one track with %i segments" %
               len(segments))

    if segments.sum() == 0:
        E.critical("no segments in input file - run aborted")
        raise ValueError("segments file is empty - run aborted")

    if len(segments) > 1000:
        raise ValueError(
            "too many (%i) segment files - use track definitions or --ignore-segment-tracks" % len(segments))

    annotations = readSegmentList(
        "annotations", options.annotation_files, options, options.enable_split_tracks)
    workspaces = readSegmentList(
        "workspaces", options.workspace_files, options, options.enable_split_tracks)

    # intersect workspaces to build a single workspace
    E.info("collapsing workspaces")
    dumpStats(workspaces, "stats_workspaces_input", options)
    workspaces.collapse()
    dumpStats(workspaces, "stats_workspaces_collapsed", options)

    # use merged workspace only, discard others
    workspaces.restrict("collapsed")

    # build isochores or intersect annotations/segments with workspace
    if options.isochore_files:

        # read one or more isochore files
        isochores = GatEngine.IntervalCollection(name="isochores")
        E.info("%s: reading isochores from %i files" %
               ("isochores", len(options.isochore_files)))
        isochores.load(options.isochore_files)
        dumpStats(isochores, "stats_isochores_raw", options)

        # merge isochores and check if consistent (fully normalized)
        isochores.sort()

        # check that there are no overlapping segments within isochores
        isochores.check()

        # TODO: flag is_normalized not properly set
        isochores.normalize()

        # check that there are no overlapping segments between isochores

        # truncate isochores to workspace
        # crucial if isochores are larger than workspace.
        isochores.intersect(workspaces["collapsed"])

    else:
        isochores = None

    return segments, annotations, workspaces, isochores