示例#1
0
def runRegexMotifSearch(infiles, outfile):
    '''run a regular expression search on sequences.
    compute counts.
    '''

    motif = "[AG]G[GT]T[CG]A"
    reverse_motif = "T[GC]A[CA]C[TC]"

    controlfile, dbfile = infiles
    if not os.path.exists(controlfile):
        raise ValueError("control file %s for %s does not exist" %
                         (controlfile, dbfile))

    motifs = []
    for x in range(0, 15):
        motifs.append(
            ("DR%i" % x, re.compile(motif + "." * x + motif, re.IGNORECASE)))
    for x in range(0, 15):
        motifs.append(("ER%i" % x,
                       re.compile(motif + "." * x + reverse_motif,
                                  re.IGNORECASE)))

    db_positions = Motifs.countMotifs(iotools.open_file(dbfile, "r"), motifs)
    control_positions = Motifs.countMotifs(iotools.open_file(controlfile, "r"),
                                           motifs)

    db_counts, control_counts = Motifs.getCounts(
        db_positions), Motifs.getCounts(control_positions)
    db_seqcounts, control_seqcounts = Motifs.getOccurances(
        db_positions), Motifs.getCounts(control_positions)

    ndb, ncontrol = len(db_positions), len(control_positions)
    outf = iotools.open_file(outfile, "w")
    outf.write(
        "motif\tmotifs_db\tmotifs_control\tseq_db\tseq_db_percent\tseq_control\tseq_control_percent\tfold\n"
    )
    for motif, pattern in motifs:
        try:
            fold = float(db_seqcounts[motif]) * \
                ncontrol / (ndb * control_seqcounts[motif])
        except ZeroDivisionError:
            fold = 0

        outf.write(
            "%s\t%i\t%i\t%i\t%s\t%i\t%s\t%5.2f\n" %
            (motif, db_counts[motif], control_counts[motif],
             db_seqcounts[motif],
             iotools.pretty_percent(db_seqcounts[motif],
                                    ndb), control_seqcounts[motif],
             iotools.pretty_percent(control_seqcounts[motif], ncontrol), fold))
示例#2
0
    def __str__(self):

        return "\t".join(map(str, (
            self.mGenes1, self.mGenes2,
            self.mGenesOverlapping1, self.mGenesOverlapping2,
            self.mGenesUnique1, self.mGenesUnique2,
            self.mExons1, self.mExons2,
            self.mExonsOverlapping1, self.mExonsOverlapping2,
            self.mExonsUnique1, self.mExonsUnique2,
            self.mBases1, self.mBases2,
            self.mBasesOverlapping1, self.mBasesOverlapping2,
            self.mBasesUnique1, self.mBasesUnique2))) + "\t" +\
            "\t".join([iotools.pretty_percent(*x) for x in (
                        (self.mGenesOverlapping1, self.mGenes1),
                        (self.mGenesOverlapping2, self.mGenes2),
                        (self.mGenesUnique1, self.mGenes1),
                        (self.mGenesUnique2, self.mGenes2),
                        (self.mExonsOverlapping1, self.mExons1),
                        (self.mExonsOverlapping2, self.mExons2),
                        (self.mExonsUnique1, self.mExons1),
                        (self.mExonsUnique2, self.mExons2),
                        (self.mBasesOverlapping1, self.mBases1),
                        (self.mBasesOverlapping2, self.mBases2),
                        (self.mBasesUnique1, self.mBases1),
                        (self.mBasesUnique2, self.mBases2))])
示例#3
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--delimiter",
                      dest="delimiter",
                      type="string",
                      help="delimiter to separate columns [%default]")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=["row-describe", "column-describe"],
                      help="additional methods to apply [%default]")

    parser.set_defaults(
        delimiter="\t",
        methods=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if not options.methods:
        options.methods = ["summary"]

    table = pandas.read_csv(options.stdin, options.delimiter)

    options.stdout.write("metric\tcount\tpercent\tinfo\n")

    for method in options.methods:
        label = re.sub("-", "_", method)
        if method == "summary":
            for category, count, denominator, info in compute_table_summary(
                    table):
                options.stdout.write("\t".join(
                    map(str, (category, count,
                              iotools.pretty_percent(count, denominator,
                                                     na=""), info))) + "\n")
        elif method == "column-describe":
            df = table.describe().T.stack()
            with E.open_output_file(label) as outf:
                outf.write("label\tcategory\tvalue\n")
                df.to_csv(outf, sep="\t")
        elif method == "row-describe":
            df = table.T.describe().stack()
            with E.open_output_file(label) as outf:
                outf.write("label\tcategory\tvalue\n")
                df.to_csv(outf, sep="\t")

    E.stop()
示例#4
0
def main(argv=None):

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option(
        "-s", "--species", dest="species", type="string",
        help="species to use [default=%default].")

    parser.add_option(
        "-i", "--slims", dest="filename_slims", type="string",
        help="filename with GO SLIM categories "
        "[default=%default].")

    parser.add_option(
        "-g", "--genes-tsv-file", dest="filename_genes", type="string",
        help="filename with genes to analyse "
        "[default=%default].")

    parser.add_option(
        "-b", "--background-tsv-file", dest="filename_background",
        type="string",
        help="filename with background genes to analyse "
        "[default=%default].")

    parser.add_option(
        "-m", "--min-counts", dest="minimum_counts",
        type="int",
        help="minimum count - ignore all categories that have "
        "fewer than # number of genes"
        " [default=%default].")

    parser.add_option(
        "-o", "--sort-order", dest="sort_order", type="choice",
        choices=("fdr", "pvalue", "ratio"),
        help="output sort order [default=%default].")

    parser.add_option(
        "--ontology", dest="ontology", type="string",
        action="append",
        help="go ontologies to analyze. Ontologies are tested "
        "separately [default=%default].")

    parser.add_option(
        "-t", "--threshold", dest="threshold", type="float",
        help="significance threshold [>1.0 = all ]. If --fdr is set, this "
        "refers to the fdr, otherwise it is a cutoff for p-values.")

    parser.add_option(
        "--filename-dump", dest="filename_dump", type="string",
        help="dump GO category assignments into a flatfile "
        "[default=%default].")

    parser.add_option(
        "--gene2name-map-tsv-file", dest="filename_gene2name", type="string",
        help="optional filename mapping gene identifiers to gene names "
        "[default=%default].")

    parser.add_option(
        "--filename-ontology", dest="filename_ontology", type="string",
        help="filename with ontology in OBO format [default=%default].")

    parser.add_option(
        "--filename-input", dest="filename_input", type="string",
        help="read GO category assignments from a flatfile "
        "[default=%default].")

    parser.add_option(
        "--sample-size", dest="sample", type="int",
        help="do sampling (with # samples) [default=%default].")

    parser.add_option(
        "--filename-output-pattern", "--output-filename-pattern",
        dest="output_filename_pattern", type="string",
        help="pattern with output filename pattern "
        "(should contain: %(go)s and %(section)s ) [default=%default]")

    parser.add_option(
        "--fdr", dest="fdr", action="store_true",
        help="calculate and filter by FDR default=%default].")

    parser.add_option(
        "--go2goslim", dest="go2goslim", action="store_true",
        help="convert go assignments in STDIN to goslim assignments and "
        "write to STDOUT [default=%default].")

    parser.add_option(
        "--gene-pattern", dest="gene_pattern", type="string",
        help="pattern to transform identifiers to GO gene names "
        "[default=%default].")

    parser.add_option(
        "--filename-map-slims", dest="filename_map_slims", type="string",
        help="write mapping between GO categories and GOSlims "
        "[default=%default].")

    parser.add_option(
        "--get-genes", dest="get_genes", type="string",
        help="list all genes in the with a certain GOID [default=%default].")

    parser.add_option(
        "--strict", dest="strict", action="store_true",
        help="require all genes in foreground to be part of background. "
        "If not set, genes in foreground will be added to the background "
        "[default=%default].")

    parser.add_option(
        "-q", "--fdr-method", dest="qvalue_method", type="choice",
        choices=("empirical", "storey", "BH"),
        help="method to perform multiple testing correction by controlling "
        "the fdr [default=%default].")

    parser.add_option(
        "--pairwise", dest="compute_pairwise", action="store_true",
        help="compute pairwise enrichment for multiple gene lists. "
        "[default=%default].")

    # parser.add_option( "--fdr-lambda", dest="qvalue_lambda", type="float",
    #                   help="fdr computation: lambda [default=%default]."  )

    # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
    #                    choices = ("smoother", "bootstrap" ),
    # help="fdr computation: method for estimating pi0 [default=%default]."  )

    parser.set_defaults(species=None,
                        filename_genes="-",
                        filename_background=None,
                        filename_slims=None,
                        minimum_counts=0,
                        ontology=[],
                        filename_dump=None,
                        sample=0,
                        fdr=False,
                        output_filename_pattern=None,
                        threshold=0.05,
                        filename_map_slims=None,
                        gene_pattern=None,
                        sort_order="ratio",
                        get_genes=None,
                        strict=False,
                        qvalue_method="empirical",
                        pairs_min_observed_counts=3,
                        compute_pairwise=False,
                        filename_gene2name=None
                        )

    (options, args) = E.start(parser, add_database_options=True)

    if options.go2goslim:
        GO.convertGo2Goslim(options)
        E.stop()
        sys.exit(0)

    if options.fdr and options.sample == 0:
        E.warn("fdr will be computed without sampling")

    #############################################################
    # dump GO
    if options.filename_dump:
        # set default orthologies to GO
        if not options.ontology:
            options.ontology = [
                "biol_process", "mol_function", "cell_location"]

        E.info("dumping GO categories to %s" % (options.filename_dump))

        dbhandle = database.connect(url=options.database_url)

        outfile = iotools.open_file(options.filename_dump, "w", create_dir=True)
        GO.DumpGOFromDatabase(outfile,
                              dbhandle,
                              options)
        outfile.close()
        E.stop()
        sys.exit(0)

    #############################################################
    # read GO categories from file
    if options.filename_input:
        E.info("reading association of categories and genes from %s" %
               (options.filename_input))
        infile = iotools.open_file(options.filename_input)
        gene2gos, go2infos = GO.ReadGene2GOFromFile(infile)
        infile.close()

    if options.filename_gene2name:
        E.info("reading gene identifier to gene name mapping from %s" %
               options.filename_gene2name)
        infile = iotools.open_file(options.filename_gene2name)
        gene2name = iotools.read_map(infile, has_header=True)
        infile.close()
        E.info("read %i gene names for %i gene identifiers" %
               (len(set(gene2name.values())),
                len(gene2name)))
    else:
        # use identity mapping
        gene2name = dict([(x, x) for x in list(gene2gos.keys())])

    #############################################################
    # read GO ontology from file
    if options.filename_ontology:
        E.info("reading ontology from %s" % (options.filename_ontology))

        infile = iotools.open_file(options.filename_ontology)
        ontology = GO.readOntology(infile)
        infile.close()

        def _g():
            return collections.defaultdict(GO.GOInfo)
        go2infos = collections.defaultdict(_g)

        # substitute go2infos
        for go in list(ontology.values()):
            go2infos[go.mNameSpace][go.mId] = GO.GOInfo(
                go.mId,
                go_type=go.mNameSpace,
                description=go.mName)

    #############################################################
    # get foreground gene list
    input_foreground, genelists = GO.ReadGeneLists(
        options.filename_genes,
        gene_pattern=options.gene_pattern)

    E.info("read %i genes for forground in %i gene lists" %
           (len(input_foreground), len(genelists)))

    #############################################################
    # get background
    if options.filename_background:

        # nick - bug fix: background is the first tuple element from
        # ReadGeneLists
        input_background = GO.ReadGeneLists(
            options.filename_background,
            gene_pattern=options.gene_pattern)[0]
        E.info("read %i genes for background" % len(input_background))
    else:
        input_background = None

    #############################################################
    # sort out which ontologies to test
    if not options.ontology:
        if options.filename_input:
            options.ontology = list(gene2gos.keys())

    E.info("found %i ontologies: %s" %
           (len(options.ontology), options.ontology))

    summary = []
    summary.append("\t".join((
        "genelist",
        "ontology",
        "significant",
        "threshold",
        "ngenes",
        "ncategories",
        "nmaps",
        "nforegound",
        "nforeground_mapped",
        "nbackground",
        "nbackground_mapped",
        "nsample_counts",
        "nbackground_counts",
        "psample_assignments",
        "pbackground_assignments",
        "messages")) + "\n")

    #############################################################
    # get go categories for genes
    for test_ontology in sorted(options.ontology):

        # store results for aggregate output of multiple gene lists
        all_results = []
        all_significant_results = []
        all_genelists_with_results = []

        E.info("working on ontology %s" % test_ontology)
        #############################################################
        # get/read association of GO categories to genes
        if options.filename_input:
            gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology]
        else:
            E.info("reading data from database ...")

            dbhandle.Connect(options)
            gene2go, go2info = GO.ReadGene2GOFromDatabase(
                dbhandle,
                test_ontology,
                options.database, options.species)

            E.info("finished")

        if len(go2info) == 0:
            E.warn(
                "could not find information for terms - "
                "could be mismatch between ontologies")

        ngenes, ncategories, nmaps, counts_per_category = GO.CountGO(gene2go)
        E.info("assignments found: %i genes mapped to %i categories "
               "(%i maps)" %
               (ngenes, ncategories, nmaps))

        if options.minimum_counts > 0:
            to_remove = set(
                [x for x, y in counts_per_category.items()
                 if y < options.minimum_counts])
            E.info("removing %i categories with less than %i genes" %
                   (len(to_remove), options.minimum_counts))
            GO.removeCategories(gene2go, to_remove)

            ngenes, ncategories, nmaps, counts_per_category = \
                GO.CountGO(gene2go)
            E.info("assignments after filtering: %i genes mapped "
                   "to %i categories (%i maps)" % (
                       ngenes, ncategories, nmaps))

        for genelist_name, foreground in sorted(genelists.items()):

            msgs = []
            E.info("processing %s with %i genes" %
                   (genelist_name, len(foreground)))
            ##################################################################
            ##################################################################
            ##################################################################
            # build background - reconcile with foreground
            ##################################################################
            if input_background is None:
                background = list(gene2go.keys())
            else:
                background = list(input_background)

            # nick - bug-fix backgorund included the foreground in a tuple.
            # background is the first tuple element
            missing = foreground.difference(set(background))

            if options.strict:
                assert len(missing) == 0, \
                    "%i genes in foreground but not in background: %s" % (
                        len(missing), str(missing))
            else:
                if len(missing) != 0:
                    E.warn("%i genes in foreground that are not in "
                           "background - added to background of %i" %
                           (len(missing), len(background)))

                background.extend(missing)

            E.info("(unfiltered) foreground=%i, background=%i" %
                   (len(foreground), len(background)))

            # sort foreground and background, important for reproducibility
            # under random seed
            foreground = sorted(foreground)
            background = sorted(background)

            #############################################################
            # sanity checks:
            # are all of the foreground genes in the dataset
            # missing = set(genes).difference( set(gene2go.keys()) )
            # assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing))

            #############################################################
            # read GO slims and map GO categories to GO slim categories
            if options.filename_slims:
                go_slims = GO.GetGOSlims(
                    iotools.open_file(options.filename_slims, "r"))

                if options.loglevel >= 1:
                    v = set()
                    for x in list(go_slims.values()):
                        for xx in x:
                            v.add(xx)
                    options.stdlog.write(
                        "# read go slims from %s: go=%i, slim=%i\n" %
                        (options.filename_slims,
                         len(go_slims),
                         len(v)))

                if options.filename_map_slims:
                    if options.filename_map_slims == "-":
                        outfile = options.stdout
                    else:
                        outfile = iotools.open_file(
                            options.filename_map_slims, "w")

                    outfile.write("GO\tGOSlim\n")
                    for go, go_slim in sorted(list(go_slims.items())):
                        outfile.write("%s\t%s\n" % (go, go_slim))

                    if outfile != options.stdout:
                        outfile.close()

                gene2go = GO.MapGO2Slims(gene2go, go_slims, ontology=ontology)

                if options.loglevel >= 1:
                    ngenes, ncategories, nmaps, counts_per_category = \
                        GO.CountGO(gene2go)
                    options.stdlog.write(
                        "# after go slim filtering: %i genes mapped to "
                        "%i categories (%i maps)\n" % (
                            ngenes, ncategories, nmaps))

            #############################################################
            # Just dump out the gene list
            if options.get_genes:
                fg, bg, ng = [], [], []

                for gene, vv in list(gene2go.items()):
                    for v in vv:
                        if v.mGOId == options.get_genes:
                            if gene in genes:
                                fg.append(gene)
                            elif gene in background:
                                bg.append(gene)
                            else:
                                ng.append(gene)

                # skip to next GO class
                if not (bg or ng):
                    continue

                options.stdout.write(
                    "# genes in GO category %s\n" % options.get_genes)
                options.stdout.write("gene\tset\n")
                for x in sorted(fg):
                    options.stdout.write("%s\t%s\n" % ("fg", x))
                for x in sorted(bg):
                    options.stdout.write("%s\t%s\n" % ("bg", x))
                for x in sorted(ng):
                    options.stdout.write("%s\t%s\n" % ("ng", x))

                E.info("nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng)))

                E.stop()
                sys.exit(0)

            #############################################################
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='foreground',
                                     set=genelist_name)

            outfile.write("gene_id\n%s\n" % ("\n".join(sorted(foreground))))
            if options.output_filename_pattern:
                outfile.close()

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='background',
                                     set=genelist_name)

            # Jethro bug fix - see section 'build background' for assignment
            outfile.write("gene_id\n%s\n" % ("\n".join(sorted(background))))
            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            # do the analysis
            go_results = GO.AnalyseGO(gene2go, foreground, background)

            if len(go_results.mSampleGenes) == 0:
                E.warn("%s: no genes with GO categories - analysis aborted" %
                       genelist_name)
                continue

            pairs = list(go_results.mResults.items())

            #############################################################
            # calculate fdr for each hypothesis
            if options.fdr:
                fdrs, samples, method = GO.computeFDRs(go_results,
                                                       foreground,
                                                       background,
                                                       options,
                                                       test_ontology,
                                                       gene2go,
                                                       go2info)
                for x, v in enumerate(pairs):
                    v[1].mQValue = fdrs[v[0]][0]
            else:
                fdrs, samples, method = {}, {}, None

            msgs.append("fdr=%s" % method)

            if options.sort_order == "fdr":
                pairs.sort(key=lambda x: x[1].mQValue)
            elif options.sort_order == "ratio":
                pairs.sort(key=lambda x: x[1].mRatio)
            elif options.sort_order == "pvalue":
                pairs.sort(key=lambda x: x[1].mPValue)

            #############################################################
            #############################################################
            #############################################################
            # output the full result
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='overall',
                                     set=genelist_name)

            GO.outputResults(
                outfile, pairs, go2info, options, fdrs=fdrs, samples=samples)

            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # filter significant results and output
            filtered_pairs = GO.selectSignificantResults(pairs, fdrs, options)

            nselected = len(filtered_pairs)
            nselected_up = len([x for x in filtered_pairs if x[1].mRatio > 1])
            nselected_down = len(
                [x for x in filtered_pairs if x[1].mRatio < 1])

            assert nselected_up + nselected_down == nselected

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='results',
                                     set=genelist_name)

            GO.outputResults(outfile,
                             filtered_pairs,
                             go2info,
                             options,
                             fdrs=fdrs,
                             samples=samples)

            if options.output_filename_pattern:
                outfile.close()

            #############################################################
            #############################################################
            #############################################################
            # save results for multi-gene-list analysis
            all_results.append(pairs)
            all_significant_results.append(filtered_pairs)
            all_genelists_with_results.append(genelist_name)

            #############################################################
            #############################################################
            #############################################################
            # output parameters
            ngenes, ncategories, nmaps, counts_per_category = \
                GO.CountGO(gene2go)

            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='parameters',
                                     set=genelist_name)

            nbackground = len(background)
            if nbackground == 0:
                nbackground = len(go_results.mBackgroundGenes)

            outfile.write(
                "# input go mappings for gene list '%s' and category '%s'\n" %
                (genelist_name, test_ontology))
            outfile.write("parameter\tvalue\tdescription\n")
            outfile.write("mapped_genes\t%i\tmapped genes\n" % ngenes)
            outfile.write(
                "mapped_categories\t%i\tmapped categories\n" % ncategories)
            outfile.write("mappings\t%i\tmappings\n" % nmaps)
            outfile.write("genes_in_fg\t%i\tgenes in foreground\n" %
                          len(foreground))
            outfile.write(
                "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n" %
                (len(go_results.mSampleGenes)))
            outfile.write(
                "genes_in_bg\t%i\tinput background\n" % nbackground)
            outfile.write(
                "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n" % (
                    len(go_results.mBackgroundGenes)))
            outfile.write(
                "associations_in_fg\t%i\tassociations in sample\n" %
                go_results.mSampleCountsTotal)
            outfile.write(
                "associations_in_bg\t%i\tassociations in background\n" %
                go_results.mBackgroundCountsTotal)
            outfile.write(
                "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n" % (
                    iotools.pretty_percent(len(go_results.mSampleGenes),
                                           len(foreground), "%5.2f")))
            outfile.write(
                "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n" % (
                    iotools.pretty_percent(len(go_results.mBackgroundGenes),
                                           nbackground, "%5.2f")))
            outfile.write(
                "significant\t%i\tsignificant results reported\n" % nselected)
            outfile.write(
                "significant_up\t%i\tsignificant up-regulated results reported\n" % nselected_up)
            outfile.write(
                "significant_down\t%i\tsignificant up-regulated results reported\n" % nselected_down)
            outfile.write(
                "threshold\t%6.4f\tsignificance threshold\n" % options.threshold)

            if options.output_filename_pattern:
                outfile.close()

            summary.append("\t".join(map(str, (
                genelist_name,
                test_ontology,
                nselected,
                options.threshold,
                ngenes,
                ncategories,
                nmaps,
                len(foreground),
                len(go_results.mSampleGenes),
                nbackground,
                len(go_results.mBackgroundGenes),
                go_results.mSampleCountsTotal,
                go_results.mBackgroundCountsTotal,
                iotools.pretty_percent(
                    len(go_results.mSampleGenes), len(foreground), "%5.2f"),
                iotools.pretty_percent(
                    len(go_results.mBackgroundGenes), nbackground, "%5.2f"),
                ",".join(msgs)))) + "\n")

            #############################################################
            #############################################################
            #############################################################
            # output the fg patterns
            outfile = GO.getFileName(options,
                                     go=test_ontology,
                                     section='withgenes',
                                     set=genelist_name)

            GO.outputResults(outfile, pairs, go2info, options,
                             fdrs=fdrs,
                             samples=samples,
                             gene2go=gene2go,
                             foreground=foreground,
                             gene2name=gene2name)

            if options.output_filename_pattern:
                outfile.close()

        if len(genelists) > 1:

            ###################################################################
            # output various summary files
            # significant results
            GO.outputMultipleGeneListResults(all_significant_results,
                                             all_genelists_with_results,
                                             test_ontology,
                                             go2info,
                                             options,
                                             section='significant')

            # all results
            GO.outputMultipleGeneListResults(all_results,
                                             all_genelists_with_results,
                                             test_ontology,
                                             go2info,
                                             options,
                                             section='all')

            if options.compute_pairwise:
                GO.pairwiseGOEnrichment(all_results,
                                        all_genelists_with_results,
                                        test_ontology,
                                        go2info,
                                        options)

    outfile_summary = options.stdout
    outfile_summary.write("".join(summary))

    E.stop()
示例#5
0
 def _write(outs, text, numerator, denominator, base):
     percent = iotools.pretty_percent(numerator, denominator)
     outs.write('%s\t%i\t%s\t%s\n' % (text, numerator, percent, base))
示例#6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: chain2psl.py 2899 2010-04-13 14:37:37Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--output-mismatches", dest="output_mismatches", action="store_true",
                      help="output mismatches [%default]")

    parser.add_option("-a", "--output-matches", dest="output_matches", action="store_true",
                      help="output matches [%default]")

    parser.add_option("-u", "--output-unique", dest="output_unique", action="store_true",
                      help="output unique positions [%default]")

    parser.add_option("-r", "--restrict", dest="restrict", type="string",
                      help="restrict analysis to a chromosome pair (chr1:chr1:+) [%default]")

    parser.set_defaults(
        output_mismatches=False,
        output_unique=False,
        restrict=None
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError("expected two chain files")

    filename_chain1, filename_chain2 = args

    E.info("validating chain 1")
    if not validateChain(iotools.open_file(filename_chain1)):
        E.warn("validation failed - exiting")
        return 1

    E.info("validating chain 2")
    if not validateChain(iotools.open_file(filename_chain2)):
        E.warn("validation failed - exiting")
        return 1

    E.info("building pairs for %s" % filename_chain1)
    pairs1 = buildPairs(iotools.open_file(filename_chain1))
    E.info("read %i pairs" % len(pairs1))

    E.info("building pairs for %s" % filename_chain2)
    pairs2 = buildPairs(iotools.open_file(filename_chain2))
    E.info("read %i pairs" % len(pairs2))

    if options.restrict:
        restrict = tuple(options.restrict.split(":"))
        pairs1 = {restrict: pairs1[restrict]}
        pairs2 = {restrict: pairs2[restrict]}

    E.info("comparing 1 -> 2")
    comparison1 = compareChains(pairs1, pairs2)
    E.info("comparing 2 -> 1")
    comparison2 = compareChains(pairs2, pairs1)

    all_keys = sorted(list(set(list(comparison1.keys()) + list(comparison2.keys()))))

    outfile = options.stdout
    headers = ("mapped", "identical", "different", "unique")
    outfile.write("contig1\tcontig2\tstrand\t%s\t%s\t%s\t%s\n" %
                  (
                      "\t".join(["%s1" % x for x in headers]),
                      "\t".join(["p%s1" % x for x in headers]),
                      "\t".join(["%s2" % x for x in headers]),
                      "\t".join(["p%s2" % x for x in headers])))

    totals = E.Counter()

    for key in all_keys:
        outfile.write("%s\t%s\t%s" % key)

        if key in comparison1:
            c = comparison1[key]
            outfile.write("\t%i\t%i\t%i\t%i\t" %
                          (c.total, c.same, c.different, c.unique))
            outfile.write(
                "\t".join([iotools.pretty_percent(x, c.total) for x in c]))

            totals.total1 += c.total
            totals.same1 += c.same
            totals.different1 += c.different
            totals.unique1 += c.unique
        else:
            outfile.write("\t%i\t%i\t%i\t%i\t" % (0, 0, 0, 0))
            outfile.write("\t%i\t%i\t%i\t%i" % (0, 0, 0, 0))

        if key in comparison2:
            c = comparison2[key]
            outfile.write("\t%i\t%i\t%i\t%i\t" %
                          (c.total, c.same, c.different, c.unique))
            outfile.write(
                "\t".join([iotools.pretty_percent(x, c.total) for x in c]))

            totals.same2 += c.same
            totals.total2 += c.total
            totals.different2 += c.different
            totals.unique2 += c.unique
        else:
            outfile.write("\t%i\t%i\t%i\t%i\t" % (0, 0, 0, 0))
            outfile.write("\t%i\t%i\t%i\t%i" % (0, 0, 0, 0))

        outfile.write("\n")

    outfile.write("total\ttotal\t.\t")
    outfile.write("\t".join(map(str, (totals.total1,
                                      totals.same1,
                                      totals.different1,
                                      totals.unique1,
                                      iotools.pretty_percent(
                                          totals.total1, totals.total1),
                                      iotools.pretty_percent(
                                          totals.same1, totals.total1),
                                      iotools.pretty_percent(
                                          totals.different1, totals.total1),
                                      iotools.pretty_percent(
                                          totals.unique1, totals.total1),
                                      totals.total2,
                                      totals.same2,
                                      totals.different2,
                                      totals.unique2,
                                      iotools.pretty_percent(
                                          totals.total2, totals.total2),
                                      iotools.pretty_percent(
                                          totals.same2, totals.total2),
                                      iotools.pretty_percent(
                                          totals.different2, totals.total2),
                                      iotools.pretty_percent(
                                          totals.unique2, totals.total2),
                                      ))) + "\n")

    # output mismapped residues
    if options.output_mismatches or options.output_unique:
        outputMismatches(pairs1, pairs2,
                         output_mismatches=options.output_mismatches,
                         output_unique=options.output_unique,
                         output_matches=options.output_matches,
                         )

    # write footer and output benchmark information.
    E.stop()