Exemplo n.º 1
0
def consolidate(args):
    """
    %prog consolidate gffile1 gffile2 ... > consolidated.out

    Given 2 or more gff files generated by pasa annotation comparison,
    iterate through each locus (shared locus name or overlapping CDS)
    and identify same/different isoforms (shared splicing structure)
    across the input datasets.

    If `slop` is enabled, consolidation will collapse any variation
    in terminal UTR lengths, keeping the longest as representative.
    """
    from jcvi.formats.base import longest_unique_prefix
    from jcvi.formats.gff import make_index, match_subfeats
    from jcvi.utils.cbook import AutoVivification
    from jcvi.utils.grouper import Grouper
    from itertools import combinations, product

    supported_modes = ["name", "coords"]
    p = OptionParser(consolidate.__doc__)
    p.add_option("--slop", default=False, action="store_true",
            help="allow minor variation in terminal 5'/3' UTR" + \
                 " start/stop position [default: %default]")
    p.add_option("--inferUTR", default=False, action="store_true",
            help="infer presence of UTRs from exon coordinates")
    p.add_option("--mode", default="name", choices=supported_modes,
            help="method used to determine overlapping loci")
    p.add_option("--summary", default=False, action="store_true",
            help="Generate summary table of consolidation process")
    p.add_option("--clusters", default=False, action="store_true",
            help="Generate table of cluster members after consolidation")
    p.set_outfile()

    opts, args = p.parse_args(args)
    slop = opts.slop
    inferUTR = opts.inferUTR
    mode = opts.mode

    if len(args) < 2:
        sys.exit(not p.print_help())

    gffdbx = {}
    for gffile in args:
        dbn = longest_unique_prefix(gffile, args)
        gffdbx[dbn] = make_index(gffile)

    loci = Grouper()
    for dbn in gffdbx:
        odbns = [odbn for odbn in gffdbx if dbn != odbn]
        for gene in gffdbx[dbn].features_of_type('gene', order_by=('seqid', 'start')):
            if mode == "name":
                loci.join(gene.id, (gene.id, dbn))
            else:
                if (gene.id, dbn) not in loci:
                    loci.join((gene.id, dbn))
                    gene_cds = list(gffdbx[dbn].children(gene, \
                        featuretype='CDS', order_by=('start')))
                    gene_cds_start, gene_cds_stop = gene_cds[0].start, \
                        gene_cds[-1].stop
                    for odbn in odbns:
                        for ogene_cds in gffdbx[odbn].region(seqid=gene.seqid, \
                                start=gene_cds_start, end=gene_cds_stop, \
                                strand=gene.strand, featuretype='CDS'):
                            for ogene in gffdbx[odbn].parents(ogene_cds, featuretype='gene'):
                                loci.join((gene.id, dbn), (ogene.id, odbn))

    gfeats = {}
    mrna = AutoVivification()
    for i, locus in enumerate(loci):
        gene = "gene_{0:0{pad}}".format(i, pad=6) \
                if mode == "coords" else None

        for elem in locus:
            if type(elem) == tuple:
                _gene, dbn = elem
                if gene is None: gene = _gene

                g = gffdbx[dbn][_gene]
                if gene not in gfeats:
                    gfeats[gene] = g
                    gfeats[gene].attributes['ID'] = [gene]
                else:
                    if g.start < gfeats[gene].start:
                        gfeats[gene].start = g.start
                    if g.stop > gfeats[gene].stop:
                        gfeats[gene].stop = g.stop

                c = list(gffdbx[dbn].children(_gene, featuretype='mRNA', order_by='start'))
                if len(c) > 0:
                    mrna[gene][dbn] = c

    fw = must_open(opts.outfile, "w")
    print("##gff-version	3", file=fw)
    seen = {}
    if opts.summary:
        summaryfile = "{0}.summary.txt".format(opts.outfile.rsplit(".")[0])
        sfw = must_open(summaryfile, "w")
        summary = ["id"]
        summary.extend(gffdbx.keys())
        print("\t".join(str(x) for x in summary), file=sfw)
    if opts.clusters:
        clustersfile = "{0}.clusters.txt".format(opts.outfile.rsplit(".")[0])
        cfw = must_open(clustersfile, "w")
        clusters = ["id", "dbns", "members", "trlens"]
        print("\t".join(str(x) for x in clusters), file=cfw)
    for gene in mrna:
        g = Grouper()
        dbns = list(combinations(mrna[gene], 2))
        if len(dbns) > 0:
            for dbn1, dbn2 in dbns:
                dbx1, dbx2 = gffdbx[dbn1], gffdbx[dbn2]
                for mrna1, mrna2 in product(mrna[gene][dbn1], mrna[gene][dbn2]):
                    mrna1s, mrna2s = mrna1.stop - mrna1.start + 1, \
                            mrna2.stop - mrna2.start + 1
                    g.join((dbn1, mrna1.id, mrna1s))
                    g.join((dbn2, mrna2.id, mrna2s))

                    if match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype='CDS'):
                        res = []
                        ftypes = ['exon'] if inferUTR else ['five_prime_UTR', 'three_prime_UTR']
                        for ftype in ftypes:
                            res.append(match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype=ftype, slop=slop))

                        if all(r == True for r in res):
                            g.join((dbn1, mrna1.id, mrna1s), (dbn2, mrna2.id, mrna2s))
        else:
            for dbn1 in mrna[gene]:
                for mrna1 in mrna[gene][dbn1]:
                    g.join((dbn1, mrna1.id, mrna1.stop - mrna1.start + 1))

        print(gfeats[gene], file=fw)

        for group in g:
            group.sort(key=lambda x: x[2], reverse=True)
            dbs, mrnas = [el[0] for el in group], [el[1] for el in group]
            d, m = dbs[0], mrnas[0]

            dbid, _mrnaid = "|".join(str(x) for x in set(dbs)), []
            for x in mrnas:
                if x not in _mrnaid: _mrnaid.append(x)
            mrnaid = "{0}|{1}".format(dbid, "-".join(_mrnaid))
            if mrnaid not in seen:
                seen[mrnaid] = 0
            else:
                seen[mrnaid] += 1
                mrnaid = "{0}-{1}".format(mrnaid, seen[mrnaid])

            _mrna = gffdbx[d][m]
            _mrna.attributes['ID'] = [mrnaid]
            _mrna.attributes['Parent'] = [gene]
            children = gffdbx[d].children(m, order_by='start')
            print(_mrna, file=fw)
            for child in children:
                child.attributes['ID'] = ["{0}|{1}".format(dbid, child.id)]
                child.attributes['Parent'] = [mrnaid]
                print(child, file=fw)

            if opts.summary:
                summary = [mrnaid]
                summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx])
                print("\t".join(str(x) for x in summary), file=sfw)

            if opts.clusters:
                clusters = [mrnaid]
                clusters.append(",".join(str(el[0]) for el in group))
                clusters.append(",".join(str(el[1]) for el in group))
                clusters.append(",".join(str(el[2]) for el in group))
                print("\t".join(str(x) for x in clusters), file=cfw)

    fw.close()
    if opts.summary: sfw.close()
    if opts.clusters: cfw.close()
Exemplo n.º 2
0
def consolidate(args):
    """
    %prog consolidate gffile1 gffile2 ... > consolidated.out

    Given 2 or more gff files generated by pasa annotation comparison,
    iterate through every gene locus and identify all cases of same and
    different isoforms across the different input datasets.
    """
    from jcvi.formats.base import longest_unique_prefix
    from jcvi.formats.gff import make_index, match_subfeats
    from jcvi.utils.cbook import AutoVivification
    from jcvi.utils.grouper import Grouper
    from itertools import combinations, product

    p = OptionParser(consolidate.__doc__)
    p.add_option("--slop", default=False, action="store_true",
            help="allow minor variation in terminal 5'/3' UTR" + \
                 " start/stop position [default: %default]")
    p.set_outfile()

    opts, args = p.parse_args(args)
    slop = opts.slop

    if len(args) < 2:
        sys.exit(not p.print_help())

    gffdbx = {}
    gene_coords = {}
    mrna = AutoVivification()
    for gffile in args:
        dbn = longest_unique_prefix(gffile, args)
        gffdbx[dbn] = make_index(gffile)
        for gene in gffdbx[dbn].features_of_type('gene',
                                                 order_by=('seqid', 'start')):
            if gene.id not in gene_coords:
                gene_coords[gene.id] = []
            gene_coords[gene.id].extend([gene.start, gene.stop])

            c = list(gffdbx[dbn].children(gene,
                                          featuretype='mRNA',
                                          order_by='start'))
            if len(c) > 0:
                mrna[gene.id][dbn] = c

    fw = must_open(opts.outfile, "w")
    print >> fw, "##gff-version	3"
    summary = ["id"]
    summary.extend(gffdbx.keys())
    print >> sys.stderr, "\t".join(str(x) for x in summary)
    for gene in mrna:
        g = Grouper()
        dbns = list(combinations(mrna[gene], 2))
        if len(dbns) > 0:
            for dbn1, dbn2 in dbns:
                dbx1, dbx2 = gffdbx[dbn1], gffdbx[dbn2]
                for mrna1, mrna2 in product(mrna[gene][dbn1],
                                            mrna[gene][dbn2]):
                    mrna1s, mrna2s = mrna1.stop - mrna1.start + 1, \
                            mrna2.stop - mrna2.start + 1
                    g.join((dbn1, mrna1.id, mrna1s))
                    g.join((dbn2, mrna2.id, mrna2s))

                    if match_subfeats(mrna1, mrna2, dbx1, dbx2):
                        res = []
                        for ftype in ('five_prime_UTR', 'three_prime_UTR'):
                            res.append(
                                match_subfeats(mrna1,
                                               mrna2,
                                               dbx1,
                                               dbx2,
                                               featuretype=ftype,
                                               slop=slop))

                        if all(r == True for r in res):
                            g.join((dbn1, mrna1.id, mrna1s),
                                   (dbn2, mrna2.id, mrna2s))
        else:
            for dbn1 in mrna[gene]:
                for mrna1 in mrna[gene][dbn1]:
                    g.join((dbn1, mrna1.id, mrna1.stop - mrna1.start + 1))

        dbn = mrna[gene].keys()[0]
        gene_coords[gene].sort()
        _gene = gffdbx[dbn][gene]
        _gene.start, _gene.stop = gene_coords[gene][0], gene_coords[gene][-1]
        print >> fw, _gene

        for group in g:
            group.sort(key=lambda x: x[2], reverse=True)
            dbs, mrnas = [el[0] for el in group], [el[1] for el in group]
            d, m = dbs[0], mrnas[0]

            dbid, _mrnaid = "".join(str(x) for x in set(dbs)), []
            for x in mrnas:
                if x not in _mrnaid: _mrnaid.append(x)
            mrnaid = "{0}:{1}".format(dbid, "-".join(_mrnaid))

            _mrna = gffdbx[d][m]
            _mrna.attributes['ID'] = [mrnaid]
            children = gffdbx[d].children(m, order_by='start')
            print >> fw, _mrna
            for child in children:
                child.attributes['ID'] = ["{0}:{1}".format(dbid, child.id)]
                child.attributes['Parent'] = [mrnaid]
                print >> fw, child

            summary = [mrnaid]
            summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx])
            print >> sys.stderr, "\t".join(str(x) for x in summary)

    fw.close()