def consolidate(args): """ %prog consolidate gffile1 gffile2 ... > consolidated.out Given 2 or more gff files generated by pasa annotation comparison, iterate through each locus (shared locus name or overlapping CDS) and identify same/different isoforms (shared splicing structure) across the input datasets. If `slop` is enabled, consolidation will collapse any variation in terminal UTR lengths, keeping the longest as representative. """ from jcvi.formats.base import longest_unique_prefix from jcvi.formats.gff import make_index, match_subfeats from jcvi.utils.cbook import AutoVivification from jcvi.utils.grouper import Grouper from itertools import combinations, product supported_modes = ["name", "coords"] p = OptionParser(consolidate.__doc__) p.add_option("--slop", default=False, action="store_true", help="allow minor variation in terminal 5'/3' UTR" + \ " start/stop position [default: %default]") p.add_option("--inferUTR", default=False, action="store_true", help="infer presence of UTRs from exon coordinates") p.add_option("--mode", default="name", choices=supported_modes, help="method used to determine overlapping loci") p.add_option("--summary", default=False, action="store_true", help="Generate summary table of consolidation process") p.add_option("--clusters", default=False, action="store_true", help="Generate table of cluster members after consolidation") p.set_outfile() opts, args = p.parse_args(args) slop = opts.slop inferUTR = opts.inferUTR mode = opts.mode if len(args) < 2: sys.exit(not p.print_help()) gffdbx = {} for gffile in args: dbn = longest_unique_prefix(gffile, args) gffdbx[dbn] = make_index(gffile) loci = Grouper() for dbn in gffdbx: odbns = [odbn for odbn in gffdbx if dbn != odbn] for gene in gffdbx[dbn].features_of_type('gene', order_by=('seqid', 'start')): if mode == "name": loci.join(gene.id, (gene.id, dbn)) else: if (gene.id, dbn) not in loci: loci.join((gene.id, dbn)) gene_cds = list(gffdbx[dbn].children(gene, \ featuretype='CDS', order_by=('start'))) gene_cds_start, gene_cds_stop = gene_cds[0].start, \ gene_cds[-1].stop for odbn in odbns: for ogene_cds in gffdbx[odbn].region(seqid=gene.seqid, \ start=gene_cds_start, end=gene_cds_stop, \ strand=gene.strand, featuretype='CDS'): for ogene in gffdbx[odbn].parents(ogene_cds, featuretype='gene'): loci.join((gene.id, dbn), (ogene.id, odbn)) gfeats = {} mrna = AutoVivification() for i, locus in enumerate(loci): gene = "gene_{0:0{pad}}".format(i, pad=6) \ if mode == "coords" else None for elem in locus: if type(elem) == tuple: _gene, dbn = elem if gene is None: gene = _gene g = gffdbx[dbn][_gene] if gene not in gfeats: gfeats[gene] = g gfeats[gene].attributes['ID'] = [gene] else: if g.start < gfeats[gene].start: gfeats[gene].start = g.start if g.stop > gfeats[gene].stop: gfeats[gene].stop = g.stop c = list(gffdbx[dbn].children(_gene, featuretype='mRNA', order_by='start')) if len(c) > 0: mrna[gene][dbn] = c fw = must_open(opts.outfile, "w") print("##gff-version 3", file=fw) seen = {} if opts.summary: summaryfile = "{0}.summary.txt".format(opts.outfile.rsplit(".")[0]) sfw = must_open(summaryfile, "w") summary = ["id"] summary.extend(gffdbx.keys()) print("\t".join(str(x) for x in summary), file=sfw) if opts.clusters: clustersfile = "{0}.clusters.txt".format(opts.outfile.rsplit(".")[0]) cfw = must_open(clustersfile, "w") clusters = ["id", "dbns", "members", "trlens"] print("\t".join(str(x) for x in clusters), file=cfw) for gene in mrna: g = Grouper() dbns = list(combinations(mrna[gene], 2)) if len(dbns) > 0: for dbn1, dbn2 in dbns: dbx1, dbx2 = gffdbx[dbn1], gffdbx[dbn2] for mrna1, mrna2 in product(mrna[gene][dbn1], mrna[gene][dbn2]): mrna1s, mrna2s = mrna1.stop - mrna1.start + 1, \ mrna2.stop - mrna2.start + 1 g.join((dbn1, mrna1.id, mrna1s)) g.join((dbn2, mrna2.id, mrna2s)) if match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype='CDS'): res = [] ftypes = ['exon'] if inferUTR else ['five_prime_UTR', 'three_prime_UTR'] for ftype in ftypes: res.append(match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype=ftype, slop=slop)) if all(r == True for r in res): g.join((dbn1, mrna1.id, mrna1s), (dbn2, mrna2.id, mrna2s)) else: for dbn1 in mrna[gene]: for mrna1 in mrna[gene][dbn1]: g.join((dbn1, mrna1.id, mrna1.stop - mrna1.start + 1)) print(gfeats[gene], file=fw) for group in g: group.sort(key=lambda x: x[2], reverse=True) dbs, mrnas = [el[0] for el in group], [el[1] for el in group] d, m = dbs[0], mrnas[0] dbid, _mrnaid = "|".join(str(x) for x in set(dbs)), [] for x in mrnas: if x not in _mrnaid: _mrnaid.append(x) mrnaid = "{0}|{1}".format(dbid, "-".join(_mrnaid)) if mrnaid not in seen: seen[mrnaid] = 0 else: seen[mrnaid] += 1 mrnaid = "{0}-{1}".format(mrnaid, seen[mrnaid]) _mrna = gffdbx[d][m] _mrna.attributes['ID'] = [mrnaid] _mrna.attributes['Parent'] = [gene] children = gffdbx[d].children(m, order_by='start') print(_mrna, file=fw) for child in children: child.attributes['ID'] = ["{0}|{1}".format(dbid, child.id)] child.attributes['Parent'] = [mrnaid] print(child, file=fw) if opts.summary: summary = [mrnaid] summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx]) print("\t".join(str(x) for x in summary), file=sfw) if opts.clusters: clusters = [mrnaid] clusters.append(",".join(str(el[0]) for el in group)) clusters.append(",".join(str(el[1]) for el in group)) clusters.append(",".join(str(el[2]) for el in group)) print("\t".join(str(x) for x in clusters), file=cfw) fw.close() if opts.summary: sfw.close() if opts.clusters: cfw.close()
def consolidate(args): """ %prog consolidate gffile1 gffile2 ... > consolidated.out Given 2 or more gff files generated by pasa annotation comparison, iterate through every gene locus and identify all cases of same and different isoforms across the different input datasets. """ from jcvi.formats.base import longest_unique_prefix from jcvi.formats.gff import make_index, match_subfeats from jcvi.utils.cbook import AutoVivification from jcvi.utils.grouper import Grouper from itertools import combinations, product p = OptionParser(consolidate.__doc__) p.add_option("--slop", default=False, action="store_true", help="allow minor variation in terminal 5'/3' UTR" + \ " start/stop position [default: %default]") p.set_outfile() opts, args = p.parse_args(args) slop = opts.slop if len(args) < 2: sys.exit(not p.print_help()) gffdbx = {} gene_coords = {} mrna = AutoVivification() for gffile in args: dbn = longest_unique_prefix(gffile, args) gffdbx[dbn] = make_index(gffile) for gene in gffdbx[dbn].features_of_type('gene', order_by=('seqid', 'start')): if gene.id not in gene_coords: gene_coords[gene.id] = [] gene_coords[gene.id].extend([gene.start, gene.stop]) c = list(gffdbx[dbn].children(gene, featuretype='mRNA', order_by='start')) if len(c) > 0: mrna[gene.id][dbn] = c fw = must_open(opts.outfile, "w") print >> fw, "##gff-version 3" summary = ["id"] summary.extend(gffdbx.keys()) print >> sys.stderr, "\t".join(str(x) for x in summary) for gene in mrna: g = Grouper() dbns = list(combinations(mrna[gene], 2)) if len(dbns) > 0: for dbn1, dbn2 in dbns: dbx1, dbx2 = gffdbx[dbn1], gffdbx[dbn2] for mrna1, mrna2 in product(mrna[gene][dbn1], mrna[gene][dbn2]): mrna1s, mrna2s = mrna1.stop - mrna1.start + 1, \ mrna2.stop - mrna2.start + 1 g.join((dbn1, mrna1.id, mrna1s)) g.join((dbn2, mrna2.id, mrna2s)) if match_subfeats(mrna1, mrna2, dbx1, dbx2): res = [] for ftype in ('five_prime_UTR', 'three_prime_UTR'): res.append( match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype=ftype, slop=slop)) if all(r == True for r in res): g.join((dbn1, mrna1.id, mrna1s), (dbn2, mrna2.id, mrna2s)) else: for dbn1 in mrna[gene]: for mrna1 in mrna[gene][dbn1]: g.join((dbn1, mrna1.id, mrna1.stop - mrna1.start + 1)) dbn = mrna[gene].keys()[0] gene_coords[gene].sort() _gene = gffdbx[dbn][gene] _gene.start, _gene.stop = gene_coords[gene][0], gene_coords[gene][-1] print >> fw, _gene for group in g: group.sort(key=lambda x: x[2], reverse=True) dbs, mrnas = [el[0] for el in group], [el[1] for el in group] d, m = dbs[0], mrnas[0] dbid, _mrnaid = "".join(str(x) for x in set(dbs)), [] for x in mrnas: if x not in _mrnaid: _mrnaid.append(x) mrnaid = "{0}:{1}".format(dbid, "-".join(_mrnaid)) _mrna = gffdbx[d][m] _mrna.attributes['ID'] = [mrnaid] children = gffdbx[d].children(m, order_by='start') print >> fw, _mrna for child in children: child.attributes['ID'] = ["{0}:{1}".format(dbid, child.id)] child.attributes['Parent'] = [mrnaid] print >> fw, child summary = [mrnaid] summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx]) print >> sys.stderr, "\t".join(str(x) for x in summary) fw.close()