def nmd(args): """ %prog nmd gffile Identify transcript variants which might be candidates for nonsense mediated decay (NMD) A transcript is considered to be a candidate for NMD when the CDS stop codon is located more than 50nt upstream of terminal splice site donor References: http://www.nature.com/horizon/rna/highlights/figures/s2_spec1_f3.html http://www.biomedcentral.com/1741-7007/7/23/figure/F1 """ import __builtin__ from jcvi.utils.cbook import enumerate_reversed p = OptionParser(nmd.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args gff = make_index(gffile) fw = must_open(opts.outfile, "w") for gene in gff.features_of_type('gene', order_by=('seqid', 'start')): _enumerate = __builtin__.enumerate if gene.strand == "-" else enumerate_reversed for mrna in gff.children(gene, featuretype='mRNA', order_by=('start')): tracker = dict() tracker['exon'] = list(gff.children(mrna, featuretype='exon', order_by=('start'))) tracker['cds'] = [None] * len(tracker['exon']) tcds_pos = None for i, exon in _enumerate(tracker['exon']): for cds in gff.region(region=exon, featuretype='CDS', completely_within=True): if mrna.id in cds['Parent']: tracker['cds'][i] = cds tcds_pos = i break if tcds_pos: break NMD, distance = False, 0 if (mrna.strand == "+" and tcds_pos + 1 < len(tracker['exon'])) \ or (mrna.strand == "-" and tcds_pos - 1 >= 0): tcds = tracker['cds'][tcds_pos] texon = tracker['exon'][tcds_pos] PTC = tcds.end if mrna.strand == '+' else tcds.start TDSS = texon.end if mrna.strand == '+' else texon.start distance = abs(TDSS - PTC) NMD = True if distance > 50 else False print >> fw, "\t".join(str(x) for x in (gene.id, mrna.id, \ gff.children_bp(mrna, child_featuretype='CDS'), distance, NMD)) fw.close()
def trimUTR(args): """ %prog trimUTR gffile Remove UTRs in the annotation set. """ p = OptionParser(trimUTR.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args g = make_index(gffile) gff = Gff(gffile) mRNA_register = {} fw = must_open(opts.outfile, "w") for c in gff: cid, ctype = c.accn, c.type if ctype == "gene": start, end = get_cds_minmax(g, cid) trim(c, start, end) elif ctype == "mRNA": start, end = get_cds_minmax(g, cid, level=1) trim(c, start, end) mRNA_register[cid] = (start, end) elif ctype != "CDS": start, end = mRNA_register[c.parent] trim(c, start, end) if c.start > c.end: print >> sys.stderr, cid, \ "destroyed [{0} > {1}]".format(c.start, c.end) else: print >> fw, c
def stats(args): """ %prog stats infile.gff Collect gene statistics based on gff file. There are some terminology issues here and so normally we call "gene" are actually mRNA, and sometimes "exon" are actually CDS, but they are configurable. Thee numbers are written to text file in four separate folders, corresponding to the four metrics: Exon length, Intron length, Gene length, Exon count With data written to disk then you can run %prog histogram """ p = OptionParser(stats.__doc__) p.add_option("--gene", default="mRNA", help="The gene type [default: %default]") p.add_option("--exon", default="CDS", help="The exon type [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gff_file, = args g = make_index(gff_file) exon_lengths = [] intron_lengths = [] gene_lengths = [] exon_counts = [] for feat in g.features_of_type(opts.gene): exons = [] for c in g.children(feat.id, 1): if c.featuretype != opts.exon: continue exons.append((c.chrom, c.start, c.stop)) introns = range_interleave(exons) feat_exon_lengths = [(stop - start + 1) for (chrom, start, stop) in exons] feat_intron_lengths = [(stop - start + 1) for (chrom, start, stop) in introns] exon_lengths += feat_exon_lengths intron_lengths += feat_intron_lengths gene_lengths.append(sum(feat_exon_lengths)) exon_counts.append(len(feat_exon_lengths)) a = SummaryStats(exon_lengths) b = SummaryStats(intron_lengths) c = SummaryStats(gene_lengths) d = SummaryStats(exon_counts) for x, title in zip((a, b, c, d), metrics): x.title = title print(x, file=sys.stderr) prefix = gff_file.split(".")[0] for x in (a, b, c, d): dirname = x.title mkdir(dirname) txtfile = op.join(dirname, prefix + ".txt") x.tofile(txtfile)
def summary(args): """ %prog summary gffile fastafile Print summary stats, including: - Gene/Exon/Intron - Number - Average size (bp) - Median size (bp) - Total length (Mb) - % of genome - % GC """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gff_file, ref = args s = Fasta(ref) g = make_index(gff_file) geneseqs, exonseqs, intronseqs = [], [], [] # Calc % GC for f in g.features_of_type("gene"): fid = f.id fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop}) geneseqs.append(fseq) exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") exons = list(exons) for chrom, start, stop in exons: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) exonseqs.append(fseq) introns = range_interleave(exons) for chrom, start, stop in introns: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) intronseqs.append(fseq) r = {} # Report for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)): tsizes = [len(x) for x in tseqs] tsummary = SummaryStats(tsizes, dtype="int") r[t, "Number"] = tsummary.size r[t, "Average size (bp)"] = tsummary.mean r[t, "Median size (bp)"] = tsummary.median r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb") r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1) r[t, "% GC"] = gc(tseqs) print >> sys.stderr, tabulate(r)
def batcheval(args): """ %prog batcheval model.ids gff_file evidences.bed fastafile Get the accuracy for a list of models against evidences in the range of the genes. For example: $ %prog batcheval all.gff3 isoforms.ids proteins.bed scaffolds.fasta Outfile contains the scores for the models can be found in models.scores """ from jcvi.formats.bed import evaluate from jcvi.formats.gff import make_index p = OptionParser(evaluate.__doc__) p.add_option( "--type", default="CDS", help="list of features to extract, use comma to separate (e.g." "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) model_ids, gff_file, evidences_bed, fastafile = args type = set(opts.type.split(",")) g = make_index(gff_file) fp = open(model_ids) prefix = model_ids.rsplit(".", 1)[0] fwscores = open(prefix + ".scores", "w") for row in fp: cid = row.strip() b = next(g.parents(cid, 1)) query = "{0}:{1}-{2}".format(b.chrom, b.start, b.stop) children = [c for c in g.children(cid, 1)] cidbed = prefix + ".bed" fw = open(cidbed, "w") for c in children: if c.featuretype not in type: continue fw.write(c.to_bed()) fw.close() b = evaluate( [cidbed, evidences_bed, fastafile, "--query={0}".format(query)]) print("\t".join((cid, b.score)), file=fwscores) fwscores.flush()
def batcheval(args): """ %prog batcheval model.ids gff_file evidences.bed fastafile Get the accuracy for a list of models against evidences in the range of the genes. For example: $ %prog batcheval all.gff3 isoforms.ids proteins.bed scaffolds.fasta Outfile contains the scores for the models can be found in models.scores """ from jcvi.formats.bed import evaluate from jcvi.formats.gff import make_index p = OptionParser(evaluate.__doc__) p.add_option("--type", default="CDS", help="list of features to extract, use comma to separate (e.g." "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) model_ids, gff_file, evidences_bed, fastafile = args type = set(opts.type.split(",")) g = make_index(gff_file) fp = open(model_ids) prefix = model_ids.rsplit(".", 1)[0] fwscores = open(prefix + ".scores", "w") for row in fp: cid = row.strip() b = g.parents(cid, 1).next() query = "{0}:{1}-{2}".format(b.chrom, b.start, b.stop) children = [c for c in g.children(cid, 1)] cidbed = prefix + ".bed" fw = open(cidbed, "w") for c in children: if c.featuretype not in type: continue fw.write(c.to_bed()) fw.close() b = evaluate([cidbed, evidences_bed, fastafile, "--query={0}".format(query)]) print >> fwscores, "\t".join((cid, b.score)) fwscores.flush()
def summary(args): """ %prog summary gffile fastafile Print summary stats, including: - Gene/Exon/Intron - Number - Average size (bp) - Median size (bp) - Total length (Mb) - % of genome - % GC """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gff_file, ref = args s = Fasta(ref) g = make_index(gff_file) geneseqs, exonseqs, intronseqs = [], [], [] # Calc % GC for f in g.features_of_type("gene"): fid = f.id fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop}) geneseqs.append(fseq) exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") exons = list(exons) for chrom, start, stop in exons: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) exonseqs.append(fseq) introns = range_interleave(exons) for chrom, start, stop in introns: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) intronseqs.append(fseq) r = {} # Report for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)): tsizes = [len(x) for x in tseqs] tsummary = SummaryStats(tsizes, dtype="int") r[t, "Number"] = tsummary.size r[t, "Average size (bp)"] = tsummary.mean r[t, "Median size (bp)"] = tsummary.median r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb") r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1) r[t, "% GC"] = gc(tseqs) print(tabulate(r), file=sys.stderr)
def genestats(args): """ %prog genestats gffile Print summary stats, including: - Number of genes - Number of single-exon genes - Number of multi-exon genes - Number of distinct exons - Number of genes with alternative transcript variants - Number of predicted transcripts - Mean number of distinct exons per gene - Mean number of transcripts per gene - Mean gene locus size (first to last exon) - Mean transcript size (UTR, CDS) - Mean exon size Stats modeled after barley genome paper Table 1. A physical, genetic and functional sequence assembly of the barley genome """ p = OptionParser(genestats.__doc__) p.add_option("--groupby", default="conf_class", help="Print separate stats groupby") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gff_file, = args gb = opts.groupby g = make_index(gff_file) tf = "transcript.sizes" if need_update(gff_file, tf): fw = open(tf, "w") for feat in g.features_of_type("mRNA"): fid = feat.id conf_class = feat.attributes.get(gb, "all") tsize = sum((c.stop - c.start + 1) for c in g.children(fid, 1) \ if c.featuretype == "exon") print >> fw, "\t".join((fid, str(tsize), conf_class)) fw.close() tsizes = DictFile(tf, cast=int) conf_classes = DictFile(tf, valuepos=2) logging.debug("A total of {0} transcripts populated.".format(len(tsizes))) genes = [] for feat in g.features_of_type("gene"): fid = feat.id transcripts = [c.id for c in g.children(fid, 1) \ if c.featuretype == "mRNA"] transcript_sizes = [tsizes[x] for x in transcripts] exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") conf_class = conf_classes[transcripts[0]] gs = GeneStats(feat, conf_class, transcript_sizes, exons) genes.append(gs) r = {} # Report distinct_groups = set(conf_classes.values()) for g in distinct_groups: num_genes = num_single_exon_genes = num_multi_exon_genes = 0 num_genes_with_alts = num_transcripts = num_exons = 0 cum_locus_size = cum_transcript_size = cum_exon_size = 0 for gs in genes: if gs.conf_class != g: continue num_genes += 1 if gs.num_exons == 1: num_single_exon_genes += 1 else: num_multi_exon_genes += 1 num_exons += gs.num_exons if gs.num_transcripts > 1: num_genes_with_alts += 1 num_transcripts += gs.num_transcripts cum_locus_size += gs.locus_size cum_transcript_size += gs.cum_transcript_size cum_exon_size += gs.cum_exon_size mean_num_exons = num_exons * 1. / num_genes mean_num_transcripts = num_transcripts * 1. / num_genes mean_locus_size = cum_locus_size * 1. / num_genes mean_transcript_size = cum_transcript_size * 1. / num_transcripts mean_exon_size = cum_exon_size * 1. / num_exons r[("Number of genes", g)] = num_genes r[("Number of single-exon genes", g)] = \ percentage(num_single_exon_genes, num_genes, mode=1) r[("Number of multi-exon genes", g)] = \ percentage(num_multi_exon_genes, num_genes, mode=1) r[("Number of distinct exons", g)] = num_exons r[("Number of genes with alternative transcript variants", g)] = \ percentage(num_genes_with_alts, num_genes, mode=1) r[("Number of predicted transcripts", g)] = num_transcripts r[("Mean number of distinct exons per gene", g)] = mean_num_exons r[("Mean number of transcripts per gene", g)] = mean_num_transcripts r[("Mean gene locus size (first to last exon)", g)] = mean_locus_size r[("Mean transcript size (UTR, CDS)", g)] = mean_transcript_size r[("Mean exon size", g)] = mean_exon_size print >> sys.stderr, tabulate(r)
def reindex(args): """ %prog reindex gffile pep.fasta ref.pep.fasta Reindex the splice isoforms (mRNA) in input GFF file, preferably generated after PASA annotation update In the input GFF file, there can be several types of mRNA within a locus: * CDS matches reference, UTR extended, inherits reference mRNA ID * CDS (slightly) different from reference, inherits reference mRNA ID * Novel isoform added by PASA, have IDs like "LOCUS.1.1", "LOCUS.1.2" * Multiple mRNA collapsed due to shared structure, have IDs like "LOCUS.1-LOCUS.1.1" In the case of multiple mRNA which have inherited the same reference mRNA ID, break ties by comparing the new protein with the reference protein using EMBOSS `needle` to decide which mRNA retains ID and which is assigned a new ID. All mRNA identifiers should follow the AGI naming conventions. When reindexing the isoform identifiers, order mRNA based on: * decreasing transcript length * decreasing support from multiple input datasets used to run pasa.consolidate() """ from jcvi.formats.gff import make_index from jcvi.formats.fasta import Fasta from jcvi.apps.emboss import needle from jcvi.formats.base import FileShredder from tempfile import mkstemp p = OptionParser(reindex.__doc__) p.add_option("--scores", type="str", \ help="read from existing EMBOSS `needle` scores file") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) gffile, pep, refpep, = args gffdb = make_index(gffile) reffasta = Fasta(refpep) if not opts.scores: fh, pairsfile = mkstemp(prefix='pairs', suffix=".txt", dir=".") fw = must_open(pairsfile, "w") conflict, novel = AutoVivification(), {} for gene in gffdb.features_of_type('gene', order_by=('seqid', 'start')): geneid = atg_name(gene.id, retval='locus') novel[geneid] = [] updated_mrna, hybrid_mrna = [], [] for mrna in gffdb.children(gene, featuretype='mRNA', order_by=('seqid', 'start')): if re.match(atg_name_pat, mrna.id) is not None and "_" not in mrna.id: pf, mrnaid = parse_prefix(mrna.id) mlen = gffdb.children_bp(mrna, child_featuretype='exon') if "-" in mrna.id: hybrid_mrna.append((mrna.id, mrna.start, mlen, len(pf))) else: updated_mrna.append((mrna.id, mrna.start, mlen, len(pf))) for mrna in sorted(updated_mrna, key=lambda k:(k[1], -k[2], -k[3])): pf, mrnaid = parse_prefix(mrna[0]) mstart, mlen = mrna[1], mrna[2] iso = atg_name(mrnaid, retval='iso') newiso = "{0}{1}".format(iso, re.sub(atg_name_pat, "", mrnaid)) if iso == newiso: if iso not in conflict[geneid]: conflict[geneid][iso] = [] conflict[geneid][iso].append((mrna[0], iso, newiso, \ mstart, mlen, len(pf))) else: novel[geneid].append((mrna[0], None, newiso, \ mstart, mlen, len(pf))) for mrna in sorted(hybrid_mrna, key=lambda k:(k[1], -k[2], -k[3])): pf, mrnaid = parse_prefix(mrna[0]) mstart, mlen = mrna[1], mrna[2] _iso, _newiso = [], [] for id in sorted(mrnaid.split("-")): a = atg_name(id, retval='iso') b = "{0}{1}".format(a, re.sub(atg_name_pat, "", id)) _iso.append(a) _newiso.append(b) _novel = None newiso = "-".join(str(x) for x in set(_newiso)) for iso, niso in zip(_iso, _newiso): if iso == niso: if iso not in conflict[geneid]: conflict[geneid][iso] = \ [(mrna[0], iso, newiso, mstart, mlen, len(pf))] _novel = None break _novel = True if _novel is not None: novel[geneid].append((mrna[0], None, newiso, \ mstart, mlen, len(pf))) if not opts.scores: for isoform in sorted(conflict[geneid]): mrnaid = "{0}.{1}".format(geneid, isoform) if mrnaid in reffasta.keys(): for mrna in conflict[geneid][isoform]: print >> fw, "\t".join(str(x) for x in (mrnaid, mrna[0])) scoresfile = None if not opts.scores: fw.close() needle([pairsfile, refpep, pep]) FileShredder([pairsfile], verbose=False) scoresfile = "{0}.scores".format(pairsfile.rsplit(".")[0]) else: scoresfile = opts.scores scores = read_scores(scoresfile, sort=True, trimsuffix=False) primary = {} for geneid in conflict: primary[geneid] = [] for iso in sorted(conflict[geneid]): conflict[geneid][iso].sort(key=lambda k:(k[3], -k[4], -k[5])) _iso = "{0}.{1}".format(geneid, iso) if _iso not in scores: novel[geneid].extend(conflict[geneid][iso]) continue top_score = scores[_iso][0][1] result = next((i for i, v in enumerate(conflict[geneid][iso]) if v[0] == top_score), None) if result is not None: primary[geneid].append(conflict[geneid][iso][result]) del conflict[geneid][iso][result] if geneid not in novel: novel[geneid] = [] novel[geneid].extend(conflict[geneid][iso]) novel[geneid].sort(key=lambda k:(k[3], -k[4], -k[5])) fw = must_open(opts.outfile, 'w') for gene in gffdb.features_of_type('gene', order_by=('seqid', 'start')): geneid = gene.id print >> fw, gene seen = [] if geneid in primary: all_mrna = primary[geneid] all_mrna.extend(novel[geneid]) for iso, mrna in enumerate(all_mrna): _mrna = gffdb[mrna[0]] _iso = mrna[1] if mrna not in novel[geneid]: seen.append(int(mrna[1])) else: mseen = 0 if len(seen) == 0 else max(seen) _iso = (mseen + iso + 1) - len(seen) _mrnaid = "{0}.{1}".format(geneid, _iso) _mrna['ID'], _mrna['_old_ID'] = [_mrnaid], [_mrna.id] print >> fw, _mrna for c in gffdb.children(_mrna, order_by=('start')): c['Parent'] = [_mrnaid] print >> fw, c else: for feat in gffdb.children(gene, order_by=('seqid', 'start')): print >> fw, feat fw.close()
def consolidate(args): """ %prog consolidate gffile1 gffile2 ... > consolidated.out Given 2 or more gff files generated by pasa annotation comparison, iterate through each locus (shared locus name or overlapping CDS) and identify same/different isoforms (shared splicing structure) across the input datasets. If `slop` is enabled, consolidation will collapse any variation in terminal UTR lengths, keeping the longest as representative. """ from jcvi.formats.base import longest_unique_prefix from jcvi.formats.gff import make_index, match_subfeats from jcvi.utils.cbook import AutoVivification from jcvi.utils.grouper import Grouper from itertools import combinations, product supported_modes = ["name", "coords"] p = OptionParser(consolidate.__doc__) p.add_option("--slop", default=False, action="store_true", help="allow minor variation in terminal 5'/3' UTR" + \ " start/stop position [default: %default]") p.add_option("--inferUTR", default=False, action="store_true", help="infer presence of UTRs from exon coordinates") p.add_option("--mode", default="name", choices=supported_modes, help="method used to determine overlapping loci") p.add_option("--summary", default=False, action="store_true", help="Generate summary table of consolidation process") p.add_option("--clusters", default=False, action="store_true", help="Generate table of cluster members after consolidation") p.set_outfile() opts, args = p.parse_args(args) slop = opts.slop inferUTR = opts.inferUTR mode = opts.mode if len(args) < 2: sys.exit(not p.print_help()) gffdbx = {} for gffile in args: dbn = longest_unique_prefix(gffile, args) gffdbx[dbn] = make_index(gffile) loci = Grouper() for dbn in gffdbx: odbns = [odbn for odbn in gffdbx if dbn != odbn] for gene in gffdbx[dbn].features_of_type('gene', order_by=('seqid', 'start')): if mode == "name": loci.join(gene.id, (gene.id, dbn)) else: if (gene.id, dbn) not in loci: loci.join((gene.id, dbn)) gene_cds = list(gffdbx[dbn].children(gene, \ featuretype='CDS', order_by=('start'))) gene_cds_start, gene_cds_stop = gene_cds[0].start, \ gene_cds[-1].stop for odbn in odbns: for ogene_cds in gffdbx[odbn].region(seqid=gene.seqid, \ start=gene_cds_start, end=gene_cds_stop, \ strand=gene.strand, featuretype='CDS'): for ogene in gffdbx[odbn].parents(ogene_cds, featuretype='gene'): loci.join((gene.id, dbn), (ogene.id, odbn)) gfeats = {} mrna = AutoVivification() for i, locus in enumerate(loci): gene = "gene_{0:0{pad}}".format(i, pad=6) \ if mode == "coords" else None for elem in locus: if type(elem) == tuple: _gene, dbn = elem if gene is None: gene = _gene g = gffdbx[dbn][_gene] if gene not in gfeats: gfeats[gene] = g gfeats[gene].attributes['ID'] = [gene] else: if g.start < gfeats[gene].start: gfeats[gene].start = g.start if g.stop > gfeats[gene].stop: gfeats[gene].stop = g.stop c = list(gffdbx[dbn].children(_gene, featuretype='mRNA', order_by='start')) if len(c) > 0: mrna[gene][dbn] = c fw = must_open(opts.outfile, "w") print("##gff-version 3", file=fw) seen = {} if opts.summary: summaryfile = "{0}.summary.txt".format(opts.outfile.rsplit(".")[0]) sfw = must_open(summaryfile, "w") summary = ["id"] summary.extend(gffdbx.keys()) print("\t".join(str(x) for x in summary), file=sfw) if opts.clusters: clustersfile = "{0}.clusters.txt".format(opts.outfile.rsplit(".")[0]) cfw = must_open(clustersfile, "w") clusters = ["id", "dbns", "members", "trlens"] print("\t".join(str(x) for x in clusters), file=cfw) for gene in mrna: g = Grouper() dbns = list(combinations(mrna[gene], 2)) if len(dbns) > 0: for dbn1, dbn2 in dbns: dbx1, dbx2 = gffdbx[dbn1], gffdbx[dbn2] for mrna1, mrna2 in product(mrna[gene][dbn1], mrna[gene][dbn2]): mrna1s, mrna2s = mrna1.stop - mrna1.start + 1, \ mrna2.stop - mrna2.start + 1 g.join((dbn1, mrna1.id, mrna1s)) g.join((dbn2, mrna2.id, mrna2s)) if match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype='CDS'): res = [] ftypes = ['exon'] if inferUTR else ['five_prime_UTR', 'three_prime_UTR'] for ftype in ftypes: res.append(match_subfeats(mrna1, mrna2, dbx1, dbx2, featuretype=ftype, slop=slop)) if all(r == True for r in res): g.join((dbn1, mrna1.id, mrna1s), (dbn2, mrna2.id, mrna2s)) else: for dbn1 in mrna[gene]: for mrna1 in mrna[gene][dbn1]: g.join((dbn1, mrna1.id, mrna1.stop - mrna1.start + 1)) print(gfeats[gene], file=fw) for group in g: group.sort(key=lambda x: x[2], reverse=True) dbs, mrnas = [el[0] for el in group], [el[1] for el in group] d, m = dbs[0], mrnas[0] dbid, _mrnaid = "|".join(str(x) for x in set(dbs)), [] for x in mrnas: if x not in _mrnaid: _mrnaid.append(x) mrnaid = "{0}|{1}".format(dbid, "-".join(_mrnaid)) if mrnaid not in seen: seen[mrnaid] = 0 else: seen[mrnaid] += 1 mrnaid = "{0}-{1}".format(mrnaid, seen[mrnaid]) _mrna = gffdbx[d][m] _mrna.attributes['ID'] = [mrnaid] _mrna.attributes['Parent'] = [gene] children = gffdbx[d].children(m, order_by='start') print(_mrna, file=fw) for child in children: child.attributes['ID'] = ["{0}|{1}".format(dbid, child.id)] child.attributes['Parent'] = [mrnaid] print(child, file=fw) if opts.summary: summary = [mrnaid] summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx]) print("\t".join(str(x) for x in summary), file=sfw) if opts.clusters: clusters = [mrnaid] clusters.append(",".join(str(el[0]) for el in group)) clusters.append(",".join(str(el[1]) for el in group)) clusters.append(",".join(str(el[2]) for el in group)) print("\t".join(str(x) for x in clusters), file=cfw) fw.close() if opts.summary: sfw.close() if opts.clusters: cfw.close()
def trimUTR(args): """ %prog trimUTR gffile Remove UTRs in the annotation set. If reference GFF3 is provided, reinstate UTRs from reference transcripts after trimming. Note: After running trimUTR, it is advised to also run `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3 to adjust the boundaries of all parent 'gene' features """ import gffutils from jcvi.formats.base import SetFile p = OptionParser(trimUTR.__doc__) p.add_option( "--trim5", default=None, type="str", help="File containing gene list for 5' UTR trimming", ) p.add_option( "--trim3", default=None, type="str", help="File containing gene list for 3' UTR trimming", ) p.add_option( "--trimrange", default=None, type="str", help="File containing gene list for UTR trim back" + "based on suggested (start, stop) coordinate range", ) p.add_option( "--refgff", default=None, type="str", help="Reference GFF3 used as fallback to replace UTRs", ) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (gffile, ) = args gff = make_index(gffile) trim_both = False if (opts.trim5 or opts.trim3) else True trim5 = SetFile(opts.trim5) if opts.trim5 else set() trim3 = SetFile(opts.trim3) if opts.trim3 else set() trimrange = dict() if opts.trimrange: trf = must_open(opts.trimrange) for tr in trf: assert (len(tr.split("\t")) == 3 ), "Must specify (start, stop) coordinate range" id, start, stop = tr.split("\t") trimrange[id] = (int(start), int(stop)) trf.close() refgff = make_index(opts.refgff) if opts.refgff else None fw = must_open(opts.outfile, "w") for feat in gff.iter_by_parent_childs(featuretype="gene", order_by=("seqid", "start"), level=1): for c in feat: cid, ctype, cparent = ( c.id, c.featuretype, c.attributes.get("Parent", [None])[0], ) t5, t3 = False, False if ctype == "gene": t5 = True if cid in trim5 else False t3 = True if cid in trim3 else False start, end = get_cds_minmax(gff, cid) trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) elif ctype == "mRNA": utr_types, extras = [], set() if any(id in trim5 for id in (cid, cparent)): t5 = True trim5.add(cid) if any(id in trim3 for id in (cid, cparent)): t3 = True trim3.add(cid) refc = None if refgff: try: refc = refgff[cid] refctype = refc.featuretype refptype = refgff[refc.attributes["Parent"] [0]].featuretype if refctype == "mRNA" and refptype == "gene": if cmp_children(cid, gff, refgff, cftype="CDS"): reinstate(c, refc, trim5=t5, trim3=t3, both=trim_both) if t5: utr_types.append("five_prime_UTR") if t3: utr_types.append("three_prime_UTR") for utr_type in utr_types: for utr in refgff.children( refc, featuretype=utr_type): extras.add(utr) for exon in refgff.region( region=utr, featuretype="exon"): if exon.attributes["Parent"][ 0] == cid: extras.add(exon) else: refc = None except gffutils.exceptions.FeatureNotFoundError: pass start, end = get_cds_minmax(gff, cid, level=1) if cid in trimrange: start, end = range_minmax([trimrange[cid], (start, end)]) if not refc: trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) for cc in gff.children(cid, order_by="start"): _ctype = cc.featuretype if _ctype not in utr_types: if _ctype != "CDS": if _ctype == "exon": eskip = [ range_overlap(to_range(cc), to_range(x)) for x in extras if x.featuretype == "exon" ] if any(eskip): continue trim(cc, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(cc, fw) else: fprint(cc, fw) for x in extras: fprint(x, fw) fw.close()
def genestats(args): """ %prog genestats gffile Print summary stats, including: - Number of genes - Number of single-exon genes - Number of multi-exon genes - Number of distinct exons - Number of genes with alternative transcript variants - Number of predicted transcripts - Mean number of distinct exons per gene - Mean number of transcripts per gene - Mean gene locus size (first to last exon) - Mean transcript size (UTR, CDS) - Mean exon size Stats modeled after barley genome paper Table 1. A physical, genetic and functional sequence assembly of the barley genome """ p = OptionParser(genestats.__doc__) p.add_option("--groupby", default="conf_class", help="Print separate stats groupby") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gff_file, = args gb = opts.groupby g = make_index(gff_file) tf = "transcript.sizes" if need_update(gff_file, tf): fw = open(tf, "w") for feat in g.features_of_type("mRNA"): fid = feat.id conf_class = feat.attributes.get(gb, "all") tsize = sum((c.stop - c.start + 1) for c in g.children(fid, 1) \ if c.featuretype == "exon") print >> fw, "\t".join((fid, str(tsize), conf_class)) fw.close() tsizes = DictFile(tf, cast=int) conf_classes = DictFile(tf, valuepos=2) logging.debug("A total of {0} transcripts populated.".format(len(tsizes))) genes = [] for feat in g.features_of_type("gene"): fid = feat.id transcripts = [c.id for c in g.children(fid, 1) \ if c.featuretype == "mRNA"] transcript_sizes = [tsizes[x] for x in transcripts] exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") conf_class = conf_classes[transcripts[0]] gs = GeneStats(feat, conf_class, transcript_sizes, exons) genes.append(gs) r = {} # Report distinct_groups = set(conf_classes.values()) for g in distinct_groups: num_genes = num_single_exon_genes = num_multi_exon_genes = 0 num_genes_with_alts = num_transcripts = num_exons = max_transcripts = 0 cum_locus_size = cum_transcript_size = cum_exon_size = 0 for gs in genes: if gs.conf_class != g: continue num_genes += 1 if gs.num_exons == 1: num_single_exon_genes += 1 else: num_multi_exon_genes += 1 num_exons += gs.num_exons if gs.num_transcripts > 1: num_genes_with_alts += 1 if gs.num_transcripts > max_transcripts: max_transcripts = gs.num_transcripts num_transcripts += gs.num_transcripts cum_locus_size += gs.locus_size cum_transcript_size += gs.cum_transcript_size cum_exon_size += gs.cum_exon_size mean_num_exons = num_exons * 1. / num_genes mean_num_transcripts = num_transcripts * 1. / num_genes mean_locus_size = cum_locus_size * 1. / num_genes mean_transcript_size = cum_transcript_size * 1. / num_transcripts mean_exon_size = cum_exon_size * 1. / num_exons r[("Number of genes", g)] = num_genes r[("Number of single-exon genes", g)] = \ percentage(num_single_exon_genes, num_genes, mode=1) r[("Number of multi-exon genes", g)] = \ percentage(num_multi_exon_genes, num_genes, mode=1) r[("Number of distinct exons", g)] = num_exons r[("Number of genes with alternative transcript variants", g)] = \ percentage(num_genes_with_alts, num_genes, mode=1) r[("Number of predicted transcripts", g)] = num_transcripts r[("Mean number of distinct exons per gene", g)] = mean_num_exons r[("Mean number of transcripts per gene", g)] = mean_num_transcripts r[("Max number of transcripts per gene", g)] = max_transcripts r[("Mean gene locus size (first to last exon)", g)] = mean_locus_size r[("Mean transcript size (UTR, CDS)", g)] = mean_transcript_size r[("Mean exon size", g)] = mean_exon_size fw = must_open(opts.outfile, "w") print >> fw, tabulate(r) fw.close()
def trimUTR(args): """ %prog trimUTR gffile Remove UTRs in the annotation set. If reference GFF3 is provided, reinstate UTRs from reference transcripts after trimming. Note: After running trimUTR, it is advised to also run `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3 to adjust the boundaries of all parent 'gene' features """ import gffutils from jcvi.formats.base import SetFile p = OptionParser(trimUTR.__doc__) p.add_option("--trim5", default=None, type="str", \ help="File containing gene list for 5' UTR trimming") p.add_option("--trim3", default=None, type="str", \ help="File containing gene list for 3' UTR trimming") p.add_option("--trimrange", default=None, type="str", \ help="File containing gene list for UTR trim back" + \ "based on suggested (start, stop) coordinate range") p.add_option("--refgff", default=None, type="str", \ help="Reference GFF3 used as fallback to replace UTRs") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args gff = make_index(gffile) trim_both = False if (opts.trim5 or opts.trim3) else True trim5 = SetFile(opts.trim5) if opts.trim5 else set() trim3 = SetFile(opts.trim3) if opts.trim3 else set() trimrange = dict() if opts.trimrange: trf = must_open(opts.trimrange) for tr in trf: assert len(tr.split("\t")) == 3, \ "Must specify (start, stop) coordinate range" id, start, stop = tr.split("\t") trimrange[id] = (int(start), int(stop)) trf.close() refgff = make_index(opts.refgff) if opts.refgff else None fw = must_open(opts.outfile, "w") for feat in gff.iter_by_parent_childs(featuretype="gene", order_by=("seqid", "start"), level=1): for c in feat: cid, ctype, cparent = c.id, c.featuretype, \ c.attributes.get('Parent', [None])[0] t5, t3 = False, False if ctype == "gene": t5 = True if cid in trim5 else False t3 = True if cid in trim3 else False start, end = get_cds_minmax(gff, cid) trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) elif ctype == "mRNA": utr_types, extras = [], set() if any(id in trim5 for id in (cid, cparent)): t5 = True trim5.add(cid) if any(id in trim3 for id in (cid, cparent)): t3 = True trim3.add(cid) refc = None if refgff: try: refc = refgff[cid] refctype = refc.featuretype refptype = refgff[refc.attributes['Parent'][0]].featuretype if refctype == "mRNA" and refptype == "gene": if cmp_children(cid, gff, refgff, cftype="CDS"): reinstate(c, refc, trim5=t5, trim3=t3, both=trim_both) if t5: utr_types.append('five_prime_UTR') if t3: utr_types.append('three_prime_UTR') for utr_type in utr_types: for utr in refgff.children(refc, featuretype=utr_type): extras.add(utr) for exon in refgff.region(region=utr, featuretype="exon"): if exon.attributes['Parent'][0] == cid: extras.add(exon) else: refc = None except gffutils.exceptions.FeatureNotFoundError: pass start, end = get_cds_minmax(gff, cid, level=1) if cid in trimrange: start, end = range_minmax([trimrange[cid], (start, end)]) if not refc: trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) for cc in gff.children(cid, order_by=("start")): _ctype = cc.featuretype if _ctype not in utr_types: if _ctype != "CDS": if _ctype == "exon": eskip = [range_overlap(to_range(cc), to_range(x)) \ for x in extras if x.featuretype == 'exon'] if any(skip for skip in eskip): continue trim(cc, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(cc, fw) else: fprint(cc, fw) for x in extras: fprint(x, fw) fw.close()
def consolidate(args): """ %prog consolidate gffile1 gffile2 ... > consolidated.out Given 2 or more gff files generated by pasa annotation comparison, iterate through every gene locus and identify all cases of same and different isoforms across the different input datasets. """ from jcvi.formats.base import longest_unique_prefix from jcvi.formats.gff import make_index from jcvi.utils.cbook import AutoVivification from jcvi.utils.grouper import Grouper from itertools import combinations, product p = OptionParser(consolidate.__doc__) p.add_option("--slop", default=False, action="store_true", help="allow minor variation in terminal 5'/3' UTR" + \ " start/stop position [default: %default]") p.set_outfile() opts, args = p.parse_args(args) slop = opts.slop if len(args) < 2: sys.exit(not p.print_help()) gffdbx = {} gene_coords = {} mrna = AutoVivification() for gffile in args: dbn = longest_unique_prefix(gffile, args) gffdbx[dbn] = make_index(gffile) for gene in gffdbx[dbn].features_of_type('gene', order_by=('seqid', 'start')): if gene.id not in gene_coords: gene_coords[gene.id] = [] gene_coords[gene.id].extend([gene.start, gene.stop]) c = list(gffdbx[dbn].children(gene, featuretype='mRNA', order_by='start')) if len(c) > 0: mrna[gene.id][dbn] = c fw = must_open(opts.outfile, "w") print >> fw, "##gff-version 3" summary = ["id"] summary.extend(gffdbx.keys()) print >> sys.stderr, "\t".join(str(x) for x in summary) for gene in mrna: g = Grouper() dbns = list(combinations(mrna[gene], 2)) if len(dbns) > 0: for dbn1, dbn2 in dbns: for mrna1, mrna2 in product(mrna[gene][dbn1], mrna[gene][dbn2]): g.join((dbn1, mrna1.id)) g.join((dbn2, mrna2.id)) fUTR, tUTR = None, None if match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2]): fUTR = match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2], \ featuretype='five_prime_UTR', slop=slop) tUTR = match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2], \ featuretype='three_prime_UTR', slop=slop) if fUTR and tUTR: g.join((dbn1, mrna1.id), (dbn2, mrna2.id)) else: for dbn1 in mrna[gene]: for mrna1 in mrna[gene][dbn1]: g.join((dbn1, mrna1.id)) dbn = mrna[gene].keys()[0] gene_coords[gene].sort() _gene = gffdbx[dbn][gene] _gene.start, _gene.stop = gene_coords[gene][0], gene_coords[gene][-1] print >> fw, _gene logging.debug(list(g)) for group in g: dbs, mrnas = [el[0] for el in group], [el[1] for el in group] d, m = dbs[0], mrnas[0] if slop: mlen = 0 for D, M in zip(dbs, mrnas): _mrna = gffdbx[D][M] _mlen = (_mrna.stop - _mrna.start) + 1 if _mlen > mlen: d, m, mlen = D, M, _mlen dbid, _mrnaid = "".join(str(x) for x in set(dbs)), [] _mrnaid = [x for x in mrnas if x not in _mrnaid] mrnaid = "{0}:{1}".format(dbid, "-".join(_mrnaid)) _mrna = gffdbx[d][m] _mrna.attributes['ID'] = [mrnaid] children = gffdbx[d].children(m, order_by='start') print >> fw, _mrna for child in children: child.attributes['ID'] = ["{0}:{1}".format(dbid, child.id)] child.attributes['Parent'] = [mrnaid] print >> fw, child summary = [mrnaid] summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx]) print >> sys.stderr, "\t".join(str(x) for x in summary) fw.close()