def report(args): """ %prog report [--options] ace_file > report Prepare a report of read location, consensus location or quality segment per contig """ from jcvi.utils.table import tabulate p = OptionParser(report.__doc__) types = {"read": ["padded_start", "padded_end", "orient"], "consensus": ["padded_consensus_start", "padded_consensus_end"], "quality" : ["qual_clipping_start", "qual_clipping_end", "align_clipping_start", "align_clipping_end"] } valid_types = tuple(types.keys()) p.add_option("--type", default="read", choices=valid_types, help="choose report type [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) acefile, = args ace = Ace.read(must_open(acefile)) logging.debug('Loaded ace file {0}'.format(acefile)) for c in ace.contigs: print c.name table = dict() if opts.type == "read": ps, pe = [], [] ps = [read.padded_start for read in c.af] for i in xrange(1, len(ps)): pe.append(ps[i] - ps[i-1]) pe.append(c.nbases) map = dict(zip(ps, pe)) for i, read in enumerate(c.af): values = [str(x) for x in (read.padded_start, map[read.padded_start], read.coru)] for i, label in enumerate(types[opts.type]): table[(str(read.name), label)] = values[i] elif opts.type == "consensus": for read in c.bs: values = [str(x) for x in (read.padded_start, read.padded_end)] for i, label in enumerate(types[opts.type]): table[(str(read.name), label)] = values[i] elif opts.type == "quality": for read in c.reads: (r1, r2) = (read.rd, read.qa) values = [str(x) for x in (r2.qual_clipping_start, r2.qual_clipping_end, r2.align_clipping_start, r2.align_clipping_end)] for i, label in enumerate(types[opts.type]): table[(str(r1.name), label)] = values[i] print tabulate(table), "\n"
def summary(args): """ %prog summary input.bed scaffolds.fasta Print out summary statistics per map, followed by consensus summary of scaffold anchoring based on multiple maps. """ p = OptionParser(summary.__doc__) p.set_table(sep="|", align=True) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, scaffolds = args pf = inputbed.rsplit(".", 1)[0] mapbed = pf + ".bed" chr_agp = pf + ".chr.agp" sep = opts.sep align = opts.align cc = Map(mapbed) mapnames = cc.mapnames s = Sizes(scaffolds) total, l50, n50 = s.summary r = {} maps = [] fw = must_open(opts.outfile, "w") print >> fw, "*** Summary for each individual map ***" for mapname in mapnames: markers = [x for x in cc if x.mapname == mapname] ms = MapSummary(markers, l50, s) r["Linkage Groups", mapname] = ms.num_lgs ms.export_table(r, mapname, total) maps.append(ms) print >> fw, tabulate(r, sep=sep, align=align) r = {} agp = AGP(chr_agp) print >> fw, "*** Summary for consensus map ***" consensus_scaffolds = set(x.component_id for x in agp if not x.is_gap) oriented_scaffolds = set(x.component_id for x in agp \ if (not x.is_gap) and x.orientation != '?') unplaced_scaffolds = set(s.mapping.keys()) - consensus_scaffolds for mapname, sc in (("Anchored", consensus_scaffolds), ("Oriented", oriented_scaffolds), ("Unplaced", unplaced_scaffolds)): markers = [x for x in cc if x.seqid in sc] ms = MapSummary(markers, l50, s, scaffolds=sc) ms.export_table(r, mapname, total) print >> fw, tabulate(r, sep=sep, align=align)
def summary(args): """ %prog summary input.bed scaffolds.fasta Print out summary statistics per map, followed by consensus summary of scaffold anchoring based on multiple maps. """ p = OptionParser(summary.__doc__) p.set_table(sep="|", align=True) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, scaffolds = args pf = inputbed.rsplit(".", 1)[0] mapbed = pf + ".bed" chr_agp = pf + ".chr.agp" sep = opts.sep align = opts.align cc = Map(mapbed) mapnames = cc.mapnames s = Sizes(scaffolds) total, l50, n50 = s.summary r = {} maps = [] fw = must_open(opts.outfile, "w") print >> fw, "*** Summary for each individual map ***" for mapname in mapnames: markers = [x for x in cc if x.mapname == mapname] ms = MapSummary(markers, l50, s) r["Linkage Groups", mapname] = ms.num_lgs ms.export_table(r, mapname, total) maps.append(ms) print >> fw, tabulate(r, sep=sep, align=align) r = {} agp = AGP(chr_agp) print >> fw, "*** Summary for consensus map ***" consensus_scaffolds = set(x.component_id for x in agp if not x.is_gap) unplaced_scaffolds = set(s.mapping.keys()) - consensus_scaffolds for mapname, sc in (("Anchored", consensus_scaffolds), ("Unplaced", unplaced_scaffolds)): markers = [x for x in cc if x.seqid in sc] ms = MapSummary(markers, l50, s, scaffolds=sc) ms.export_table(r, mapname, total) print >> fw, tabulate(r, sep=sep, align=align)
def allstats(args): """ %prog allstats fastafiles Summarize multiple FASTA in a table. """ from jcvi.utils.table import tabulate p = OptionParser(allstats.__doc__) p.add_option("--exclude", help="Exclude statistics, must be {0}, " "multiple separated by comma [default: %default]".\ format("|".join(header)) ) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastafiles = args exclude = opts.exclude.split(",") assert all(x in header for x in exclude) tabledict = {} for fastafile in fastafiles: pf = fastafile.rsplit(".", 1)[0] for key, val in n50([fastafile]): if key in exclude: continue tabledict[(pf, key)] = val table = tabulate(tabledict) print >> sys.stderr, table
def test_tabulate(): from jcvi.utils.table import tabulate data = {(1, "a"): 3, (1, "b"): 4, (2, "a"): 5, (2, "b"): 0} assert (tabulate(data) == """=========== o a b ----------- 1 3 4 2 5 0 -----------""") assert (tabulate(data, transpose=True) == """=========== o 1 2 ----------- a 3 5 b 4 0 -----------""")
def summary(args): """ %prog summary *.gff Print gene statistics table. """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) gff_files = args for metric in metrics: logging.debug("Parsing files in `{0}`..".format(metric)) table = {} for x in gff_files: pf = op.basename(x).split(".")[0] numberfile = op.join(metric, pf + ".txt") ar = [int(x.strip()) for x in open(numberfile)] sum = SummaryStats(ar).todict().items() keys, vals = zip(*sum) keys = [(pf, x) for x in keys] table.update(dict(zip(keys, vals))) print >> sys.stderr, tabulate(table)
def summary(args): """ %prog summary *.gff Print gene statistics table. """ from jcvi.utils.table import tabulate p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) gff_files = args for metric in metrics: logging.debug("Parsing files in `{0}`..".format(metric)) table = {} for x in gff_files: pf = op.basename(x).split(".")[0] numberfile = op.join(metric, pf + ".txt") ar = [int(x.strip()) for x in open(numberfile)] sum = SummaryStats(ar).todict().items() keys, vals = zip(*sum) keys = [(pf, x) for x in keys] table.update(dict(zip(keys, vals))) print >> sys.stderr, tabulate(table)
def summary(args): """ %prog summary gffile fastafile Print summary stats, including: - Gene/Exon/Intron - Number - Average size (bp) - Median size (bp) - Total length (Mb) - % of genome - % GC """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gff_file, ref = args s = Fasta(ref) g = make_index(gff_file) geneseqs, exonseqs, intronseqs = [], [], [] # Calc % GC for f in g.features_of_type("gene"): fid = f.id fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop}) geneseqs.append(fseq) exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") exons = list(exons) for chrom, start, stop in exons: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) exonseqs.append(fseq) introns = range_interleave(exons) for chrom, start, stop in introns: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) intronseqs.append(fseq) r = {} # Report for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)): tsizes = [len(x) for x in tseqs] tsummary = SummaryStats(tsizes, dtype="int") r[t, "Number"] = tsummary.size r[t, "Average size (bp)"] = tsummary.mean r[t, "Median size (bp)"] = tsummary.median r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb") r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1) r[t, "% GC"] = gc(tseqs) print >> sys.stderr, tabulate(r)
def script(args): """ %prog script bfs_rfs libs `bfs_rfs` contains the joined results from Brian's `classifyMates`. We want to keep the RFS result (but not in the BFS result) to retain actual MP. Libs contain a list of lib iids, use comma to separate, e.g. "9,10,11". """ p = OptionParser(script.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) fsfile, libs = args libs = [int(x) for x in libs.split(",")] fp = open(fsfile) not_found = ("limited", "exhausted") counts = defaultdict(int) pe, mp = 0, 0 both, noidea = 0, 0 total = 0 for i in libs: print "lib iid {0} allfragsunmated 1".format(i) for row in fp: frgiid, bfs, rfs = row.split() bfs = (bfs not in not_found) rfs = (rfs not in not_found) if bfs and (not rfs): pe += 1 if rfs and (not bfs): mp += 1 frgiid = int(frgiid) mateiid = frgiid + 1 print "frg iid {0} mateiid {1}".format(frgiid, mateiid) print "frg iid {0} mateiid {1}".format(mateiid, frgiid) if bfs and rfs: both += 1 if (not bfs) and (not rfs): noidea += 1 total += 1 assert pe + mp + both + noidea == total counts[("PE", "N")] = pe counts[("MP", "N")] = mp counts[("Both", "N")] = both counts[("No Idea", "N")] = noidea table = tabulate(counts) func = lambda a: a * 100. / total table = table.withNewColumn("Percentage", callback=func, columns=("N",), digits=2) print >> sys.stderr, table
def stats(args): """ %prog stats agpfile Print out a report for length of gaps and components. """ p = OptionParser(stats.__doc__) p.add_option("--warn", default=False, action="store_true", help="Warnings on small component spans [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) agpfile, = args agp = AGP(agpfile) gap_lengths = [] component_lengths = [] for a in agp: span = a.object_span if a.is_gap: label = a.gap_type gap_lengths.append((span, label)) else: label = "{0}:{1}-{2}".format(a.component_id, a.component_beg, \ a.component_end) component_lengths.append((span, label)) if opts.warn and span < 50: logging.error("component span too small ({0}):\n{1}".\ format(span, a)) table = dict() for label, lengths in zip(("Gaps", "Components"), (gap_lengths, component_lengths)): if not lengths: table[(label, "Min")] = table[(label, "Max")] \ = table[(label, "Sum")] = "n.a." continue table[(label, "Min")] = "{0} ({1})".format(*min(lengths)) table[(label, "Max")] = "{0} ({1})".format(*max(lengths)) table[(label, "Sum")] = sum(x[0] for x in lengths) from jcvi.utils.table import tabulate table = tabulate(table) print >> sys.stderr, table
def summary(args): """ %prog summary gffile fastafile Print summary stats, including: - Gene/Exon/Intron - Number - Average size (bp) - Median size (bp) - Total length (Mb) - % of genome - % GC """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gff_file, ref = args s = Fasta(ref) g = make_index(gff_file) geneseqs, exonseqs, intronseqs = [], [], [] # Calc % GC for f in g.features_of_type("gene"): fid = f.id fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop}) geneseqs.append(fseq) exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") exons = list(exons) for chrom, start, stop in exons: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) exonseqs.append(fseq) introns = range_interleave(exons) for chrom, start, stop in introns: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) intronseqs.append(fseq) r = {} # Report for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)): tsizes = [len(x) for x in tseqs] tsummary = SummaryStats(tsizes, dtype="int") r[t, "Number"] = tsummary.size r[t, "Average size (bp)"] = tsummary.mean r[t, "Median size (bp)"] = tsummary.median r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb") r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1) r[t, "% GC"] = gc(tseqs) print(tabulate(r), file=sys.stderr)
def __str__(self): from jcvi.utils.table import tabulate table = {} table[("Prediction-True", "Reality-True")] = self.TP table[("Prediction-True", "Reality-False")] = self.FP table[("Prediction-False", "Reality-True")] = self.FN table[("Prediction-False", "Reality-False")] = self.TN msg = str(tabulate(table)) msg += "\nSensitivity [TP / (TP + FN)]: {0:.1f} %\n".format(self.sensitivity * 100) msg += "Specificity [TP / (TP + FP)]: {0:.1f} %\n".format(self.specificity * 100) msg += "Accuracy [(TP + TN) / (TP + FP + FN + TN)]: {0:.1f} %".format(self.accuracy * 100) return msg
def __str__(self): from jcvi.utils.table import tabulate table = {} table[("Prediction-True", "Reality-True")] = self.TP table[("Prediction-True", "Reality-False")] = self.FP table[("Prediction-False", "Reality-True")] = self.FN table[("Prediction-False", "Reality-False")] = self.TN msg = str(tabulate(table)) msg += "\nSensitivity [TP / (TP + FN)]: {0:.1f} %\n".\ format(self.sensitivity * 100) msg += "Specificity [TP / (TP + FP)]: {0:.1f} %\n".\ format(self.specificity * 100) msg += "Accuracy [(TP + TN) / (TP + FP + FN + TN)]: {0:.1f} %".\ format(self.accuracy * 100) return msg
def genestats(args): """ %prog genestats gffile Print summary stats, including: - Number of genes - Number of single-exon genes - Number of multi-exon genes - Number of distinct exons - Number of genes with alternative transcript variants - Number of predicted transcripts - Mean number of distinct exons per gene - Mean number of transcripts per gene - Mean gene locus size (first to last exon) - Mean transcript size (UTR, CDS) - Mean exon size Stats modeled after barley genome paper Table 1. A physical, genetic and functional sequence assembly of the barley genome """ p = OptionParser(genestats.__doc__) p.add_option("--groupby", default="conf_class", help="Print separate stats groupby") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gff_file, = args gb = opts.groupby g = make_index(gff_file) tf = "transcript.sizes" if need_update(gff_file, tf): fw = open(tf, "w") for feat in g.features_of_type("mRNA"): fid = feat.id conf_class = feat.attributes.get(gb, "all") tsize = sum((c.stop - c.start + 1) for c in g.children(fid, 1) \ if c.featuretype == "exon") print >> fw, "\t".join((fid, str(tsize), conf_class)) fw.close() tsizes = DictFile(tf, cast=int) conf_classes = DictFile(tf, valuepos=2) logging.debug("A total of {0} transcripts populated.".format(len(tsizes))) genes = [] for feat in g.features_of_type("gene"): fid = feat.id transcripts = [c.id for c in g.children(fid, 1) \ if c.featuretype == "mRNA"] transcript_sizes = [tsizes[x] for x in transcripts] exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") conf_class = conf_classes[transcripts[0]] gs = GeneStats(feat, conf_class, transcript_sizes, exons) genes.append(gs) r = {} # Report distinct_groups = set(conf_classes.values()) for g in distinct_groups: num_genes = num_single_exon_genes = num_multi_exon_genes = 0 num_genes_with_alts = num_transcripts = num_exons = 0 cum_locus_size = cum_transcript_size = cum_exon_size = 0 for gs in genes: if gs.conf_class != g: continue num_genes += 1 if gs.num_exons == 1: num_single_exon_genes += 1 else: num_multi_exon_genes += 1 num_exons += gs.num_exons if gs.num_transcripts > 1: num_genes_with_alts += 1 num_transcripts += gs.num_transcripts cum_locus_size += gs.locus_size cum_transcript_size += gs.cum_transcript_size cum_exon_size += gs.cum_exon_size mean_num_exons = num_exons * 1. / num_genes mean_num_transcripts = num_transcripts * 1. / num_genes mean_locus_size = cum_locus_size * 1. / num_genes mean_transcript_size = cum_transcript_size * 1. / num_transcripts mean_exon_size = cum_exon_size * 1. / num_exons r[("Number of genes", g)] = num_genes r[("Number of single-exon genes", g)] = \ percentage(num_single_exon_genes, num_genes, mode=1) r[("Number of multi-exon genes", g)] = \ percentage(num_multi_exon_genes, num_genes, mode=1) r[("Number of distinct exons", g)] = num_exons r[("Number of genes with alternative transcript variants", g)] = \ percentage(num_genes_with_alts, num_genes, mode=1) r[("Number of predicted transcripts", g)] = num_transcripts r[("Mean number of distinct exons per gene", g)] = mean_num_exons r[("Mean number of transcripts per gene", g)] = mean_num_transcripts r[("Mean gene locus size (first to last exon)", g)] = mean_locus_size r[("Mean transcript size (UTR, CDS)", g)] = mean_transcript_size r[("Mean exon size", g)] = mean_exon_size print >> sys.stderr, tabulate(r)
def summary(args): """ %prog summary txtfile fastafile The txtfile can be generated by: %prog mstmap --noheader --freq=0 Tabulate on all possible combinations of genotypes and provide results in a nicely-formatted table. Give a fastafile for SNP rate (average # of SNPs per Kb). Only three-column file is supported: locus_id intra- genotype inter- genotype """ from jcvi.utils.cbook import thousands from jcvi.utils.table import tabulate p = OptionParser(summary.__doc__) p.add_option("--counts", help="Print SNP counts in a txt file [default: %default]") p.add_option("--bed", help="Print SNPs locations in a bed file [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) txtfile, fastafile = args bedfw = open(opts.bed, "w") if opts.bed else None fp = open(txtfile) header = fp.next().split() # Header snps = defaultdict(list) # contig => list of loci combinations = defaultdict(int) intraSNPs = interSNPs = 0 distinctSet = set() # set of genes that show A-B pattern ref, alt = header[1:3] snpcounts, goodsnpcounts = defaultdict(int), defaultdict(int) for row in fp: atoms = row.split() assert len(atoms) == 3, \ "Only three-column file is supported" locus, intra, inter = atoms ctg, pos = locus.rsplit(".", 1) pos = int(pos) snps[ctg].append(pos) snpcounts[ctg] += 1 if intra == 'X': intraSNPs += 1 if inter in ('B', 'X'): interSNPs += 1 if intra == 'A' and inter == 'B': distinctSet.add(ctg) goodsnpcounts[ctg] += 1 # Tabulate all possible combinations intra = ref + "-" + intra inter = alt + "-" + inter combinations[(intra, inter)] += 1 if bedfw: print >> bedfw, "\t".join(str(x) for x in \ (ctg, pos - 1, pos, locus)) if bedfw: logging.debug("SNP locations written to `{0}`.".format(opts.bed)) bedfw.close() nsites = sum(len(x) for x in snps.values()) sizes = Sizes(fastafile) bpsize = sizes.totalsize snprate = lambda a: a * 1000. / bpsize m = "Dataset `{0}` contains {1} contigs ({2} bp).\n".\ format(fastafile, len(sizes), thousands(bpsize)) m += "A total of {0} SNPs within {1} contigs ({2} bp).\n".\ format(nsites, len(snps), thousands(sum(sizes.mapping[x] for x in snps.keys()))) m += "SNP rate: {0:.1f}/Kb, ".format(snprate(nsites)) m += "IntraSNPs: {0} ({1:.1f}/Kb), InterSNPs: {2} ({3:.1f}/Kb)".\ format(intraSNPs, snprate(intraSNPs), interSNPs, snprate(interSNPs)) print >> sys.stderr, m print >> sys.stderr, tabulate(combinations) leg = "Legend: A - homozygous same, B - homozygous different, X - heterozygous" print >> sys.stderr, leg tag = (ref + "-A", alt + "-B") distinctSNPs = combinations[tag] tag = str(tag).replace("'", "") print >> sys.stderr, "A total of {0} disparate {1} SNPs in {2} contigs.".\ format(distinctSNPs, tag, len(distinctSet)) if not opts.counts: return snpcountsfile = opts.counts fw = open(snpcountsfile, "w") header = "\t".join(("Contig", "#_SNPs", "#_AB_SNP")) print >> fw, header assert sum(snpcounts.values()) == nsites assert sum(goodsnpcounts.values()) == distinctSNPs for ctg in sorted(snps.keys()): snpcount = snpcounts[ctg] goodsnpcount = goodsnpcounts[ctg] print >> fw, "\t".join(str(x) for x in (ctg, snpcount, goodsnpcount)) fw.close() logging.debug("SNP counts per contig is written to `{0}`.".\ format(snpcountsfile))
def genestats(args): """ %prog genestats gffile Print summary stats, including: - Number of genes - Number of single-exon genes - Number of multi-exon genes - Number of distinct exons - Number of genes with alternative transcript variants - Number of predicted transcripts - Mean number of distinct exons per gene - Mean number of transcripts per gene - Mean gene locus size (first to last exon) - Mean transcript size (UTR, CDS) - Mean exon size Stats modeled after barley genome paper Table 1. A physical, genetic and functional sequence assembly of the barley genome """ p = OptionParser(genestats.__doc__) p.add_option("--groupby", default="conf_class", help="Print separate stats groupby") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gff_file, = args gb = opts.groupby g = make_index(gff_file) tf = "transcript.sizes" if need_update(gff_file, tf): fw = open(tf, "w") for feat in g.features_of_type("mRNA"): fid = feat.id conf_class = feat.attributes.get(gb, "all") tsize = sum((c.stop - c.start + 1) for c in g.children(fid, 1) \ if c.featuretype == "exon") print >> fw, "\t".join((fid, str(tsize), conf_class)) fw.close() tsizes = DictFile(tf, cast=int) conf_classes = DictFile(tf, valuepos=2) logging.debug("A total of {0} transcripts populated.".format(len(tsizes))) genes = [] for feat in g.features_of_type("gene"): fid = feat.id transcripts = [c.id for c in g.children(fid, 1) \ if c.featuretype == "mRNA"] transcript_sizes = [tsizes[x] for x in transcripts] exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") conf_class = conf_classes[transcripts[0]] gs = GeneStats(feat, conf_class, transcript_sizes, exons) genes.append(gs) r = {} # Report distinct_groups = set(conf_classes.values()) for g in distinct_groups: num_genes = num_single_exon_genes = num_multi_exon_genes = 0 num_genes_with_alts = num_transcripts = num_exons = max_transcripts = 0 cum_locus_size = cum_transcript_size = cum_exon_size = 0 for gs in genes: if gs.conf_class != g: continue num_genes += 1 if gs.num_exons == 1: num_single_exon_genes += 1 else: num_multi_exon_genes += 1 num_exons += gs.num_exons if gs.num_transcripts > 1: num_genes_with_alts += 1 if gs.num_transcripts > max_transcripts: max_transcripts = gs.num_transcripts num_transcripts += gs.num_transcripts cum_locus_size += gs.locus_size cum_transcript_size += gs.cum_transcript_size cum_exon_size += gs.cum_exon_size mean_num_exons = num_exons * 1. / num_genes mean_num_transcripts = num_transcripts * 1. / num_genes mean_locus_size = cum_locus_size * 1. / num_genes mean_transcript_size = cum_transcript_size * 1. / num_transcripts mean_exon_size = cum_exon_size * 1. / num_exons r[("Number of genes", g)] = num_genes r[("Number of single-exon genes", g)] = \ percentage(num_single_exon_genes, num_genes, mode=1) r[("Number of multi-exon genes", g)] = \ percentage(num_multi_exon_genes, num_genes, mode=1) r[("Number of distinct exons", g)] = num_exons r[("Number of genes with alternative transcript variants", g)] = \ percentage(num_genes_with_alts, num_genes, mode=1) r[("Number of predicted transcripts", g)] = num_transcripts r[("Mean number of distinct exons per gene", g)] = mean_num_exons r[("Mean number of transcripts per gene", g)] = mean_num_transcripts r[("Max number of transcripts per gene", g)] = max_transcripts r[("Mean gene locus size (first to last exon)", g)] = mean_locus_size r[("Mean transcript size (UTR, CDS)", g)] = mean_transcript_size r[("Mean exon size", g)] = mean_exon_size fw = must_open(opts.outfile, "w") print >> fw, tabulate(r) fw.close()