def summary(args): """ %prog summary old.new.chain old.fasta new.fasta Provide stats of the chain file. """ from jcvi.formats.fasta import summary as fsummary from jcvi.utils.cbook import percentage, human_size p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) chainfile, oldfasta, newfasta = args chain = Chain(chainfile) ungapped, dt, dq = chain.ungapped, chain.dt, chain.dq print >> sys.stderr, "File `{0}` contains {1} chains.".\ format(chainfile, len(chain)) print >> sys.stderr, "ungapped={0} dt={1} dq={2}".\ format(human_size(ungapped), human_size(dt), human_size(dq)) oldreal, oldnn, oldlen = fsummary([oldfasta, "--outfile=/dev/null"]) print >> sys.stderr, "Old fasta (`{0}`) mapped: {1}".\ format(oldfasta, percentage(ungapped, oldreal)) newreal, newnn, newlen = fsummary([newfasta, "--outfile=/dev/null"]) print >> sys.stderr, "New fasta (`{0}`) mapped: {1}".\ format(newfasta, percentage(ungapped, newreal))
def venn(args): """ %prog venn *.benchmark Display benchmark results as Venn diagram. """ from matplotlib_venn import venn2 p = OptionParser(venn.__doc__) opts, args, iopts = p.set_image_options(args, figsize="9x9") if len(args) < 1: sys.exit(not p.print_help()) bcs = args fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) pad = .02 ystart = 1 ywidth = 1. / len(bcs) tags = ("Bowers", "YGOB", "Schnable") for bc, tag in zip(bcs, tags): fp = open(bc) data = [] for row in fp: prog, pcounts, tcounts, shared = row.split() pcounts = int(pcounts) tcounts = int(tcounts) shared = int(shared) data.append((prog, pcounts, tcounts, shared)) xstart = 0 xwidth = 1. / len(data) for prog, pcounts, tcounts, shared in data: a, b, c = pcounts - shared, tcounts - shared, shared ax = fig.add_axes([xstart + pad, ystart - ywidth + pad, xwidth - 2 * pad, ywidth - 2 * pad]) venn2(subsets=(a, b, c), set_labels=(prog, tag), ax=ax) message = "Sn={0} Pu={1}".\ format(percentage(shared, tcounts, precision=0, mode=-1), percentage(shared, pcounts, precision=0, mode=-1)) print(message, file=sys.stderr) ax.text(.5, .92, latex(message), ha="center", va="center", transform=ax.transAxes, color='b') ax.set_axis_off() xstart += xwidth ystart -= ywidth panel_labels(root, ((.04, .96, "A"), (.04, .96 - ywidth, "B"), (.04, .96 - 2 * ywidth, "C"))) panel_labels(root, ((.5, .98, "A. thaliana duplicates"), (.5, .98 - ywidth, "14 Yeast genomes"), (.5, .98 - 2 * ywidth, "4 Grass genomes"))) normalize_axes(root) savefig("venn.pdf", dpi=opts.dpi)
def venn(args): """ %prog venn *.benchmark Display benchmark results as Venn diagram. """ from matplotlib_venn import venn2 p = OptionParser(venn.__doc__) opts, args, iopts = p.set_image_options(args, figsize="9x9") if len(args) < 1: sys.exit(not p.print_help()) bcs = args fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) pad = .02 ystart = 1 ywidth = 1. / len(bcs) tags = ("Bowers", "YGOB", "Schnable") for bc, tag in zip(bcs, tags): fp = open(bc) data = [] for row in fp: prog, pcounts, tcounts, shared = row.split() pcounts = int(pcounts) tcounts = int(tcounts) shared = int(shared) data.append((prog, pcounts, tcounts, shared)) xstart = 0 xwidth = 1. / len(data) for prog, pcounts, tcounts, shared in data: a, b, c = pcounts - shared, tcounts - shared, shared ax = fig.add_axes([xstart + pad, ystart - ywidth + pad, xwidth - 2 * pad, ywidth - 2 * pad]) venn2(subsets=(a, b, c), set_labels=(prog, tag), ax=ax) message = "Sn={0} Pu={1}".\ format(percentage(shared, tcounts, precision=0, mode=-1), percentage(shared, pcounts, precision=0, mode=-1)) print >> sys.stderr, message ax.text(.5, .92, latex(message), ha="center", va="center", transform=ax.transAxes, color='b') ax.set_axis_off() xstart += xwidth ystart -= ywidth panel_labels(root, ((.04, .96, "A"), (.04, .96 - ywidth, "B"), (.04, .96 - 2 * ywidth, "C"))) panel_labels(root, ((.5, .98, "A. thaliana duplicates"), (.5, .98 - ywidth, "14 Yeast genomes"), (.5, .98 - 2 * ywidth, "4 Grass genomes"))) normalize_axes(root) savefig("venn.pdf", dpi=opts.dpi)
def fillstats(args): """ %prog fillstats genome.fill Build stats on .fill file from GapCloser. """ from jcvi.utils.cbook import SummaryStats, percentage, thousands p = OptionParser(fillstats.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fillfile, ) = args fp = open(fillfile) scaffolds = 0 gaps = [] for row in fp: if row[0] == ">": scaffolds += 1 continue fl = FillLine(row) gaps.append(fl) print("{0} scaffolds in total".format(scaffolds), file=sys.stderr) closed = [x for x in gaps if x.closed] closedbp = sum(x.before for x in closed) notClosed = [x for x in gaps if not x.closed] notClosedbp = sum(x.before for x in notClosed) totalgaps = len(closed) + len(notClosed) print( "Closed gaps: {0} size: {1} bp".format( percentage(len(closed), totalgaps), thousands(closedbp)), file=sys.stderr, ) ss = SummaryStats([x.after for x in closed]) print(ss, file=sys.stderr) ss = SummaryStats([x.delta for x in closed]) print("Delta:", ss, file=sys.stderr) print( "Remaining gaps: {0} size: {1} bp".format( percentage(len(notClosed), totalgaps), thousands(notClosedbp)), file=sys.stderr, ) ss = SummaryStats([x.after for x in notClosed]) print(ss, file=sys.stderr)
def print_stats(self): qrycovered = self.qrycovered refcovered = self.refcovered qryspan = self.qryspan refspan = self.refspan m0 = "AL50 (>=50% of bases in alignment blocks >= this size): {}".format( self.AL50 ) m1 = "Query coverage: {}".format(percentage(self.identicals, qrycovered)) m2 = "Reference coverage: {}".format(percentage(self.identicals, refcovered)) m3 = "Query span: {}".format(percentage(self.identicals, qryspan)) m4 = "Reference span: {}".format(percentage(self.identicals, refspan)) print("\n".join((m0, m1, m2, m3, m4)), file=sys.stderr)
def filter(args): """ %prog filter frgfile idsfile Removes the reads from frgfile that are indicated as duplicates in the clstrfile (generated by CD-HIT-454). `idsfile` includes a set of names to include in the filtered frgfile. See apps.cdhit.ids(). """ p = OptionParser(filter.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) frgfile, idsfile = args assert frgfile.endswith(".frg") fp = open(idsfile) allowed = set(x.strip() for x in fp) logging.debug("A total of {0} allowed ids loaded.".format(len(allowed))) newfrgfile = frgfile.replace(".frg", ".filtered.frg") fp = open(frgfile) fw = open(newfrgfile, "w") nfrags, discarded_frags = 0, 0 nmates, discarded_mates = 0, 0 for rec in iter_records(fp): if rec.type == "FRG": readname = rec.get_field("acc") readname = readname.rstrip("ab") nfrags += 1 if readname not in allowed: discarded_frags += 1 continue if rec.type == "LKG": readname = rec.get_field("frg") readname = readname.rstrip("ab") nmates += 1 if readname not in allowed: discarded_mates += 1 continue print >> fw, rec # Print out a summary survived_frags = nfrags - discarded_frags survived_mates = nmates - discarded_mates print >> sys.stderr, "Survived fragments: {0}".\ format(percentage(survived_frags, nfrags)) print >> sys.stderr, "Survived mates: {0}".\ format(percentage(survived_mates, nmates))
def fix(args): """ %prog fix bedfile > newbedfile Fix non-standard bed files. One typical problem is start > end. """ p = OptionParser(fix.__doc__) p.add_option("--minspan", default=0, type="int", help="Enforce minimum span [default: %default]") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args minspan = opts.minspan fp = open(bedfile) fw = must_open(opts.outfile, "w") nfixed = nfiltered = ntotal = 0 for row in fp: atoms = row.strip().split("\t") assert len(atoms) >= 3, "Must be at least 3 columns" seqid, start, end = atoms[:3] start, end = int(start), int(end) orientation = '+' if start > end: start, end = end, start orientation = '-' nfixed += 1 atoms[1:3] = [str(start), str(end)] if len(atoms) > 6: atoms[6] = orientation line = "\t".join(atoms) b = BedLine(line) if b.span >= minspan: print >> fw, b nfiltered += 1 ntotal += 1 if nfixed: logging.debug("Total fixed: {0}".format(percentage(nfixed, ntotal))) if nfiltered: logging.debug("Total filtered: {0}".format( percentage(nfiltered, ntotal)))
def print_stats(self): qrycovered = self.qrycovered refcovered = self.refcovered qryspan = self.qryspan refspan = self.refspan m0 = "AL50 (>=50% of bases in alignment blocks >= this size): {}".\ format(self.AL50) m1 = "Query coverage: {}".\ format(percentage(self.identicals, qrycovered)) m2 = "Reference coverage: {}".\ format(percentage(self.identicals, refcovered)) m3 = "Query span: {}".format(percentage(self.identicals, qryspan)) m4 = "Reference span: {}".format(percentage(self.identicals, refspan)) print("\n".join((m0, m1, m2, m3, m4)), file=sys.stderr)
def filter(args): """ %prog filter bedfile Filter the bedfile to retain records between certain size range. """ p = OptionParser(filter.__doc__) p.add_option("--minsize", default=0, type="int", help="Minimum feature length") p.add_option("--maxsize", default=1000000000, type="int", help="Minimum feature length") p.add_option( "--minaccn", type="int", help="Minimum value of accn, useful to filter based on coverage") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args fp = must_open(bedfile) fw = must_open(opts.outfile, "w") minsize, maxsize = opts.minsize, opts.maxsize minaccn = opts.minaccn total = [] keep = [] for row in fp: b = BedLine(row) span = b.span total.append(span) if not minsize <= span <= maxsize: continue if minaccn and int(b.accn) < minaccn: continue print >> fw, b keep.append(span) logging.debug("Stats: {0} features kept.".\ format(percentage(len(keep), len(total)))) logging.debug("Stats: {0} bases kept.".\ format(percentage(sum(keep), sum(total))))
def gc(seqs): gc = total = 0 for s in seqs: s = s.upper() gc += s.count('G') + s.count('C') total += sum(s.count(x) for x in 'ACGT') return percentage(gc, total, precision=0, mode=-1)
def loghistogram(data, base=2, ascii=True, title="Counts", summary=False): """ bins is a dictionary with key: log(x, base), value: counts. """ from jcvi.utils.cbook import percentage if summary: unique = len(data) total = sum(data) # Print out a distribution print >> sys.stderr, "Unique: {0}".format(percentage(unique, total)) bins = defaultdict(int) for d in data: logd = int(log(d, base)) bins[logd] += 1 x, y = [], [] for size, number in sorted(bins.items()): lb, ub = base ** size, base ** (size + 1) x.append((lb, ub)) y.append(number) asciiplot(x, y, title=title)
def lobstrindex(args): """ %prog lobstrindex hg38.trf.bed hg38.upper.fa Make lobSTR index. Make sure the FASTA contain only upper case (so use fasta.format --upper to convert from UCSC fasta). The bed file is generated by str(). """ p = OptionParser(lobstrindex.__doc__) p.add_option( "--notreds", default=False, action="store_true", help="Remove TREDs from the bed file", ) p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trfbed, fastafile = args pf = fastafile.split(".")[0] lhome = opts.lobstr_home mkdir(pf) if opts.notreds: newbedfile = trfbed + ".new" newbed = open(newbedfile, "w") fp = open(trfbed) retained = total = 0 seen = set() for row in fp: r = STRLine(row) total += 1 name = r.longname if name in seen: continue seen.add(name) print(r, file=newbed) retained += 1 newbed.close() logging.debug("Retained: {0}".format(percentage(retained, total))) else: newbedfile = trfbed mm = MakeManager() cmd = "python {0}/scripts/lobstr_index.py".format(lhome) cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf) mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd) tabfile = "{0}/index.tab".format(pf) cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome) cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile) mm.add((newbedfile, fastafile), tabfile, cmd) infofile = "{0}/index.info".format(pf) cmd = "cp {0} {1}".format(newbedfile, infofile) mm.add(trfbed, infofile, cmd) mm.write()
def uniq(args): """ %prog uniq fastqfile Retain only first instance of duplicate reads. Duplicate is defined as having the same read name. """ p = OptionParser(uniq.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args fw = must_open(opts.outfile, "w") nduplicates = nreads = 0 seen = set() for rec in iter_fastq(fastqfile): nreads += 1 if rec is None: break name = rec.name if name in seen: nduplicates += 1 continue seen.add(name) print >> fw, rec logging.debug("Removed duplicate reads: {}".\ format(percentage(nduplicates, nreads)))
def mismatches(args): """ %prog mismatches blastfile Print out histogram of mismatches of HSPs, usually for evaluating SNP level. """ from jcvi.utils.cbook import percentage from jcvi.graphics.histogram import stem_leaf_plot p = OptionParser(mismatches.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) blastfile, = args data = [] b = Blast(blastfile) for query, bline in b.iter_best_hit(): mm = bline.nmismatch + bline.ngaps data.append(mm) nonzeros = [x for x in data if x != 0] title = "Polymorphic sites: {0}".\ format(percentage(len(nonzeros), len(data))) stem_leaf_plot(data, 0, 20, 20, title=title)
def batchcn(args): """ %prog batchcn workdir samples.csv Run CNV segmentation caller in batch mode. Scans a workdir. """ p = OptionParser(batchcn.__doc__) p.add_option("--upload", default="s3://hli-mv-data-science/htang/ccn", help="Upload cn and seg results to s3") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) workdir, samples = args upload = opts.upload store = upload + "/{}/*.seg".format(workdir) computed = [op.basename(x).split(".")[0] for x in glob_s3(store)] computed = set(computed) # Generate a bunch of cn commands fp = open(samples) nskipped = ntotal = 0 cmd = "python -m jcvi.variation.cnv cn --hmm --cleanup {}".format(workdir) for row in fp: samplekey, path = row.strip().split(",") ntotal += 1 if samplekey in computed: nskipped += 1 continue print(" ".join((cmd, samplekey, path))) logging.debug("Skipped: {}".format(percentage(nskipped, ntotal)))
def suffix(args): """ %prog suffix fastqfile CAG Filter reads based on suffix. """ p = OptionParser(suffix.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastqfile, sf = args fw = must_open(opts.outfile, "w") nreads = nselected = 0 for rec in iter_fastq(fastqfile): nreads += 1 if rec is None: break if rec.seq.endswith(sf): print >> fw, rec nselected += 1 logging.debug("Selected reads with suffix {0}: {1}".\ format(sf, percentage(nselected, nreads)))
def batchcn(args): """ %prog batchcn workdir samples.csv Run CNV segmentation caller in batch mode. Scans a workdir. """ p = OptionParser(batchcn.__doc__) p.add_option("--upload", default="s3://hli-mv-data-science/htang/ccn", help="Upload cn and seg results to s3") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) workdir, samples = args upload = opts.upload store = upload + "/{}/*.seg".format(workdir) computed = [op.basename(x).split(".")[0] for x in glob_s3(store)] computed = set(computed) # Generate a bunch of cn commands fp = open(samples) nskipped = ntotal = 0 cmd = "python -m jcvi.variation.cnv cn --hmm --cleanup {}".format(workdir) for row in fp: samplekey, path = row.strip().split(",") ntotal += 1 if samplekey in computed: nskipped += 1 continue print " ".join((cmd, samplekey, path)) logging.debug("Skipped: {}".format(percentage(nskipped, ntotal)))
def suffix(args): """ %prog suffix fastqfile CAG Filter reads based on suffix. """ from jcvi.utils.cbook import percentage p = OptionParser(suffix.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastqfile, sf = args fw = must_open(opts.outfile, "w") nreads = nselected = 0 for rec in iter_fastq(fastqfile): nreads += 1 if rec is None: break if rec.seq.endswith(sf): print >> fw, rec nselected += 1 logging.debug("Selected reads with suffix {0}: {1}".format(sf, percentage(nselected, nreads)))
def filterm4(args): """ %prog filterm4 sample.m4 > filtered.m4 Filter .m4 file after blasr is run. As blasr takes a long time to run, changing -bestn is undesirable. This screens the m4 file to retain top hits. """ p = OptionParser(filterm4.__doc__) p.add_option("--best", default=1, type="int", help="Only retain best N hits") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) m4file, = args best = opts.best fp = open(m4file) fw = must_open(opts.outfile, "w") seen = defaultdict(int) retained = total = 0 for row in fp: r = M4Line(row) total += 1 if total % 100000 == 0: logging.debug("Retained {0} lines".\ format(percentage(retained, total))) if seen.get(r.query, 0) < best: fw.write(row) seen[r.query] += 1 retained += 1 fw.close()
def gc(seqs): gc = total = 0 for s in seqs: s = s.upper() gc += s.count("G") + s.count("C") total += sum(s.count(x) for x in "ACGT") return percentage(gc, total, precision=0, mode=-1)
def distance(args): """ %prog distance bedfile Calculate distance between bed features. The output file is a list of distances, which can be used to plot histogram, etc. """ from jcvi.utils.iter import pairwise p = OptionParser(distance.__doc__) p.add_option("--distmode", default="ss", choices=("ss", "ee"), help="Distance mode between paired reads. ss is outer distance, " \ "ee is inner distance [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args sortedbedfile = sort([bedfile]) valid = total = 0 fp = open(sortedbedfile) for a, b in pairwise(fp): a = BedLine(a) b = BedLine(b) ar = (a.seqid, a.start, a.end, "+") br = (b.seqid, b.start, b.end, "+") dist, oo = range_distance(ar, br, distmode=opts.distmode) total += 1 if dist > 0: print dist valid += 1 logging.debug("Total valid (> 0) distances: {0}.".\ format(percentage(valid, total)))
def mismatches(args): """ %prog mismatches blastfile Print out histogram of mismatches of HSPs, usually for evaluating SNP level. """ from jcvi.utils.cbook import percentage from jcvi.graphics.histogram import stem_leaf_plot p = OptionParser(mismatches.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) blastfile, = args data = [] matches = 0 b = Blast(blastfile) for query, bline in b.iter_best_hit(): mm = bline.nmismatch + bline.ngaps data.append(mm) nonzeros = [x for x in data if x != 0] title = "Polymorphic sites: {0}".\ format(percentage(len(nonzeros), len(data))) stem_leaf_plot(data, 0, 20, 20, title=title)
def uniq(args): """ %prog uniq fastqfile Retain only first instance of duplicate reads. Duplicate is defined as having the same read name. """ p = OptionParser(uniq.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args fw = must_open(opts.outfile, "w") nduplicates = nreads = 0 seen = set() for rec in iter_fastq(fastqfile): nreads += 1 if rec is None: break name = rec.name if name in seen: nduplicates += 1 continue seen.add(name) print(rec, file=fw) logging.debug("Removed duplicate reads: {}".\ format(percentage(nduplicates, nreads)))
def fix(args): """ %prog fix bedfile > newbedfile Fix non-standard bed files. One typical problem is start > end. """ p = OptionParser(fix.__doc__) p.add_option("--minspan", default=0, type="int", help="Enforce minimum span [default: %default]") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args minspan = opts.minspan fp = open(bedfile) fw = must_open(opts.outfile, "w") nfixed = nfiltered = ntotal = 0 for row in fp: atoms = row.strip().split("\t") assert len(atoms) >= 3, "Must be at least 3 columns" seqid, start, end = atoms[:3] start, end = int(start), int(end) orientation = '+' if start > end: start, end = end, start orientation = '-' nfixed += 1 atoms[1:3] = [str(start), str(end)] if len(atoms) > 6: atoms[6] = orientation line = "\t".join(atoms) b = BedLine(line) if b.span >= minspan: print >> fw, b nfiltered += 1 ntotal += 1 if nfixed: logging.debug("Total fixed: {0}".format(percentage(nfixed, ntotal))) if nfiltered: logging.debug("Total filtered: {0}".format(percentage(nfiltered, ntotal)))
def stats(args): """ %prog stats blocksfile Provide statistics for MCscan-style blocks. The count of homologs in each pivot gene is recorded. """ from jcvi.utils.cbook import percentage p = OptionParser(stats.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) blocksfile, = args fp = open(blocksfile) counts = defaultdict(int) total = orthologous = 0 for row in fp: atoms = row.rstrip().split("\t") hits = [x for x in atoms[1:] if x != '.'] counts[len(hits)] += 1 total += 1 if atoms[1] != '.': orthologous += 1 print("Total lines: {0}".format(total), file=sys.stderr) for i, n in sorted(counts.items()): print("Count {0}: {1}".format(i, percentage(n, total)), file=sys.stderr) print(file=sys.stderr) matches = sum(n for i, n in counts.items() if i != 0) print("Total lines with matches: {0}".\ format(percentage(matches, total)), file=sys.stderr) for i, n in sorted(counts.items()): if i == 0: continue print("Count {0}: {1}".format(i, percentage(n, matches)), file=sys.stderr) print(file=sys.stderr) print("Orthologous matches: {0}".\ format(percentage(orthologous, matches)), file=sys.stderr)
def fillstats(args): """ %prog fillstats genome.fill Build stats on .fill file from GapCloser. """ from jcvi.utils.cbook import SummaryStats, percentage, thousands p = OptionParser(fillstats.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fillfile, = args fp = open(fillfile) scaffolds = 0 gaps = [] for row in fp: if row[0] == ">": scaffolds += 1 continue fl = FillLine(row) gaps.append(fl) print >> sys.stderr, "{0} scaffolds in total".format(scaffolds) closed = [x for x in gaps if x.closed] closedbp = sum(x.before for x in closed) notClosed = [x for x in gaps if not x.closed] notClosedbp = sum(x.before for x in notClosed) totalgaps = len(closed) + len(notClosed) totalbp = closedbp + notClosedbp print >> sys.stderr, "Closed gaps: {0} size: {1} bp".\ format(percentage(len(closed), totalgaps), thousands(closedbp)) ss = SummaryStats([x.after for x in closed]) print >> sys.stderr, ss ss = SummaryStats([x.delta for x in closed]) print >> sys.stderr, "Delta:", ss print >> sys.stderr, "Remaining gaps: {0} size: {1} bp".\ format(percentage(len(notClosed), totalgaps), thousands(notClosedbp)) ss = SummaryStats([x.after for x in notClosed]) print >> sys.stderr, ss
def stats(args): """ %prog stats blocksfile Provide statistics for MCscan-style blocks. The count of homologs in each pivot gene is recorded. """ from jcvi.utils.cbook import percentage p = OptionParser(stats.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) blocksfile, = args fp = open(blocksfile) counts = defaultdict(int) total = orthologous = 0 for row in fp: atoms = row.rstrip().split("\t") hits = [x for x in atoms[1:] if x != '.'] counts[len(hits)] += 1 total += 1 if atoms[1] != '.': orthologous += 1 print >> sys.stderr, "Total lines: {0}".format(total) for i, n in sorted(counts.items()): print >> sys.stderr, "Count {0}: {1}".format(i, percentage(n, total)) print >> sys.stderr matches = sum(n for i, n in counts.items() if i != 0) print >> sys.stderr, "Total lines with matches: {0}".\ format(percentage(matches, total)) for i, n in sorted(counts.items()): if i == 0: continue print >> sys.stderr, "Count {0}: {1}".format(i, percentage(n, matches)) print >> sys.stderr print >> sys.stderr, "Orthologous matches: {0}".\ format(percentage(orthologous, matches))
def lobstrindex(args): """ %prog lobstrindex hg38.trf.bed hg38.upper.fa Make lobSTR index. Make sure the FASTA contain only upper case (so use fasta.format --upper to convert from UCSC fasta). The bed file is generated by str(). """ p = OptionParser(lobstrindex.__doc__) p.add_option("--notreds", default=False, action="store_true", help="Remove TREDs from the bed file") p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) trfbed, fastafile = args pf = fastafile.split(".")[0] lhome = opts.lobstr_home mkdir(pf) if opts.notreds: newbedfile = trfbed + ".new" newbed = open(newbedfile, "w") fp = open(trfbed) retained = total = 0 seen = set() for row in fp: r = STRLine(row) total += 1 name = r.longname if name in seen: continue seen.add(name) print >> newbed, r retained += 1 newbed.close() logging.debug("Retained: {0}".format(percentage(retained, total))) else: newbedfile = trfbed mm = MakeManager() cmd = "python {0}/scripts/lobstr_index.py".format(lhome) cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf) mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd) tabfile = "{0}/index.tab".format(pf) cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome) cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile) mm.add((newbedfile, fastafile), tabfile, cmd) infofile = "{0}/index.info".format(pf) cmd = "cp {0} {1}".format(newbedfile, infofile) mm.add(trfbed, infofile, cmd) mm.write()
def lobstrindex(args): """ %prog lobstrindex hg38.trf.bed hg38.upper.fa hg38 Make lobSTR index. Make sure the FASTA contain only upper case (so use fasta.format --upper to convert from UCSC fasta). The bed file is generated by str(). """ p = OptionParser(lobstrindex.__doc__) p.add_option("--fixseq", action="store_true", default=False, help="Scan sequences to extract perfect STRs") p.set_home("lobstr") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) trfbed, fastafile, pf = args lhome = opts.lobstr_home mkdir(pf) if opts.fixseq: genome = pyfasta.Fasta(fastafile) newbedfile = trfbed + ".new" newbed = open(newbedfile, "w") fp = open(trfbed) retained = total = 0 for row in fp: s = STRLine(row) total += 1 for ns in s.iter_exact_str(genome): if not ns.is_valid(): continue print >> newbed, ns retained += 1 newbed.close() logging.debug("Retained: {0}".format(percentage(retained, total))) else: newbedfile = trfbed mm = MakeManager() cmd = "python {0}/scripts/lobstr_index.py".format(lhome) cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf) mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd) tabfile = "{0}/index.tab".format(pf) cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome) cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile) mm.add((newbedfile, fastafile), tabfile, cmd) infofile = "{0}/index.info".format(pf) cmd = "cp {0} {1}".format(trfbed, infofile) mm.add(trfbed, infofile, cmd) mm.write()
def summary(args): """ %prog summary gffile fastafile Print summary stats, including: - Gene/Exon/Intron - Number - Average size (bp) - Median size (bp) - Total length (Mb) - % of genome - % GC """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gff_file, ref = args s = Fasta(ref) g = make_index(gff_file) geneseqs, exonseqs, intronseqs = [], [], [] # Calc % GC for f in g.features_of_type("gene"): fid = f.id fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop}) geneseqs.append(fseq) exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") exons = list(exons) for chrom, start, stop in exons: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) exonseqs.append(fseq) introns = range_interleave(exons) for chrom, start, stop in introns: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) intronseqs.append(fseq) r = {} # Report for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)): tsizes = [len(x) for x in tseqs] tsummary = SummaryStats(tsizes, dtype="int") r[t, "Number"] = tsummary.size r[t, "Average size (bp)"] = tsummary.mean r[t, "Median size (bp)"] = tsummary.median r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb") r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1) r[t, "% GC"] = gc(tseqs) print >> sys.stderr, tabulate(r)
def summary(args): """ %prog summary fastafile Report the number of bases and sequences masked. """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args f = Fasta(fastafile, index=False) halfmaskedseqs = set() allmasked = 0 allbases = 0 cutoff = 50 for key, seq in f.iteritems(): masked = 0 for base in seq: if base not in "AGCT": masked += 1 seqlen = len(seq) if masked * 100.0 / seqlen > cutoff: halfmaskedseqs.add(key) allmasked += masked allbases += seqlen seqnum = len(f) maskedseqnum = len(halfmaskedseqs) print( "Total masked bases: {0}".format(percentage(allmasked, allbases)), file=sys.stderr, ) print( "Total masked sequences (contain > {0}% masked): {1}".format( cutoff, percentage(maskedseqnum, seqnum)), file=sys.stderr, )
def filter(args): """ %prog filter bedfile Filter the bedfile to retain records between certain size range. """ p = OptionParser(filter.__doc__) p.add_option("--minsize", default=0, type="int", help="Minimum feature length") p.add_option("--maxsize", default=1000000000, type="int", help="Minimum feature length") p.add_option("--minaccn", type="int", help="Minimum value of accn, useful to filter based on coverage") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args fp = must_open(bedfile) fw = must_open(opts.outfile, "w") minsize, maxsize = opts.minsize, opts.maxsize minaccn = opts.minaccn total = [] keep = [] for row in fp: b = BedLine(row) span = b.span total.append(span) if not minsize <= span <= maxsize: continue if minaccn and int(b.accn) < minaccn: continue print >> fw, b keep.append(span) logging.debug("Stats: {0} features kept.".\ format(percentage(len(keep), len(total)))) logging.debug("Stats: {0} bases kept.".\ format(percentage(sum(keep), sum(total))))
def batchlobstr(args): """ %prog batchlobstr samples.csv Run lobSTR sequentially on list of samples. Each line contains: sample-name,s3-location """ p = OptionParser(batchlobstr.__doc__) p.add_option("--sep", default=",", help="Separator for building commandline") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (samplesfile,) = args store = opts.output_path computed = ls_s3(store) fp = open(samplesfile) skipped = total = 0 for row in fp: total += 1 sample, s3file = row.strip().split(",")[:2] exec_id, sample_id = sample.split("_") bamfile = s3file.replace(".gz", "").replace(".vcf", ".bam") gzfile = sample + ".{0}.vcf.gz".format("hg38") if gzfile in computed: skipped += 1 continue print( opts.sep.join( "python -m jcvi.variation.str lobstr".split() + [ "hg38", "--input_bam_path", bamfile, "--output_path", store, "--sample_id", sample_id, "--workflow_execution_id", exec_id, "--lobstr_home", opts.lobstr_home, "--workdir", opts.workdir, ] ) ) fp.close() logging.debug("Total skipped: {0}".format(percentage(skipped, total)))
def header(self): from jcvi.utils.cbook import percentage s = "Number of paired reads: {0}\n".format(\ percentage(self.npairs * 2, self.nreads)) s += "Libraries: {0}\n".format(", ".join(self.libnames)) s += "LibraryStats: {0}\n".format(self.libstats) s += "r1: {0}\n".format(self.r1) s += "r2: {0}\n".format(self.r2) s += "libs: {0}".format(self.libs) return s
def export_table(self, r, mapname, total): r["Markers (unique)", mapname] = self.num_markers r["Markers per Mb", mapname] = \ self.num_markers * 1e6 / self.total_bases \ if self.total_bases else 0 r["Scaffolds", mapname] = self.num_scaffolds r["N50 Scaffolds", mapname] = self.num_n50_scaffolds r["Total bases", mapname] = percentage(self.total_bases, total, mode=1) r["Scaffolds with 1 marker", mapname] = self.scaffold_1m r["Scaffolds with 2 markers", mapname] = self.scaffold_2m r["Scaffolds with 3 markers", mapname] = self.scaffold_3m r["Scaffolds with >=4 markers", mapname] = self.scaffold_4m
def filter(args): """ %prog filter *.consensus.fasta Filter consensus sequence with min cluster size. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(filter.__doc__) p.add_option("--minsize", default=2, type="int", help="Minimum cluster size") p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastafiles = args minsize = opts.minsize totalreads = totalassembled = 0 fw = must_open(opts.outfile, "w") for i, fastafile in enumerate(fastafiles): f = Fasta(fastafile, lazy=True) pf = "s{0:03d}".format(i) nreads = nsingletons = nclusters = 0 for desc, rec in f.iterdescriptions_ordered(): nclusters += 1 if desc.startswith("singleton"): nsingletons += 1 nreads += 1 continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" size = int(size) nreads += size if size < minsize: continue rec.description = rec.description.split(None, 1)[-1] rec.id = pf + "_" + rec.id SeqIO.write(rec, fw, "fasta") logging.debug("Scanned {0} clusters with {1} reads ..".format( nclusters, nreads)) cclusters, creads = nclusters - nsingletons, nreads - nsingletons logging.debug( "Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]". format(cclusters, minsize, creads, creads / cclusters, pf)) totalreads += nreads totalassembled += nreads - nsingletons logging.debug("Total assembled: {0}".format( percentage(totalassembled, totalreads)))
def batchlobstr(args): """ %prog batchlobstr samples.csv Run lobSTR sequentially on list of samples. Each line contains: sample-name,s3-location """ p = OptionParser(batchlobstr.__doc__) p.add_option("--sep", default=",", help="Separator for building commandline") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) samplesfile, = args store = opts.output_path computed = ls_s3(store) fp = open(samplesfile) skipped = total = 0 for row in fp: total += 1 sample, s3file = row.strip().split(",")[:2] exec_id, sample_id = sample.split("_") bamfile = s3file.replace(".gz", "").replace(".vcf", ".bam") gzfile = sample + ".{0}.vcf.gz".format("hg38") if gzfile in computed: skipped += 1 continue print opts.sep.join( "python -m jcvi.variation.str lobstr".split() + [ "hg38", "--input_bam_path", bamfile, "--output_path", store, "--sample_id", sample_id, "--workflow_execution_id", exec_id, "--lobstr_home", opts.lobstr_home, "--workdir", opts.workdir, ] ) fp.close() logging.debug("Total skipped: {0}".format(percentage(skipped, total)))
def summary(args): """ %prog summary fastafile Report the number of bases and sequences masked. """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args f = Fasta(fastafile, index=False) halfmaskedseqs = set() allmasked = 0 allbases = 0 cutoff = 50 others = 0 for key, seq in f.iteritems(): masked = 0 for base in seq: if base not in "AGCT": masked += 1 seqlen = len(seq) if masked * 100. / seqlen > cutoff: halfmaskedseqs.add(key) allmasked += masked allbases += seqlen seqnum = len(f) maskedseqnum = len(halfmaskedseqs) print >> sys.stderr, "Total masked bases: {0}".\ format(percentage(allmasked, allbases)) print >> sys.stderr, "Total masked sequences (contain > {0}% masked): {1}".\ format(cutoff, percentage(maskedseqnum, seqnum))
def validate(args): """ %prog validate imputed.vcf withheld.vcf Validate imputation against withheld variants. """ p = OptionParser(validate.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) imputed, withheld = args register = {} fp = open(withheld) for row in fp: if row[0] == "#": continue v = VcfLine(row) register[(v.seqid, v.pos)] = v.genotype logging.debug("Imported {0} records from `{1}`".\ format(len(register), withheld)) fp = must_open(imputed) hit = concordant = 0 seen = set() for row in fp: if row[0] == "#": continue v = VcfLine(row) chr, pos, genotype = v.seqid, v.pos, v.genotype if (chr, pos) in seen: continue seen.add((chr, pos)) if (chr, pos) not in register: continue truth = register[(chr, pos)] imputed = genotype.split(":")[0] if "|" in imputed: imputed = "/".join(sorted(genotype.split(":")[0].split("|"))) #probs = [float(x) for x in genotype.split(":")[-1].split(",")] #imputed = max(zip(probs, ["0/0", "0/1", "1/1"]))[-1] hit += 1 if truth == imputed: concordant += 1 else: print(row.strip(), "truth={0}".format(truth), file=sys.stderr) logging.debug("Total concordant: {0}".\ format(percentage(concordant, hit)))
def filter(args): """ %prog filter *.consensus.fasta Filter consensus sequence with min cluster size. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(filter.__doc__) p.add_option("--minsize", default=2, type="int", help="Minimum cluster size") p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastafiles = args minsize = opts.minsize totalreads = totalassembled = 0 fw = must_open(opts.outfile, "w") for i, fastafile in enumerate(fastafiles): f = Fasta(fastafile, lazy=True) pf = "s{0:03d}".format(i) nreads = nsingletons = nclusters = 0 for desc, rec in f.iterdescriptions_ordered(): nclusters += 1 if desc.startswith("singleton"): nsingletons += 1 nreads += 1 continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" size = int(size) nreads += size if size < minsize: continue rec.description = rec.description.split(None, 1)[-1] rec.id = pf + "_" + rec.id SeqIO.write(rec, fw, "fasta") logging.debug("Scanned {0} clusters with {1} reads ..".\ format(nclusters, nreads)) cclusters, creads = nclusters - nsingletons, nreads - nsingletons logging.debug("Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]".\ format(cclusters, minsize, creads, creads / cclusters, pf)) totalreads += nreads totalassembled += nreads - nsingletons logging.debug("Total assembled: {0}".\ format(percentage(totalassembled, totalreads)))
def summary(args): """ %prog summary gffile fastafile Print summary stats, including: - Gene/Exon/Intron - Number - Average size (bp) - Median size (bp) - Total length (Mb) - % of genome - % GC """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gff_file, ref = args s = Fasta(ref) g = make_index(gff_file) geneseqs, exonseqs, intronseqs = [], [], [] # Calc % GC for f in g.features_of_type("gene"): fid = f.id fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop}) geneseqs.append(fseq) exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") exons = list(exons) for chrom, start, stop in exons: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) exonseqs.append(fseq) introns = range_interleave(exons) for chrom, start, stop in introns: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) intronseqs.append(fseq) r = {} # Report for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)): tsizes = [len(x) for x in tseqs] tsummary = SummaryStats(tsizes, dtype="int") r[t, "Number"] = tsummary.size r[t, "Average size (bp)"] = tsummary.mean r[t, "Median size (bp)"] = tsummary.median r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb") r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1) r[t, "% GC"] = gc(tseqs) print(tabulate(r), file=sys.stderr)
def validate(args): """ %prog validate input.vcf genome.fasta Fasta validation of vcf file. """ import pyfasta p = OptionParser(validate.__doc__) p.add_option("--prefix", help="Add prefix to seqid") opts, args = p.parse_args(args) vcffile, fastafile = args pf = opts.prefix genome = pyfasta.Fasta(fastafile, record_class=pyfasta.MemoryRecord) fp = must_open(vcffile) match_ref = match_alt = total = 0 for row in fp: if row[0] == '#': continue seqid, pos, id, ref, alt = row.split()[:5] total += 1 if pf: seqid = pf + seqid pos = int(pos) if seqid not in genome: continue true_ref = genome[seqid][pos - 1] if total % 100000 == 0: print >> sys.stderr, total, "sites parsed" if ref == true_ref: match_ref += 1 elif alt == true_ref: match_alt += 1 logging.debug("Match REF: {}".format(percentage(match_ref, total))) logging.debug("Match ALT: {}".format(percentage(match_alt, total)))
def validate(args): """ %prog validate input.vcf genome.fasta Fasta validation of vcf file. """ import pyfasta p = OptionParser(validate.__doc__) p.add_option("--prefix", help="Add prefix to seqid") opts, args = p.parse_args(args) vcffile, fastafile = args pf = opts.prefix genome = pyfasta.Fasta(fastafile, record_class=pyfasta.MemoryRecord) fp = must_open(vcffile) match_ref = match_alt = total = 0 for row in fp: if row[0] == "#": continue seqid, pos, id, ref, alt = row.split()[:5] total += 1 if pf: seqid = pf + seqid pos = int(pos) if seqid not in genome: continue true_ref = genome[seqid][pos - 1] if total % 100000 == 0: print(total, "sites parsed", file=sys.stderr) if ref == true_ref: match_ref += 1 elif alt == true_ref: match_alt += 1 logging.debug("Match REF: {}".format(percentage(match_ref, total))) logging.debug("Match ALT: {}".format(percentage(match_alt, total)))
def query_links(abed, bbed): abedlinks = abed.links bbedlinks = bbed.links # Reverse complement bbedlinks bxbedlinks = bbedlinks[:] for (a, ai), (b, bi) in bbedlinks: ai = {"+": "-", "?": "-", "-": "+"}[ai] bi = {"+": "-", "?": "-", "-": "+"}[bi] bxbedlinks.append(((b, bi), (a, ai))) atotal = len(abedlinks) print("Total links in {0}: {1}".format(abed.filename, atotal), file=sys.stderr) recovered = set(abedlinks) & set(bxbedlinks) print("Recovered {0}".format(percentage(len(recovered), atotal)), file=sys.stderr) print(set(abedlinks) - set(bxbedlinks), file=sys.stderr)
def mitocompile(args): """ %prog mitcompile *.vcf.gz Extract information about deletions in vcf file. """ from jcvi.formats.vcf import VcfLine from six.moves.urllib.parse import parse_qsl p = OptionParser(mitocompile.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) vcfs = args print("\t".join("vcf samplekey depth seqid pos alt svlen pe sr".split())) for i, vcf in enumerate(vcfs): if (i + 1) % 100 == 0: logging.debug("Process `{}` [{}]".format(vcf, percentage(i + 1, len(vcfs)))) depthfile = vcf.replace(".sv.vcf.gz", ".depth") fp = must_open(depthfile) chrm, depth = fp.next().split() depth = int(float(depth)) samplekey = op.basename(vcf).split("_")[0] fp = must_open(vcf) for row in fp: if row[0] == "#": continue v = VcfLine(row) info = dict(parse_qsl(v.info)) print( "\t".join( str(x) for x in ( vcf, samplekey, depth, v.seqid, v.pos, v.alt, info.get("SVLEN"), info["PE"], info["SR"], ) ) )
def query_links(abed, bbed): abedlinks = abed.links bbedlinks = bbed.links # Reverse complement bbedlinks bxbedlinks = bbedlinks[:] for (a, ai), (b, bi) in bbedlinks: ai = {"+": "-", "?": "-", "-": "+"}[ai] bi = {"+": "-", "?": "-", "-": "+"}[bi] bxbedlinks.append(((b, bi), (a, ai))) atotal = len(abedlinks) print >> sys.stderr, "Total links in {0}: {1}".\ format(abed.filename, atotal) recovered = set(abedlinks) & set(bxbedlinks) print >> sys.stderr, "Recovered {0}".\ format(percentage(len(recovered), atotal)) print >> sys.stderr, set(abedlinks) - set(bxbedlinks)
def range_depth(ranges, size, verbose=True): """ Overlay ranges on [start, end], and summarize the ploidy of the intervals. """ from jcvi.utils.iter import pairwise from jcvi.utils.cbook import percentage # Make endpoints endpoints = [] for a, b in ranges: endpoints.append((a, LEFT)) endpoints.append((b, RIGHT)) endpoints.sort() vstart, vend = min(endpoints)[0], max(endpoints)[0] assert 0 <= vstart < size assert 0 <= vend < size depth = 0 depthstore = defaultdict(int) depthstore[depth] += vstart depthdetails = [(0, vstart, depth)] for (a, atag), (b, btag) in pairwise(endpoints): if atag == LEFT: depth += 1 elif atag == RIGHT: depth -= 1 depthstore[depth] += b - a depthdetails.append((a, b, depth)) assert btag == RIGHT depth -= 1 assert depth == 0 depthstore[depth] += size - vend depthdetails.append((vend, size, depth)) assert sum(depthstore.values()) == size if verbose: for depth, count in sorted(depthstore.items()): print >> sys.stderr, "Depth {0}: {1}".\ format(depth, percentage(count, size)) return depthstore, depthdetails
def some(args): """ %prog some bedfile idsfile > newbedfile Retrieve a subset of bed features given a list of ids. """ from jcvi.formats.base import SetFile from jcvi.utils.cbook import gene_name p = OptionParser(some.__doc__) p.add_option("-v", dest="inverse", default=False, action="store_true", help="Get the inverse, like grep -v [default: %default]") p.set_outfile() p.set_stripnames() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, idsfile = args inverse = opts.inverse ostrip = opts.strip_names fw = must_open(opts.outfile, "w") ids = SetFile(idsfile) if ostrip: ids = set(gene_name(x) for x in ids) bed = Bed(bedfile) ntotal = nkeep = 0 for b in bed: ntotal += 1 keep = b.accn in ids if inverse: keep = not keep if keep: nkeep += 1 print >> fw, b fw.close() logging.debug("Stats: {0} features kept.".\ format(percentage(nkeep, ntotal)))
def gaps(args): """ %prog gaps idsfile fractionationfile gapsbed Check gene locations against gaps. `idsfile` contains a list of IDs to query into `fractionationfile` in order to get expected locations. """ from jcvi.formats.base import DictFile from jcvi.apps.base import popen from jcvi.utils.cbook import percentage p = OptionParser(gaps.__doc__) p.add_option("--bdist", default=0, type="int", help="Base pair distance [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) idsfile, frfile, gapsbed = args bdist = opts.bdist d = DictFile(frfile, keypos=1, valuepos=2) bedfile = idsfile + ".bed" fw = open(bedfile, "w") fp = open(idsfile) total = 0 for row in fp: id = row.strip() hit = d[id] tag, pos = get_tag(hit, None) seqid, start, end = pos start, end = max(start - bdist, 1), end + bdist print >> fw, "\t".join(str(x) for x in (seqid, start - 1, end, id)) total += 1 fw.close() cmd = "intersectBed -a {0} -b {1} -v | wc -l".format(bedfile, gapsbed) not_in_gaps = popen(cmd).read() not_in_gaps = int(not_in_gaps) in_gaps = total - not_in_gaps print >> sys.stderr, "Ids in gaps: {1}".\ format(total, percentage(in_gaps, total))