def main(): p = OptionParser(__doc__) p.add_option("--order", help="The order to plot the tracks, comma-separated") opts, args, iopts = p.set_image_options() if len(args) != 3: sys.exit(not p.print_help()) chr, sizes, datadir = args order = opts.order hlsuffix = opts.hlsuffix if order: order = order.split(",") sizes = Sizes(sizes) fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) canvas = (.12, .35, .8, .35) chr_size = sizes.get_size(chr) c = Coverage(fig, root, canvas, chr, (0, chr_size), datadir, order=order, hlsuffix=hlsuffix) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = chr + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def agp(args): """ %prog agp main_results/ contigs.fasta Generate AGP file based on LACHESIS output. """ p = OptionParser(agp.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) odir, contigsfasta = args fwagp = must_open(opts.outfile, 'w') orderingfiles = natsorted(iglob(odir, "*.ordering")) sizes = Sizes(contigsfasta).mapping contigs = set(sizes.keys()) anchored = set() for ofile in orderingfiles: co = ContigOrdering(ofile) anchored |= set([x.contig_name for x in co]) obj = op.basename(ofile).split('.')[0] co.write_agp(obj, sizes, fwagp) singletons = contigs - anchored logging.debug('Anchored: {}, Singletons: {}'. format(len(anchored), len(singletons))) for s in natsorted(singletons): order_to_agp(s, [(s, "?")], sizes, fwagp)
def bincount(args): """ %prog bincount fastafile binfile Count K-mers in the bin. """ from bitarray import bitarray from jcvi.formats.sizes import Sizes p = OptionParser(bincount.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, binfile = args K = opts.K fp = open(binfile) a = bitarray() a.fromfile(fp) f = Sizes(fastafile) tsize = 0 fw = must_open(opts.outfile, "w") for name, seqlen in f.iter_sizes(): ksize = seqlen - K + 1 b = a[tsize : tsize + ksize] bcount = b.count() print >> fw, "\t".join(str(x) for x in (name, bcount)) tsize += ksize
def pasteprepare(args): """ %prog pasteprepare bacs.fasta Prepare sequences for paste. """ p = OptionParser(pasteprepare.__doc__) p.add_option("--flank", default=5000, type="int", help="Get the seq of size on two ends [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) goodfasta, = args flank = opts.flank pf = goodfasta.rsplit(".", 1)[0] extbed = pf + ".ext.bed" sizes = Sizes(goodfasta) fw = open(extbed, "w") for bac, size in sizes.iter_sizes(): print >> fw, "\t".join(str(x) for x in \ (bac, 0, min(flank, size), bac + "L")) print >> fw, "\t".join(str(x) for x in \ (bac, max(size - flank, 0), size, bac + "R")) fw.close() fastaFromBed(extbed, goodfasta, name=True)
def longest(args): """ %prog longest bedfile fastafile Select longest feature within overlapping piles. """ from jcvi.formats.sizes import Sizes p = OptionParser(longest.__doc__) p.add_option("--maxsize", default=20000, type="int", help="Limit max size") p.add_option("--minsize", default=60, type="int", help="Limit min size") p.add_option("--precedence", default="Medtr", help="Accessions with prefix take precedence") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, fastafile = args maxsize = opts.maxsize minsize = opts.minsize prec = opts.precedence mergedbed = mergeBed(bedfile, nms=True) sizes = Sizes(fastafile).mapping bed = Bed(mergedbed) pf = bedfile.rsplit(".", 1)[0] ids = set() for b in bed: accns = b.accn.split(";") prec_accns = [x for x in accns if x.startswith(prec)] if prec_accns: accns = prec_accns accn_sizes = [(sizes.get(x, 0), x) for x in accns] accn_sizes = [(size, x) for size, x in accn_sizes if size < maxsize] if not accn_sizes: continue max_size, max_accn = max(accn_sizes) if max_size < minsize: continue ids.add(max_accn) newids = remove_isoforms(ids) logging.debug("Remove isoforms: before={0} after={1}".\ format(len(ids), len(newids))) longestidsfile = pf + ".longest.ids" fw = open(longestidsfile, "w") print >> fw, "\n".join(newids) fw.close() logging.debug("A total of {0} records written to `{1}`.".\ format(len(newids), longestidsfile)) longestbedfile = pf + ".longest.bed" some([bedfile, longestidsfile, "--outfile={0}".format(longestbedfile), "--no_strip_names"])
def scaffold(args): """ %prog scaffold scaffold.fasta synteny.blast synteny.sizes synteny.bed physicalmap.blast physicalmap.sizes physicalmap.bed As evaluation of scaffolding, visualize external line of evidences: * Plot synteny to an external genome * Plot alignments to physical map * Plot alignments to genetic map (TODO) Each trio defines one panel to be plotted. blastfile defines the matchings between the evidences vs scaffolds. Then the evidence sizes, and evidence bed to plot dot plots. This script will plot a dot in the dot plot in the corresponding location the plots are one contig/scaffold per plot. """ from jcvi.graphics.base import set_image_options from jcvi.utils.iter import grouper p = OptionParser(scaffold.__doc__) p.add_option("--cutoff", type="int", default=1000000, help="Plot scaffolds with size larger than [default: %default]") p.add_option("--highlights", help="A set of regions in BED format to highlight [default: %default]") opts, args, iopts = set_image_options(p, args, figsize="14x8", dpi=150) if len(args) < 4 or len(args) % 3 != 1: sys.exit(not p.print_help()) highlights = opts.highlights scafsizes = Sizes(args[0]) trios = list(grouper(3, args[1:])) trios = [(a, Sizes(b), Bed(c)) for a, b, c in trios] if highlights: hlbed = Bed(highlights) for scaffoldID, scafsize in scafsizes.iter_sizes(): if scafsize < opts.cutoff: continue logging.debug("Loading {0} (size={1})".format(scaffoldID, thousands(scafsize))) tmpname = scaffoldID + ".sizes" tmp = open(tmpname, "w") tmp.write("{0}\t{1}".format(scaffoldID, scafsize)) tmp.close() tmpsizes = Sizes(tmpname) tmpsizes.close(clean=True) if highlights: subhighlights = list(hlbed.sub_bed(scaffoldID)) imagename = ".".join((scaffoldID, opts.format)) plot_one_scaffold(scaffoldID, tmpsizes, None, trios, imagename, iopts, highlights=subhighlights)
def __init__(self, filename, fastafile): super(EvidenceFile, self).__init__(filename) sz = Sizes(fastafile) sizes = [None] # tig-list starts at 1 for name, size in sz.iter_sizes(): sizes.append((name, size)) self.sizes = sizes self.sz = sz.mapping self.scf = {}
def write_unplaced_agp(agpfile, scaffolds, unplaced_agp): agp = AGP(agpfile) scaffolds_seen = set(x.component_id for x in agp) sizes = Sizes(scaffolds).mapping fwagp = must_open(unplaced_agp, "w") for s in sorted(sizes.keys()): if s in scaffolds_seen: continue order_to_agp(s, [(s, "?")], sizes, fwagp) logging.debug("Write unplaced AGP to `{0}`.".format(unplaced_agp))
def graph_to_agp(g, blastfile, subjectfasta, exclude=[], verbose=False): from jcvi.formats.agp import order_to_agp logging.debug(str(g)) g.write("graph.txt") #g.draw("graph.pdf") paths = [] for path in g.iter_paths(): m, oo = g.path(path) if len(oo) == 1: # Singleton path continue paths.append(oo) if verbose: print m print oo npaths = len(paths) ntigs = sum(len(x) for x in paths) logging.debug("Graph decomposed to {0} paths with {1} components.".\ format(npaths, ntigs)) agpfile = blastfile + ".agp" sizes = Sizes(subjectfasta) fwagp = open(agpfile, "w") scaffolded = set() for i, oo in enumerate(paths): ctgorder = [(str(ctg), ("+" if strand else "-")) \ for ctg, strand in oo] scaffolded |= set(ctg for ctg, strand in ctgorder) object = "pmol_{0:04d}".format(i) order_to_agp(object, ctgorder, sizes.mapping, fwagp) # Get the singletons as well nsingletons = nscaffolded = nexcluded = 0 for ctg, size in sizes.iter_sizes(): if ctg in scaffolded: nscaffolded += 1 continue if ctg in exclude: nexcluded += 1 continue ctgorder = [(ctg, "+")] object = ctg order_to_agp(object, ctgorder, sizes.mapping, fwagp) nsingletons += 1 logging.debug("scaffolded={} excluded={} singletons={}".\ format(nscaffolded, nexcluded, nsingletons)) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))
def covlen(args): """ %prog covlen covfile fastafile Plot coverage vs length. `covfile` is two-column listing contig id and depth of coverage. """ import numpy as np import pandas as pd import seaborn as sns from jcvi.formats.base import DictFile p = OptionParser(covlen.__doc__) p.add_option("--maxsize", default=1000000, type="int", help="Max contig size") p.add_option("--maxcov", default=100, type="int", help="Max contig size") p.add_option("--color", default='m', help="Color of the data points") p.add_option("--kind", default="scatter", choices=("scatter", "reg", "resid", "kde", "hex"), help="Kind of plot to draw") opts, args, iopts = p.set_image_options(args, figsize="8x8") if len(args) != 2: sys.exit(not p.print_help()) covfile, fastafile = args cov = DictFile(covfile, cast=float) s = Sizes(fastafile) data = [] maxsize, maxcov = opts.maxsize, opts.maxcov for ctg, size in s.iter_sizes(): c = cov.get(ctg, 0) if size > maxsize: continue if c > maxcov: continue data.append((size, c)) x, y = zip(*data) x = np.array(x) y = np.array(y) logging.debug("X size {0}, Y size {1}".format(x.size, y.size)) df = pd.DataFrame() xlab, ylab = "Length", "Coverage of depth (X)" df[xlab] = x df[ylab] = y sns.jointplot(xlab, ylab, kind=opts.kind, data=df, xlim=(0, maxsize), ylim=(0, maxcov), stat_func=None, edgecolor="w", color=opts.color) figname = covfile + ".pdf" savefig(figname, dpi=iopts.dpi, iopts=iopts)
def main(args): """ %prog deltafile Plot one query. Extract the references that have major matches to this query. Control "major" by option --refcov. """ p = OptionParser(main.__doc__) p.add_option("--refids", help="Use subset of contigs in the ref") p.add_option("--refcov", default=.01, type="float", help="Minimum reference coverage [default: %default]") p.add_option("--all", default=False, action="store_true", help="Plot one pdf file per ref in refidsfile [default: %default]") p.add_option("--color", default="similarity", choices=("similarity", "direction", "none"), help="Color the dots based on") p.add_option("--nolayout", default=False, action="store_true", help="Do not rearrange contigs") p.set_align(pctid=0, hitlen=0) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) deltafile, = args reffasta, queryfasta = open(deltafile).readline().split() color = opts.color layout = not opts.nolayout prefix = op.basename(deltafile).split(".")[0] qsizes = Sizes(queryfasta).mapping rsizes = Sizes(reffasta).mapping refs = SetFile(opts.refids) if opts.refids else set(rsizes.keys()) refcov = opts.refcov pctid = opts.pctid hitlen = opts.hitlen deltafile = filter([deltafile, "--pctid={0}".format(pctid), "--hitlen={0}".format(hitlen)]) if opts.all: for r in refs: pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov, prefix=prefix, color=color, layout=layout) if pdffile: sh("mv {0} {1}.pdf".format(pdffile, r)) else: plot_some_queries(refs, qsizes, rsizes, deltafile, refcov, prefix=prefix, color=color, layout=layout)
def bed(args): """ %prog bed binfile fastafile Write bed files where the bases have at least certain depth. """ p = OptionParser(bed.__doc__) p.add_option("-o", dest="output", default="stdout", help="Output file name [default: %default]") p.add_option("--cutoff", dest="cutoff", default=10, type="int", help="Minimum read depth to report intervals [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) binfile, fastafile = args fw = must_open(opts.output, "w") cutoff = opts.cutoff assert cutoff >= 0, "Need non-negative cutoff" b = BinFile(binfile) ar = b.array fastasize, sizes, offsets = get_offsets(fastafile) s = Sizes(fastafile) for ctg, ctglen in s.iter_sizes(): offset = offsets[ctg] subarray = ar[offset:offset + ctglen] key = lambda x: x[1] >= cutoff for tf, array_elements in groupby(enumerate(subarray), key=key): array_elements = list(array_elements) if not tf: continue # 0-based system => 1-based system start = array_elements[0][0] + 1 end = array_elements[-1][0] + 1 mean_depth = sum([x[1] for x in array_elements]) / \ len(array_elements) mean_depth = int(mean_depth) name = "na" print >> fw, "\t".join(str(x) for x in (ctg, \ start - 1, end, name, mean_depth))
def posmap(args): """ %prog posmap frgscf.sorted scf.fasta scfID Perform QC on the selected scfID, generate multiple BED files for plotting. """ p = OptionParser(posmap.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(p.print_help()) frgscffile, fastafile, scf = args # fasta cmd = "faOneRecord {0} {1}".format(fastafile, scf) scffastafile = scf + ".fasta" if not op.exists(scffastafile): sh(cmd, outfile=scffastafile) # sizes sizesfile = scffastafile + ".sizes" sizes = Sizes(scffastafile).mapping scfsize = sizes[scf] logging.debug("`{0}` has length of {1}.".format(scf, scfsize)) # gaps.bed gapsbedfile = scf + ".gaps.bed" if not op.exists(gapsbedfile): args = [scffastafile, "--bed", "--mingap=100"] gaps(args) # reads frgscf posmap posmapfile = scf + ".posmap" if not op.exists(posmapfile): args = [frgscffile, scf] query(args) # reads bed bedfile = scf + ".bed" if not op.exists(bedfile): args = [posmapfile] bed(args) # reads bedpe bedpefile = scf + ".bedpe" pairsbedfile = scf + ".pairs.bed" if not (op.exists(bedpefile) and op.exists(pairsbedfile)): bed_to_bedpe(bedfile, bedpefile, pairsbedfile=pairsbedfile, ca=True) # base coverage basecoverage = Coverage(bedfile, sizesfile) pecoverage = Coverage(pairsbedfile, sizesfile)
def main(args): """ %prog deltafile refidsfile query.fasta ref.fasta Plot one query. Extract the references that have major matches to this query. Control "major" by option --refcov. """ p = OptionParser(main.__doc__) p.add_option("--refcov", default=.01, type="float", help="Minimum reference coverage [default: %default]") p.add_option( "--all", default=False, action="store_true", help="Plot one pdf file per ref in refidsfile [default: %default]") p.set_align(pctid=96, hitlen=500) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) deltafile, refidsfile, queryfasta, reffasta = args qsizes = Sizes(queryfasta).mapping rsizes = Sizes(reffasta).mapping refs = SetFile(refidsfile) refcov = opts.refcov pctid = opts.pctid hitlen = opts.hitlen deltafile = filter([ deltafile, "--pctid={0}".format(pctid), "--hitlen={0}".format(hitlen) ]) if opts.all: for r in refs: pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov) if pdffile: sh("mv {0} {1}.pdf".format(pdffile, r)) else: plot_some_queries(refs, qsizes, rsizes, deltafile, refcov)
def coverage(args): """ %prog coverage fastafile bamfile Calculate coverage for BAM file. BAM file will be sorted unless with --nosort. """ p = OptionParser(coverage.__doc__) p.add_option( "--format", default="bigwig", choices=("bedgraph", "bigwig", "coverage"), help="Output format", ) p.add_option("--nosort", default=False, action="store_true", help="Do not sort BAM") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, bamfile = args format = opts.format if opts.nosort: logging.debug("BAM sorting skipped") else: bamfile = index([bamfile, "--fasta={0}".format(fastafile)]) pf = bamfile.rsplit(".", 2)[0] sizesfile = Sizes(fastafile).filename cmd = "genomeCoverageBed -ibam {0} -g {1}".format(bamfile, sizesfile) if format in ("bedgraph", "bigwig"): cmd += " -bg" bedgraphfile = pf + ".bedgraph" sh(cmd, outfile=bedgraphfile) if format == "bedgraph": return bedgraphfile bigwigfile = pf + ".bigwig" cmd = "bedGraphToBigWig {0} {1} {2}".format(bedgraphfile, sizesfile, bigwigfile) sh(cmd) return bigwigfile coveragefile = pf + ".coverage" if need_update(fastafile, coveragefile): sh(cmd, outfile=coveragefile) gcf = GenomeCoverageFile(coveragefile) fw = must_open(opts.outfile, "w") for seqid, cov in gcf.iter_coverage_seqid(): print("\t".join((seqid, "{0:.1f}".format(cov))), file=fw) fw.close()
def flanks(args): """ %prog flanks gaps.bed fastafile Create sequences flanking the gaps. """ p = OptionParser(flanks.__doc__) p.add_option( "--extend", default=2000, type="int", help="Extend seq flanking the gaps", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gapsbed, fastafile = args Ext = opts.extend sizes = Sizes(fastafile).mapping bed = Bed(gapsbed) pf = gapsbed.rsplit(".", 1)[0] extbed = pf + ".ext.bed" fw = open(extbed, "w") for i, b in enumerate(bed): seqid = b.seqid gapname = b.accn size = sizes[seqid] prev_b = bed[i - 1] if i > 0 else None next_b = bed[i + 1] if i + 1 < len(bed) else None if prev_b and prev_b.seqid != seqid: prev_b = None if next_b and next_b.seqid != seqid: next_b = None start = prev_b.end + 1 if prev_b else 1 start, end = max(start, b.start - Ext), b.start - 1 print("\t".join( str(x) for x in (b.seqid, start - 1, end, gapname + "L")), file=fw) end = next_b.start - 1 if next_b else size start, end = b.end + 1, min(end, b.end + Ext) print("\t".join( str(x) for x in (b.seqid, start - 1, end, gapname + "R")), file=fw) fw.close() extfasta = fastaFromBed(extbed, fastafile, name=True) return extbed, extfasta
def coordQ(args): sizes = Sizes(args.fs) for line in must_open(args.fi): if not re.match(r'\d+', line[0]): continue p = PslLine(line) qnames = p.qName.split("-") if len(qnames) == 3: x = p.qName p.qName, qosStart, qosEnd = qnames[0], int(qnames[1]), int( qnames[2]) assert qosEnd - qosStart + 1 == p.qSize cSize = sizes.get_size(p.qName) p.qStart += qosStart - 1 p.qEnd += qosStart - 1 if p.qstrand == "-": p.qStarts = [x + cSize - qosEnd for x in p.qStarts] else: p.qStarts = [x + qosStart - 1 for x in p.qStarts] p.qSize = cSize print(str(p))
def bed2chain(args): from maize.formats.sizes import Sizes tdic = Sizes(args.tsize) qdic = Sizes(args.qsize) firstline = True cid0, tName0, qName0, srd0, locs = '', '', '', '', [] for line in must_open(args.fi): line = line.rstrip("\n") if not line: continue tName, tStart, tEnd, srd, qName, qStart, qEnd, cid = line.split()[:8] tStart, tEnd, qStart, qEnd = int(tStart), int(tEnd), int(qStart), int(qEnd) if firstline: cid0, tName0, qName0, srd0 = cid, tName, qName, srd locs.append([tStart, tEnd, qStart, qEnd]) firstline = False elif cid0 == cid: assert tName == tName0 and qName == qName0 and srd == srd0, "inconsistent info in chain" locs.append([tStart, tEnd, qStart, qEnd]) else: print_chain(cid0, tName0, qName0, srd0, tdic.get_size(tName0), qdic.get_size(qName0), locs) cid0, tName0, qName0, srd0 = cid, tName, qName, srd locs = [[tStart, tEnd, qStart, qEnd]] print_chain(cid0, tName0, qName0, srd0, tdic.get_size(tName0), qdic.get_size(qName0), locs)
def covlen(args): """ %prog covlen covfile fastafile Plot coverage vs lenght. `covfile` is two-column listing contig id and depth of coverage. """ import numpy as np import seaborn as sns from jcvi.formats.base import DictFile p = OptionParser(covlen.__doc__) p.add_option("--maxsize", default=100000, type="int", help="Max contig size") p.add_option("--maxcov", default=100, type="int", help="Max contig size") opts, args, iopts = p.set_image_options(args, figsize="8x8") if len(args) != 2: sys.exit(not p.print_help()) covfile, fastafile = args cov = DictFile(covfile, cast=float) s = Sizes(fastafile) data = [] maxsize, maxcov = opts.maxsize, opts.maxcov for ctg, size in s.iter_sizes(): c = cov[ctg] if size > maxsize: continue if c > maxcov: continue data.append((size, c)) x, y = zip(*data) x = np.array(x) y = np.array(y) logging.debug("X size {0}, Y size {1}".format(x.size, y.size)) sns.jointplot(x, y, kind="kde") figname = covfile + ".pdf" savefig(figname, dpi=iopts.dpi, iopts=iopts)
def __init__(self, bedfile, sizesfile): bedfile = sort([bedfile]) coveragefile = bedfile + ".coverage" if need_update(bedfile, coveragefile): cmd = "genomeCoverageBed" cmd += " -bg -i {0} -g {1}".format(bedfile, sizesfile) sh(cmd, outfile=coveragefile) self.sizes = Sizes(sizesfile).mapping filename = coveragefile assert filename.endswith(".coverage") super(Coverage, self).__init__(filename)
def summary(args): """ %prog summary input.bed scaffolds.fasta Print out summary statistics per map, followed by consensus summary of scaffold anchoring based on multiple maps. """ p = OptionParser(summary.__doc__) p.set_table(sep="|", align=True) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, scaffolds = args pf = inputbed.rsplit(".", 1)[0] mapbed = pf + ".bed" chr_agp = pf + ".chr.agp" sep = opts.sep align = opts.align cc = Map(mapbed) mapnames = cc.mapnames s = Sizes(scaffolds) total, l50, n50 = s.summary r = {} maps = [] fw = must_open(opts.outfile, "w") print >> fw, "*** Summary for each individual map ***" for mapname in mapnames: markers = [x for x in cc if x.mapname == mapname] ms = MapSummary(markers, l50, s) r["Linkage Groups", mapname] = ms.num_lgs ms.export_table(r, mapname, total) maps.append(ms) print >> fw, tabulate(r, sep=sep, align=align) r = {} agp = AGP(chr_agp) print >> fw, "*** Summary for consensus map ***" consensus_scaffolds = set(x.component_id for x in agp if not x.is_gap) unplaced_scaffolds = set(s.mapping.keys()) - consensus_scaffolds for mapname, sc in (("Anchored", consensus_scaffolds), ("Unplaced", unplaced_scaffolds)): markers = [x for x in cc if x.seqid in sc] ms = MapSummary(markers, l50, s, scaffolds=sc) ms.export_table(r, mapname, total) print >> fw, tabulate(r, sep=sep, align=align)
def location(args): """ %prog location bedfile fastafile Given SNP locations, summarize the locations in the sequences. For example, find out if there are more 3`-SNPs than 5`-SNPs. """ from jcvi.formats.bed import BedLine from jcvi.graphics.histogram import stem_leaf_plot p = OptionParser(location.__doc__) p.add_option( "--dist", default=100, type="int", help="Distance cutoff to call 5` and 3` [default: %default]", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, fastafile = args dist = opts.dist sizes = Sizes(fastafile).mapping fp = open(bedfile) fiveprime = threeprime = total = 0 percentages = [] for row in fp: b = BedLine(row) pos = b.start size = sizes[b.seqid] if pos < dist: fiveprime += 1 if size - pos < dist: threeprime += 1 total += 1 percentages.append(100 * pos / size) m = "Five prime (within {0}bp of start codon): {1}\n".format( dist, fiveprime) m += "Three prime (within {0}bp of stop codon): {1}\n".format( dist, threeprime) m += "Total: {0}".format(total) print(m, file=sys.stderr) bins = 10 title = "Locations within the gene [0=Five-prime, 100=Three-prime]" stem_leaf_plot(percentages, 0, 100, bins, title=title)
def pasteprepare(args): """ %prog pasteprepare bacs.fasta Prepare sequences for paste. """ p = OptionParser(pasteprepare.__doc__) p.add_option( "--flank", default=5000, type="int", help="Get the seq of size on two ends", ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (goodfasta, ) = args flank = opts.flank pf = goodfasta.rsplit(".", 1)[0] extbed = pf + ".ext.bed" sizes = Sizes(goodfasta) fw = open(extbed, "w") for bac, size in sizes.iter_sizes(): print("\t".join(str(x) for x in (bac, 0, min(flank, size), bac + "L")), file=fw) print( "\t".join( str(x) for x in (bac, max(size - flank, 0), size, bac + "R")), file=fw, ) fw.close() fastaFromBed(extbed, goodfasta, name=True)
def annotate(args): """ %prog annotate blastfile query.fasta subject.fasta Annotate overlap types (dovetail, contained, etc) in BLAST tabular file. """ from jcvi.assembly.goldenpath import Overlap, Overlap_types p = OptionParser(annotate.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) blastfile, afasta, bfasta = args fp = open(blastfile) asizes = Sizes(afasta).mapping bsizes = Sizes(bfasta).mapping for row in fp: b = BlastLine(row) asize = asizes[b.query] bsize = bsizes[b.subject] ov = Overlap(b, asize, bsize) print "{0}\t{1}".format(b, Overlap_types[ov.otype])
def uniq(args): """ %prog uniq gffile cdsfasta Remove overlapping gene models. Similar to formats.gff.uniq(), overlapping 'piles' are processed, one by one. Here, we use a different algorithm, that retains the best non-overlapping subset witin each pile, rather than single best model. Scoring function is also different, rather than based on score or span, we optimize for the subset that show the best combined score. Score is defined by: score = (1 - AED) * length """ p = OptionParser(uniq.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gffile, cdsfasta = args gff = Gff(gffile) sizes = Sizes(cdsfasta).mapping gene_register = {} for g in gff: if g.type != "mRNA": continue aed = float(g.attributes["_AED"][0]) gene_register[g.parent] = (1 - aed) * sizes[g.accn] allgenes = import_feats(gffile) g = get_piles(allgenes) bestids = set() for group in g: ranges = [ to_range(x, score=gene_register[x.accn], id=x.accn) for x in group ] selected_chain, score = range_chain(ranges) bestids |= set(x.id for x in selected_chain) removed = set(x.accn for x in allgenes) - bestids fw = open("removed.ids", "w") print("\n".join(sorted(removed)), file=fw) fw.close() populate_children(opts.outfile, bestids, gffile, "gene")
def agp(args): fp = open("assembly-order.txt") next(fp) sizes = Sizes("SCAFFOLD-SPLIT.fasta").mapping for row in fp: atoms = row.split() assert len(atoms) in (4, 5) if len(atoms) == 4: atoms.append('?') scaf, tag, linkage, no, strand = atoms strand = strand.lower() strand = {'f': '+', 'r': '-', '?': '?'}[strand] scaf = "scaffold_" + scaf scaf_size = sizes[scaf] linkage = "LG{0:02d}".format(ord(linkage.lower()) - ord('a') + 1) print("\t".join(str(x) for x in \ (scaf, 0, scaf_size, linkage, 1000, strand)))
def closest(args): """ %prog closest candidates.bed gaps.bed fastafile Identify the nearest gaps flanking suggested regions. """ p = OptionParser(closest.__doc__) p.add_option( "--om", default=False, action="store_true", help="The bedfile is OM blocks", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) candidates, gapsbed, fastafile = args sizes = Sizes(fastafile).mapping bed = Bed(candidates) ranges = [] for b in bed: r = range_parse(b.accn) if opts.om else b ranges.append([r.seqid, r.start, r.end]) gapsbed = Bed(gapsbed) granges = [(x.seqid, x.start, x.end) for x in gapsbed] ranges = range_merge(ranges) for r in ranges: a = range_closest(granges, r) b = range_closest(granges, r, left=False) seqid = r[0] if a is not None and a[0] != seqid: a = None if b is not None and b[0] != seqid: b = None mmin = 1 if a is None else a[1] mmax = sizes[seqid] if b is None else b[2] print("\t".join(str(x) for x in (seqid, mmin - 1, mmax)))
def __init__(self, bedfile, sizesfile): from jcvi.formats.bed import sort sortedbedfile = bedfile.rsplit(".", 1)[0] + ".sorted.bed" if need_update(bedfile, sortedbedfile): sort([bedfile]) bedfile = sortedbedfile coveragefile = bedfile + ".coverage" if need_update(bedfile, coveragefile): cmd = "genomeCoverageBed" cmd += " -bg -i {0} -g {1}".format(bedfile, sizesfile) sh(cmd, outfile=coveragefile) self.sizes = Sizes(sizesfile).mapping filename = coveragefile assert filename.endswith(".coverage") super(Coverage, self).__init__(filename)
def fasta(args): """ %prog fasta map.out scaffolds.fasta Extract marker sequences based on map. """ from jcvi.formats.sizes import Sizes p = OptionParser(fasta.__doc__) p.add_option( "--extend", default=1000, type="int", help="Extend seq flanking the gaps", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) mapout, sfasta = args Flank = opts.extend pf = mapout.split(".")[0] mapbed = pf + ".bed" bm = BinMap(mapout) bm.print_to_bed(mapbed) bed = Bed(mapbed, sorted=False) markersbed = pf + ".markers.bed" fw = open(markersbed, "w") sizes = Sizes(sfasta).mapping for b in bed: accn = b.accn scf, pos = accn.split(".") pos = int(pos) start = max(0, pos - Flank) end = min(pos + Flank, sizes[scf]) print("\t".join(str(x) for x in (scf, start, end, accn)), file=fw) fw.close() fastaFromBed(markersbed, sfasta, name=True)
def coverage(args): """ %prog coverage fastafile bamfile Calculate coverage for BAM file. BAM file must be sorted. """ p = OptionParser(coverage.__doc__) p.add_option("--format", default="bigwig", choices=("bedgraph", "bigwig", "coverage"), help="Output format") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, bamfile = args format = opts.format pf = bamfile.rsplit(".", 2)[0] sizesfile = Sizes(fastafile).filename cmd = "genomeCoverageBed -ibam {0} -g {1}".format(bamfile, sizesfile) if format in ("bedgraph", "bigwig"): cmd += " -bg" bedgraphfile = pf + ".bedgraph" sh(cmd, outfile=bedgraphfile) if format == "bedgraph": return bedgraphfile bigwigfile = pf + ".bigwig" cmd = "bedGraphToBigWig {0} {1} {2}".\ format(bedgraphfile, sizesfile, bigwigfile) sh(cmd) return bigwigfile coveragefile = pf + ".coverage" if need_update(fastafile, coveragefile): sh(cmd, outfile=coveragefile) gcf = GenomeCoverageFile(coveragefile) for seqid, cov in gcf.iter_coverage_seqid(): print "\t".join((seqid, "{0:.1f}".format(cov)))
def longest(args): """ %prog longest pile.txt cds.fasta Pick the longest model per group. `pile.txt` can be generated by formats.bed.pile(). """ from jcvi.formats.sizes import Sizes p = OptionParser(longest.__doc__) p.add_option("--samesize", default=False, action="store_true", help="Only report where the group has same size "\ "[default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) pilefile, cdsfasta = args sizes = Sizes(cdsfasta).mapping fp = open(pilefile) fw = open("Problems.ids", "w") for row in fp: models, = row.split() all_models = models.split("|") all_lengths = [(x, sizes[x]) for x in all_models] max_model = max(all_lengths, key=lambda x: x[-1])[0] if opts.samesize: mms, lengths = zip(*all_lengths) if len(set(lengths)) != 1: continue modelmsg = "|".join("{0}({1})".format(a, b) for a, b in all_lengths) print "\t".join((max_model, modelmsg)) problems = [x for x in all_models if x != max_model] print >> fw, "\n".join(problems) fw.close()
def fill(args): """ %prog fill gaps.bed bad.fasta Perform gap filling of one assembly (bad) using sequences from another. """ p = OptionParser(fill.__doc__) p.add_option( "--extend", default=2000, type="int", help="Extend seq flanking the gaps", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gapsbed, badfasta = args Ext = opts.extend gapdist = 2 * Ext + 1 # This is to prevent to replacement ranges intersect gapsbed = mergeBed(gapsbed, d=gapdist, nms=True) bed = Bed(gapsbed) sizes = Sizes(badfasta).mapping pf = gapsbed.rsplit(".", 1)[0] extbed = pf + ".ext.bed" fw = open(extbed, "w") for b in bed: gapname = b.accn start, end = max(0, b.start - Ext - 1), b.start - 1 print("\t".join(str(x) for x in (b.seqid, start, end, gapname + "L")), file=fw) start, end = b.end, min(sizes[b.seqid], b.end + Ext) print("\t".join(str(x) for x in (b.seqid, start, end, gapname + "R")), file=fw) fw.close() fastaFromBed(extbed, badfasta, name=True)
def fasta(args): """ %prog fasta bedfile scf.fasta pseudomolecules.fasta Use OM bed to scaffold and create pseudomolecules. bedfile can be generated by running jcvi.assembly.opticalmap bed --blockonly """ from jcvi.formats.sizes import Sizes from jcvi.formats.agp import OO, build p = OptionParser(fasta.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, scffasta, pmolfasta = args pf = bedfile.rsplit(".", 1)[0] bed = Bed(bedfile) selected = select_bed(bed) oo = OO() seen = set() sizes = Sizes(scffasta).mapping agpfile = pf + ".agp" agp = open(agpfile, "w") for b in selected: scf = range_parse(b.accn).seqid chr = b.seqid cs = (chr, scf) if cs not in seen: oo.add(chr, scf, sizes[scf], b.strand) seen.add(cs) else: logging.debug("Seen {0}, ignored.".format(cs)) oo.write_AGP(agp, gaptype="contig") agp.close() build([agpfile, scffasta, pmolfasta])
def eject(args): """ %prog eject candidates.bed chr.fasta Eject scaffolds from assembly, using the range identified by closest(). """ p = OptionParser(eject.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) candidates, chrfasta = args sizesfile = Sizes(chrfasta).filename cbedfile = complementBed(candidates, sizesfile) cbed = Bed(cbedfile) for b in cbed: b.accn = b.seqid b.score = 1000 b.strand = "+" cbed.print_to_file()
def ancestral(args): """ %prog ancestral ancestral.txt assembly.fasta Karyotype evolution of pineapple. The figure is inspired by Amphioxus paper Figure 3 and Tetradon paper Figure 9. """ p = OptionParser(ancestral.__doc__) opts, args, iopts = p.set_image_options(args, figsize="8x7") if len(args) != 2: sys.exit(not p.print_help()) regionsfile, sizesfile = args regions = RegionsFile(regionsfile) sizes = Sizes(sizesfile).mapping sizes = dict((k, v) for (k, v) in sizes.iteritems() if k[:2] == "LG") maxsize = max(sizes.values()) ratio = .5 / maxsize fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes((0, 0, 1, 1)) from jcvi.graphics.base import set2 a, b, c, d, e, f, g = set2[:7] set2 = (c, g, b, e, d, a, f) # Upper panel is the evolution of segments # All segments belong to one of seven karyotypes 1 to 7 karyotypes = regions.karyotypes xgap = 1. / (1 + len(karyotypes)) ygap = .05 mgap = xgap / 4.5 gwidth = mgap * .75 tip = .02 coords = {} for i, k in enumerate(regions.karyotypes): x = (i + 1) * xgap y = .9 root.text(x, y + tip, "Anc" + k, ha="center") root.plot((x, x), (y, y - ygap), "k-", lw=2) y -= 2 * ygap coords['a'] = (x - 1.5 * mgap , y) coords['b'] = (x - .5 * mgap , y) coords['c'] = (x + .5 * mgap , y) coords['d'] = (x + 1.5 * mgap , y) coords['ab'] = join_nodes_vertical(root, coords, 'a', 'b', y + ygap / 2) coords['cd'] = join_nodes_vertical(root, coords, 'c', 'd', y + ygap / 2) coords['abcd'] = join_nodes_vertical(root, coords, 'ab', 'cd', y + ygap) for n in 'abcd': nx, ny = coords[n] root.text(nx, ny - tip, n, ha="center") coords[n] = (nx, ny - ygap / 2) kdata = regions.get_karyotype(k) for kd in kdata: g = kd.group gx, gy = coords[g] gsize = ratio * kd.span gy -= gsize p = Rectangle((gx - gwidth / 2, gy), gwidth, gsize, lw=0, color=set2[i]) root.add_patch(p) root.text(gx, gy + gsize / 2, kd.chromosome, ha="center", va="center", color='w') coords[g] = (gx, gy - tip) # Bottom panel shows the location of segments on chromosomes # TODO: redundant code, similar to graphics.chromosome ystart = .54 chr_number = len(sizes) xstart, xend = xgap - 2 * mgap, 1 - xgap + 2 * mgap xinterval = (xend - xstart - gwidth) / (chr_number - 1) chrpos = {} for a, (chr, clen) in enumerate(sorted(sizes.items())): chr = get_number(chr) xx = xstart + a * xinterval + gwidth / 2 chrpos[chr] = xx root.text(xx, ystart + .01, chr, ha="center") Chromosome(root, xx, ystart, ystart - clen * ratio, width=gwidth) # Start painting for r in regions: xx = chrpos[r.chromosome] yystart = ystart - r.start * ratio yyend = ystart - r.end * ratio p = Rectangle((xx - gwidth / 2, yystart), gwidth, yyend - yystart, color=set2[int(r.karyotype) - 1], lw=0) root.add_patch(p) root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() pf = "pineapple-karyotype" image_name = pf + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None): # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}, empty=True) for crange in cranges: if crange: seqid, start, end = crange bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) else: abeds.append(None) for a in aa: gapid = a.accn bi, b = border[gapid] if a.strand == "-": b.extra[1] = b.strand = "-" if b.strand == "+" else "+" bbeds.append(b) n_abeds = len(abeds) n_bbeds = len(bbeds) assert n_abeds - n_bbeds == 1, "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds) beds = [x for x in roundrobin(abeds, bbeds) if x] if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled) return shuffledbed
def bambus(args): """ %prog bambus bambus.bed bambus.mates total.fasta Insert unplaced scaffolds based on mates. """ from jcvi.formats.bed import BedLine from jcvi.formats.posmap import MatesFile p = OptionParser(bambus.__doc__) p.add_option( "--prefix", default="scaffold", help="Prefix of the unplaced scaffolds", ) p.add_option( "--minlinks", default=3, type="int", help="Minimum number of links to place", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, matesfile, fastafile = args pf = matesfile.rsplit(".", 1)[0] logfile = pf + ".log" log = open(logfile, "w") mf = MatesFile(matesfile) maxdist = max(x.max for x in mf.libraries.values()) logging.debug("Max separation: {0}".format(maxdist)) prefix = opts.prefix minlinks = opts.minlinks is_unplaced = lambda x: x.startswith(prefix) bed = Bed(bedfile, sorted=False) beds = [] unplaced = defaultdict(list) for a, b in pairwise(bed): aname, bname = a.accn, b.accn aseqid, bseqid = a.seqid, b.seqid if aname not in mf: continue pa, la = mf[aname] if pa != bname: continue ia = is_unplaced(aseqid) ib = is_unplaced(bseqid) if ia == ib: continue if ia: a, b = b, a unplaced[b.seqid].append((a, b)) beds.extend([a, b]) sizes = Sizes(fastafile) candidatebed = Bed() cbeds = [] # For each unplaced scaffold, find most likely placement and orientation for scf, beds in sorted(unplaced.items()): print(file=log) ranges = [] for a, b in beds: aname, astrand = a.accn, a.strand bname, bstrand = b.accn, b.strand aseqid, bseqid = a.seqid, b.seqid pa, lib = mf[aname] print(a, file=log) print(b, file=log) flip_b = astrand == bstrand fbstrand = "-" if flip_b else "+" if flip_b: b.reverse_complement(sizes) lmin, lmax = lib.min, lib.max L = sizes.get_size(scf) assert astrand in ("+", "-") if astrand == "+": offset = a.start - b.end sstart, sstop = offset + lmin, offset + lmax else: offset = a.end - b.start + L sstart, sstop = offset - lmax, offset - lmin # Prevent out of range error size = sizes.get_size(aseqid) sstart = max(0, sstart) sstop = max(0, sstop) sstart = min(size - 1, sstart) sstop = min(size - 1, sstop) start_range = (aseqid, sstart, sstop, scf, 1, fbstrand) print("*" + "\t".join(str(x) for x in start_range), file=log) ranges.append(start_range) mranges = [x[:3] for x in ranges] # Determine placement by finding the interval with the most support rd = ranges_depth(mranges, sizes.mapping, verbose=False) alldepths = [] for depth in rd: alldepths.extend(depth) print(alldepths, file=log) maxdepth = max(alldepths, key=lambda x: x[-1])[-1] if maxdepth < minlinks: print("Insufficient links ({0} < {1})".format(maxdepth, minlinks), file=log) continue candidates = [x for x in alldepths if x[-1] == maxdepth] nseqids = len(set(x[0] for x in candidates)) if nseqids != 1: msg = "Multiple conflicting candidates found" print(msg, file=log) continue seqid, mmin, mmax, depth = candidates[0] mmin, mmax = range_minmax([x[1:3] for x in candidates]) if mmin >= mmax: msg = "Invalid (min, max) range" print("Invalid (min, max) range", file=log) continue if (mmax - mmin) > maxdist: msg = "(min, max) distance greater than library maxdist" print(msg, file=log) continue # Determine orientation by voting nplus, nminus = 0, 0 arange = (seqid, mmin, mmax) for sid, start, end, sf, sc, fbstrand in ranges: brange = (sid, start, end) if range_overlap(arange, brange): if fbstrand == "+": nplus += 1 else: nminus += 1 fbstrand = "+" if nplus >= nminus else "-" candidate = (seqid, mmin, mmax, scf, depth, fbstrand) bedline = BedLine("\t".join((str(x) for x in candidate))) cbeds.append(bedline) print("Plus: {0}, Minus: {1}".format(nplus, nminus), file=log) print(candidate, file=log) candidatebed.extend(cbeds) logging.debug("A total of {0} scaffolds can be placed.".format(len(candidatebed))) log.close() candidatebedfile = pf + ".candidate.bed" candidatebed.print_to_file(candidatebedfile, sorted=True)
def insert(args): """ %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta Insert scaffolds into assembly. """ from jcvi.formats.agp import mask, bed from jcvi.formats.sizes import agp p = OptionParser(insert.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) candidates, gapsbed, chrfasta, unplacedfasta = args refinedbed = refine([candidates, gapsbed]) sizes = Sizes(unplacedfasta).mapping cbed = Bed(candidates) corder = cbed.order gbed = Bed(gapsbed) gorder = gbed.order gpbed = Bed() gappositions = {} # (chr, start, end) => gapid fp = open(refinedbed) gap_to_scf = defaultdict(list) seen = set() for row in fp: atoms = row.split() if len(atoms) <= 6: continue unplaced = atoms[3] strand = atoms[5] gapid = atoms[9] if gapid not in seen: seen.add(gapid) gi, gb = gorder[gapid] gpbed.append(gb) gappositions[(gb.seqid, gb.start, gb.end)] = gapid gap_to_scf[gapid].append((unplaced, strand)) gpbedfile = "candidate.gaps.bed" gpbed.print_to_file(gpbedfile, sorted=True) agpfile = agp([chrfasta]) maskedagpfile = mask([agpfile, gpbedfile]) maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed" bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)]) mbed = Bed(maskedbedfile) finalbed = Bed() for b in mbed: sid = b.seqid key = (sid, b.start, b.end) if key not in gappositions: finalbed.add("{0}\n".format(b)) continue gapid = gappositions[key] scfs = gap_to_scf[gapid] # For scaffolds placed in the same gap, sort according to positions scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end) for scf, strand in scfs: size = sizes[scf] finalbed.add("\t".join(str(x) for x in (scf, 0, size, sid, 1000, strand))) finalbedfile = "final.bed" finalbed.print_to_file(finalbedfile) # Clean-up toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile] FileShredder(toclean)
def covfilter(args): """ %prog covfilter blastfile fastafile Fastafile is used to get the sizes of the queries. Two filters can be applied, the id% and cov%. """ p = OptionParser(covfilter.__doc__) p.add_option("--pctid", dest="pctid", default=90, type="int", help="Percentage identity cutoff [default: %default]") p.add_option("--pctcov", dest="pctcov", default=50, type="int", help="Percentage identity cutoff [default: %default]") p.add_option("--ids", dest="ids", default=None, help="Print out the ids that satisfy [default: %default]") p.add_option("--list", dest="list", default=False, action="store_true", help="List the id% and cov% per gene [default: %default]") set_outfile(p, outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) from jcvi.algorithms.supermap import supermap blastfile, fastafile = args sizes = Sizes(fastafile).mapping querysupermap = blastfile + ".query.supermap" if not op.exists(querysupermap): supermap(blastfile, filter="query") blastfile = querysupermap assert op.exists(blastfile) covered = 0 mismatches = 0 gaps = 0 alignlen = 0 queries = set() valid = set() blast = BlastSlow(querysupermap) for query, blines in blast.iter_hits(): blines = list(blines) queries.add(query) # per gene report this_covered = 0 this_alignlen = 0 this_mismatches = 0 this_gaps = 0 for b in blines: this_covered += abs(b.qstart - b.qstop + 1) this_alignlen += b.hitlen this_mismatches += b.nmismatch this_gaps += b.ngaps this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen this_coverage = this_covered * 100. / sizes[query] if opts.list: print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage) if this_identity >= opts.pctid and this_coverage >= opts.pctcov: valid.add(query) covered += this_covered mismatches += this_mismatches gaps += this_gaps alignlen += this_alignlen mapped_count = len(queries) valid_count = len(valid) cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts) print >> sys.stderr, "Identity: {0} mismatches, {1} gaps, {2} alignlen".\ format(mismatches, gaps, alignlen) total = len(sizes.keys()) print >> sys.stderr, "Total mapped: {0} ({1:.1f}% of {2})".\ format(mapped_count, mapped_count * 100. / total, total) print >> sys.stderr, "Total valid {0}: {1} ({2:.1f}% of {3})".\ format(cutoff_message, valid_count, valid_count * 100. / total, total) print >> sys.stderr, "Average id = {0:.2f}%".\ format(100 - (mismatches + gaps) * 100. / alignlen) queries_combined = sum(sizes[x] for x in queries) print >> sys.stderr, "Coverage: {0} covered, {1} total".\ format(covered, queries_combined) print >> sys.stderr, "Average coverage = {0:.2f}%".\ format(covered * 100. / queries_combined) if opts.ids: filename = opts.ids fw = must_open(filename, "w") for id in valid: print >> fw, id logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\ format(cutoff_message, filename)) outfile = opts.outfile if not outfile: return fp = open(blastfile) fw = must_open(outfile, "w") blast = Blast(blastfile) for b in blast.iter_line(): if b.query in valid: print >> fw, b
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None): # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}, empty=True) for crange in cranges: if crange: seqid, start, end = crange bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) else: abeds.append(None) for a in aa: gapid = a.accn bi, b = border[gapid] if a.strand == '-': b.extra[1] = b.strand = ('-' if b.strand == '+' else '+') bbeds.append(b) n_abeds = len(abeds) n_bbeds = len(bbeds) assert n_abeds - n_bbeds == 1, \ "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds) beds = [x for x in roundrobin(abeds, bbeds) if x] if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled) return shuffledbed
def bambus(args): """ %prog bambus bambus.bed bambus.mates total.fasta Insert unplaced scaffolds based on mates. """ from jcvi.utils.iter import pairwise from jcvi.formats.posmap import MatesFile p = OptionParser(bambus.__doc__) p.add_option("--prefix", default="scaffold", help="Prefix of the unplaced scaffolds [default: %default]") p.add_option("--minlinks", default=3, type="int", help="Minimum number of links to place [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, matesfile, fastafile = args pf = matesfile.rsplit(".", 1)[0] logfile = pf + ".log" log = open(logfile, "w") mf = MatesFile(matesfile) maxdist = max(x.max for x in mf.libraries.values()) logging.debug("Max separation: {0}".format(maxdist)) prefix = opts.prefix minlinks = opts.minlinks is_unplaced = lambda x: x.startswith(prefix) bed = Bed(bedfile, sorted=False) beds = [] unplaced = defaultdict(list) for a, b in pairwise(bed): aname, bname = a.accn, b.accn aseqid, bseqid = a.seqid, b.seqid if aname not in mf: continue pa, la = mf[aname] if pa != bname: continue ia = is_unplaced(aseqid) ib = is_unplaced(bseqid) if ia == ib: continue if ia: a, b = b, a unplaced[b.seqid].append((a, b)) beds.extend([a, b]) sizes = Sizes(fastafile) candidatebed = Bed() cbeds = [] # For each unplaced scaffold, find most likely placement and orientation for scf, beds in sorted(unplaced.items()): print >> log ranges = [] for a, b in beds: aname, astrand = a.accn, a.strand bname, bstrand = b.accn, b.strand aseqid, bseqid = a.seqid, b.seqid pa, lib = mf[aname] print >> log, a print >> log, b flip_b = (astrand == bstrand) fbstrand = '-' if flip_b else '+' if flip_b: b.reverse_complement(sizes) lmin, lmax = lib.min, lib.max L = sizes.get_size(scf) assert astrand in ('+', '-') if astrand == '+': offset = a.start - b.end sstart, sstop = offset + lmin, offset + lmax else: offset = a.end - b.start + L sstart, sstop = offset - lmax, offset - lmin # Prevent out of range error size = sizes.get_size(aseqid) sstart = max(0, sstart) sstop = max(0, sstop) sstart = min(size - 1, sstart) sstop = min(size - 1, sstop) start_range = (aseqid, sstart, sstop, scf, 1, fbstrand) print >> log, "*" + "\t".join(str(x) for x in start_range) ranges.append(start_range) mranges = [x[:3] for x in ranges] # Determine placement by finding the interval with the most support rd = ranges_depth(mranges, sizes.mapping, verbose=False) alldepths = [] for depth in rd: alldepths.extend(depth) print >> log, alldepths maxdepth = max(alldepths, key=lambda x: x[-1])[-1] if maxdepth < minlinks: print >> log, "Insufficient links ({0} < {1})".format(maxdepth, minlinks) continue candidates = [x for x in alldepths if x[-1] == maxdepth] nseqids = len(set(x[0] for x in candidates)) msg = "Multiple conflicting candidates found" if nseqids != 1: print >> log, msg continue seqid, mmin, mmax, depth = candidates[0] mmin, mmax = range_minmax([x[1:3] for x in candidates]) if (mmax - mmin) > maxdist: print >> log, msg continue # Determine orientation by voting nplus, nminus = 0, 0 arange = (seqid, mmin, mmax) for sid, start, end, sf, sc, fbstrand in ranges: brange = (sid, start, end) if range_overlap(arange, brange): if fbstrand == '+': nplus += 1 else: nminus += 1 fbstrand = '+' if nplus >= nminus else '-' candidate = (seqid, mmin, mmax, scf, depth, fbstrand) bedline = BedLine("\t".join((str(x) for x in candidate))) cbeds.append(bedline) print >> log, "Plus: {0}, Minus: {1}".format(nplus, nminus) print >> log, candidate candidatebed.extend(cbeds) logging.debug("A total of {0} scaffolds can be placed.".\ format(len(candidatebed))) log.close() candidatebedfile = pf + ".candidate.bed" candidatebed.print_to_file(candidatebedfile, sorted=True)
def covfilter(args): """ %prog covfilter blastfile fastafile Fastafile is used to get the sizes of the queries. Two filters can be applied, the id% and cov%. """ from jcvi.algorithms.supermap import supermap from jcvi.utils.range import range_union allowed_iterby = ("query", "query_sbjct") p = OptionParser(covfilter.__doc__) p.set_align(pctid=95, pctcov=50) p.add_option("--scov", default=False, action="store_true", help="Subject coverage instead of query [default: %default]") p.add_option("--supermap", action="store_true", help="Use supermap instead of union") p.add_option("--ids", dest="ids", default=None, help="Print out the ids that satisfy [default: %default]") p.add_option("--list", dest="list", default=False, action="store_true", help="List the id% and cov% per gene [default: %default]") p.add_option("--iterby", dest="iterby", default="query", choices=allowed_iterby, help="Choose how to iterate through BLAST [default: %default]") p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, fastafile = args pctid = opts.pctid pctcov = opts.pctcov union = not opts.supermap scov = opts.scov sz = Sizes(fastafile) sizes = sz.mapping iterby = opts.iterby qspair = iterby == "query_sbjct" if not union: querysupermap = blastfile + ".query.supermap" if not op.exists(querysupermap): supermap(blastfile, filter="query") blastfile = querysupermap assert op.exists(blastfile) covered = 0 mismatches = 0 gaps = 0 alignlen = 0 queries = set() valid = set() blast = BlastSlow(blastfile) iterator = blast.iter_hits_pair if qspair else blast.iter_hits covidstore = {} for query, blines in iterator(): blines = list(blines) queries.add(query) # per gene report this_covered = 0 this_alignlen = 0 this_mismatches = 0 this_gaps = 0 this_identity = 0 ranges = [] for b in blines: if scov: s, start, stop = b.subject, b.sstart, b.sstop else: s, start, stop = b.query, b.qstart, b.qstop cov_id = s if b.pctid < pctid: continue if start > stop: start, stop = stop, start this_covered += stop - start + 1 this_alignlen += b.hitlen this_mismatches += b.nmismatch this_gaps += b.ngaps ranges.append(("1", start, stop)) if ranges: this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen if union: this_covered = range_union(ranges) this_coverage = this_covered * 100. / sizes[cov_id] covidstore[query] = (this_identity, this_coverage) if this_identity >= pctid and this_coverage >= pctcov: valid.add(query) covered += this_covered mismatches += this_mismatches gaps += this_gaps alignlen += this_alignlen if opts.list: if qspair: allpairs = defaultdict(list) for (q, s) in covidstore: allpairs[q].append((q, s)) allpairs[s].append((q, s)) for id, size in sz.iter_sizes(): if id not in allpairs: print "\t".join((id, "na", "0", "0")) else: for qs in allpairs[id]: this_identity, this_coverage = covidstore[qs] print "{0}\t{1:.1f}\t{2:.1f}".format("\t".join(qs), this_identity, this_coverage) else: for query, size in sz.iter_sizes(): this_identity, this_coverage = covidstore.get(query, (0, 0)) print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage) mapped_count = len(queries) valid_count = len(valid) cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts) m = "Identity: {0} mismatches, {1} gaps, {2} alignlen\n".\ format(mismatches, gaps, alignlen) total = len(sizes.keys()) m += "Total mapped: {0} ({1:.1f}% of {2})\n".\ format(mapped_count, mapped_count * 100. / total, total) m += "Total valid {0}: {1} ({2:.1f}% of {3})\n".\ format(cutoff_message, valid_count, valid_count * 100. / total, total) m += "Average id = {0:.2f}%\n".\ format(100 - (mismatches + gaps) * 100. / alignlen) queries_combined = sz.totalsize m += "Coverage: {0} covered, {1} total\n".\ format(covered, queries_combined) m += "Average coverage = {0:.2f}%".\ format(covered * 100. / queries_combined) logfile = blastfile + ".covfilter.log" fw = open(logfile, "w") for f in (sys.stderr, fw): print >> f, m fw.close() if opts.ids: filename = opts.ids fw = must_open(filename, "w") for id in valid: print >> fw, id logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\ format(cutoff_message, filename)) outfile = opts.outfile if not outfile: return fw = must_open(outfile, "w") blast = Blast(blastfile) for b in blast: query = (b.query, b.subject) if qspair else b.query if query in valid: print >> fw, b
def main(args): """ %prog deltafile Plot one query. Extract the references that have major matches to this query. Control "major" by option --refcov. """ p = OptionParser(main.__doc__) p.add_option("--refids", help="Use subset of contigs in the ref") p.add_option("--refcov", default=.01, type="float", help="Minimum reference coverage [default: %default]") p.add_option( "--all", default=False, action="store_true", help="Plot one pdf file per ref in refidsfile [default: %default]") p.add_option("--color", default="similarity", choices=("similarity", "direction", "none"), help="Color the dots based on") p.add_option("--nolayout", default=False, action="store_true", help="Do not rearrange contigs") p.set_align(pctid=0, hitlen=0) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) deltafile, = args reffasta, queryfasta = open(deltafile).readline().split() color = opts.color layout = not opts.nolayout prefix = op.basename(deltafile).split(".")[0] qsizes = Sizes(queryfasta).mapping rsizes = Sizes(reffasta).mapping refs = SetFile(opts.refids) if opts.refids else set(rsizes.keys()) refcov = opts.refcov pctid = opts.pctid hitlen = opts.hitlen deltafile = filter([ deltafile, "--pctid={0}".format(pctid), "--hitlen={0}".format(hitlen) ]) if opts.all: for r in refs: pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov, prefix=prefix, color=color, layout=layout) if pdffile: sh("mv {0} {1}.pdf".format(pdffile, r)) else: plot_some_queries(refs, qsizes, rsizes, deltafile, refcov, prefix=prefix, color=color, layout=layout)
def scaffold(args): """ %prog scaffold ctgfasta agpfile Build scaffolds based on ordering in the AGP file. """ from jcvi.formats.agp import bed, order_to_agp, build from jcvi.formats.bed import Bed p = OptionParser(scaffold.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Keep IDs with same prefix together [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, agpfile = args sizes = Sizes(ctgfasta).mapping pf = ctgfasta.rsplit(".", 1)[0] phasefile = pf + ".phases" fwphase = open(phasefile, "w") newagpfile = pf + ".new.agp" fwagp = open(newagpfile, "w") scaffoldbuckets = defaultdict(list) bedfile = bed([agpfile, "--nogaps", "--outfile=tmp"]) bb = Bed(bedfile) for s, partialorder in bb.sub_beds(): name = partialorder[0].accn bname = name.rsplit("_", 1)[0] if opts.prefix else s scaffoldbuckets[bname].append([(b.accn, b.strand) for b in partialorder]) # Now the buckets contain a mixture of singletons and partially resolved # scaffolds. Print the scaffolds first then remaining singletons. for bname, scaffolds in sorted(scaffoldbuckets.items()): ctgorder = [] singletons = set() for scaf in sorted(scaffolds): for node, orientation in scaf: ctgorder.append((node, orientation)) if len(scaf) == 1: singletons.add(node) nscaffolds = len(scaffolds) nsingletons = len(singletons) if nsingletons == 1 and nscaffolds == 0: phase = 3 elif nsingletons == 0 and nscaffolds == 1: phase = 2 else: phase = 1 msg = "{0}: Scaffolds={1} Singletons={2} Phase={3}".\ format(bname, nscaffolds, nsingletons, phase) print >> sys.stderr, msg print >> fwphase, "\t".join((bname, str(phase))) order_to_agp(bname, ctgorder, sizes, fwagp) fwagp.close() os.remove(bedfile) fastafile = "final.fasta" build([newagpfile, ctgfasta, fastafile]) tidy([fastafile])
def scaffold(args): """ %prog scaffold ctgfasta agpfile Build scaffolds based on ordering in the AGP file. """ from jcvi.formats.agp import AGP, bed, order_to_agp, build from jcvi.formats.bed import Bed p = OptionParser(scaffold.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Keep IDs with same prefix together [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, agpfile = args sizes = Sizes(ctgfasta).mapping pf = ctgfasta.rsplit(".", 1)[0] phasefile = pf + ".phases" fwphase = open(phasefile, "w") newagpfile = pf + ".new.agp" fwagp = open(newagpfile, "w") scaffoldbuckets = defaultdict(list) seqnames = sorted(sizes.keys()) bedfile = bed([agpfile, "--nogaps", "--outfile=tmp"]) bb = Bed(bedfile) for s, partialorder in bb.sub_beds(): name = partialorder[0].accn bname = name.rsplit("_", 1)[0] if opts.prefix else s scaffoldbuckets[bname].append([(b.accn, b.strand) for b in partialorder]) # Now the buckets contain a mixture of singletons and partially resolved # scaffolds. Print the scaffolds first then remaining singletons. for bname, scaffolds in sorted(scaffoldbuckets.items()): ctgorder = [] singletons = set() for scaf in sorted(scaffolds): for node, orientation in scaf: ctgorder.append((node, orientation)) if len(scaf) == 1: singletons.add(node) nscaffolds = len(scaffolds) nsingletons = len(singletons) if nsingletons == 1 and nscaffolds == 0: phase = 3 elif nsingletons == 0 and nscaffolds == 1: phase = 2 else: phase = 1 msg = "{0}: Scaffolds={1} Singletons={2} Phase={3}".\ format(bname, nscaffolds, nsingletons, phase) print >> sys.stderr, msg print >> fwphase, "\t".join((bname, str(phase))) order_to_agp(bname, ctgorder, sizes, fwagp) fwagp.close() os.remove(bedfile) fastafile = "final.fasta" build([newagpfile, ctgfasta, fastafile]) tidy([fastafile])
qbed, sbed = opts.qbed, opts.sbed proportional = opts.proportional if len(args) != 1: sys.exit(not p.print_help()) if qbed: qsizes = qsizes or sizes([qbed]) qbed = Bed(qbed) if sbed: ssizes = ssizes or sizes([sbed]) sbed = Bed(sbed) assert qsizes and ssizes, "You must specify at least one of --sizes of --bed" qsizes = Sizes(qsizes, select=opts.qselect) ssizes = Sizes(ssizes, select=opts.sselect) (blastfile, ) = args image_name = op.splitext(blastfile)[0] + "." + opts.format plt.rcParams["xtick.major.pad"] = 16 plt.rcParams["ytick.major.pad"] = 16 # Fix the width xsize, ysize = qsizes.totalsize, ssizes.totalsize # get highlight beds qh, sh = opts.qh, opts.sh qh = Bed(qh) if qh else None sh = Bed(sh) if sh else None
def scaffold(args): """ %prog scaffold ctgfasta linksfile Use the linksfile to build scaffolds. The linksfile can be generated by calling assembly.bundle.link() or assembly.bundle.bundle(). Use --prefix to place the sequences with same prefix together. The final product is an AGP file. """ from jcvi.algorithms.graph import nx from jcvi.formats.agp import order_to_agp p = OptionParser(scaffold.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Keep IDs with same prefix together [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, linksfile = args sizes = Sizes(ctgfasta).mapping logfile = "scaffold.log" fwlog = open(logfile, "w") pf = ctgfasta.rsplit(".", 1)[0] agpfile = pf + ".agp" fwagp = open(agpfile, "w") clinks = [] g = nx.MultiGraph() # use this to get connected components fp = open(linksfile) for row in fp: c = LinkLine(row) distance = max(c.distance, 50) g.add_edge(c.aseqid, c.bseqid, orientation=c.orientation, distance=distance) def get_bname(sname, prefix=False): return sname.rsplit("_", 1)[0] if prefix else "chr0" scaffoldbuckets = defaultdict(list) seqnames = sorted(sizes.keys()) for h in nx.connected_component_subgraphs(g): partialorder = solve_component(h, sizes, fwlog) name = partialorder[0][0] bname = get_bname(name, prefix=opts.prefix) scaffoldbuckets[bname].append(partialorder) ctgbuckets = defaultdict(set) for name in seqnames: bname = get_bname(name, prefix=opts.prefix) ctgbuckets[bname].add(name) # Now the buckets contain a mixture of singletons and partially resolved # scaffolds. Print the scaffolds first then remaining singletons. scafname = "{0}.scf_{1:04d}" for bname, ctgs in sorted(ctgbuckets.items()): scaffolds = scaffoldbuckets[bname] scaffolded = set() ctgorder = [] for scafID, scaf in enumerate(scaffolds): ctgorder = [] for node, start, end, orientation in scaf: ctgorder.append((node, orientation)) scaffolded.add(node) scaf = scafname.format(bname, scafID) order_to_agp(scaf, ctgorder, sizes, fwagp) singletons = sorted(ctgbuckets[bname] - scaffolded) nscaffolds = len(scaffolds) nsingletons = len(singletons) msg = "{0}: Scaffolds={1} Singletons={2}".\ format(bname, nscaffolds, nsingletons) print >> sys.stderr, msg for singleton in singletons: ctgorder = [(singleton, "+")] order_to_agp(singleton, ctgorder, sizes, fwagp) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))
def stack(args): """ %prog stack fastafile Create landscape plots that show the amounts of genic sequences, and repetitive sequences along the chromosomes. """ p = OptionParser(stack.__doc__) p.add_option("--top", default=10, type="int", help="Draw the first N chromosomes [default: %default]") p.add_option("--stacks", default="Exons,Introns,DNA_transposons,Retrotransposons", help="Features to plot in stackplot [default: %default]") p.add_option("--switch", help="Change chr names based on two-column file [default: %default]") add_window_options(p) opts, args, iopts = p.set_image_options(args, figsize="8x8") if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args top = opts.top window, shift, subtract = check_window_options(opts) switch = opts.switch if switch: switch = DictFile(opts.switch) stacks = opts.stacks.split(",") bedfiles = get_beds(stacks) binfiles = get_binfiles(bedfiles, fastafile, shift, subtract=subtract) sizes = Sizes(fastafile) s = list(sizes.iter_sizes())[:top] maxl = max(x[1] for x in s) margin = .08 inner = .02 # y distance between tracks pf = fastafile.rsplit(".", 1)[0] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # Gauge ratio = draw_gauge(root, margin, maxl) # Per chromosome yinterval = (1 - 2 * margin) / (top + 1) xx = margin yy = 1 - margin for chr, clen in s: yy -= yinterval xlen = clen / ratio cc = chr if "_" in chr: ca, cb = chr.split("_") cc = ca[0].upper() + cb if switch and cc in switch: cc = "\n".join((cc, "({0})".format(switch[cc]))) root.add_patch(Rectangle((xx, yy), xlen, yinterval - inner, color=gray)) ax = fig.add_axes([xx, yy, xlen, yinterval - inner]) nbins = clen / shift if clen % shift: nbins += 1 stackplot(ax, binfiles, nbins, palette, chr, window, shift) root.text(xx - .04, yy + .5 * (yinterval - inner), cc, ha="center", va="center") ax.set_xlim(0, nbins) ax.set_ylim(0, 1) ax.set_axis_off() # Legends yy -= yinterval xx = margin for b, p in zip(bedfiles, palette): b = b.rsplit(".", 1)[0].replace("_", " ") b = Registration.get(b, b) root.add_patch(Rectangle((xx, yy), inner, inner, color=p, lw=0)) xx += 2 * inner root.text(xx, yy, b, size=13) xx += len(b) * .012 + inner root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = pf + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def connect(args): """ %prog connect assembly.fasta read_mapping.blast Connect contigs using long reads. """ p = OptionParser(connect.__doc__) p.add_option( "--clip", default=2000, type="int", help="Only consider end of contigs", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, blastfile = args clip = opts.clip sizes = Sizes(fastafile).mapping blast = Blast(blastfile) blasts = [] for b in blast: seqid = b.subject size = sizes[seqid] start, end = b.sstart, b.sstop cstart, cend = min(size, clip), max(0, size - clip) if start > cstart and end < cend: continue blasts.append(b) key = lambda x: x.query blasts.sort(key=key) g = BiGraph() for query, bb in groupby(blasts, key=key): bb = sorted(bb, key=lambda x: x.qstart) nsubjects = len(set(x.subject for x in bb)) if nsubjects == 1: continue print("\n".join(str(x) for x in bb)) for a, b in pairwise(bb): astart, astop = a.qstart, a.qstop bstart, bstop = b.qstart, b.qstop if a.subject == b.subject: continue arange = astart, astop brange = bstart, bstop ov = range_intersect(arange, brange) alen = astop - astart + 1 blen = bstop - bstart + 1 if ov: ostart, ostop = ov ov = ostop - ostart + 1 print(ov, alen, blen) if ov and (ov > alen / 2 or ov > blen / 2): print("Too much overlap ({0})".format(ov)) continue asub = a.subject bsub = b.subject atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(asub, bsub, atag, btag) graph_to_agp(g, blastfile, fastafile, verbose=False)
def fromagp(args): """ %prog fromagp agpfile componentfasta objectfasta Generate chain file from AGP format. The components represent the old genome (target) and the objects represent new genome (query). """ from jcvi.formats.agp import AGP from jcvi.formats.sizes import Sizes p = OptionParser(fromagp.__doc__) p.add_option("--novalidate", default=False, action="store_true", help="Do not validate AGP") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) agpfile, componentfasta, objectfasta = args chainfile = agpfile.rsplit(".", 1)[0] + ".chain" fw = open(chainfile, "w") agp = AGP(agpfile, validate=(not opts.novalidate)) componentsizes = Sizes(componentfasta).mapping objectsizes = Sizes(objectfasta).mapping chain = "chain" score = 1000 tStrand = "+" id = 0 for a in agp: if a.is_gap: continue tName = a.component_id tSize = componentsizes[tName] tStart = a.component_beg tEnd = a.component_end tStart -= 1 qName = a.object qSize = objectsizes[qName] qStrand = "-" if a.orientation == "-" else "+" qStart = a.object_beg qEnd = a.object_end if qStrand == '-': _qStart = qSize - qEnd + 1 _qEnd = qSize - qStart + 1 qStart, qEnd = _qStart, _qEnd qStart -= 1 id += 1 size = a.object_span headerline = "\t".join( str(x) for x in (chain, score, tName, tSize, tStrand, tStart, tEnd, qName, qSize, qStrand, qStart, qEnd, id)) alignmentline = size print >> fw, headerline print >> fw, alignmentline print >> fw fw.close() logging.debug("File written to `{0}`.".format(chainfile))
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option("--clique", default=False, action="store_true", help="Populate clique instead of linear path [default: %default]") p.add_option("--maxdist", default=100000, type="int", help="Create edge within certain distance [default: %default]") p.add_option("--verbose", default=False, action="store_true", help="Print verbose reports to stdout [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(BiEdge(asub, bsub, atag, btag)) g.write("graph.txt") #g.draw("graph.pdf") logging.debug(str(g)) paths = [] for path in g.iter_paths(): m, oo = g.path(path) if len(oo) == 1: # Singleton path continue paths.append(oo) if opts.verbose: print m print oo npaths = len(paths) ntigs = sum(len(x) for x in paths) logging.debug("Graph decomposed to {0} paths with {1} components.".\ format(npaths, ntigs)) agpfile = blastfile + ".agp" sizes = Sizes(subjectfasta) fwagp = open(agpfile, "w") scaffolded = set() for i, oo in enumerate(paths): ctgorder = [(str(ctg), ("+" if strand else "-")) \ for ctg, strand in oo] scaffolded |= set(ctg for ctg, strand in ctgorder) object = "pmol_{0:04d}".format(i) order_to_agp(object, ctgorder, sizes.mapping, fwagp) # Get the singletons as well nsingletons = 0 for ctg, size in sizes.iter_sizes(): if ctg in scaffolded: continue ctgorder = [(ctg, "+")] object = ctg order_to_agp(object, ctgorder, sizes.mapping, fwagp) nsingletons += 1 logging.debug("Written {0} unscaffolded singletons.".format(nsingletons)) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))
def get_offsets(fastafile): s = Sizes(fastafile) fastasize = s.totalsize sizes = s.mapping offsets = s.cumsizes_mapping return fastasize, sizes, offsets
def score(args): """ %prog score main_results/ cached_data/ contigsfasta Score the current LACHESIS CLM. """ p = OptionParser(score.__doc__) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) mdir, cdir, contigsfasta = args orderingfiles = natsorted(iglob(mdir, "*.ordering")) sizes = Sizes(contigsfasta) contig_names = list(sizes.iter_names()) contig_ids = dict((name, i) for (i, name) in enumerate(contig_names)) oo = [] # Load contact matrix glm = op.join(cdir, "all.GLM") N = len(contig_ids) M = np.zeros((N, N), dtype=int) fp = open(glm) for row in fp: if row[0] == '#': continue x, y, z = row.split() if x == 'X': continue M[int(x), int(y)] = int(z) fwtour = open("tour", "w") def callback(tour, gen, oo): fitness = tour.fitness if hasattr(tour, "fitness") else None label = "GA-{0}".format(gen) if fitness: fitness = "{0}".format(fitness).split(",")[0].replace("(", "") label += "-" + fitness print_tour(fwtour, tour, label, contig_names, oo) return tour for ofile in orderingfiles: co = ContigOrdering(ofile) for x in co: contig_id = contig_ids[x.contig_name] oo.append(contig_id) pf = op.basename(ofile).split(".")[0] print pf print oo tour, tour_sizes, tour_M = prepare_ec(oo, sizes, M) # Store INIT tour print_tour(fwtour, tour, "INIT", contig_names, oo) # Faster Cython version for evaluation from .chic import score_evaluate_M callbacki = partial(callback, oo=oo) toolbox = GA_setup(tour) toolbox.register("evaluate", score_evaluate_M, tour_sizes=tour_sizes, tour_M=tour_M) tour, tour.fitness = GA_run(toolbox, npop=100, cpus=opts.cpus, callback=callbacki) print tour, tour.fitness break fwtour.close()