def prepare_synteny(tourfile, lastfile, odir, p, opts): """ Prepare synteny plots for movie(). """ qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts) qbedfile = op.abspath(qbedfile) sbedfile = op.abspath(sbedfile) qbed = Bed(qbedfile, sorted=False) contig_to_beds = dict(qbed.sub_beds()) # Create a separate directory for the subplots and movie mkdir(odir, overwrite=True) os.chdir(odir) logging.debug("Change into subdir `{}`".format(odir)) # Make anchorsfile anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) \ + ".anchors" fw = open(anchorsfile, "w") for b in Blast(lastfile): print >> fw, "\t".join((gene_name(b.query), gene_name(b.subject), str(int(b.score)))) fw.close() # Symlink sbed symlink(sbedfile, op.basename(sbedfile)) return anchorsfile, qbedfile, contig_to_beds
def insertion(args): """ %prog insertion mic.mac.bed Find IES based on mapping MIC reads to MAC genome. Output a bedfile with 'lesions' (stack of broken reads) in the MAC genome. """ p = OptionParser(insertion.__doc__) p.add_option("--mindepth", default=6, type="int", help="Minimum depth to call an insertion") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args mindepth = opts.mindepth bed = Bed(bedfile) fw = must_open(opts.outfile, "w") for seqid, feats in bed.sub_beds(): left_ends = Counter([x.start for x in feats]) right_ends = Counter([x.end for x in feats]) selected = [] for le, count in left_ends.items(): if count >= mindepth: selected.append((seqid, le, "LE-{0}".format(le), count)) for re, count in right_ends.items(): if count >= mindepth: selected.append((seqid, re, "RE-{0}".format(re), count)) selected.sort() for seqid, pos, label, count in selected: label = "{0}-r{1}".format(label, count) print >> fw, "\t".join((seqid, str(pos - 1), str(pos), label))
def nucmer(args): """ %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3 Select specific chromosome region based on MTR mapping. The above command will extract chr1:2,000,001-3,000,000. """ p = OptionParser(nucmer.__doc__) opts, args = p.parse_args(args) if len(args) != 5: sys.exit(not p.print_help()) mapbed, mtrfasta, asmfasta, chr, idx = args idx = int(idx) m1 = 1000000 bedfile = "sample.bed" bed = Bed() bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1))) bed.print_to_file(bedfile) cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format(mapbed, bedfile) idsfile = "query.ids" sh(cmd, outfile=idsfile) sfasta = fastaFromBed(bedfile, mtrfasta) qfasta = "query.fasta" cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta) sh(cmd) cmd = "nucmer {0} {1}".format(sfasta, qfasta) sh(cmd) mummerplot_main(["out.delta", "--refcov=0"]) sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale try: newsseqid = get_number(sseqid) except ValueError: raise ValueError, "`{0}` is on `{1}` with no number to extract".\ format(saccn, sseqid) bedline = "\t".join(str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(newsseqid, sstart))) bd.add(bedline) bd.print_to_file(filename=opts.outfile, sorted=True)
def scaffold(args): """ %prog scaffold scaffold.fasta synteny.blast synteny.sizes synteny.bed physicalmap.blast physicalmap.sizes physicalmap.bed As evaluation of scaffolding, visualize external line of evidences: * Plot synteny to an external genome * Plot alignments to physical map * Plot alignments to genetic map (TODO) Each trio defines one panel to be plotted. blastfile defines the matchings between the evidences vs scaffolds. Then the evidence sizes, and evidence bed to plot dot plots. This script will plot a dot in the dot plot in the corresponding location the plots are one contig/scaffold per plot. """ from jcvi.graphics.base import set_image_options from jcvi.utils.iter import grouper p = OptionParser(scaffold.__doc__) p.add_option("--cutoff", type="int", default=1000000, help="Plot scaffolds with size larger than [default: %default]") p.add_option("--highlights", help="A set of regions in BED format to highlight [default: %default]") opts, args, iopts = set_image_options(p, args, figsize="14x8", dpi=150) if len(args) < 4 or len(args) % 3 != 1: sys.exit(not p.print_help()) highlights = opts.highlights scafsizes = Sizes(args[0]) trios = list(grouper(3, args[1:])) trios = [(a, Sizes(b), Bed(c)) for a, b, c in trios] if highlights: hlbed = Bed(highlights) for scaffoldID, scafsize in scafsizes.iter_sizes(): if scafsize < opts.cutoff: continue logging.debug("Loading {0} (size={1})".format(scaffoldID, thousands(scafsize))) tmpname = scaffoldID + ".sizes" tmp = open(tmpname, "w") tmp.write("{0}\t{1}".format(scaffoldID, scafsize)) tmp.close() tmpsizes = Sizes(tmpname) tmpsizes.close(clean=True) if highlights: subhighlights = list(hlbed.sub_bed(scaffoldID)) imagename = ".".join((scaffoldID, opts.format)) plot_one_scaffold(scaffoldID, tmpsizes, None, trios, imagename, iopts, highlights=subhighlights)
def frombed(args): """ %prog frombed bedfile contigfasta readfasta Convert read placement to contig format. This is useful before running BAMBUS. """ from jcvi.formats.fasta import Fasta from jcvi.formats.bed import Bed from jcvi.utils.cbook import fill p = OptionParser(frombed.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, contigfasta, readfasta = args prefix = bedfile.rsplit(".", 1)[0] contigfile = prefix + ".contig" idsfile = prefix + ".ids" contigfasta = Fasta(contigfasta) readfasta = Fasta(readfasta) bed = Bed(bedfile) checksum = "00000000 checksum." fw_ids = open(idsfile, "w") fw = open(contigfile, "w") for ctg, reads in bed.sub_beds(): ctgseq = contigfasta[ctg] ctgline = "##{0} {1} {2} bases, {3}".format(\ ctg, len(reads), len(ctgseq), checksum) print >> fw_ids, ctg print >> fw, ctgline print >> fw, fill(ctgseq.seq) for b in reads: read = b.accn strand = b.strand readseq = readfasta[read] rc = " [RC]" if strand == "-" else "" readlen = len(readseq) rstart, rend = 1, readlen if strand == "-": rstart, rend = rend, rstart readrange = "{{{0} {1}}}".format(rstart, rend) conrange = "<{0} {1}>".format(b.start, b.end) readline = "#{0}(0){1} {2} bases, {3} {4} {5}".format(\ read, rc, readlen, checksum, readrange, conrange) print >> fw, readline print >> fw, fill(readseq.seq) logging.debug("Mapped contigs written to `{0}`.".format(contigfile)) logging.debug("Contig IDs written to `{0}`.".format(idsfile))
def write_lst(bedfile): pf = op.basename(bedfile).split(".")[0] mkdir(pf) bed = Bed(bedfile) stanza = [] for seqid, bs in bed.sub_beds(): fname = op.join(pf, "{0}.lst".format(seqid)) fw = open(fname, "w") for b in bs: print >> fw, "{0}{1}".format(b.accn.replace(" ", ""), b.strand) stanza.append((seqid, fname)) fw.close() return pf, stanza
def tips(args): """ %prog tips patchers.bed complements.bed original.fasta backbone.fasta Append telomeric sequences based on patchers and complements. """ p = OptionParser(tips.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbedfile, cbedfile, sizesfile, bbfasta = args pbed = Bed(pbedfile, sorted=False) cbed = Bed(cbedfile, sorted=False) complements = dict() for object, beds in groupby(cbed, key=lambda x: x.seqid): beds = list(beds) complements[object] = beds sizes = Sizes(sizesfile).mapping bbsizes = Sizes(bbfasta).mapping tbeds = [] for object, beds in groupby(pbed, key=lambda x: x.accn): beds = list(beds) startbed, endbed = beds[0], beds[-1] start_id, end_id = startbed.seqid, endbed.seqid if startbed.start == 1: start_id = None if endbed.end == sizes[end_id]: end_id = None print >> sys.stderr, object, start_id, end_id if start_id: b = complements[start_id][0] b.accn = object tbeds.append(b) tbeds.append(BedLine("\t".join(str(x) for x in \ (object, 0, bbsizes[object], object, 1000, "+")))) if end_id: b = complements[end_id][-1] b.accn = object tbeds.append(b) tbed = Bed() tbed.extend(tbeds) tbedfile = "tips.bed" tbed.print_to_file(tbedfile)
def liftover(args): """ %prog liftover agpfile bedfile Given coordinates in components, convert to the coordinates in chromosomes. """ p = OptionParser(liftover.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Prepend prefix to accn names [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile).order bed = Bed(bedfile) newbed = Bed() for b in bed: component = b.seqid if component not in agp: newbed.append(b) continue i, a = agp[component] assert a.component_beg < a.component_end arange = a.component_beg, a.component_end assert b.start < b.end brange = b.start, b.end st = range_intersect(arange, brange) if not st: continue start, end = st assert start <= end if a.orientation == '-': d = a.object_end + a.component_beg s, t = d - end, d - start else: d = a.object_beg - a.component_beg s, t = d + start, d + end name = b.accn.replace(" ", "_") if opts.prefix: name = component + "_" + name bline = "\t".join(str(x) for x in (a.object, s - 1, t, name)) newbed.append(BedLine(bline)) newbed.sort(key=newbed.nullkey) newbed.print_to_file()
def merge(args): """ %prog merge map1 map2 map3 ... Convert csv maps to bed format. Each input map is csv formatted, for example: ScaffoldID,ScaffoldPosition,LinkageGroup,GeneticPosition scaffold_2707,11508,1,0 scaffold_2707,11525,1,1.2 scaffold_759,81336,1,9.7 """ p = OptionParser(merge.__doc__) p.add_option("-w", "--weightsfile", default="weights.txt", help="Write weights to file") p.set_outfile("out.bed") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) maps = args outfile = opts.outfile fp = must_open(maps) b = Bed() mapnames = set() for row in fp: mapname = fp.filename().split(".")[0] mapnames.add(mapname) try: m = CSVMapLine(row, mapname=mapname) if m.cm < 0: logging.error("Ignore marker with negative genetic distance") print >> sys.stderr, row.strip() else: b.append(BedLine(m.bedline)) except (IndexError, ValueError): # header or mal-formed line continue b.print_to_file(filename=outfile, sorted=True) logging.debug("A total of {0} markers written to `{1}`.".\ format(len(b), outfile)) assert len(maps) == len(mapnames), "You have a collision in map names" write_weightsfile(mapnames, weightsfile=opts.weightsfile)
def patcher(args): """ %prog patcher backbone.bed other.bed Given optical map alignment, prepare the patchers. Use --backbone to suggest which assembly is the major one, and the patchers will be extracted from another assembly. """ from jcvi.formats.bed import uniq p = OptionParser(patcher.__doc__) p.add_option("--backbone", default="OM", help="Prefix of the backbone assembly [default: %default]") p.add_option("--object", default="object", help="New object name [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) backbonebed, otherbed = args backbonebed = uniq([backbonebed]) otherbed = uniq([otherbed]) pf = backbonebed.split(".")[0] key = lambda x: (x.seqid, x.start, x.end) # Make a uniq bed keeping backbone at redundant intervals cmd = "intersectBed -v -wa" cmd += " -a {0} -b {1}".format(otherbed, backbonebed) outfile = otherbed.rsplit(".", 1)[0] + ".not." + backbonebed sh(cmd, outfile=outfile) uniqbed = Bed() uniqbedfile = pf + ".merged.bed" uniqbed.extend(Bed(backbonebed)) uniqbed.extend(Bed(outfile)) uniqbed.print_to_file(uniqbedfile, sorted=True) # Condense adjacent intervals, allow some chaining bed = uniqbed key = lambda x: range_parse(x.accn).seqid bed_fn = pf + ".patchers.bed" bed_fw = open(bed_fn, "w") for k, sb in groupby(bed, key=key): sb = list(sb) chr, start, end, strand = merge_ranges(sb) print >> bed_fw, "\t".join(str(x) for x in \ (chr, start, end, opts.object, 1000, strand)) bed_fw.close()
def breakpoint(args): """ %prog breakpoint blastfile bedfile Identify breakpoints where collinearity ends. `blastfile` contains mapping from markers (query) to scaffolds (subject). `bedfile` contains marker locations in the related species. """ from jcvi.formats.blast import bed from jcvi.utils.range import range_interleave p = OptionParser(breakpoint.__doc__) p.add_option("--xdist", type="int", default=20, help="xdist (in related genome) cutoff [default: %default]") p.add_option("--ydist", type="int", default=200000, help="ydist (in current genome) cutoff [default: %default]") p.add_option("-n", type="int", default=5, help="number of markers in a block [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, bedfile = args order = Bed(bedfile).order blastbedfile = bed([blastfile]) bbed = Bed(blastbedfile) key = lambda x: x[1] for scaffold, bs in bbed.sub_beds(): blocks = get_blocks(scaffold, bs, order, xdist=opts.xdist, ydist=opts.ydist, N=opts.n) sblocks = [] for block in blocks: xx, yy = zip(*block) sblocks.append((scaffold, min(yy), max(yy))) iblocks = range_interleave(sblocks) for ib in iblocks: ch, start, end = ib print "{0}\t{1}\t{2}".format(ch, start - 1, end)
def eject(args): """ %prog eject candidates.bed chr.fasta Eject scaffolds from assembly, using the range identified by closest(). """ p = OptionParser(eject.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) candidates, chrfasta = args sizesfile = Sizes(chrfasta).filename cbedfile = complementBed(candidates, sizesfile) cbed = Bed(cbedfile) for b in cbed: b.accn = b.seqid b.score = 1000 b.strand = '+' cbed.print_to_file()
def mergebed(args): """ %prog mergebed map1.bed map2.bed map3.bed ... Combine bed maps to bed format, adding the map name. """ p = OptionParser(mergebed.__doc__) p.add_option("-w", "--weightsfile", default="weights.txt", help="Write weights to file") p.set_outfile("out.bed") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) maps = args outfile = opts.outfile fp = must_open(maps) b = Bed() mapnames = set() for row in fp: mapname = fp.filename().split(".")[0] mapnames.add(mapname) try: m = BedLine(row) m.accn = "{0}-{1}".format(mapname, m.accn) m.extra = ["{0}:{1}".format(m.seqid, m.start)] b.append(m) except (IndexError, ValueError): # header or mal-formed line continue b.print_to_file(filename=outfile, sorted=True) logging.debug("A total of {0} markers written to `{1}`.".\ format(len(b), outfile)) assert len(maps) == len(mapnames), "You have a collision in map names" write_weightsfile(mapnames, weightsfile=opts.weightsfile)
def bed(args): ''' %prog bed gff_file [--options] Parses the start, stop locations of the selected features out of GFF and generate a bed file ''' p = OptionParser(bed.__doc__) p.add_option("--type", dest="type", default="gene", help="Feature type to extract, use comma for multiple [default: %default]") p.add_option("--key", dest="key", default="ID", help="Key in the attributes to extract [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args key = opts.key if key == "None": key = None type = set(x.strip() for x in opts.type.split(",")) gff = Gff(gffile, key=key) b = Bed() for g in gff: if g.type not in type: continue b.append(g.bedline) b.sort(key=b.key) b.print_to_file(opts.outfile)
def rename(args): """ %prog rename genes.bed [gaps.bed] Rename genes for annotation release. For genes on chromosomes (e.g. the 12th gene on C1): Bo1g00120 For genes on scaffolds (e.g. the 12th gene on unplaced Scaffold00285): Bo00285s120 The genes identifiers will increment by 10. So assuming no gap, these are the consecutive genes: Bo1g00120, Bo1g00130, Bo1g00140... Bo00285s120, Bo00285s130, Bo00285s140... When we encounter gaps, we would like the increment to be larger. For example, Bo1g00120, <gap>, Bo1g01120... Gaps bed file is optional. """ import string p = OptionParser(rename.__doc__) p.add_option("-a", dest="gene_increment", default=10, type="int", help="Increment for continuous genes [default: %default]") p.add_option("-b", dest="gap_increment", default=1000, type="int", help="Increment for gaps [default: %default]") p.add_option("--pad0", default=6, type="int", help="Pad gene identifiers with 0 [default: %default]") p.add_option("--spad0", default=4, type="int", help="Pad gene identifiers on small scaffolds [default: %default]") p.add_option("--prefix", default="Bo", help="Genome prefix [default: %default]") p.add_option("--jgi", default=False, action="store_true", help="Create JGI style identifier PREFIX.NN[G|TE]NNNNN.1" + \ " [default: %default]") opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) genebed = args[0] gapbed = args[1] if len(args) == 2 else None prefix = opts.prefix gene_increment = opts.gene_increment gap_increment = opts.gap_increment genes = Bed(genebed) if gapbed: fp = open(gapbed) for row in fp: genes.append(BedLine(row)) genes.sort(key=genes.key) idsfile = prefix + ".ids" newbedfile = prefix + ".bed" gap_increment -= gene_increment assert gap_increment >= 0 if opts.jgi: prefix += "." fw = open(idsfile, "w") for chr, lines in groupby(genes, key=lambda x: x.seqid): lines = list(lines) pad0 = opts.pad0 if len(lines) > 1000 else opts.spad0 isChr = chr[0].upper() == 'C' digits = "".join(x for x in chr if x in string.digits) gs = "g" if isChr else "s" pp = prefix + digits + gs idx = 0 if isChr: idx += gap_increment for r in lines: isGap = r.strand not in ("+", "-") if isGap: idx += gap_increment continue else: idx += gene_increment accn = pp + "{0:0{1}d}".format(idx, pad0) oldaccn = r.accn print >> fw, "\t".join((oldaccn, accn)) r.accn = accn genes.print_to_file(newbedfile) logging.debug("Converted IDs written to `{0}`.".format(idsfile)) logging.debug("Converted bed written to `{0}`.".format(newbedfile))
def renumber(args): """ %prog renumber Mt35.consolidated.bed > tagged.bed Renumber genes for annotation updates. """ from jcvi.algorithms.lis import longest_increasing_subsequence from jcvi.utils.grouper import Grouper p = OptionParser(renumber.__doc__) p.set_annot_reformat_opts() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args pf = bedfile.rsplit(".", 1)[0] abedfile = pf + ".a.bed" bbedfile = pf + ".b.bed" if need_update(bedfile, (abedfile, bbedfile)): prepare(bedfile) mbed = Bed(bbedfile) g = Grouper() for s in mbed: accn = s.accn g.join(*accn.split(";")) bed = Bed(abedfile) for chr, sbed in bed.sub_beds(): current_chr = chr_number(chr) if not current_chr: continue ranks = [] gg = set() for s in sbed: accn = s.accn achr, arank = atg_name(accn) if achr != current_chr: continue ranks.append(arank) gg.add(accn) lranks = longest_increasing_subsequence(ranks) print >> sys.stderr, current_chr, len(sbed), "==>", len(ranks), \ "==>", len(lranks) granks = set(gene_name(current_chr, x, prefix=opts.prefix, \ pad0=opts.pad0, uc=opts.uc) for x in lranks) | \ set(gene_name(current_chr, x, prefix=opts.prefix, \ pad0=opts.pad0, sep="te", uc=opts.uc) for x in lranks) tagstore = {} for s in sbed: achr, arank = atg_name(s.accn) accn = s.accn if accn in granks: tag = (accn, FRAME) elif accn in gg: tag = (accn, RETAIN) else: tag = (".", NEW) tagstore[accn] = tag # Find cases where genes overlap for s in sbed: accn = s.accn gaccn = g[accn] tags = [((tagstore[x][-1] if x in tagstore else NEW), x) for x in gaccn] group = [(PRIORITY.index(tag), x) for tag, x in tags] best = min(group)[-1] if accn != best: tag = (best, OVERLAP) else: tag = tagstore[accn] print "\t".join((str(s), "|".join(tag)))
def plot(args): """ %prog plot tagged.new.bed chr1 Plot gene identifiers along a particular chromosome, often to illustrate the gene id assignment procedure. """ from jcvi.graphics.base import plt, savefig from jcvi.graphics.chromosome import ChromosomeMap p = OptionParser(plot.__doc__) p.add_option("--firstn", type="int", help="Only plot the first N genes") p.add_option("--ymax", type="int", help="Y-axis max value") p.add_option("--log", action="store_true", help="Write plotting data [default: %default]") opts, args, iopts = p.set_image_options(args, figsize="6x4") if len(args) != 2: sys.exit(not p.print_help()) taggedbed, chr = args bed = Bed(taggedbed) beds = list(bed.sub_bed(chr)) old, new = [], [] i = 0 for b in beds: accn = b.extra[0] if "te" in accn: continue accn, tag = accn.split("|") if tag == "OVERLAP": continue c, r = atg_name(accn) if tag == "NEW": new.append((i, r)) else: old.append((i, r)) i += 1 ngenes = i assert ngenes == len(new) + len(old) logging.debug("Imported {0} ranks on {1}.".format(ngenes, chr)) fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) xstart, xend = .2, .8 ystart, yend = .2, .8 pad = .02 ngenes = opts.firstn or ngenes ymax = opts.ymax or 500000 title = "Assignment of Medtr identifiers" if opts.ymax: subtitle = "{0}, first {1} genes".format(chr, ngenes) else: subtitle = "{0}, {1} genes ({2} new)".format(chr, ngenes, len(new)) chr_map = ChromosomeMap(fig, root, xstart, xend, ystart, yend, pad, 0, ymax, 5, title, subtitle) ax = chr_map.axes if opts.log: from jcvi.utils.table import write_csv header = ["x", "y"] write_csv(header, new, filename=chr + ".new") write_csv(header, old, filename=chr + ".old") x, y = zip(*new) ax.plot(x, y, "b,") x, y = zip(*old) ax.plot(x, y, "r,") # Legends ymid = (ystart + yend) / 2 y = ymid + pad root.plot([.2], [y], "r.", lw=2) root.text(.2 + pad, y, "Existing Medtr ids", va="center", size=10) y = ymid - pad root.plot([.2], [y], "b.", lw=2) root.text(.2 + pad, y, "Newly instantiated ids", va="center", size=10) ax.set_xlim(0, ngenes) ax.set_ylim(0, ymax) ax.set_axis_off() root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = chr + ".identifiers." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def mask(args): """ %prog mask agpfile bedfile Mask given ranges in componets to gaps. """ p = OptionParser(mask.__doc__) p.add_option("--split", default=False, action="store_true", help="Split object and create new names [default: %default]") p.add_option("--log", default=False, action="store_true", help="Write verbose logs to .masklog file [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile) bed = Bed(bedfile) simple_agp = agp.order # agp lines to replace original ones, keyed by the component agp_fixes = defaultdict(list) newagpfile = agpfile.replace(".agp", ".masked.agp") logfile = bedfile.replace(".bed", ".masklog") fw = open(newagpfile, "w") if opts.log: fwlog = open(logfile, "w") for component, intervals in bed.sub_beds(): if opts.log: print >> fwlog, "\n".join(str(x) for x in intervals) i, a = simple_agp[component] object = a.object component_span = a.component_span orientation = a.orientation if opts.log: print >> fwlog, a assert a.component_beg, a.component_end arange = a.component_beg, a.component_end # Make sure `ivs` contain DISJOINT ranges, and located within `arange` ivs = [] for i in intervals: iv = range_intersect(arange, (i.start, i.end)) if iv is not None: ivs.append(iv) # Sort the ends of `ivs` as well as the arange arange = a.component_beg - 1, a.component_end + 1 endpoints = sorted(flatten(ivs + [arange])) # reverse if component on negative strand if orientation == '-': endpoints.reverse() sum_of_spans = 0 # assign complements as sequence components for i, (a, b) in enumerate(pairwise(endpoints)): if orientation == '-': a, b = b, a if orientation not in ('+', '-'): orientation = '+' oid = object + "_{0}".format(i / 2) if opts.split else object aline = [oid, 0, 0, 0] if i % 2 == 0: cspan = b - a - 1 aline += ['D', component, a + 1, b - 1, orientation] is_gap = False else: cspan = b - a + 1 aline += ["N", cspan, "fragment", "yes"] is_gap = True if cspan <= 0: continue sum_of_spans += cspan aline = "\t".join(str(x) for x in aline) if not (opts.split and is_gap): agp_fixes[component].append(aline) if opts.log: print >> fwlog, aline assert component_span == sum_of_spans if opts.log: print >> fwlog # Finally write the masked agp for a in agp: if not a.is_gap and a.component_id in agp_fixes: print >> fw, "\n".join(agp_fixes[a.component_id]) else: print >> fw, a fw.close() # Reindex idxagpfile = reindex([newagpfile]) shutil.move(idxagpfile, newagpfile) return newagpfile
def simple(args): """ %prog simple anchorfile --qbed=qbedfile --sbed=sbedfile [options] Write the block ends for each block in the anchorfile. GeneA1 GeneA2 GeneB1 GeneB2 +/- score Optional additional columns: orderA1 orderA2 orderB1 orderB2 sizeA sizeB size block_id With base coordinates (--coords): block_id seqidA startA endA bpSpanA GeneA1 GeneA2 geneSpanA block_id seqidB startB endB bpSpanB GeneB1 GeneB2 geneSpanB """ p = OptionParser(simple.__doc__) p.add_option("--rich", default=False, action="store_true", \ help="Output additional columns [default: %default]") p.add_option("--coords", default=False, action="store_true", help="Output columns with base coordinates [default: %default]") p.add_option("--bed", default=False, action="store_true", help="Generate BED file for the blocks") p.add_option("--noheader", default=False, action="store_true", help="Don't output header [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args additional = opts.rich coords = opts.coords header = not opts.noheader bed = opts.bed if bed: coords = True bbed = Bed() ac = AnchorFile(anchorfile) simplefile = anchorfile.rsplit(".", 1)[0] + ".simple" qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) pf = "-".join(anchorfile.split(".", 2)[:2]) blocks = ac.blocks if coords: h = "Block|Chr|Start|End|Span|StartGene|EndGene|GeneSpan|Orientation" else: h = "StartGeneA|EndGeneA|StartGeneB|EndGeneB|Orientation|Score" if additional: h += "|StartOrderA|EndOrderA|StartOrderB|EndOrderB|"\ "SizeA|SizeB|Size|Block" fws = open(simplefile, "w") if header: print >> fws, "\t".join(h.split("|")) atotalbase = btotalbase = 0 for i, block in enumerate(blocks): a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] ia, oa = zip(*a) ib, ob = zip(*b) astarti, aendi = min(ia), max(ia) bstarti, bendi = min(ib), max(ib) astart, aend = min(a)[1].accn, max(a)[1].accn bstart, bend = min(b)[1].accn, max(b)[1].accn sizeA = len(set(ia)) sizeB = len(set(ib)) size = len(block) slope, intercept = np.polyfit(ia, ib, 1) orientation = "+" if slope >= 0 else '-' aspan = aendi - astarti + 1 bspan = bendi - bstarti + 1 score = int((aspan * bspan) ** .5) score = str(score) block_id = pf + "-block-{0}".format(i) if coords: aseqid, astartbase, aendbase = \ get_boundary_bases(astart, aend, qorder) bseqid, bstartbase, bendbase = \ get_boundary_bases(bstart, bend, sorder) abase = aendbase - astartbase + 1 bbase = bendbase - bstartbase + 1 atotalbase += abase btotalbase += bbase # Write dual lines aargs = [block_id, aseqid, astartbase, aendbase, abase, astart, aend, aspan, "+"] bargs = [block_id, bseqid, bstartbase, bendbase, bbase, bstart, bend, bspan, orientation] if bed: bbed.append(BedLine("\t".join(str(x) for x in \ (bseqid, bstartbase - 1, bendbase, "{}:{}-{}".format(aseqid, astartbase, aendbase), size, orientation)))) for args in (aargs, bargs): print >> fws, "\t".join(str(x) for x in args) continue args = [astart, aend, bstart, bend, score, orientation] if additional: args += [astarti, aendi, bstarti, bendi, sizeA, sizeB, size, block_id] print >> fws, "\t".join(str(x) for x in args) fws.close() logging.debug("A total of {0} blocks written to `{1}`.".format(i + 1, simplefile)) if coords: print >> sys.stderr, "Total block span in {0}: {1}".format(qbed.filename, \ human_size(atotalbase, precision=2)) print >> sys.stderr, "Total block span in {0}: {1}".format(sbed.filename, \ human_size(btotalbase, precision=2)) print >> sys.stderr, "Ratio: {0:.1f}x".format(\ max(atotalbase, btotalbase) * 1. / min(atotalbase, btotalbase)) if bed: bedfile = simplefile + ".bed" bbed.print_to_file(filename=bedfile, sorted=True) logging.debug("Bed file written to `{}`".format(bedfile))
def main(): """ %prog bedfile id_mappings Takes a bedfile that contains the coordinates of features to plot on the chromosomes, and `id_mappings` file that map the ids to certain class. Each class will get assigned a unique color. `id_mappings` file is optional (if omitted, will not paint the chromosome features, except the centromere). """ p = OptionParser(main.__doc__) p.add_option("--title", default="Medicago truncatula v3.5", help="title of the image [default: `%default`]") p.add_option("--gauge", default=False, action="store_true", help="draw a gauge with size label [default: %default]") p.add_option("--imagemap", default=False, action="store_true", help="generate an HTML image map associated with the image [default: %default]") p.add_option("--winsize", default=50000, type="int", help="if drawing an imagemap, specify the window size (bases) of each map element " "[default: %default bp]") opts, args, iopts = set_image_options(p, figsize="6x6", dpi=300) if len(args) not in (1, 2): sys.exit(p.print_help()) bedfile = args[0] mappingfile = None if len(args) == 2: mappingfile = args[1] winsize = opts.winsize imagemap = opts.imagemap w, h = iopts.w, iopts.h dpi = iopts.dpi prefix = bedfile.rsplit(".", 1)[0] figname = prefix + "." + opts.format if imagemap: imgmapfile = prefix + '.map' mapfh = open(imgmapfile, "w") print >> mapfh, '<map id="' + prefix + '">' if mappingfile: mappings = dict(x.split() for x in open(mappingfile)) classes = sorted(set(mappings.values())) logging.debug("A total of {0} classes found: {1}".format(len(classes), ','.join(classes))) else: mappings = {} classes = [] logging.debug("No classes registered (no id_mappings given).") mycolors = "wrgbymc" class_colors = dict(zip(classes, mycolors)) bed = Bed(bedfile) chr_lens = {} centromeres = {} for b, blines in groupby(bed, key=(lambda x: x.seqid)): blines = list(blines) maxlen = max(x.end for x in blines) chr_lens[b] = maxlen for b in bed: accn = b.accn if accn == "centromere": centromeres[b.seqid] = b.start if accn in mappings: b.accn = mappings[accn] else: b.accn = '-' chr_number = len(chr_lens) assert chr_number == len(centromeres) fig = plt.figure(1, (w, h)) root = fig.add_axes([0, 0, 1, 1]) r = .7 # width and height of the whole chromosome set xstart, ystart = .15, .85 xinterval = r / chr_number xwidth = xinterval * .5 # chromosome width max_chr_len = max(chr_lens.values()) ratio = r / max_chr_len # canvas / base # first the chromosomes for a, (chr, cent_position) in enumerate(sorted(centromeres.items())): clen = chr_lens[chr] xx = xstart + a * xinterval + .5 * xwidth yy = ystart - cent_position * ratio root.text(xx, ystart + .01, _(chr), ha="center") ChromosomeWithCentromere(root, xx, ystart, yy, ystart - clen * ratio, width=xwidth) chr_idxs = dict((a, i) for i, a in enumerate(sorted(chr_lens.keys()))) alpha = .75 # color the regions for chr in sorted(chr_lens.keys()): segment_size, excess = 0, 0 bac_list = [] for b in bed.sub_bed(chr): clen = chr_lens[chr] idx = chr_idxs[chr] klass = b.accn start = b.start end = b.end xx = xstart + idx * xinterval yystart = ystart - end * ratio yyend = ystart - start * ratio root.add_patch(Rectangle((xx, yystart), xwidth, yyend - yystart, fc=class_colors.get(klass, "w"), lw=0, alpha=alpha)) if imagemap: """ `segment` : size of current BAC being investigated + `excess` `excess` : left-over bases from the previous BAC, as a result of iterating over `winsize` regions of `segment` """ if excess == 0: segment_start = start segment = (end - start + 1) + excess while True: if segment < winsize: bac_list.append(b.accn) excess = segment break segment_end = segment_start + winsize - 1 tlx, tly, brx, bry = xx, (1 - ystart) + segment_start * ratio, \ xx + xwidth, (1 - ystart) + segment_end * ratio print >> mapfh, '\t' + write_ImageMapLine(tlx, tly, brx, bry, \ w, h, dpi, chr+":"+",".join(bac_list), segment_start, segment_end) segment_start += winsize segment -= winsize bac_list = [] if imagemap and excess > 0: bac_list.append(b.accn) segment_end = end tlx, tly, brx, bry = xx, (1 - ystart) + segment_start * ratio, \ xx + xwidth, (1 - ystart) + segment_end * ratio print >> mapfh, '\t' + write_ImageMapLine(tlx, tly, brx, bry, \ w, h, dpi, chr+":"+",".join(bac_list), segment_start, segment_end) if imagemap: print >> mapfh, '</map>' mapfh.close() logging.debug("Image map written to `{0}`".format(mapfh.name)) if opts.gauge: tip = .008 # the ticks on the gauge bar extra = .006 # the offset for the unit label xstart, ystart = .9, .85 yy = ystart gauge = int(ceil(max_chr_len / 1e6)) mb = ratio * 1e6 yinterval = 2 * mb root.plot([xstart, xstart], [yy, yy - r], 'b-', lw=2) for x in xrange(0, gauge, 2): if x % 10: root.plot([xstart, xstart + tip], [yy, yy], "b-") else: root.plot([xstart - tip, xstart + tip], [yy, yy], 'b-', lw=2) root.text(xstart + tip + extra, yy, _(x), color="gray", va="center") yy -= yinterval root.text(xstart, yy - .03, _("Mb"), color="gray", va="center") # class legends, four in a row xstart = .1 xinterval = .2 xwidth = .04 yy = .08 for klass, cc in sorted(class_colors.items()): if klass == '-': continue root.add_patch(Rectangle((xstart, yy), xwidth, xwidth, fc=cc, lw=0, alpha=alpha)) root.text(xstart + xwidth + .01, yy, _(klass), fontsize=9) xstart += xinterval root.text(.5, .95, opts.title, fontstyle="italic", ha="center", va="center") root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() plt.savefig(figname, dpi=dpi) logging.debug("Figure saved to `{0}` {1}".format(figname, iopts))
def draw_chromosomes( root, bedfile, sizes, iopts, mergedist, winsize, imagemap, mappingfile=None, gauge=False, legend=True, empty=False, title=None, ): bed = Bed(bedfile) prefix = bedfile.rsplit(".", 1)[0] if imagemap: imgmapfile = prefix + ".map" mapfh = open(imgmapfile, "w") print('<map id="' + prefix + '">', file=mapfh) if mappingfile: mappings = DictFile(mappingfile, delimiter="\t") classes = sorted(set(mappings.values())) logging.debug( "A total of {0} classes found: {1}".format(len(classes), ",".join(classes)) ) else: classes = sorted(set(x.accn for x in bed)) mappings = dict((x, x) for x in classes) # Assign colors to classes ncolors = max(3, min(len(classes), 12)) palette = set1_n if ncolors <= 8 else set3_n colorset = palette(number=ncolors) colorset = sample_N(colorset, len(classes)) class_colors = dict(zip(classes, colorset)) logging.debug("Assigned colors: {}".format(class_colors)) chr_lens = {} centromeres = {} if sizes: chr_lens = Sizes(sizes).sizes_mapping else: for b, blines in groupby(bed, key=(lambda x: x.seqid)): blines = list(blines) maxlen = max(x.end for x in blines) chr_lens[b] = maxlen for b in bed: accn = b.accn if accn == "centromere": centromeres[b.seqid] = b.start if accn in mappings: b.accn = mappings[accn] else: b.accn = "-" chr_number = len(chr_lens) if centromeres: assert chr_number == len( centromeres ), "chr_number = {}, centromeres = {}".format(chr_number, centromeres) r = 0.7 # width and height of the whole chromosome set xstart, ystart = 0.15, 0.85 xinterval = r / chr_number xwidth = xinterval * 0.5 # chromosome width max_chr_len = max(chr_lens.values()) ratio = r / max_chr_len # canvas / base # first the chromosomes for a, (chr, clen) in enumerate(sorted(chr_lens.items())): xx = xstart + a * xinterval + 0.5 * xwidth root.text(xx, ystart + 0.01, str(get_number(chr)), ha="center") if centromeres: yy = ystart - centromeres[chr] * ratio ChromosomeWithCentromere( root, xx, ystart, yy, ystart - clen * ratio, width=xwidth ) else: Chromosome(root, xx, ystart, ystart - clen * ratio, width=xwidth) chr_idxs = dict((a, i) for i, a in enumerate(sorted(chr_lens.keys()))) alpha = 1 # color the regions for chr in sorted(chr_lens.keys()): segment_size, excess = 0, 0 bac_list = [] prev_end, prev_klass = 0, None for b in bed.sub_bed(chr): clen = chr_lens[chr] idx = chr_idxs[chr] klass = b.accn if klass == "centromere": continue start = b.start end = b.end if start < prev_end + mergedist and klass == prev_klass: start = prev_end xx = xstart + idx * xinterval yystart = ystart - end * ratio yyend = ystart - start * ratio root.add_patch( Rectangle( (xx, yystart), xwidth, yyend - yystart, fc=class_colors.get(klass, "lightslategray"), lw=0, alpha=alpha, ) ) prev_end, prev_klass = b.end, klass if imagemap: """ `segment` : size of current BAC being investigated + `excess` `excess` : left-over bases from the previous BAC, as a result of iterating over `winsize` regions of `segment` """ if excess == 0: segment_start = start segment = (end - start + 1) + excess while True: if segment < winsize: bac_list.append(b.accn) excess = segment break segment_end = segment_start + winsize - 1 tlx, tly, brx, bry = ( xx, (1 - ystart) + segment_start * ratio, xx + xwidth, (1 - ystart) + segment_end * ratio, ) print( "\t" + write_ImageMapLine( tlx, tly, brx, bry, iopts.w, iopts.h, iopts.dpi, chr + ":" + ",".join(bac_list), segment_start, segment_end, ), file=mapfh, ) segment_start += winsize segment -= winsize bac_list = [] if imagemap and excess > 0: bac_list.append(b.accn) segment_end = end tlx, tly, brx, bry = ( xx, (1 - ystart) + segment_start * ratio, xx + xwidth, (1 - ystart) + segment_end * ratio, ) print( "\t" + write_ImageMapLine( tlx, tly, brx, bry, iopts.w, iopts.h, iopts.dpi, chr + ":" + ",".join(bac_list), segment_start, segment_end, ), file=mapfh, ) if imagemap: print("</map>", file=mapfh) mapfh.close() logging.debug("Image map written to `{0}`".format(mapfh.name)) if gauge: xstart, ystart = 0.9, 0.85 Gauge(root, xstart, ystart - r, ystart, max_chr_len) if "centromere" in class_colors: del class_colors["centromere"] # class legends, four in a row if legend: xstart = 0.1 xinterval = 0.8 / len(class_colors) xwidth = 0.04 yy = 0.08 for klass, cc in sorted(class_colors.items()): if klass == "-": continue root.add_patch( Rectangle((xstart, yy), xwidth, xwidth, fc=cc, lw=0, alpha=alpha) ) root.text(xstart + xwidth + 0.01, yy, latex(klass), fontsize=10) xstart += xinterval if empty: root.add_patch(Rectangle((xstart, yy), xwidth, xwidth, fill=False, lw=1)) root.text(xstart + xwidth + 0.01, yy, empty, fontsize=10) if title: root.text(0.5, 0.95, markup(title), ha="center", va="center")
def annotate(args): """ %prog annotate new.bed old.bed 2> log Annotate the `new.bed` with features from `old.bed` for the purpose of gene numbering. Ambiguity in ID assignment can be resolved by either of the following 2 methods: - `alignment`: make use of global sequence alignment score (calculated by `needle`) - `overlap`: make use of overlap length (calculated by `intersectBed`) Transfer over as many identifiers as possible while following guidelines: http://www.arabidopsis.org/portals/nomenclature/guidelines.jsp#editing Note: Following RegExp pattern describes the structure of the identifier assigned to features in the `new.bed` file. new_id_pat = re.compile(r"^\d+\.[cemtx]+\S+") Examples: 23231.m312389, 23231.t004898, 23231.tRNA.144 Adjust the value of `new_id_pat` manually as per your ID naming conventions. """ from jcvi.utils.grouper import Grouper valid_resolve_choices = ["alignment", "overlap"] p = OptionParser(annotate.__doc__) p.add_option("--resolve", default="alignment", choices=valid_resolve_choices, help="Resolve ID assignment based on a certain metric" \ + " [default: %default]") p.add_option("--atg_name", default=False, action="store_true", help="Specify is locus IDs in `new.bed` file follow ATG nomenclature" \ + " [default: %default]") g1 = OptionGroup(p, "Optional parameters (alignment):\n" \ + "Use if resolving ambiguities based on sequence `alignment`") g1.add_option("--pid", dest="pid", default=35., type="float", help="Percent identity cutoff [default: %default]") g1.add_option("--score", dest="score", default=250., type="float", help="Alignment score cutoff [default: %default]") p.add_option_group(g1) g2 = OptionGroup(p, "Optional parameters (overlap):\n" \ + "Use if resolving ambiguities based on `overlap` length\n" \ + "Parameters equivalent to `intersectBed`") g2.add_option( "-f", dest="f", default=0.5, type="float", help="Minimum overlap fraction (0.0 - 1.0) [default: %default]") g2.add_option( "-r", dest="r", default=False, action="store_true", help="Require fraction overlap to be reciprocal [default: %default]") g2.add_option("-s", dest="s", default=True, action="store_true", help="Require same strandedness [default: %default]") p.add_option_group(g2) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) nbedfile, obedfile = args npf, opf = nbedfile.rsplit(".", 1)[0], obedfile.rsplit(".", 1)[0] # Make consolidated.bed cbedfile = "consolidated.bed" if not os.path.isfile(cbedfile): consolidate(nbedfile, obedfile, cbedfile) else: logging.warning("`{0}` already exists. Skipping step".format(cbedfile)) logging.warning("Resolving ID assignment ambiguity based on `{0}`".\ format(opts.resolve)) if opts.resolve == "alignment": # Get pairs and prompt to run needle pairsfile = "nw.pairs" scoresfile = "nw.scores" if not os.path.isfile(pairsfile): get_pairs(cbedfile, pairsfile) else: logging.warning("`{0}` already exists. Checking for needle output".\ format(pairsfile)) # If needle scores do not exist, prompt user to run needle if not os.path.isfile(scoresfile): logging.error("`{0}` does not exist. Please process {1} using `needle`".\ format(scoresfile, pairsfile)) sys.exit() else: scoresfile = "ovl.scores" # Calculate overlap length using intersectBed calculate_ovl(nbedfile, obedfile, opts, scoresfile) logging.warning("`{0}' exists. Storing scores in memory".\ format(scoresfile)) scores = read_scores(scoresfile, opts) # Iterate through consolidated bed and # filter piles based on score abedline = {} cbed = Bed(cbedfile) g = Grouper() for c in cbed: accn = c.accn g.join(*accn.split(";")) nbedline = {} nbed = Bed(nbedfile) for line in nbed: nbedline[line.accn] = line splits = set() for chr, chrbed in nbed.sub_beds(): abedline, splits = annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits) if splits is not None: abedline = process_splits(splits, scores, nbedline, abedline) abedfile = npf + ".annotated.bed" afh = open(abedfile, "w") for accn in abedline: print(abedline[accn], file=afh) afh.close() sort([abedfile, "-i"])
def renumber(args): """ %prog renumber Mt35.consolidated.bed > tagged.bed Renumber genes for annotation updates. """ from jcvi.algorithms.lis import longest_increasing_subsequence from jcvi.utils.grouper import Grouper p = OptionParser(renumber.__doc__) p.set_annot_reformat_opts() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args pf = bedfile.rsplit(".", 1)[0] abedfile = pf + ".a.bed" bbedfile = pf + ".b.bed" if need_update(bedfile, (abedfile, bbedfile)): prepare(bedfile) mbed = Bed(bbedfile) g = Grouper() for s in mbed: accn = s.accn g.join(*accn.split(";")) bed = Bed(abedfile) for chr, sbed in bed.sub_beds(): current_chr = chr_number(chr) if not current_chr: continue ranks = [] gg = set() for s in sbed: accn = s.accn achr, arank = atg_name(accn) if achr != current_chr: continue ranks.append(arank) gg.add(accn) lranks = longest_increasing_subsequence(ranks) print(current_chr, len(sbed), "==>", len(ranks), \ "==>", len(lranks), file=sys.stderr) granks = set(gene_name(current_chr, x, prefix=opts.prefix, \ pad0=opts.pad0, uc=opts.uc) for x in lranks) | \ set(gene_name(current_chr, x, prefix=opts.prefix, \ pad0=opts.pad0, sep="te", uc=opts.uc) for x in lranks) tagstore = {} for s in sbed: achr, arank = atg_name(s.accn) accn = s.accn if accn in granks: tag = (accn, FRAME) elif accn in gg: tag = (accn, RETAIN) else: tag = (".", NEW) tagstore[accn] = tag # Find cases where genes overlap for s in sbed: accn = s.accn gaccn = g[accn] tags = [((tagstore[x][-1] if x in tagstore else NEW), x) for x in gaccn] group = [(PRIORITY.index(tag), x) for tag, x in tags] best = min(group)[-1] if accn != best: tag = (best, OVERLAP) else: tag = tagstore[accn] print("\t".join((str(s), "|".join(tag))))
def instantiate(args): """ %prog instantiate tagged.bed blacklist.ids big_gaps.bed instantiate NEW genes tagged by renumber. """ p = OptionParser(instantiate.__doc__) p.set_annot_reformat_opts() p.add_option("--extended_stride", default=False, action="store_true", help="Toggle extended strides for gene numbering") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) taggedbed, blacklist, gapsbed = args r = NameRegister(prefix=opts.prefix, pad0=opts.pad0, uc=opts.uc) r.get_blacklist(blacklist) r.get_gaps(gapsbed) # Run through the bed, identify stretch of NEW ids to instantiate, # identify the flanking FRAMEs, interpolate! bed = Bed(taggedbed) outputbed = taggedbed.rsplit(".", 1)[0] + ".new.bed" fw = open(outputbed, "w") tagkey = lambda x: x.rsplit("|", 1)[-1] for chr, sbed in bed.sub_beds(): current_chr = chr_number(chr) if not current_chr: continue sbed = list(sbed) ranks = [] for i, s in enumerate(sbed): nametag = s.extra[0] tag = tagkey(nametag) if tag in (NEW, FRAME): ranks.append((i, nametag)) blocks = [] for tag, names in groupby(ranks, key=lambda x: tagkey(x[-1])): names = list(names) if tag == NEW: blocks.append((tag, [sbed[x[0]] for x in names])) else: start, end = names[0][-1], names[-1][-1] start, end = atg_name(start, retval="rank"), atg_name(end, retval="rank") blocks.append((tag, [start, end])) id_table = {} # old to new name conversion for i, (tag, info) in enumerate(blocks): if tag != NEW: continue start_id = 0 if i == 0 else blocks[i - 1][1][-1] end_id = start_id + 10000 if i == len(blocks) -1 \ else blocks[i + 1][1][0] r.allocate(info, chr, start_id, end_id, id_table, extended_stride=opts.extended_stride) # Output new names for i, s in enumerate(sbed): nametag = s.extra[0] name, tag = nametag.split("|") if tag == NEW: assert name == '.' name = id_table[s.accn] elif tag == OVERLAP: if name in id_table: name = id_table[name] s.extra[0] = "|".join((name, tag)) print(s, file=fw) fw.close()
def heatmap(args): """ %prog heatmap fastafile chr1 Combine stack plot with heatmap to show abundance of various tracks along given chromosome. Need to give multiple beds to --stacks and --heatmaps """ p = OptionParser(heatmap.__doc__) p.add_option("--stacks", default="Exons,Introns,DNA_transposons,Retrotransposons", help="Features to plot in stackplot [default: %default]") p.add_option("--heatmaps", default="Copia,Gypsy,hAT,Helitron,Introns,Exons", help="Features to plot in heatmaps [default: %default]") p.add_option( "--meres", default=None, help="Extra centromere / telomere features [default: %default]") add_window_options(p) opts, args, iopts = p.set_image_options(args, figsize="8x5") if len(args) != 2: sys.exit(not p.print_help()) fastafile, chr = args window, shift, subtract = check_window_options(opts) stacks = opts.stacks.split(",") heatmaps = opts.heatmaps.split(",") stackbeds = get_beds(stacks) heatmapbeds = get_beds(heatmaps) stackbins = get_binfiles(stackbeds, fastafile, shift, subtract=subtract) heatmapbins = get_binfiles(heatmapbeds, fastafile, shift, subtract=subtract) margin = .06 inner = .015 clen = Sizes(fastafile).mapping[chr] fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # Gauge ratio = draw_gauge(root, margin, clen, rightmargin=4 * margin) yinterval = .3 xx = margin yy = 1 - margin yy -= yinterval xlen = clen / ratio cc = chr if "_" in chr: ca, cb = chr.split("_") cc = ca[0].upper() + cb root.add_patch(Rectangle((xx, yy), xlen, yinterval - inner, color=gray)) ax = fig.add_axes([xx, yy, xlen, yinterval - inner]) nbins = get_nbins(clen, shift) owindow = clen / 100 if owindow > window: window = owindow / shift * shift stackplot(ax, stackbins, nbins, palette, chr, window, shift) ax.text(.1, .9, cc, va="top", zorder=100, transform=ax.transAxes, bbox=dict(boxstyle="round", fc="w", alpha=.5)) # Legends xx += xlen + .01 yspace = (yinterval - inner) / (len(stackbins) + 1) yy = 1 - margin - yinterval for s, p in zip(stacks, palette): s = s.replace("_", " ") s = Registration.get(s, s) yy += yspace root.add_patch(Rectangle((xx, yy), inner, inner, color=p, lw=0)) root.text(xx + 1.5 * inner, yy, s, size=10) yh = .05 # Heatmap height # Heatmaps xx = margin yy = 1 - margin - yinterval - inner for s, p in zip(heatmaps, heatmapbins): s = s.replace("_", " ") s = Registration.get(s, s) yy -= yh m = stackarray(p, chr, window, shift) Y = np.array([m, m]) root.imshow(Y, extent=(xx, xx + xlen, yy, yy + yh - inner), interpolation="nearest", aspect="auto") root.text(xx + xlen + .01, yy, s, size=10) yy -= yh meres = opts.meres if meres: bed = Bed(meres) for b in bed: if b.seqid != chr: continue pos = (b.start + b.end) / 2 cpos = pos / ratio xx = margin + cpos accn = b.accn.capitalize() root.add_patch(CirclePolygon((xx, yy), radius=.01, fc="m", ec="m")) root.text(xx + .014, yy, accn, va="center", color="m") root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = chr + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def composite(args): """ %prog composite fastafile chr1 Combine line plots, feature bars and alt-bars, different data types specified in options. Inputs must be BED-formatted. Three types of viz are currently supported: --lines: traditional line plots, useful for plotting feature freq --bars: show where the extent of features are --altbars: similar to bars, yet in two alternating tracks, e.g. scaffolds """ from jcvi.graphics.chromosome import HorizontalChromosome p = OptionParser(composite.__doc__) p.add_option("--lines", help="Features to plot in lineplot [default: %default]") p.add_option("--bars", help="Features to plot in bars [default: %default]") p.add_option("--altbars", help="Features to plot in alt-bars [default: %default]") p.add_option( "--fatten", default=False, action="store_true", help="Help visualize certain narrow features [default: %default]") p.add_option("--mode", default="span", choices=("span", "count", "score"), help="Accumulate feature based on [default: %default]") add_window_options(p) opts, args, iopts = p.set_image_options(args, figsize="8x5") if len(args) != 2: sys.exit(not p.print_help()) fastafile, chr = args window, shift, subtract = check_window_options(opts) linebeds, barbeds, altbarbeds = [], [], [] fatten = opts.fatten if opts.lines: lines = opts.lines.split(",") linebeds = get_beds(lines) if opts.bars: bars = opts.bars.split(",") barbeds = get_beds(bars) if opts.altbars: altbars = opts.altbars.split(",") altbarbeds = get_beds(altbars) linebins = get_binfiles(linebeds, fastafile, shift, mode=opts.mode) margin = .12 clen = Sizes(fastafile).mapping[chr] nbins = get_nbins(clen, shift) plt.rcParams["xtick.major.size"] = 0 plt.rcParams["ytick.major.size"] = 0 fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) root.text(.5, .95, chr, ha="center", color="darkslategray") xstart, xend = margin, 1 - margin xlen = xend - xstart ratio = xlen / clen # Line plots ax = fig.add_axes([xstart, .6, xlen, .3]) lineplot(ax, linebins, nbins, chr, window, shift) # Bar plots yy = .5 yinterval = .08 xs = lambda x: xstart + ratio * x r = .01 fattend = .0025 for bb in barbeds: root.text(xend + .01, yy, bb.split(".")[0], va="center") HorizontalChromosome(root, xstart, xend, yy, height=.02) bb = Bed(bb) for b in bb: start, end = xs(b.start), xs(b.end) span = end - start if fatten and span < fattend: span = fattend root.add_patch(Rectangle((start, yy - r), span, 2 * r, \ lw=0, fc="darkslategray")) yy -= yinterval # Alternative bar plots offset = r / 2 for bb in altbarbeds: root.text(xend + .01, yy, bb.split(".")[0], va="center") bb = Bed(bb) for i, b in enumerate(bb): start, end = xs(b.start), xs(b.end) span = end - start if span < .0001: continue offset = -offset root.add_patch(Rectangle((start, yy + offset), end - start, .003, \ lw=0, fc="darkslategray")) yy -= yinterval root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = chr + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \ evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False): if genefam: N = 1e5 # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file) order = bed.order if is_self: # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) * P / 100.: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) qi, q = order[query] si, s = order[subject] if abs(qi - si) <= N and b.evalue <= evalue: if genefam: g.join(query, subject) elif q.seqid == s.seqid: g.join(query, subject) else: homologs = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) * P / 100.: continue if b.evalue > evalue: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) homologs.join(query, subject) if genefam: g = homologs else: g = Grouper() for i, atom in enumerate(bed): for x in range(1, N + 1): if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \ homologs.joined(bed[i-x].accn, atom.accn)]): leni = sizes[bed[i].accn] lenx = sizes[bed[i - x].accn] if abs(leni - lenx) > max(leni, lenx) * (1 - P / 100.): continue g.join(bed[i - x].accn, atom.accn) # dump the grouper fw = must_open(ofile, "w") ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print >> fw, ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >> sys.stderr, "Proximal paralogues (dist=%d):" % N print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >> sys.stderr, "Longest families (%d): %s" % ( len(longest_family), ",".join(longest_family)) return families
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.align import blast from jcvi.formats.fasta import SeqIO p = OptionParser(install.__doc__) p.set_rclip(rclip=1) p.add_option("--maxsize", default=300000, type="int", help="Maximum size of patchers to be replaced [default: %default]") p.add_option("--prefix", help="Prefix of the new object [default: %default]") p.add_option("--strict", default=False, action="store_true", help="Only update if replacement has no gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args maxsize = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip blastfile = blast([altfasta, pfasta,"--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = blast_to_twobeds(blastfile, order, rclip=rclip, maxsize=maxsize) beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count('n') + x.seq.count('N') exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug("Ignore {0} updates because of decreasing quality."\ .format(len(exclude))) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
def rename(args): """ %prog rename genes.bed [gaps.bed] Rename genes for annotation release. For genes on chromosomes (e.g. the 12th gene on C1): Bo1g00120 For genes on scaffolds (e.g. the 12th gene on unplaced Scaffold00285): Bo00285s120 The genes identifiers will increment by 10. So assuming no gap, these are the consecutive genes: Bo1g00120, Bo1g00130, Bo1g00140... Bo00285s120, Bo00285s130, Bo00285s140... When we encounter gaps, we would like the increment to be larger. For example, Bo1g00120, <gap>, Bo1g01120... Gaps bed file is optional. """ import string p = OptionParser(rename.__doc__) p.add_option("-a", dest="gene_increment", default=10, type="int", help="Increment for continuous genes [default: %default]") p.add_option("-b", dest="gap_increment", default=1000, type="int", help="Increment for gaps [default: %default]") p.add_option("--pad0", default=6, type="int", help="Pad gene identifiers with 0 [default: %default]") p.add_option( "--spad0", default=4, type="int", help="Pad gene identifiers on small scaffolds [default: %default]") p.add_option("--prefix", default="Bo", help="Genome prefix [default: %default]") p.add_option("--jgi", default=False, action="store_true", help="Create JGI style identifier PREFIX.NN[G|TE]NNNNN.1" + \ " [default: %default]") opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) genebed = args[0] gapbed = args[1] if len(args) == 2 else None prefix = opts.prefix gene_increment = opts.gene_increment gap_increment = opts.gap_increment genes = Bed(genebed) if gapbed: fp = open(gapbed) for row in fp: genes.append(BedLine(row)) genes.sort(key=genes.key) idsfile = prefix + ".ids" newbedfile = prefix + ".bed" gap_increment -= gene_increment assert gap_increment >= 0 if opts.jgi: prefix += "." fw = open(idsfile, "w") for chr, lines in groupby(genes, key=lambda x: x.seqid): lines = list(lines) pad0 = opts.pad0 if len(lines) > 1000 else opts.spad0 isChr = chr[0].upper() == 'C' digits = "".join(x for x in chr if x in string.digits) gs = "g" if isChr else "s" pp = prefix + digits + gs idx = 0 if isChr: idx += gap_increment for r in lines: isGap = r.strand not in ("+", "-") if isGap: idx += gap_increment continue else: idx += gene_increment accn = pp + "{0:0{1}d}".format(idx, pad0) oldaccn = r.accn print("\t".join((oldaccn, accn)), file=fw) r.accn = accn genes.print_to_file(newbedfile) logging.debug("Converted IDs written to `{0}`.".format(idsfile)) logging.debug("Converted bed written to `{0}`.".format(newbedfile))
def cluster(args): """ %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile Cluster the segments and form PAD. This is the method described in Tang et al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks, based on which the genome on one or both axis can be chopped up into pieces and clustered. """ from jcvi.utils.range import Range p = OptionParser(cluster.__doc__) p.set_beds() p.add_option("--minsize", default=10, type="int", help="Only segment using blocks >= size [default: %default]") p.add_option("--path", default="~/scratch/bin", help="Path to the CLUSTER 3.0 binary [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, anchorfile = args qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts) minsize = opts.minsize ac = AnchorFile(anchorfile) qranges, sranges = [], [] qextra = [x[1:] for x in qbed.get_breaks()] sextra = [x[1:] for x in sbed.get_breaks()] id = 0 for block in ac.iter_blocks(minsize=minsize): q, s = zip(*block)[:2] q = [qorder[x][0] for x in q] s = [sorder[x][0] for x in s] minq, maxq = min(q), max(q) mins, maxs = min(s), max(s) id += 1 qr = Range("0", minq, maxq, maxq - minq, id) sr = Range("0", mins, maxs, maxs - mins, id) qranges.append(qr) sranges.append(sr) qpads = list(get_segments(qranges, qextra)) spads = list(get_segments(sranges, sextra)) suffix = ".pad.bed" qpf = opts.qbed.split(".")[0] spf = opts.sbed.split(".")[0] qpadfile = qpf + suffix spadfile = spf + suffix qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed) snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed) qpadbed, spadbed = Bed(qpadfile), Bed(spadfile) logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames) m, n = logmp.shape matrixfile = ".".join((qpf, spf, "logmp.txt")) fw = open(matrixfile, "w") header = ["o"] + spadnames print("\t".join(header), file=fw) for i in xrange(m): row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]] print("\t".join(row), file=fw) fw.close() # Run CLUSTER 3.0 (Pearson correlation, average linkage) cmd = op.join(opts.path, "cluster") cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile) pf = matrixfile.rsplit(".", 1)[0] cdtfile = pf + ".cdt" if need_update(matrixfile, cdtfile): sh(cmd)
def refine(args): """ %prog refine breakpoints.bed gaps.bed Find gaps within or near breakpoint region. For breakpoint regions with no gaps, there are two options: - Break in the middle of the region - Break at the closest gap (--closest) """ p = OptionParser(refine.__doc__) p.add_option("--closest", default=False, action="store_true", help="In case of no gaps, use closest [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) breakpointsbed, gapsbed = args ncols = len(open(breakpointsbed).next().split()) logging.debug("File {0} contains {1} columns.".format(breakpointsbed, ncols)) cmd = "intersectBed -wao -a {0} -b {1}".format(breakpointsbed, gapsbed) pf = "{0}.{1}".format(breakpointsbed.split(".")[0], gapsbed.split(".")[0]) ingapsbed = pf + ".bed" sh(cmd, outfile=ingapsbed) fp = open(ingapsbed) data = [x.split() for x in fp] nogapsbed = pf + ".nogaps.bed" largestgapsbed = pf + ".largestgaps.bed" nogapsfw = open(nogapsbed, "w") largestgapsfw = open(largestgapsbed, "w") for b, gaps in groupby(data, key=lambda x: x[:ncols]): gaps = list(gaps) gap = gaps[0] if len(gaps) == 1 and gap[-1] == "0": assert gap[-3] == "." print >> nogapsfw, "\t".join(b) continue gaps = [(int(x[-1]), x) for x in gaps] maxgap = max(gaps)[1] print >> largestgapsfw, "\t".join(maxgap) nogapsfw.close() largestgapsfw.close() beds = [largestgapsbed] toclean = [nogapsbed, largestgapsbed] if opts.closest: closestgapsbed = pf + ".closestgaps.bed" cmd = "closestBed -a {0} -b {1} -d".format(nogapsbed, gapsbed) sh(cmd, outfile=closestgapsbed) beds += [closestgapsbed] toclean += [closestgapsbed] else: pointbed = pf + ".point.bed" pbed = Bed() bed = Bed(nogapsbed) for b in bed: pos = (b.start + b.end) / 2 b.start, b.end = pos, pos pbed.append(b) pbed.print_to_file(pointbed) beds += [pointbed] toclean += [pointbed] refinedbed = pf + ".refined.bed" FileMerger(beds, outfile=refinedbed).merge() # Clean-up FileShredder(toclean) return refinedbed
def ld(args): """ %prog ld map Calculate pairwise linkage disequilibrium given MSTmap. """ import numpy as np from random import sample from jcvi.algorithms.matrix import symmetrize p = OptionParser(ld.__doc__) p.add_option("--subsample", default=500, type="int", help="Subsample markers to speed up [default: %default]") opts, args, iopts = p.set_image_options(args, figsize="8x8") if len(args) != 1: sys.exit(not p.print_help()) mstmap, = args subsample = opts.subsample data = MSTMap(mstmap) # Take random subsample while keeping marker order if subsample < data.nmarkers: data = [data[x] for x in \ sorted(sample(xrange(len(data)), subsample))] markerbedfile = mstmap + ".subsample.bed" ldmatrix = mstmap + ".subsample.matrix" if need_update(mstmap, (markerbedfile, ldmatrix)): nmarkers = len(data) fw = open(markerbedfile, "w") print >> fw, "\n".join(x.bedline for x in data) logging.debug("Write marker set of size {0} to file `{1}`."\ .format(nmarkers, markerbedfile)) M = np.zeros((nmarkers, nmarkers), dtype=float) for i, j in combinations(range(nmarkers), 2): a = data[i] b = data[j] M[i, j] = calc_ldscore(a.genotype, b.genotype) M = symmetrize(M) logging.debug("Write LD matrix to file `{0}`.".format(ldmatrix)) M.tofile(ldmatrix) else: nmarkers = len(Bed(markerbedfile)) M = np.fromfile(ldmatrix, dtype="float").reshape(nmarkers, nmarkers) logging.debug("LD matrix `{0}` exists ({1}x{1})."\ .format(ldmatrix, nmarkers)) from jcvi.graphics.base import plt, savefig, Rectangle, draw_cmap plt.rcParams["axes.linewidth"] = 0 fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) ax = fig.add_axes([.1, .1, .8, .8]) # the heatmap ax.matshow(M, cmap=iopts.cmap) # Plot chromosomes breaks bed = Bed(markerbedfile) xsize = len(bed) extent = (0, nmarkers) chr_labels = [] ignore_size = 20 for (seqid, beg, end) in bed.get_breaks(): ignore = abs(end - beg) < ignore_size pos = (beg + end) / 2 chr_labels.append((seqid, pos, ignore)) if ignore: continue ax.plot((end, end), extent, "w-", lw=1) ax.plot(extent, (end, end), "w-", lw=1) # Plot chromosome labels for label, pos, ignore in chr_labels: pos = .1 + pos * .8 / xsize if not ignore: root.text(pos, .91, label, ha="center", va="bottom", rotation=45, color="grey") root.text(.09, pos, label, ha="right", va="center", color="grey") ax.set_xlim(extent) ax.set_ylim(extent) ax.set_axis_off() draw_cmap(root, "Pairwise LD (r2)", 0, 1, cmap=default_cm) root.add_patch(Rectangle((.1, .1), .8, .8, fill=False, ec="k", lw=2)) m = mstmap.split(".")[0] root.text(.5, .06, "Linkage Disequilibrium between {0} markers".format(m), ha="center") root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = m + ".subsample" + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def cut(args): """ %prog cut agpfile bedfile Cut at the boundaries of the ranges in the bedfile. Use --shrink to control the exact boundaries where you cut. """ p = OptionParser(cut.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) agpfile, bedfile = args agp = AGP(agpfile) bed = Bed(bedfile) simple_agp = agp.order newagpfile = agpfile.replace(".agp", ".cut.agp") fw = open(newagpfile, "w") agp_fixes = defaultdict(list) for component, intervals in bed.sub_beds(): i, a = simple_agp[component] object = a.object component_span = a.component_span orientation = a.orientation assert a.component_beg, a.component_end arange = a.component_beg, a.component_end cuts = set() for i in intervals: start, end = i.start, i.end end -= 1 assert start <= end cuts.add(start) cuts.add(end) cuts.add(0) cuts.add(component_span) cuts = list(sorted(cuts)) sum_of_spans = 0 for i, (a, b) in enumerate(pairwise(cuts)): oid = object + "_{0}".format(i) aline = [oid, 0, 0, 0] cspan = b - a aline += ['D', component, a + 1, b, orientation] sum_of_spans += cspan aline = "\t".join(str(x) for x in aline) agp_fixes[component].append(aline) assert component_span == sum_of_spans # Finally write the masked agp for a in agp: if not a.is_gap and a.component_id in agp_fixes: print >> fw, "\n".join(agp_fixes[a.component_id]) else: print >> fw, a fw.close() # Reindex idxagpfile = reindex([newagpfile]) shutil.move(idxagpfile, newagpfile) return newagpfile
def shuffle_twobeds(afbed, bfbed, bbfasta, prefix=None): # Shuffle the two bedfiles together sz = Sizes(bbfasta) sizes = sz.mapping shuffled = "shuffled.bed" border = bfbed.order all = [] afbed.sort(key=afbed.nullkey) totalids = len(sizes) pad = int(math.log10(totalids)) + 1 cj = 0 seen = set() accn = lambda x: "{0}{1:0{2}d}".format(prefix, x, pad) for seqid, aa in afbed.sub_beds(): cj += 1 abeds, bbeds, beds = [], [], [] size = sizes[seqid] ranges = [(x.seqid, x.start, x.end) for x in aa] cranges = range_interleave(ranges, sizes={seqid: size}, empty=True) for crange in cranges: if crange: seqid, start, end = crange bedline = "\t".join(str(x) for x in (seqid, start - 1, end)) abeds.append(BedLine(bedline)) else: abeds.append(None) for a in aa: gapid = a.accn bi, b = border[gapid] if a.strand == '-': b.extra[1] = b.strand = ('-' if b.strand == '+' else '+') bbeds.append(b) n_abeds = len(abeds) n_bbeds = len(bbeds) assert n_abeds - n_bbeds == 1, \ "abeds: {0}, bbeds: {1}".format(n_abeds, n_bbeds) beds = [x for x in roundrobin(abeds, bbeds) if x] if prefix: for b in beds: b.accn = accn(cj) all.extend(beds) seen.add(seqid) # Singletons for seqid, size in sz.iter_sizes(): if seqid in seen: continue bedline = "\t".join(str(x) for x in (seqid, 0, size, accn(cj))) b = BedLine(bedline) cj += 1 if prefix: b.accn = accn(cj) all.append(b) shuffledbed = Bed() shuffledbed.extend(all) shuffledbed.print_to_file(shuffled) return shuffledbed
def main(): """ %prog bedfile id_mappings Takes a bedfile that contains the coordinates of features to plot on the chromosomes, and `id_mappings` file that map the ids to certain class. Each class will get assigned a unique color. `id_mappings` file is optional (if omitted, will not paint the chromosome features, except the centromere). """ p = OptionParser(main.__doc__) p.add_option("--title", default="Medicago truncatula v3.5", help="title of the image [default: `%default`]") p.add_option("--gauge", default=False, action="store_true", help="draw a gauge with size label [default: %default]") p.add_option("--imagemap", default=False, action="store_true", help="generate an HTML image map associated with the image [default: %default]") p.add_option("--winsize", default=50000, type="int", help="if drawing an imagemap, specify the window size (bases) of each map element " "[default: %default bp]") p.add_option("--empty", help="Write legend for unpainted region") opts, args, iopts = p.set_image_options(figsize="6x6", dpi=300) if len(args) not in (1, 2): sys.exit(p.print_help()) bedfile = args[0] mappingfile = None if len(args) == 2: mappingfile = args[1] winsize = opts.winsize imagemap = opts.imagemap w, h = iopts.w, iopts.h dpi = iopts.dpi prefix = bedfile.rsplit(".", 1)[0] figname = prefix + "." + opts.format if imagemap: imgmapfile = prefix + '.map' mapfh = open(imgmapfile, "w") print >> mapfh, '<map id="' + prefix + '">' if mappingfile: mappings = DictFile(mappingfile, delimiter="\t") classes = sorted(set(mappings.values())) logging.debug("A total of {0} classes found: {1}".format(len(classes), ','.join(classes))) else: mappings = {} classes = [] logging.debug("No classes registered (no id_mappings given).") mycolors = "rgbymc" class_colors = dict(zip(classes, mycolors)) bed = Bed(bedfile) chr_lens = {} centromeres = {} for b, blines in groupby(bed, key=(lambda x: x.seqid)): blines = list(blines) maxlen = max(x.end for x in blines) chr_lens[b] = maxlen for b in bed: accn = b.accn if accn == "centromere": centromeres[b.seqid] = b.start if accn in mappings: b.accn = mappings[accn] else: b.accn = '-' chr_number = len(chr_lens) if centromeres: assert chr_number == len(centromeres) fig = plt.figure(1, (w, h)) root = fig.add_axes([0, 0, 1, 1]) r = .7 # width and height of the whole chromosome set xstart, ystart = .15, .85 xinterval = r / chr_number xwidth = xinterval * .5 # chromosome width max_chr_len = max(chr_lens.values()) ratio = r / max_chr_len # canvas / base # first the chromosomes for a, (chr, clen) in enumerate(sorted(chr_lens.items())): xx = xstart + a * xinterval + .5 * xwidth root.text(xx, ystart + .01, chr, ha="center") if centromeres: yy = ystart - centromeres[chr] * ratio ChromosomeWithCentromere(root, xx, ystart, yy, ystart - clen * ratio, width=xwidth) else: Chromosome(root, xx, ystart, ystart - clen * ratio, width=xwidth) chr_idxs = dict((a, i) for i, a in enumerate(sorted(chr_lens.keys()))) alpha = .75 # color the regions for chr in sorted(chr_lens.keys()): segment_size, excess = 0, 0 bac_list = [] for b in bed.sub_bed(chr): clen = chr_lens[chr] idx = chr_idxs[chr] klass = b.accn start = b.start end = b.end xx = xstart + idx * xinterval yystart = ystart - end * ratio yyend = ystart - start * ratio root.add_patch(Rectangle((xx, yystart), xwidth, yyend - yystart, fc=class_colors.get(klass, "w"), lw=0, alpha=alpha)) if imagemap: """ `segment` : size of current BAC being investigated + `excess` `excess` : left-over bases from the previous BAC, as a result of iterating over `winsize` regions of `segment` """ if excess == 0: segment_start = start segment = (end - start + 1) + excess while True: if segment < winsize: bac_list.append(b.accn) excess = segment break segment_end = segment_start + winsize - 1 tlx, tly, brx, bry = xx, (1 - ystart) + segment_start * ratio, \ xx + xwidth, (1 - ystart) + segment_end * ratio print >> mapfh, '\t' + write_ImageMapLine(tlx, tly, brx, bry, \ w, h, dpi, chr+":"+",".join(bac_list), segment_start, segment_end) segment_start += winsize segment -= winsize bac_list = [] if imagemap and excess > 0: bac_list.append(b.accn) segment_end = end tlx, tly, brx, bry = xx, (1 - ystart) + segment_start * ratio, \ xx + xwidth, (1 - ystart) + segment_end * ratio print >> mapfh, '\t' + write_ImageMapLine(tlx, tly, brx, bry, \ w, h, dpi, chr+":"+",".join(bac_list), segment_start, segment_end) if imagemap: print >> mapfh, '</map>' mapfh.close() logging.debug("Image map written to `{0}`".format(mapfh.name)) if opts.gauge: xstart, ystart = .9, .85 Gauge(root, xstart, ystart - r, ystart, max_chr_len) # class legends, four in a row xstart = .1 xinterval = .2 xwidth = .04 yy = .08 for klass, cc in sorted(class_colors.items()): if klass == '-': continue root.add_patch(Rectangle((xstart, yy), xwidth, xwidth, fc=cc, lw=0, alpha=alpha)) root.text(xstart + xwidth + .01, yy, klass, fontsize=10) xstart += xinterval empty = opts.empty if empty: root.add_patch(Rectangle((xstart, yy), xwidth, xwidth, fill=False, lw=1)) root.text(xstart + xwidth + .01, yy, empty, fontsize=10) root.text(.5, .95, opts.title, fontstyle="italic", ha="center", va="center") root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() savefig(figname, dpi=dpi, iopts=iopts)
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option( "--switch", default=False, action="store_true", help="Switch reference and aligned map elements", ) p.add_option( "--scale", type="float", help="Scale the aligned map distance by factor" ) p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale try: newsseqid = get_number(sseqid) except ValueError: raise ValueError( "`{0}` is on `{1}` with no number to extract".format(saccn, sseqid) ) bedline = "\t".join( str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(newsseqid, sstart)) ) bd.add(bedline) bd.print_to_file(filename=opts.outfile, sorted=True)
def instantiate(args): """ %prog instantiate tagged.bed blacklist.ids big_gaps.bed instantiate NEW genes tagged by renumber. """ p = OptionParser(instantiate.__doc__) p.set_annot_reformat_opts() p.add_option("--extended_stride", default=False, action="store_true", help="Toggle extended strides for gene numbering") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) taggedbed, blacklist, gapsbed = args r = NameRegister(prefix=opts.prefix, pad0=opts.pad0, uc=opts.uc) r.get_blacklist(blacklist) r.get_gaps(gapsbed) # Run through the bed, identify stretch of NEW ids to instantiate, # identify the flanking FRAMEs, interpolate! bed = Bed(taggedbed) outputbed = taggedbed.rsplit(".", 1)[0] + ".new.bed" fw = open(outputbed, "w") tagkey = lambda x: x.rsplit("|", 1)[-1] for chr, sbed in bed.sub_beds(): current_chr = chr_number(chr) if not current_chr: continue sbed = list(sbed) ranks = [] for i, s in enumerate(sbed): nametag = s.extra[0] tag = tagkey(nametag) if tag in (NEW, FRAME): ranks.append((i, nametag)) blocks = [] for tag, names in groupby(ranks, key=lambda x: tagkey(x[-1])): names = list(names) if tag == NEW: blocks.append((tag, [sbed[x[0]] for x in names])) else: start, end = names[0][-1], names[-1][-1] start, end = atg_name(start, retval="rank"), atg_name(end, retval="rank") blocks.append((tag, [start, end])) id_table = {} # old to new name conversion for i, (tag, info) in enumerate(blocks): if tag != NEW: continue start_id = 0 if i == 0 else blocks[i - 1][1][-1] end_id = start_id + 10000 if i == len(blocks) -1 \ else blocks[i + 1][1][0] r.allocate(info, chr, start_id, end_id, id_table, extended_stride=opts.extended_stride) # Output new names for i, s in enumerate(sbed): nametag = s.extra[0] name, tag = nametag.split("|") if tag == NEW: assert name == '.' name = id_table[s.accn] elif tag == OVERLAP: if name in id_table: name = id_table[name] s.extra[0] = "|".join((name, tag)) print >> fw, s fw.close()
def pastegenes(args): """ %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly Paste in zero or low coverage genes. For a set of neighboring genes missing, add the whole cassette as unplaced scaffolds. For singletons the program will try to make a patch. """ from jcvi.formats.base import DictFile from jcvi.utils.cbook import gene_name p = OptionParser(pastegenes.__doc__) p.add_option("--cutoff", default=90, type="int", help="Coverage cutoff to call gene missing [default: %default]") p.add_option("--flank", default=2000, type="int", help="Get the seq of size on two ends [default: %default]") p.add_option("--maxsize", default=50000, type="int", help="Maximum size of patchers to be replaced [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) coveragefile, oldbed, newbed, oldassembly = args cutoff = opts.cutoff flank = opts.flank maxsize = opts.maxsize coverage = DictFile(coveragefile, valuepos=2, cast=float) obed = Bed(oldbed) order = obed.order bed = [x for x in obed if x.accn in coverage] key = lambda x: coverage[x.accn] >= cutoff extrabed = "extra.bed" extendbed = "extend.bed" pastebed = "paste.bed" fw = open(extrabed, "w") fwe = open(extendbed, "w") fwp = open(pastebed, "w") fw_ids = open(extendbed + ".ids", "w") singletons, large, large_genes = 0, 0, 0 for chr, chrbed in groupby(bed, key=lambda x: x.seqid): chrbed = list(chrbed) for good, beds in groupby(chrbed, key=key): if good: continue beds = list(beds) blocksize = len(set([gene_name(x.accn) for x in beds])) if blocksize == 1: singletons += 1 accn = beds[0].accn gi, gb = order[accn] leftb = obed[gi - 1] rightb = obed[gi + 1] leftr = leftb.range rightr = rightb.range cur = gb.range distance_to_left, oo = range_distance(leftr, cur) distance_to_right, oo = range_distance(cur, rightr) span, oo = range_distance(leftr, rightr) if distance_to_left <= distance_to_right and \ distance_to_left > 0: label = "LEFT" else: label = "RIGHT" if 0 < span <= maxsize: print >> fwp, "\t".join(str(x) for x in \ (chr, leftb.start, rightb.end, gb.accn)) print >> fwe, leftb print >> fwe, gb print >> fwe, rightb print >> fwe, "L:{0} R:{1} [{2}]".format(distance_to_left, \ distance_to_right, label) print >> fw_ids, gb.accn continue large += 1 large_genes += blocksize ranges = [(x.start, x.end) for x in beds] rmin, rmax = range_minmax(ranges) rmin -= flank rmax += flank name = "-".join((beds[0].accn, beds[-1].accn)) print >> fw, "\t".join(str(x) for x in (chr, rmin - 1, rmax, name)) fw.close() fwe.close() extrabed = mergeBed(extrabed, d=flank, nms=True) fastaFromBed(extrabed, oldassembly, name=True) summary([extrabed]) logging.debug("Singleton blocks : {0}".format(singletons)) logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
def annotate(args): """ %prog annotate new.bed old.bed 2> log Annotate the `new.bed` with features from `old.bed` for the purpose of gene numbering. Ambiguity in ID assignment can be resolved by either of the following 2 methods: - `alignment`: make use of global sequence alignment score (calculated by `needle`) - `overlap`: make use of overlap length (calculated by `intersectBed`) Transfer over as many identifiers as possible while following guidelines: http://www.arabidopsis.org/portals/nomenclature/guidelines.jsp#editing Note: Following RegExp pattern describes the structure of the identifier assigned to features in the `new.bed` file. new_id_pat = re.compile(r"^\d+\.[cemtx]+\S+") Examples: 23231.m312389, 23231.t004898, 23231.tRNA.144 Adjust the value of `new_id_pat` manually as per your ID naming conventions. """ from jcvi.utils.grouper import Grouper valid_resolve_choices = ["alignment", "overlap"] p = OptionParser(annotate.__doc__) p.add_option("--resolve", default="alignment", choices=valid_resolve_choices, help="Resolve ID assignment based on a certain metric" \ + " [default: %default]") p.add_option("--atg_name", default=False, action="store_true", help="Specify is locus IDs in `new.bed` file follow ATG nomenclature" \ + " [default: %default]") g1 = OptionGroup(p, "Optional parameters (alignment):\n" \ + "Use if resolving ambiguities based on sequence `alignment`") g1.add_option("--pid", dest="pid", default=35., type="float", help="Percent identity cutoff [default: %default]") g1.add_option("--score", dest="score", default=250., type="float", help="Alignment score cutoff [default: %default]") p.add_option_group(g1) g2 = OptionGroup(p, "Optional parameters (overlap):\n" \ + "Use if resolving ambiguities based on `overlap` length\n" \ + "Parameters equivalent to `intersectBed`") g2.add_option("-f", dest="f", default=0.5, type="float", help="Minimum overlap fraction (0.0 - 1.0) [default: %default]") g2.add_option("-r", dest="r", default=False, action="store_true", help="Require fraction overlap to be reciprocal [default: %default]") g2.add_option("-s", dest="s", default=True, action="store_true", help="Require same strandedness [default: %default]") p.add_option_group(g2) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) nbedfile, obedfile = args npf, opf = nbedfile.rsplit(".", 1)[0], obedfile.rsplit(".", 1)[0] # Make consolidated.bed cbedfile = "consolidated.bed" if not os.path.isfile(cbedfile): consolidate(nbedfile, obedfile, cbedfile) else: logging.warning("`{0}` already exists. Skipping step".format(cbedfile)) logging.warning("Resolving ID assignment ambiguity based on `{0}`".\ format(opts.resolve)) if opts.resolve == "alignment": # Get pairs and prompt to run needle pairsfile = "nw.pairs" scoresfile = "nw.scores" if not os.path.isfile(pairsfile): get_pairs(cbedfile, pairsfile) else: logging.warning("`{0}` already exists. Checking for needle output".\ format(pairsfile)) # If needle scores do not exist, prompt user to run needle if not os.path.isfile(scoresfile): logging.error("`{0}` does not exist. Please process {1} using `needle`".\ format(scoresfile, pairsfile)) sys.exit() else: scoresfile = "ovl.scores" # Calculate overlap length using intersectBed calculate_ovl(nbedfile, obedfile, opts, scoresfile) logging.warning("`{0}' exists. Storing scores in memory".\ format(scoresfile)) scores = read_scores(scoresfile, opts) # Iterate through consolidated bed and # filter piles based on score abedline = {} cbed = Bed(cbedfile) g = Grouper() for c in cbed: accn = c.accn g.join(*accn.split(";")) nbedline = {} nbed = Bed(nbedfile) for line in nbed: nbedline[line.accn] = line splits = set() for chr, chrbed in nbed.sub_beds(): abedline, splits = annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits) if splits is not None: abedline = process_splits(splits, scores, nbedline, abedline) abedfile = npf + ".annotated.bed" afh = open(abedfile, "w") for accn in abedline: print >> afh, abedline[accn] afh.close() sort([abedfile, "-i"])
def insert(args): """ %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta Insert scaffolds into assembly. """ from jcvi.formats.agp import mask, bed from jcvi.formats.sizes import agp p = OptionParser(insert.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) candidates, gapsbed, chrfasta, unplacedfasta = args refinedbed = refine([candidates, gapsbed]) sizes = Sizes(unplacedfasta).mapping cbed = Bed(candidates) corder = cbed.order gbed = Bed(gapsbed) gorder = gbed.order gpbed = Bed() gappositions = {} # (chr, start, end) => gapid fp = open(refinedbed) gap_to_scf = defaultdict(list) seen = set() for row in fp: atoms = row.split() unplaced = atoms[3] strand = atoms[5] gapid = atoms[9] if gapid not in seen: seen.add(gapid) gi, gb = gorder[gapid] gpbed.append(gb) gappositions[(gb.seqid, gb.start, gb.end)] = gapid gap_to_scf[gapid].append((unplaced, strand)) gpbedfile = "candidate.gaps.bed" gpbed.print_to_file(gpbedfile, sorted=True) agpfile = agp([chrfasta]) maskedagpfile = mask([agpfile, gpbedfile]) maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed" bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)]) mbed = Bed(maskedbedfile) beds = [] for b in mbed: sid = b.seqid key = (sid, b.start, b.end) if key not in gappositions: beds.append(b) continue gapid = gappositions[key] scfs = gap_to_scf[gapid] # For scaffolds placed in the same gap, sort according to positions scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end) for scf, strand in scfs: size = sizes[scf] beds.append(BedLine("\t".join(str(x) for x in \ (scf, 0, size, sid, 1000, strand)))) finalbed = Bed() finalbed.extend(beds) finalbedfile = "final.bed" finalbed.print_to_file(finalbedfile) # Clean-up toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile] FileShredder(toclean)
def __init__(self, fig, root, datafile, bedfile, layoutfile, switch=None, tree=None, extra_features=None, chr_label=True, loc_label=True, pad=.05, vpad=.015, scalebar=False): w, h = fig.get_figwidth(), fig.get_figheight() bed = Bed(bedfile) order = bed.order bf = BlockFile(datafile) self.layout = lo = Layout(layoutfile) switch = DictFile(switch, delimiter="\t") if switch else None if extra_features: extra_features = Bed(extra_features) exts = [] extras = [] for i in range(bf.ncols): ext = bf.get_extent(i, order) exts.append(ext) if extra_features: start, end, si, ei, chr, orientation, span = ext start, end = start.start, end.end # start, end coordinates ef = list(extra_features.extract(chr, start, end)) # Pruning removes minor features with < 0.1% of the region ef_pruned = [x for x in ef if x.span >= span / 1000] print("Extracted {0} features "\ "({1} after pruning)".format(len(ef), len(ef_pruned)), file=sys.stderr) extras.append(ef_pruned) maxspan = max(exts, key=lambda x: x[-1])[-1] scale = maxspan / .65 self.gg = gg = {} self.rr = [] ymids = [] #vpad = .012 * w / h for i in range(bf.ncols): ext = exts[i] ef = extras[i] if extras else None r = Region(root, ext, lo[i], bed, scale, switch, chr_label=chr_label, loc_label=loc_label, vpad=vpad, extra_features=ef) self.rr.append(r) # Use tid and accn to store gene positions gg.update(dict(((i, k), v) for k, v in r.gg.items())) ymids.append(r.y) for i, j, samearc in lo.edges: for ga, gb, h in bf.iter_pairs(i, j): a, b = gg[(i, ga)], gg[(j, gb)] if samearc == "above": ymid = ymids[i] + 2 * pad elif samearc == "below": ymid = ymids[i] - 2 * pad else: ymid = (ymids[i] + ymids[j]) / 2 Shade(root, a, b, ymid, fc="gainsboro", lw=0, alpha=1) for ga, gb, h in bf.iter_pairs(i, j, highlight=True): a, b = gg[(i, ga)], gg[(j, gb)] if samearc == "above": ymid = ymids[i] + 2 * pad elif samearc == "below": ymid = ymids[i] - 2 * pad else: ymid = (ymids[i] + ymids[j]) / 2 Shade(root, a, b, ymid, alpha=1, highlight=h, zorder=2) if scalebar: print("Build scalebar (scale={})".format(scale), file=sys.stderr) # Find the best length of the scalebar ar = [1, 2, 5] candidates = [1000 * x for x in ar] + [10000 * x for x in ar] + \ [100000 * x for x in ar] # Find the one that's close to an optimal canvas size dists = [(abs(x / scale - .12), x) for x in candidates] dist, candidate = min(dists) dist = candidate / scale x, y, yp = .2, .96, .005 a, b = x - dist / 2, x + dist / 2 lsg = "lightslategrey" root.plot([a, a], [y - yp, y + yp], "-", lw=2, color=lsg) root.plot([b, b], [y - yp, y + yp], "-", lw=2, color=lsg) root.plot([a, b], [y, y], "-", lw=2, color=lsg) root.text(x, y + .02, human_size(candidate, precision=0), ha="center", va="center") if tree: from jcvi.graphics.tree import draw_tree, read_trees trees = read_trees(tree) ntrees = len(trees) logging.debug("A total of {0} trees imported.".format(ntrees)) xiv = 1. / ntrees yiv = .3 xstart = 0 ystart = min(ymids) - .4 for i in range(ntrees): ax = fig.add_axes([xstart, ystart, xiv, yiv]) label, outgroup, color, tx = trees[i] draw_tree(ax, tx, outgroup=outgroup, rmargin=.4, leaffont=11, treecolor=color, supportcolor=color, leafcolor=color) xstart += xiv RoundLabel(ax, .5, .3, label, fill=True, fc="lavender", color=color)
def depth(args): """ %prog depth *.regions.bed.gz Plot the mosdepth regions BED file. We recommend to generate this BED file by (please adjust the --by parameter to your required resolution): $ mosdepth --no-per-base --use-median --fast-mode --by 1000000 sample.wgs sample.bam Use --chrinfo to specify a colormap between seqid, desired color, and optionally a new name. For example: chr01A, #c51b7d, 1A chr01B, #4d9221, 1B ... Only seqids that are in the colormap will be plotted, in the order that's given in the file. When --colormap is not set, every seqid will be drawn in black. Can take multiple BED files as input and then plot all of them in a composite figure. """ p = OptionParser(depth.__doc__) p.add_option( "--chrinfo", help="Comma-separated mappings between seqid, color, new_name") p.add_option( "--titleinfo", help="Comma-separated titles mappings between filename, title", ) p.add_option("--maxdepth", default=100, type="int", help="Maximum depth to show") p.add_option("--logscale", default=False, action="store_true", help="Use log-scale on depth") opts, args, iopts = p.set_image_options(args, style="dark", figsize="14x4") if len(args) < 1: sys.exit(not p.print_help()) bedfiles = args chrinfo = ChrInfoFile(opts.chrinfo) if opts.chrinfo else {} titleinfo = TitleInfoFile(opts.titleinfo) if opts.titleinfo else {} fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) npanels = len(bedfiles) yinterval = 1.0 / npanels ypos = 1 - yinterval for bedfile in bedfiles: pf = op.basename(bedfile).split(".", 1)[0] bed = Bed(bedfile) panel_root = root if npanels == 1 else fig.add_axes( [0, ypos, 1, yinterval]) panel_ax = fig.add_axes( [0.1, ypos + 0.2 * yinterval, 0.8, 0.65 * yinterval]) if ypos > 0.001: root.plot((0, 1), (ypos, ypos), "-", lw=2, color="lightslategray") title = titleinfo.get(bedfile, pf.split("_", 1)[0]) subtitle = None if isinstance(title, TitleInfoLine): subtitle = title.subtitle title = title.title draw_depth( panel_root, panel_ax, bed, chrinfo=chrinfo, ylim=opts.maxdepth, logscale=opts.logscale, title=title, subtitle=subtitle, ) ypos -= yinterval normalize_axes(root) if npanels > 1: pf = op.commonprefix(bedfiles) pf = pf or "depth" image_name = pf + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def bambus(args): """ %prog bambus bambus.bed bambus.mates total.fasta Insert unplaced scaffolds based on mates. """ from jcvi.utils.iter import pairwise from jcvi.formats.posmap import MatesFile p = OptionParser(bambus.__doc__) p.add_option("--prefix", default="scaffold", help="Prefix of the unplaced scaffolds [default: %default]") p.add_option("--minlinks", default=3, type="int", help="Minimum number of links to place [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, matesfile, fastafile = args pf = matesfile.rsplit(".", 1)[0] logfile = pf + ".log" log = open(logfile, "w") mf = MatesFile(matesfile) maxdist = max(x.max for x in mf.libraries.values()) logging.debug("Max separation: {0}".format(maxdist)) prefix = opts.prefix minlinks = opts.minlinks is_unplaced = lambda x: x.startswith(prefix) bed = Bed(bedfile, sorted=False) beds = [] unplaced = defaultdict(list) for a, b in pairwise(bed): aname, bname = a.accn, b.accn aseqid, bseqid = a.seqid, b.seqid if aname not in mf: continue pa, la = mf[aname] if pa != bname: continue ia = is_unplaced(aseqid) ib = is_unplaced(bseqid) if ia == ib: continue if ia: a, b = b, a unplaced[b.seqid].append((a, b)) beds.extend([a, b]) sizes = Sizes(fastafile) candidatebed = Bed() cbeds = [] # For each unplaced scaffold, find most likely placement and orientation for scf, beds in sorted(unplaced.items()): print >> log ranges = [] for a, b in beds: aname, astrand = a.accn, a.strand bname, bstrand = b.accn, b.strand aseqid, bseqid = a.seqid, b.seqid pa, lib = mf[aname] print >> log, a print >> log, b flip_b = (astrand == bstrand) fbstrand = '-' if flip_b else '+' if flip_b: b.reverse_complement(sizes) lmin, lmax = lib.min, lib.max L = sizes.get_size(scf) assert astrand in ('+', '-') if astrand == '+': offset = a.start - b.end sstart, sstop = offset + lmin, offset + lmax else: offset = a.end - b.start + L sstart, sstop = offset - lmax, offset - lmin # Prevent out of range error size = sizes.get_size(aseqid) sstart = max(0, sstart) sstop = max(0, sstop) sstart = min(size - 1, sstart) sstop = min(size - 1, sstop) start_range = (aseqid, sstart, sstop, scf, 1, fbstrand) print >> log, "*" + "\t".join(str(x) for x in start_range) ranges.append(start_range) mranges = [x[:3] for x in ranges] # Determine placement by finding the interval with the most support rd = ranges_depth(mranges, sizes.mapping, verbose=False) alldepths = [] for depth in rd: alldepths.extend(depth) print >> log, alldepths maxdepth = max(alldepths, key=lambda x: x[-1])[-1] if maxdepth < minlinks: print >> log, "Insufficient links ({0} < {1})".format(maxdepth, minlinks) continue candidates = [x for x in alldepths if x[-1] == maxdepth] nseqids = len(set(x[0] for x in candidates)) msg = "Multiple conflicting candidates found" if nseqids != 1: print >> log, msg continue seqid, mmin, mmax, depth = candidates[0] mmin, mmax = range_minmax([x[1:3] for x in candidates]) if (mmax - mmin) > maxdist: print >> log, msg continue # Determine orientation by voting nplus, nminus = 0, 0 arange = (seqid, mmin, mmax) for sid, start, end, sf, sc, fbstrand in ranges: brange = (sid, start, end) if range_overlap(arange, brange): if fbstrand == '+': nplus += 1 else: nminus += 1 fbstrand = '+' if nplus >= nminus else '-' candidate = (seqid, mmin, mmax, scf, depth, fbstrand) bedline = BedLine("\t".join((str(x) for x in candidate))) cbeds.append(bedline) print >> log, "Plus: {0}, Minus: {1}".format(nplus, nminus) print >> log, candidate candidatebed.extend(cbeds) logging.debug("A total of {0} scaffolds can be placed.".\ format(len(candidatebed))) log.close() candidatebedfile = pf + ".candidate.bed" candidatebed.print_to_file(candidatebedfile, sorted=True)
def bambus(args): """ %prog bambus bambus.bed bambus.mates total.fasta Insert unplaced scaffolds based on mates. """ from jcvi.utils.iter import pairwise from jcvi.formats.bed import BedLine from jcvi.formats.posmap import MatesFile p = OptionParser(bambus.__doc__) p.add_option("--prefix", default="scaffold", help="Prefix of the unplaced scaffolds [default: %default]") p.add_option("--minlinks", default=3, type="int", help="Minimum number of links to place [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, matesfile, fastafile = args pf = matesfile.rsplit(".", 1)[0] logfile = pf + ".log" log = open(logfile, "w") mf = MatesFile(matesfile) maxdist = max(x.max for x in mf.libraries.values()) logging.debug("Max separation: {0}".format(maxdist)) prefix = opts.prefix minlinks = opts.minlinks is_unplaced = lambda x: x.startswith(prefix) bed = Bed(bedfile, sorted=False) beds = [] unplaced = defaultdict(list) for a, b in pairwise(bed): aname, bname = a.accn, b.accn aseqid, bseqid = a.seqid, b.seqid if aname not in mf: continue pa, la = mf[aname] if pa != bname: continue ia = is_unplaced(aseqid) ib = is_unplaced(bseqid) if ia == ib: continue if ia: a, b = b, a unplaced[b.seqid].append((a, b)) beds.extend([a, b]) sizes = Sizes(fastafile) candidatebed = Bed() cbeds = [] # For each unplaced scaffold, find most likely placement and orientation for scf, beds in sorted(unplaced.items()): print >> log ranges = [] for a, b in beds: aname, astrand = a.accn, a.strand bname, bstrand = b.accn, b.strand aseqid, bseqid = a.seqid, b.seqid pa, lib = mf[aname] print >> log, a print >> log, b flip_b = (astrand == bstrand) fbstrand = '-' if flip_b else '+' if flip_b: b.reverse_complement(sizes) lmin, lmax = lib.min, lib.max L = sizes.get_size(scf) assert astrand in ('+', '-') if astrand == '+': offset = a.start - b.end sstart, sstop = offset + lmin, offset + lmax else: offset = a.end - b.start + L sstart, sstop = offset - lmax, offset - lmin # Prevent out of range error size = sizes.get_size(aseqid) sstart = max(0, sstart) sstop = max(0, sstop) sstart = min(size - 1, sstart) sstop = min(size - 1, sstop) start_range = (aseqid, sstart, sstop, scf, 1, fbstrand) print >> log, "*" + "\t".join(str(x) for x in start_range) ranges.append(start_range) mranges = [x[:3] for x in ranges] # Determine placement by finding the interval with the most support rd = ranges_depth(mranges, sizes.mapping, verbose=False) alldepths = [] for depth in rd: alldepths.extend(depth) print >> log, alldepths maxdepth = max(alldepths, key=lambda x: x[-1])[-1] if maxdepth < minlinks: print >> log, "Insufficient links ({0} < {1})".format(maxdepth, minlinks) continue candidates = [x for x in alldepths if x[-1] == maxdepth] nseqids = len(set(x[0] for x in candidates)) if nseqids != 1: msg = "Multiple conflicting candidates found" print >> log, msg continue seqid, mmin, mmax, depth = candidates[0] mmin, mmax = range_minmax([x[1:3] for x in candidates]) if mmin >= mmax: msg = "Invalid (min, max) range" print >> log, "Invalid (min, max) range" continue if (mmax - mmin) > maxdist: msg = "(min, max) distance greater than library maxdist" print >> log, msg continue # Determine orientation by voting nplus, nminus = 0, 0 arange = (seqid, mmin, mmax) for sid, start, end, sf, sc, fbstrand in ranges: brange = (sid, start, end) if range_overlap(arange, brange): if fbstrand == '+': nplus += 1 else: nminus += 1 fbstrand = '+' if nplus >= nminus else '-' candidate = (seqid, mmin, mmax, scf, depth, fbstrand) bedline = BedLine("\t".join((str(x) for x in candidate))) cbeds.append(bedline) print >> log, "Plus: {0}, Minus: {1}".format(nplus, nminus) print >> log, candidate candidatebed.extend(cbeds) logging.debug("A total of {0} scaffolds can be placed.".\ format(len(candidatebed))) log.close() candidatebedfile = pf + ".candidate.bed" candidatebed.print_to_file(candidatebedfile, sorted=True)
def insert(args): """ %prog insert candidates.bed gaps.bed chrs.fasta unplaced.fasta Insert scaffolds into assembly. """ from jcvi.formats.agp import mask, bed from jcvi.formats.sizes import agp p = OptionParser(insert.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) candidates, gapsbed, chrfasta, unplacedfasta = args refinedbed = refine([candidates, gapsbed]) sizes = Sizes(unplacedfasta).mapping cbed = Bed(candidates) corder = cbed.order gbed = Bed(gapsbed) gorder = gbed.order gpbed = Bed() gappositions = {} # (chr, start, end) => gapid fp = open(refinedbed) gap_to_scf = defaultdict(list) seen = set() for row in fp: atoms = row.split() if len(atoms) <= 6: continue unplaced = atoms[3] strand = atoms[5] gapid = atoms[9] if gapid not in seen: seen.add(gapid) gi, gb = gorder[gapid] gpbed.append(gb) gappositions[(gb.seqid, gb.start, gb.end)] = gapid gap_to_scf[gapid].append((unplaced, strand)) gpbedfile = "candidate.gaps.bed" gpbed.print_to_file(gpbedfile, sorted=True) agpfile = agp([chrfasta]) maskedagpfile = mask([agpfile, gpbedfile]) maskedbedfile = maskedagpfile.rsplit(".", 1)[0] + ".bed" bed([maskedagpfile, "--outfile={0}".format(maskedbedfile)]) mbed = Bed(maskedbedfile) finalbed = Bed() for b in mbed: sid = b.seqid key = (sid, b.start, b.end) if key not in gappositions: finalbed.add("{0}\n".format(b)) continue gapid = gappositions[key] scfs = gap_to_scf[gapid] # For scaffolds placed in the same gap, sort according to positions scfs.sort(key=lambda x: corder[x[0]][1].start + corder[x[0]][1].end) for scf, strand in scfs: size = sizes[scf] finalbed.add("\t".join(str(x) for x in \ (scf, 0, size, sid, 1000, strand))) finalbedfile = "final.bed" finalbed.print_to_file(finalbedfile) # Clean-up toclean = [gpbedfile, agpfile, maskedagpfile, maskedbedfile] FileShredder(toclean)
def extract(args): import re from jcvi.formats.bed import Bed db = "" if op.isfile(args.db): db = Fasta(args.db) else: f_db = "%s/data/%s/10_genome.fna" % (os.environ["genome"], args.db) assert op.isfile(f_db), "cannot find %s" % args.db db = Fasta(f_db) reg1 = re.compile("^([\w\-]+)\:([\d,]+)(\-|\.{1,2})([\d,]+)$") reg2 = re.compile("^([\w\-]+)$") bed = Bed() if op.isfile(args.loc): if args.list: fho = must_open(args.loc, 'r') for line in fho: sid = line.strip() beg = 0 if sid in db: end = len(db[sid]) bed.add("%s\t%d\t%d\n" % (sid, beg, end)) # else: # logging.error("%s not in db => skipped" % sid) else: bed = Bed(args.loc, sorted=False) else: for loc in args.loc.split(","): res = reg1.match(loc) if res: sid, beg, end = res.group(1), res.group(2), res.group(4) beg = int(beg.replace(",", "")) end = int(end.replace(",", "")) bed.add("%s\t%d\t%d\n" % (sid, beg - 1, end)) else: res = reg2.match(loc) if res: sid = res.group(1) beg = 0 if sid in db: end = len(db[sid]) bed.add("%s\t%d\t%d\n" % (sid, beg, end)) # else: # logging.error("%s not in db => skipped" % sid) else: logging.error("%s: unknown locstr => skipped" % loc) rcds = [] for b in bed: sid, beg, end = b.seqid, b.start, b.end oid = sid if args.list else f"{sid}-{beg}-{end}" if b.accn: oid = b.accn if sid not in db: print("%s not in db => skipped" % sid) continue size = end - beg + 1 bp_pad = 0 if beg < 1: bp_pad += 1 - beg beg = 1 if beg > len(db[sid]): bp_pad = 1 beg = len(db[sid]) if end > len(db[sid]): bp_pad += end - len(db[sid]) end = len(db[sid]) seq = db[sid][beg - 1:end].seq if args.padding: if bp_pad > 0: if end - beg + 1 < 30: seq = "N" * size else: seq += "N" * bp_pad assert len(seq) == size, "error in seq size: %s:%d-%d %d" % ( sid, beg, end, bp_pad) if args.tsv: print("\t".join([sid, str(beg), str(end), seq])) else: rcd = SeqRecord(Seq(seq), id=oid, description='') SeqIO.write([rcd], sys.stdout, 'fasta')
def ancestral(args): """ %prog ancestral vplanifoliaA.vplanifoliaA.anchors > vplanifoliaA_blocks.bed Paint 14 chromosomes following alpha WGD. """ p = OptionParser(ancestral.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (anchorsfile, ) = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) # We focus on the following chromosome pairs target_pairs = set(( (1, 1), (1, 6), (1, 8), (1, 13), (2, 4), (3, 12), (3, 14), (5, 6), (5, 8), (7, 9), (7, 11), (9, 10), (10, 11), )) def get_target(achr, bchr): if "chr" not in achr and "chr" not in bchr: return None achr, bchr = get_number(achr), get_number(bchr) if achr > bchr: achr, bchr = bchr, achr if (achr, bchr) in target_pairs: return achr, bchr return None def build_bedline(astart, aend, target_pair): # target_name = "{:02d}-{:02d}".format(*target_pair) target_name = [ str(x) for x in target_pair if x in (1, 2, 3, 5, 7, 10) ][0] return "\t".join( str(x) for x in (astart.seqid, astart.start, aend.end, target_name)) # Iterate through the blocks, store any regions that has hits to one of the # target_pairs ac = AnchorFile(anchorsfile) blocks = ac.blocks outbed = Bed() for i, block in enumerate(blocks): a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] astart, aend = min(a)[1], max(a)[1] bstart, bend = min(b)[1], max(b)[1] # Now convert to BED lines with new accn achr, bchr = astart.seqid, bstart.seqid target = get_target(achr, bchr) if target is None: continue outbed.add(build_bedline(astart, aend, target)) outbed.add(build_bedline(bstart, bend, target)) outbed.print_to_file(sorted=True)
def composite(args): """ %prog composite fastafile chr1 Combine line plots, feature bars and alt-bars, different data types specified in options. Inputs must be BED-formatted. Three types of viz are currently supported: --lines: traditional line plots, useful for plotting feature freq --bars: show where the extent of features are --altbars: similar to bars, yet in two alternating tracks, e.g. scaffolds """ from jcvi.graphics.chromosome import HorizontalChromosome p = OptionParser(composite.__doc__) p.add_option("--lines", help="Features to plot in lineplot [default: %default]") p.add_option("--bars", help="Features to plot in bars [default: %default]") p.add_option("--altbars", help="Features to plot in alt-bars [default: %default]") p.add_option("--fatten", default=False, action="store_true", help="Help visualize certain narrow features [default: %default]") p.add_option("--mode", default="span", choices=("span", "count", "score"), help="Accumulate feature based on [default: %default]") add_window_options(p) opts, args, iopts = p.set_image_options(args, figsize="8x5") if len(args) != 2: sys.exit(not p.print_help()) fastafile, chr = args window, shift, subtract = check_window_options(opts) linebeds, barbeds, altbarbeds = [], [], [] fatten = opts.fatten if opts.lines: lines = opts.lines.split(",") linebeds = get_beds(lines) if opts.bars: bars = opts.bars.split(",") barbeds = get_beds(bars) if opts.altbars: altbars = opts.altbars.split(",") altbarbeds = get_beds(altbars) linebins = get_binfiles(linebeds, fastafile, shift, mode=opts.mode) margin = .12 clen = Sizes(fastafile).mapping[chr] nbins = get_nbins(clen, shift) plt.rcParams["xtick.major.size"] = 0 plt.rcParams["ytick.major.size"] = 0 fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) root.text(.5, .95, chr, ha="center", color="darkslategray") xstart, xend = margin, 1 - margin xlen = xend - xstart ratio = xlen / clen # Line plots ax = fig.add_axes([xstart, .6, xlen, .3]) lineplot(ax, linebins, nbins, chr, window, shift) # Bar plots yy = .5 yinterval = .08 xs = lambda x: xstart + ratio * x r = .01 fattend = .0025 for bb in barbeds: root.text(xend + .01, yy, bb.split(".")[0], va="center") HorizontalChromosome(root, xstart, xend, yy, height=.02) bb = Bed(bb) for b in bb: start, end = xs(b.start), xs(b.end) span = end - start if fatten and span < fattend: span = fattend root.add_patch(Rectangle((start, yy - r), span, 2 * r, \ lw=0, fc="darkslategray")) yy -= yinterval # Alternative bar plots offset = r / 2 for bb in altbarbeds: root.text(xend + .01, yy, bb.split(".")[0], va="center") bb = Bed(bb) for i, b in enumerate(bb): start, end = xs(b.start), xs(b.end) span = end - start if span < .0001: continue offset = -offset root.add_patch(Rectangle((start, yy + offset), end - start, .003, \ lw=0, fc="darkslategray")) yy -= yinterval root.set_xlim(0, 1) root.set_ylim(0, 1) root.set_axis_off() image_name = chr + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def split(args): """ %prog split split.bed evidences.bed predictor1.gff predictor2.gff fastafile Split MAKER models by checking against predictors (such as AUGUSTUS and FGENESH). For each region covered by a working model. Find out the combination of predictors that gives the best accuracy against evidences (such as PASA). `split.bed` can be generated by pulling out subset from a list of ids $ python -m jcvi.formats.base join split.ids working.bed --column=0,3 --noheader | cut -f2-7 > split.bed """ from jcvi.formats.bed import Bed p = OptionParser(split.__doc__) p.add_option( "--key", default="Name", help="Key in the attributes to extract predictor.gff", ) p.add_option( "--parents", default="match", help="list of features to extract, use comma to separate (e.g.'gene,mRNA')", ) p.add_option( "--children", default="match_part", help="list of features to extract, use comma to separate (e.g." "'five_prime_UTR,CDS,three_prime_UTR')", ) opts, args = p.parse_args(args) if len(args) != 5: sys.exit(not p.print_help()) split_bed, evidences_bed, p1_gff, p2_gff, fastafile = args parents = opts.parents children = opts.children key = opts.key bed = Bed(split_bed) s1 = get_splits(split_bed, p1_gff, parents, key) s2 = get_splits(split_bed, p2_gff, parents, key) for b in bed: query = "{0}:{1}-{2}".format(b.seqid, b.start, b.end) b1 = get_accuracy(query, p1_gff, evidences_bed, fastafile, children, key) b2 = get_accuracy(query, p2_gff, evidences_bed, fastafile, children, key) accn = b.accn c1 = "|".join(s1[accn]) c2 = "|".join(s2[accn]) ac1 = b1.accuracy ac2 = b2.accuracy tag = p1_gff if ac1 >= ac2 else p2_gff tag = tag.split(".")[0] ac1 = "{0:.3f}".format(ac1) ac2 = "{0:.3f}".format(ac2) print("\t".join((accn, tag, ac1, ac2, c1, c2)))