def main(blastfile, p, opts): sqlite = opts.sqlite qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts) filtered_blast = read_blast(blastfile, qorder, sorder, \ is_self=is_self, ostrip=opts.strip_names) all_data = [(b.qi, b.si) for b in filtered_blast] c = None if sqlite: conn = sqlite3.connect(sqlite) c = conn.cursor() c.execute("drop table if exists synteny") c.execute("create table synteny (query text, anchor text, " "gray varchar(1), score integer, dr integer, " "orientation varchar(1), qnote text, snote text)") fw = None else: fw = must_open(opts.outfile, "w") batch_query(qbed, sbed, all_data, opts, fw=fw, c=c, transpose=False) if qbed.filename == sbed.filename: logging.debug("Self comparisons, mirror ignored") else: batch_query(qbed, sbed, all_data, opts, fw=fw, c=c, transpose=True) if sqlite: c.execute("create index q on synteny (query)") conn.commit() c.close() else: fw.close()
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale try: newsseqid = get_number(sseqid) except ValueError: raise ValueError, "`{0}` is on `{1}` with no number to extract".\ format(saccn, sseqid) bedline = "\t".join(str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(newsseqid, sstart))) bd.add(bedline) bd.print_to_file(filename=opts.outfile, sorted=True)
def movieframe(args): """ %prog movieframe tour test.clm contigs.ref.anchors Draw heatmap and synteny in the same plot. """ p = OptionParser(movieframe.__doc__) p.add_option("--label", help="Figure title") p.set_beds() p.set_outfile(outfile=None) opts, args, iopts = p.set_image_options(args, figsize="16x8", style="white", cmap="coolwarm", format="png", dpi=120) if len(args) != 3: sys.exit(not p.print_help()) tour, clmfile, anchorsfile = args tour = tour.split(",") image_name = opts.outfile or ("movieframe." + iopts.format) label = opts.label or op.basename(image_name).rsplit(".", 1)[0] clm = CLMFile(clmfile) totalbins, bins, breaks = make_bins(tour, clm.tig_to_size) M = read_clm(clm, totalbins, bins) fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # whole canvas ax1 = fig.add_axes([.05, .1, .4, .8]) # heatmap ax2 = fig.add_axes([.55, .1, .4, .8]) # dot plot ax2_root = fig.add_axes([.5, 0, .5, 1]) # dot plot canvas # Left axis: heatmap plot_heatmap(ax1, M, breaks, iopts) # Right axis: synteny qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts, sorted=False) dotplot(anchorsfile, qbed, sbed, fig, ax2_root, ax2, sep=False, title="") root.text(.5, .98, clm.name, color="g", ha="center", va="center") root.text(.5, .95, label, color="darkslategray", ha="center", va="center") normalize_axes(root) savefig(image_name, dpi=iopts.dpi, iopts=iopts)
def offdiag(args): """ %prog offdiag diploid.napus.1x1.lifted.anchors Find gene pairs that are off diagnoal. "Off diagonal" are the pairs that are not on the orthologous chromosomes. For example, napus chrA01 and brapa A01. """ p = OptionParser(offdiag.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) fp = open(anchorsfile) pf = "-".join(anchorsfile.split(".")[:2]) header = "Block-id|Napus|Diploid|Napus-chr|Diploid-chr|RBH?".split("|") print "\t".join(header) i = -1 for row in fp: if row[0] == '#': i += 1 continue q, s, score = row.split() rbh = 'no' if score[-1] == 'L' else 'yes' qi, qq = qorder[q] si, ss = sorder[s] oqseqid = qseqid = qq.seqid osseqid = sseqid = ss.seqid sseqid = sseqid.split("_")[0][-3:] if qseqid[0] == 'A': qseqid = qseqid[-3:] # A09 => A09 elif qseqid[0] == 'C': qseqid = 'C0' + qseqid[-1] # C9 => C09 else: continue if qseqid == sseqid or sseqid[-2:] == 'nn': continue block_id = pf + "-block-{0}".format(i) print "\t".join((block_id, q, s, oqseqid, osseqid, rbh))
def mergechrom(args): """ %prog mergechrom a.b.anchors merge synteny blocks on the same chromosome """ p = OptionParser(mergechrom.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) af = AnchorFile(anchorfile) newanchorfile = anchorfile.rsplit(".", 1)[0] + ".mergechrom.anchors" fw = open(newanchorfile, "w") qchrom_dic = dict((b.accn,b.seqid) for b in qbed) schrom_dic = dict((b.accn,b.seqid) for b in sbed) block_dic = dict() blocks = af.blocks for (i,block) in enumerate(blocks): q, s, score = block[0] qchrom, schrom = qchrom_dic[q], schrom_dic[s] k = "%s_%s" % (qchrom, schrom) if k not in block_dic: block_dic[k] = [] block_dic[k].append(i) for (k, idxs) in block_dic.items(): print("#" * 3, file=fw) for i in idxs: for q, s, score in blocks[i]: print("\t".join((q, s, str(score))), file=fw) fw.close() print("%d blocks merged to %d" % (len(blocks), len(block_dic.keys())))
def collinear(args): """ %prog collinear a.b.anchors Reduce synteny blocks to strictly collinear, use dynamic programming in a procedure similar to DAGchainer. """ p = OptionParser(collinear.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) af = AnchorFile(anchorfile) newanchorfile = anchorfile.rsplit(".", 1)[0] + ".collinear.anchors" fw = open(newanchorfile, "w") blocks = af.blocks for block in blocks: print("#" * 3, file=fw) iblock = [] for q, s, score in block: qi, q = qorder[q] si, s = sorder[s] score = int(long(score)) iblock.append([qi, si, score]) block = get_collinear(iblock) for q, s, score in block: q = qbed[q].accn s = sbed[s].accn print("\t".join((q, s, str(score))), file=fw) fw.close()
def dotplot_main(args): p = OptionParser(__doc__) p.set_beds() p.add_option("--synteny", default=False, action="store_true", help="Run a fast synteny scan and display blocks [default: %default]") p.add_option("--cmaptext", help="Draw colormap box on the bottom-left corner") p.add_option("--vmin", dest="vmin", type="float", default=0, help="Minimum value in the colormap [default: %default]") p.add_option("--vmax", dest="vmax", type="float", default=2, help="Maximum value in the colormap [default: %default]") p.add_option("--genomenames", type="string", default=None, help="genome names for labeling axes in the form of qname_sname, " \ "eg. \"Vitis vinifera_Oryza sativa\"") p.add_option("--nmax", dest="sample_number", type="int", default=10000, help="Maximum number of data points to plot [default: %default]") p.add_option("--minfont", type="int", default=4, help="Do not render labels with size smaller than") p.add_option("--colormap", help="Two column file, block id to color mapping [default: %default]") p.add_option("--nosort", default=False, action="store_true", help="Do not sort the seqids along the axes") p.add_option("--nosep", default=False, action="store_true", help="Do not add contig lines") p.add_option("--nostdpf", default=False, action="store_true", help="Do not standardize contig names") p.add_option("--skipempty", default=False, action="store_true", help="Skip seqids that do not have matches") p.add_option("--title", help="Title of the dot plot") p.set_outfile(outfile=None) opts, args, iopts = p.set_image_options(args, figsize="8x8", style="dark", dpi=90, cmap="copper") if len(args) != 1: sys.exit(not p.print_help()) palette = opts.colormap if palette: palette = Palette(palette) anchorfile, = args cmaptext = opts.cmaptext if anchorfile.endswith(".ks"): from jcvi.apps.ks import KsFile logging.debug("Anchors contain Ks values") cmaptext = cmaptext or "*Ks* values" anchorksfile = anchorfile + ".anchors" if need_update(anchorfile, anchorksfile): ksfile = KsFile(anchorfile) ksfile.print_to_anchors(anchorksfile) anchorfile = anchorksfile qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts, sorted=(not opts.nosort)) if opts.skipempty: ac = AnchorFile(anchorfile) if is_self: qseqids = sseqids = set() else: qseqids, sseqids = set(), set() for pair in ac.iter_pairs(): q, s = pair[:2] qi, q = qorder[q] si, s = sorder[s] qseqids.add(q.seqid) sseqids.add(s.seqid) if is_self: qbed = sbed = subset_bed(qbed, qseqids) else: qbed = subset_bed(qbed, qseqids) sbed = subset_bed(sbed, sseqids) fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # the whole canvas ax = fig.add_axes([.1, .1, .8, .8]) # the dot plot dotplot(anchorfile, qbed, sbed, fig, root, ax, vmin=opts.vmin, vmax=opts.vmax, is_self=is_self, synteny=opts.synteny, cmap_text=opts.cmaptext, cmap=iopts.cmap, genomenames=opts.genomenames, sample_number=opts.sample_number, minfont=opts.minfont, palette=palette, sep=(not opts.nosep), title=opts.title, stdpf=(not opts.nostdpf)) image_name = opts.outfile or \ (op.splitext(anchorfile)[0] + "." + opts.format) savefig(image_name, dpi=iopts.dpi, iopts=iopts) fig.clear()
def loss(args): """ %prog loss a.b.i1.blocks [a.b-genomic.blast] Extract likely gene loss candidates between genome a and b. """ p = OptionParser(loss.__doc__) p.add_option("--bed", default=False, action="store_true", help="Genomic BLAST is in bed format [default: %default]") p.add_option("--gdist", default=20, type="int", help="Gene distance [default: %default]") p.add_option("--bdist", default=20000, type="int", help="Base pair distance [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) blocksfile = args[0] emptyblast = (len(args) == 1) if emptyblast: genomicblast = "empty.blast" sh("touch {0}".format(genomicblast)) else: genomicblast = args[1] gdist, bdist = opts.gdist, opts.bdist qbed, sbed, qorder, sorder, is_self = check_beds(blocksfile, p, opts) blocks = [] fp = open(blocksfile) genetrack = {} proxytrack = {} for row in fp: a, b = row.split() genetrack[a] = b blocks.append((a, b)) data = [] for key, rows in groupby(blocks, key=lambda x: x[-1]): rows = list(rows) data.append((key, rows)) imax = len(data) - 1 for i, (key, rows) in enumerate(data): if i == 0 or i == imax: continue if key != '.': continue before, br = data[i - 1] after, ar = data[i + 1] bi, bx = sorder[before] ai, ax = sorder[after] dist = abs(bi - ai) if bx.seqid != ax.seqid or dist > gdist: continue start, end = range_minmax(((bx.start, bx.end), (ax.start, ax.end))) start, end = max(start - bdist, 1), end + bdist proxy = (bx.seqid, start, end) for a, b in rows: proxytrack[a] = proxy tags = {} if opts.bed: bed = Bed(genomicblast, sorted=False) key = lambda x: gene_name(x.accn.rsplit(".", 1)[0]) for query, bb in groupby(bed, key=key): bb = list(bb) if query not in proxytrack: continue proxy = proxytrack[query] tag = "NS" best_b = bb[0] for b in bb: hsp = (b.seqid, b.start, b.end) if range_overlap(proxy, hsp): tag = "S" best_b = b break hsp = (best_b.seqid, best_b.start, best_b.end) proxytrack[query] = hsp tags[query] = tag else: blast = Blast(genomicblast) for query, bb in blast.iter_hits(): bb = list(bb) query = gene_name(query) if query not in proxytrack: continue proxy = proxytrack[query] tag = "NS" best_b = bb[0] for b in bb: hsp = (b.subject, b.sstart, b.sstop) if range_overlap(proxy, hsp): tag = "S" best_b = b break hsp = (best_b.subject, best_b.sstart, best_b.sstop) proxytrack[query] = hsp tags[query] = tag for b in qbed: accn = b.accn target_region = genetrack[accn] if accn in proxytrack: target_region = region_str(proxytrack[accn]) if accn in tags: ptag = "[{0}]".format(tags[accn]) else: ptag = "[NF]" target_region = ptag + target_region print "\t".join((b.seqid, accn, target_region)) if emptyblast: sh("rm -f {0}".format(genomicblast))
def pad(args): """ %prog pad blastfile cdtfile --qbed q.pad.bed --sbed s.pad.bed Test and reconstruct candidate PADs. """ from jcvi.formats.cdt import CDT p = OptionParser(pad.__doc__) p.set_beds() p.add_option( "--cutoff", default=.3, type="float", help="The clustering cutoff to call similar [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cutoff = opts.cutoff blastfile, cdtfile = args qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts) cdt = CDT(cdtfile) qparts = list(cdt.iter_partitions(cutoff=cutoff)) sparts = list(cdt.iter_partitions(cutoff=cutoff, gtr=False)) qid, sid = {}, {} for i, part in enumerate(qparts): qid.update(dict((x, i) for x in part)) for i, part in enumerate(sparts): sid.update(dict((x, i) for x in part)) # Without writing files, conversion from PAD to merged PAD is done in memory for q in qbed: q.seqid = qid[q.seqid] for s in sbed: s.seqid = sid[s.seqid] qnames = range(len(qparts)) snames = range(len(sparts)) logmp = make_arrays(blastfile, qbed, sbed, qnames, snames) m, n = logmp.shape pvalue_cutoff = 1e-30 cutoff = -log(pvalue_cutoff) significant = [] for i in xrange(m): for j in xrange(n): score = logmp[i, j] if score < cutoff: continue significant.append((qparts[i], sparts[j], score)) for a, b, score in significant: print("|".join(a), "|".join(b), score) logging.debug("Collected {0} PAR comparisons significant at (P < {1}).".\ format(len(significant), pvalue_cutoff)) return significant
def blastfilter_main(blast_file, p, opts): qbed, sbed, qorder, sorder, is_self = check_beds(blast_file, p, opts) tandem_Nmax = opts.tandem_Nmax cscore = opts.cscore fp = open(blast_file) total_lines = sum(1 for line in fp if line[0] != '#') logging.debug("Load BLAST file `%s` (total %d lines)" % \ (blast_file, total_lines)) bl = Blast(blast_file) blasts = sorted(list(bl), key=lambda b: b.score, reverse=True) filtered_blasts = [] seen = set() ostrip = opts.strip_names nwarnings = 0 for b in blasts: query, subject = b.query, b.subject if query == subject: continue if ostrip: query, subject = gene_name(query), gene_name(subject) if query not in qorder: if nwarnings < 100: logging.warning("{0} not in {1}".format(query, qbed.filename)) elif nwarnings == 100: logging.warning("too many warnings.. suppressed") nwarnings += 1 continue if subject not in sorder: if nwarnings < 100: logging.warning("{0} not in {1}".format(subject, sbed.filename)) elif nwarnings == 100: logging.warning("too many warnings.. suppressed") nwarnings += 1 continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # move all hits to same side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.query, b.subject = key b.qi, b.si = qi, si b.qseqid, b.sseqid = q.seqid, s.seqid filtered_blasts.append(b) if cscore: before_filter = len(filtered_blasts) logging.debug("running the cscore filter (cscore>=%.2f) .." % cscore) filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore)) logging.debug("after filter (%d->%d) .." % (before_filter, len(filtered_blasts))) if tandem_Nmax: logging.debug("running the local dups filter (tandem_Nmax=%d) .." % \ tandem_Nmax) qtandems = tandem_grouper(qbed, filtered_blasts, flip=True, tandem_Nmax=tandem_Nmax) standems = tandem_grouper(sbed, filtered_blasts, flip=False, tandem_Nmax=tandem_Nmax) qdups_fh = open(op.splitext(opts.qbed)[0] + ".localdups", "w") \ if opts.tandems_only else None if is_self: for s in standems: qtandems.join(*s) qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh) sdups_to_mother = qdups_to_mother else: qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh) sdups_fh = open(op.splitext(opts.sbed)[0] + ".localdups", "w") \ if opts.tandems_only else None sdups_to_mother = write_localdups(standems, sbed, sdups_fh) if opts.tandems_only: # write out new .bed after tandem removal write_new_bed(qbed, qdups_to_mother) if not is_self: write_new_bed(sbed, sdups_to_mother) # just want to use this script as a tandem finder. #sys.exit() before_filter = len(filtered_blasts) filtered_blasts = list(filter_tandem(filtered_blasts, \ qdups_to_mother, sdups_to_mother)) logging.debug("after filter (%d->%d) .." % \ (before_filter, len(filtered_blasts))) blastfilteredfile = blast_file + ".filtered" fw = open(blastfilteredfile, "w") write_new_blast(filtered_blasts, fh=fw) fw.close()
help="genome names for labeling axes in the form of qname_sname, " \ "eg. \"Vitis vinifera_Oryza sativa\"") p.add_option("--nmax", dest="sample_number", type="int", default=10000, help="Maximum number of data points to plot [default: %default]") p.add_option("--ignore", type="float", default=.005, help="Do not render labels for chr less than portion of genome [default: %default]") p.add_option("--palette", help="Two column file, block id to color mapping [default: %default]") opts, args, iopts = p.set_image_options(sys.argv[1:], figsize="8x8", dpi=90) if len(args) != 1: sys.exit(not p.print_help()) synteny = opts.synteny vmin, vmax = opts.vmin, opts.vmax cmap_text = opts.cmap genomenames = opts.genomenames sample_number = opts.sample_number palette = opts.palette if palette: palette = Palette(palette) anchorfile, = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) image_name = op.splitext(anchorfile)[0] + "." + opts.format dotplot_main(anchorfile, qbed, sbed, image_name, iopts, vmin=0, vmax=1, is_self=is_self, synteny=synteny, cmap_text=cmap_text, \ genomenames=genomenames, sample_number=sample_number, ignore=opts.ignore, palette=palette)
def main(args): p = OptionParser(__doc__) p.set_beds() p.add_option("--quota", default="1:1", help="`quota mapping` procedure -- screen blocks to constrain mapping"\ " (useful for orthology), "\ "put in the format like (#subgenomes expected for genome X):"\ "(#subgenomes expected for genome Y) "\ "[default: %default]") p.add_option("--Nm", dest="Nmax", type="int", default=10, help="distance cutoff to tolerate two blocks that are "\ "slightly overlapping (cutoff for `quota mapping`) "\ "[default: %default units (gene or bp dist)]") supported_solvers = ("SCIP", "GLPK") p.add_option("--self", dest="self_match", action="store_true", default=False, help="you might turn this on when screening paralogous blocks, "\ "esp. if you have reduced mirrored blocks into non-redundant set") p.add_option("--solver", default="SCIP", choices=supported_solvers, help="use MIP solver [default: %default]") p.add_option("--verbose", action="store_true", default=False, help="show verbose solver output") p.add_option("--screen", default=False, action="store_true", help="generate new anchors file [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) qa_file, = args qbed, sbed, qorder, sorder, is_self = check_beds(qa_file, p, opts) # sanity check for the quota if opts.quota: try: qa, qb = opts.quota.split(":") qa, qb = int(qa), int(qb) except: print >> sys.stderr, "quota string should be the form x:x (2:4, 1:3, etc.)" sys.exit(1) if opts.self_match and qa != qb: raise Exception, "when comparing genome to itself, " \ "quota must be the same number " \ "(like 1:1, 2:2) you have %s" % opts.quota quota = (qa, qb) self_match = opts.self_match clusters = read_clusters(qa_file, qorder, sorder) for cluster in clusters: assert len(cluster) > 0 # below runs `quota mapping` work_dir = op.join(op.dirname(op.abspath(qa_file)), "work") selected_ids = solve_lp(clusters, quota, work_dir=work_dir, \ Nmax=opts.Nmax, self_match=self_match, \ solver=opts.solver, verbose=opts.verbose) logging.debug("Selected {0} blocks.".format(len(selected_ids))) prefix = qa_file.rsplit(".", 1)[0] suffix = "{0}x{1}".format(qa, qb) outfile = ".".join((prefix, suffix)) fw = must_open(outfile, "w") print >> fw, ",".join(str(x) for x in selected_ids) fw.close() logging.debug("Screened blocks ids written to `{0}`.".format(outfile)) if opts.screen: from jcvi.compara.synteny import screen new_qa_file = ".".join((prefix, suffix, "anchors")) largs = [qa_file, new_qa_file, "--ids", outfile] if opts.qbed and opts.sbed: largs += ["--qbed={0}".format(opts.qbed)] largs += ["--sbed={0}".format(opts.sbed)] screen(largs)
def blastfilter_main(blast_file, p, opts): qbed, sbed, qorder, sorder, is_self = check_beds(blast_file, p, opts) tandem_Nmax = opts.tandem_Nmax cscore = opts.cscore exclude = opts.exclude fp = open(blast_file) total_lines = sum(1 for line in fp if line[0] != "#") logging.debug( "Load BLAST file `{}` (total {} lines)".format(blast_file, total_lines) ) bl = Blast(blast_file) blasts = sorted(list(bl), key=lambda b: b.score, reverse=True) filtered_blasts = [] seen = set() ostrip = opts.strip_names nwarnings = 0 for b in blasts: query, subject = b.query, b.subject if query == subject: continue if ostrip: query, subject = gene_name(query), gene_name(subject) if query not in qorder: if nwarnings < 100: logging.warning("{} not in {}".format(query, qbed.filename)) elif nwarnings == 100: logging.warning("too many warnings.. suppressed") nwarnings += 1 continue if subject not in sorder: if nwarnings < 100: logging.warning("{} not in {}".format(subject, sbed.filename)) elif nwarnings == 100: logging.warning("too many warnings.. suppressed") nwarnings += 1 continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # move all hits to same side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.query, b.subject = [str(k) for k in key] b.qi, b.si = qi, si b.qseqid, b.sseqid = q.seqid, s.seqid filtered_blasts.append(b) if exclude: before_filter = len(filtered_blasts) logging.debug("running excluded pairs (--exclude `{}`) ..".format(exclude)) filtered_blasts = list(filter_exclude(filtered_blasts, exclude=exclude)) logging.debug( "after filter ({}->{}) ..".format(before_filter, len(filtered_blasts)) ) if cscore: before_filter = len(filtered_blasts) logging.debug("running the cscore filter (cscore>=%.2f) .." % cscore) filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore)) logging.debug( "after filter ({}->{}) ..".format(before_filter, len(filtered_blasts)) ) if tandem_Nmax: logging.debug( "running the local dups filter (tandem_Nmax={}) ..".format(tandem_Nmax) ) qtandems = tandem_grouper(filtered_blasts, flip=True, tandem_Nmax=tandem_Nmax) standems = tandem_grouper(filtered_blasts, flip=False, tandem_Nmax=tandem_Nmax) qdups_fh = ( open(op.splitext(opts.qbed)[0] + ".localdups", "w") if opts.tandems_only else None ) if is_self: for s in standems: qtandems.join(*s) qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh) sdups_to_mother = qdups_to_mother else: qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh) sdups_fh = ( open(op.splitext(opts.sbed)[0] + ".localdups", "w") if opts.tandems_only else None ) sdups_to_mother = write_localdups(standems, sbed, sdups_fh) if opts.tandems_only: # write out new .bed after tandem removal write_new_bed(qbed, qdups_to_mother) if not is_self: write_new_bed(sbed, sdups_to_mother) # just want to use this script as a tandem finder. # sys.exit() before_filter = len(filtered_blasts) filtered_blasts = list( filter_tandem(filtered_blasts, qdups_to_mother, sdups_to_mother) ) logging.debug( "after filter ({}->{}) ..".format(before_filter, len(filtered_blasts)) ) blastfilteredfile = blast_file + ".filtered" fw = open(blastfilteredfile, "w") write_new_blast(filtered_blasts, fh=fw) fw.close()
def pad(args): """ %prog pad blastfile cdtfile --qbed q.pad.bed --sbed s.pad.bed Test and reconstruct candidate PADs. """ from jcvi.formats.cdt import CDT p = OptionParser(pad.__doc__) p.set_beds() p.add_option("--cutoff", default=.3, type="float", help="The clustering cutoff to call similar [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cutoff = opts.cutoff blastfile, cdtfile = args qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts) cdt = CDT(cdtfile) qparts = list(cdt.iter_partitions(cutoff=cutoff)) sparts = list(cdt.iter_partitions(cutoff=cutoff, gtr=False)) qid, sid = {}, {} for i, part in enumerate(qparts): qid.update(dict((x, i) for x in part)) for i, part in enumerate(sparts): sid.update(dict((x, i) for x in part)) # Without writing files, conversion from PAD to merged PAD is done in memory for q in qbed: q.seqid = qid[q.seqid] for s in sbed: s.seqid = sid[s.seqid] qnames = range(len(qparts)) snames = range(len(sparts)) logmp = make_arrays(blastfile, qbed, sbed, qnames, snames) m, n = logmp.shape pvalue_cutoff = 1e-30 cutoff = - log(pvalue_cutoff) significant = [] for i in xrange(m): for j in xrange(n): score = logmp[i, j] if score < cutoff: continue significant.append((qparts[i], sparts[j], score)) for a, b, score in significant: print("|".join(a), "|".join(b), score) logging.debug("Collected {0} PAR comparisons significant at (P < {1}).".\ format(len(significant), pvalue_cutoff)) return significant
def cluster(args): """ %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile Cluster the segments and form PAD. This is the method described in Tang et al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks, based on which the genome on one or both axis can be chopped up into pieces and clustered. """ from jcvi.utils.range import Range p = OptionParser(cluster.__doc__) p.set_beds() p.add_option("--minsize", default=10, type="int", help="Only segment using blocks >= size [default: %default]") p.add_option("--path", default="~/scratch/bin", help="Path to the CLUSTER 3.0 binary [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, anchorfile = args qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts) minsize = opts.minsize ac = AnchorFile(anchorfile) qranges, sranges = [], [] qextra = [x[1:] for x in qbed.get_breaks()] sextra = [x[1:] for x in sbed.get_breaks()] id = 0 for block in ac.iter_blocks(minsize=minsize): q, s = zip(*block)[:2] q = [qorder[x][0] for x in q] s = [sorder[x][0] for x in s] minq, maxq = min(q), max(q) mins, maxs = min(s), max(s) id += 1 qr = Range("0", minq, maxq, maxq - minq, id) sr = Range("0", mins, maxs, maxs - mins, id) qranges.append(qr) sranges.append(sr) qpads = list(get_segments(qranges, qextra)) spads = list(get_segments(sranges, sextra)) suffix = ".pad.bed" qpf = opts.qbed.split(".")[0] spf = opts.sbed.split(".")[0] qpadfile = qpf + suffix spadfile = spf + suffix qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed) snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed) qpadbed, spadbed = Bed(qpadfile), Bed(spadfile) logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames) m, n = logmp.shape matrixfile = ".".join((qpf, spf, "logmp.txt")) fw = open(matrixfile, "w") header = ["o"] + spadnames print("\t".join(header), file=fw) for i in xrange(m): row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]] print("\t".join(row), file=fw) fw.close() # Run CLUSTER 3.0 (Pearson correlation, average linkage) cmd = op.join(opts.path, "cluster") cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile) pf = matrixfile.rsplit(".", 1)[0] cdtfile = pf + ".cdt" if need_update(matrixfile, cdtfile): sh(cmd)
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option( "--switch", default=False, action="store_true", help="Switch reference and aligned map elements", ) p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (anchorsfile, ) = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale try: newsseqid = get_number(sseqid) except ValueError: raise ValueError( "`{0}` is on `{1}` with no number to extract".format( saccn, sseqid)) bedline = "\t".join( str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(newsseqid, sstart))) bd.add(bedline) bd.print_to_file(filename=opts.outfile, sorted=True)
def dotplot_main(args): p = OptionParser(__doc__) p.set_beds() p.add_option( "--synteny", default=False, action="store_true", help="Run a fast synteny scan and display blocks", ) p.add_option("--cmaptext", help="Draw colormap box on the bottom-left corner") p.add_option( "--vmin", dest="vmin", type="float", default=0, help="Minimum value in the colormap", ) p.add_option( "--vmax", dest="vmax", type="float", default=2, help="Maximum value in the colormap", ) p.add_option( "--nmax", dest="sample_number", type="int", default=10000, help="Maximum number of data points to plot", ) p.add_option( "--minfont", type="int", default=4, help="Do not render labels with size smaller than", ) p.add_option("--colormap", help="Two column file, block id to color mapping") p.add_option( "--colororientation", action="store_true", default=False, help="Color the blocks based on orientation, similar to mummerplot", ) p.add_option( "--nosort", default=False, action="store_true", help="Do not sort the seqids along the axes", ) p.add_option("--nosep", default=False, action="store_true", help="Do not add contig lines") p.add_option("--title", help="Title of the dot plot") p.set_dotplot_opts() p.set_outfile(outfile=None) opts, args, iopts = p.set_image_options(args, figsize="9x9", style="dark", dpi=90, cmap="copper") if len(args) != 1: sys.exit(not p.print_help()) (anchorfile, ) = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts, sorted=(not opts.nosort)) palette = opts.colormap if palette: palette = Palette(palettefile=palette) elif opts.colororientation: palette = Palette.from_block_orientation(anchorfile, qbed, sbed) cmaptext = opts.cmaptext if anchorfile.endswith(".ks"): from jcvi.apps.ks import KsFile logging.debug("Anchors contain Ks values") cmaptext = cmaptext or "*Ks* values" anchorksfile = anchorfile + ".anchors" if need_update(anchorfile, anchorksfile): ksfile = KsFile(anchorfile) ksfile.print_to_anchors(anchorksfile) anchorfile = anchorksfile if opts.skipempty: ac = AnchorFile(anchorfile) if is_self: qseqids = sseqids = set() else: qseqids, sseqids = set(), set() for pair in ac.iter_pairs(): q, s = pair[:2] qi, q = qorder[q] si, s = sorder[s] qseqids.add(q.seqid) sseqids.add(s.seqid) if is_self: qbed = sbed = subset_bed(qbed, qseqids) else: qbed = subset_bed(qbed, qseqids) sbed = subset_bed(sbed, sseqids) fig = plt.figure(1, (iopts.w, iopts.h)) root = fig.add_axes([0, 0, 1, 1]) # the whole canvas ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # the dot plot dotplot( anchorfile, qbed, sbed, fig, root, ax, vmin=opts.vmin, vmax=opts.vmax, is_self=is_self, synteny=opts.synteny, cmap_text=opts.cmaptext, cmap=iopts.cmap, genomenames=opts.genomenames, sample_number=opts.sample_number, minfont=opts.minfont, palette=palette, sep=(not opts.nosep), sepcolor=set1[int(opts.theme)], title=opts.title, stdpf=(not opts.nostdpf), chpf=(not opts.nochpf), ) image_name = opts.outfile or (op.splitext(anchorfile)[0] + "." + opts.format) savefig(image_name, dpi=iopts.dpi, iopts=iopts) fig.clear()
def omgprepare(args): """ %prog omgprepare ploidy anchorsfile blastfile Prepare to run Sankoff's OMG algorithm to get orthologs. """ from jcvi.formats.blast import cscore from jcvi.formats.base import DictFile p = OptionParser(omgprepare.__doc__) p.add_option("--norbh", action="store_true", help="Disable RBH hits [default: %default]") p.add_option("--pctid", default=0, type="int", help="Percent id cutoff for RBH hits [default: %default]") p.add_option("--cscore", default=90, type="int", help="C-score cutoff for RBH hits [default: %default]") p.set_stripnames() p.set_beds() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ploidy, anchorfile, blastfile = args norbh = opts.norbh pctid = opts.pctid cs = opts.cscore qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) fp = open(ploidy) genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp)) fp.close() ploidy = DictFile(ploidy) geneinfo(qbed, qorder, genomeidx, ploidy) geneinfo(sbed, sorder, genomeidx, ploidy) pf = blastfile.rsplit(".", 1)[0] cscorefile = pf + ".cscore" cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"]) ac = AnchorFile(anchorfile) pairs = set((a, b) for a, b, i in ac.iter_pairs()) logging.debug("Imported {0} pairs from `{1}`.".format(len(pairs), anchorfile)) weightsfile = pf + ".weights" fp = open(cscorefile) fw = open(weightsfile, "w") npairs = 0 for row in fp: a, b, c, pct = row.split() c, pct = float(c), float(pct) c = int(c * 100) if (a, b) not in pairs: if norbh: continue if c < cs: continue if pct < pctid: continue c /= 10 # This severely penalizes RBH against synteny print >> fw, "\t".join((a, b, str(c))) npairs += 1 fw.close() logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
def omgprepare(args): """ %prog omgprepare ploidy anchorsfile blastfile Prepare to run Sankoff's OMG algorithm to get orthologs. """ from jcvi.formats.blast import cscore from jcvi.formats.base import DictFile p = OptionParser(omgprepare.__doc__) p.add_option("--norbh", action="store_true", help="Disable RBH hits") p.add_option( "--pctid", default=0, type="int", help="Percent id cutoff for RBH hits" ) p.add_option("--cscore", default=90, type="int", help="C-score cutoff for RBH hits") p.set_stripnames() p.set_beds() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ploidy, anchorfile, blastfile = args norbh = opts.norbh pctid = opts.pctid cs = opts.cscore qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) fp = open(ploidy) genomeidx = dict((x.split()[0], i) for i, x in enumerate(fp)) fp.close() ploidy = DictFile(ploidy) geneinfo(qbed, qorder, genomeidx, ploidy) geneinfo(sbed, sorder, genomeidx, ploidy) pf = blastfile.rsplit(".", 1)[0] cscorefile = pf + ".cscore" cscore([blastfile, "-o", cscorefile, "--cutoff=0", "--pct"]) ac = AnchorFile(anchorfile) pairs = set((a, b) for a, b, i in ac.iter_pairs()) logging.debug("Imported {0} pairs from `{1}`.".format(len(pairs), anchorfile)) weightsfile = pf + ".weights" fp = open(cscorefile) fw = open(weightsfile, "w") npairs = 0 for row in fp: a, b, c, pct = row.split() c, pct = float(c), float(pct) c = int(c * 100) if (a, b) not in pairs: if norbh: continue if c < cs: continue if pct < pctid: continue c /= 10 # This severely penalizes RBH against synteny print("\t".join((a, b, str(c))), file=fw) npairs += 1 fw.close() logging.debug("Write {0} pairs to `{1}`.".format(npairs, weightsfile))
def ancestral(args): """ %prog ancestral vplanifoliaA.vplanifoliaA.anchors > vplanifoliaA_blocks.bed Paint 14 chromosomes following alpha WGD. """ p = OptionParser(ancestral.__doc__) p.set_beds() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (anchorsfile, ) = args qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) # We focus on the following chromosome pairs target_pairs = set(( (1, 1), (1, 6), (1, 8), (1, 13), (2, 4), (3, 12), (3, 14), (5, 6), (5, 8), (7, 9), (7, 11), (9, 10), (10, 11), )) def get_target(achr, bchr): if "chr" not in achr and "chr" not in bchr: return None achr, bchr = get_number(achr), get_number(bchr) if achr > bchr: achr, bchr = bchr, achr if (achr, bchr) in target_pairs: return achr, bchr return None def build_bedline(astart, aend, target_pair): # target_name = "{:02d}-{:02d}".format(*target_pair) target_name = [ str(x) for x in target_pair if x in (1, 2, 3, 5, 7, 10) ][0] return "\t".join( str(x) for x in (astart.seqid, astart.start, aend.end, target_name)) # Iterate through the blocks, store any regions that has hits to one of the # target_pairs ac = AnchorFile(anchorsfile) blocks = ac.blocks outbed = Bed() for i, block in enumerate(blocks): a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] astart, aend = min(a)[1], max(a)[1] bstart, bend = min(b)[1], max(b)[1] # Now convert to BED lines with new accn achr, bchr = astart.seqid, bstart.seqid target = get_target(achr, bchr) if target is None: continue outbed.add(build_bedline(astart, aend, target)) outbed.add(build_bedline(bstart, bend, target)) outbed.print_to_file(sorted=True)
def main(args): p = OptionParser(__doc__) p.set_beds() p.add_option("--quota", default="1:1", help="`quota mapping` procedure -- screen blocks to constrain mapping"\ " (useful for orthology), "\ "put in the format like (#subgenomes expected for genome X):"\ "(#subgenomes expected for genome Y) "\ "[default: %default]") p.add_option("--Nm", dest="Nmax", type="int", default=10, help="distance cutoff to tolerate two blocks that are "\ "slightly overlapping (cutoff for `quota mapping`) "\ "[default: %default units (gene or bp dist)]") supported_solvers = ("SCIP", "GLPK") p.add_option("--self", dest="self_match", action="store_true", default=False, help="you might turn this on when screening paralogous blocks, "\ "esp. if you have reduced mirrored blocks into non-redundant set") p.add_option("--solver", default="SCIP", choices=supported_solvers, help="use MIP solver [default: %default]") p.set_verbose(help="Show verbose solver output") p.add_option("--screen", default=False, action="store_true", help="generate new anchors file [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) qa_file, = args qbed, sbed, qorder, sorder, is_self = check_beds(qa_file, p, opts) # sanity check for the quota if opts.quota: try: qa, qb = opts.quota.split(":") qa, qb = int(qa), int(qb) except: print >> sys.stderr, "quota string should be the form x:x (2:4, 1:3, etc.)" sys.exit(1) if opts.self_match and qa != qb: raise Exception, "when comparing genome to itself, " \ "quota must be the same number " \ "(like 1:1, 2:2) you have %s" % opts.quota quota = (qa, qb) self_match = opts.self_match clusters = read_clusters(qa_file, qorder, sorder) for cluster in clusters: assert len(cluster) > 0 # below runs `quota mapping` work_dir = op.join(op.dirname(op.abspath(qa_file)), "work") selected_ids = solve_lp(clusters, quota, work_dir=work_dir, \ Nmax=opts.Nmax, self_match=self_match, \ solver=opts.solver, verbose=opts.verbose) logging.debug("Selected {0} blocks.".format(len(selected_ids))) prefix = qa_file.rsplit(".", 1)[0] suffix = "{0}x{1}".format(qa, qb) outfile = ".".join((prefix, suffix)) fw = must_open(outfile, "w") print >> fw, ",".join(str(x) for x in selected_ids) fw.close() logging.debug("Screened blocks ids written to `{0}`.".format(outfile)) if opts.screen: from jcvi.compara.synteny import screen new_qa_file = ".".join((prefix, suffix, "anchors")) largs = [qa_file, new_qa_file, "--ids", outfile] if opts.qbed and opts.sbed: largs += ["--qbed={0}".format(opts.qbed)] largs += ["--sbed={0}".format(opts.sbed)] screen(largs)