def anchor(args): """ %prog anchor map.bed markers.blast > anchored.bed Anchor scaffolds based on map. """ from jcvi.formats.blast import bed p = OptionParser(anchor.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) mapbed, blastfile = args bedfile = bed([blastfile]) markersbed = Bed(bedfile) markers = markersbed.order mapbed = Bed(mapbed, sorted=False) for b in mapbed: m = b.accn if m not in markers: continue i, mb = markers[m] new_accn = "{0}:{1}-{2}".format(mb.seqid, mb.start, mb.end) b.accn = new_accn print b
def paste(args): """ %prog paste flanks.bed flanks_vs_assembly.blast backbone.fasta Paste in good sequences in the final assembly. """ from jcvi.formats.bed import uniq p = OptionParser(paste.__doc__) p.add_option( "--maxsize", default=300000, type="int", help="Maximum size of patchers to be replaced", ) p.add_option("--prefix", help="Prefix of the new object") p.set_rclip(rclip=1) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) pbed, blastfile, bbfasta = args maxsize = opts.maxsize # Max DNA size to replace gap order = Bed(pbed).order beforebed, afterbed = blast_to_twobeds( blastfile, order, log=True, rclip=opts.rclip, maxsize=maxsize, flipbeds=True ) beforebed = uniq([beforebed]) afbed = Bed(beforebed) bfbed = Bed(afterbed) shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
def check_beds(hintfile, p, opts): wd, hintfile = op.split(hintfile) if not (opts.qbed and opts.sbed): try: q, s = hintfile.split(".", 2)[:2] opts.qbed = op.join(wd, q + ".bed") opts.sbed = op.join(wd, s + ".bed") logging.debug("Assuming --qbed={0} --sbed={1}".\ format(opts.qbed, opts.sbed)) except: print >> sys.stderr, "Options --qbed and --sbed are required" sys.exit(not p.print_help()) qbed_file, sbed_file = opts.qbed, opts.sbed # is this a self-self blast? is_self = (qbed_file == sbed_file) if is_self: logging.debug("Looks like self-self comparison.") qbed = Bed(opts.qbed) sbed = Bed(opts.sbed) qorder = qbed.order sorder = sbed.order return qbed, sbed, qorder, sorder, is_self
def subset(args): """ %prog subset blastfile qbedfile sbedfile Extract blast hits between given query and subject chrs. If --qchrs or --schrs is not given, then all chrs from q/s genome will be included. However one of --qchrs and --schrs must be specified. Otherwise the script will do nothing. """ p = OptionParser(subset.__doc__) p.add_option("--qchrs", default=None, help="query chrs to extract, comma sep [default: %default]") p.add_option("--schrs", default=None, help="subject chrs to extract, comma sep [default: %default]") p.add_option("--convert", default=False, action="store_true", help="convert accns to chr_rank [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) blastfile, qbedfile, sbedfile = args qchrs = opts.qchrs schrs = opts.schrs assert qchrs or schrs, p.print_help() convert = opts.convert outfile = blastfile + "." if qchrs: outfile += qchrs + "." qchrs = set(qchrs.split(",")) else: qchrs = set(Bed(qbedfile).seqids) if schrs: schrs = set(schrs.split(",")) if qbedfile != sbedfile or qchrs != schrs: outfile += ",".join(schrs) + "." else: schrs = set(Bed(sbedfile).seqids) outfile += "blast" qo = Bed(qbedfile).order so = Bed(sbedfile).order fw = must_open(outfile, "w") for b in Blast(blastfile): q, s = b.query, b.subject if qo[q][1].seqid in qchrs and so[s][1].seqid in schrs: if convert: b.query = qo[q][1].seqid + "_" + "{0:05d}".format(qo[q][0]) b.subject = so[s][1].seqid + "_" + "{0:05d}".format(so[s][0]) print >> fw, b fw.close() logging.debug("Subset blastfile written to `{0}`".format(outfile))
def patcher(args): """ %prog patcher backbone.bed other.bed Given optical map alignment, prepare the patchers. Use --backbone to suggest which assembly is the major one, and the patchers will be extracted from another assembly. """ from jcvi.formats.bed import uniq p = OptionParser(patcher.__doc__) p.add_option("--backbone", default="OM", help="Prefix of the backbone assembly [default: %default]") p.add_option("--object", default="object", help="New object name [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) backbonebed, otherbed = args backbonebed = uniq([backbonebed]) otherbed = uniq([otherbed]) bb = opts.backbone pf = backbonebed.split(".")[0] key = lambda x: (x.seqid, x.start, x.end) is_bb = lambda x: x.startswith(bb) # Make a uniq bed keeping backbone at redundant intervals cmd = "intersectBed -v -wa" cmd += " -a {0} -b {1}".format(otherbed, backbonebed) outfile = otherbed.rsplit(".", 1)[0] + ".not." + backbonebed sh(cmd, outfile=outfile) uniqbed = Bed() uniqbedfile = pf + ".merged.bed" uniqbed.extend(Bed(backbonebed)) uniqbed.extend(Bed(outfile)) uniqbed.print_to_file(uniqbedfile, sorted=True) # Condense adjacent intervals, allow some chaining bed = uniqbed key = lambda x: range_parse(x.accn).seqid bed_fn = pf + ".patchers.bed" bed_fw = open(bed_fn, "w") for k, sb in groupby(bed, key=key): sb = list(sb) chr, start, end, strand = merge_ranges(sb) id = "{0}:{1}-{2}".format(chr, start, end) print >> bed_fw, "\t".join(str(x) for x in \ (chr, start, end, opts.object, 1000, strand)) bed_fw.close()
def geneinfo(args): """ %prog geneinfo pineapple.20141004.bed liftover.bed pineapple.20150413.bed \ note.txt interproscan.txt Build gene info table from various sources. The three beds contain information on the original scaffolds, linkage groups, and final selected loci (after removal of TEs and split loci). The final two text files contain AHRD and domain data. """ p = OptionParser(geneinfo.__doc__) opts, args = p.parse_args(args) if len(args) != 5: sys.exit(not p.print_help()) scfbed, liftoverbed, lgbed, note, ipr = args note = DictFile(note, delimiter="\t") scfbed = Bed(scfbed) lgorder = Bed(lgbed).order liftover = Bed(liftoverbed).order header = ("Accession Scaffold-position LG-position " "Description Interpro-domain Interpro-description " "GO-term KEGG".split()) ipr = read_interpro(ipr) fw_clean = must_open("master.txt", "w") fw_removed = must_open("master-removed.txt", "w") for fw in (fw_clean, fw_removed): print("\t".join(header), file=fw) for b in scfbed: accession = b.accn scaffold_position = b.tag if accession in liftover: lg_position = liftover[accession][-1].tag else: lg_position = "split" fw = fw_clean if accession in lgorder else fw_removed description = note[accession] interpro = interpro_description = go = kegg = "" if accession in ipr: interpro, interpro_description, go, kegg = ipr[accession] print( "\t".join(( accession, scaffold_position, lg_position, description, interpro, interpro_description, go, kegg, )), file=fw, ) fw.close()
def scaffold(args): """ %prog scaffold scaffold.fasta synteny.blast synteny.sizes synteny.bed physicalmap.blast physicalmap.sizes physicalmap.bed As evaluation of scaffolding, visualize external line of evidences: * Plot synteny to an external genome * Plot alignments to physical map * Plot alignments to genetic map (TODO) Each trio defines one panel to be plotted. blastfile defines the matchings between the evidences vs scaffolds. Then the evidence sizes, and evidence bed to plot dot plots. This script will plot a dot in the dot plot in the corresponding location the plots are one contig/scaffold per plot. """ from jcvi.graphics.base import set_image_options from jcvi.utils.iter import grouper p = OptionParser(scaffold.__doc__) p.add_option("--cutoff", type="int", default=1000000, help="Plot scaffolds with size larger than [default: %default]") p.add_option("--highlights", help="A set of regions in BED format to highlight [default: %default]") opts, args, iopts = set_image_options(p, args, figsize="14x8", dpi=150) if len(args) < 4 or len(args) % 3 != 1: sys.exit(not p.print_help()) highlights = opts.highlights scafsizes = Sizes(args[0]) trios = list(grouper(3, args[1:])) trios = [(a, Sizes(b), Bed(c)) for a, b, c in trios] if highlights: hlbed = Bed(highlights) for scaffoldID, scafsize in scafsizes.iter_sizes(): if scafsize < opts.cutoff: continue logging.debug("Loading {0} (size={1})".format(scaffoldID, thousands(scafsize))) tmpname = scaffoldID + ".sizes" tmp = open(tmpname, "w") tmp.write("{0}\t{1}".format(scaffoldID, scafsize)) tmp.close() tmpsizes = Sizes(tmpname) tmpsizes.close(clean=True) if highlights: subhighlights = list(hlbed.sub_bed(scaffoldID)) imagename = ".".join((scaffoldID, opts.format)) plot_one_scaffold(scaffoldID, tmpsizes, None, trios, imagename, iopts, highlights=subhighlights)
def tips(args): """ %prog tips patchers.bed complements.bed original.fasta backbone.fasta Append telomeric sequences based on patchers and complements. """ p = OptionParser(tips.__doc__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbedfile, cbedfile, sizesfile, bbfasta = args pbed = Bed(pbedfile, sorted=False) cbed = Bed(cbedfile, sorted=False) complements = dict() for object, beds in groupby(cbed, key=lambda x: x.seqid): beds = list(beds) complements[object] = beds sizes = Sizes(sizesfile).mapping bbsizes = Sizes(bbfasta).mapping tbeds = [] for object, beds in groupby(pbed, key=lambda x: x.accn): beds = list(beds) startbed, endbed = beds[0], beds[-1] start_id, end_id = startbed.seqid, endbed.seqid if startbed.start == 1: start_id = None if endbed.end == sizes[end_id]: end_id = None print(object, start_id, end_id, file=sys.stderr) if start_id: b = complements[start_id][0] b.accn = object tbeds.append(b) tbeds.append( BedLine( "\t".join( str(x) for x in (object, 0, bbsizes[object], object, 1000, "+") ) ) ) if end_id: b = complements[end_id][-1] b.accn = object tbeds.append(b) tbed = Bed() tbed.extend(tbeds) tbedfile = "tips.bed" tbed.print_to_file(tbedfile)
def liftover(args): """ %prog liftover agpfile bedfile Given coordinates in components, convert to the coordinates in chromosomes. """ p = OptionParser(liftover.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Prepend prefix to accn names [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) agpfile, bedfile = args agp = AGP(agpfile).order bed = Bed(bedfile) newbed = Bed() for b in bed: component = b.seqid if component not in agp: newbed.append(b) continue i, a = agp[component] assert a.component_beg < a.component_end arange = a.component_beg, a.component_end assert b.start < b.end brange = b.start, b.end st = range_intersect(arange, brange) if not st: continue start, end = st assert start <= end if a.orientation == '-': d = a.object_end + a.component_beg s, t = d - end, d - start else: d = a.object_beg - a.component_beg s, t = d + start, d + end name = b.accn.replace(" ", "_") if opts.prefix: name = component + "_" + name bline = "\t".join(str(x) for x in (a.object, s - 1, t, name)) newbed.append(BedLine(bline)) newbed.print_to_file(sorted=True)
def check_beds(hintfile, p, opts, sorted=True): qbed_file, sbed_file = get_bed_filenames(hintfile, p, opts) # is this a self-self blast? is_self = (qbed_file == sbed_file) if is_self: logging.debug("Looks like self-self comparison.") qbed = Bed(opts.qbed, sorted=sorted) sbed = Bed(opts.sbed, sorted=sorted) qorder = qbed.order sorder = sbed.order return qbed, sbed, qorder, sorder, is_self
def breakpoint(args): """ %prog breakpoint blastfile bedfile Identify breakpoints where collinearity ends. `blastfile` contains mapping from markers (query) to scaffolds (subject). `bedfile` contains marker locations in the related species. """ from jcvi.formats.blast import bed from jcvi.utils.range import range_interleave p = OptionParser(breakpoint.__doc__) p.add_option("--xdist", type="int", default=20, help="xdist (in related genome) cutoff [default: %default]") p.add_option("--ydist", type="int", default=200000, help="ydist (in current genome) cutoff [default: %default]") p.add_option("-n", type="int", default=5, help="number of markers in a block [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, bedfile = args order = Bed(bedfile).order blastbedfile = bed([blastfile]) bbed = Bed(blastbedfile) key = lambda x: x[1] for scaffold, bs in bbed.sub_beds(): blocks = get_blocks(scaffold, bs, order, xdist=opts.xdist, ydist=opts.ydist, N=opts.n) sblocks = [] for block in blocks: xx, yy = zip(*block) sblocks.append((scaffold, min(yy), max(yy))) iblocks = range_interleave(sblocks) for ib in iblocks: ch, start, end = ib print "{0}\t{1}\t{2}".format(ch, start - 1, end)
def synfind(args): """ %prog synfind all.last *.bed Prepare input for SynFind. """ p = OptionParser(synfind.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) lastfile = args[0] bedfiles = args[1:] fp = open(lastfile) filteredlast = lastfile + ".filtered" fw = open(filteredlast, "w") for row in fp: b = BlastLine(row) if b.query == b.subject: continue print(b, file=fw) fw.close() logging.debug("Filtered LAST file written to `{0}`".format(filteredlast)) allbed = "all.bed" fw = open(allbed, "w") for i, bedfile in enumerate(bedfiles): prefix = chr(ord('A') + i) bed = Bed(bedfile) for b in bed: b.seqid = prefix + b.seqid print(b, file=fw) fw.close() logging.debug("Bed file written to `{0}`".format(allbed))
def prepare_synteny(tourfile, lastfile, odir, p, opts): """ Prepare synteny plots for movie(). """ qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts) qbedfile = op.abspath(qbedfile) sbedfile = op.abspath(sbedfile) qbed = Bed(qbedfile, sorted=False) contig_to_beds = dict(qbed.sub_beds()) # Create a separate directory for the subplots and movie mkdir(odir, overwrite=True) os.chdir(odir) logging.debug("Change into subdir `{}`".format(odir)) # Make anchorsfile anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) + ".anchors" fw = open(anchorsfile, "w") for b in Blast(lastfile): print >> fw, "\t".join( (gene_name(b.query), gene_name(b.subject), str(int(b.score)))) fw.close() # Symlink sbed symlink(sbedfile, op.basename(sbedfile)) return anchorsfile, qbedfile, contig_to_beds
def insertion(args): """ %prog insertion mic.mac.bed Find IES based on mapping MIC reads to MAC genome. Output a bedfile with 'lesions' (stack of broken reads) in the MAC genome. """ p = OptionParser(insertion.__doc__) p.add_option("--mindepth", default=6, type="int", help="Minimum depth to call an insertion") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args mindepth = opts.mindepth bed = Bed(bedfile) fw = must_open(opts.outfile, "w") for seqid, feats in bed.sub_beds(): left_ends = Counter([x.start for x in feats]) right_ends = Counter([x.end for x in feats]) selected = [] for le, count in left_ends.items(): if count >= mindepth: selected.append((seqid, le, "LE-{0}".format(le), count)) for re, count in right_ends.items(): if count >= mindepth: selected.append((seqid, re, "RE-{0}".format(re), count)) selected.sort() for seqid, pos, label, count in selected: label = "{0}-r{1}".format(label, count) print >> fw, "\t".join((seqid, str(pos - 1), str(pos), label))
def rename(args): """ %prog rename map markers.blast > renamed.map Rename markers according to the new mapping locations. """ from jcvi.formats.blast import bed p = OptionParser(rename.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) mstmap, blastfile = args bedfile = bed([blastfile]) markersbed = Bed(bedfile) markers = markersbed.order data = MSTMap(mstmap) header = data.header header = [header[0]] + ["seqid", "start"] + header[1:] print "\t".join(header) for b in data: m, geno = b.id, b.genotype if m not in markers: continue i, mb = markers[m] print "\t".join(str(x) for x in \ (m, mb.seqid, mb.start, "\t".join(list(geno))))
def prepare(bedfile): """ Remove prepended tags in gene names. """ pf = bedfile.rsplit(".", 1)[0] abedfile = pf + ".a.bed" bbedfile = pf + ".b.bed" fwa = open(abedfile, "w") fwb = open(bbedfile, "w") bed = Bed(bedfile) seen = set() for b in bed: accns = b.accn.split(";") new_accns = [] for accn in accns: if ":" in accn: method, a = accn.split(":", 1) if method in ("liftOver", "GMAP", ""): accn = a if accn in seen: logging.error("Duplicate id {0} found. Ignored.".format(accn)) continue new_accns.append(accn) b.accn = accn print >> fwa, b seen.add(accn) b.accn = ";".join(new_accns) print >> fwb, b fwa.close() fwb.close()
def condense(args): """ %prog condense OM.bed Merge split alignments in OM bed. """ from itertools import groupby from jcvi.assembly.patch import merge_ranges p = OptionParser(condense.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (bedfile,) = args bed = Bed(bedfile, sorted=False) key = lambda x: (x.seqid, x.start, x.end) for k, sb in groupby(bed, key=key): sb = list(sb) b = sb[0] chr, start, end, strand = merge_ranges(sb) id = "{0}:{1}-{2}".format(chr, start, end) b.accn = id print(b)
def gaps(args): """ %prog gaps OM.bed fastafile Create patches around OM gaps. """ from jcvi.formats.bed import uniq p = OptionParser(gaps.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ombed, fastafile = args ombed = uniq([ombed]) bed = Bed(ombed) for a, b in pairwise(bed): om_a = (a.seqid, a.start, a.end, "+") om_b = (b.seqid, b.start, b.end, "+") ch_a = range_parse(a.accn) ch_b = range_parse(b.accn) ch_a = (ch_a.seqid, ch_a.start, ch_a.end, "+") ch_b = (ch_b.seqid, ch_b.start, ch_b.end, "+") om_dist, x = range_distance(om_a, om_b, distmode="ee") ch_dist, x = range_distance(ch_a, ch_b, distmode="ee") if om_dist <= 0 and ch_dist <= 0: continue print(a) print(b) print(om_dist, ch_dist)
def fuse(args): """ %prog fuse *.bed *.anchors Fuse gene orders based on anchors file. """ from jcvi.algorithms.graph import BiGraph p = OptionParser(fuse.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) bedfiles = [x for x in args if x.endswith(".bed")] anchorfiles = [x for x in args if x.endswith(".anchors")] # TODO: Use Markov clustering to sparsify the edges families = Grouper() for anchorfile in anchorfiles: af = AnchorFile(anchorfile) for a, b, block_id in af.iter_pairs(): families.join(a, b) allowed = set(families.keys()) logging.debug("Total families: {}, Gene members: {}".format( len(families), len(allowed))) # TODO: Use C++ implementation of BiGraph() when available # For now just serialize this to the disk for bedfile in bedfiles: bed = Bed(bedfile, include=allowed) print_edges(bed, families)
def comparebed(args): """ %prog comparebed AP.chr.bed infer.bed Compare the scaffold links indicated in two bed files. """ p = OptionParser(comparebed.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) abed, bbed = args abed = Bed(abed) bbed = Bed(bbed) query_links(abed, bbed) query_links(bbed, abed)
def layout(args): """ %prog layout omgfile taxa Build column formatted gene lists after omgparse(). Use species list separated by comma in place of taxa, e.g. "BR,BO,AN,CN" """ p = OptionParser(layout.__doc__) p.add_option("--sort", help="Sort layout file based on bedfile [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) omgfile, taxa = args listfile = omgfile.rsplit(".", 1)[0] + ".list" taxa = taxa.split(",") ntaxa = len(taxa) fw = open(listfile, "w") data = [] fp = open(omgfile) for row in fp: genes, idxs = row.split() row = ["."] * ntaxa genes = genes.split(",") ixs = [int(x) for x in idxs.split(",")] for gene, idx in zip(genes, ixs): row[idx] = gene txs = ",".join(taxa[x] for x in ixs) print >> fw, "\t".join(("\t".join(row), txs)) data.append(row) coldata = zip(*data) ngenes = [] for i, tx in enumerate(taxa): genes = [x for x in coldata[i] if x != '.'] genes = set(x.strip("|") for x in genes) ngenes.append((len(genes), tx)) details = ", ".join("{0} {1}".format(a, b) for a, b in ngenes) total = sum(a for a, b in ngenes) s = "A list of {0} orthologous families that collectively".format( len(data)) s += " contain a total of {0} genes ({1})".format(total, details) print >> sys.stderr, s fw.close() lastcolumn = ntaxa + 1 cmd = "sort -k{0},{0} {1} -o {1}".format(lastcolumn, listfile) sh(cmd) logging.debug("List file written to `{0}`.".format(listfile)) sort = opts.sort if sort: thread = Bed(sort) sort_layout(thread, listfile)
def chimera(args): """ %prog chimera bedfile Scan the bed file to break scaffolds that multi-maps. """ p = OptionParser(chimera.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args bed = Bed(bedfile) selected = select_bed(bed) mapped = defaultdict(set) # scaffold => chr chimerabed = "chimera.bed" fw = open(chimerabed, "w") for b in selected: scf = range_parse(b.accn).seqid chr = b.seqid mapped[scf].add(chr) nchimera = 0 for s, chrs in sorted(mapped.items()): if len(chrs) == 1: continue print >> sys.stderr, "=" * 80 print >> sys.stderr, "{0} mapped to multiple locations: {1}".\ format(s, ",".join(sorted(chrs))) ranges = [] for b in selected: rr = range_parse(b.accn) scf = rr.seqid if scf == s: print >> sys.stderr, b ranges.append(rr) # Identify breakpoints ranges.sort(key=lambda x: (x.seqid, x.start, x.end)) for a, b in pairwise(ranges): seqid = a.seqid if seqid != b.seqid: continue start, end = a.end, b.start if start > end: start, end = end, start chimeraline = "\t".join(str(x) for x in (seqid, start, end)) print >> fw, chimeraline print >> sys.stderr, chimeraline nchimera += 1 fw.close() logging.debug("A total of {0} junctions written to `{1}`.".\ format(nchimera, chimerabed))
def check_beds(p, opts): if not (opts.qbed and opts.sbed): print >> sys.stderr, "Options --qbed and --sbed are required" sys.exit(not p.print_help()) qbed_file, sbed_file = opts.qbed, opts.sbed # is this a self-self blast? is_self = (qbed_file == sbed_file) if is_self: logging.debug("Looks like self-self comparison.") qbed = Bed(opts.qbed) sbed = Bed(opts.sbed) qorder = qbed.order sorder = sbed.order return qbed, sbed, qorder, sorder, is_self
def frombed(args): """ %prog frombed bedfile contigfasta readfasta Convert read placement to contig format. This is useful before running BAMBUS. """ from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.bed import Bed from jcvi.utils.cbook import fill p = OptionParser(frombed.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, contigfasta, readfasta = args prefix = bedfile.rsplit(".", 1)[0] contigfile = prefix + ".contig" idsfile = prefix + ".ids" contigfasta = Fasta(contigfasta) readfasta = Fasta(readfasta) bed = Bed(bedfile) checksum = "00000000 checksum." fw_ids = open(idsfile, "w") fw = open(contigfile, "w") for ctg, reads in bed.sub_beds(): ctgseq = contigfasta[ctg] ctgline = "##{0} {1} {2} bases, {3}".format(\ ctg, len(reads), len(ctgseq), checksum) print >> fw_ids, ctg print >> fw, ctgline print >> fw, fill(ctgseq.seq) for b in reads: read = b.accn strand = b.strand readseq = readfasta[read] rc = " [RC]" if strand == "-" else "" readlen = len(readseq) rstart, rend = 1, readlen if strand == "-": rstart, rend = rend, rstart readrange = "{{{0} {1}}}".format(rstart, rend) conrange = "<{0} {1}>".format(b.start, b.end) readline = "#{0}(0){1} {2} bases, {3} {4} {5}".format(\ read, rc, readlen, checksum, readrange, conrange) print >> fw, readline print >> fw, fill(readseq.seq) logging.debug("Mapped contigs written to `{0}`.".format(contigfile)) logging.debug("Contig IDs written to `{0}`.".format(idsfile))
def closest(args): """ %prog closest candidates.bed gaps.bed fastafile Identify the nearest gaps flanking suggested regions. """ p = OptionParser(closest.__doc__) p.add_option( "--om", default=False, action="store_true", help="The bedfile is OM blocks", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) candidates, gapsbed, fastafile = args sizes = Sizes(fastafile).mapping bed = Bed(candidates) ranges = [] for b in bed: r = range_parse(b.accn) if opts.om else b ranges.append([r.seqid, r.start, r.end]) gapsbed = Bed(gapsbed) granges = [(x.seqid, x.start, x.end) for x in gapsbed] ranges = range_merge(ranges) for r in ranges: a = range_closest(granges, r) b = range_closest(granges, r, left=False) seqid = r[0] if a is not None and a[0] != seqid: a = None if b is not None and b[0] != seqid: b = None mmin = 1 if a is None else a[1] mmax = sizes[seqid] if b is None else b[2] print("\t".join(str(x) for x in (seqid, mmin - 1, mmax)))
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed, BedLine from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale bedline = "\t".join( str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(get_number(sseqid), sstart))) bd.append(BedLine(bedline)) bd.print_to_file(filename=opts.outfile, sorted=True)
def bed_store(bedfile): bedfile = mergeBed(bedfile, s=True, nms=True, sorted=True) bed = Bed(bedfile) reads, reads_r = {}, defaultdict(list) for b in bed: target = "{0}:{1}".format(b.seqid, b.start) for accn in b.accn.split(","): reads[accn] = target reads_r[target].append(accn) return reads, reads_r
def make_gff(bed, prefix, fw): bed = Bed(bed) nfeats = 0 for b in bed: seqid = prefix + b.seqid print("\t".join(str(x) for x in \ (seqid, b.accn, b.start, b.end)), file=fw) nfeats += 1 logging.debug("A total of {0} features converted to `{1}`".\ format(nfeats, fw.name))
def split(args): """ %prog split split.bed evidences.bed predictor1.gff predictor2.gff fastafile Split MAKER models by checking against predictors (such as AUGUSTUS and FGENESH). For each region covered by a working model. Find out the combination of predictors that gives the best accuracy against evidences (such as PASA). `split.bed` can be generated by pulling out subset from a list of ids $ python -m jcvi.formats.base join split.ids working.bed --column=0,3 --noheader | cut -f2-7 > split.bed """ from jcvi.formats.bed import Bed p = OptionParser(split.__doc__) p.add_option("--key", default="Name", help="Key in the attributes to extract predictor.gff [default: %default]") p.add_option("--parents", default="match", help="list of features to extract, use comma to separate (e.g." "'gene,mRNA') [default: %default]") p.add_option("--children", default="match_part", help="list of features to extract, use comma to separate (e.g." "'five_prime_UTR,CDS,three_prime_UTR') [default: %default]") opts, args = p.parse_args(args) if len(args) != 5: sys.exit(not p.print_help()) split_bed, evidences_bed, p1_gff, p2_gff, fastafile = args parents = opts.parents children = opts.children key = opts.key bed = Bed(split_bed) s1 = get_splits(split_bed, p1_gff, parents, key) s2 = get_splits(split_bed, p2_gff, parents, key) for b in bed: query = "{0}:{1}-{2}".format(b.seqid, b.start, b.end) b1 = get_accuracy(query, p1_gff, evidences_bed, fastafile, children, key) b2 = get_accuracy(query, p2_gff, evidences_bed, fastafile, children, key) accn = b.accn c1 = "|".join(s1[accn]) c2 = "|".join(s2[accn]) ac1 = b1.accuracy ac2 = b2.accuracy tag = p1_gff if ac1 >= ac2 else p2_gff tag = tag.split(".")[0] ac1 = "{0:.3f}".format(ac1) ac2 = "{0:.3f}".format(ac2) print "\t".join((accn, tag, ac1, ac2, c1, c2))
def estimate(args): """ %prog estimate gaps.bed all.spans.bed all.mates Estimate gap sizes based on mate positions and library insert sizes. """ from collections import defaultdict from jcvi.formats.bed import intersectBed_wao from jcvi.formats.posmap import MatesFile p = OptionParser(estimate.__doc__) p.add_option("--minlinks", default=3, type="int", help="Minimum number of links to place [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) gapsbed, spansbed, matesfile = args mf = MatesFile(matesfile) bed = Bed(gapsbed) order = bed.order gap2mate = defaultdict(set) mate2gap = defaultdict(set) for a, b in intersectBed_wao(gapsbed, spansbed): gapsize = a.span if gapsize != 100: continue gapname = a.accn if b is None: gap2mate[gapname] = set() continue matename = b.accn gap2mate[gapname].add(matename) mate2gap[matename].add(gapname) omgapsbed = "gaps.linkage.bed" fw = open(omgapsbed, "w") for gapname, mates in sorted(gap2mate.items()): i, b = order[gapname] nmates = len(mates) if nmates < opts.minlinks: print("{0}\t{1}".format(b, nmates), file=fw) continue print(gapname, mates) fw.close()