def gaps(args): """ %prog gaps OM.bed fastafile Create patches around OM gaps. """ from jcvi.formats.bed import uniq p = OptionParser(gaps.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ombed, fastafile = args ombed = uniq([ombed]) bed = Bed(ombed) for a, b in pairwise(bed): om_a = (a.seqid, a.start, a.end, "+") om_b = (b.seqid, b.start, b.end, "+") ch_a = range_parse(a.accn) ch_b = range_parse(b.accn) ch_a = (ch_a.seqid, ch_a.start, ch_a.end, "+") ch_b = (ch_b.seqid, ch_b.start, ch_b.end, "+") om_dist, x = range_distance(om_a, om_b, distmode="ee") ch_dist, x = range_distance(ch_a, ch_b, distmode="ee") if om_dist <= 0 and ch_dist <= 0: continue print(a) print(b) print(om_dist, ch_dist)
def chimera(args): """ %prog chimera bedfile Scan the bed file to break scaffolds that multi-maps. """ p = OptionParser(chimera.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args bed = Bed(bedfile) selected = select_bed(bed) mapped = defaultdict(set) # scaffold => chr chimerabed = "chimera.bed" fw = open(chimerabed, "w") for b in selected: scf = range_parse(b.accn).seqid chr = b.seqid mapped[scf].add(chr) nchimera = 0 for s, chrs in sorted(mapped.items()): if len(chrs) == 1: continue print >> sys.stderr, "=" * 80 print >> sys.stderr, "{0} mapped to multiple locations: {1}".\ format(s, ",".join(sorted(chrs))) ranges = [] for b in selected: rr = range_parse(b.accn) scf = rr.seqid if scf == s: print >> sys.stderr, b ranges.append(rr) # Identify breakpoints ranges.sort(key=lambda x: (x.seqid, x.start, x.end)) for a, b in pairwise(ranges): seqid = a.seqid if seqid != b.seqid: continue start, end = a.end, b.start if start > end: start, end = end, start chimeraline = "\t".join(str(x) for x in (seqid, start, end)) print >> fw, chimeraline print >> sys.stderr, chimeraline nchimera += 1 fw.close() logging.debug("A total of {0} junctions written to `{1}`.".\ format(nchimera, chimerabed))
def patcher(args): """ %prog patcher backbone.bed other.bed Given optical map alignment, prepare the patchers. Use --backbone to suggest which assembly is the major one, and the patchers will be extracted from another assembly. """ from jcvi.formats.bed import uniq p = OptionParser(patcher.__doc__) p.add_option("--backbone", default="OM", help="Prefix of the backbone assembly [default: %default]") p.add_option("--object", default="object", help="New object name [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) backbonebed, otherbed = args backbonebed = uniq([backbonebed]) otherbed = uniq([otherbed]) bb = opts.backbone pf = backbonebed.split(".")[0] key = lambda x: (x.seqid, x.start, x.end) is_bb = lambda x: x.startswith(bb) # Make a uniq bed keeping backbone at redundant intervals cmd = "intersectBed -v -wa" cmd += " -a {0} -b {1}".format(otherbed, backbonebed) outfile = otherbed.rsplit(".", 1)[0] + ".not." + backbonebed sh(cmd, outfile=outfile) uniqbed = Bed() uniqbedfile = pf + ".merged.bed" uniqbed.extend(Bed(backbonebed)) uniqbed.extend(Bed(outfile)) uniqbed.print_to_file(uniqbedfile, sorted=True) # Condense adjacent intervals, allow some chaining bed = uniqbed key = lambda x: range_parse(x.accn).seqid bed_fn = pf + ".patchers.bed" bed_fw = open(bed_fn, "w") for k, sb in groupby(bed, key=key): sb = list(sb) chr, start, end, strand = merge_ranges(sb) id = "{0}:{1}-{2}".format(chr, start, end) print >> bed_fw, "\t".join(str(x) for x in \ (chr, start, end, opts.object, 1000, strand)) bed_fw.close()
def patcher(args): """ %prog patcher backbone.bed other.bed Given optical map alignment, prepare the patchers. Use --backbone to suggest which assembly is the major one, and the patchers will be extracted from another assembly. """ from jcvi.formats.bed import uniq p = OptionParser(patcher.__doc__) p.add_option("--backbone", default="OM", help="Prefix of the backbone assembly [default: %default]") p.add_option("--object", default="object", help="New object name [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) backbonebed, otherbed = args backbonebed = uniq([backbonebed]) otherbed = uniq([otherbed]) pf = backbonebed.split(".")[0] key = lambda x: (x.seqid, x.start, x.end) # Make a uniq bed keeping backbone at redundant intervals cmd = "intersectBed -v -wa" cmd += " -a {0} -b {1}".format(otherbed, backbonebed) outfile = otherbed.rsplit(".", 1)[0] + ".not." + backbonebed sh(cmd, outfile=outfile) uniqbed = Bed() uniqbedfile = pf + ".merged.bed" uniqbed.extend(Bed(backbonebed)) uniqbed.extend(Bed(outfile)) uniqbed.print_to_file(uniqbedfile, sorted=True) # Condense adjacent intervals, allow some chaining bed = uniqbed key = lambda x: range_parse(x.accn).seqid bed_fn = pf + ".patchers.bed" bed_fw = open(bed_fn, "w") for k, sb in groupby(bed, key=key): sb = list(sb) chr, start, end, strand = merge_ranges(sb) print >> bed_fw, "\t".join(str(x) for x in \ (chr, start, end, opts.object, 1000, strand)) bed_fw.close()
def gaps(args): """ %prog gaps OM.bed fastafile Create patches around OM gaps. """ from jcvi.formats.bed import uniq from jcvi.utils.iter import pairwise p = OptionParser(gaps.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ombed, fastafile = args ombed = uniq([ombed]) bed = Bed(ombed) for a, b in pairwise(bed): om_a = (a.seqid, a.start, a.end, "+") om_b = (b.seqid, b.start, b.end, "+") ch_a = range_parse(a.accn) ch_b = range_parse(b.accn) ch_a = (ch_a.seqid, ch_a.start, ch_a.end, "+") ch_b = (ch_b.seqid, ch_b.start, ch_b.end, "+") om_dist, x = range_distance(om_a, om_b, distmode="ee") ch_dist, x = range_distance(ch_a, ch_b, distmode="ee") if om_dist <= 0 and ch_dist <= 0: continue print a print b print om_dist, ch_dist
def closest(args): """ %prog closest candidates.bed gaps.bed fastafile Identify the nearest gaps flanking suggested regions. """ p = OptionParser(closest.__doc__) p.add_option( "--om", default=False, action="store_true", help="The bedfile is OM blocks", ) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) candidates, gapsbed, fastafile = args sizes = Sizes(fastafile).mapping bed = Bed(candidates) ranges = [] for b in bed: r = range_parse(b.accn) if opts.om else b ranges.append([r.seqid, r.start, r.end]) gapsbed = Bed(gapsbed) granges = [(x.seqid, x.start, x.end) for x in gapsbed] ranges = range_merge(ranges) for r in ranges: a = range_closest(granges, r) b = range_closest(granges, r, left=False) seqid = r[0] if a is not None and a[0] != seqid: a = None if b is not None and b[0] != seqid: b = None mmin = 1 if a is None else a[1] mmax = sizes[seqid] if b is None else b[2] print("\t".join(str(x) for x in (seqid, mmin - 1, mmax)))
def merge_ranges(beds): m = [x.accn for x in beds] mr = [range_parse(x) for x in m] mc = set(x.seqid for x in mr) if len(mc) != 1: logging.error("Multiple seqid found in pocket. Aborted.") return mc = list(mc)[0] ms = min(x.start for x in mr) me = max(x.end for x in mr) neg_strands = sum(1 for x in beds if x.strand == "-") pos_strands = len(beds) - neg_strands strand = "-" if neg_strands > pos_strands else "+" return mc, ms, me, strand
def merge_ranges(beds): m = [x.accn for x in beds] mr = [range_parse(x) for x in m] mc = set(x.seqid for x in mr) if len(mc) != 1: logging.error("Multiple seqid found in pocket. Aborted.") return mc = list(mc)[0] ms = min(x.start for x in mr) me = max(x.end for x in mr) neg_strands = sum(1 for x in beds if x.strand == '-') pos_strands = len(beds) - neg_strands strand = '-' if neg_strands > pos_strands else '+' return mc, ms, me, strand
def closest(args): """ %prog closest candidates.bed gaps.bed fastafile Identify the nearest gaps flanking suggested regions. """ p = OptionParser(closest.__doc__) p.add_option("--om", default=False, action="store_true", help="The bedfile is OM blocks [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) candidates, gapsbed, fastafile = args sizes = Sizes(fastafile).mapping bed = Bed(candidates) ranges = [] for b in bed: r = range_parse(b.accn) if opts.om else b ranges.append([r.seqid, r.start, r.end]) gapsbed = Bed(gapsbed) granges = [(x.seqid, x.start, x.end) for x in gapsbed] ranges = range_merge(ranges) for r in ranges: a = range_closest(granges, r) b = range_closest(granges, r, left=False) seqid = r[0] if a is not None and a[0] != seqid: a = None if b is not None and b[0] != seqid: b = None mmin = 1 if a is None else a[1] mmax = sizes[seqid] if b is None else b[2] print "\t".join(str(x) for x in (seqid, mmin - 1, mmax))
def fasta(args): """ %prog fasta bedfile scf.fasta pseudomolecules.fasta Use OM bed to scaffold and create pseudomolecules. bedfile can be generated by running jcvi.assembly.opticalmap bed --blockonly """ from jcvi.formats.sizes import Sizes from jcvi.formats.agp import OO, build p = OptionParser(fasta.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, scffasta, pmolfasta = args pf = bedfile.rsplit(".", 1)[0] bed = Bed(bedfile) selected = select_bed(bed) oo = OO() seen = set() sizes = Sizes(scffasta).mapping agpfile = pf + ".agp" agp = open(agpfile, "w") for b in selected: scf = range_parse(b.accn).seqid chr = b.seqid cs = (chr, scf) if cs not in seen: oo.add(chr, scf, sizes[scf], b.strand) seen.add(cs) else: logging.debug("Seen {0}, ignored.".format(cs)) oo.write_AGP(agp, gaptype="contig") agp.close() build([agpfile, scffasta, pmolfasta])
def test_range_parse(input, expected): from jcvi.utils.range import range_parse assert range_parse(input) == expected