def uniq(args): """ %prog uniq bedfile > newbedfile Remove overlapping features with higher scores. """ p = OptionParser(uniq.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args uniqbedfile = bedfile.split(".")[0] + ".uniq.bed" bed = Bed(bedfile) if not need_update(bedfile, uniqbedfile): return uniqbedfile ranges = [Range(x.seqid, x.start, x.end, float(x.score), i) \ for i, x in enumerate(bed)] selected, score = range_chain(ranges) selected = [bed[x.id] for x in selected] newbed = Bed() newbed.extend(selected) newbed.print_to_file(uniqbedfile, sorted=True) logging.debug("Imported: {0}, Exported: {1}".format(len(bed), len(newbed))) return uniqbedfile
def uniq(args): """ %prog uniq bedfile Remove overlapping features with higher scores. """ p = OptionParser(uniq.__doc__) p.add_option("--slen", default=False, action="store_true", help="Use sequence length as score [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args uniqbedfile = bedfile.split(".")[0] + ".uniq.bed" bed = Bed(bedfile) if opts.slen: ranges = [Range(x.seqid, x.start, x.end, x.end - x.start, i) \ for i, x in enumerate(bed)] else: ranges = [Range(x.seqid, x.start, x.end, float(x.score), i) \ for i, x in enumerate(bed)] selected, score = range_chain(ranges) selected = [x.id for x in selected] selected_ids = set(selected) selected = [bed[x] for x in selected] notselected = [x for i, x in enumerate(bed) if i not in selected_ids] newbed = Bed() newbed.extend(selected) newbed.print_to_file(uniqbedfile, sorted=True) if notselected: leftoverfile = bedfile.split(".")[0] + ".leftover.bed" leftoverbed = Bed() leftoverbed.extend(notselected) leftoverbed.print_to_file(leftoverfile, sorted=True) logging.debug("Imported: {0}, Exported: {1}".format(len(bed), len(newbed))) return uniqbedfile
def select_bed(bed): """ Return non-overlapping set of ranges, choosing high scoring blocks over low scoring alignments when there are conflicts. """ ranges = [Range(x.seqid, x.start, x.end, float(x.score), i) for i, x in enumerate(bed)] selected, score = range_chain(ranges) selected = [bed[x.id] for x in selected] return selected
def get_piles(allgenes): """ Before running uniq, we need to compute all the piles. The piles are a set of redundant features we want to get rid of. Input are a list of GffLines features. Output are list of list of features distinct "piles". """ from jcvi.utils.range import Range, range_piles ranges = [Range(a.seqid, a.start, a.end, 0, i) \ for i, a in enumerate(allgenes)] for pile in range_piles(ranges): yield [allgenes[x] for x in pile]
def get_range(q, s, t, i, order, block_pairs, clip=10): pairs = get_best_pair(q, s, t) score = len(pairs) block_pairs[i].update(pairs) q = [order[x][0] for x in q] q.sort() qmin = q[0] qmax = q[-1] if qmax - qmin >= 2 * clip: qmin += clip / 2 qmax -= clip / 2 return Range("0", qmin, qmax, score=score, id=i)
def uniq(args): """ %prog uniq gffile cdsfasta Remove overlapping gene models. Similar to formats.gff.uniq(), overlapping 'piles' are processed, one by one. Here, we use a different algorithm, that retains the best non-overlapping subset witin each pile, rather than single best model. Scoring function is also different, rather than based on score or span, we optimize for the subset that show the best combined score. Score is defined by: score = (1 - AED) * length """ p = OptionParser(uniq.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gffile, cdsfasta = args gff = Gff(gffile) sizes = Sizes(cdsfasta).mapping gene_register = {} for g in gff: if g.type != "mRNA": continue aed = float(g.attributes["_AED"][0]) gene_register[g.parent] = (1 - aed) * sizes[g.accn] allgenes = import_feats(gffile) g = get_piles(allgenes) bestids = set() for group in g: ranges = [Range(x.seqid, x.start, x.end, \ gene_register[x.accn], x.accn) for x in group] selected_chain, score = range_chain(ranges) bestids |= set(x.id for x in selected_chain) removed = set(x.accn for x in allgenes) - bestids fw = open("removed.ids", "w") print >> fw, "\n".join(sorted(removed)) fw.close() populate_children(opts.outfile, bestids, gffile, "gene")
def BlastOrCoordsLine(filename, filter="ref", dialect="blast", clip=0): allowed_filters = ("ref", "query") REF, QUERY = range(len(allowed_filters)) allowed_dialects = ("blast", "coords") BLAST, COORDS = range(len(allowed_dialects)) assert filter in allowed_filters filter = allowed_filters.index(filter) assert dialect in allowed_dialects dialect = allowed_dialects.index(dialect) fp = open(filename) for i, row in enumerate(fp): if row[0] == '#': continue if dialect == BLAST: b = BlastLine(row) if filter == QUERY: query, start, end = b.query, b.qstart, b.qstop else: query, start, end = b.subject, b.sstart, b.sstop else: try: b = CoordsLine(row) except AssertionError: continue if filter == QUERY: query, start, end = b.query, b.start2, b.end2 else: query, start, end = b.ref, b.start1, b.end1 if start > end: start, end = end, start if clip: # clip cannot be more than 5% of the range r = end - start + 1 cc = min(.05 * r, clip) start = start + cc end = end - cc yield Range(query, start, end, b.score, i)
def cluster(args): """ %prog cluster blastfile anchorfile --qbed qbedfile --sbed sbedfile Cluster the segments and form PAD. This is the method described in Tang et al. (2010) PNAS paper. The anchorfile defines a list of synteny blocks, based on which the genome on one or both axis can be chopped up into pieces and clustered. """ from jcvi.utils.range import Range p = OptionParser(cluster.__doc__) p.set_beds() p.add_option( "--minsize", default=10, type="int", help="Only segment using blocks >= size" ) p.add_option( "--path", default="~/scratch/bin", help="Path to the CLUSTER 3.0 binary" ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, anchorfile = args qbed, sbed, qorder, sorder, is_self = check_beds(blastfile, p, opts) minsize = opts.minsize ac = AnchorFile(anchorfile) qranges, sranges = [], [] qextra = [x[1:] for x in qbed.get_breaks()] sextra = [x[1:] for x in sbed.get_breaks()] id = 0 for block in ac.iter_blocks(minsize=minsize): q, s = list(zip(*block))[:2] q = [qorder[x][0] for x in q] s = [sorder[x][0] for x in s] minq, maxq = min(q), max(q) mins, maxs = min(s), max(s) id += 1 qr = Range("0", minq, maxq, maxq - minq, id) sr = Range("0", mins, maxs, maxs - mins, id) qranges.append(qr) sranges.append(sr) qpads = list(get_segments(qranges, qextra)) spads = list(get_segments(sranges, sextra)) suffix = ".pad.bed" qpf = opts.qbed.split(".")[0] spf = opts.sbed.split(".")[0] qpadfile = qpf + suffix spadfile = spf + suffix qnpads, qpadnames = write_PAD_bed(qpadfile, qpf, qpads, qbed) snpads, spadnames = write_PAD_bed(spadfile, spf, spads, sbed) qpadbed, spadbed = Bed(qpadfile), Bed(spadfile) logmp = make_arrays(blastfile, qpadbed, spadbed, qpadnames, spadnames) m, n = logmp.shape matrixfile = ".".join((qpf, spf, "logmp.txt")) fw = open(matrixfile, "w") header = ["o"] + spadnames print("\t".join(header), file=fw) for i in range(m): row = [qpadnames[i]] + ["{0:.1f}".format(x) for x in logmp[i, :]] print("\t".join(row), file=fw) fw.close() # Run CLUSTER 3.0 (Pearson correlation, average linkage) cmd = op.join(opts.path, "cluster") cmd += " -g 2 -e 2 -m a -f {0}".format(matrixfile) pf = matrixfile.rsplit(".", 1)[0] cdtfile = pf + ".cdt" if need_update(matrixfile, cdtfile): sh(cmd)
def deletion(args): """ %prog deletion [mac.mic.bam|mac.mic.bed] mic.gaps.bed Find IES based on mapping MAC reads to MIC genome. """ p = OptionParser(deletion.__doc__) p.add_option("--mindepth", default=3, type="int", help="Minimum depth to call a deletion") p.add_option("--minspan", default=30, type="int", help="Minimum span to call a deletion") p.add_option("--split", default=False, action="store_true", help="Break at cigar N into separate parts") p.set_tmpdir() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, gapsbedfile = args if bedfile.endswith(".bam"): bamfile = bedfile bedfile = bamfile.replace(".sorted.", ".").replace(".bam", ".bed") if need_update(bamfile, bedfile): cmd = "bamToBed -i {0}".format(bamfile) if opts.split: cmd += " -split" cmd += " | cut -f1-4" sh(cmd, outfile=bedfile) sort_tmpdir = "--tmpdir={0}".format(opts.tmpdir) if bedfile.endswith(".sorted.bed"): pf = bedfile.rsplit(".", 2)[0] sortedbedfile = bedfile else: pf = bedfile.rsplit(".", 1)[0] sortedbedfile = pf + ".sorted.bed" if need_update(bedfile, sortedbedfile): sort([bedfile, "-u", "--accn", sort_tmpdir]) # Find reads that contain multiple matches ibedfile = pf + ".d.bed" if need_update(sortedbedfile, ibedfile): bed = Bed(sortedbedfile, sorted=False) fw = open(ibedfile, "w") logging.debug("Write deletions to `{0}`.".format(ibedfile)) for accn, bb in groupby(bed, key=lambda x: x.accn): bb = list(bb) branges = [(x.seqid, x.start, x.end) for x in bb] iranges = range_interleave(branges) for seqid, start, end in iranges: if end - start + 1 < opts.minspan: continue print("\t".join(str(x) for x in \ (seqid, start - 1, end, accn + '-d')), file=fw) fw.close() # Uniqify the insertions and count occurrences countbedfile = pf + ".uniq.bed" if need_update(ibedfile, countbedfile): bed = Bed(ibedfile) fw = open(countbedfile, "w") logging.debug("Write counts to `{0}`.".format(countbedfile)) registry = Counter((x.seqid, x.start, x.end) for x in bed) ies_id = 1 for (seqid, start, end), count in registry.items(): ies_name = "{0:05d}-r{1}".format(ies_id, count) if count < opts.mindepth: continue print("\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name)), file=fw) ies_id += 1 fw.close() sort([countbedfile, "-i", sort_tmpdir]) # Remove deletions that contain some read depth depthbedfile = pf + ".depth.bed" if need_update((sortedbedfile, countbedfile), depthbedfile): depth([ sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile) ]) validbedfile = pf + ".valid.bed" if need_update(depthbedfile, validbedfile): fw = open(validbedfile, "w") logging.debug("Filter valid deletions to `{0}`.".format(validbedfile)) bed = Bed(depthbedfile) all_scores = [float(b.score) for b in bed] lb, ub = outlier_cutoff(all_scores) logging.debug( "Bounds for depths: LB={0:.2f} (ignored) UB={1:.2f}".format( lb, ub)) for b in bed: if float(b.score) > ub: continue print(b, file=fw) fw.close() # Remove deletions that contain sequencing gaps on its flanks selectedbedfile = pf + ".selected.bed" if need_update(validbedfile, selectedbedfile): flanksbedfile = pf + ".flanks.bed" fw = open(flanksbedfile, "w") bed = Bed(validbedfile) flank = 100 logging.debug("Write deletion flanks to `{0}`.".format(flanksbedfile)) for b in bed: start, end = b.start, b.end b.start, b.end = start, min(start + flank - 1, end) print(b, file=fw) b.start, b.end = max(start, end - flank + 1), end print(b, file=fw) fw.close() intersectidsfile = pf + ".intersect.ids" cmd = "intersectBed -a {0} -b {1}".format(flanksbedfile, gapsbedfile) cmd += " | cut -f4 | sort -u" sh(cmd, outfile=intersectidsfile) some([ validbedfile, intersectidsfile, "-v", "--outfile={0}".format(selectedbedfile) ]) # Find best-scoring non-overlapping set iesbedfile = pf + ".ies.bed" if need_update(selectedbedfile, iesbedfile): bed = Bed(selectedbedfile) fw = open(iesbedfile, "w") logging.debug("Write IES to `{0}`.".format(iesbedfile)) branges = [Range(x.seqid, x.start, x.end, int(x.accn.rsplit("r")[-1]), i) \ for i, x in enumerate(bed)] iranges, iscore = range_chain(branges) logging.debug("Best chain score: {0} ({1} IES)".\ format(iscore, len(iranges))) ies_id = 1 for seqid, start, end, score, id in iranges: ies_name = "IES-{0:05d}-r{1}".format(ies_id, score) span = end - start + 1 print("\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name, span)), file=fw) ies_id += 1 fw.close()
def mcscan(args): """ %prog mcscan bedfile anchorfile Stack synteny blocks on a reference bed, MCSCAN style. The first column in the output is the reference order, given in the bedfile. Then each column next to it are separate 'tracks'. """ from jcvi.utils.range import Range, range_chain p = OptionParser(mcscan.__doc__) p.add_option("--iter", default=100, type="int", help="Max number of chains to output [default: %default]") p.add_option( "--ascii", default=False, action="store_true", help="Output symbols rather than gene names [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, anchorfile = args ascii = opts.ascii bed = Bed(bedfile) order = bed.order ac = AnchorFile(anchorfile) ranges = [] block_pairs = {} for i, (q, s) in enumerate(ac.iter_blocks()): if q[0] not in order: q, s = s, q pairs = dict(zip(q, s)) block_pairs[i] = pairs q = [order[x] for x in q] q.sort() ranges.append(Range("0", q[0], q[-1], score=len(q), id=i)) tracks = [] print >> sys.stderr, "Chain started: {0} blocks".format(len(ranges)) iteration = 0 while ranges: if iteration >= opts.iter: break selected, score = range_chain(ranges) tracks.append(selected) selected = set(x.id for x in selected) ranges = [x for x in ranges if x.id not in selected] msg = "Chain {0}: score={1}".format(iteration, score) if ranges: msg += " {0} blocks remained..".format(len(ranges)) else: msg += " done!" print >> sys.stderr, msg iteration += 1 for b in bed: id = b.accn atoms = [] for track in tracks: track_ids = [x.id for x in track] for tid in track_ids: pairs = block_pairs[tid] anchor = pairs.get(id, ".") if anchor != ".": break if ascii and anchor != ".": anchor = "x" atoms.append(anchor) sep = "" if ascii else "\t" print "\t".join((id, sep.join(atoms)))
import pytest from jcvi.utils.range import Range, range_closest @pytest.mark.parametrize( "input,expected", [("chr1:1000-1", Range(seqid="chr1", start=1, end=1000, score=0, id=0))], ) def test_range_parse(input, expected): from jcvi.utils.range import range_parse assert range_parse(input) == expected @pytest.mark.parametrize("a,b,expected", [((30, 45), (55, 65), None), ((48, 65), (45, 55), [48, 55])]) def test_range_intersect(a, b, expected): from jcvi.utils.range import range_intersect assert range_intersect(a, b) == expected @pytest.mark.parametrize( "a,b,ratio,expected", [ (("1", 30, 45), ("1", 41, 55), False, 5), (("1", 21, 45), ("1", 41, 75), True, 0.2), (("1", 30, 45), ("1", 15, 55), False, 16), (("1", 30, 45), ("1", 15, 55), True, 1.0), (("1", 30, 45), ("1", 57, 68), False, 0),