def bed_store(bedfile, sorted=False): bedfile = mergeBed(bedfile, s=True, nms=True, sorted=sorted) bed = Bed(bedfile) reads, reads_r = {}, defaultdict(list) for b in bed: target = "{0}:{1}".format(b.seqid, b.start) for accn in b.accn.split(","): reads[accn] = target reads_r[target].append(accn) return reads, reads_r
def bed_store(bedfile): bedfile = mergeBed(bedfile, s=True, nms=True, sorted=True) bed = Bed(bedfile) reads, reads_r = {}, defaultdict(list) for b in bed: target = "{0}:{1}".format(b.seqid, b.start) for accn in b.accn.split(","): reads[accn] = target reads_r[target].append(accn) return reads, reads_r
def insertionpairs(args): """ %prog insertionpairs endpoints.bed Pair up the candidate endpoints. A candidate exision point would contain both left-end (LE) and right-end (RE) within a given distance. -----------| |------------ -------| |-------- ---------| |---------- (RE) (LE) """ p = OptionParser(insertionpairs.__doc__) p.add_option( "--extend", default=10, type="int", help="Allow insertion sites to match up within distance", ) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (bedfile, ) = args mergedbedfile = mergeBed(bedfile, d=opts.extend, nms=True) bed = Bed(mergedbedfile) fw = must_open(opts.outfile, "w") support = lambda x: -x.reads for b in bed: names = b.accn.split(",") ends = [EndPoint(x) for x in names] REs = sorted([x for x in ends if x.leftright == "RE"], key=support) LEs = sorted([x for x in ends if x.leftright == "LE"], key=support) if not (REs and LEs): continue mRE, mLE = REs[0], LEs[0] pRE, pLE = mRE.position, mLE.position if pLE < pRE: b.start, b.end = pLE - 1, pRE else: b.start, b.end = pRE - 1, pLE b.accn = "{0}|{1}".format(mRE.label, mLE.label) b.score = pLE - pRE - 1 print(b, file=fw)
def insertionpairs(args): """ %prog insertionpairs endpoints.bed Pair up the candidate endpoints. A candidate exision point would contain both left-end (LE) and right-end (RE) within a given distance. -----------| |------------ -------| |-------- ---------| |---------- (RE) (LE) """ p = OptionParser(insertionpairs.__doc__) p.add_option("--extend", default=10, type="int", help="Allow insertion sites to match up within distance") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args mergedbedfile = mergeBed(bedfile, d=opts.extend, nms=True) bed = Bed(mergedbedfile) fw = must_open(opts.outfile, "w") support = lambda x: -x.reads for b in bed: names = b.accn.split(",") ends = [EndPoint(x) for x in names] REs = sorted([x for x in ends if x.leftright == "RE"], key=support) LEs = sorted([x for x in ends if x.leftright == "LE"], key=support) if not (REs and LEs): continue mRE, mLE = REs[0], LEs[0] pRE, pLE = mRE.position, mLE.position if pLE < pRE: b.start, b.end = pLE - 1, pRE else: b.start, b.end = pRE - 1, pLE b.accn = "{0}|{1}".format(mRE.label, mLE.label) b.score = pLE - pRE - 1 print >> fw, b
def fill(args): """ %prog fill gaps.bed bad.fasta Perform gap filling of one assembly (bad) using sequences from another. """ p = OptionParser(fill.__doc__) p.add_option( "--extend", default=2000, type="int", help="Extend seq flanking the gaps", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gapsbed, badfasta = args Ext = opts.extend gapdist = 2 * Ext + 1 # This is to prevent to replacement ranges intersect gapsbed = mergeBed(gapsbed, d=gapdist, nms=True) bed = Bed(gapsbed) sizes = Sizes(badfasta).mapping pf = gapsbed.rsplit(".", 1)[0] extbed = pf + ".ext.bed" fw = open(extbed, "w") for b in bed: gapname = b.accn start, end = max(0, b.start - Ext - 1), b.start - 1 print("\t".join(str(x) for x in (b.seqid, start, end, gapname + "L")), file=fw) start, end = b.end, min(sizes[b.seqid], b.end + Ext) print("\t".join(str(x) for x in (b.seqid, start, end, gapname + "R")), file=fw) fw.close() fastaFromBed(extbed, badfasta, name=True)
def fill(args): """ %prog fill gaps.bed bad.fasta Perform gap filling of one assembly (bad) using sequences from another. """ p = OptionParser(fill.__doc__) p.add_option("--extend", default=2000, type="int", help="Extend seq flanking the gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gapsbed, badfasta = args Ext = opts.extend gapdist = 2 * Ext + 1 # This is to prevent to replacement ranges intersect gapsbed = mergeBed(gapsbed, d=gapdist, nms=True) bed = Bed(gapsbed) sizes = Sizes(badfasta).mapping pf = gapsbed.rsplit(".", 1)[0] extbed = pf + ".ext.bed" fw = open(extbed, "w") for b in bed: gapname = b.accn start, end = max(0, b.start - Ext - 1), b.start - 1 print >> fw, "\t".join(str(x) for x in \ (b.seqid, start, end, gapname + "L")) start, end = b.end, min(sizes[b.seqid], b.end + Ext) print >> fw, "\t".join(str(x) for x in \ (b.seqid, start, end, gapname + "R")) fw.close() fastaFromBed(extbed, badfasta, name=True)
def pastegenes(args): """ %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly Paste in zero or low coverage genes. For a set of neighboring genes missing, add the whole cassette as unplaced scaffolds. For singletons the program will try to make a patch. """ from jcvi.formats.base import DictFile from jcvi.utils.cbook import gene_name p = OptionParser(pastegenes.__doc__) p.add_option( "--cutoff", default=90, type="int", help="Coverage cutoff to call gene missing", ) p.add_option( "--flank", default=2000, type="int", help="Get the seq of size on two ends", ) p.add_option( "--maxsize", default=50000, type="int", help="Maximum size of patchers to be replaced", ) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) coveragefile, oldbed, newbed, oldassembly = args cutoff = opts.cutoff flank = opts.flank maxsize = opts.maxsize coverage = DictFile(coveragefile, valuepos=2, cast=float) obed = Bed(oldbed) order = obed.order bed = [x for x in obed if x.accn in coverage] key = lambda x: coverage[x.accn] >= cutoff extrabed = "extra.bed" extendbed = "extend.bed" pastebed = "paste.bed" fw = open(extrabed, "w") fwe = open(extendbed, "w") fwp = open(pastebed, "w") fw_ids = open(extendbed + ".ids", "w") singletons, large, large_genes = 0, 0, 0 for chr, chrbed in groupby(bed, key=lambda x: x.seqid): chrbed = list(chrbed) for good, beds in groupby(chrbed, key=key): if good: continue beds = list(beds) blocksize = len(set([gene_name(x.accn) for x in beds])) if blocksize == 1: singletons += 1 accn = beds[0].accn gi, gb = order[accn] leftb = obed[gi - 1] rightb = obed[gi + 1] leftr = leftb.range rightr = rightb.range cur = gb.range distance_to_left, oo = range_distance(leftr, cur) distance_to_right, oo = range_distance(cur, rightr) span, oo = range_distance(leftr, rightr) if distance_to_left <= distance_to_right and distance_to_left > 0: label = "LEFT" else: label = "RIGHT" if 0 < span <= maxsize: print( "\t".join( str(x) for x in (chr, leftb.start, rightb.end, gb.accn) ), file=fwp, ) print(leftb, file=fwe) print(gb, file=fwe) print(rightb, file=fwe) print( "L:{0} R:{1} [{2}]".format( distance_to_left, distance_to_right, label ), file=fwe, ) print(gb.accn, file=fw_ids) continue large += 1 large_genes += blocksize ranges = [(x.start, x.end) for x in beds] rmin, rmax = range_minmax(ranges) rmin -= flank rmax += flank name = "-".join((beds[0].accn, beds[-1].accn)) print("\t".join(str(x) for x in (chr, rmin - 1, rmax, name)), file=fw) fw.close() fwe.close() extrabed = mergeBed(extrabed, d=flank, nms=True) fastaFromBed(extrabed, oldassembly, name=True) summary([extrabed]) logging.debug("Singleton blocks : {0}".format(singletons)) logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
def pastegenes(args): """ %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly Paste in zero or low coverage genes. For a set of neighboring genes missing, add the whole cassette as unplaced scaffolds. For singletons the program will try to make a patch. """ from jcvi.formats.base import DictFile from jcvi.utils.cbook import gene_name p = OptionParser(pastegenes.__doc__) p.add_option("--cutoff", default=90, type="int", help="Coverage cutoff to call gene missing [default: %default]") p.add_option("--flank", default=2000, type="int", help="Get the seq of size on two ends [default: %default]") p.add_option("--maxsize", default=50000, type="int", help="Maximum size of patchers to be replaced [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) coveragefile, oldbed, newbed, oldassembly = args cutoff = opts.cutoff flank = opts.flank maxsize = opts.maxsize coverage = DictFile(coveragefile, valuepos=2, cast=float) obed = Bed(oldbed) order = obed.order bed = [x for x in obed if x.accn in coverage] key = lambda x: coverage[x.accn] >= cutoff extrabed = "extra.bed" extendbed = "extend.bed" pastebed = "paste.bed" fw = open(extrabed, "w") fwe = open(extendbed, "w") fwp = open(pastebed, "w") fw_ids = open(extendbed + ".ids", "w") singletons, large, large_genes = 0, 0, 0 for chr, chrbed in groupby(bed, key=lambda x: x.seqid): chrbed = list(chrbed) for good, beds in groupby(chrbed, key=key): if good: continue beds = list(beds) blocksize = len(set([gene_name(x.accn) for x in beds])) if blocksize == 1: singletons += 1 accn = beds[0].accn gi, gb = order[accn] leftb = obed[gi - 1] rightb = obed[gi + 1] leftr = leftb.range rightr = rightb.range cur = gb.range distance_to_left, oo = range_distance(leftr, cur) distance_to_right, oo = range_distance(cur, rightr) span, oo = range_distance(leftr, rightr) if distance_to_left <= distance_to_right and \ distance_to_left > 0: label = "LEFT" else: label = "RIGHT" if 0 < span <= maxsize: print >> fwp, "\t".join(str(x) for x in \ (chr, leftb.start, rightb.end, gb.accn)) print >> fwe, leftb print >> fwe, gb print >> fwe, rightb print >> fwe, "L:{0} R:{1} [{2}]".format(distance_to_left, \ distance_to_right, label) print >> fw_ids, gb.accn continue large += 1 large_genes += blocksize ranges = [(x.start, x.end) for x in beds] rmin, rmax = range_minmax(ranges) rmin -= flank rmax += flank name = "-".join((beds[0].accn, beds[-1].accn)) print >> fw, "\t".join(str(x) for x in (chr, rmin - 1, rmax, name)) fw.close() fwe.close() extrabed = mergeBed(extrabed, d=flank, nms=True) fastaFromBed(extrabed, oldassembly, name=True) summary([extrabed]) logging.debug("Singleton blocks : {0}".format(singletons)) logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
def variation(args): """ %prog variation P1.bed P2.bed F1.bed Associate IES in parents and progeny. """ p = OptionParser(variation.__doc__) p.add_option("--diversity", choices=("breakpoint", "variant"), default="variant", help="Plot diversity") opts, args, iopts = p.set_image_options(args, figsize="6x6") if len(args) != 3: sys.exit(not p.print_help()) pfs = [op.basename(x).split('-')[0] for x in args] P1, P2, F1 = pfs newbedfile = "-".join(pfs) + ".bed" if need_update(args, newbedfile): newbed = Bed() for pf, filename in zip(pfs, args): bed = Bed(filename) for b in bed: b.accn = "-".join((pf, b.accn)) b.score = None newbed.append(b) newbed.print_to_file(newbedfile, sorted=True) neworder = Bed(newbedfile).order mergedbedfile = mergeBed(newbedfile, nms=True) bed = Bed(mergedbedfile) valid = 0 total_counts = Counter() F1_counts = [] bp_diff = [] novelbedfile = "novel.bed" fw = open(novelbedfile, "w") for b in bed: accns = b.accn.split(',') pfs_accns = [x.split("-")[0] for x in accns] pfs_counts = Counter(pfs_accns) if len(pfs_counts) != 3: print(b, file=fw) continue valid += 1 total_counts += pfs_counts F1_counts.append(pfs_counts[F1]) # Collect breakpoint positions between P1 and F1 P1_accns = [x for x in accns if x.split("-")[0] == P1] F1_accns = [x for x in accns if x.split("-")[0] == F1] if len(P1_accns) != 1: continue ri, ref = neworder[P1_accns[0]] P1_accns = [neworder[x][-1] for x in F1_accns] bp_diff.extend(x.start - ref.start for x in P1_accns) bp_diff.extend(x.end - ref.end for x in P1_accns) print("A total of {0} sites show consistent deletions across samples.".\ format(percentage(valid, len(bed))), file=sys.stderr) for pf, count in total_counts.items(): print("{0:>9}: {1:.2f} deletions/site".\ format(pf, count * 1. / valid), file=sys.stderr) F1_counts = Counter(F1_counts) # Plot the IES variant number diversity from jcvi.graphics.base import plt, savefig, set_ticklabels_helvetica fig = plt.figure(1, (iopts.w, iopts.h)) if opts.diversity == "variant": left, height = zip(*sorted(F1_counts.items())) for l, h in zip(left, height): print("{0:>9} variants: {1}".format(l, h), file=sys.stderr) plt.text(l, h + 5, str(h), color="darkslategray", size=8, ha="center", va="bottom", rotation=90) plt.bar(left, height, align="center") plt.xlabel("Identified number of IES per site") plt.ylabel("Counts") plt.title("IES variation in progeny pool") ax = plt.gca() set_ticklabels_helvetica(ax) savefig(F1 + ".counts.pdf") # Plot the IES breakpoint position diversity else: bp_diff = Counter(bp_diff) bp_diff_abs = Counter() for k, v in bp_diff.items(): bp_diff_abs[abs(k)] += v plt.figure(1, (iopts.w, iopts.h)) left, height = zip(*sorted(bp_diff_abs.items())) for l, h in zip(left, height)[:21]: plt.text(l, h + 50, str(h), color="darkslategray", size=8, ha="center", va="bottom", rotation=90) plt.bar(left, height, align="center") plt.xlabel("Progeny breakpoint relative to SB210") plt.ylabel("Counts") plt.xlim(-.5, 20.5) ax = plt.gca() set_ticklabels_helvetica(ax) savefig(F1 + ".breaks.pdf") # Serialize the data to a file fw = open("Breakpoint-offset-histogram.csv", "w") for k, v in sorted(bp_diff.items()): print("{0},{1}".format(k, v), file=fw) fw.close() total = sum(height) zeros = bp_diff[0] within_20 = sum([v for i, v in bp_diff.items() if -20 <= i <= 20]) print("No deviation: {0}".format(percentage(zeros, total)), file=sys.stderr) print(" Within 20bp: {0}".format(percentage(within_20, total)), file=sys.stderr)
def variation(args): """ %prog variation P1.bed P2.bed F1.bed Associate IES in parents and progeny. """ p = OptionParser(variation.__doc__) p.add_option("--diversity", choices=("breakpoint", "variant"), default="variant", help="Plot diversity") opts, args, iopts = p.set_image_options(args, figsize="6x6") if len(args) != 3: sys.exit(not p.print_help()) pfs = [op.basename(x).split('-')[0] for x in args] P1, P2, F1 = pfs newbedfile = "-".join(pfs) + ".bed" if need_update(args, newbedfile): newbed = Bed() for pf, filename in zip(pfs, args): bed = Bed(filename) for b in bed: b.accn = "-".join((pf, b.accn)) b.score = None newbed.append(b) newbed.print_to_file(newbedfile, sorted=True) neworder = Bed(newbedfile).order mergedbedfile = mergeBed(newbedfile, nms=True) bed = Bed(mergedbedfile) valid = 0 total_counts = Counter() F1_counts = [] bp_diff = [] novelbedfile = "novel.bed" fw = open(novelbedfile, "w") for b in bed: accns = b.accn.split(',') pfs_accns = [x.split("-")[0] for x in accns] pfs_counts = Counter(pfs_accns) if len(pfs_counts) != 3: print >> fw, b continue valid += 1 total_counts += pfs_counts F1_counts.append(pfs_counts[F1]) # Collect breakpoint positions between P1 and F1 P1_accns = [x for x in accns if x.split("-")[0] == P1] F1_accns = [x for x in accns if x.split("-")[0] == F1] if len(P1_accns) != 1: continue ri, ref = neworder[P1_accns[0]] P1_accns = [neworder[x][-1] for x in F1_accns] bp_diff.extend(x.start - ref.start for x in P1_accns) bp_diff.extend(x.end - ref.end for x in P1_accns) print >> sys.stderr, \ "A total of {0} sites show consistent deletions across samples.".\ format(percentage(valid, len(bed))) for pf, count in total_counts.items(): print >> sys.stderr, "{0:>9}: {1:.2f} deletions/site".\ format(pf, count * 1. / valid) F1_counts = Counter(F1_counts) # Plot the IES variant number diversity from jcvi.graphics.base import plt, savefig, set_ticklabels_helvetica fig = plt.figure(1, (iopts.w, iopts.h)) if opts.diversity == "variant": left, height = zip(*sorted(F1_counts.items())) for l, h in zip(left, height): print >> sys.stderr, "{0:>9} variants: {1}".format(l, h) plt.text(l, h + 5, str(h), color="darkslategray", size=8, ha="center", va="bottom", rotation=90) plt.bar(left, height, align="center") plt.xlabel("Identified number of IES per site") plt.ylabel("Counts") plt.title("IES variation in progeny pool") ax = plt.gca() set_ticklabels_helvetica(ax) savefig(F1 + ".counts.pdf") # Plot the IES breakpoint position diversity else: bp_diff = Counter(bp_diff) bp_diff_abs = Counter() for k, v in bp_diff.items(): bp_diff_abs[abs(k)] += v plt.figure(1, (iopts.w, iopts.h)) left, height = zip(*sorted(bp_diff_abs.items())) for l, h in zip(left, height)[:21]: plt.text(l, h + 50, str(h), color="darkslategray", size=8, ha="center", va="bottom", rotation=90) plt.bar(left, height, align="center") plt.xlabel("Progeny breakpoint relative to SB210") plt.ylabel("Counts") plt.xlim(-.5, 20.5) ax = plt.gca() set_ticklabels_helvetica(ax) savefig(F1 + ".breaks.pdf") # Serialize the data to a file fw = open("Breakpoint-offset-histogram.csv", "w") for k, v in sorted(bp_diff.items()): print >> fw, "{0},{1}".format(k, v) fw.close() total = sum(height) zeros = bp_diff[0] within_20 = sum([v for i, v in bp_diff.items() if -20 <= i <= 20]) print >> sys.stderr, "No deviation: {0}".format(percentage(zeros, total)) print >> sys.stderr, " Within 20bp: {0}".format(percentage(within_20, total))