def activate(self, tourfile=None, minsize=10000, backuptour=True): """ Select contigs in the current partition. This is the setup phase of the algorithm, and supports two modes: - "de novo": This is useful at the start of a new run where no tours are available. We select the strong contigs that have significant number of links to other contigs in the partition. We build a histogram of link density (# links per bp) and remove the contigs that appear to be outliers. The orientations are derived from the matrix decomposition of the pairwise strandedness matrix O. - "hotstart": This is useful when there was a past run, with a given tourfile. In this case, the active contig list and orientations are derived from the last tour in the file. """ if tourfile and (not op.exists(tourfile)): logging.debug("Tourfile `{}` not found".format(tourfile)) tourfile = None if tourfile: logging.debug("Importing tourfile `{}`".format(tourfile)) tour, tour_o = iter_last_tour(tourfile, self) self.active = set(tour) tig_to_idx = self.tig_to_idx tour = [tig_to_idx[x] for x in tour] signs = sorted([(x, FF[o]) for (x, o) in zip(tour, tour_o)]) _, signs = zip(*signs) self.signs = np.array(signs, dtype=int) if backuptour: backup(tourfile) tour = array.array('i', tour) else: self.report_active() while True: logdensities = self.calculate_densities() lb, ub = outlier_cutoff(logdensities.values()) logging.debug("Log10(link_densities) ~ [{}, {}]".format( lb, ub)) remove = set(x for x, d in logdensities.items() \ if (d < lb and self.tig_to_size[x] < minsize * 10)) if remove: self.active -= remove self.report_active() else: break logging.debug("Remove contigs with size < {}".format(minsize)) self.active = set(x for x in self.active if self.tig_to_size[x] >= minsize) tour = range(self.N) # Use starting (random) order otherwise tour = array.array('i', tour) # Determine orientations self.flip_all(tour) self.report_active() self.tour = tour return tour
def activate(self, tourfile=None, minsize=10000, backuptour=True): """ Select contigs in the current partition. This is the setup phase of the algorithm, and supports two modes: - "de novo": This is useful at the start of a new run where no tours available. We select the strong contigs that have significant number of links to other contigs in the partition. We build a histogram of link density (# links per bp) and remove the contigs that appear as outliers. The orientations are derived from the matrix decomposition of the pairwise strandedness matrix O. - "hotstart": This is useful when there was a past run, with a given tourfile. In this case, the active contig list and orientations are derived from the last tour in the file. """ if tourfile and (not op.exists(tourfile)): logging.debug("Tourfile `{}` not found".format(tourfile)) tourfile = None if tourfile: logging.debug("Importing tourfile `{}`".format(tourfile)) tour, tour_o = iter_last_tour(tourfile, self) self.active = set(tour) tig_to_idx = self.tig_to_idx tour = [tig_to_idx[x] for x in tour] signs = sorted([(x, FF[o]) for (x, o) in zip(tour, tour_o)]) _, signs = zip(*signs) self.signs = np.array(signs, dtype=int) if backuptour: backup(tourfile) tour = array.array('i', tour) else: self.report_active() while True: logdensities = self.calculate_densities() lb, ub = outlier_cutoff(logdensities.values()) logging.debug("Log10(link_densities) ~ [{}, {}]" .format(lb, ub)) remove = set(x for x, d in logdensities.items() if (d < lb and self.tig_to_size[x] < minsize * 10)) if remove: self.active -= remove self.report_active() else: break logging.debug("Remove contigs with size < {}".format(minsize)) self.active = set(x for x in self.active if self.tig_to_size[x] >= minsize) tour = range(self.N) # Use starting (random) order otherwise tour = array.array('i', tour) # Determine orientations self.flip_all(tour) self.report_active() self.tour = tour return tour
def prune_tour(self, tour, cpus): """ Test deleting each contig and check the delta_score; tour here must be an array of ints. """ while True: tour_score, = self.evaluate_tour_M(tour) logging.debug("Starting score: {}".format(tour_score)) active_sizes = self.active_sizes M = self.M args = [] for i, t in enumerate(tour): stour = tour[:i] + tour[i + 1:] args.append((t, stour, tour_score, active_sizes, M)) # Parallel run p = Pool(processes=cpus) results = list(p.imap(prune_tour_worker, args)) assert len(tour) == len(results), \ "Array size mismatch, tour({}) != results({})"\ .format(len(tour), len(results)) # Identify outliers active_contigs = self.active_contigs idx, log10deltas = zip(*results) lb, ub = outlier_cutoff(log10deltas) logging.debug("Log10(delta_score) ~ [{}, {}]".format(lb, ub)) remove = set(active_contigs[x] for (x, d) in results if d < lb) self.active -= remove self.report_active() tig_to_idx = self.tig_to_idx tour = [active_contigs[x] for x in tour] tour = array.array('i', [tig_to_idx[x] for x in tour \ if x not in remove]) if not remove: break self.tour = tour self.flip_all(tour) return tour
def prune_tour(self, tour, cpus): """ Test deleting each contig and check the delta_score; tour here must be an array of ints. """ while True: tour_score, = self.evaluate_tour_M(tour) logging.debug("Starting score: {}".format(tour_score)) active_sizes = self.active_sizes M = self.M args = [] for i, t in enumerate(tour): stour = tour[:i] + tour[i + 1:] args.append((t, stour, tour_score, active_sizes, M)) # Parallel run p = Pool(processes=cpus) results = list(p.imap(prune_tour_worker, args)) assert len(tour) == len(results), \ "Array size mismatch, tour({}) != results({})"\ .format(len(tour), len(results)) # Identify outliers active_contigs = self.active_contigs idx, log10deltas = zip(*results) lb, ub = outlier_cutoff(log10deltas) logging.debug("Log10(delta_score) ~ [{}, {}]".format(lb, ub)) remove = set(active_contigs[x] for (x, d) in results if d < lb) self.active -= remove self.report_active() tig_to_idx = self.tig_to_idx tour = [active_contigs[x] for x in tour] tour = array.array('i', [tig_to_idx[x] for x in tour if x not in remove]) if not remove: break self.tour = tour self.flip_all(tour) return tour
def deletion(args): """ %prog deletion [mac.mic.bam|mac.mic.bed] mic.gaps.bed Find IES based on mapping MAC reads to MIC genome. """ p = OptionParser(deletion.__doc__) p.add_option("--mindepth", default=3, type="int", help="Minimum depth to call a deletion") p.add_option("--minspan", default=30, type="int", help="Minimum span to call a deletion") p.add_option("--split", default=False, action="store_true", help="Break at cigar N into separate parts") p.set_tmpdir() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, gapsbedfile = args if bedfile.endswith(".bam"): bamfile = bedfile bedfile = bamfile.replace(".sorted.", ".").replace(".bam", ".bed") if need_update(bamfile, bedfile): cmd = "bamToBed -i {0}".format(bamfile) if opts.split: cmd += " -split" cmd += " | cut -f1-4" sh(cmd, outfile=bedfile) sort_tmpdir = "--tmpdir={0}".format(opts.tmpdir) if bedfile.endswith(".sorted.bed"): pf = bedfile.rsplit(".", 2)[0] sortedbedfile = bedfile else: pf = bedfile.rsplit(".", 1)[0] sortedbedfile = pf + ".sorted.bed" if need_update(bedfile, sortedbedfile): sort([bedfile, "-u", "--accn", sort_tmpdir]) # Find reads that contain multiple matches ibedfile = pf + ".d.bed" if need_update(sortedbedfile, ibedfile): bed = Bed(sortedbedfile, sorted=False) fw = open(ibedfile, "w") logging.debug("Write deletions to `{0}`.".format(ibedfile)) for accn, bb in groupby(bed, key=lambda x: x.accn): bb = list(bb) branges = [(x.seqid, x.start, x.end) for x in bb] iranges = range_interleave(branges) for seqid, start, end in iranges: if end - start + 1 < opts.minspan: continue print("\t".join(str(x) for x in \ (seqid, start - 1, end, accn + '-d')), file=fw) fw.close() # Uniqify the insertions and count occurrences countbedfile = pf + ".uniq.bed" if need_update(ibedfile, countbedfile): bed = Bed(ibedfile) fw = open(countbedfile, "w") logging.debug("Write counts to `{0}`.".format(countbedfile)) registry = Counter((x.seqid, x.start, x.end) for x in bed) ies_id = 1 for (seqid, start, end), count in registry.items(): ies_name = "{0:05d}-r{1}".format(ies_id, count) if count < opts.mindepth: continue print("\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name)), file=fw) ies_id += 1 fw.close() sort([countbedfile, "-i", sort_tmpdir]) # Remove deletions that contain some read depth depthbedfile = pf + ".depth.bed" if need_update((sortedbedfile, countbedfile), depthbedfile): depth([ sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile) ]) validbedfile = pf + ".valid.bed" if need_update(depthbedfile, validbedfile): fw = open(validbedfile, "w") logging.debug("Filter valid deletions to `{0}`.".format(validbedfile)) bed = Bed(depthbedfile) all_scores = [float(b.score) for b in bed] lb, ub = outlier_cutoff(all_scores) logging.debug( "Bounds for depths: LB={0:.2f} (ignored) UB={1:.2f}".format( lb, ub)) for b in bed: if float(b.score) > ub: continue print(b, file=fw) fw.close() # Remove deletions that contain sequencing gaps on its flanks selectedbedfile = pf + ".selected.bed" if need_update(validbedfile, selectedbedfile): flanksbedfile = pf + ".flanks.bed" fw = open(flanksbedfile, "w") bed = Bed(validbedfile) flank = 100 logging.debug("Write deletion flanks to `{0}`.".format(flanksbedfile)) for b in bed: start, end = b.start, b.end b.start, b.end = start, min(start + flank - 1, end) print(b, file=fw) b.start, b.end = max(start, end - flank + 1), end print(b, file=fw) fw.close() intersectidsfile = pf + ".intersect.ids" cmd = "intersectBed -a {0} -b {1}".format(flanksbedfile, gapsbedfile) cmd += " | cut -f4 | sort -u" sh(cmd, outfile=intersectidsfile) some([ validbedfile, intersectidsfile, "-v", "--outfile={0}".format(selectedbedfile) ]) # Find best-scoring non-overlapping set iesbedfile = pf + ".ies.bed" if need_update(selectedbedfile, iesbedfile): bed = Bed(selectedbedfile) fw = open(iesbedfile, "w") logging.debug("Write IES to `{0}`.".format(iesbedfile)) branges = [Range(x.seqid, x.start, x.end, int(x.accn.rsplit("r")[-1]), i) \ for i, x in enumerate(bed)] iranges, iscore = range_chain(branges) logging.debug("Best chain score: {0} ({1} IES)".\ format(iscore, len(iranges))) ies_id = 1 for seqid, start, end, score, id in iranges: ies_name = "IES-{0:05d}-r{1}".format(ies_id, score) span = end - start + 1 print("\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name, span)), file=fw) ies_id += 1 fw.close()
def deletion(args): """ %prog deletion [mac.mic.bam|mac.mic.bed] mic.gaps.bed Find IES based on mapping MAC reads to MIC genome. """ p = OptionParser(deletion.__doc__) p.add_option("--mindepth", default=3, type="int", help="Minimum depth to call a deletion") p.add_option("--minspan", default=30, type="int", help="Minimum span to call a deletion") p.add_option("--split", default=False, action="store_true", help="Break at cigar N into separate parts") p.set_tmpdir() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, gapsbedfile = args if bedfile.endswith(".bam"): bamfile = bedfile bedfile = bamfile.replace(".sorted.", ".").replace(".bam", ".bed") if need_update(bamfile, bedfile): cmd = "bamToBed -i {0}".format(bamfile) if opts.split: cmd += " -split" cmd += " | cut -f1-4" sh(cmd, outfile=bedfile) sort_tmpdir = "--tmpdir={0}".format(opts.tmpdir) if bedfile.endswith(".sorted.bed"): pf = bedfile.rsplit(".", 2)[0] sortedbedfile = bedfile else: pf = bedfile.rsplit(".", 1)[0] sortedbedfile = pf + ".sorted.bed" if need_update(bedfile, sortedbedfile): sort([bedfile, "-u", "--accn", sort_tmpdir]) # Find reads that contain multiple matches ibedfile = pf + ".d.bed" if need_update(sortedbedfile, ibedfile): bed = Bed(sortedbedfile, sorted=False) fw = open(ibedfile, "w") logging.debug("Write deletions to `{0}`.".format(ibedfile)) for accn, bb in groupby(bed, key=lambda x: x.accn): bb = list(bb) branges = [(x.seqid, x.start, x.end) for x in bb] iranges = range_interleave(branges) for seqid, start, end in iranges: if end - start + 1 < opts.minspan: continue print >> fw, "\t".join(str(x) for x in \ (seqid, start - 1, end, accn + '-d')) fw.close() # Uniqify the insertions and count occurrences countbedfile = pf + ".uniq.bed" if need_update(ibedfile, countbedfile): bed = Bed(ibedfile) fw = open(countbedfile, "w") logging.debug("Write counts to `{0}`.".format(countbedfile)) registry = Counter((x.seqid, x.start, x.end) for x in bed) ies_id = 1 for (seqid, start, end), count in registry.items(): ies_name = "{0:05d}-r{1}".format(ies_id, count) if count < opts.mindepth: continue print >> fw, "\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name)) ies_id += 1 fw.close() sort([countbedfile, "-i", sort_tmpdir]) # Remove deletions that contain some read depth depthbedfile = pf + ".depth.bed" if need_update((sortedbedfile, countbedfile), depthbedfile): depth([sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile)]) validbedfile = pf + ".valid.bed" if need_update(depthbedfile, validbedfile): fw = open(validbedfile, "w") logging.debug("Filter valid deletions to `{0}`.".format(validbedfile)) bed = Bed(depthbedfile) all_scores = [float(b.score) for b in bed] lb, ub = outlier_cutoff(all_scores) logging.debug("Bounds for depths: LB={0:.2f} (ignored) UB={1:.2f}".format(lb, ub)) for b in bed: if float(b.score) > ub: continue print >> fw, b fw.close() # Remove deletions that contain sequencing gaps on its flanks selectedbedfile = pf + ".selected.bed" if need_update(validbedfile, selectedbedfile): flanksbedfile = pf + ".flanks.bed" fw = open(flanksbedfile, "w") bed = Bed(validbedfile) flank = 100 logging.debug("Write deletion flanks to `{0}`.".format(flanksbedfile)) for b in bed: start, end = b.start, b.end b.start, b.end = start, min(start + flank - 1, end) print >> fw, b b.start, b.end = max(start, end - flank + 1), end print >> fw, b fw.close() intersectidsfile = pf + ".intersect.ids" cmd = "intersectBed -a {0} -b {1}".format(flanksbedfile, gapsbedfile) cmd += " | cut -f4 | sort -u" sh(cmd, outfile=intersectidsfile) some([validbedfile, intersectidsfile, "-v", "--outfile={0}".format(selectedbedfile)]) # Find best-scoring non-overlapping set iesbedfile = pf + ".ies.bed" if need_update(selectedbedfile, iesbedfile): bed = Bed(selectedbedfile) fw = open(iesbedfile, "w") logging.debug("Write IES to `{0}`.".format(iesbedfile)) branges = [Range(x.seqid, x.start, x.end, int(x.accn.rsplit("r")[-1]), i) \ for i, x in enumerate(bed)] iranges, iscore = range_chain(branges) logging.debug("Best chain score: {0} ({1} IES)".\ format(iscore, len(iranges))) ies_id = 1 for seqid, start, end, score, id in iranges: ies_name = "IES-{0:05d}-r{1}".format(ies_id, score) span = end - start + 1 print >> fw, "\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name, span)) ies_id += 1 fw.close()