def test_intervals_5(): """ A case where there is nowhere 3-spanning coverage """ refWindow = (0, 0, 10) reads = [(x, x + 1) for x in xrange(0, 10)] reads.append((0, 10)) start, end = map(np.array, zip(*reads)) assert_equals([(x, x + 1) for x in xrange(0, 10)], kSpannedIntervals(refWindow, 2, start, end)) assert_equals([], kSpannedIntervals(refWindow, 3, start, end))
def test_intervals_5(): """ A case where there is nowhere 3-spanning coverage """ refWindow = (0, 0, 10) reads = [ (x, x+1) for x in xrange(0, 10) ] reads.append((0, 10)) start, end = map(np.array, zip(*reads)) assert_equals([ (x, x+1) for x in xrange(0, 10) ], kSpannedIntervals(refWindow, 2, start, end)) assert_equals([], kSpannedIntervals(refWindow, 3, start, end))
def test_intervals_3(): """ Intervals covering the middle of the window -- "dromedary" """ refWindow = (0, 0, 10) start = np.array([3] * 10, dtype=int) end = np.array([7] * 10, dtype=int) assert_equals([(3, 7)], kSpannedIntervals(refWindow, 3, start, end))
def test_intervals_2(): """ Intervals not touching the window """ refWindow = (0, 1, 10) start = np.array([0] * 5 + [10] * 5, dtype=int) end = np.array([1] * 5 + [15] * 5, dtype=int) assert_equals([], kSpannedIntervals(refWindow, 3, start, end))
def test_intervals_1(): """ Intervals all covering the window """ refWindow = (0, 100, 1010) start = np.array(np.array([100] * 10, dtype=int), dtype=int) end = np.array(np.array([110] * 10, dtype=int), dtype=int) assert_equals([(100, 110)], kSpannedIntervals(refWindow, 3, start, end))
def test_intervals_underflow(): """ I found an case that gave the wrong results due to an underflow. Regression test here. """ refWindow = (0, 5, 10) tStart = np.arange(10, dtype=np.uint32) tEnd = tStart + 10 assert_equals([(5, 10)], kSpannedIntervals(refWindow, 3, tStart, tEnd))
def test_intervals_4(): """ Two intervals at the fringes, with a hole in the middle --- "camel" """ refWindow = (0, 100, 110) start = np.array([103] * 5 + [107] * 5, dtype=int) end = np.array([105] * 5 + [109] * 5, dtype=int) assert_equals([(103, 105), (107, 109)], kSpannedIntervals(refWindow, 3, start, end))
def test_intervals_4(): """ Two intervals at the fringes, with a hole in the middle --- "camel" """ refWindow = (0, 100, 110) start = np.array([103]*5 + [107]*5, dtype=int) end = np.array([105]*5 + [109]*5, dtype=int) assert_equals([(103,105), (107,109)], kSpannedIntervals(refWindow, 3, start, end))
def test_intervals_3(): """ Intervals covering the middle of the window -- "dromedary" """ refWindow = (0, 0, 10) start = np.array([3]*10, dtype=int) end = np.array([7]*10, dtype=int) assert_equals([(3, 7)], kSpannedIntervals(refWindow, 3, start, end))
def test_intervals_2(): """ Intervals not touching the window """ refWindow = (0, 1, 10) start = np.array([0]*5 + [10]*5, dtype=int) end = np.array([1]*5 + [15]*5, dtype=int) assert_equals([], kSpannedIntervals(refWindow, 3, start, end))
def test_intervals_1(): """ Intervals all covering the window """ refWindow = (0, 100, 1010) start = np.array(np.array([100]*10, dtype=int), dtype=int) end = np.array(np.array([110]*10, dtype=int), dtype=int) assert_equals([(100, 110)], kSpannedIntervals(refWindow, 3, start, end))
def consensus(alnReader, refWindow, referenceTable, alns): # identify the enlarged interval [-5, +5] refName = alnReader.referenceInfo(refWindow.refId).FullName refLength = len(referenceTable[refName].sequence) eWindow = enlargedReferenceWindow(refWindow, refLength, overlap) refSeqInEnlargedWindow = referenceTable[refName].sequence[eWindow.start:eWindow.end] # find 3-spanned intervals in the enlarged interval # call css for each interval subConsensi = [] tStart = [ a.tStart for a in alns ] tEnd = [ a.tEnd for a in alns ] coveredIntervals = w.kSpannedIntervals(eWindow, K, tStart, tEnd) holes = w.holes(eWindow, coveredIntervals) for interval in sorted(coveredIntervals + holes): subWin = subWindow(eWindow, interval) #print subWin intStart, intEnd = interval intRefSeq = refSeqInEnlargedWindow[intStart-eWindow.start: intEnd-eWindow.start] css_ = Consensus.nAsConsensus(subWin, intRefSeq) if interval in coveredIntervals: alns = readsInWindow(alnReader, subWin, depthLimit=100, minMapQV=quiverConfig.minMapQV, strategy="longest") clippedAlns = [ aln.clippedTo(*interval) for aln in alns ] goodAlns = q.utils.filterAlns(subWin, clippedAlns, quiverConfig) if len(goodAlns) >= K: css_ = q.utils.consensusForAlignments(subWin, intRefSeq, goodAlns, quiverConfig) subConsensi.append(css_) # join subconsensus objects css = join(subConsensi) # align css back to refWindow, and clip ga = cc.Align(refSeqInEnlargedWindow, css.sequence) targetPositions = cc.TargetToQueryPositions(ga) cssStart = targetPositions[refWindow.start-eWindow.start] cssEnd = targetPositions[refWindow.end-eWindow.start] cssSequence = css.sequence[cssStart:cssEnd] cssQv = css.confidence[cssStart:cssEnd] consensusObj = Consensus(refWindow, cssSequence, cssQv) return consensusObj
def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig, depthLimit, arrowConfig): """ High-level routine for calling the consensus for a window of the genome given a BAM file. Identifies the coverage contours of the window in order to identify subintervals where a good consensus can be called. Creates the desired "no evidence consensus" where there is inadequate coverage. """ winId, winStart, winEnd = refWindow logging.info("Arrow operating on %s" % reference.windowToString(refWindow)) if options.fancyChunking: # 1) identify the intervals with adequate coverage for arrow # consensus; restrict to intervals of length > 10 alnHits = U.readsInWindow(alnFile, refWindow, depthLimit=20000, minMapQV=arrowConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) starts = np.fromiter((hit.tStart for hit in alnHits), np.int) ends = np.fromiter((hit.tEnd for hit in alnHits), np.int) intervals = kSpannedIntervals(refWindow, arrowConfig.minPoaCoverage, starts, ends, minLength=10) coverageGaps = holes(refWindow, intervals) allIntervals = sorted(intervals + coverageGaps) if len(allIntervals) > 1: logging.info("Usable coverage in %s: %r" % (reference.windowToString(refWindow), intervals)) else: allIntervals = [(winStart, winEnd)] # 2) pull out the reads we will use for each interval # 3) call consensusForAlignments on the interval subConsensi = [] variants = [] for interval in allIntervals: intStart, intEnd = interval intRefSeq = referenceContig[intStart:intEnd] subWin = subWindow(refWindow, interval) windowRefSeq = referenceContig[intStart:intEnd] alns = U.readsInWindow(alnFile, subWin, depthLimit=depthLimit, minMapQV=arrowConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) clippedAlns_ = [aln.clippedTo(*interval) for aln in alns] clippedAlns = U.filterAlns(subWin, clippedAlns_, arrowConfig) if len([a for a in clippedAlns if a.spansReferenceRange(*interval) ]) >= arrowConfig.minPoaCoverage: logging.debug("%s: Reads being used: %s" % (reference.windowToString(subWin), " ".join( [str(hit.readName) for hit in alns]))) alnsUsed = [] if options.reportEffectiveCoverage else None css = U.consensusForAlignments(subWin, intRefSeq, clippedAlns, arrowConfig, alnsUsed=alnsUsed) # Tabulate the coverage implied by these alignments, as # well as the post-filtering ("effective") coverage siteCoverage = U.coverageInWindow(subWin, alns) effectiveSiteCoverage = U.coverageInWindow( subWin, alnsUsed) if options.reportEffectiveCoverage else None variants_, newPureCss = U.variantsFromConsensus( subWin, windowRefSeq, css.sequence, css.confidence, siteCoverage, effectiveSiteCoverage, options.aligner, ai=None, diploid=arrowConfig.polishDiploid) # Annotate? if options.annotateGFF: annotateVariants(variants_, clippedAlns) variants += variants_ # The nascent consensus sequence might contain ambiguous bases, these # need to be removed as software in the wild cannot deal with such # characters and we only use IUPAC for *internal* bookkeeping. if arrowConfig.polishDiploid: css.sequence = newPureCss else: css = ArrowConsensus.noCallConsensus( arrowConfig.noEvidenceConsensus, subWin, intRefSeq) subConsensi.append(css) # 4) glue the subwindow consensus objects together to form the # full window consensus css = join(subConsensi) # 5) Return return css, variants
def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig, depthLimit, arrowConfig): """ High-level routine for calling the consensus for a window of the genome given a cmp.h5. Identifies the coverage contours of the window in order to identify subintervals where a good consensus can be called. Creates the desired "no evidence consensus" where there is inadequate coverage. """ winId, winStart, winEnd = refWindow logging.info("Arrow operating on %s" % reference.windowToString(refWindow)) if options.fancyChunking: # 1) identify the intervals with adequate coverage for arrow # consensus; restrict to intervals of length > 10 alnHits = U.readsInWindow(alnFile, refWindow, depthLimit=20000, minMapQV=arrowConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) starts = np.fromiter((hit.tStart for hit in alnHits), np.int) ends = np.fromiter((hit.tEnd for hit in alnHits), np.int) intervals = kSpannedIntervals(refWindow, arrowConfig.minPoaCoverage, starts, ends, minLength=10) coverageGaps = holes(refWindow, intervals) allIntervals = sorted(intervals + coverageGaps) if len(allIntervals) > 1: logging.info("Usable coverage in %s: %r" % (reference.windowToString(refWindow), intervals)) else: allIntervals = [ (winStart, winEnd) ] # 2) pull out the reads we will use for each interval # 3) call consensusForAlignments on the interval subConsensi = [] variants = [] for interval in allIntervals: intStart, intEnd = interval intRefSeq = referenceContig[intStart:intEnd] subWin = subWindow(refWindow, interval) windowRefSeq = referenceContig[intStart:intEnd] alns = U.readsInWindow(alnFile, subWin, depthLimit=depthLimit, minMapQV=arrowConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ] clippedAlns = U.filterAlns(subWin, clippedAlns_, arrowConfig) if len([ a for a in clippedAlns if a.spansReferenceRange(*interval) ]) >= arrowConfig.minPoaCoverage: logging.debug("%s: Reads being used: %s" % (reference.windowToString(subWin), " ".join([str(hit.readName) for hit in alns]))) alnsUsed = [] if options.reportEffectiveCoverage else None css = U.consensusForAlignments(subWin, intRefSeq, clippedAlns, arrowConfig, alnsUsed=alnsUsed) # Tabulate the coverage implied by these alignments, as # well as the post-filtering ("effective") coverage siteCoverage = U.coverageInWindow(subWin, alns) effectiveSiteCoverage = U.coverageInWindow(subWin, alnsUsed) if options.reportEffectiveCoverage else None variants_ = U.variantsFromConsensus(subWin, windowRefSeq, css.sequence, css.confidence, siteCoverage, effectiveSiteCoverage, options.aligner, ai=None) filteredVars = filterVariants(options.minCoverage, options.minConfidence, variants_) # Annotate? if options.annotateGFF: annotateVariants(filteredVars, clippedAlns) variants += filteredVars # Dump? maybeDumpEvidence = \ ((options.dumpEvidence == "all") or (options.dumpEvidence == "outliers") or (options.dumpEvidence == "variants") and (len(variants) > 0)) if maybeDumpEvidence: refId, refStart, refEnd = subWin refName = reference.idToName(refId) windowDirectory = os.path.join( options.evidenceDirectory, refName, "%d-%d" % (refStart, refEnd)) ev = ArrowEvidence.fromConsensus(css) if options.dumpEvidence != "outliers": ev.save(windowDirectory) elif (np.max(ev.delta) > 20): # Mathematically I don't think we should be seeing # deltas > 6 in magnitude, but let's just restrict # attention to truly bonkers outliers. ev.save(windowDirectory) else: css = ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, subWin, intRefSeq) subConsensi.append(css) # 4) glue the subwindow consensus objects together to form the # full window consensus css = join(subConsensi) # 5) Return return css, variants
def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig, depthLimit, arrowConfig): """ High-level routine for calling the consensus for a window of the genome given a cmp.h5. Identifies the coverage contours of the window in order to identify subintervals where a good consensus can be called. Creates the desired "no evidence consensus" where there is inadequate coverage. """ winId, winStart, winEnd = refWindow logging.info("Arrow operating on %s" % reference.windowToString(refWindow)) if options.fancyChunking: # 1) identify the intervals with adequate coverage for arrow # consensus; restrict to intervals of length > 10 alnHits = U.readsInWindow(alnFile, refWindow, depthLimit=20000, minMapQV=arrowConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) starts = np.fromiter((hit.tStart for hit in alnHits), np.int) ends = np.fromiter((hit.tEnd for hit in alnHits), np.int) intervals = kSpannedIntervals(refWindow, arrowConfig.minPoaCoverage, starts, ends, minLength=10) coverageGaps = holes(refWindow, intervals) allIntervals = sorted(intervals + coverageGaps) if len(allIntervals) > 1: logging.info("Usable coverage in %s: %r" % (reference.windowToString(refWindow), intervals)) else: allIntervals = [(winStart, winEnd)] # 2) pull out the reads we will use for each interval # 3) call consensusForAlignments on the interval subConsensi = [] variants = [] for interval in allIntervals: intStart, intEnd = interval intRefSeq = referenceContig[intStart:intEnd] subWin = subWindow(refWindow, interval) windowRefSeq = referenceContig[intStart:intEnd] alns = U.readsInWindow(alnFile, subWin, depthLimit=depthLimit, minMapQV=arrowConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) clippedAlns_ = [aln.clippedTo(*interval) for aln in alns] clippedAlns = U.filterAlns(subWin, clippedAlns_, arrowConfig) if len([a for a in clippedAlns if a.spansReferenceRange(*interval) ]) >= arrowConfig.minPoaCoverage: logging.debug("%s: Reads being used: %s" % (reference.windowToString(subWin), " ".join( [str(hit.readName) for hit in alns]))) alnsUsed = [] if options.reportEffectiveCoverage else None css = U.consensusForAlignments(subWin, intRefSeq, clippedAlns, arrowConfig, alnsUsed=alnsUsed) # Tabulate the coverage implied by these alignments, as # well as the post-filtering ("effective") coverage siteCoverage = U.coverageInWindow(subWin, alns) effectiveSiteCoverage = U.coverageInWindow( subWin, alnsUsed) if options.reportEffectiveCoverage else None variants_ = U.variantsFromConsensus(subWin, windowRefSeq, css.sequence, css.confidence, siteCoverage, effectiveSiteCoverage, options.aligner, ai=None) filteredVars = filterVariants(options.minCoverage, options.minConfidence, variants_) # Annotate? if options.annotateGFF: annotateVariants(filteredVars, clippedAlns) variants += filteredVars # Dump? maybeDumpEvidence = \ ((options.dumpEvidence == "all") or (options.dumpEvidence == "outliers") or (options.dumpEvidence == "variants") and (len(variants) > 0)) if maybeDumpEvidence: refId, refStart, refEnd = subWin refName = reference.idToName(refId) windowDirectory = os.path.join(options.evidenceDirectory, refName, "%d-%d" % (refStart, refEnd)) ev = ArrowEvidence.fromConsensus(css) if options.dumpEvidence != "outliers": ev.save(windowDirectory) elif (np.max(ev.delta) > 20): # Mathematically I don't think we should be seeing # deltas > 6 in magnitude, but let's just restrict # attention to truly bonkers outliers. ev.save(windowDirectory) else: css = ArrowConsensus.noCallConsensus( arrowConfig.noEvidenceConsensus, subWin, intRefSeq) subConsensi.append(css) # 4) glue the subwindow consensus objects together to form the # full window consensus css = join(subConsensi) # 5) Return return css, variants
def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig, depthLimit, arrowConfig): """ High-level routine for calling the consensus for a window of the genome given a cmp.h5. Identifies the coverage contours of the window in order to identify subintervals where a good consensus can be called. Creates the desired "no evidence consensus" where there is inadequate coverage. """ winId, winStart, winEnd = refWindow logging.info("Arrow operating on %s" % reference.windowToString(refWindow)) if options.fancyChunking: # 1) identify the intervals with adequate coverage for arrow # consensus; restrict to intervals of length > 10 alnHits = U.readsInWindow(alnFile, refWindow, depthLimit=20000, minMapQV=arrowConfig.minMapQV, strategy="longest", stratum=options.readStratum, barcode=options.barcode) starts = np.fromiter((hit.tStart for hit in alnHits), np.int) ends = np.fromiter((hit.tEnd for hit in alnHits), np.int) intervals = kSpannedIntervals(refWindow, arrowConfig.minPoaCoverage, starts, ends, minLength=10) coverageGaps = holes(refWindow, intervals) allIntervals = sorted(intervals + coverageGaps) if len(allIntervals) > 1: logging.info("Usable coverage in %s: %r" % (reference.windowToString(refWindow), intervals)) else: allIntervals = [ (winStart, winEnd) ] # 2) pull out the reads we will use for each interval # 3) call consensusForAlignments on the interval subConsensi = [] variants = [] for interval in allIntervals: intStart, intEnd = interval intRefSeq = referenceContig[intStart:intEnd] subWin = subWindow(refWindow, interval) windowRefSeq = referenceContig[intStart:intEnd] alns = U.readsInWindow(alnFile, subWin, depthLimit=depthLimit, minMapQV=arrowConfig.minMapQV, strategy="longest", stratum=options.readStratum, barcode=options.barcode) clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ] clippedAlns = U.filterAlns(subWin, clippedAlns_, arrowConfig) if len([ a for a in clippedAlns if a.spansReferenceRange(*interval) ]) >= arrowConfig.minPoaCoverage: logging.debug("%s: Reads being used: %s" % (reference.windowToString(subWin), " ".join([str(hit.readName) for hit in alns]))) css = U.consensusForAlignments(subWin, intRefSeq, clippedAlns, arrowConfig) siteCoverage = U.coverageInWindow(subWin, alns) variants_ = U.variantsFromConsensus(subWin, windowRefSeq, css.sequence, css.confidence, siteCoverage, options.aligner, ai=None) filteredVars = filterVariants(options.minCoverage, options.minConfidence, variants_) # Annotate? if options.annotateGFF: annotateVariants(filteredVars, clippedAlns) variants += filteredVars # Dump? shouldDumpEvidence = \ ((options.dumpEvidence == "all") or (options.dumpEvidence == "variants") and (len(variants) > 0)) if shouldDumpEvidence: logging.info("Arrow does not yet support --dumpEvidence") # dumpEvidence(options.evidenceDirectory, # subWin, windowRefSeq, # clippedAlns, css) else: css = ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, subWin, intRefSeq) subConsensi.append(css) # 4) glue the subwindow consensus objects together to form the # full window consensus css = join(subConsensi) # 5) Return return css, variants
def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig, depthLimit, arrowConfig): """ High-level routine for calling the consensus for a window of the genome given a BAM file. Identifies the coverage contours of the window in order to identify subintervals where a good consensus can be called. Creates the desired "no evidence consensus" where there is inadequate coverage. """ winId, winStart, winEnd = refWindow logging.info("Arrow operating on %s" % reference.windowToString(refWindow)) if options.fancyChunking: # 1) identify the intervals with adequate coverage for arrow # consensus; restrict to intervals of length > 10 alnHits = U.readsInWindow(alnFile, refWindow, depthLimit=20000, minMapQV=arrowConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) starts = np.fromiter((hit.tStart for hit in alnHits), np.int) ends = np.fromiter((hit.tEnd for hit in alnHits), np.int) intervals = kSpannedIntervals(refWindow, arrowConfig.minPoaCoverage, starts, ends, minLength=10) coverageGaps = holes(refWindow, intervals) allIntervals = sorted(intervals + coverageGaps) if len(allIntervals) > 1: logging.info("Usable coverage in %s: %r" % (reference.windowToString(refWindow), intervals)) else: allIntervals = [ (winStart, winEnd) ] # 2) pull out the reads we will use for each interval # 3) call consensusForAlignments on the interval subConsensi = [] variants = [] for interval in allIntervals: intStart, intEnd = interval intRefSeq = referenceContig[intStart:intEnd] subWin = subWindow(refWindow, interval) windowRefSeq = referenceContig[intStart:intEnd] alns = U.readsInWindow(alnFile, subWin, depthLimit=depthLimit, minMapQV=arrowConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ] clippedAlns = U.filterAlns(subWin, clippedAlns_, arrowConfig) if len([ a for a in clippedAlns if a.spansReferenceRange(*interval) ]) >= arrowConfig.minPoaCoverage: logging.debug("%s: Reads being used: %s" % (reference.windowToString(subWin), " ".join([str(hit.readName) for hit in alns]))) alnsUsed = [] if options.reportEffectiveCoverage else None css = U.consensusForAlignments(subWin, intRefSeq, clippedAlns, arrowConfig, alnsUsed=alnsUsed) # Tabulate the coverage implied by these alignments, as # well as the post-filtering ("effective") coverage siteCoverage = U.coverageInWindow(subWin, alns) effectiveSiteCoverage = U.coverageInWindow(subWin, alnsUsed) if options.reportEffectiveCoverage else None variants_, newPureCss = U.variantsFromConsensus(subWin, windowRefSeq, css.sequence, css.confidence, siteCoverage, effectiveSiteCoverage, options.aligner, ai=None, diploid=arrowConfig.polishDiploid) # Annotate? if options.annotateGFF: annotateVariants(variants_, clippedAlns) variants += variants_ # The nascent consensus sequence might contain ambiguous bases, these # need to be removed as software in the wild cannot deal with such # characters and we only use IUPAC for *internal* bookkeeping. if arrowConfig.polishDiploid: css.sequence = newPureCss else: css = ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, subWin, intRefSeq) subConsensi.append(css) # 4) glue the subwindow consensus objects together to form the # full window consensus css = join(subConsensi) # 5) Return return css, variants