def consensusForAlignments(refWindow, refSequence, alns, arrowConfig): """ Call consensus on this interval---without subdividing the interval further. Testable! Clipping has already been done! """ _, refStart, refEnd = refWindow # Compute the POA consensus, which is our initial guess, and # should typically be > 99.5% accurate fwdSequences = [ a.read(orientation="genomic", aligned=False) for a in alns if a.spansReferenceRange(refStart, refEnd) ] assert len(fwdSequences) >= arrowConfig.minPoaCoverage try: p = cc.PoaConsensus.FindConsensus(fwdSequences[:arrowConfig.maxPoaCoverage]) except: logging.info("%s: POA could not be generated" % (refWindow,)) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence) ga = cc.Align(refSequence, p.Sequence) numPoaVariants = ga.Errors() poaCss = p.Sequence # Extract reads into ConsensusCore2-compatible objects, and map them into the # coordinates relative to the POA consensus mappedReads = [ arrowConfig.extractMappedRead(aln, refStart) for aln in alns ] queryPositions = cc.TargetToQueryPositions(ga) mappedReads = [ (lifted(queryPositions, mr), snr) for (mr, snr) in mappedReads ] # Load the mapped reads into the mutation scorer, and iterate # until convergence. ai = cc.MultiMolecularIntegrator(poaCss, cc.IntegratorConfig()) for (mr, snr) in mappedReads: # TODO (dalexander, lhepler): check for success to compute coverage accurately ai.AddRead(mr, snr) # Iterate until covergence try: _, converged = refineConsensus(ai, arrowConfig) assert converged, "Arrow did not converge to MLE" arrowCss = str(ai) if arrowConfig.computeConfidence: confidence = consensusConfidence(ai) else: confidence = np.zeros(shape=len(arrowCss), dtype=int) return ArrowConsensus(refWindow, arrowCss, confidence, ai) except: traceback = ''.join(format_exception(*sys.exc_info())) logging.info("%s: %s" % (refWindow, traceback)) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence)
def onChunk(self, workChunk): referenceWindow = workChunk.window refId, refStart, refEnd = referenceWindow refSeqInWindow = reference.sequenceInWindow(referenceWindow) # Quick cutout for no-coverage case if not workChunk.hasCoverage: noCallCss = ArrowConsensus.noCallConsensus(self.arrowConfig.noEvidenceConsensus, referenceWindow, refSeqInWindow) return (referenceWindow, (noCallCss, [])) # General case eWindow = reference.enlargedReferenceWindow(referenceWindow, options.referenceChunkOverlap) _, eStart, eEnd = eWindow # We call consensus on the enlarged window and then map back # to the reference and clip the consensus at the implied # bounds. This seems to be more reliable thank cutting the # consensus bluntly refContig = reference.byName[refId].sequence refSequenceInEnlargedWindow = refContig[eStart:eEnd] # # Get the consensus for the enlarged window. # css_, variants_ = \ consensusAndVariantsForWindow(self._inAlnFile, eWindow, refContig, options.coverage, self.arrowConfig) # # Restrict the consensus and variants to the reference window. # ga = cc.Align(refSequenceInEnlargedWindow, css_.sequence) targetPositions = cc.TargetToQueryPositions(ga) cssStart = targetPositions[refStart-eStart] cssEnd = targetPositions[refEnd-eStart] cssSequence = css_.sequence[cssStart:cssEnd] cssQv = css_.confidence[cssStart:cssEnd] variants = [ v for v in variants_ if refStart <= v.refStart < refEnd ] consensusObj = Consensus(referenceWindow, cssSequence, cssQv) return (referenceWindow, (consensusObj, variants))
def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig, depthLimit, arrowConfig): """ High-level routine for calling the consensus for a window of the genome given a cmp.h5. Identifies the coverage contours of the window in order to identify subintervals where a good consensus can be called. Creates the desired "no evidence consensus" where there is inadequate coverage. """ winId, winStart, winEnd = refWindow logging.info("Arrow operating on %s" % reference.windowToString(refWindow)) if options.fancyChunking: # 1) identify the intervals with adequate coverage for arrow # consensus; restrict to intervals of length > 10 alnHits = U.readsInWindow(alnFile, refWindow, depthLimit=20000, minMapQV=arrowConfig.minMapQV, strategy="longest", stratum=options.readStratum, barcode=options.barcode) starts = np.fromiter((hit.tStart for hit in alnHits), np.int) ends = np.fromiter((hit.tEnd for hit in alnHits), np.int) intervals = kSpannedIntervals(refWindow, arrowConfig.minPoaCoverage, starts, ends, minLength=10) coverageGaps = holes(refWindow, intervals) allIntervals = sorted(intervals + coverageGaps) if len(allIntervals) > 1: logging.info("Usable coverage in %s: %r" % (reference.windowToString(refWindow), intervals)) else: allIntervals = [ (winStart, winEnd) ] # 2) pull out the reads we will use for each interval # 3) call consensusForAlignments on the interval subConsensi = [] variants = [] for interval in allIntervals: intStart, intEnd = interval intRefSeq = referenceContig[intStart:intEnd] subWin = subWindow(refWindow, interval) windowRefSeq = referenceContig[intStart:intEnd] alns = U.readsInWindow(alnFile, subWin, depthLimit=depthLimit, minMapQV=arrowConfig.minMapQV, strategy="longest", stratum=options.readStratum, barcode=options.barcode) clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ] clippedAlns = U.filterAlns(subWin, clippedAlns_, arrowConfig) if len([ a for a in clippedAlns if a.spansReferenceRange(*interval) ]) >= arrowConfig.minPoaCoverage: logging.debug("%s: Reads being used: %s" % (reference.windowToString(subWin), " ".join([str(hit.readName) for hit in alns]))) css = U.consensusForAlignments(subWin, intRefSeq, clippedAlns, arrowConfig) siteCoverage = U.coverageInWindow(subWin, alns) variants_ = U.variantsFromConsensus(subWin, windowRefSeq, css.sequence, css.confidence, siteCoverage, options.aligner, ai=None) filteredVars = filterVariants(options.minCoverage, options.minConfidence, variants_) # Annotate? if options.annotateGFF: annotateVariants(filteredVars, clippedAlns) variants += filteredVars # Dump? shouldDumpEvidence = \ ((options.dumpEvidence == "all") or (options.dumpEvidence == "variants") and (len(variants) > 0)) if shouldDumpEvidence: logging.info("Arrow does not yet support --dumpEvidence") # dumpEvidence(options.evidenceDirectory, # subWin, windowRefSeq, # clippedAlns, css) else: css = ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, subWin, intRefSeq) subConsensi.append(css) # 4) glue the subwindow consensus objects together to form the # full window consensus css = join(subConsensi) # 5) Return return css, variants
def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig, depthLimit, arrowConfig): """ High-level routine for calling the consensus for a window of the genome given a BAM file. Identifies the coverage contours of the window in order to identify subintervals where a good consensus can be called. Creates the desired "no evidence consensus" where there is inadequate coverage. """ winId, winStart, winEnd = refWindow logging.info("Arrow operating on %s" % reference.windowToString(refWindow)) if options.fancyChunking: # 1) identify the intervals with adequate coverage for arrow # consensus; restrict to intervals of length > 10 alnHits = U.readsInWindow(alnFile, refWindow, depthLimit=20000, minMapQV=arrowConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) starts = np.fromiter((hit.tStart for hit in alnHits), np.int) ends = np.fromiter((hit.tEnd for hit in alnHits), np.int) intervals = kSpannedIntervals(refWindow, arrowConfig.minPoaCoverage, starts, ends, minLength=10) coverageGaps = holes(refWindow, intervals) allIntervals = sorted(intervals + coverageGaps) if len(allIntervals) > 1: logging.info("Usable coverage in %s: %r" % (reference.windowToString(refWindow), intervals)) else: allIntervals = [(winStart, winEnd)] # 2) pull out the reads we will use for each interval # 3) call consensusForAlignments on the interval subConsensi = [] variants = [] for interval in allIntervals: intStart, intEnd = interval intRefSeq = referenceContig[intStart:intEnd] subWin = subWindow(refWindow, interval) windowRefSeq = referenceContig[intStart:intEnd] alns = U.readsInWindow(alnFile, subWin, depthLimit=depthLimit, minMapQV=arrowConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) clippedAlns_ = [aln.clippedTo(*interval) for aln in alns] clippedAlns = U.filterAlns(subWin, clippedAlns_, arrowConfig) if len([a for a in clippedAlns if a.spansReferenceRange(*interval) ]) >= arrowConfig.minPoaCoverage: logging.debug("%s: Reads being used: %s" % (reference.windowToString(subWin), " ".join( [str(hit.readName) for hit in alns]))) alnsUsed = [] if options.reportEffectiveCoverage else None css = U.consensusForAlignments(subWin, intRefSeq, clippedAlns, arrowConfig, alnsUsed=alnsUsed) # Tabulate the coverage implied by these alignments, as # well as the post-filtering ("effective") coverage siteCoverage = U.coverageInWindow(subWin, alns) effectiveSiteCoverage = U.coverageInWindow( subWin, alnsUsed) if options.reportEffectiveCoverage else None variants_, newPureCss = U.variantsFromConsensus( subWin, windowRefSeq, css.sequence, css.confidence, siteCoverage, effectiveSiteCoverage, options.aligner, ai=None, diploid=arrowConfig.polishDiploid) # Annotate? if options.annotateGFF: annotateVariants(variants_, clippedAlns) variants += variants_ # The nascent consensus sequence might contain ambiguous bases, these # need to be removed as software in the wild cannot deal with such # characters and we only use IUPAC for *internal* bookkeeping. if arrowConfig.polishDiploid: css.sequence = newPureCss else: css = ArrowConsensus.noCallConsensus( arrowConfig.noEvidenceConsensus, subWin, intRefSeq) subConsensi.append(css) # 4) glue the subwindow consensus objects together to form the # full window consensus css = join(subConsensi) # 5) Return return css, variants
def consensusForAlignments(refWindow, refSequence, alns, arrowConfig, draft=None, polish=True, alnsUsed=None): """ Call consensus on this interval---without subdividing the interval further. Returns an ArrowConsensus object. Requires that clipping has already been done. If `draft` is provided, it will serve as the starting point for polishing. If not, the POA will be used to generate a draft starting point. If `polish` is False, the arrow polishing procedure will not be used, and the draft consensus will be returned. `alnsUsed` is an output parameter; if not None, it should be an empty list on entry; on return from this function, the list will contain the alns objects that were actually used to compute the consensus (those not filtered out). """ _, refStart, refEnd = refWindow if alnsUsed is not None: assert alnsUsed == [] if draft is None: # Compute the POA consensus, which is our initial guess, and # should typically be > 99.5% accurate fwdSequences = [ a.read(orientation="genomic", aligned=False) for a in alns if a.spansReferenceRange(refStart, refEnd) ] assert len(fwdSequences) >= arrowConfig.minPoaCoverage try: p = cc.PoaConsensus.FindConsensus(fwdSequences[:arrowConfig.maxPoaCoverage]) except: logging.info("%s: POA could not be generated" % (refWindow,)) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence) draft = p.Sequence ga = cc.Align(refSequence, draft) # Extract reads into ConsensusCore2-compatible objects, and map them into the # coordinates relative to the POA consensus mappedReads = [ arrowConfig.extractMappedRead(aln, refStart) for aln in alns ] queryPositions = cc.TargetToQueryPositions(ga) mappedReads = [ lifted(queryPositions, mr) for mr in mappedReads ] # Load the mapped reads into the mutation scorer, and iterate # until convergence. ai = cc.MultiMolecularIntegrator(draft, cc.IntegratorConfig(arrowConfig.minZScore)) coverage = 0 for i, mr in enumerate(mappedReads): if (mr.TemplateEnd <= mr.TemplateStart or mr.TemplateEnd - mr.TemplateStart < 2 or mr.Length() < 2): continue if not sufficientlyAccurate(mr, draft, arrowConfig.minAccuracy): tpl = draft[mr.TemplateStart:mr.TemplateEnd] if mr.Strand == cc.StrandType_FORWARD: pass elif mr.Strand == cc.StrandType_REVERSE: tpl = reverseComplement(tpl) else: tpl = "INACTIVE/UNMAPPED" logging.debug("%s: skipping read '%s' due to insufficient accuracy, (poa, read): ('%s', '%s')" % (refWindow, mr.Name, tpl, mr.Seq)) continue if ai.AddRead(mr) == cc.State_VALID: coverage += 1 if alnsUsed is not None: alnsUsed.append(alns[i]) if coverage < arrowConfig.minPoaCoverage: logging.info("%s: Inadequate coverage to call consensus" % (refWindow,)) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence) if not polish: confidence = np.zeros(len(draft), dtype=int) return ArrowConsensus(refWindow, draft, confidence, ai) # Iterate until covergence _, converged = refineConsensus(ai, arrowConfig) if converged: arrowCss = str(ai) if arrowConfig.computeConfidence: confidence = consensusConfidence(ai) else: confidence = np.zeros(shape=len(arrowCss), dtype=int) return ArrowConsensus(refWindow, arrowCss, confidence, ai) else: logging.info("%s: Arrow did not converge to MLE" % (refWindow,)) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence)
def consensusForAlignments(refWindow, refSequence, alns, arrowConfig): """ Call consensus on this interval---without subdividing the interval further. Testable! Clipping has already been done! """ _, refStart, refEnd = refWindow # Compute the POA consensus, which is our initial guess, and # should typically be > 99.5% accurate fwdSequences = [ a.read(orientation="genomic", aligned=False) for a in alns if a.spansReferenceRange(refStart, refEnd) ] assert len(fwdSequences) >= arrowConfig.minPoaCoverage try: p = cc.PoaConsensus.FindConsensus(fwdSequences[:arrowConfig.maxPoaCoverage]) except: logging.info("%s: POA could not be generated" % (refWindow,)) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence) ga = cc.Align(refSequence, p.Sequence) numPoaVariants = ga.Errors() poaCss = p.Sequence # Extract reads into ConsensusCore2-compatible objects, and map them into the # coordinates relative to the POA consensus mappedReads = [ arrowConfig.extractMappedRead(aln, refStart) for aln in alns ] queryPositions = cc.TargetToQueryPositions(ga) mappedReads = [ lifted(queryPositions, mr) for mr in mappedReads ] # Load the mapped reads into the mutation scorer, and iterate # until convergence. ai = cc.MultiMolecularIntegrator(poaCss, cc.IntegratorConfig(arrowConfig.minZScore)) coverage = 0 for mr in mappedReads: if (mr.TemplateEnd <= mr.TemplateStart or mr.TemplateEnd - mr.TemplateStart < 2 or mr.Length() < 2): continue if not sufficientlyAccurate(mr, poaCss, arrowConfig.minAccuracy): tpl = poaCss[mr.TemplateStart:mr.TemplateEnd] if mr.Strand == cc.StrandType_FORWARD: pass elif mr.Strand == cc.StrandType_REVERSE: tpl = reverseComplement(tpl) else: tpl = "INACTIVE/UNMAPPED" logging.debug("%s: skipping read '%s' due to insufficient accuracy, (poa, read): ('%s', '%s')" % (refWindow, mr.Name, tpl, mr.Seq)) continue if ai.AddRead(mr) == cc.State_VALID: coverage += 1 # Iterate until covergence if coverage < arrowConfig.minPoaCoverage: logging.info("%s: Inadequate coverage to call consensus" % (refWindow,)) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence) _, converged = refineConsensus(ai, arrowConfig) if converged: arrowCss = str(ai) if arrowConfig.computeConfidence: confidence = consensusConfidence(ai) else: confidence = np.zeros(shape=len(arrowCss), dtype=int) return ArrowConsensus(refWindow, arrowCss, confidence, ai) else: logging.info("%s: Arrow did not converge to MLE" % (refWindow,)) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence)
def consensusForAlignments(refWindow, refSequence, alns, arrowConfig): """ Call consensus on this interval---without subdividing the interval further. Testable! Clipping has already been done! """ _, refStart, refEnd = refWindow # Compute the POA consensus, which is our initial guess, and # should typically be > 99.5% accurate fwdSequences = [ a.read(orientation="genomic", aligned=False) for a in alns if a.spansReferenceRange(refStart, refEnd) ] assert len(fwdSequences) >= arrowConfig.minPoaCoverage try: p = cc.PoaConsensus.FindConsensus( fwdSequences[:arrowConfig.maxPoaCoverage]) except: logging.info("%s: POA could not be generated" % (refWindow, )) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence) ga = cc.Align(refSequence, p.Sequence) numPoaVariants = ga.Errors() poaCss = p.Sequence # Extract reads into ConsensusCore2-compatible objects, and map them into the # coordinates relative to the POA consensus mappedReads = [ arrowConfig.extractMappedRead(aln, refStart) for aln in alns ] queryPositions = cc.TargetToQueryPositions(ga) mappedReads = [(lifted(queryPositions, mr), snr) for (mr, snr) in mappedReads] # Load the mapped reads into the mutation scorer, and iterate # until convergence. ai = cc.MultiMolecularIntegrator( poaCss, cc.IntegratorConfig(arrowConfig.minZScore)) coverage = 0 for (mr, snr) in mappedReads: if (mr.TemplateEnd <= mr.TemplateStart or mr.TemplateEnd - mr.TemplateStart < 2 or mr.Length() < 2): continue coverage += 1 if ai.AddRead(mr, snr) == cc.AddReadResult_SUCCESS else 0 # TODO(lhepler, dalexander): propagate coverage around somehow # Iterate until covergence try: assert coverage >= arrowConfig.minPoaCoverage, \ "Insufficient coverage (%d) to call consensus (%d)" \ % (coverage, arrowConfig.minPoaCoverage) _, converged = refineConsensus(ai, arrowConfig) assert converged, "Arrow did not converge to MLE" arrowCss = str(ai) if arrowConfig.computeConfidence: confidence = consensusConfidence(ai) else: confidence = np.zeros(shape=len(arrowCss), dtype=int) return ArrowConsensus(refWindow, arrowCss, confidence, ai) except: traceback = ''.join(format_exception(*sys.exc_info())) logging.info("%s: %s" % (refWindow, traceback)) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence)
def consensusForAlignments(refWindow, refSequence, alns, arrowConfig, draft=None, polish=True, alnsUsed=None): """ Call consensus on this interval---without subdividing the interval further. Returns an ArrowConsensus object. Requires that clipping has already been done. If `draft` is provided, it will serve as the starting point for polishing. If not, the POA will be used to generate a draft starting point. If `polish` is False, the arrow polishing procedure will not be used, and the draft consensus will be returned. `alnsUsed` is an output parameter; if not None, it should be an empty list on entry; on return from this function, the list will contain the alns objects that were actually used to compute the consensus (those not filtered out). """ _, refStart, refEnd = refWindow if alnsUsed is not None: assert alnsUsed == [] if draft is None: # Compute the POA consensus, which is our initial guess, and # should typically be > 99.5% accurate fwdSequences = [ a.read(orientation="genomic", aligned=False) for a in alns if a.spansReferenceRange(refStart, refEnd) ] assert len(fwdSequences) >= arrowConfig.minPoaCoverage try: p = poaConsensus(fwdSequences, arrowConfig) except Exception: logging.info("%s: POA could not be generated" % (refWindow, )) return ArrowConsensus.noCallConsensus( arrowConfig.noEvidenceConsensus, refWindow, refSequence) draft = p.Sequence ga = cc.Align(refSequence, draft) # Extract reads into ConsensusCore2-compatible objects, and map them into the # coordinates relative to the POA consensus mappedReads = [ arrowConfig.extractMappedRead(aln, refStart) for aln in alns ] queryPositions = cc.TargetToQueryPositions(ga) mappedReads = [lifted(queryPositions, mr) for mr in mappedReads] # Load the mapped reads into the mutation scorer, and iterate # until convergence. ai = cc.Integrator(draft, cc.IntegratorConfig(arrowConfig.minZScore)) coverage = 0 for i, mr in enumerate(mappedReads): if (mr.TemplateEnd <= mr.TemplateStart or mr.TemplateEnd - mr.TemplateStart < 2 or mr.Length() < 2): continue if not sufficientlyAccurate(mr, draft, arrowConfig.minAccuracy): tpl = draft[mr.TemplateStart:mr.TemplateEnd] if mr.Strand == cc.StrandType_FORWARD: pass elif mr.Strand == cc.StrandType_REVERSE: tpl = reverseComplement(tpl) else: tpl = "INACTIVE/UNMAPPED" logging.debug( "%s: skipping read '%s' due to insufficient accuracy, (poa, read): ('%s', '%s')" % (refWindow, mr.Name, tpl, mr.Seq)) continue if ai.AddRead(mr) == cc.State_VALID: coverage += 1 if alnsUsed is not None: alnsUsed.append(alns[i]) if coverage < arrowConfig.minPoaCoverage: logging.info("%s: Inadequate coverage to call consensus" % (refWindow, )) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence) if not polish: confidence = np.zeros(len(draft), dtype=int) return ArrowConsensus(refWindow, draft, confidence, ai) # Iterate until covergence _, converged = refineConsensus(ai, arrowConfig, polishDiploid=False) if converged: arrowCss = str(ai) if arrowConfig.computeConfidence: confidence = consensusConfidence(ai) else: confidence = np.zeros(shape=len(arrowCss), dtype=int) else: logging.info("%s: Arrow did not converge to MLE" % (refWindow, )) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence) if arrowConfig.polishDiploid: # additional rounds of diploid polishing _, converged = refineConsensus(ai, arrowConfig, polishDiploid=True) if converged: arrowCss = str(ai) if arrowConfig.computeConfidence: confidence = consensusConfidence(ai) else: confidence = np.zeros(shape=len(arrowCss), dtype=int) else: logging.info( "%s: Arrow (diploid) did not converge to optimal solution" % (refWindow, )) return ArrowConsensus(refWindow, arrowCss, confidence, ai)
def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig, depthLimit, arrowConfig): """ High-level routine for calling the consensus for a window of the genome given a cmp.h5. Identifies the coverage contours of the window in order to identify subintervals where a good consensus can be called. Creates the desired "no evidence consensus" where there is inadequate coverage. """ winId, winStart, winEnd = refWindow logging.info("Arrow operating on %s" % reference.windowToString(refWindow)) if options.fancyChunking: # 1) identify the intervals with adequate coverage for arrow # consensus; restrict to intervals of length > 10 alnHits = U.readsInWindow(alnFile, refWindow, depthLimit=20000, minMapQV=arrowConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) starts = np.fromiter((hit.tStart for hit in alnHits), np.int) ends = np.fromiter((hit.tEnd for hit in alnHits), np.int) intervals = kSpannedIntervals(refWindow, arrowConfig.minPoaCoverage, starts, ends, minLength=10) coverageGaps = holes(refWindow, intervals) allIntervals = sorted(intervals + coverageGaps) if len(allIntervals) > 1: logging.info("Usable coverage in %s: %r" % (reference.windowToString(refWindow), intervals)) else: allIntervals = [(winStart, winEnd)] # 2) pull out the reads we will use for each interval # 3) call consensusForAlignments on the interval subConsensi = [] variants = [] for interval in allIntervals: intStart, intEnd = interval intRefSeq = referenceContig[intStart:intEnd] subWin = subWindow(refWindow, interval) windowRefSeq = referenceContig[intStart:intEnd] alns = U.readsInWindow(alnFile, subWin, depthLimit=depthLimit, minMapQV=arrowConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) clippedAlns_ = [aln.clippedTo(*interval) for aln in alns] clippedAlns = U.filterAlns(subWin, clippedAlns_, arrowConfig) if len([a for a in clippedAlns if a.spansReferenceRange(*interval) ]) >= arrowConfig.minPoaCoverage: logging.debug("%s: Reads being used: %s" % (reference.windowToString(subWin), " ".join( [str(hit.readName) for hit in alns]))) alnsUsed = [] if options.reportEffectiveCoverage else None css = U.consensusForAlignments(subWin, intRefSeq, clippedAlns, arrowConfig, alnsUsed=alnsUsed) # Tabulate the coverage implied by these alignments, as # well as the post-filtering ("effective") coverage siteCoverage = U.coverageInWindow(subWin, alns) effectiveSiteCoverage = U.coverageInWindow( subWin, alnsUsed) if options.reportEffectiveCoverage else None variants_ = U.variantsFromConsensus(subWin, windowRefSeq, css.sequence, css.confidence, siteCoverage, effectiveSiteCoverage, options.aligner, ai=None) filteredVars = filterVariants(options.minCoverage, options.minConfidence, variants_) # Annotate? if options.annotateGFF: annotateVariants(filteredVars, clippedAlns) variants += filteredVars # Dump? maybeDumpEvidence = \ ((options.dumpEvidence == "all") or (options.dumpEvidence == "outliers") or (options.dumpEvidence == "variants") and (len(variants) > 0)) if maybeDumpEvidence: refId, refStart, refEnd = subWin refName = reference.idToName(refId) windowDirectory = os.path.join(options.evidenceDirectory, refName, "%d-%d" % (refStart, refEnd)) ev = ArrowEvidence.fromConsensus(css) if options.dumpEvidence != "outliers": ev.save(windowDirectory) elif (np.max(ev.delta) > 20): # Mathematically I don't think we should be seeing # deltas > 6 in magnitude, but let's just restrict # attention to truly bonkers outliers. ev.save(windowDirectory) else: css = ArrowConsensus.noCallConsensus( arrowConfig.noEvidenceConsensus, subWin, intRefSeq) subConsensi.append(css) # 4) glue the subwindow consensus objects together to form the # full window consensus css = join(subConsensi) # 5) Return return css, variants
def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig, depthLimit, arrowConfig): """ High-level routine for calling the consensus for a window of the genome given a cmp.h5. Identifies the coverage contours of the window in order to identify subintervals where a good consensus can be called. Creates the desired "no evidence consensus" where there is inadequate coverage. """ winId, winStart, winEnd = refWindow logging.info("Arrow operating on %s" % reference.windowToString(refWindow)) if options.fancyChunking: # 1) identify the intervals with adequate coverage for arrow # consensus; restrict to intervals of length > 10 alnHits = U.readsInWindow(alnFile, refWindow, depthLimit=20000, minMapQV=arrowConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) starts = np.fromiter((hit.tStart for hit in alnHits), np.int) ends = np.fromiter((hit.tEnd for hit in alnHits), np.int) intervals = kSpannedIntervals(refWindow, arrowConfig.minPoaCoverage, starts, ends, minLength=10) coverageGaps = holes(refWindow, intervals) allIntervals = sorted(intervals + coverageGaps) if len(allIntervals) > 1: logging.info("Usable coverage in %s: %r" % (reference.windowToString(refWindow), intervals)) else: allIntervals = [ (winStart, winEnd) ] # 2) pull out the reads we will use for each interval # 3) call consensusForAlignments on the interval subConsensi = [] variants = [] for interval in allIntervals: intStart, intEnd = interval intRefSeq = referenceContig[intStart:intEnd] subWin = subWindow(refWindow, interval) windowRefSeq = referenceContig[intStart:intEnd] alns = U.readsInWindow(alnFile, subWin, depthLimit=depthLimit, minMapQV=arrowConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ] clippedAlns = U.filterAlns(subWin, clippedAlns_, arrowConfig) if len([ a for a in clippedAlns if a.spansReferenceRange(*interval) ]) >= arrowConfig.minPoaCoverage: logging.debug("%s: Reads being used: %s" % (reference.windowToString(subWin), " ".join([str(hit.readName) for hit in alns]))) alnsUsed = [] if options.reportEffectiveCoverage else None css = U.consensusForAlignments(subWin, intRefSeq, clippedAlns, arrowConfig, alnsUsed=alnsUsed) # Tabulate the coverage implied by these alignments, as # well as the post-filtering ("effective") coverage siteCoverage = U.coverageInWindow(subWin, alns) effectiveSiteCoverage = U.coverageInWindow(subWin, alnsUsed) if options.reportEffectiveCoverage else None variants_ = U.variantsFromConsensus(subWin, windowRefSeq, css.sequence, css.confidence, siteCoverage, effectiveSiteCoverage, options.aligner, ai=None) filteredVars = filterVariants(options.minCoverage, options.minConfidence, variants_) # Annotate? if options.annotateGFF: annotateVariants(filteredVars, clippedAlns) variants += filteredVars # Dump? maybeDumpEvidence = \ ((options.dumpEvidence == "all") or (options.dumpEvidence == "outliers") or (options.dumpEvidence == "variants") and (len(variants) > 0)) if maybeDumpEvidence: refId, refStart, refEnd = subWin refName = reference.idToName(refId) windowDirectory = os.path.join( options.evidenceDirectory, refName, "%d-%d" % (refStart, refEnd)) ev = ArrowEvidence.fromConsensus(css) if options.dumpEvidence != "outliers": ev.save(windowDirectory) elif (np.max(ev.delta) > 20): # Mathematically I don't think we should be seeing # deltas > 6 in magnitude, but let's just restrict # attention to truly bonkers outliers. ev.save(windowDirectory) else: css = ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, subWin, intRefSeq) subConsensi.append(css) # 4) glue the subwindow consensus objects together to form the # full window consensus css = join(subConsensi) # 5) Return return css, variants
def consensusAndVariantsForWindow(alnFile, refWindow, referenceContig, depthLimit, arrowConfig): """ High-level routine for calling the consensus for a window of the genome given a BAM file. Identifies the coverage contours of the window in order to identify subintervals where a good consensus can be called. Creates the desired "no evidence consensus" where there is inadequate coverage. """ winId, winStart, winEnd = refWindow logging.info("Arrow operating on %s" % reference.windowToString(refWindow)) if options.fancyChunking: # 1) identify the intervals with adequate coverage for arrow # consensus; restrict to intervals of length > 10 alnHits = U.readsInWindow(alnFile, refWindow, depthLimit=20000, minMapQV=arrowConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) starts = np.fromiter((hit.tStart for hit in alnHits), np.int) ends = np.fromiter((hit.tEnd for hit in alnHits), np.int) intervals = kSpannedIntervals(refWindow, arrowConfig.minPoaCoverage, starts, ends, minLength=10) coverageGaps = holes(refWindow, intervals) allIntervals = sorted(intervals + coverageGaps) if len(allIntervals) > 1: logging.info("Usable coverage in %s: %r" % (reference.windowToString(refWindow), intervals)) else: allIntervals = [ (winStart, winEnd) ] # 2) pull out the reads we will use for each interval # 3) call consensusForAlignments on the interval subConsensi = [] variants = [] for interval in allIntervals: intStart, intEnd = interval intRefSeq = referenceContig[intStart:intEnd] subWin = subWindow(refWindow, interval) windowRefSeq = referenceContig[intStart:intEnd] alns = U.readsInWindow(alnFile, subWin, depthLimit=depthLimit, minMapQV=arrowConfig.minMapQV, strategy="long-and-strand-balanced", stratum=options.readStratum, barcode=options.barcode) clippedAlns_ = [ aln.clippedTo(*interval) for aln in alns ] clippedAlns = U.filterAlns(subWin, clippedAlns_, arrowConfig) if len([ a for a in clippedAlns if a.spansReferenceRange(*interval) ]) >= arrowConfig.minPoaCoverage: logging.debug("%s: Reads being used: %s" % (reference.windowToString(subWin), " ".join([str(hit.readName) for hit in alns]))) alnsUsed = [] if options.reportEffectiveCoverage else None css = U.consensusForAlignments(subWin, intRefSeq, clippedAlns, arrowConfig, alnsUsed=alnsUsed) # Tabulate the coverage implied by these alignments, as # well as the post-filtering ("effective") coverage siteCoverage = U.coverageInWindow(subWin, alns) effectiveSiteCoverage = U.coverageInWindow(subWin, alnsUsed) if options.reportEffectiveCoverage else None variants_, newPureCss = U.variantsFromConsensus(subWin, windowRefSeq, css.sequence, css.confidence, siteCoverage, effectiveSiteCoverage, options.aligner, ai=None, diploid=arrowConfig.polishDiploid) # Annotate? if options.annotateGFF: annotateVariants(variants_, clippedAlns) variants += variants_ # The nascent consensus sequence might contain ambiguous bases, these # need to be removed as software in the wild cannot deal with such # characters and we only use IUPAC for *internal* bookkeeping. if arrowConfig.polishDiploid: css.sequence = newPureCss else: css = ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, subWin, intRefSeq) subConsensi.append(css) # 4) glue the subwindow consensus objects together to form the # full window consensus css = join(subConsensi) # 5) Return return css, variants