def extractMappedRead(self, aln, windowStart): """ Given a clipped alignment, convert its coordinates into template space (starts with 0), bundle it up with its features as a MappedRead. """ if isinstance(aln, CmpH5Alignment): die("Arrow does not support CmpH5 files!") assert aln.referenceSpan > 0 def baseFeature(featureName): if aln.reader.hasBaseFeature(featureName): rawFeature = aln.baseFeature(featureName, aligned=False, orientation="native") return rawFeature.clip(0,255).astype(np.uint8) else: return np.zeros((aln.readLength,), dtype=np.uint8) name = aln.readName chemistry = aln.sequencingChemistry strand = cc.StrandType_REVERSE if aln.isReverseStrand else cc.StrandType_FORWARD read = cc.Read(name, aln.read(aligned=False, orientation="native"), cc.Uint8Vector(baseFeature("Ipd").tolist()), cc.Uint8Vector(baseFeature("PulseWidth").tolist()), cc.SNR(aln.hqRegionSnr), chemistry) return cc.MappedRead(read, strand, int(aln.referenceStart - windowStart), int(aln.referenceEnd - windowStart))
def uniqueSingleBaseMutations(templateSequence, positions=None): """ Return an iterator over all single-base mutations of a templateSequence that result in unique mutated sequences. """ allBases = "ACGT" positions = positions or xrange(0, len(templateSequence)) for tplStart in positions: tplBase = templateSequence[tplStart] prevTplBase = templateSequence[tplStart-1] if (tplStart > 0) else None # snvs for subsBase in allBases: if subsBase != tplBase: yield cc.Mutation(cc.MutationType_SUBSTITUTION, tplStart, subsBase) # Insertions---only allowing insertions that are not cognate # with the previous base. for insBase in allBases: if insBase != prevTplBase: yield cc.Mutation(cc.MutationType_INSERTION, tplStart, insBase) # Deletion--only allowed if refBase does not match previous tpl base if tplBase != prevTplBase: yield cc.Mutation(cc.MutationType_DELETION, tplStart)
def configure(options, alnFile): if alnFile.readType != "standard": raise U.IncompatibleDataException( "The Arrow algorithm requires a BAM file containing standard (non-CCS) reads." ) if options.diploid: logging.warn("Diploid analysis not yet supported under Arrow model.") # load parameters from file if options.parametersFile: logging.info("Loading model parameters from: ({0})".format( options.parametersFile)) if not cc.LoadModels(options.parametersFile): die("Arrow: unable to load parameters from: ({0})".format( options.parametersFile)) # test available chemistries supp = set(cc.SupportedChemistries()) logging.info("Found consensus models for: ({0})".format(", ".join( sorted(supp)))) used = set([]) if options.parametersSpec != "auto": logging.info("Overriding model selection with: ({0})".format( options.parametersSpec)) if not cc.OverrideModel(options.parametersSpec): die("Arrow: unable to override model with: ({0})".format( options.parametersSpec)) used.add(options.parametersSpec) else: used.update(alnFile.sequencingChemistry) unsupp = used - supp if used - supp: die("Arrow: unsupported chemistries found: ({0})".format(", ".join( sorted(unsupp)))) # All arrow models require PW except P6 and the first S/P1-C1 for readGroup in alnFile.readGroupTable: if set([readGroup["SequencingChemistry"]]) - set( ["P6-C4", "S/P1-C1/beta"]): if ("Ipd" not in readGroup["BaseFeatures"] or "PulseWidth" not in readGroup["BaseFeatures"]): die("Arrow model requires missing base feature: IPD or PulseWidth" ) logging.info("Using consensus models for: ({0})".format(", ".join( sorted(used)))) return M.ArrowConfig(minMapQV=options.minMapQV, noEvidenceConsensus=options.noEvidenceConsensusCall, computeConfidence=(not options.fastMode), minReadScore=options.minReadScore, minHqRegionSnr=options.minHqRegionSnr, minZScore=options.minZScore, minAccuracy=options.minAccuracy)
def refineConsensus(ai, arrowConfig): """ Given a MultiReadMutationScorer, identify and apply favorable template mutations. Return (consensus, didConverge) :: (str, bool) """ cfg = cc.PolishConfig(arrowConfig.maxIterations, arrowConfig.mutationSeparation, arrowConfig.mutationNeighborhood) isConverged, nTested, nApplied = cc.Polish(ai, cfg) return str(ai), isConverged
def variantsFromConsensus(refWindow, refSequenceInWindow, cssSequenceInWindow, cssQvInWindow=None, siteCoverage=None, aligner="affine", ai=None): """ Compare the consensus and the reference in this window, returning a list of variants. Uses the integrator to identify heterozygous variants. """ assert (cssQvInWindow is None) == (siteCoverage is None) # Both or none refId, refStart, refEnd = refWindow if ai is not None: # # Hunting diploid variants: # 1. find confident heterozygous sites; # 2. build a "diploid consensus" using IUPAC encoding # for het sites; mark cssQv accordingly # 3. align diploid consensus to reference # 4. extract and decorate variants # assert str(ai) == cssSequenceInWindow iupacMutations = [] # List of (Mutation, confidence) for pos in xrange(0, ai.Length()): ds = cc.IsSiteHeterozygous(scoresForPosition(ai, pos), 40) if ds: muts = [None] + list(allSingleBaseMutations(cssSequenceInWindow, positions=[pos])) mut0 = muts[ds.Allele0] mut1 = muts[ds.Allele1] cssBase = cssSequenceInWindow[pos] packedMut = packMuts(cssBase, mut0, mut1) iupacMutations.append((packedMut, 40)) # Create diploidCss by applying mutations, meanwhile updating the # confidence vector accordingly. diploidCss = cc.ApplyMutations([pair[0] for pair in iupacMutations], cssSequenceInWindow) diploidQv = list(cssQvInWindow) if cssQvInWindow is not None else None runningLengthDiff = 0 for (mut, conf) in iupacMutations: start = mut.Start() + runningLengthDiff end = mut.End() + runningLengthDiff diploidQv[start:end] = [conf] assert len(diploidCss) == len(diploidQv) cssSequenceInWindow = diploidCss cssQvInWindow = diploidQv vars = variantsFromAlignment(refWindow, refSequenceInWindow, cssSequenceInWindow, cssQvInWindow, siteCoverage) return vars
def refineConsensus(ai, arrowConfig, polishDiploid=False): """ Given a MultiReadMutationScorer, identify and apply favorable template mutations. Return (consensus, didConverge) :: (str, bool) """ cfg = cc.PolishConfig(arrowConfig.maxIterations, arrowConfig.mutationSeparation, arrowConfig.mutationNeighborhood, polishDiploid) if arrowConfig.maskRadius: _ = cc.Polish(ai, cfg) ai.MaskIntervals(arrowConfig.maskRadius, arrowConfig.maskErrorRate) polishResult = cc.Polish(ai, cfg) return str(ai), polishResult.hasConverged
def onChunk(self, workChunk): referenceWindow = workChunk.window refId, refStart, refEnd = referenceWindow refSeqInWindow = reference.sequenceInWindow(referenceWindow) # Quick cutout for no-coverage case if not workChunk.hasCoverage: noCallCss = ArrowConsensus.noCallConsensus(self.arrowConfig.noEvidenceConsensus, referenceWindow, refSeqInWindow) return (referenceWindow, (noCallCss, [])) # General case eWindow = reference.enlargedReferenceWindow(referenceWindow, options.referenceChunkOverlap) _, eStart, eEnd = eWindow # We call consensus on the enlarged window and then map back # to the reference and clip the consensus at the implied # bounds. This seems to be more reliable thank cutting the # consensus bluntly refContig = reference.byName[refId].sequence refSequenceInEnlargedWindow = refContig[eStart:eEnd] # # Get the consensus for the enlarged window. # css_, variants_ = \ consensusAndVariantsForWindow(self._inAlnFile, eWindow, refContig, options.coverage, self.arrowConfig) # # Restrict the consensus and variants to the reference window. # ga = cc.Align(refSequenceInEnlargedWindow, css_.sequence) targetPositions = cc.TargetToQueryPositions(ga) cssStart = targetPositions[refStart-eStart] cssEnd = targetPositions[refEnd-eStart] cssSequence = css_.sequence[cssStart:cssEnd] cssQv = css_.confidence[cssStart:cssEnd] variants = [ v for v in variants_ if refStart <= v.refStart < refEnd ] consensusObj = Consensus(referenceWindow, cssSequence, cssQv) return (referenceWindow, (consensusObj, variants))
def extractMappedRead(aln, windowStart): """ Given a clipped alignment, convert its coordinates into template space (starts with 0), bundle it up with its features as a MappedRead. """ assert aln.referenceSpan > 0 name = aln.readName chemistry = aln.sequencingChemistry strand = cc.StrandEnum_REVERSE if aln.isReverseStrand else cc.StrandEnum_FORWARD read = cc.Read(name, ArrowConfig.extractFeatures(aln), chemistry) return (cc.MappedRead(read, strand, int(aln.referenceStart - windowStart), int(aln.referenceEnd - windowStart)), cc.SNR(aln.hqRegionSnr))
def consensusConfidence(ai, positions=None): """ Returns an array of QV values reflecting the consensus confidence at each position specified. If the `positions` argument is omitted, confidence values are returned for all positions in the consensus (str(ai)). """ return np.array(np.clip(cc.ConsensusQVs(ai), 0, 93), dtype=np.uint8)
def allSingleBaseMutations(templateSequence, positions=None): """ Same as ``uniqueSingleBaseMutations``, but no filtering as to whether the mutated sequences are unique. """ allBases = "ACGT" positions = positions or xrange(0, len(templateSequence)) for tplStart in positions: tplBase = templateSequence[tplStart] # snvs for subsBase in allBases: if subsBase != tplBase: yield cc.Mutation_Substitution(tplStart, subsBase) # Insertions for insBase in allBases: yield cc.Mutation_Insertion(tplStart, insBase) # Deletion yield cc.Mutation_Deletion(tplStart, 1)
def poaConsensus(fwdSequences, arrowConfig): seqLens = [len(seq) for seq in fwdSequences] median = np.median(seqLens) ordSeqs = sorted(fwdSequences, key=lambda seq: abs(len(seq) - median)) ordSeqs = ordSeqs[:arrowConfig.maxPoaCoverage] cov = len(ordSeqs) minCov = 1 if cov < 5 else ((cov + 1) / 2 - 1) poaConfig = cc.DefaultPoaConfig(cc.AlignMode_GLOBAL) return cc.PoaConsensus.FindConsensus(ordSeqs, poaConfig, minCov)
def lifted(queryPositions, mappedRead): """ Lift a mappedRead into a new coordinate system by using the position translation table `queryPositions` """ newStart = queryPositions[mappedRead.TemplateStart] newEnd = queryPositions[mappedRead.TemplateEnd] copy = cc.MappedRead(mappedRead) copy.TemplateStart = newStart copy.TemplateEnd = newEnd return copy
def ScoreCas9Site(seq): maxKey = None maxAcc = None for key, rna in GUIDES.iteritems(): query = rna + "NGG" aln = cc.Align(seq, query, cfg) Ns = sum(1 for b in aln.Query() if b == 'N') acc = (aln.Matches() + Ns) / float(len(query)) if maxAcc is None or acc > maxAcc: maxKey = key maxAcc = acc return (maxKey, maxAcc)
def packMuts(cssBase, mut1, mut2): # Turn two muts (with same Start, End, LengthDiff) into a single mutation to # IUPAC. The no-op mutation is coded as None. # # Example1: (_, Subs A, Subs T) -> Subs W # Example2: (_, Ins A, Ins T) -> Ins W # Example3: (A, None, Subs T) -> Subs W # nonNullMut = mut1 or mut2 start = nonNullMut.Start() mutType = nonNullMut.Type() newBase1 = mut1.Bases() if mut1 else cssBase newBase2 = mut2.Bases() if mut2 else cssBase newBasePacked = packIUPAC((newBase1, newBase2)) return cc.Mutation(mutType, start, newBasePacked)
def sufficientlyAccurate(mappedRead, poaCss, minAccuracy): if minAccuracy <= 0.0: return True s, e = mappedRead.TemplateStart, mappedRead.TemplateEnd tpl = poaCss[s:e] if mappedRead.Strand == cc.StrandType_FORWARD: pass elif mappedRead.Strand == cc.StrandType_REVERSE: tpl = reverseComplement(tpl) else: return False aln = cc.AlignLinear(tpl, mappedRead.Seq) nErrors = sum(1 for t in aln.Transcript() if t != 'M') tlen = len(tpl) acc = 1.0 - 1.0 * min(nErrors, tlen) / tlen return acc >= minAccuracy
from resources.genomes import decodeGenome MIN_ACC = 0.8 if len(sys.argv) < 4: print "ERROR:\tExpected at least 3 arguments but got {0}".format( len(sys.argv) - 1) print "Usage:\tloadingDiagnostic OUTPUT_PREFIX HG19.FASTA ALIGN_BAM [ALIGN_BAM ..]" raise SystemExit outputPrefix = sys.argv[1] genomeName = sys.argv[2] indexedFasta = sys.argv[3] inputFiles = sys.argv[4:] cfg = cc.AlignConfig(cc.AlignParams.Default(), 1) GUIDES = { "FMR1": "AGAGGCCGAACTGGGATAAC", "FMR1_201": "CGCGCGTCTGTCTTTCGACC", "HTT": "AGCGGGCCCAAACTCACGGT", "HTT_SQ1": "CTTATTAACAGCAGAGAACT" } def ScoreCas9Site(seq): maxKey = None maxAcc = None for key, rna in GUIDES.iteritems(): query = rna + "NGG" aln = cc.Align(seq, query, cfg)
def consensusForAlignments(refWindow, refSequence, alns, arrowConfig): """ Call consensus on this interval---without subdividing the interval further. Testable! Clipping has already been done! """ _, refStart, refEnd = refWindow # Compute the POA consensus, which is our initial guess, and # should typically be > 99.5% accurate fwdSequences = [ a.read(orientation="genomic", aligned=False) for a in alns if a.spansReferenceRange(refStart, refEnd) ] assert len(fwdSequences) >= arrowConfig.minPoaCoverage try: p = cc.PoaConsensus.FindConsensus( fwdSequences[:arrowConfig.maxPoaCoverage]) except: logging.info("%s: POA could not be generated" % (refWindow, )) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence) ga = cc.Align(refSequence, p.Sequence) numPoaVariants = ga.Errors() poaCss = p.Sequence # Extract reads into ConsensusCore2-compatible objects, and map them into the # coordinates relative to the POA consensus mappedReads = [ arrowConfig.extractMappedRead(aln, refStart) for aln in alns ] queryPositions = cc.TargetToQueryPositions(ga) mappedReads = [(lifted(queryPositions, mr), snr) for (mr, snr) in mappedReads] # Load the mapped reads into the mutation scorer, and iterate # until convergence. ai = cc.MultiMolecularIntegrator( poaCss, cc.IntegratorConfig(arrowConfig.minZScore)) coverage = 0 for (mr, snr) in mappedReads: if (mr.TemplateEnd <= mr.TemplateStart or mr.TemplateEnd - mr.TemplateStart < 2 or mr.Length() < 2): continue coverage += 1 if ai.AddRead(mr, snr) == cc.AddReadResult_SUCCESS else 0 # TODO(lhepler, dalexander): propagate coverage around somehow # Iterate until covergence try: assert coverage >= arrowConfig.minPoaCoverage, \ "Insufficient coverage (%d) to call consensus (%d)" \ % (coverage, arrowConfig.minPoaCoverage) _, converged = refineConsensus(ai, arrowConfig) assert converged, "Arrow did not converge to MLE" arrowCss = str(ai) if arrowConfig.computeConfidence: confidence = consensusConfidence(ai) else: confidence = np.zeros(shape=len(arrowCss), dtype=int) return ArrowConsensus(refWindow, arrowCss, confidence, ai) except: traceback = ''.join(format_exception(*sys.exc_info())) logging.info("%s: %s" % (refWindow, traceback)) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence)
def variantsFromAlignment(refWindow, refSeq, cssSeq, cssQV=None, refCoverage=None): """ Extract the variants implied by a pairwise alignment of cssSeq to refSeq reference. If cssQV, refCoverage are provided, they will be used to decorate the variants with those attributes. Arguments: - cssQV: QV array, same length as css - refCoverage: coverage array, sample length as reference window This is trickier than in the haploid case. We have to break out diploid variants as single bases, in order to avoid implying phase. """ variants = [] refId, refStart, refEnd = refWindow aln = cc.AlignAffineIupac(refSeq, cssSeq) alnTarget = aln.Target() alnQuery = aln.Query() assert (cssQV is None) == (refCoverage is None) # Both or none assert len(refSeq) == refEnd - refStart assert cssQV is None or len(cssSeq) == len(cssQV) assert refCoverage is None or len(refSeq) == len(refCoverage) transcript = [ X if (Q != "N" and T != "N") else "N" for (X, T, Q) in zip(aln.Transcript(), alnTarget, alnQuery) ] variants = [] runStart = -1 runStartRefPos = None runX = None refPos = refStart for pos, (X, T, Q) in enumerate(zip(transcript, alnTarget, alnQuery)): if X != runX or isHeterozygote(Q): if runStart >= 0 and runX not in "MN": # Package up the run and dump a variant ref = alnTarget[runStart:pos].replace("-", "") read = alnQuery[runStart:pos].replace("-", "") if isHeterozygote(read): allele1, allele2 = unpackIUPAC(read) var = Variant(refId, runStartRefPos, refPos, ref, allele1, allele2) else: var = Variant(refId, runStartRefPos, refPos, ref, read) variants.append(var) runStart = pos runStartRefPos = refPos runX = X if T != "-": refPos += 1 # This might be better handled within the loop above, just keeping # track of Qpos, Tpos if cssQV is not None: cssPosition = cc.TargetToQueryPositions(aln) for v in variants: # HACK ALERT: we are not really handling the confidence or # coverage for variants at last position of the window # correctly here. refPos_ = min(v.refStart - refStart, len(refCoverage) - 1) cssPos_ = min(cssPosition[v.refStart - refStart], len(cssQV) - 1) if refCoverage is not None: v.coverage = refCoverage[refPos_] if cssQV is not None: v.confidence = cssQV[cssPos_] return variants
def consensusForAlignments(refWindow, refSequence, alns, arrowConfig, draft=None, polish=True, alnsUsed=None): """ Call consensus on this interval---without subdividing the interval further. Returns an ArrowConsensus object. Requires that clipping has already been done. If `draft` is provided, it will serve as the starting point for polishing. If not, the POA will be used to generate a draft starting point. If `polish` is False, the arrow polishing procedure will not be used, and the draft consensus will be returned. `alnsUsed` is an output parameter; if not None, it should be an empty list on entry; on return from this function, the list will contain the alns objects that were actually used to compute the consensus (those not filtered out). """ _, refStart, refEnd = refWindow if alnsUsed is not None: assert alnsUsed == [] if draft is None: # Compute the POA consensus, which is our initial guess, and # should typically be > 99.5% accurate fwdSequences = [ a.read(orientation="genomic", aligned=False) for a in alns if a.spansReferenceRange(refStart, refEnd) ] assert len(fwdSequences) >= arrowConfig.minPoaCoverage try: p = poaConsensus(fwdSequences, arrowConfig) except Exception: logging.info("%s: POA could not be generated" % (refWindow, )) return ArrowConsensus.noCallConsensus( arrowConfig.noEvidenceConsensus, refWindow, refSequence) draft = p.Sequence ga = cc.Align(refSequence, draft) # Extract reads into ConsensusCore2-compatible objects, and map them into the # coordinates relative to the POA consensus mappedReads = [ arrowConfig.extractMappedRead(aln, refStart) for aln in alns ] queryPositions = cc.TargetToQueryPositions(ga) mappedReads = [lifted(queryPositions, mr) for mr in mappedReads] # Load the mapped reads into the mutation scorer, and iterate # until convergence. ai = cc.Integrator(draft, cc.IntegratorConfig(arrowConfig.minZScore)) coverage = 0 for i, mr in enumerate(mappedReads): if (mr.TemplateEnd <= mr.TemplateStart or mr.TemplateEnd - mr.TemplateStart < 2 or mr.Length() < 2): continue if not sufficientlyAccurate(mr, draft, arrowConfig.minAccuracy): tpl = draft[mr.TemplateStart:mr.TemplateEnd] if mr.Strand == cc.StrandType_FORWARD: pass elif mr.Strand == cc.StrandType_REVERSE: tpl = reverseComplement(tpl) else: tpl = "INACTIVE/UNMAPPED" logging.debug( "%s: skipping read '%s' due to insufficient accuracy, (poa, read): ('%s', '%s')" % (refWindow, mr.Name, tpl, mr.Seq)) continue if ai.AddRead(mr) == cc.State_VALID: coverage += 1 if alnsUsed is not None: alnsUsed.append(alns[i]) if coverage < arrowConfig.minPoaCoverage: logging.info("%s: Inadequate coverage to call consensus" % (refWindow, )) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence) if not polish: confidence = np.zeros(len(draft), dtype=int) return ArrowConsensus(refWindow, draft, confidence, ai) # Iterate until covergence _, converged = refineConsensus(ai, arrowConfig, polishDiploid=False) if converged: arrowCss = str(ai) if arrowConfig.computeConfidence: confidence = consensusConfidence(ai) else: confidence = np.zeros(shape=len(arrowCss), dtype=int) else: logging.info("%s: Arrow did not converge to MLE" % (refWindow, )) return ArrowConsensus.noCallConsensus(arrowConfig.noEvidenceConsensus, refWindow, refSequence) if arrowConfig.polishDiploid: # additional rounds of diploid polishing _, converged = refineConsensus(ai, arrowConfig, polishDiploid=True) if converged: arrowCss = str(ai) if arrowConfig.computeConfidence: confidence = consensusConfidence(ai) else: confidence = np.zeros(shape=len(arrowCss), dtype=int) else: logging.info( "%s: Arrow (diploid) did not converge to optimal solution" % (refWindow, )) return ArrowConsensus(refWindow, arrowCss, confidence, ai)