def _computePositionTraditionalControl(self, caseObservations, controlObservations, methylFractionFlag, identifyFlag, testProcedure=_tTest): """Summarize the observed ipds at one template position/strand, using a case-control analysis""" # Compute stats on the observed ipds caseData = caseObservations['data']['ipd'] controlData = controlObservations['data']['ipd'] res = dict() res['refId'] = self.refId # FASTA header name res['refName'] = self.refName strand = res['strand'] = 1 - caseObservations['strand'] tpl = res['tpl'] = caseObservations['tpl'] res['base'] = self.cognateBaseFunc(tpl, strand) res['coverage'] = int(round((caseData.size + controlData.size) / 2.0)) # need a coverage annotation res['caseCoverage'] = caseData.size res['controlCoverage'] = controlData.size res['caseMean'] = caseData.mean().item() res['caseMedian'] = np.median(caseData).item() res['caseStd'] = np.std(caseData).item() res['controlMean'] = controlData.mean().item() res['controlMedian'] = np.median(controlData).item() res['controlStd'] = np.std(controlData).item() trim = (0.001, 0.03) ctrlMean = mstats.trimmed_mean(controlData, trim).item() if abs(ctrlMean) > 1e-3: res['ipdRatio'] = (mstats.trimmed_mean(caseData, trim).item() / ctrlMean) else: res['ipdRatio'] = 1.0 testResults = testProcedure(caseData, controlData) res['testStatistic'] = testResults['testStatistic'] res['pvalue'] = testResults['pvalue'] pvalue = max(sys.float_info.min, res['pvalue']) res['score'] = round(-10.0 * math.log10(pvalue)) # If the methylFractionFlag is set, then estimate fraction using just modelPrediction in the detection case. if methylFractionFlag and pvalue < self.options.pvalue and not identifyFlag: if res['controlCoverage'] > self.options.methylMinCov and res['caseCoverage'] > self.options.methylMinCov: # Instantiate mixture estimation methods: mixture = MixtureEstimationMethods(self.ipdModel.gbmModel.post, self.ipdModel.gbmModel.pre, res, self.options.methylMinCov) x = mixture.detectionMixModelBootstrap(res['controlMean'], caseData) res[FRAC] = x[0] res[FRAClow] = x[1] res[FRACup] = x[2] else: res[FRAC] = np.nan res[FRACup] = np.nan res[FRAClow] = np.nan return res
def _computePositionSyntheticControl(self, caseObservations, capValue, methylFractionFlag, identifyFlag, modelPrediction=None): """Summarize the observed ipds at one template position/strand, using the synthetic ipd model""" # Compute stats on the observed ipds d = caseObservations['data']['ipd'] res = dict() # ref00000x name res['refId'] = self.refId # FASTA header name res['refName'] = self.refName # NOTE -- this is where the strand flipping occurs -- make sure to reproduce this in the all calling methods strand = res['strand'] = 1 - caseObservations['strand'] tpl = res['tpl'] = caseObservations['tpl'] res['coverage'] = d.size # Don't compute these stats - they just take time and confuse things # res['mean'] = d.mean().item() # res['median'] = np.median(d).item() # res['std'] = np.std(d).item() # Compute the predicted IPD from the model # NOTE! The ipd model is in the observed read strand if modelPrediction is None: modelPrediction = self.meanIpdFunc(tpl, strand).item() res['modelPrediction'] = modelPrediction res['base'] = self.cognateBaseFunc(tpl, strand) # Store in case of methylated fraction estimtion: res['rawData'] = d # Try a hybrid capping approach -- cap at the higher of # - 5x the model prediction # - 90th percentile of the local data (at low coverage we pick a lower percentile to ensure we trim the highest datapoint # - global cap value percentile = min(90, (1.0 - 1.0 / (d.size - 1)) * 100) localPercentile = np.percentile(d, percentile) capValue = max(capValue, 4.0 * modelPrediction, localPercentile) # np.minimum(d, capValue, out=d) # this version will send capped IPDs to modified fraction estimator d = np.minimum(d, capValue) # Trimmed stats res['tMean'] = d.mean().item() res['tErr'] = np.std(d).item() / sqrt(d.size) res['ipdRatio'] = res['tMean'] / res['modelPrediction'] # Don't know the modification yet res["modification"] = "." # use ttest-based pvalue # res['pvalue'] = self.computeObservationPValue(res) res['tStatistic'] = self.computeObservationTstatistic(res) res['pvalue'] = self.computeObservationPValueTTest(res) pvalue = max(sys.float_info.min, res['pvalue']) score = round(-10.0 * math.log10(pvalue)) res['score'] = score # If the methylFractionFlag is set, then estimate fraction using just modelPrediction in the detection case. if methylFractionFlag and pvalue < self.options.pvalue and not identifyFlag: if res['coverage'] > self.options.methylMinCov: modelPrediction = self.meanIpdFunc(tpl, strand).item() # Instantiate mixture estimation methods: mixture = MixtureEstimationMethods(self.ipdModel.gbmModel.post, self.ipdModel.gbmModel.pre, res, self.options.methylMinCov) x = mixture.detectionMixModelBootstrap(modelPrediction, d) # x = self.detectionMixModelBootstrap(modelPrediction, d) res[FRAC] = x[0] res[FRAClow] = x[1] res[FRACup] = x[2] else: res[FRAC] = np.nan res[FRACup] = np.nan res[FRAClow] = np.nan # print res return res
def _computePositionTraditionalControl(self, caseObservations, controlObservations, capValue, controlCapValue, methylFractionFlag, identifyFlag, testProcedure=_tTest): oCapValue = capValue oControlCapValue = controlCapValue """Summarize the observed ipds at one template position/strand, using a case-control analysis""" # Compute stats on the observed ipds caseData = caseObservations['data']['ipd'] controlData = controlObservations['data']['ipd'] # cap both the native and control data, more or less as it is done in computePositionSyntheticControl: percentile = min(90, (1.0 - 1.0 / (caseData.size - 1)) * 100) localPercentile = np.percentile(caseData, percentile) capValue = max(capValue, 4.0 * np.median(caseData).item(), localPercentile) caseData = np.minimum(caseData, capValue) percentile = min(90, (1.0 - 1.0 / (controlData.size - 1)) * 100) localPercentile = np.percentile(controlData, percentile) controlCapValue = max(controlCapValue, 4.0 * np.median(controlData).item(), localPercentile) controlData = np.minimum(controlData, controlCapValue) res = dict() res['refId'] = self.refId # FASTA header name res['refName'] = self.refName strand = res['strand'] = 1 - caseObservations['strand'] tpl = res['tpl'] = caseObservations['tpl'] res['base'] = self.cognateBaseFunc(tpl, strand) res['coverage'] = int(round((caseData.size + controlData.size) / 2.0)) # need a coverage annotation res['caseCoverage'] = caseData.size res['controlCoverage'] = controlData.size res['caseMean'] = caseData.mean().item() res['caseMedian'] = np.median(caseData).item() res['caseStd'] = np.std(caseData).item() res['controlMean'] = controlData.mean().item() res['controlMedian'] = np.median(controlData).item() res['controlStd'] = np.std(controlData).item() trim = (0.001, 0.03) ctrlMean = mstats.trimmed_mean(controlData, trim).item() if abs(ctrlMean) > 1e-3: res['ipdRatio'] = (mstats.trimmed_mean(caseData, trim).item() / ctrlMean) else: res['ipdRatio'] = 1.0 testResults = testProcedure(caseData, controlData) res['testStatistic'] = testResults['testStatistic'] res['pvalue'] = testResults['pvalue'] # res['testStatistic'] = ( res['caseMedian'] - res['controlMedian'] ) / sqrt( res['caseStd']**2 + res['controlStd']**2 ) # res['pvalue'] = 0.5 * erfc(res['testStatistic'] / sqrt(2)) pvalue = max(sys.float_info.min, res['pvalue']) res['score'] = round(-10.0 * math.log10(pvalue)) # print res # If the methylFractionFlag is set, then estimate fraction using just modelPrediction in the detection case. if methylFractionFlag and pvalue < self.options.pvalue and not identifyFlag: if res['controlCoverage'] > self.options.methylMinCov and res[ 'caseCoverage'] > self.options.methylMinCov: # Instantiate mixture estimation methods: mixture = MixtureEstimationMethods(self.ipdModel.gbmModel.post, self.ipdModel.gbmModel.pre, res, self.options.methylMinCov) x = mixture.detectionMixModelBootstrap(res['controlMean'], caseData) res[FRAC] = x[0] res[FRAClow] = x[1] res[FRACup] = x[2] else: res[FRAC] = np.nan res[FRACup] = np.nan res[FRAClow] = np.nan return res
def scoreMods(self, modCalls): """ For each modification in the best scoring configuration, score a config excluding the current mod against the winning config use this value as the Qmod for the deleted modification """ qvModCalls = dict() modSeq = a.array('c') modSeq.fromstring(self.sequence) # Apply the found modifications to the raw sequence for (pos, call) in modCalls.items(): modSeq[pos] = call for (pos, call) in modCalls.items(): # Score the modified template at all positions affected by this mod modScore = self.scoreRegion(pos - self.post, pos + self.pre, modSeq) modScores = self.getRegionScores(pos - self.post, pos + self.pre, modSeq) if self.methylFractionFlag and self.rawKinetics.has_key(pos): if self.rawKinetics[pos]["coverage"] > self.methylMinCov: modifiedMeanVectors = self.getContextMeans( pos - self.post, pos + self.pre, modSeq) # Switch back to the unmodified base and re-score modSeq[pos] = canonicalBaseMap[call] noModScore = self.scoreRegion(pos - self.post, pos + self.pre, modSeq) noModScores = self.getRegionScores(pos - self.post, pos + self.pre, modSeq) if self.methylFractionFlag and self.rawKinetics.has_key(pos): if self.rawKinetics[pos]["coverage"] > self.methylMinCov: unModifiedMeanVectors = self.getContextMeans( pos - self.post, pos + self.pre, modSeq) # Put back the modified base modSeq[pos] = call # Compute score difference llr = modScore - noModScore # Convert from LLR to phred-scaled probability of modification qModScore = 10 * llr * log10e + 10 * log1p(exp(-llr)) * log10e # Figure out which secondary peaks were likely generated by this modification # What is the posterior that the peak was generated by this mod? maskPos = self.findMaskPositions(pos, modScores, noModScores) # FIXME: Without this, currently, the identificationQv score is too low for many Ca5C sites # if self.useLDA: # if self.rawKinetics.has_key(pos): # if self.rawKinetics[pos].has_key('Ca5C'): # llr = -self.rawKinetics[pos]['Ca5C'] # qModScore = 100 * llr * log10e + 100*log1p(exp(-llr))*log10e if self.methylFractionFlag and self.rawKinetics.has_key(pos): if self.rawKinetics[pos]["coverage"] > self.methylMinCov: # Instantiate mixture estimation methods: mixture = MixtureEstimationMethods(self.gbmModel.post, self.gbmModel.pre, self.rawKinetics, self.methylMinCov) # Use modifiedMeanVectors and unmodifiedMeanVectors to calculate mixing proportion, and 95% CI limits. methylFracEst, methylFracLow, methylFracUpp = mixture.estimateMethylatedFractions( pos, unModifiedMeanVectors, modifiedMeanVectors, ModificationPeakMask[modNames[call]]) qvModCalls[pos] = { 'modification': modNames[call], 'QMod': qModScore, 'LLR': llr, 'Mask': maskPos, FRAC: methylFracEst, FRAClow: methylFracLow, FRACup: methylFracUpp } else: qvModCalls[pos] = { 'modification': modNames[call], 'QMod': qModScore, 'LLR': llr, 'Mask': maskPos } else: # Store the full results qvModCalls[pos] = { 'modification': modNames[call], 'QMod': qModScore, 'LLR': llr, 'Mask': maskPos } return qvModCalls
def scoreMods(self, modCalls): """ For each modification in the best scoring configuration, score a config excluding the current mod against the winning config use this value as the Qmod for the deleted modification """ qvModCalls = dict() modSeq = a.array('c') modSeq.fromstring(self.sequence) # Apply the found modifications to the raw sequence for (pos, call) in modCalls.items(): modSeq[pos] = call for (pos, call) in modCalls.items(): # Score the modified template at all positions affected by this mod modScore = self.scoreRegion(pos - self.post, pos + self.pre, modSeq) modScores = self.getRegionScores(pos - self.post, pos + self.pre, modSeq) if self.methylFractionFlag and self.rawKinetics.has_key(pos): if self.rawKinetics[pos]["coverage"] > self.methylMinCov: modifiedMeanVectors = self.getContextMeans(pos - self.post, pos + self.pre, modSeq) # Switch back to the unmodified base and re-score modSeq[pos] = canonicalBaseMap[call] noModScore = self.scoreRegion(pos - self.post, pos + self.pre, modSeq) noModScores = self.getRegionScores(pos - self.post, pos + self.pre, modSeq) if self.methylFractionFlag and self.rawKinetics.has_key(pos): if self.rawKinetics[pos]["coverage"] > self.methylMinCov: unModifiedMeanVectors = self.getContextMeans(pos - self.post, pos + self.pre, modSeq) # Put back the modified base modSeq[pos] = call # Compute score difference llr = modScore - noModScore # Convert from LLR to phred-scaled probability of modification qModScore = 10 * llr * log10e + 10 * log1p(exp(-llr)) * log10e # Figure out which secondary peaks were likely generated by this modification # What is the posterior that the peak was generated by this mod? maskPos = self.findMaskPositions(pos, modScores, noModScores) # FIXME: Without this, currently, the identificationQv score is too low for many Ca5C sites # if self.useLDA: # if self.rawKinetics.has_key(pos): # if self.rawKinetics[pos].has_key('Ca5C'): # llr = -self.rawKinetics[pos]['Ca5C'] # qModScore = 100 * llr * log10e + 100*log1p(exp(-llr))*log10e if self.methylFractionFlag and self.rawKinetics.has_key(pos): if self.rawKinetics[pos]["coverage"] > self.methylMinCov: # Instantiate mixture estimation methods: mixture = MixtureEstimationMethods(self.gbmModel.post, self.gbmModel.pre, self.rawKinetics, self.methylMinCov) # Use modifiedMeanVectors and unmodifiedMeanVectors to calculate mixing proportion, and 95% CI limits. methylFracEst, methylFracLow, methylFracUpp = mixture.estimateMethylatedFractions(pos, unModifiedMeanVectors, modifiedMeanVectors, ModificationPeakMask[modNames[call]]) qvModCalls[pos] = {'modification': modNames[call], 'QMod': qModScore, 'LLR': llr, 'Mask': maskPos, FRAC: methylFracEst, FRAClow: methylFracLow, FRACup: methylFracUpp} else: qvModCalls[pos] = {'modification': modNames[call], 'QMod': qModScore, 'LLR': llr, 'Mask': maskPos} else: # Store the full results qvModCalls[pos] = {'modification': modNames[call], 'QMod': qModScore, 'LLR': llr, 'Mask': maskPos} return qvModCalls