def onChunk(self, referenceWindow): # Setup the object for a new window. self._prepForReferenceWindow(referenceWindow) # start and end are the windows of the reference that we are responsible for reporting data from. # We may elect to pull data from a wider window for use with positive control (reference, start, end) = referenceWindow # Trim end coordinate to length of current template end = min(end, self.ipdModel.refLength(reference)) if self.options.identify: # If we are attempting to identify modifications, get the raw data for a slightly expanded window # then do the decoding, then weave the modification results back into the main results padStart = start - self.pad padEnd = end + self.pad perSiteResults = self._summarizeReferenceRegion((padStart, padEnd), self.options.methylFraction, self.options.identify) if self.options.useLDA: # FIXME: add on a column "Ca5C" containing LDA score for each C-residue site # Below is an example of how to use an alternative, the BasicLdaEnricher, which does not use the positive control model # PositiveControlEnricher currently uses a logistic regression model trained using SMRTportal job 65203 (native E. coli) # lda = BasicLdaEnricher( self.ipdModel.gbmModel, self.sequence, perSiteResults, self.options.identify, self.options.modsToCall ) lda = PositiveControlEnricher(self.ipdModel.gbmModel, self.sequence, perSiteResults) perSiteResults = lda.callEnricherFunction(perSiteResults) try: # Handle different modes of 'extra analysis' here -- this one is for multi-site m5C detection # mods = self._multiSiteDetection(perSiteResults, (start, end)) mods = self._decodePositiveControl(perSiteResults, (start, end)) except: type, value, tb = sys.exc_info() traceback.print_exc() pdb.post_mortem(tb) finalCalls = [] # Weave together results for strand in [0, 1]: strandSign = 1 if strand == 0 else -1 siteDict = dict((x['tpl'], x) for x in perSiteResults if start <= x['tpl'] < end and x['strand'] == strand) modDict = dict((x['tpl'], x) for x in mods if start <= x['tpl'] < end and x['strand'] == strand) # Go through the modifications - add tags for identified mods to per-site stats # add a 'offTarget' tag to the off target peaks. for (pos, mod) in modDict.items(): # Only convert to positive control call if we actually have enough # coverage on the cognate base! if siteDict.has_key(mod['tpl']): # Copy mod identification data #siteDict[mod['tpl']]['modificationScore'] = mod['QMod'] #siteDict[mod['tpl']]['modification'] = mod['modification'] if self.options.methylFraction and mod.has_key(FRAC): siteDict[mod['tpl']][FRAC] = mod[FRAC] siteDict[mod['tpl']][FRAClow] = mod[FRAClow] siteDict[mod['tpl']][FRACup] = mod[FRACup] # Copy any extra properties that were added newKeys = set(mod.keys()) - set(siteDict[mod['tpl']].keys()) for nk in newKeys: siteDict[mod['tpl']][nk] = mod[nk] if mod.has_key('Mask'): # The decoder should supply the off-target peak mask mask = mod['Mask'] mask.append(0) # make sure we always mask the cognate position else: # If the decoder doesn't supply a mask - use a hard-coded version # FIXME - this branch is deprecated mask = ModificationPeakMask[mod['modification']] # Mask out neighbor peaks that may have been caused by this mod for offset in mask: shadowPos = mod['tpl'] + strandSign * offset if siteDict.has_key(shadowPos): siteDict[shadowPos]['offTargetPeak'] = True finalCalls.extend(siteDict.values()) # Sort by template position finalCalls.sort(key=lambda x: x['tpl']) return finalCalls else: result = self._summarizeReferenceRegion((start, end), self.options.methylFraction, self.options.identify) if self.options.useLDA and self.controlCmpH5 is None: # FIXME: add on a column "Ca5C" containing LDA score for each C-residue site # lda = BasicLdaEnricher(self.ipdModel.gbmModel, self.sequence, result, self.options.identify) lda = PositiveControlEnricher(self.ipdModel.gbmModel, self.sequence, result) results = lda.callEnricherFunction(result) result.sort(key=lambda x: x['tpl']) return result
def onChunk(self, referenceWindow): # start and end are the windows of the reference that we are responsible for reporting data from. # We may elect to pull data from a wider window for use with positive control if self.options.smBaseMod: (reference, smId, start, end) = referenceWindow else: (reference, start, end) = referenceWindow targetBounds = (start,end) # Trim end coordinate to length of current template end = min(end,self.ipdModel.refLength(reference)) # Each chunk is from a single reference -- fire up meanIpd func on the current reference self.meanIpdFunc = self.ipdModel.predictIpdFunc(reference) # Get the cognate base at a given position self.cognateBaseFunc = self.ipdModel.cognateBaseFunc(reference) self.refId = reference self.sequence = self.ipdModel.getReferenceWindow(self.refId, 0, start, end) # Compute the data for this chunk if self.options.identify: # If we are attempting to identify modifications, get the raw data for a slightly expanded window # then do the decoding, then weave the modification results back into the main results padStart = start - 8 padEnd = end + 8 perSiteResults = self._summarizeReferenceRegion((padStart, padEnd), self.options.methylFraction, self.options.identify) if self.options.useLDA: # FIXME: add on a column "Ca5C" containing LDA score for each C-residue site # Below is an example of how to use an alternative, the BasicLdaEnricher, which does not use the positive control model # PositiveControlEnricher currently uses a logistic regression model trained using SMRTportal job 65203 (native E. coli) # lda = BasicLdaEnricher( self.ipdModel.gbmModel, self.sequence, perSiteResults, self.options.identify, self.options.modsToCall ) lda = PositiveControlEnricher( self.ipdModel.gbmModel, self.sequence, perSiteResults ) perSiteResults = lda.callEnricherFunction( perSiteResults ) mods = self._decodePositiveControl(perSiteResults, (start, end)) finalCalls = [] # Weave together results for strand in [0, 1]: strandSign = 1 if strand == 0 else -1 siteDict = dict((x['tpl'], x) for x in perSiteResults if start <= x['tpl'] < end and x['strand'] == strand) modDict = dict((x['tpl'], x) for x in mods if start <= x['tpl'] < end and x['strand'] == strand) # Go through the modifications - add tags for identified mods to per-site stats # add a 'offTarget' tag to the off target peaks. for (pos, mod) in modDict.items(): # Only convert to positive control call if we actually have enough # coverage on the cognate base! if siteDict.has_key(mod['tpl']): # Copy mod identification data siteDict[mod['tpl']]['modificationScore'] = mod['QMod'] siteDict[mod['tpl']]['modification'] = mod['modification'] if self.options.methylFraction and mod.has_key(FRAC): siteDict[mod['tpl']][FRAC] = mod[FRAC] siteDict[mod['tpl']][FRAClow] = mod[FRAClow] siteDict[mod['tpl']][FRACup] = mod[FRACup] if mod.has_key('Mask'): # The decoder should supply the off-target peak mask mask = mod['Mask'] mask.append(0) # make sure we always mask the cognate position else: # If the decoder doesn't supply a mask - use a hard-coded version # FIXME - this branch is deprecated mask = ModificationPeakMask[mod['modification']] # Mask out neighbor peaks that may have been caused by this mod for offset in mask: shadowPos = mod['tpl'] + strandSign * offset if siteDict.has_key(shadowPos): siteDict[shadowPos]['offTargetPeak'] = True finalCalls.extend(siteDict.values()) # Sort by template position finalCalls.sort(key = lambda x: x['tpl']) return finalCalls else: if self.options.smBaseMod: result = self._summarizeMolecule(smId, targetBounds, self.options.methylFraction, self.options.identify) else: result = self._summarizeReferenceRegion(targetBounds, self.options.methylFraction, self.options.identify) if self.options.useLDA and self.controlCmpH5 is None: # FIXME: add on a column "Ca5C" containing LDA score for each C-residue site # lda = BasicLdaEnricher(self.ipdModel.gbmModel, self.sequence, result, self.options.identify) lda = PositiveControlEnricher( self.ipdModel.gbmModel, self.sequence, result ) results = lda.callEnricherFunction( result ) if self.options.smBaseMod: pass else: result.sort(key = lambda x: x['tpl']) return result