def saveRead(self, readToSave): ''' @param readToSave - the read that should be added to the output @param parent - 1, 2, 3, or SET; 1, 2, and 3 indicate the parent type, SET means it's already added, don't add again @param out - whether you want to see output from this save or not ''' if readToSave == None: return #save the read self.outputFile.write(readToSave) choice = MergeImprove.getTag(readToSave, MergeImprove.CHOICE_TYPE_TAG) parent = MergeImprove.getTag(readToSave, MergeImprove.PARENT_OF_ORIGIN_TAG) #TODO: remove this print error if parent == None or choice == None: print 'ERROR:' + str(readToSave) else: if self.statistics.has_key(choice) and parent < len( self.statistics[choice]): self.statistics[choice][parent] += 1 self.statistics['tot'][parent] += 1 else: print 'Poorly structured tag:' print self.statistics print readToSave if MergeImprove.verbosity: logger.info('Saving from ' + MergeImprove.getTag( readToSave, MergeImprove.PARENT_OF_ORIGIN_TAG) + ': ' + str(readToSave))
def saveRead(self, readToSave): ''' @param readToSave - the read that should be added to the output @param parent - 1, 2, 3, or SET; 1, 2, and 3 indicate the parent type, SET means it's already added, don't add again @param out - whether you want to see output from this save or not ''' if readToSave == None: return #save the read self.outputFile.write(readToSave) choice = MergeImprove.getTag(readToSave, MergeImprove.CHOICE_TYPE_TAG) parent = MergeImprove.getTag(readToSave, MergeImprove.PARENT_OF_ORIGIN_TAG) #TODO: remove this print error if parent == None or choice == None: print 'ERROR:'+str(readToSave) else: if self.statistics.has_key(choice) and parent < len(self.statistics[choice]): self.statistics[choice][parent] += 1 self.statistics['tot'][parent] += 1 else: print 'Poorly structured tag:' print self.statistics print readToSave if MergeImprove.verbosity: logger.info('Saving from '+MergeImprove.getTag(readToSave, MergeImprove.PARENT_OF_ORIGIN_TAG)+': '+str(readToSave))
def saveRandomPair(self, possiblePairs): ''' @param possiblePairs - a list of possible pairs that could be saved @param parent - the parent of origin for these pairs, can be SET indicating a mix of pairs that already have it set ''' if len(possiblePairs) > 1: if self.isRandomFilter: #pick a random pair rv = random.randint(0, len(possiblePairs) - 1) #random choice MergeImprove.setTag(possiblePairs[rv][0], MergeImprove.CHOICE_TYPE_TAG, 'R') MergeImprove.setTag(possiblePairs[rv][1], MergeImprove.CHOICE_TYPE_TAG, 'R') #save the pair self.saveRead(possiblePairs[rv][0]) self.saveRead(possiblePairs[rv][1]) else: for pair in possiblePairs: MergeImprove.setTag(pair[0], MergeImprove.CHOICE_TYPE_TAG, 'K') MergeImprove.setTag(pair[1], MergeImprove.CHOICE_TYPE_TAG, 'K') self.saveRead(pair[0]) self.saveRead(pair[1]) else: #save the pair self.saveRead(possiblePairs[0][0]) self.saveRead(possiblePairs[0][1])
def saveRandomPair(self, possiblePairs): ''' @param possiblePairs - a list of possible pairs that could be saved @param parent - the parent of origin for these pairs, can be SET indicating a mix of pairs that already have it set ''' if len(possiblePairs) > 1: if self.isRandomFilter: #pick a random pair rv = random.randint(0, len(possiblePairs)-1) #random choice MergeImprove.setTag(possiblePairs[rv][0], MergeImprove.CHOICE_TYPE_TAG, 'R') MergeImprove.setTag(possiblePairs[rv][1], MergeImprove.CHOICE_TYPE_TAG, 'R') #save the pair self.saveRead(possiblePairs[rv][0]) self.saveRead(possiblePairs[rv][1]) else: for pair in possiblePairs: MergeImprove.setTag(pair[0], MergeImprove.CHOICE_TYPE_TAG, 'K') MergeImprove.setTag(pair[1], MergeImprove.CHOICE_TYPE_TAG, 'K') self.saveRead(pair[0]) self.saveRead(pair[1]) else: #save the pair self.saveRead(possiblePairs[0][0]) self.saveRead(possiblePairs[0][1])
def saveRandomSingle(self, possibleSingles): ''' @param possibleSingles - a list of possible singles that could be saved @param parent - the parent of origin for these singles, can be SET indicating a mix of pairs that already have it set ''' if len(possibleSingles) > 1: if self.isRandomFilter: #pick a random single and save it rv = random.randint(0, len(possibleSingles)-1) MergeImprove.setTag(possibleSingles[rv], MergeImprove.CHOICE_TYPE_TAG, 'R') self.saveRead(possibleSingles[rv]) else: for single in possibleSingles: MergeImprove.setTag(single, MergeImprove.CHOICE_TYPE_TAG, 'K') self.saveRead(single) else: self.saveRead(possibleSingles[0])
def saveRandomSingle(self, possibleSingles): ''' @param possibleSingles - a list of possible singles that could be saved @param parent - the parent of origin for these singles, can be SET indicating a mix of pairs that already have it set ''' if len(possibleSingles) > 1: if self.isRandomFilter: #pick a random single and save it rv = random.randint(0, len(possibleSingles) - 1) MergeImprove.setTag(possibleSingles[rv], MergeImprove.CHOICE_TYPE_TAG, 'R') self.saveRead(possibleSingles[rv]) else: for single in possibleSingles: MergeImprove.setTag(single, MergeImprove.CHOICE_TYPE_TAG, 'K') self.saveRead(single) else: self.saveRead(possibleSingles[0])
def __init__(self, sharedDict, resultsQueue, outputFilename, isRandomFilter, outHeader, workerID, keepAll, numInputs): ''' @param sharedDict - a dictionary full of multiprocessing.Arrays @param resultsQueue - the shared queue for storing statistics on the alignments from each worker @param outputFilename - the overall output filename where the final merge will be, worker won't use this unless a single process scenario @param isRandomFilter - boolean determining whether we random or union at the end @param outHeader - the header to use in any output files @param workerID - integer ID, 0 means master ''' global logger logger = MergeImprove.getLogger() #this first multiprocessing.Process.__init__(self) #my vars self.sharedDict = sharedDict #always get these stats ''' self.statistics = {'K':{'1':0,'2':0,'3':0}, 'U':{'1':0,'2':0,'3':0}, 'Q':{'1':0,'2':0,'3':0}, 'P':{'1':0,'2':0,'3':0}, 'R':{'1':0,'2':0,'3':0}, 'tot':{'1':0,'2':0,'3':0}} ''' cts = ['K', 'U', 'Q', 'P', 'R', 'tot'] self.statistics = {} maxPO = 2**numInputs for ct in cts: self.statistics[ct] = [0 for x in range(0, maxPO)] self.percentageChoice = [0 for x in range(0, 101)] #save the inputs from the init self.resultsQueue = resultsQueue self.baseOutputFN = outputFilename self.outHeader = outHeader self.isRandomFilter = isRandomFilter self.pileupTempFN = outputFilename + '.tmp' + str( workerID) + '.bam' + '.pileup_tmp.bam' self.outputFN = outputFilename + '.tmp' + str( workerID) + '.bam' + '.pileup_complete.bam' self.workerID = workerID self.keepAll = keepAll
def __init__(self, sharedDict, resultsQueue, outputFilename, isRandomFilter, outHeader, workerID, keepAll, numInputs): ''' @param sharedDict - a dictionary full of multiprocessing.Arrays @param resultsQueue - the shared queue for storing statistics on the alignments from each worker @param outputFilename - the overall output filename where the final merge will be, worker won't use this unless a single process scenario @param isRandomFilter - boolean determining whether we random or union at the end @param outHeader - the header to use in any output files @param workerID - integer ID, 0 means master ''' global logger logger = MergeImprove.getLogger() #this first multiprocessing.Process.__init__(self) #my vars self.sharedDict = sharedDict #always get these stats ''' self.statistics = {'K':{'1':0,'2':0,'3':0}, 'U':{'1':0,'2':0,'3':0}, 'Q':{'1':0,'2':0,'3':0}, 'P':{'1':0,'2':0,'3':0}, 'R':{'1':0,'2':0,'3':0}, 'tot':{'1':0,'2':0,'3':0}} ''' cts = ['K', 'U', 'Q', 'P', 'R', 'tot'] self.statistics = {} maxPO = 2**numInputs for ct in cts: self.statistics[ct] = [0 for x in range(0, maxPO)] self.percentageChoice = [0 for x in range(0, 101)] #save the inputs from the init self.resultsQueue = resultsQueue self.baseOutputFN = outputFilename self.outHeader = outHeader self.isRandomFilter = isRandomFilter self.pileupTempFN = outputFilename+'.tmp'+str(workerID)+'.bam'+'.pileup_tmp.bam' self.outputFN = outputFilename+'.tmp'+str(workerID)+'.bam'+'.pileup_complete.bam' self.workerID = workerID self.keepAll = keepAll
def handlePostPileupMerge(self, reads): ''' This function compares a group of alignments and decides which one to keep @param reads - a set of reads with the same name to be compared using pileup ''' avgSum = 0 [pairs, singles] = MergeImprove.pairReads(reads, MergeImprove.PILEUP_HI_TAG) if len(pairs) != 0: bestAvgPileup = -1 bestPairs = [] for pair in pairs: [tot1, bases1] = self.calcPileupStats(pair[0]) [tot2, bases2] = self.calcPileupStats(pair[1]) avgPileup = float(tot1 + tot2) / (bases1 + bases2) avgSum += avgPileup if avgPileup > bestAvgPileup: bestAvgPileup = avgPileup bestPairs = [] if avgPileup == bestAvgPileup: bestPairs.append(pair) #stats if len(bestPairs) == 1: MergeImprove.setTag(bestPairs[0][0], MergeImprove.CHOICE_TYPE_TAG, 'P') MergeImprove.setTag(bestPairs[0][1], MergeImprove.CHOICE_TYPE_TAG, 'P') #save pileup stats if (bestAvgPileup == 0): self.percentageChoice[0] += 2 else: self.percentageChoice[int(100 * bestAvgPileup / avgSum)] += 2 #save one of the best pileup pairs self.saveRandomPair(bestPairs) else: #do this over singles bestAvgPileup = {} bestReads = {} avgSum = {False: 0, True: 0} for read in singles: #if there's nothing yet for this sequence, set it's best as -1 so it gets overwritten below isFirst = MergeImprove.isFlagSet( read.flag, MergeImprove.FIRST_SEGMENT_FLAG) if not bestAvgPileup.has_key(isFirst): bestAvgPileup[isFirst] = -1 bestReads[isFirst] = [] #get the pileup calculation [tot, bases] = self.calcPileupStats(read) avgPileup = float(tot) / bases avgSum[isFirst] += avgPileup #if it's better, keep it if avgPileup > bestAvgPileup[isFirst]: bestAvgPileup[isFirst] = avgPileup bestReads[isFirst] = [] if avgPileup == bestAvgPileup[isFirst]: bestReads[isFirst].append(read) #save the best from each end for end in bestReads: brs = bestReads[end] if len(brs) == 1: MergeImprove.setTag(brs[0], MergeImprove.CHOICE_TYPE_TAG, 'P') if bestAvgPileup[end] == 0: self.percentageChoice[0] += 1 else: self.percentageChoice[int(100 * bestAvgPileup[end] / avgSum[end])] += 1 self.saveRandomSingle(brs)
def handlePostPileupMerge(self, reads): ''' This function compares a group of alignments and decides which one to keep @param reads - a set of reads with the same name to be compared using pileup ''' avgSum = 0 [pairs, singles] = MergeImprove.pairReads(reads, MergeImprove.PILEUP_HI_TAG) if len(pairs) != 0: bestAvgPileup = -1 bestPairs = [] for pair in pairs: [tot1, bases1] = self.calcPileupStats(pair[0]) [tot2, bases2] = self.calcPileupStats(pair[1]) avgPileup = float(tot1+tot2)/(bases1+bases2) avgSum += avgPileup if avgPileup > bestAvgPileup: bestAvgPileup = avgPileup bestPairs = [] if avgPileup == bestAvgPileup: bestPairs.append(pair) #stats if len(bestPairs) == 1: MergeImprove.setTag(bestPairs[0][0], MergeImprove.CHOICE_TYPE_TAG, 'P') MergeImprove.setTag(bestPairs[0][1], MergeImprove.CHOICE_TYPE_TAG, 'P') #save pileup stats if(bestAvgPileup == 0): self.percentageChoice[0] += 2 else: self.percentageChoice[int(100*bestAvgPileup/avgSum)] += 2 #save one of the best pileup pairs self.saveRandomPair(bestPairs) else: #do this over singles bestAvgPileup = {} bestReads = {} avgSum = {False: 0, True: 0} for read in singles: #if there's nothing yet for this sequence, set it's best as -1 so it gets overwritten below isFirst = MergeImprove.isFlagSet(read.flag, MergeImprove.FIRST_SEGMENT_FLAG) if not bestAvgPileup.has_key(isFirst): bestAvgPileup[isFirst] = -1 bestReads[isFirst] = [] #get the pileup calculation [tot, bases] = self.calcPileupStats(read) avgPileup = float(tot)/bases avgSum[isFirst] += avgPileup #if it's better, keep it if avgPileup > bestAvgPileup[isFirst]: bestAvgPileup[isFirst] = avgPileup bestReads[isFirst] = [] if avgPileup == bestAvgPileup[isFirst]: bestReads[isFirst].append(read) #save the best from each end for end in bestReads: brs = bestReads[end] if len(brs) == 1: MergeImprove.setTag(brs[0], MergeImprove.CHOICE_TYPE_TAG, 'P') if bestAvgPileup[end] == 0: self.percentageChoice[0] += 1 else: self.percentageChoice[int(100*bestAvgPileup[end]/avgSum[end])] += 1 self.saveRandomSingle(brs)