def removeOutliers(self, binFile, outlierFile, outputFile): """Remove sequences specified as outliers in the provided file.""" binSeqs = readFasta(binFile) binIdToModify = binIdFromFilename(binFile) # get files to remove checkFileExists(outlierFile) seqsToRemove = [] bHeader = True for line in open(outlierFile): if bHeader: bHeader = False continue lineSplit = line.split('\t') binId = lineSplit[0] if binId == binIdToModify: seqId = lineSplit[1] seqsToRemove.append(seqId) # remove sequences from bin if len(seqsToRemove) > 0: self.__removeSeqs(binSeqs, seqsToRemove) # save modified bin writeFasta(binSeqs, outputFile)
def run(self, coverageFile, outFile, bTabTable): checkFileExists(coverageFile) # get number of reads mapped to each bin self.logger.info('Determining number of reads mapped to each bin.') readsMappedToBin = {} binSize = {} totalMappedReads = {} bHeader = True for line in open(coverageFile): if bHeader: bHeader = False continue lineSplit = line.split('\t') # seqId = lineSplit[0] binId = lineSplit[1] seqLen = int(lineSplit[2]) binSize[binId] = binSize.get(binId, 0) + seqLen if binId not in readsMappedToBin: readsMappedToBin[binId] = {} for i in xrange(3, len(lineSplit), 3): bamId = lineSplit[i] mappedReads = int(lineSplit[i + 2]) totalMappedReads[bamId] = totalMappedReads.get(bamId, 0) + mappedReads readsMappedToBin[binId][bamId] = readsMappedToBin[binId].get( bamId, 0) + mappedReads # calculate percentage of mapped reads to binned populations perMappedReads = {} normBinCoverage = {} sumNormBinCoverage = {} for binId, bamIds in readsMappedToBin.iteritems(): perMappedReads[binId] = {} normBinCoverage[binId] = {} for bamId in bamIds: perMR = float( readsMappedToBin[binId][bamId]) / totalMappedReads[bamId] perMappedReads[binId][bamId] = perMR if binId == DefaultValues.UNBINNED: continue normCoverage = perMR / binSize[binId] normBinCoverage[binId][bamId] = normCoverage sumNormBinCoverage[bamId] = sumNormBinCoverage.get( bamId, 0) + normCoverage for binId, bamIds in normBinCoverage.iteritems(): for bamId in bamIds: if sumNormBinCoverage[bamId] != 0: normBinCoverage[binId][bamId] /= sumNormBinCoverage[bamId] else: normBinCoverage[binId][bamId] = 0 # write community profile oldStdOut = reassignStdOut(outFile) sortedBinIds = sorted(readsMappedToBin.keys()) sortedBamIds = sorted(readsMappedToBin[sortedBinIds[0]].keys()) header = ['Bin Id', 'Bin size (Mbp)'] for bamId in sortedBamIds: header += [bamId + ': mapped reads'] header += [bamId + ': % mapped reads'] header += [bamId + ': % binned populations'] header += [bamId + ': % community'] if bTabTable: print('\t'.join(header)) else: pTable = prettytable.PrettyTable(header) pTable.float_format = '.2' pTable.align = 'c' pTable.align[header[0]] = 'l' pTable.hrules = prettytable.FRAME pTable.vrules = prettytable.NONE for binId in sortedBinIds: row = [binId] row += [float(binSize[binId]) / 1e6] for bamId in sortedBamIds: row += [readsMappedToBin[binId][bamId]] row += [perMappedReads[binId][bamId] * 100.0] if DefaultValues.UNBINNED in perMappedReads: unbinnedPercentage = perMappedReads[ DefaultValues.UNBINNED][bamId] else: unbinnedPercentage = 0 if binId == DefaultValues.UNBINNED: row += ['NA'] row += [unbinnedPercentage * 100.0] else: row += [normBinCoverage[binId][bamId] * 100.0] row += [ normBinCoverage[binId][bamId] * 100.0 * (1.0 - unbinnedPercentage) ] if bTabTable: print('\t'.join(map(str, row))) else: pTable.add_row(row) if not bTabTable: print(pTable.get_string()) restoreStdOut(outFile, oldStdOut)
def run(self, coverageFile, outFile, bTabTable): checkFileExists(coverageFile) # get number of reads mapped to each bin self.logger.info(' Determining number of reads mapped to each bin.') self.logger.info('') readsMappedToBin = {} binSize = {} totalMappedReads = {} bHeader = True for line in open(coverageFile): if bHeader: bHeader = False continue lineSplit = line.split('\t') # seqId = lineSplit[0] binId = lineSplit[1] seqLen = int(lineSplit[2]) binSize[binId] = binSize.get(binId, 0) + seqLen if binId not in readsMappedToBin: readsMappedToBin[binId] = {} for i in xrange(3, len(lineSplit), 3): bamId = lineSplit[i] mappedReads = int(lineSplit[i + 2]) totalMappedReads[bamId] = totalMappedReads.get(bamId, 0) + mappedReads readsMappedToBin[binId][bamId] = readsMappedToBin[binId].get(bamId, 0) + mappedReads # calculate percentage of mapped reads to binned populations perMappedReads = {} normBinCoverage = {} sumNormBinCoverage = {} for binId, bamIds in readsMappedToBin.iteritems(): perMappedReads[binId] = {} normBinCoverage[binId] = {} for bamId in bamIds: perMR = float(readsMappedToBin[binId][bamId]) / totalMappedReads[bamId] perMappedReads[binId][bamId] = perMR if binId == DefaultValues.UNBINNED: continue normCoverage = perMR / binSize[binId] normBinCoverage[binId][bamId] = normCoverage sumNormBinCoverage[bamId] = sumNormBinCoverage.get(bamId, 0) + normCoverage for binId, bamIds in normBinCoverage.iteritems(): for bamId in bamIds: normBinCoverage[binId][bamId] /= sumNormBinCoverage[bamId] # write community profile oldStdOut = reassignStdOut(outFile) sortedBinIds = sorted(readsMappedToBin.keys()) sortedBamIds = sorted(readsMappedToBin[sortedBinIds[0]].keys()) header = ['Bin Id', 'Bin size (Mbp)'] for bamId in sortedBamIds: header += [bamId + ': mapped reads'] header += [bamId + ': % mapped reads'] header += [bamId + ': % binned populations'] header += [bamId + ': % community'] if bTabTable: print('\t'.join(header)) else: pTable = prettytable.PrettyTable(header) pTable.float_format = '.2' pTable.align = 'c' pTable.align[header[0]] = 'l' pTable.hrules = prettytable.FRAME pTable.vrules = prettytable.NONE for binId in sortedBinIds: row = [binId] row += [float(binSize[binId]) / 1e6] for bamId in sortedBamIds: row += [readsMappedToBin[binId][bamId]] row += [perMappedReads[binId][bamId] * 100.0] if DefaultValues.UNBINNED in perMappedReads: unbinnedPercentage = perMappedReads[DefaultValues.UNBINNED][bamId] else: unbinnedPercentage = 0 if binId == DefaultValues.UNBINNED: row += ['NA'] row += [unbinnedPercentage * 100.0] else: row += [normBinCoverage[binId][bamId] * 100.0] row += [normBinCoverage[binId][bamId] * 100.0 * (1.0 - unbinnedPercentage)] if bTabTable: print('\t'.join(map(str, row))) else: pTable.add_row(row) if not bTabTable: print(pTable.get_string()) restoreStdOut(outFile, oldStdOut)