def reportFullMSA(self, outDir, outFile): """Create MSA with all reference and bin alignments.""" # write bin alignments to file oldStdOut = reassignStdOut(outFile) for line in open( os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_CONCAT_SEQ_OUT)): print((line.rstrip())) # read duplicate seqs duplicateNodes = self.__readDuplicateSeqs() # write reference alignments to file seqs = readFasta( os.path.join(DefaultValues.PPLACER_REF_PACKAGE_FULL, DefaultValues.GENOME_TREE_FASTA)) for seqId, seq in seqs.items(): print(('>' + seqId)) print(seq) if seqId in duplicateNodes: for dupSeqId in duplicateNodes[seqId]: print(('>' + dupSeqId)) print(seq) restoreStdOut(outFile, oldStdOut)
def __printSimpleSummaryTable(self, binIdToTaxonomy, resultsParser, bTabTable, outFile): # redirect output oldStdOut = reassignStdOut(outFile) arbitraryBinId = binIdToTaxonomy.keys()[0] markerCountLabel = '# unique markers (of %d)' % len(resultsParser.models[arbitraryBinId]) header = ['Bin Id', markerCountLabel, '# multi-copy', 'Taxonomy'] if bTabTable: pTable = None print('\t'.join(header)) else: pTable = prettytable.PrettyTable(header) pTable.float_format = '.2' pTable.align = 'c' pTable.align[header[0]] = 'l' pTable.align['Taxonomy'] = 'l' pTable.hrules = prettytable.FRAME pTable.vrules = prettytable.NONE for binId in sorted(binIdToTaxonomy.keys()): uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits() row = [binId, uniqueHits, multiCopyHits, binIdToTaxonomy[binId]] if bTabTable: print('\t'.join(map(str, row))) else: pTable.add_row(row) if not bTabTable: print(pTable.get_string(sortby=markerCountLabel, reversesort=True)) # restore stdout restoreStdOut(outFile, oldStdOut)
def __printSimpleSummaryTable(self, binIdToTaxonomy, resultsParser, bTabTable, outFile): # redirect output oldStdOut = reassignStdOut(outFile) arbitraryBinId = binIdToTaxonomy.keys()[0] markerCountLabel = '# unique markers (of %d)' % len(resultsParser.models[arbitraryBinId]) header = ['Bin Id', markerCountLabel, '# multi-copy', 'Taxonomy'] if bTabTable: pTable = None print('\t'.join(header)) else: pTable = prettytable.PrettyTable(header) pTable.float_format = '.2' pTable.align = 'c' pTable.align[header[0]] = 'l' pTable.align['Taxonomy'] = 'l' pTable.hrules = prettytable.FRAME pTable.vrules = prettytable.NONE for binId in sorted(binIdToTaxonomy.keys()): uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits() row = [binId, uniqueHits, multiCopyHits, binIdToTaxonomy[binId]] if bTabTable: print('\t'.join(map(str, row))) else: pTable.add_row(row) if not bTabTable : print(pTable.get_string(sortby=markerCountLabel, reversesort=True)) # restore stdout restoreStdOut(outFile, oldStdOut)
def reportNewickTree(self, outDir, outFile, leafLabels=None): # read duplicate nodes duplicateSeqs = self.__readDuplicateSeqs() # read tree treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT) tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True) # clean up internal node labels for node in tree.internal_nodes(): if node.label: labelSplit = node.label.split('|') label = labelSplit[0] if labelSplit[1] != '': label += '|' + labelSplit[1] if labelSplit[2] != '': label += '|' + labelSplit[2] node.label = label # insert duplicate nodes into tree for leaf in tree.leaf_nodes(): duplicates = duplicateSeqs.get(leaf.taxon.label, None) if duplicates != None: newParent = leaf.parent_node.new_child( edge_length=leaf.edge_length) curLeaf = leaf.parent_node.remove_child(leaf) newParent.new_child(taxon=curLeaf.taxon, edge_length=0) for d in duplicates: newParent.new_child(taxon=dendropy.Taxon(label=d), edge_length=0) # append taxonomy to leaf nodes if leafLabels == 'taxonomy': # read taxonomy string for each IMG genome taxonomy = {} for line in open( os.path.join(DefaultValues.GENOME_TREE_DIR, DefaultValues.GENOME_TREE_TAXONOMY)): lineSplit = line.split('\t') taxonomy[lineSplit[0]] = lineSplit[1].rstrip() # append taxonomy to leaf labels for leaf in tree.leaf_nodes(): taxaStr = taxonomy.get(leaf.taxon.label, None) if taxaStr: leaf.taxon.label += '|' + taxaStr # write out tree oldStdOut = reassignStdOut(outFile) print((tree.as_string(schema='newick', suppress_rooting=True))) restoreStdOut(outFile, oldStdOut)
def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile, anaFolder): # redirect output oldStdOut = reassignStdOut(outFile) coverageBinProfiles = None if coverageFile: coverage = Coverage(1) coverageBinProfiles = coverage.binProfiles(coverageFile) prettyTableFormats = [1, 2, 3, 9] header = self.__getHeader( outputFormat, binIdToBinMarkerSets[list(binIdToBinMarkerSets.keys())[0]], coverageBinProfiles, bTabTable) if bTabTable or outputFormat not in prettyTableFormats: bTabTable = True pTable = None if header != None: print('\t'.join(header)) else: pTable = prettytable.PrettyTable(header) pTable.float_format = '.2' pTable.align = 'c' pTable.align[header[0]] = 'l' pTable.hrules = prettytable.FRAME pTable.vrules = prettytable.NONE seqsReported = 0 for binId in sorted(self.results.keys()): seqsReported += self.results[binId].printSummary( outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable, anaFolder) if outputFormat in [6, 7] and seqsReported == 0: print('[No marker genes satisfied the reporting criteria.]') if not bTabTable: if outputFormat in [1, 2]: print( pTable.get_string(sortby='Completeness', reversesort=True)) else: # only print if there are rows if pTable.get_string(print_empty=False): print(pTable.get_string(print_empty=False)) # restore stdout restoreStdOut(outFile, oldStdOut)
def reportNewickTree(self, outDir, outFile, leafLabels=None): # read duplicate nodes duplicateSeqs = self.__readDuplicateSeqs() # read tree treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT) tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) # clean up internal node labels for node in tree.internal_nodes(): if node.label: labelSplit = node.label.split('|') label = labelSplit[0] if labelSplit[1] != '': label += '|' + labelSplit[1] if labelSplit[2] != '': label += '|' + labelSplit[2] node.label = label # insert duplicate nodes into tree for leaf in tree.leaf_nodes(): duplicates = duplicateSeqs.get(leaf.taxon.label, None) if duplicates != None: newParent = leaf.parent_node.new_child(edge_length = leaf.edge_length) curLeaf = leaf.parent_node.remove_child(leaf) newParent.new_child(taxon = curLeaf.taxon, edge_length = 0) for d in duplicates: newParent.new_child(taxon = Taxon(label = d), edge_length = 0) # append taxonomy to leaf nodes if leafLabels == 'taxonomy': # read taxonomy string for each IMG genome taxonomy = {} for line in open(os.path.join(DefaultValues.GENOME_TREE_DIR, 'genome_tree.taxonomy.tsv')): lineSplit = line.split('\t') taxonomy[lineSplit[0]] = lineSplit[1].rstrip() # append taxonomy to leaf labels for leaf in tree.leaf_nodes(): taxaStr = taxonomy.get(leaf.taxon.label, None) if taxaStr: leaf.taxon.label += '|' + taxaStr # write out tree oldStdOut = reassignStdOut(outFile) print(tree.as_string(schema='newick', suppress_rooting=True)) restoreStdOut(outFile, oldStdOut)
def joinTables(self, options): """Join tables command""" self.logger.info( '[CheckM - join_tables] Joining tables containing bin information.' ) # read all tables headers = {} rows = defaultdict(dict) binIds = set() for f in options.tables: with open(f) as fin: headers[f] = [x.strip() for x in fin.readline().split('\t')][1:] for line in fin: lineSplit = [x.strip() for x in line.split('\t')] binId = lineSplit[0] binIds.add(binId) for i, header in enumerate(headers[f]): rows[binId][header] = lineSplit[i + 1] # write merge table oldStdOut = reassignStdOut(options.file) row = 'Bin Id' for f in options.tables: row += '\t' + '\t'.join(headers[f]) print(row) for binId in binIds: row = binId for f in options.tables: for header in headers[f]: row += '\t' + rows[binId].get(header, '') print(row) restoreStdOut(options.file, oldStdOut) if options.file: self.logger.info('Joined table written to: ' + options.file) self.timeKeeper.printTimeStamp()
def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile, anaFolder): # redirect output oldStdOut = reassignStdOut(outFile) coverageBinProfiles = None if coverageFile: coverage = Coverage(1) coverageBinProfiles = coverage.binProfiles(coverageFile) prettyTableFormats = [1, 2, 3, 9] header = self.__getHeader(outputFormat, binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]], coverageBinProfiles, bTabTable) if bTabTable or outputFormat not in prettyTableFormats: bTabTable = True pTable = None if header != None: print('\t'.join(header)) else: pTable = prettytable.PrettyTable(header) pTable.float_format = '.2' pTable.align = 'c' pTable.align[header[0]] = 'l' pTable.hrules = prettytable.FRAME pTable.vrules = prettytable.NONE seqsReported = 0 for binId in sorted(self.results.keys()): seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable, anaFolder) if outputFormat in [6, 7] and seqsReported == 0: print('[No marker genes satisfied the reporting criteria.]') if not bTabTable: if outputFormat in [1, 2]: print(pTable.get_string(sortby='Completeness', reversesort=True)) else: # only print if there are rows if pTable.get_string(print_empty=False): print(pTable.get_string(print_empty=False)) # restore stdout restoreStdOut(outFile, oldStdOut)
def reportFullMSA(self, outDir, outFile): """Create MSA with all reference and bin alignments.""" # write bin alignments to file oldStdOut = reassignStdOut(outFile) for line in open(os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_CONCAT_SEQ_OUT)): print(line.rstrip()) # read duplicate seqs duplicateNodes = self.__readDuplicateSeqs() # write reference alignments to file seqs = readFasta(os.path.join(DefaultValues.PPLACER_REF_PACKAGE, 'genome_tree.concatenated.derep.fasta')) for seqId, seq in seqs.iteritems(): print('>' + seqId) print(seq) if seqId in duplicateNodes: for dupSeqId in duplicateNodes[seqId]: print('>' + dupSeqId) print(seq) restoreStdOut(outFile, oldStdOut)
def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile): # redirect output oldStdOut = reassignStdOut(outFile) coverageBinProfiles = None if coverageFile: coverage = Coverage(1) coverageBinProfiles = coverage.binProfiles(coverageFile) prettyTableFormats = [1, 2, 3] header = self.__getHeader(outputFormat, binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]], coverageBinProfiles) if bTabTable or outputFormat not in prettyTableFormats: bTabTable = True pTable = None if header != None: print('\t'.join(header)) else: pTable = prettytable.PrettyTable(header) pTable.float_format = '.2' pTable.align = 'c' pTable.align[header[0]] = 'l' pTable.hrules = prettytable.FRAME pTable.vrules = prettytable.NONE for binId in sorted(self.results.keys()): self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable) if not bTabTable : if outputFormat in [1,2]: print(pTable.get_string(sortby='Completeness', reversesort=True)) else: print(pTable.get_string()) # restore stdout restoreStdOut(outFile, oldStdOut)
def run(self, binFiles, bamFiles, outFile, bAllReads, minAlignPer, maxEditDistPer, minQC): """Calculate coverage of sequences for each BAM file.""" # determine bin assignment of each sequence self.logger.info(' Determining bin assignment of each sequence.') seqIdToBinId = {} seqIdToSeqLen = {} for binFile in binFiles: binId = binIdFromFilename(binFile) seqs = readFasta(binFile) for seqId, seq in seqs.iteritems(): seqIdToBinId[seqId] = binId seqIdToSeqLen[seqId] = len(seq) # process each fasta file self.logger.info(" Processing %d file(s) with %d threads.\n" % (len(bamFiles), self.totalThreads)) # make sure all BAM files are sorted self.numFiles = len(bamFiles) for bamFile in bamFiles: if not os.path.exists(bamFile + '.bai'): self.logger.error( ' [Error] BAM file is either unsorted or not indexed: ' + bamFile + '\n') sys.exit() # calculate coverage of each BAM file coverageInfo = {} numFilesStarted = 0 for bamFile in bamFiles: numFilesStarted += 1 self.logger.info( ' Processing %s (%d of %d):' % (ntpath.basename(bamFile), numFilesStarted, len(bamFiles))) coverageInfo[bamFile] = mp.Manager().dict() coverageInfo[bamFile] = self.__processBam(bamFile, bAllReads, minAlignPer, maxEditDistPer, minQC, coverageInfo[bamFile]) # redirect output self.logger.info(' Writing coverage information to file.') oldStdOut = reassignStdOut(outFile) header = 'Sequence Id\tBin Id\tSequence length (bp)' for bamFile in bamFiles: header += '\tBam Id\tCoverage\tMapped reads' print(header) # get length of all seqs for bamFile, seqIds in coverageInfo.iteritems(): for seqId in seqIds.keys(): seqIdToSeqLen[seqId] = seqIds[seqId].seqLen # write coverage stats for all scaffolds to file for seqId, seqLen in seqIdToSeqLen.iteritems(): rowStr = seqId + '\t' + seqIdToBinId.get( seqId, DefaultValues.UNBINNED) + '\t' + str(seqLen) for bamFile in bamFiles: bamId = binIdFromFilename(bamFile) if seqId in coverageInfo[bamFile]: rowStr += '\t%s\t%f\t%d' % ( bamId, coverageInfo[bamFile][seqId].coverage, coverageInfo[bamFile][seqId].mappedReads) else: rowStr += '\t%s\t%f\t%d' % (bamId, 0, 0) print(rowStr) # restore stdout restoreStdOut(outFile, oldStdOut)
def run(self, binFiles, bamFiles, outFile, bAllReads, minAlignPer, maxEditDistPer, minQC): """Calculate coverage of sequences for each BAM file.""" # determine bin assignment of each sequence self.logger.info(' Determining bin assignment of each sequence.') seqIdToBinId = {} seqIdToSeqLen = {} for binFile in binFiles: binId = binIdFromFilename(binFile) seqs = readFasta(binFile) for seqId, seq in seqs.iteritems(): seqIdToBinId[seqId] = binId seqIdToSeqLen[seqId] = len(seq) # process each fasta file self.logger.info(" Processing %d file(s) with %d threads.\n" % (len(bamFiles), self.totalThreads)) # make sure all BAM files are sorted self.numFiles = len(bamFiles) for bamFile in bamFiles: if not os.path.exists(bamFile + '.bai'): self.logger.error(' [Error] BAM file is either unsorted or not indexed: ' + bamFile + '\n') sys.exit(1) # calculate coverage of each BAM file coverageInfo = {} numFilesStarted = 0 for bamFile in bamFiles: numFilesStarted += 1 self.logger.info(' Processing %s (%d of %d):' % (ntpath.basename(bamFile), numFilesStarted, len(bamFiles))) coverageInfo[bamFile] = mp.Manager().dict() coverageInfo[bamFile] = self.__processBam(bamFile, bAllReads, minAlignPer, maxEditDistPer, minQC, coverageInfo[bamFile]) # redirect output self.logger.info(' Writing coverage information to file.') oldStdOut = reassignStdOut(outFile) header = 'Sequence Id\tBin Id\tSequence length (bp)' for bamFile in bamFiles: header += '\tBam Id\tCoverage\tMapped reads' print(header) # get length of all seqs for bamFile, seqIds in coverageInfo.iteritems(): for seqId in seqIds.keys(): seqIdToSeqLen[seqId] = seqIds[seqId].seqLen # write coverage stats for all scaffolds to file for seqId, seqLen in seqIdToSeqLen.iteritems(): rowStr = seqId + '\t' + seqIdToBinId.get(seqId, DefaultValues.UNBINNED) + '\t' + str(seqLen) for bamFile in bamFiles: bamId = binIdFromFilename(bamFile) if seqId in coverageInfo[bamFile]: rowStr += '\t%s\t%f\t%d' % (bamId, coverageInfo[bamFile][seqId].coverage, coverageInfo[bamFile][seqId].mappedReads) else: rowStr += '\t%s\t%f\t%d' % (bamId, 0, 0) print(rowStr) # restore stdout restoreStdOut(outFile, oldStdOut)
def __printFullTable(self, binIdToUID, binIdToTaxonomy, binIdToSisterTaxonomy, binIdToLineageStatistics, resultsParser, binStats, bTabTable, outFile): # redirect output oldStdOut = reassignStdOut(outFile) arbitraryBinId = list(binIdToTaxonomy.keys())[0] markerCountLabel = '# unique markers (of %d)' % len( resultsParser.models[arbitraryBinId]) header = ['Bin Id', markerCountLabel, "# multi-copy"] header += [ 'Insertion branch UID', 'Taxonomy (contained)', 'Taxonomy (sister lineage)' ] header += [ 'GC', 'Genome size (Mbp)', 'Gene count', 'Coding density', 'Translation table' ] header += [ '# descendant genomes', 'Lineage: GC mean', 'Lineage: GC std' ] header += [ 'Lineage: genome size (Mbp) mean', 'Lineage: genome size (Mbp) std' ] header += ['Lineage: gene count mean', 'Lineage: gene count std'] if bTabTable: pTable = None print(('\t'.join(header))) else: pTable = prettytable.PrettyTable(header) pTable.float_format = '.2' pTable.float_format['GC'] = '.1' pTable.float_format['Lineage: GC mean'] = '.1' pTable.float_format['Lineage: GC std'] = '.1' pTable.float_format['Lineage: gene count mean'] = '.0' pTable.float_format['Lineage: gene count std'] = '.0' pTable.align = 'c' pTable.align[header[0]] = 'l' pTable.align['Insertion branch UID'] = 'l' pTable.align['Taxonomy (contained)'] = 'l' pTable.align['Taxonomy (sister lineage)'] = 'l' pTable.hrules = prettytable.FRAME pTable.vrules = prettytable.NONE for binId in sorted(binIdToTaxonomy.keys()): uniqueHits, multiCopyHits = resultsParser.results[ binId].countUniqueHits() truncSisterLineage = binIdToSisterTaxonomy[binId] for taxa in binIdToTaxonomy[binId].split(';'): truncSisterLineage = truncSisterLineage.replace(taxa + ';', '') if len(truncSisterLineage) == 0: truncSisterLineage = 'unresolved' elif truncSisterLineage[-1] == ';': truncSisterLineage = truncSisterLineage[0:-1] row = [binId, uniqueHits, multiCopyHits] row += [ binIdToUID[binId], binIdToTaxonomy[binId], truncSisterLineage ] row += [binStats[binId]['GC'] * 100] row += [float(binStats[binId]['Genome size']) / 1e6] row += [binStats[binId]['# predicted genes']] row += [binStats[binId]['Coding density']] row += [binStats[binId]['Translation table']] row += [binIdToLineageStatistics[binId]['# genomes']] row += [binIdToLineageStatistics[binId]['gc mean']] row += [binIdToLineageStatistics[binId]['gc std']] row += [binIdToLineageStatistics[binId]['genome size mean']] row += [binIdToLineageStatistics[binId]['genome size std']] row += [binIdToLineageStatistics[binId]['gene count mean']] row += [binIdToLineageStatistics[binId]['gene count std']] if bTabTable: print(('\t'.join(map(str, row)))) else: pTable.add_row(row) if not bTabTable: print((pTable.get_string(sortby=markerCountLabel, reversesort=True))) # restore stdout restoreStdOut(outFile, oldStdOut)
def __printFullTable(self, binIdToTaxonomy, binIdToSisterTaxonomy, binIdToLineageStatistics, resultsParser, binStats, bTabTable, outFile): # redirect output oldStdOut = reassignStdOut(outFile) arbitraryBinId = binIdToTaxonomy.keys()[0] markerCountLabel = '# unique markers (of %d)' % len(resultsParser.models[arbitraryBinId]) header = ['Bin Id', markerCountLabel, "# multi-copy"] header += ['Taxonomy (contained)', 'Taxonomy (sister lineage)'] header += ['GC', 'Genome size (Mbp)', 'Gene count', 'Coding density', 'Translation table'] header += ['# descendant genomes', 'Lineage: GC mean', 'Lineage: GC std'] header += ['Lineage: genome size (Mbp) mean', 'Lineage: genome size (Mbp) std'] header += ['Lineage: gene count mean', 'Lineage: gene count std'] if bTabTable: pTable = None print('\t'.join(header)) else: pTable = prettytable.PrettyTable(header) pTable.float_format = '.2' pTable.float_format['GC'] = '.1' pTable.float_format['Lineage: GC mean'] = '.1' pTable.float_format['Lineage: GC std'] = '.1' pTable.float_format['Lineage: gene count mean'] = '.0' pTable.float_format['Lineage: gene count std'] = '.0' pTable.align = 'c' pTable.align[header[0]] = 'l' pTable.align['Taxonomy (contained)'] = 'l' pTable.align['Taxonomy (sister lineage)'] = 'l' pTable.hrules = prettytable.FRAME pTable.vrules = prettytable.NONE for binId in sorted(binIdToTaxonomy.keys()): uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits() truncSisterLineage = binIdToSisterTaxonomy[binId] for taxa in binIdToTaxonomy[binId].split(';'): truncSisterLineage = truncSisterLineage.replace(taxa + ';', '') if len(truncSisterLineage) == 0: truncSisterLineage = 'unresolved' elif truncSisterLineage[-1] == ';': truncSisterLineage = truncSisterLineage[0:-1] row = [binId, uniqueHits, multiCopyHits] row += [binIdToTaxonomy[binId], truncSisterLineage] row += [binStats[binId]['GC'] * 100] row += [float(binStats[binId]['Genome size']) / 1e6] row += [binStats[binId]['# predicted genes']] row += [binStats[binId]['Coding density']] row += [binStats[binId]['Translation table']] row += [binIdToLineageStatistics[binId]['# genomes']] row += [binIdToLineageStatistics[binId]['gc mean']] row += [binIdToLineageStatistics[binId]['gc std']] row += [binIdToLineageStatistics[binId]['genome size mean']] row += [binIdToLineageStatistics[binId]['genome size std']] row += [binIdToLineageStatistics[binId]['gene count mean']] row += [binIdToLineageStatistics[binId]['gene count std']] if bTabTable: print('\t'.join(map(str, row))) else: pTable.add_row(row) if not bTabTable : print(pTable.get_string(sortby=markerCountLabel, reversesort=True)) # restore stdout restoreStdOut(outFile, oldStdOut)
def run(self, coverageFile, outFile, bTabTable): checkFileExists(coverageFile) # get number of reads mapped to each bin self.logger.info('Determining number of reads mapped to each bin.') readsMappedToBin = {} binSize = {} totalMappedReads = {} bHeader = True for line in open(coverageFile): if bHeader: bHeader = False continue lineSplit = line.split('\t') # seqId = lineSplit[0] binId = lineSplit[1] seqLen = int(lineSplit[2]) binSize[binId] = binSize.get(binId, 0) + seqLen if binId not in readsMappedToBin: readsMappedToBin[binId] = {} for i in range(3, len(lineSplit), 3): bamId = lineSplit[i] mappedReads = int(lineSplit[i + 2]) totalMappedReads[bamId] = totalMappedReads.get(bamId, 0) + mappedReads readsMappedToBin[binId][bamId] = readsMappedToBin[binId].get( bamId, 0) + mappedReads # calculate percentage of mapped reads to binned populations perMappedReads = {} normBinCoverage = {} sumNormBinCoverage = {} for binId, bamIds in readsMappedToBin.items(): perMappedReads[binId] = {} normBinCoverage[binId] = {} for bamId in bamIds: perMR = float( readsMappedToBin[binId][bamId]) / totalMappedReads[bamId] perMappedReads[binId][bamId] = perMR if binId == DefaultValues.UNBINNED: continue normCoverage = perMR / binSize[binId] normBinCoverage[binId][bamId] = normCoverage sumNormBinCoverage[bamId] = sumNormBinCoverage.get( bamId, 0) + normCoverage for binId, bamIds in normBinCoverage.items(): for bamId in bamIds: if sumNormBinCoverage[bamId] != 0: normBinCoverage[binId][bamId] /= sumNormBinCoverage[bamId] else: normBinCoverage[binId][bamId] = 0 # write community profile oldStdOut = reassignStdOut(outFile) sortedBinIds = sorted(readsMappedToBin.keys()) sortedBamIds = sorted(readsMappedToBin[sortedBinIds[0]].keys()) header = ['Bin Id', 'Bin size (Mbp)'] for bamId in sortedBamIds: header += [bamId + ': mapped reads'] header += [bamId + ': % mapped reads'] header += [bamId + ': % binned populations'] header += [bamId + ': % community'] if bTabTable: print('\t'.join(header)) else: pTable = prettytable.PrettyTable(header) pTable.float_format = '.2' pTable.align = 'c' pTable.align[header[0]] = 'l' pTable.hrules = prettytable.FRAME pTable.vrules = prettytable.NONE for binId in sortedBinIds: row = [binId] row += [float(binSize[binId]) / 1e6] for bamId in sortedBamIds: row += [readsMappedToBin[binId][bamId]] row += [perMappedReads[binId][bamId] * 100.0] if DefaultValues.UNBINNED in perMappedReads: unbinnedPercentage = perMappedReads[ DefaultValues.UNBINNED][bamId] else: unbinnedPercentage = 0 if binId == DefaultValues.UNBINNED: row += ['NA'] row += [unbinnedPercentage * 100.0] else: row += [normBinCoverage[binId][bamId] * 100.0] row += [ normBinCoverage[binId][bamId] * 100.0 * (1.0 - unbinnedPercentage) ] if bTabTable: print('\t'.join(list(map(str, row)))) else: pTable.add_row(row) if not bTabTable: print(pTable.get_string()) restoreStdOut(outFile, oldStdOut)