Пример #1
0
    def reportFullMSA(self, outDir, outFile):
        """Create MSA with all reference and bin alignments."""

        # write bin alignments to file
        oldStdOut = reassignStdOut(outFile)
        for line in open(
                os.path.join(outDir, 'storage', 'tree',
                             DefaultValues.PPLACER_CONCAT_SEQ_OUT)):
            print((line.rstrip()))

        # read duplicate seqs
        duplicateNodes = self.__readDuplicateSeqs()

        # write reference alignments to file
        seqs = readFasta(
            os.path.join(DefaultValues.PPLACER_REF_PACKAGE_FULL,
                         DefaultValues.GENOME_TREE_FASTA))
        for seqId, seq in seqs.items():
            print(('>' + seqId))
            print(seq)

            if seqId in duplicateNodes:
                for dupSeqId in duplicateNodes[seqId]:
                    print(('>' + dupSeqId))
                    print(seq)

        restoreStdOut(outFile, oldStdOut)
Пример #2
0
    def __printSimpleSummaryTable(self, binIdToTaxonomy, resultsParser, bTabTable, outFile):
        # redirect output
        oldStdOut = reassignStdOut(outFile)

        arbitraryBinId = binIdToTaxonomy.keys()[0]
        markerCountLabel = '# unique markers (of %d)' % len(resultsParser.models[arbitraryBinId])
        header = ['Bin Id', markerCountLabel, '# multi-copy', 'Taxonomy']

        if bTabTable:
            pTable = None
            print('\t'.join(header))
        else:
            pTable = prettytable.PrettyTable(header)
            pTable.float_format = '.2'
            pTable.align = 'c'
            pTable.align[header[0]] = 'l'
            pTable.align['Taxonomy'] = 'l'
            pTable.hrules = prettytable.FRAME
            pTable.vrules = prettytable.NONE

        for binId in sorted(binIdToTaxonomy.keys()):
            uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits()

            row = [binId, uniqueHits, multiCopyHits, binIdToTaxonomy[binId]]

            if bTabTable:
                print('\t'.join(map(str, row)))
            else:
                pTable.add_row(row)

        if not bTabTable:
            print(pTable.get_string(sortby=markerCountLabel, reversesort=True))

        # restore stdout
        restoreStdOut(outFile, oldStdOut)
Пример #3
0
    def __printSimpleSummaryTable(self, binIdToTaxonomy, resultsParser, bTabTable, outFile):
        # redirect output
        oldStdOut = reassignStdOut(outFile)

        arbitraryBinId = binIdToTaxonomy.keys()[0]
        markerCountLabel = '# unique markers (of %d)' % len(resultsParser.models[arbitraryBinId])
        header = ['Bin Id', markerCountLabel, '# multi-copy', 'Taxonomy']

        if bTabTable:
            pTable = None
            print('\t'.join(header))
        else:
            pTable = prettytable.PrettyTable(header)
            pTable.float_format = '.2'
            pTable.align = 'c'
            pTable.align[header[0]] = 'l'
            pTable.align['Taxonomy'] = 'l'
            pTable.hrules = prettytable.FRAME
            pTable.vrules = prettytable.NONE

        for binId in sorted(binIdToTaxonomy.keys()):
            uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits()
         
            row = [binId, uniqueHits, multiCopyHits, binIdToTaxonomy[binId]]

            if bTabTable:
                print('\t'.join(map(str, row)))
            else:
                pTable.add_row(row)

        if not bTabTable :
            print(pTable.get_string(sortby=markerCountLabel, reversesort=True))

        # restore stdout
        restoreStdOut(outFile, oldStdOut)
Пример #4
0
    def reportNewickTree(self, outDir, outFile, leafLabels=None):
        # read duplicate nodes
        duplicateSeqs = self.__readDuplicateSeqs()

        # read tree
        treeFile = os.path.join(outDir, 'storage', 'tree',
                                DefaultValues.PPLACER_TREE_OUT)
        tree = dendropy.Tree.get_from_path(treeFile,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)

        # clean up internal node labels
        for node in tree.internal_nodes():
            if node.label:
                labelSplit = node.label.split('|')

                label = labelSplit[0]
                if labelSplit[1] != '':
                    label += '|' + labelSplit[1]
                if labelSplit[2] != '':
                    label += '|' + labelSplit[2]

                node.label = label

        # insert duplicate nodes into tree
        for leaf in tree.leaf_nodes():
            duplicates = duplicateSeqs.get(leaf.taxon.label, None)
            if duplicates != None:
                newParent = leaf.parent_node.new_child(
                    edge_length=leaf.edge_length)
                curLeaf = leaf.parent_node.remove_child(leaf)
                newParent.new_child(taxon=curLeaf.taxon, edge_length=0)
                for d in duplicates:
                    newParent.new_child(taxon=dendropy.Taxon(label=d),
                                        edge_length=0)

        # append taxonomy to leaf nodes
        if leafLabels == 'taxonomy':
            # read taxonomy string for each IMG genome
            taxonomy = {}
            for line in open(
                    os.path.join(DefaultValues.GENOME_TREE_DIR,
                                 DefaultValues.GENOME_TREE_TAXONOMY)):
                lineSplit = line.split('\t')
                taxonomy[lineSplit[0]] = lineSplit[1].rstrip()

            # append taxonomy to leaf labels
            for leaf in tree.leaf_nodes():
                taxaStr = taxonomy.get(leaf.taxon.label, None)
                if taxaStr:
                    leaf.taxon.label += '|' + taxaStr

        # write out tree
        oldStdOut = reassignStdOut(outFile)
        print((tree.as_string(schema='newick', suppress_rooting=True)))
        restoreStdOut(outFile, oldStdOut)
Пример #5
0
    def printSummary(self, outputFormat, aai, binIdToBinMarkerSets,
                     bIndividualMarkers, coverageFile, bTabTable, outFile,
                     anaFolder):
        # redirect output
        oldStdOut = reassignStdOut(outFile)

        coverageBinProfiles = None
        if coverageFile:
            coverage = Coverage(1)
            coverageBinProfiles = coverage.binProfiles(coverageFile)

        prettyTableFormats = [1, 2, 3, 9]

        header = self.__getHeader(
            outputFormat,
            binIdToBinMarkerSets[list(binIdToBinMarkerSets.keys())[0]],
            coverageBinProfiles, bTabTable)
        if bTabTable or outputFormat not in prettyTableFormats:
            bTabTable = True
            pTable = None

            if header != None:
                print('\t'.join(header))
        else:
            pTable = prettytable.PrettyTable(header)
            pTable.float_format = '.2'
            pTable.align = 'c'
            pTable.align[header[0]] = 'l'
            pTable.hrules = prettytable.FRAME
            pTable.vrules = prettytable.NONE

        seqsReported = 0
        for binId in sorted(self.results.keys()):
            seqsReported += self.results[binId].printSummary(
                outputFormat, aai, binIdToBinMarkerSets[binId],
                bIndividualMarkers, coverageBinProfiles, pTable, anaFolder)

        if outputFormat in [6, 7] and seqsReported == 0:
            print('[No marker genes satisfied the reporting criteria.]')

        if not bTabTable:
            if outputFormat in [1, 2]:
                print(
                    pTable.get_string(sortby='Completeness', reversesort=True))
            else:
                # only print if there are rows
                if pTable.get_string(print_empty=False):
                    print(pTable.get_string(print_empty=False))

        # restore stdout
        restoreStdOut(outFile, oldStdOut)
Пример #6
0
    def reportNewickTree(self, outDir, outFile, leafLabels=None):
        # read duplicate nodes
        duplicateSeqs = self.__readDuplicateSeqs()

        # read tree
        treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT)
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True)

        # clean up internal node labels
        for node in tree.internal_nodes():
            if node.label:
                labelSplit = node.label.split('|')

                label = labelSplit[0]
                if labelSplit[1] != '':
                    label += '|' + labelSplit[1]
                if labelSplit[2] != '':
                    label += '|' + labelSplit[2]

                node.label = label

        # insert duplicate nodes into tree
        for leaf in tree.leaf_nodes():
            duplicates = duplicateSeqs.get(leaf.taxon.label, None)
            if duplicates != None:
                newParent = leaf.parent_node.new_child(edge_length = leaf.edge_length)
                curLeaf = leaf.parent_node.remove_child(leaf)
                newParent.new_child(taxon = curLeaf.taxon, edge_length = 0)
                for d in duplicates:
                    newParent.new_child(taxon = Taxon(label = d), edge_length = 0)

        # append taxonomy to leaf nodes
        if leafLabels == 'taxonomy':
            # read taxonomy string for each IMG genome
            taxonomy = {}
            for line in open(os.path.join(DefaultValues.GENOME_TREE_DIR, 'genome_tree.taxonomy.tsv')):
                lineSplit = line.split('\t')
                taxonomy[lineSplit[0]] = lineSplit[1].rstrip()

            # append taxonomy to leaf labels
            for leaf in tree.leaf_nodes():
                taxaStr = taxonomy.get(leaf.taxon.label, None)
                if taxaStr:
                    leaf.taxon.label += '|' + taxaStr

        # write out tree
        oldStdOut = reassignStdOut(outFile)
        print(tree.as_string(schema='newick', suppress_rooting=True))
        restoreStdOut(outFile, oldStdOut)
Пример #7
0
    def joinTables(self, options):
        """Join tables command"""

        self.logger.info(
            '[CheckM - join_tables] Joining tables containing bin information.'
        )

        # read all tables
        headers = {}
        rows = defaultdict(dict)
        binIds = set()
        for f in options.tables:
            with open(f) as fin:
                headers[f] = [x.strip()
                              for x in fin.readline().split('\t')][1:]

                for line in fin:
                    lineSplit = [x.strip() for x in line.split('\t')]

                    binId = lineSplit[0]
                    binIds.add(binId)

                    for i, header in enumerate(headers[f]):
                        rows[binId][header] = lineSplit[i + 1]

        # write merge table
        oldStdOut = reassignStdOut(options.file)

        row = 'Bin Id'
        for f in options.tables:
            row += '\t' + '\t'.join(headers[f])
        print(row)

        for binId in binIds:
            row = binId
            for f in options.tables:
                for header in headers[f]:
                    row += '\t' + rows[binId].get(header, '')
            print(row)

        restoreStdOut(options.file, oldStdOut)

        if options.file:
            self.logger.info('Joined table written to: ' + options.file)

        self.timeKeeper.printTimeStamp()
Пример #8
0
    def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile, anaFolder):
        # redirect output
        oldStdOut = reassignStdOut(outFile)

        coverageBinProfiles = None
        if coverageFile:
            coverage = Coverage(1)
            coverageBinProfiles = coverage.binProfiles(coverageFile)

        prettyTableFormats = [1, 2, 3, 9]

        header = self.__getHeader(outputFormat, binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]], coverageBinProfiles, bTabTable)
        if bTabTable or outputFormat not in prettyTableFormats:
            bTabTable = True
            pTable = None

            if header != None:
                print('\t'.join(header))
        else:
            pTable = prettytable.PrettyTable(header)
            pTable.float_format = '.2'
            pTable.align = 'c'
            pTable.align[header[0]] = 'l'
            pTable.hrules = prettytable.FRAME
            pTable.vrules = prettytable.NONE

        seqsReported = 0
        for binId in sorted(self.results.keys()):
            seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable, anaFolder)

        if outputFormat in [6, 7] and seqsReported == 0:
            print('[No marker genes satisfied the reporting criteria.]')

        if not bTabTable:
            if outputFormat in [1, 2]:
                print(pTable.get_string(sortby='Completeness', reversesort=True))
            else:
                # only print if there are rows
                if pTable.get_string(print_empty=False):
                    print(pTable.get_string(print_empty=False))

        # restore stdout
        restoreStdOut(outFile, oldStdOut)
Пример #9
0
    def reportFullMSA(self, outDir, outFile):
        """Create MSA with all reference and bin alignments."""

        # write bin alignments to file
        oldStdOut = reassignStdOut(outFile)
        for line in open(os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_CONCAT_SEQ_OUT)):
            print(line.rstrip())

        # read duplicate seqs
        duplicateNodes = self.__readDuplicateSeqs()

        # write reference alignments to file
        seqs = readFasta(os.path.join(DefaultValues.PPLACER_REF_PACKAGE, 'genome_tree.concatenated.derep.fasta'))
        for seqId, seq in seqs.iteritems():
            print('>' + seqId)
            print(seq)

            if seqId in duplicateNodes:
                for dupSeqId in duplicateNodes[seqId]:
                    print('>' + dupSeqId)
                    print(seq)

        restoreStdOut(outFile, oldStdOut)
Пример #10
0
    def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile):
        # redirect output
        oldStdOut = reassignStdOut(outFile)

        coverageBinProfiles = None
        if coverageFile:
            coverage = Coverage(1)
            coverageBinProfiles = coverage.binProfiles(coverageFile)

        prettyTableFormats = [1, 2, 3]

        header = self.__getHeader(outputFormat, binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]], coverageBinProfiles)
        if bTabTable or outputFormat not in prettyTableFormats:
            bTabTable = True
            pTable = None
            
            if header != None:
                print('\t'.join(header))
        else:
            pTable = prettytable.PrettyTable(header)
            pTable.float_format = '.2'
            pTable.align = 'c'
            pTable.align[header[0]] = 'l'
            pTable.hrules = prettytable.FRAME
            pTable.vrules = prettytable.NONE

        for binId in sorted(self.results.keys()):
            self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable)

        if not bTabTable :
            if outputFormat in [1,2]:
                print(pTable.get_string(sortby='Completeness', reversesort=True))
            else:
                print(pTable.get_string())

        # restore stdout
        restoreStdOut(outFile, oldStdOut)
Пример #11
0
    def run(self, binFiles, bamFiles, outFile, bAllReads, minAlignPer,
            maxEditDistPer, minQC):
        """Calculate coverage of sequences for each BAM file."""

        # determine bin assignment of each sequence
        self.logger.info('  Determining bin assignment of each sequence.')

        seqIdToBinId = {}
        seqIdToSeqLen = {}
        for binFile in binFiles:
            binId = binIdFromFilename(binFile)

            seqs = readFasta(binFile)
            for seqId, seq in seqs.iteritems():
                seqIdToBinId[seqId] = binId
                seqIdToSeqLen[seqId] = len(seq)

        # process each fasta file
        self.logger.info("  Processing %d file(s) with %d threads.\n" %
                         (len(bamFiles), self.totalThreads))

        # make sure all BAM files are sorted
        self.numFiles = len(bamFiles)
        for bamFile in bamFiles:
            if not os.path.exists(bamFile + '.bai'):
                self.logger.error(
                    '  [Error] BAM file is either unsorted or not indexed: ' +
                    bamFile + '\n')
                sys.exit()

        # calculate coverage of each BAM file
        coverageInfo = {}
        numFilesStarted = 0
        for bamFile in bamFiles:
            numFilesStarted += 1
            self.logger.info(
                '  Processing %s (%d of %d):' %
                (ntpath.basename(bamFile), numFilesStarted, len(bamFiles)))

            coverageInfo[bamFile] = mp.Manager().dict()
            coverageInfo[bamFile] = self.__processBam(bamFile, bAllReads,
                                                      minAlignPer,
                                                      maxEditDistPer, minQC,
                                                      coverageInfo[bamFile])

        # redirect output
        self.logger.info('  Writing coverage information to file.')
        oldStdOut = reassignStdOut(outFile)

        header = 'Sequence Id\tBin Id\tSequence length (bp)'
        for bamFile in bamFiles:
            header += '\tBam Id\tCoverage\tMapped reads'

        print(header)

        # get length of all seqs
        for bamFile, seqIds in coverageInfo.iteritems():
            for seqId in seqIds.keys():
                seqIdToSeqLen[seqId] = seqIds[seqId].seqLen

        # write coverage stats for all scaffolds to file
        for seqId, seqLen in seqIdToSeqLen.iteritems():
            rowStr = seqId + '\t' + seqIdToBinId.get(
                seqId, DefaultValues.UNBINNED) + '\t' + str(seqLen)
            for bamFile in bamFiles:
                bamId = binIdFromFilename(bamFile)

                if seqId in coverageInfo[bamFile]:
                    rowStr += '\t%s\t%f\t%d' % (
                        bamId, coverageInfo[bamFile][seqId].coverage,
                        coverageInfo[bamFile][seqId].mappedReads)
                else:
                    rowStr += '\t%s\t%f\t%d' % (bamId, 0, 0)

            print(rowStr)

        # restore stdout
        restoreStdOut(outFile, oldStdOut)
Пример #12
0
    def run(self, binFiles, bamFiles, outFile, bAllReads, minAlignPer, maxEditDistPer, minQC):
        """Calculate coverage of sequences for each BAM file."""

        # determine bin assignment of each sequence
        self.logger.info('  Determining bin assignment of each sequence.')

        seqIdToBinId = {}
        seqIdToSeqLen = {}
        for binFile in binFiles:
            binId = binIdFromFilename(binFile)

            seqs = readFasta(binFile)
            for seqId, seq in seqs.iteritems():
                seqIdToBinId[seqId] = binId
                seqIdToSeqLen[seqId] = len(seq)

        # process each fasta file
        self.logger.info("  Processing %d file(s) with %d threads.\n" % (len(bamFiles), self.totalThreads))

        # make sure all BAM files are sorted
        self.numFiles = len(bamFiles)
        for bamFile in bamFiles:
            if not os.path.exists(bamFile + '.bai'):
                self.logger.error('  [Error] BAM file is either unsorted or not indexed: ' + bamFile + '\n')
                sys.exit(1)

        # calculate coverage of each BAM file
        coverageInfo = {}
        numFilesStarted = 0
        for bamFile in bamFiles:
            numFilesStarted += 1
            self.logger.info('  Processing %s (%d of %d):' % (ntpath.basename(bamFile), numFilesStarted, len(bamFiles)))

            coverageInfo[bamFile] = mp.Manager().dict()
            coverageInfo[bamFile] = self.__processBam(bamFile, bAllReads, minAlignPer, maxEditDistPer, minQC, coverageInfo[bamFile])

        # redirect output
        self.logger.info('  Writing coverage information to file.')
        oldStdOut = reassignStdOut(outFile)

        header = 'Sequence Id\tBin Id\tSequence length (bp)'
        for bamFile in bamFiles:
            header += '\tBam Id\tCoverage\tMapped reads'

        print(header)

        # get length of all seqs
        for bamFile, seqIds in coverageInfo.iteritems():
            for seqId in seqIds.keys():
                seqIdToSeqLen[seqId] = seqIds[seqId].seqLen

        # write coverage stats for all scaffolds to file
        for seqId, seqLen in seqIdToSeqLen.iteritems():
            rowStr = seqId + '\t' + seqIdToBinId.get(seqId, DefaultValues.UNBINNED) + '\t' + str(seqLen)
            for bamFile in bamFiles:
                bamId = binIdFromFilename(bamFile)

                if seqId in coverageInfo[bamFile]:
                    rowStr += '\t%s\t%f\t%d' % (bamId, coverageInfo[bamFile][seqId].coverage, coverageInfo[bamFile][seqId].mappedReads)
                else:
                    rowStr += '\t%s\t%f\t%d' % (bamId, 0, 0)

            print(rowStr)

        # restore stdout
        restoreStdOut(outFile, oldStdOut)
Пример #13
0
    def __printFullTable(self, binIdToUID, binIdToTaxonomy,
                         binIdToSisterTaxonomy, binIdToLineageStatistics,
                         resultsParser, binStats, bTabTable, outFile):
        # redirect output
        oldStdOut = reassignStdOut(outFile)

        arbitraryBinId = list(binIdToTaxonomy.keys())[0]
        markerCountLabel = '# unique markers (of %d)' % len(
            resultsParser.models[arbitraryBinId])
        header = ['Bin Id', markerCountLabel, "# multi-copy"]
        header += [
            'Insertion branch UID', 'Taxonomy (contained)',
            'Taxonomy (sister lineage)'
        ]
        header += [
            'GC', 'Genome size (Mbp)', 'Gene count', 'Coding density',
            'Translation table'
        ]
        header += [
            '# descendant genomes', 'Lineage: GC mean', 'Lineage: GC std'
        ]
        header += [
            'Lineage: genome size (Mbp) mean', 'Lineage: genome size (Mbp) std'
        ]
        header += ['Lineage: gene count mean', 'Lineage: gene count std']

        if bTabTable:
            pTable = None
            print(('\t'.join(header)))
        else:
            pTable = prettytable.PrettyTable(header)
            pTable.float_format = '.2'
            pTable.float_format['GC'] = '.1'
            pTable.float_format['Lineage: GC mean'] = '.1'
            pTable.float_format['Lineage: GC std'] = '.1'
            pTable.float_format['Lineage: gene count mean'] = '.0'
            pTable.float_format['Lineage: gene count std'] = '.0'
            pTable.align = 'c'
            pTable.align[header[0]] = 'l'
            pTable.align['Insertion branch UID'] = 'l'
            pTable.align['Taxonomy (contained)'] = 'l'
            pTable.align['Taxonomy (sister lineage)'] = 'l'
            pTable.hrules = prettytable.FRAME
            pTable.vrules = prettytable.NONE

        for binId in sorted(binIdToTaxonomy.keys()):
            uniqueHits, multiCopyHits = resultsParser.results[
                binId].countUniqueHits()

            truncSisterLineage = binIdToSisterTaxonomy[binId]
            for taxa in binIdToTaxonomy[binId].split(';'):
                truncSisterLineage = truncSisterLineage.replace(taxa + ';', '')

            if len(truncSisterLineage) == 0:
                truncSisterLineage = 'unresolved'
            elif truncSisterLineage[-1] == ';':
                truncSisterLineage = truncSisterLineage[0:-1]

            row = [binId, uniqueHits, multiCopyHits]
            row += [
                binIdToUID[binId], binIdToTaxonomy[binId], truncSisterLineage
            ]
            row += [binStats[binId]['GC'] * 100]
            row += [float(binStats[binId]['Genome size']) / 1e6]
            row += [binStats[binId]['# predicted genes']]
            row += [binStats[binId]['Coding density']]
            row += [binStats[binId]['Translation table']]
            row += [binIdToLineageStatistics[binId]['# genomes']]
            row += [binIdToLineageStatistics[binId]['gc mean']]
            row += [binIdToLineageStatistics[binId]['gc std']]
            row += [binIdToLineageStatistics[binId]['genome size mean']]
            row += [binIdToLineageStatistics[binId]['genome size std']]
            row += [binIdToLineageStatistics[binId]['gene count mean']]
            row += [binIdToLineageStatistics[binId]['gene count std']]

            if bTabTable:
                print(('\t'.join(map(str, row))))
            else:
                pTable.add_row(row)

        if not bTabTable:
            print((pTable.get_string(sortby=markerCountLabel,
                                     reversesort=True)))

        # restore stdout
        restoreStdOut(outFile, oldStdOut)
Пример #14
0
    def __printFullTable(self, binIdToTaxonomy, binIdToSisterTaxonomy, binIdToLineageStatistics, resultsParser, binStats, bTabTable, outFile):
        # redirect output
        oldStdOut = reassignStdOut(outFile)

        arbitraryBinId = binIdToTaxonomy.keys()[0]
        markerCountLabel = '# unique markers (of %d)' % len(resultsParser.models[arbitraryBinId])
        header = ['Bin Id', markerCountLabel, "# multi-copy"]
        header += ['Taxonomy (contained)', 'Taxonomy (sister lineage)']
        header += ['GC', 'Genome size (Mbp)', 'Gene count', 'Coding density', 'Translation table']
        header += ['# descendant genomes', 'Lineage: GC mean', 'Lineage: GC std']
        header += ['Lineage: genome size (Mbp) mean', 'Lineage: genome size (Mbp) std']
        header += ['Lineage: gene count mean', 'Lineage: gene count std']

        if bTabTable:
            pTable = None
            print('\t'.join(header))
        else:
            pTable = prettytable.PrettyTable(header)
            pTable.float_format = '.2'
            pTable.float_format['GC'] = '.1'
            pTable.float_format['Lineage: GC mean'] = '.1'
            pTable.float_format['Lineage: GC std'] = '.1'
            pTable.float_format['Lineage: gene count mean'] = '.0'
            pTable.float_format['Lineage: gene count std'] = '.0'
            pTable.align = 'c'
            pTable.align[header[0]] = 'l'
            pTable.align['Taxonomy (contained)'] = 'l'
            pTable.align['Taxonomy (sister lineage)'] = 'l'
            pTable.hrules = prettytable.FRAME
            pTable.vrules = prettytable.NONE

        for binId in sorted(binIdToTaxonomy.keys()):
            uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits()
            
            truncSisterLineage = binIdToSisterTaxonomy[binId]
            for taxa in binIdToTaxonomy[binId].split(';'):
                truncSisterLineage = truncSisterLineage.replace(taxa + ';', '')

            if len(truncSisterLineage) == 0:
                truncSisterLineage = 'unresolved'
            elif truncSisterLineage[-1] == ';':
                truncSisterLineage = truncSisterLineage[0:-1]

            row = [binId, uniqueHits, multiCopyHits]
            row += [binIdToTaxonomy[binId], truncSisterLineage]
            row += [binStats[binId]['GC'] * 100]
            row += [float(binStats[binId]['Genome size']) / 1e6]
            row += [binStats[binId]['# predicted genes']]
            row += [binStats[binId]['Coding density']]
            row += [binStats[binId]['Translation table']]
            row += [binIdToLineageStatistics[binId]['# genomes']]
            row += [binIdToLineageStatistics[binId]['gc mean']]
            row += [binIdToLineageStatistics[binId]['gc std']]
            row += [binIdToLineageStatistics[binId]['genome size mean']]
            row += [binIdToLineageStatistics[binId]['genome size std']]
            row += [binIdToLineageStatistics[binId]['gene count mean']]
            row += [binIdToLineageStatistics[binId]['gene count std']]

            if bTabTable:
                print('\t'.join(map(str, row)))
            else:
                pTable.add_row(row)

        if not bTabTable :
            print(pTable.get_string(sortby=markerCountLabel, reversesort=True))

        # restore stdout
        restoreStdOut(outFile, oldStdOut)
Пример #15
0
    def run(self, coverageFile, outFile, bTabTable):
        checkFileExists(coverageFile)

        # get number of reads mapped to each bin
        self.logger.info('Determining number of reads mapped to each bin.')

        readsMappedToBin = {}
        binSize = {}
        totalMappedReads = {}
        bHeader = True
        for line in open(coverageFile):
            if bHeader:
                bHeader = False
                continue

            lineSplit = line.split('\t')

            # seqId = lineSplit[0]
            binId = lineSplit[1]

            seqLen = int(lineSplit[2])
            binSize[binId] = binSize.get(binId, 0) + seqLen

            if binId not in readsMappedToBin:
                readsMappedToBin[binId] = {}

            for i in range(3, len(lineSplit), 3):
                bamId = lineSplit[i]
                mappedReads = int(lineSplit[i + 2])

                totalMappedReads[bamId] = totalMappedReads.get(bamId,
                                                               0) + mappedReads
                readsMappedToBin[binId][bamId] = readsMappedToBin[binId].get(
                    bamId, 0) + mappedReads

        # calculate percentage of mapped reads to binned populations
        perMappedReads = {}
        normBinCoverage = {}
        sumNormBinCoverage = {}
        for binId, bamIds in readsMappedToBin.items():
            perMappedReads[binId] = {}
            normBinCoverage[binId] = {}

            for bamId in bamIds:
                perMR = float(
                    readsMappedToBin[binId][bamId]) / totalMappedReads[bamId]
                perMappedReads[binId][bamId] = perMR

                if binId == DefaultValues.UNBINNED:
                    continue

                normCoverage = perMR / binSize[binId]
                normBinCoverage[binId][bamId] = normCoverage
                sumNormBinCoverage[bamId] = sumNormBinCoverage.get(
                    bamId, 0) + normCoverage

        for binId, bamIds in normBinCoverage.items():
            for bamId in bamIds:
                if sumNormBinCoverage[bamId] != 0:
                    normBinCoverage[binId][bamId] /= sumNormBinCoverage[bamId]
                else:
                    normBinCoverage[binId][bamId] = 0

        # write community profile
        oldStdOut = reassignStdOut(outFile)

        sortedBinIds = sorted(readsMappedToBin.keys())
        sortedBamIds = sorted(readsMappedToBin[sortedBinIds[0]].keys())

        header = ['Bin Id', 'Bin size (Mbp)']
        for bamId in sortedBamIds:
            header += [bamId + ': mapped reads']
            header += [bamId + ': % mapped reads']
            header += [bamId + ': % binned populations']
            header += [bamId + ': % community']

        if bTabTable:
            print('\t'.join(header))
        else:
            pTable = prettytable.PrettyTable(header)
            pTable.float_format = '.2'
            pTable.align = 'c'
            pTable.align[header[0]] = 'l'
            pTable.hrules = prettytable.FRAME
            pTable.vrules = prettytable.NONE

        for binId in sortedBinIds:
            row = [binId]
            row += [float(binSize[binId]) / 1e6]

            for bamId in sortedBamIds:
                row += [readsMappedToBin[binId][bamId]]
                row += [perMappedReads[binId][bamId] * 100.0]

                if DefaultValues.UNBINNED in perMappedReads:
                    unbinnedPercentage = perMappedReads[
                        DefaultValues.UNBINNED][bamId]
                else:
                    unbinnedPercentage = 0

                if binId == DefaultValues.UNBINNED:
                    row += ['NA']
                    row += [unbinnedPercentage * 100.0]
                else:
                    row += [normBinCoverage[binId][bamId] * 100.0]
                    row += [
                        normBinCoverage[binId][bamId] * 100.0 *
                        (1.0 - unbinnedPercentage)
                    ]

            if bTabTable:
                print('\t'.join(list(map(str, row))))
            else:
                pTable.add_row(row)

        if not bTabTable:
            print(pTable.get_string())

        restoreStdOut(outFile, oldStdOut)