Exemplo n.º 1
0
    def __getUniversalMarkerGenes(self, phyloUbiquityThreshold,
                                  phyloSingleCopyThreshold, outputGeneDir):
        img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv',
                  '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        markerSetBuilder = MarkerSetBuilder()

        metadata = img.genomeMetadata()

        allTrustedGenomeIds = set()
        phyloMarkerGenes = {}
        for lineage in ['Archaea', 'Bacteria']:
            # get all genomes in lineage
            print('\nIdentifying all ' + lineage + ' genomes.')
            trustedGenomeIds = img.genomeIdsByTaxonomy(lineage, metadata)
            print('  Trusted genomes in lineage: ' +
                  str(len(trustedGenomeIds)))
            if len(trustedGenomeIds) < 1:
                print(
                    '  Skipping lineage due to insufficient number of genomes.'
                )
                continue

            allTrustedGenomeIds.update(trustedGenomeIds)

            print('  Building marker set.')
            markerGenes = markerSetBuilder.buildMarkerGenes(
                trustedGenomeIds, phyloUbiquityThreshold,
                phyloSingleCopyThreshold)
            phyloMarkerGenes[lineage] = markerGenes

            #print lineage
            #print len(markerGenes)
            #print 'pfam01379: ', ('pfam01379' in markerGenes)
            #print '--------------------'

        # universal marker genes
        universalMarkerGenes = None
        for markerGenes in phyloMarkerGenes.values():
            if universalMarkerGenes == None:
                universalMarkerGenes = markerGenes
            else:
                universalMarkerGenes.intersection_update(markerGenes)

        fout = open(os.path.join(outputGeneDir, 'phylo_marker_set.txt'), 'w')
        fout.write(str(universalMarkerGenes))
        fout.close()

        print('')
        print('  Universal marker genes: ' + str(len(universalMarkerGenes)))

        return allTrustedGenomeIds, universalMarkerGenes
    def __getUniversalMarkerGenes(self, phyloUbiquityThreshold, phyloSingleCopyThreshold, outputGeneDir):
        img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        markerSetBuilder = MarkerSetBuilder()

        metadata = img.genomeMetadata()
                        
        allTrustedGenomeIds = set()
        phyloMarkerGenes = {}
        for lineage in ['Archaea', 'Bacteria']:
            # get all genomes in lineage
            print '\nIdentifying all ' + lineage + ' genomes.'
            trustedGenomeIds = img.genomeIdsByTaxonomy(lineage, metadata)
            print '  Trusted genomes in lineage: ' + str(len(trustedGenomeIds))
            if len(trustedGenomeIds) < 1:
                print '  Skipping lineage due to insufficient number of genomes.'
                continue
            
            allTrustedGenomeIds.update(trustedGenomeIds)
            
            print '  Building marker set.'
            markerGenes = markerSetBuilder.buildMarkerGenes(trustedGenomeIds, phyloUbiquityThreshold, phyloSingleCopyThreshold)
            phyloMarkerGenes[lineage] = markerGenes
            
            #print lineage
            #print len(markerGenes)
            #print 'pfam01379: ', ('pfam01379' in markerGenes)
            #print '--------------------'

        # universal marker genes
        universalMarkerGenes = None
        for markerGenes in phyloMarkerGenes.values():
            if universalMarkerGenes == None:
                universalMarkerGenes = markerGenes
            else:
                universalMarkerGenes.intersection_update(markerGenes)

        fout = open(os.path.join(outputGeneDir, 'phylo_marker_set.txt'), 'w')
        fout.write(str(universalMarkerGenes))
        fout.close()

        print ''
        print '  Universal marker genes: ' + str(len(universalMarkerGenes))
        
        return allTrustedGenomeIds, universalMarkerGenes
Exemplo n.º 3
0
 def __init__(self):
     self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
     self.pfamHMMs = '/srv/whitlam/bio/db/pfam/27/Pfam-A.hmm'
     self.markerSetBuilder = MarkerSetBuilder()
Exemplo n.º 4
0
class DecorateTree(object):
    def __init__(self):
        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        self.pfamHMMs = '/srv/whitlam/bio/db/pfam/27/Pfam-A.hmm'
        self.markerSetBuilder = MarkerSetBuilder()

    def __meanStd(self, metadata, genomeIds, category):
        values = []
        for genomeId in genomeIds:
            genomeId = genomeId.replace('IMG_', '')
            v = metadata[genomeId][category]
            if v != 'NA':
                values.append(v)

        return mean(values), std(values)

    def __calculateMarkerSet(self, genomeLabels, ubiquityThreshold=0.97, singleCopyThreshold=0.97):
        """Calculate marker set for a set of genomes."""

        # get genome IDs from genome labels
        genomeIds = set()
        for genomeLabel in genomeLabels:
            genomeIds.add(genomeLabel.replace('IMG_', ''))

        markerSet = self.markerSetBuilder.buildMarkerSet(genomeIds, ubiquityThreshold, singleCopyThreshold)

        return markerSet.markerSet

    def __pfamIdToPfamAcc(self, img):
        pfamIdToPfamAcc = {}
        for line in open(self.pfamHMMs):
            if 'ACC' in line:
                acc = line.split()[1].strip()
                pfamId = acc.split('.')[0]

                pfamIdToPfamAcc[pfamId] = acc

        return pfamIdToPfamAcc

    def decorate(self, taxaTreeFile, derepFile, inputTreeFile, metadataOut, numThreads):
        # read genome metadata
        print '  Reading metadata.'
        metadata = self.img.genomeMetadata()

        # read list of taxa with duplicate sequences
        print '  Read list of taxa with duplicate sequences.'
        duplicateTaxa = {}
        for line in open(derepFile):
            lineSplit = line.rstrip().split()
            if len(lineSplit) > 1:
                duplicateTaxa[lineSplit[0]] = lineSplit[1:]

        # build gene count table
        print '  Building gene count table.'
        genomeIds = self.img.genomeMetadata().keys()
        print '    # trusted genomes = ' + str(len(genomeIds))

        # calculate statistics for each internal node using multiple threads
        print '  Calculating statistics for each internal node.'
        self.__internalNodeStatistics(taxaTreeFile, inputTreeFile, duplicateTaxa, metadata, metadataOut, numThreads)

    def __internalNodeStatistics(self, taxaTreeFile, inputTreeFile, duplicateTaxa, metadata, metadataOut, numThreads):

        # determine HMM model accession numbers
        pfamIdToPfamAcc = self.__pfamIdToPfamAcc(self.img)

        taxaTree = dendropy.Tree.get_from_path(taxaTreeFile, schema='newick', as_rooted=True, preserve_underscores=True)
        inputTree = dendropy.Tree.get_from_path(inputTreeFile, schema='newick', as_rooted=True, preserve_underscores=True)

        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        uniqueId = 0
        for node in inputTree.internal_nodes():
            uniqueId += 1
            workerQueue.put((uniqueId, node))

        for _ in range(numThreads):
            workerQueue.put((None, None))

        calcProc = [mp.Process(target=self.__processInternalNode, args=(taxaTree, duplicateTaxa, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target=self.__reportStatistics, args=(metadata, metadataOut, inputTree, inputTreeFile, pfamIdToPfamAcc, writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None, None))
        writeProc.join()

    def __processInternalNode(self, taxaTree, duplicateTaxa, queueIn, queueOut):
        """Run each marker gene in a separate thread."""

        while True:
            uniqueId, node = queueIn.get(block=True, timeout=None)
            if uniqueId == None:
                break

            # find corresponding internal node in taxa tree
            labels = []
            for leaf in node.leaf_nodes():
                labels.append(leaf.taxon.label)
                if leaf.taxon.label in duplicateTaxa:
                    for genomeId in duplicateTaxa[leaf.taxon.label]:
                        labels.append(genomeId)

            # check if there is a taxonomic label
            mrca = taxaTree.mrca(taxon_labels=labels)
            taxaStr = ''
            if mrca.label:
                taxaStr = mrca.label.replace(' ', '')

            # give node a unique Id while retraining bootstrap value
            bootstrap = ''
            if node.label:
                bootstrap = node.label
            nodeLabel = 'UID' + str(uniqueId) + '|' + taxaStr + '|' + bootstrap

            # calculate marker set
            markerSet = self.__calculateMarkerSet(labels)

            queueOut.put((uniqueId, labels, markerSet, taxaStr, bootstrap, node.oid, nodeLabel))

    def __reportStatistics(self, metadata, metadataOut, inputTree, inputTreeFile, pfamIdToPfamAcc, writerQueue):
        """Store statistics for internal node."""

        fout = open(metadataOut, 'w')
        fout.write('UID\t# genomes\tTaxonomy\tBootstrap')
        fout.write('\tGC mean\tGC std')
        fout.write('\tGenome size mean\tGenome size std')
        fout.write('\tGene count mean\tGene count std')
        fout.write('\tMarker set')
        fout.write('\n')

        numProcessedNodes = 0
        numInternalNodes = len(inputTree.internal_nodes())
        while True:
            uniqueId, labels, markerSet, taxaStr, bootstrap, nodeID, nodeLabel = writerQueue.get(block=True, timeout=None)
            if uniqueId == None:
                break

            numProcessedNodes += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) internal nodes.' % (numProcessedNodes, numInternalNodes, float(numProcessedNodes) * 100 / numInternalNodes)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            fout.write('UID' + str(uniqueId) + '\t' + str(len(labels)) + '\t' + taxaStr + '\t' + bootstrap)

            m, s = self.__meanStd(metadata, labels, 'GC %')
            fout.write('\t' + str(m * 100) + '\t' + str(s * 100))

            m, s = self.__meanStd(metadata, labels, 'genome size')
            fout.write('\t' + str(m) + '\t' + str(s))

            m, s = self.__meanStd(metadata, labels, 'gene count')
            fout.write('\t' + str(m) + '\t' + str(s))

            # change model names to accession numbers, and make
            # sure there is an HMM model for each PFAM
            mungedMarkerSets = []
            for geneSet in markerSet:
                s = set()
                for geneId in geneSet:
                    if 'pfam' in geneId:
                        pfamId = geneId.replace('pfam', 'PF')
                        if pfamId in pfamIdToPfamAcc:
                            s.add(pfamIdToPfamAcc[pfamId])
                    else:
                        s.add(geneId)
                mungedMarkerSets.append(s)

            fout.write('\t' + str(mungedMarkerSets))

            fout.write('\n')

            node = inputTree.find_node(filter_fn=lambda n: hasattr(n, 'oid') and n.oid == nodeID)
            node.label = nodeLabel

        sys.stdout.write('\n')

        fout.close()

        inputTree.write_to_path(inputTreeFile, schema='newick', suppress_rooting=True, unquoted_underscores=True)
Exemplo n.º 5
0
 def __init__(self):
     self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv',
                    '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
     self.pfamHMMs = '/srv/whitlam/bio/db/pfam/27/Pfam-A.hmm'
     self.markerSetBuilder = MarkerSetBuilder()
Exemplo n.º 6
0
class DecorateTree(object):
    def __init__(self):
        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv',
                       '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        self.pfamHMMs = '/srv/whitlam/bio/db/pfam/27/Pfam-A.hmm'
        self.markerSetBuilder = MarkerSetBuilder()

    def __meanStd(self, metadata, genomeIds, category):
        values = []
        for genomeId in genomeIds:
            genomeId = genomeId.replace('IMG_', '')
            v = metadata[genomeId][category]
            if v != 'NA':
                values.append(v)

        return mean(values), std(values)

    def __calculateMarkerSet(self,
                             genomeLabels,
                             ubiquityThreshold=0.97,
                             singleCopyThreshold=0.97):
        """Calculate marker set for a set of genomes."""

        # get genome IDs from genome labels
        genomeIds = set()
        for genomeLabel in genomeLabels:
            genomeIds.add(genomeLabel.replace('IMG_', ''))

        markerSet = self.markerSetBuilder.buildMarkerSet(
            genomeIds, ubiquityThreshold, singleCopyThreshold)

        return markerSet.markerSet

    def __pfamIdToPfamAcc(self, img):
        pfamIdToPfamAcc = {}
        for line in open(self.pfamHMMs):
            if 'ACC' in line:
                acc = line.split()[1].strip()
                pfamId = acc.split('.')[0]

                pfamIdToPfamAcc[pfamId] = acc

        return pfamIdToPfamAcc

    def decorate(self, taxaTreeFile, derepFile, inputTreeFile, metadataOut,
                 numThreads):
        # read genome metadata
        print('  Reading metadata.')
        metadata = self.img.genomeMetadata()

        # read list of taxa with duplicate sequences
        print('  Read list of taxa with duplicate sequences.')
        duplicateTaxa = {}
        for line in open(derepFile):
            lineSplit = line.rstrip().split()
            if len(lineSplit) > 1:
                duplicateTaxa[lineSplit[0]] = lineSplit[1:]

        # build gene count table
        print('  Building gene count table.')
        genomeIds = self.img.genomeMetadata().keys()
        print('    # trusted genomes = ' + str(len(genomeIds)))

        # calculate statistics for each internal node using multiple threads
        print('  Calculating statistics for each internal node.')
        self.__internalNodeStatistics(taxaTreeFile, inputTreeFile,
                                      duplicateTaxa, metadata, metadataOut,
                                      numThreads)

    def __internalNodeStatistics(self, taxaTreeFile, inputTreeFile,
                                 duplicateTaxa, metadata, metadataOut,
                                 numThreads):

        # determine HMM model accession numbers
        pfamIdToPfamAcc = self.__pfamIdToPfamAcc(self.img)

        taxaTree = dendropy.Tree.get_from_path(taxaTreeFile,
                                               schema='newick',
                                               as_rooted=True,
                                               preserve_underscores=True)
        inputTree = dendropy.Tree.get_from_path(inputTreeFile,
                                                schema='newick',
                                                as_rooted=True,
                                                preserve_underscores=True)

        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        uniqueId = 0
        for node in inputTree.internal_nodes():
            uniqueId += 1
            workerQueue.put((uniqueId, node))

        for _ in range(numThreads):
            workerQueue.put((None, None))

        calcProc = [
            mp.Process(target=self.__processInternalNode,
                       args=(taxaTree, duplicateTaxa, workerQueue,
                             writerQueue)) for _ in range(numThreads)
        ]
        writeProc = mp.Process(target=self.__reportStatistics,
                               args=(metadata, metadataOut, inputTree,
                                     inputTreeFile, pfamIdToPfamAcc,
                                     writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None, None))
        writeProc.join()

    def __processInternalNode(self, taxaTree, duplicateTaxa, queueIn,
                              queueOut):
        """Run each marker gene in a separate thread."""

        while True:
            uniqueId, node = queueIn.get(block=True, timeout=None)
            if uniqueId == None:
                break

            # find corresponding internal node in taxa tree
            labels = []
            for leaf in node.leaf_nodes():
                labels.append(leaf.taxon.label)
                if leaf.taxon.label in duplicateTaxa:
                    for genomeId in duplicateTaxa[leaf.taxon.label]:
                        labels.append(genomeId)

            # check if there is a taxonomic label
            mrca = taxaTree.mrca(taxon_labels=labels)
            taxaStr = ''
            if mrca.label:
                taxaStr = mrca.label.replace(' ', '')

            # give node a unique Id while retraining bootstrap value
            bootstrap = ''
            if node.label:
                bootstrap = node.label
            nodeLabel = 'UID' + str(uniqueId) + '|' + taxaStr + '|' + bootstrap

            # calculate marker set
            markerSet = self.__calculateMarkerSet(labels)

            queueOut.put((uniqueId, labels, markerSet, taxaStr, bootstrap,
                          node.oid, nodeLabel))

    def __reportStatistics(self, metadata, metadataOut, inputTree,
                           inputTreeFile, pfamIdToPfamAcc, writerQueue):
        """Store statistics for internal node."""

        fout = open(metadataOut, 'w')
        fout.write('UID\t# genomes\tTaxonomy\tBootstrap')
        fout.write('\tGC mean\tGC std')
        fout.write('\tGenome size mean\tGenome size std')
        fout.write('\tGene count mean\tGene count std')
        fout.write('\tMarker set')
        fout.write('\n')

        numProcessedNodes = 0
        numInternalNodes = len(inputTree.internal_nodes())
        while True:
            uniqueId, labels, markerSet, taxaStr, bootstrap, nodeID, nodeLabel = writerQueue.get(
                block=True, timeout=None)
            if uniqueId == None:
                break

            numProcessedNodes += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) internal nodes.' % (
                numProcessedNodes, numInternalNodes,
                float(numProcessedNodes) * 100 / numInternalNodes)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            fout.write('UID' + str(uniqueId) + '\t' + str(len(labels)) + '\t' +
                       taxaStr + '\t' + bootstrap)

            m, s = self.__meanStd(metadata, labels, 'GC %')
            fout.write('\t' + str(m * 100) + '\t' + str(s * 100))

            m, s = self.__meanStd(metadata, labels, 'genome size')
            fout.write('\t' + str(m) + '\t' + str(s))

            m, s = self.__meanStd(metadata, labels, 'gene count')
            fout.write('\t' + str(m) + '\t' + str(s))

            # change model names to accession numbers, and make
            # sure there is an HMM model for each PFAM
            mungedMarkerSets = []
            for geneSet in markerSet:
                s = set()
                for geneId in geneSet:
                    if 'pfam' in geneId:
                        pfamId = geneId.replace('pfam', 'PF')
                        if pfamId in pfamIdToPfamAcc:
                            s.add(pfamIdToPfamAcc[pfamId])
                    else:
                        s.add(geneId)
                mungedMarkerSets.append(s)

            fout.write('\t' + str(mungedMarkerSets))

            fout.write('\n')

            node = inputTree.find_node(
                filter_fn=lambda n: hasattr(n, 'oid') and n.oid == nodeID)
            node.label = nodeLabel

        sys.stdout.write('\n')

        fout.close()

        inputTree.write_to_path(inputTreeFile,
                                schema='newick',
                                suppress_rooting=True,
                                unquoted_underscores=True)