示例#1
0
    def __getUniversalMarkerGenes(self, phyloUbiquityThreshold,
                                  phyloSingleCopyThreshold, outputGeneDir):
        img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv',
                  '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        markerSetBuilder = MarkerSetBuilder()

        metadata = img.genomeMetadata()

        allTrustedGenomeIds = set()
        phyloMarkerGenes = {}
        for lineage in ['Archaea', 'Bacteria']:
            # get all genomes in lineage
            print('\nIdentifying all ' + lineage + ' genomes.')
            trustedGenomeIds = img.genomeIdsByTaxonomy(lineage, metadata)
            print('  Trusted genomes in lineage: ' +
                  str(len(trustedGenomeIds)))
            if len(trustedGenomeIds) < 1:
                print(
                    '  Skipping lineage due to insufficient number of genomes.'
                )
                continue

            allTrustedGenomeIds.update(trustedGenomeIds)

            print('  Building marker set.')
            markerGenes = markerSetBuilder.buildMarkerGenes(
                trustedGenomeIds, phyloUbiquityThreshold,
                phyloSingleCopyThreshold)
            phyloMarkerGenes[lineage] = markerGenes

            #print lineage
            #print len(markerGenes)
            #print 'pfam01379: ', ('pfam01379' in markerGenes)
            #print '--------------------'

        # universal marker genes
        universalMarkerGenes = None
        for markerGenes in list(phyloMarkerGenes.values()):
            if universalMarkerGenes == None:
                universalMarkerGenes = markerGenes
            else:
                universalMarkerGenes.intersection_update(markerGenes)

        fout = open(os.path.join(outputGeneDir, 'phylo_marker_set.txt'), 'w')
        fout.write(str(universalMarkerGenes))
        fout.close()

        print('')
        print('  Universal marker genes: ' + str(len(universalMarkerGenes)))

        return allTrustedGenomeIds, universalMarkerGenes
示例#2
0
    def run(self, outputDir, ubiquityThreshold, singleCopyThreshold,
            minGenomes, colocatedDistThreshold, colocatedGenomeThreshold,
            threads):
        if not os.path.exists(outputDir):
            os.makedirs(outputDir)

        # determine lineages to process
        img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv',
                  '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        metadata = img.genomeMetadata()
        lineages = img.lineagesSorted(metadata)
        lineages.append('Universal')

        # determine HMM model accession numbers
        pfamIdToPfamAcc = self.__pfamIdToPfamAcc(img)

        # populate worker queue with data to process
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for lineage in lineages:
            workerQueue.put(lineage)

        for _ in range(threads):
            workerQueue.put(None)

        workerProc = [
            mp.Process(target=self.__workerThread,
                       args=(ubiquityThreshold, singleCopyThreshold,
                             minGenomes, colocatedDistThreshold,
                             colocatedGenomeThreshold, metadata, workerQueue,
                             writerQueue)) for _ in range(threads)
        ]
        writeProc = mp.Process(
            target=self.__writerThread,
            args=(pfamIdToPfamAcc, ubiquityThreshold, singleCopyThreshold,
                  colocatedDistThreshold, colocatedGenomeThreshold, outputDir,
                  len(lineages), writerQueue))

        writeProc.start()

        for p in workerProc:
            p.start()

        for p in workerProc:
            p.join()

        writerQueue.put((None, None, None))
        writeProc.join()
    def __getUniversalMarkerGenes(self, phyloUbiquityThreshold, phyloSingleCopyThreshold, outputGeneDir):
        img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        markerSetBuilder = MarkerSetBuilder()

        metadata = img.genomeMetadata()
                        
        allTrustedGenomeIds = set()
        phyloMarkerGenes = {}
        for lineage in ['Archaea', 'Bacteria']:
            # get all genomes in lineage
            print '\nIdentifying all ' + lineage + ' genomes.'
            trustedGenomeIds = img.genomeIdsByTaxonomy(lineage, metadata)
            print '  Trusted genomes in lineage: ' + str(len(trustedGenomeIds))
            if len(trustedGenomeIds) < 1:
                print '  Skipping lineage due to insufficient number of genomes.'
                continue
            
            allTrustedGenomeIds.update(trustedGenomeIds)
            
            print '  Building marker set.'
            markerGenes = markerSetBuilder.buildMarkerGenes(trustedGenomeIds, phyloUbiquityThreshold, phyloSingleCopyThreshold)
            phyloMarkerGenes[lineage] = markerGenes
            
            #print lineage
            #print len(markerGenes)
            #print 'pfam01379: ', ('pfam01379' in markerGenes)
            #print '--------------------'

        # universal marker genes
        universalMarkerGenes = None
        for markerGenes in phyloMarkerGenes.values():
            if universalMarkerGenes == None:
                universalMarkerGenes = markerGenes
            else:
                universalMarkerGenes.intersection_update(markerGenes)

        fout = open(os.path.join(outputGeneDir, 'phylo_marker_set.txt'), 'w')
        fout.write(str(universalMarkerGenes))
        fout.close()

        print ''
        print '  Universal marker genes: ' + str(len(universalMarkerGenes))
        
        return allTrustedGenomeIds, universalMarkerGenes
    def run(self, outputDir, ubiquityThreshold, singleCopyThreshold, minGenomes, colocatedDistThreshold, colocatedGenomeThreshold, threads):
        if not os.path.exists(outputDir):
            os.makedirs(outputDir)
            
        # determine lineages to process
        img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        metadata = img.genomeMetadata()
        lineages = img.lineagesSorted(metadata)
        lineages.append('Universal')
        
        # determine HMM model accession numbers
        pfamIdToPfamAcc = self.__pfamIdToPfamAcc(img)
        
        # populate worker queue with data to process
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for lineage in lineages:
            workerQueue.put(lineage)

        for _ in range(threads):
            workerQueue.put(None)

        workerProc = [mp.Process(target = self.__workerThread, args = (ubiquityThreshold, singleCopyThreshold, 
                                                                       minGenomes, 
                                                                       colocatedDistThreshold, colocatedGenomeThreshold, 
                                                                       metadata,
                                                                       workerQueue, writerQueue)) for _ in range(threads)]
        writeProc = mp.Process(target = self.__writerThread, args = (pfamIdToPfamAcc, 
                                                                       ubiquityThreshold, singleCopyThreshold, 
                                                                       colocatedDistThreshold, colocatedGenomeThreshold,
                                                                       outputDir, len(lineages), writerQueue))

        writeProc.start()

        for p in workerProc:
            p.start()

        for p in workerProc:
            p.join()

        writerQueue.put((None, None, None))
        writeProc.join()
示例#5
0
class DecorateTree(object):
    def __init__(self):
        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        self.pfamHMMs = '/srv/whitlam/bio/db/pfam/27/Pfam-A.hmm'
        self.markerSetBuilder = MarkerSetBuilder()

    def __meanStd(self, metadata, genomeIds, category):
        values = []
        for genomeId in genomeIds:
            genomeId = genomeId.replace('IMG_', '')
            v = metadata[genomeId][category]
            if v != 'NA':
                values.append(v)

        return mean(values), std(values)

    def __calculateMarkerSet(self, genomeLabels, ubiquityThreshold=0.97, singleCopyThreshold=0.97):
        """Calculate marker set for a set of genomes."""

        # get genome IDs from genome labels
        genomeIds = set()
        for genomeLabel in genomeLabels:
            genomeIds.add(genomeLabel.replace('IMG_', ''))

        markerSet = self.markerSetBuilder.buildMarkerSet(genomeIds, ubiquityThreshold, singleCopyThreshold)

        return markerSet.markerSet

    def __pfamIdToPfamAcc(self, img):
        pfamIdToPfamAcc = {}
        for line in open(self.pfamHMMs):
            if 'ACC' in line:
                acc = line.split()[1].strip()
                pfamId = acc.split('.')[0]

                pfamIdToPfamAcc[pfamId] = acc

        return pfamIdToPfamAcc

    def decorate(self, taxaTreeFile, derepFile, inputTreeFile, metadataOut, numThreads):
        # read genome metadata
        print '  Reading metadata.'
        metadata = self.img.genomeMetadata()

        # read list of taxa with duplicate sequences
        print '  Read list of taxa with duplicate sequences.'
        duplicateTaxa = {}
        for line in open(derepFile):
            lineSplit = line.rstrip().split()
            if len(lineSplit) > 1:
                duplicateTaxa[lineSplit[0]] = lineSplit[1:]

        # build gene count table
        print '  Building gene count table.'
        genomeIds = self.img.genomeMetadata().keys()
        print '    # trusted genomes = ' + str(len(genomeIds))

        # calculate statistics for each internal node using multiple threads
        print '  Calculating statistics for each internal node.'
        self.__internalNodeStatistics(taxaTreeFile, inputTreeFile, duplicateTaxa, metadata, metadataOut, numThreads)

    def __internalNodeStatistics(self, taxaTreeFile, inputTreeFile, duplicateTaxa, metadata, metadataOut, numThreads):

        # determine HMM model accession numbers
        pfamIdToPfamAcc = self.__pfamIdToPfamAcc(self.img)

        taxaTree = dendropy.Tree.get_from_path(taxaTreeFile, schema='newick', as_rooted=True, preserve_underscores=True)
        inputTree = dendropy.Tree.get_from_path(inputTreeFile, schema='newick', as_rooted=True, preserve_underscores=True)

        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        uniqueId = 0
        for node in inputTree.internal_nodes():
            uniqueId += 1
            workerQueue.put((uniqueId, node))

        for _ in range(numThreads):
            workerQueue.put((None, None))

        calcProc = [mp.Process(target=self.__processInternalNode, args=(taxaTree, duplicateTaxa, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target=self.__reportStatistics, args=(metadata, metadataOut, inputTree, inputTreeFile, pfamIdToPfamAcc, writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None, None))
        writeProc.join()

    def __processInternalNode(self, taxaTree, duplicateTaxa, queueIn, queueOut):
        """Run each marker gene in a separate thread."""

        while True:
            uniqueId, node = queueIn.get(block=True, timeout=None)
            if uniqueId == None:
                break

            # find corresponding internal node in taxa tree
            labels = []
            for leaf in node.leaf_nodes():
                labels.append(leaf.taxon.label)
                if leaf.taxon.label in duplicateTaxa:
                    for genomeId in duplicateTaxa[leaf.taxon.label]:
                        labels.append(genomeId)

            # check if there is a taxonomic label
            mrca = taxaTree.mrca(taxon_labels=labels)
            taxaStr = ''
            if mrca.label:
                taxaStr = mrca.label.replace(' ', '')

            # give node a unique Id while retraining bootstrap value
            bootstrap = ''
            if node.label:
                bootstrap = node.label
            nodeLabel = 'UID' + str(uniqueId) + '|' + taxaStr + '|' + bootstrap

            # calculate marker set
            markerSet = self.__calculateMarkerSet(labels)

            queueOut.put((uniqueId, labels, markerSet, taxaStr, bootstrap, node.oid, nodeLabel))

    def __reportStatistics(self, metadata, metadataOut, inputTree, inputTreeFile, pfamIdToPfamAcc, writerQueue):
        """Store statistics for internal node."""

        fout = open(metadataOut, 'w')
        fout.write('UID\t# genomes\tTaxonomy\tBootstrap')
        fout.write('\tGC mean\tGC std')
        fout.write('\tGenome size mean\tGenome size std')
        fout.write('\tGene count mean\tGene count std')
        fout.write('\tMarker set')
        fout.write('\n')

        numProcessedNodes = 0
        numInternalNodes = len(inputTree.internal_nodes())
        while True:
            uniqueId, labels, markerSet, taxaStr, bootstrap, nodeID, nodeLabel = writerQueue.get(block=True, timeout=None)
            if uniqueId == None:
                break

            numProcessedNodes += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) internal nodes.' % (numProcessedNodes, numInternalNodes, float(numProcessedNodes) * 100 / numInternalNodes)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            fout.write('UID' + str(uniqueId) + '\t' + str(len(labels)) + '\t' + taxaStr + '\t' + bootstrap)

            m, s = self.__meanStd(metadata, labels, 'GC %')
            fout.write('\t' + str(m * 100) + '\t' + str(s * 100))

            m, s = self.__meanStd(metadata, labels, 'genome size')
            fout.write('\t' + str(m) + '\t' + str(s))

            m, s = self.__meanStd(metadata, labels, 'gene count')
            fout.write('\t' + str(m) + '\t' + str(s))

            # change model names to accession numbers, and make
            # sure there is an HMM model for each PFAM
            mungedMarkerSets = []
            for geneSet in markerSet:
                s = set()
                for geneId in geneSet:
                    if 'pfam' in geneId:
                        pfamId = geneId.replace('pfam', 'PF')
                        if pfamId in pfamIdToPfamAcc:
                            s.add(pfamIdToPfamAcc[pfamId])
                    else:
                        s.add(geneId)
                mungedMarkerSets.append(s)

            fout.write('\t' + str(mungedMarkerSets))

            fout.write('\n')

            node = inputTree.find_node(filter_fn=lambda n: hasattr(n, 'oid') and n.oid == nodeID)
            node.label = nodeLabel

        sys.stdout.write('\n')

        fout.close()

        inputTree.write_to_path(inputTreeFile, schema='newick', suppress_rooting=True, unquoted_underscores=True)
class PlotScaffoldLenVsMarkers(object):
    def __init__(self):
        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        

    def run(self):
        # get all draft genomes consisting of a user-specific minimum number of scaffolds
        print('')
        metadata = self.img.genomeMetadata()
        print('  Total genomes: %d' % len(metadata))
        
        arGenome = set()
        for genomeId in metadata:
            if metadata[genomeId]['taxonomy'][0] == 'Archaea':
                arGenome.add(genomeId)
                
        draftGenomeIds = arGenome - self.img.filterGenomeIds(arGenome, metadata, 'status', 'Finished')
        print('  Number of draft genomes: %d' % len(draftGenomeIds))
        
        minScaffolds = 20
        genomeIdsToTest = set()
        for genomeId in draftGenomeIds:
            if metadata[genomeId]['scaffold count'] >= minScaffolds:
                genomeIdsToTest.add(genomeId)
        print('  Number of draft genomes with >= %d scaffolds: %d' % (minScaffolds, len(genomeIdsToTest)))

        print('')
        print('  Calculating genome information for calculating marker sets:')
        genomeFamilyScaffolds = self.img.precomputeGenomeFamilyScaffolds(genomeIdsToTest)
        
        print('  Calculating genome sequence lengths.')
        genomeSeqLens = self.img.precomputeGenomeSeqLens(genomeIdsToTest)
        
        print('  Determining domain-specific marker sets.')
        taxonParser = TaxonParser()
        taxonMarkerSets = taxonParser.readMarkerSets()
        bacMarkers = taxonMarkerSets['domain']['Bacteria'].getMarkerGenes()
        arMarkers = taxonMarkerSets['domain']['Archaea'].getMarkerGenes()
        print('    There are %d bacterial markers and %d archaeal markers.' % (len(bacMarkers), len(arMarkers)))
        
        print('  Determining percentage of markers on each scaffold.')
        totalMarkers = 0
        totalSequenceLen = 0
        markersOnShortScaffolds = 0
        totalShortScaffoldLen = 0
        
        scaffoldLen = {}
        percentageMarkers = defaultdict(float)
        for genomeId, markerIds in genomeFamilyScaffolds.items():
            domain = metadata[genomeId]['taxonomy'][0]
            markerGenes = bacMarkers if domain == 'Bacteria' else arMarkers
            for markerId in markerGenes:
                if markerId.startswith('PF'):
                    markerId = markerId.replace('PF', 'pfam')
                    markerId = markerId[0:markerId.rfind('.')]
                if markerId in markerIds:
                    for scaffoldId in markerIds[markerId]:
                        scaffoldLen[scaffoldId] = genomeSeqLens[genomeId][scaffoldId]
                        percentageMarkers[scaffoldId] += 1.0/len(markerGenes)
                        
                        totalMarkers += 1
                        totalSequenceLen += genomeSeqLens[genomeId][scaffoldId]
                        
                        if genomeSeqLens[genomeId][scaffoldId] < 10000:
                            markersOnShortScaffolds += 1
                            totalShortScaffoldLen += genomeSeqLens[genomeId][scaffoldId]
       
        print('Markers on short scaffolds: %d over %d Mbp (%f markers per base)' % (markersOnShortScaffolds, totalShortScaffoldLen, float(markersOnShortScaffolds)/totalShortScaffoldLen))
        print('Total markers on scaffolds: %d over %d Mbp (%f markers per base)' % (totalMarkers, totalSequenceLen, float(totalMarkers)/totalSequenceLen))
                        
        print('  Create plot.')
        plotLens = []
        plotPerMarkers = []
        for scaffoldId in percentageMarkers:
            plotLens.append(scaffoldLen[scaffoldId])
            plotPerMarkers.append(percentageMarkers[scaffoldId]/scaffoldLen[scaffoldId] * 1e6)
            
        scatterPlot = ScatterPlot()
        scatterPlot.plot(plotLens, plotPerMarkers)     
        scatterPlot.savePlot('./experiments/plotScaffoldLenVsMarkers.png')
示例#7
0
    def run(self, geneTreeDir, treeExtension, consistencyThreshold, minTaxaForAverage, outputFile, outputDir):
        # make sure output directory is empty
        if not os.path.exists(outputDir):
            os.makedirs(outputDir)

        files = os.listdir(outputDir)
        for f in files:
            if os.path.isfile(os.path.join(outputDir, f)):
                os.remove(os.path.join(outputDir, f))

        # get TIGRFam info
        descDict = {}
        files = os.listdir('/srv/db/tigrfam/13.0/TIGRFAMs_13.0_INFO')
        for f in files:
            shortDesc = longDesc = ''
            for line in open('/srv/db/tigrfam/13.0/TIGRFAMs_13.0_INFO/' + f):
                lineSplit = line.split('  ')
                if lineSplit[0] == 'AC':
                    acc = lineSplit[1].strip()
                elif lineSplit[0] == 'DE':
                    shortDesc = lineSplit[1].strip()
                elif lineSplit[0] == 'CC':
                    longDesc = lineSplit[1].strip()

            descDict[acc] = [shortDesc, longDesc]

        # get PFam info
        for line in open('/srv/db/pfam/27/Pfam-A.clans.tsv'):
            lineSplit = line.split('\t')
            acc = lineSplit[0]
            shortDesc = lineSplit[3]
            longDesc = lineSplit[4].strip()

            descDict[acc] = [shortDesc, longDesc]

        # get IMG taxonomy
        img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        metadata = img.genomeMetadata()
        genomeIdToTaxonomy = {}
        for genomeId, m in metadata.iteritems():
            genomeIdToTaxonomy[genomeId] = m['taxonomy']

        # perform analysis for each tree
        treeFiles = os.listdir(geneTreeDir)
        allResults = {}
        allTaxa = [set([]), set([]), set([])]
        taxaCounts = {}
        avgConsistency = {}
        for treeFile in treeFiles:
            if not treeFile.endswith(treeExtension):
                continue

            print treeFile
            tree = dendropy.Tree.get_from_path(os.path.join(geneTreeDir, treeFile), schema='newick', as_rooted=True, preserve_underscores=True)

            domainConsistency = {}
            phylaConsistency = {}
            classConsistency = {}
            consistencyDict = [domainConsistency, phylaConsistency, classConsistency]

            # get abundance of taxa at different taxonomic ranks
            totals = [{}, {}, {}]
            leaves = tree.leaf_nodes()
            print '  Number of leaves: ' + str(len(leaves))
            totalValidLeaves = 0

            for leaf in leaves:
                genomeId = self.__genomeId(leaf.taxon.label)

                if genomeId not in metadata:
                    print '[Error] Genome is missing metadata: ' + genomeId
                    sys.exit()

                totalValidLeaves += 1
                taxonomy = genomeIdToTaxonomy[genomeId]
                for r in xrange(0, 3):
                    totals[r][taxonomy[r]] = totals[r].get(taxonomy[r], 0) + 1
                    consistencyDict[r][taxonomy[r]] = 0
                    allTaxa[r].add(taxonomy[r])

            taxaCounts[treeFile] = [totalValidLeaves, totals[0].get('Bacteria', 0), totals[0].get('Archaea', 0)]

            # find highest consistency nodes (congruent descendant taxa / (total taxa + incongruent descendant taxa))
            internalNodes = tree.internal_nodes()
            for node in internalNodes:
                leaves = node.leaf_nodes()

                for r in xrange(0, 3):
                    leafCounts = {}
                    for leaf in leaves:
                        genomeId = self.__genomeId(leaf.taxon.label)
                        taxonomy = genomeIdToTaxonomy[genomeId]
                        leafCounts[taxonomy[r]] = leafCounts.get(taxonomy[r], 0) + 1

                    # calculate consistency for node
                    for taxa in consistencyDict[r]:
                        totalTaxaCount = totals[r][taxa]
                        if totalTaxaCount <= 1 or taxa == 'unclassified':
                            consistencyDict[r][taxa] = 'N/A'
                            continue

                        taxaCount = leafCounts.get(taxa, 0)
                        incongruentTaxa = len(leaves) - taxaCount
                        c = float(taxaCount) / (totalTaxaCount + incongruentTaxa)
                        if c > consistencyDict[r][taxa]:
                            consistencyDict[r][taxa] = c

                        # consider clan in other direction since the trees are unrooted
                        taxaCount = totalTaxaCount - leafCounts.get(taxa, 0)
                        incongruentTaxa = totalValidLeaves - len(leaves) - taxaCount
                        c = float(taxaCount) / (totalTaxaCount + incongruentTaxa)
                        if c > consistencyDict[r][taxa]:
                            consistencyDict[r][taxa] = c

            # write results
            consistencyDir = os.path.join(outputDir, 'consistency')
            if not os.path.exists(consistencyDir):
                os.makedirs(consistencyDir)
            fout = open(os.path.join(consistencyDir, treeFile + '.results.tsv'), 'w')
            fout.write('Tree')
            for r in xrange(0, 3):
                for taxa in sorted(consistencyDict[r].keys()):
                    fout.write('\t' + taxa)
            fout.write('\n')

            fout.write(treeFile)
            for r in xrange(0, 3):
                for taxa in sorted(consistencyDict[r].keys()):
                    if consistencyDict[r][taxa] != 'N/A':
                        fout.write('\t%.2f' % (consistencyDict[r][taxa]*100))
                    else:
                        fout.write('\tN/A')
            fout.close()

            # calculate average consistency at each taxonomic rank
            average = []
            for r in xrange(0, 3):
                sumConsistency = []
                for taxa in consistencyDict[r]:
                    if totals[r][taxa] > minTaxaForAverage and consistencyDict[r][taxa] != 'N/A':
                        sumConsistency.append(consistencyDict[r][taxa])

                if len(sumConsistency) > 0:
                    average.append(sum(sumConsistency) / len(sumConsistency))
                else:
                    average.append(0)
            avgConsistency[treeFile] = average
            allResults[treeFile] = consistencyDict

            print '  Average consistency: ' + str(average) + ', mean = %.2f' % (sum(average)/len(average))
            print ''

        # print out combined results
        fout = open(outputFile, 'w')
        fout.write('Tree\tShort Desc.\tLong Desc.\tAlignment Length\t# Taxa\t# Bacteria\t# Archaea\tAvg. Consistency\tAvg. Domain Consistency\tAvg. Phylum Consistency\tAvg. Class Consistency')
        for r in xrange(0, 3):
            for t in sorted(allTaxa[r]):
                fout.write('\t' + t)
        fout.write('\n')

        filteredGeneTrees = 0
        retainedGeneTrees = 0
        for treeFile in sorted(allResults.keys()):
            consistencyDict = allResults[treeFile]
            treeId = treeFile[0:treeFile.find('.')].replace('pfam', 'PF')

            fout.write(treeId + '\t' + descDict[treeId][0] + '\t' + descDict[treeId][1])

            # Taxa count
            fout.write('\t' + str(taxaCounts[treeFile][0]) + '\t' + str(taxaCounts[treeFile][1]) + '\t' + str(taxaCounts[treeFile][2]))

            avgCon = 0
            for r in xrange(0, 3):
                avgCon += avgConsistency[treeFile][r]
            avgCon /= 3
            fout.write('\t' + str(avgCon))
            
            if avgCon >= consistencyThreshold:
                retainedGeneTrees += 1
                os.system('cp ' + os.path.join(geneTreeDir, treeFile) + ' ' + os.path.join(outputDir, treeFile))
            else:
                filteredGeneTrees += 1
                print 'Filtered % s with an average consistency of %.4f.' % (treeFile, avgCon)

            for r in xrange(0, 3):
                fout.write('\t' + str(avgConsistency[treeFile][r]))

            for r in xrange(0, 3):
                for t in sorted(allTaxa[r]):
                    if t in consistencyDict[r]:
                        if consistencyDict[r][t] != 'N/A':
                            fout.write('\t%.2f' % (consistencyDict[r][t]*100))
                        else:
                            fout.write('\tN/A')
                    else:
                        fout.write('\tN/A')
            fout.write('\n')
        fout.close()

        print 'Retained gene trees: ' + str(retainedGeneTrees)
        print 'Filtered gene trees: ' + str(filteredGeneTrees)
示例#8
0
    def run(self, geneTreeDir, acceptPer, extension, outputDir):
        # make sure output directory is empty
        if not os.path.exists(outputDir):
            os.makedirs(outputDir)

        files = os.listdir(outputDir)
        for f in files:
            os.remove(os.path.join(outputDir, f))

        img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv',
                  '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        metadata = img.genomeMetadata()

        files = os.listdir(geneTreeDir)
        print('Identifying gene trees with only conspecific paralogous genes:')
        filteredGeneTrees = 0
        retainedGeneTrees = 0
        for f in files:
            if not f.endswith(extension):
                continue

            geneId = f[0:f.find('.')]
            print('  Testing gene tree: ' + geneId)

            tree = dendropy.Tree.get_from_path(os.path.join(geneTreeDir, f),
                                               schema='newick',
                                               as_rooted=False,
                                               preserve_underscores=True)

            taxa = tree.leaf_nodes()
            numTaxa = len(taxa)
            print('  Genes in tree: ' + str(numTaxa))

            # root tree with archaeal genomes
            rerootTree = RerootTree()
            rerootTree.reroot(tree)

            # get species name of each taxa
            leafNodeToSpeciesName = {}
            for t in taxa:
                genomeId = t.taxon.label.split('|')[0]
                genus = metadata[genomeId]['taxonomy'][5]
                sp = metadata[genomeId]['taxonomy'][6].lower()

                leafNodeToSpeciesName[t.taxon.label] = genus + ' ' + sp

            # find all paralogous genes
            print('  Finding paralogous genes.')

            paralogs = defaultdict(set)
            for i in range(0, len(taxa)):
                genomeId = taxa[i].taxon.label.split('|')[0]
                for j in range(i + 1, len(taxa)):
                    # genes from the same genome are paralogs, but we filter out
                    # those that are identical (distance of 0 on the tree) to
                    # speed up computation and because these clearly do not
                    # adversely effect phylogenetic inference
                    if genomeId == taxa[j].taxon.label.split(
                            '|')[0] and self.__patristicDist(
                                tree, taxa[i], taxa[j]) > 0:
                        paralogs[genomeId].add(taxa[i].taxon.label)
                        paralogs[genomeId].add(taxa[j].taxon.label)

            print('    Paralogous genes: ' + str(len(paralogs)))

            # check if paralogous genes are conspecific
            print('  Determining if paralogous genes are conspecific.')
            nonConspecificGenomes = []
            for genomeId, taxaLabels in paralogs.iteritems():
                lcaNode = tree.mrca(taxon_labels=taxaLabels)

                children = lcaNode.leaf_nodes()
                species = set()
                for child in children:
                    childGenomeId = child.taxon.label.split('|')[0]

                    genus = metadata[childGenomeId]['taxonomy'][5]
                    sp = metadata[childGenomeId]['taxonomy'][6].lower()
                    if sp != '' and sp != 'unclassified' and genus != 'unclassified':
                        species.add(genus + ' ' + sp)

                if len(species) > 1:
                    nonConspecificGenomes.append(genomeId)

            if len(nonConspecificGenomes) > acceptPer * numTaxa:
                filteredGeneTrees += 1
                print('  Tree is not conspecific for the following genome: ' +
                      str(nonConspecificGenomes))
            else:
                retainedGeneTrees += 1

                if len(nonConspecificGenomes) > 1:
                    print(
                        '  An acceptable number of genomes are not conspecific: '
                        + str(nonConspecificGenomes))
                else:
                    print('  Tree is conspecific.')

                os.system('cp ' + os.path.join(geneTreeDir, f) + ' ' +
                          os.path.join(outputDir, f))

            print('')

        print('Filtered gene trees: ' + str(filteredGeneTrees))
        print('Retained gene trees: ' + str(retainedGeneTrees))
示例#9
0
class Simulation(object):
    def __init__(self):
        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')

        self.contigLens = [1000, 2000, 5000, 10000, 20000, 50000]
        self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]
        self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2]

    def __workerThread(self, tree, metadata, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut):
        """Process each data item in parallel."""

        while True:
            testGenomeId = queueIn.get(block=True, timeout=None)
            if testGenomeId == None:
                break

            # build marker sets for evaluating test genome
            testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId)
            binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet=True, genomeIdsToRemove=[testGenomeId])
            #!!!binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildDomainMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet = False, genomeIdsToRemove = [testGenomeId])

            # determine distribution of all marker genes within the test genome
            geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0)

            print('# marker genes: ', len(binMarkerSets.getMarkerGenes()))
            print('# genes in table: ', len(geneDistTable[testGenomeId]))

            # estimate completeness of unmodified genome
            unmodifiedComp = {}
            unmodifiedCont = {}
            for ms in binMarkerSets.markerSetIter():
                hits = {}
                for mg in ms.getMarkerGenes():
                    if mg in geneDistTable[testGenomeId]:
                        hits[mg] = geneDistTable[testGenomeId][mg]
                completeness, contamination = ms.genomeCheck(hits, bIndividualMarkers=True)
                unmodifiedComp[ms.lineageStr] = completeness
                unmodifiedCont[ms.lineageStr] = contamination

            print(completeness, contamination)

            # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets
            genomeSize = readFastaBases(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna'))
            print('genomeSize', genomeSize)

            for contigLen in self.contigLens:
                for percentComp in self.percentComps:
                    for percentCont in self.percentConts:
                        deltaComp = defaultdict(list)
                        deltaCont = defaultdict(list)
                        deltaCompSet = defaultdict(list)
                        deltaContSet = defaultdict(list)

                        deltaCompRefined = defaultdict(list)
                        deltaContRefined = defaultdict(list)
                        deltaCompSetRefined = defaultdict(list)
                        deltaContSetRefined = defaultdict(list)

                        trueComps = []
                        trueConts = []

                        numDescendants = {}

                        for _ in range(0, numReplicates):
                            trueComp, trueCont, startPartialGenomeContigs = self.markerSetBuilder.sampleGenome(genomeSize, percentComp, percentCont, contigLen)
                            print(contigLen, trueComp, trueCont, len(startPartialGenomeContigs))

                            trueComps.append(trueComp)
                            trueConts.append(trueCont)

                            for ms in binMarkerSets.markerSetIter():
                                numDescendants[ms.lineageStr] = ms.numGenomes

                                containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, contigLen)
                                completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True)
                                deltaComp[ms.lineageStr].append(completeness - trueComp)
                                deltaCont[ms.lineageStr].append(contamination - trueCont)

                                completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False)
                                deltaCompSet[ms.lineageStr].append(completeness - trueComp)
                                deltaContSet[ms.lineageStr].append(contamination - trueCont)

                            for ms in refinedBinMarkerSet.markerSetIter():
                                containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, contigLen)
                                completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True)
                                deltaCompRefined[ms.lineageStr].append(completeness - trueComp)
                                deltaContRefined[ms.lineageStr].append(contamination - trueCont)

                                completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False)
                                deltaCompSetRefined[ms.lineageStr].append(completeness - trueComp)
                                deltaContSetRefined[ms.lineageStr].append(contamination - trueCont)

                        taxonomy = ';'.join(metadata[testGenomeId]['taxonomy'])
                        queueOut.put((testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, trueComps, trueConts, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts))

    def __writerThread(self, numTestGenomes, writerQueue):
        """Store or write results of worker threads in a single thread."""

        # summaryOut = open('/tmp/simulation.draft.summary.w_refinement_50.tsv', 'w')
        summaryOut = open('/tmp/simulation.summary.testing.tsv', 'w')
        summaryOut.write('Genome Id\tContig len\t% comp\t% cont')
        summaryOut.write('\tTaxonomy\tMarker set\t# descendants')
        summaryOut.write('\tUnmodified comp\tUnmodified cont\tTrue comp\tTrue cont')
        summaryOut.write('\tIM comp\tIM comp std\tIM cont\tIM cont std')
        summaryOut.write('\tMS comp\tMS comp std\tMS cont\tMS cont std')
        summaryOut.write('\tRIM comp\tRIM comp std\tRIM cont\tRIM cont std')
        summaryOut.write('\tRMS comp\tRMS comp std\tRMS cont\tRMS cont std\n')

        # fout = gzip.open('/tmp/simulation.draft.w_refinement_50.tsv.gz', 'wb')
        fout = gzip.open('/tmp/simulation.testing.tsv.gz', 'wb')
        fout.write('Genome Id\tContig len\t% comp\t% cont')
        fout.write('\tTaxonomy\tMarker set\t# descendants')
        fout.write('\tUnmodified comp\tUnmodified cont\tTrue comp\tTrue cont')
        fout.write('\tIM comp\tIM cont')
        fout.write('\tMS comp\tMS cont')
        fout.write('\tRIM comp\tRIM cont')
        fout.write('\tRMS comp\tRMS cont\tTrue Comp\tTrue Cont\n')

        testsPerGenome = len(self.contigLens) * len(self.percentComps) * len(self.percentConts)

        itemsProcessed = 0
        while True:
            testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, trueComps, trueConts, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts = writerQueue.get(block=True, timeout=None)
            if testGenomeId == None:
                break

            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, numTestGenomes * testsPerGenome, float(itemsProcessed) * 100 / (numTestGenomes * testsPerGenome))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            for markerSetId in unmodifiedComp:
                summaryOut.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont))
                summaryOut.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId]))
                summaryOut.write('\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId]))
                summaryOut.write('\t%.3f\t%.3f' % (mean(trueComps), std(trueConts)))
                summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaComp[markerSetId])), std(abs(deltaComp[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCont[markerSetId])), std(abs(deltaCont[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSet[markerSetId])), std(abs(deltaCompSet[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSet[markerSetId])), std(abs(deltaContSet[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompRefined[markerSetId])), std(abs(deltaCompRefined[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContRefined[markerSetId])), std(abs(deltaContRefined[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSetRefined[markerSetId])), std(abs(deltaCompSetRefined[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSetRefined[markerSetId])), std(abs(deltaContSetRefined[markerSetId]))))
                summaryOut.write('\n')

                fout.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont))
                fout.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId]))
                fout.write('\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId]))
                fout.write('\t%s' % ','.join(map(str, trueComps)))
                fout.write('\t%s' % ','.join(map(str, trueConts)))
                fout.write('\t%s' % ','.join(map(str, deltaComp[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, deltaCont[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, deltaCompSet[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, deltaContSet[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, deltaCompRefined[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, deltaContRefined[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, deltaCompSetRefined[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, deltaContSetRefined[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, trueComps)))
                fout.write('\t%s' % ','.join(map(str, trueConts)))
                fout.write('\n')

        summaryOut.close()
        fout.close()

        sys.stdout.write('\n')

    def run(self, ubiquityThreshold, singleCopyThreshold, numReplicates, numThreads):
        print('\n  Reading reference genome tree.')
        treeFile = os.path.join('/srv', 'db', 'checkm', 'genome_tree', 'genome_tree_full.refpkg', 'genome_tree.tre')
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True)

        print('    Number of taxa in tree: %d' % (len(tree.leaf_nodes())))

        genomesInTree = set()
        for leaf in tree.leaf_iter():
            genomesInTree.add(leaf.taxon.label.replace('IMG_', ''))

        # get all draft genomes for testing
        print('')
        metadata = self.img.genomeMetadata()
        print('  Total genomes: %d' % len(metadata))

        genomeIdsToTest = genomesInTree - self.img.filterGenomeIds(genomesInTree, metadata, 'status', 'Finished')
        print('  Number of draft genomes: %d' % len(genomeIdsToTest))

        print('')
        print('  Pre-computing genome information for calculating marker sets:')
        start = time.time()
        self.markerSetBuilder.readLineageSpecificGenesToRemove()
        end = time.time()
        print('    readLineageSpecificGenesToRemove: %.2f' % (end - start))


        start = time.time()
        # self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys())
        end = time.time()
        print('    globalGeneCountTable: %.2f' % (end - start))

        start = time.time()
        # self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys())
        end = time.time()
        print('    precomputeGenomeSeqLens: %.2f' % (end - start))

        start = time.time()
        # self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 0)
        end = time.time()
        print('    precomputeGenomeFamilyPositions: %.2f' % (end - start))

        print('')
        print('  Evaluating %d test genomes.' % len(genomeIdsToTest))
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for testGenomeId in genomeIdsToTest:
            workerQueue.put(testGenomeId)

        for _ in range(numThreads):
            workerQueue.put(None)

        workerProc = [mp.Process(target=self.__workerThread, args=(tree, metadata, ubiquityThreshold, singleCopyThreshold, numReplicates, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target=self.__writerThread, args=(len(genomeIdsToTest), writerQueue))

        writeProc.start()

        for p in workerProc:
            p.start()

        for p in workerProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None))
        writeProc.join()
class SimulationScaffolds(object):
    def __init__(self):
        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        
        self.contigLens = [5000, 20000, 50000]
        self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]
        self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2]

    def __seqLens(self, seqs):
        """Calculate lengths of seqs."""
        genomeSize = 0
        seqLens = {}
        for seqId, seq in seqs.iteritems():
            seqLens[seqId] = len(seq)
            genomeSize += len(seq)
    
        return seqLens, genomeSize
    
    def __workerThread(self, tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut):
        """Process each data item in parallel."""

        while True:
            testGenomeId = queueIn.get(block=True, timeout=None)
            if testGenomeId == None:
                break
                        
            # build marker sets for evaluating test genome
            testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId)
            binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet = True, genomeIdsToRemove = [testGenomeId])

            # determine distribution of all marker genes within the test genome
            geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0)
                
            # estimate completeness of unmodified genome
            unmodifiedComp = {}
            unmodifiedCont = {}
            for ms in binMarkerSets.markerSetIter():     
                hits = {}
                for mg in ms.getMarkerGenes():
                    if mg in geneDistTable[testGenomeId]:
                        hits[mg] = geneDistTable[testGenomeId][mg]
                completeness, contamination = ms.genomeCheck(hits, bIndividualMarkers=True) 
                unmodifiedComp[ms.lineageStr] = completeness
                unmodifiedCont[ms.lineageStr] = contamination

            # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets 
            testSeqs = readFasta(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna'))
            testSeqLens, genomeSize = self.__seqLens(testSeqs)
            
            
            for contigLen in self.contigLens: 
                for percentComp in self.percentComps:
                    for percentCont in self.percentConts:
                        deltaComp = defaultdict(list)
                        deltaCont = defaultdict(list)
                        deltaCompSet = defaultdict(list)
                        deltaContSet = defaultdict(list)
                        
                        deltaCompRefined = defaultdict(list)
                        deltaContRefined = defaultdict(list)
                        deltaCompSetRefined = defaultdict(list)
                        deltaContSetRefined = defaultdict(list)
                        
                        trueComps = []
                        trueConts = []
                        
                        numDescendants = {}
            
                        for i in xrange(0, numReplicates):
                            # generate test genome with a specific level of completeness, by randomly sampling scaffolds to remove 
                            # (this will sample >= the desired level of completeness)
                            retainedTestSeqs, trueComp = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement(percentComp, testSeqLens, genomeSize)
                            trueComps.append(trueComp)
    
                            # select a random genome to use as a source of contamination
                            contGenomeId = random.sample(genomeIdsToTest - set([testGenomeId]), 1)[0]
                            contSeqs = readFasta(os.path.join(self.img.genomeDir, contGenomeId, contGenomeId + '.fna'))
                            contSeqLens, contGenomeSize = self.__seqLens(contSeqs) 
                            seqsToRetain, trueRetainedPer = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement(1 - percentCont, contSeqLens, contGenomeSize) 
                            
                            contSampledSeqIds = set(contSeqs.keys()).difference(seqsToRetain)
                            trueCont = 100.0 - trueRetainedPer
                            trueConts.append(trueCont)
              
                            for ms in binMarkerSets.markerSetIter():  
                                numDescendants[ms.lineageStr] = ms.numGenomes
                                containedMarkerGenes= defaultdict(list)
                                self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes)
                                self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes)

                                completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True)
                                deltaComp[ms.lineageStr].append(completeness - trueComp)
                                deltaCont[ms.lineageStr].append(contamination - trueCont)
                                
                                completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False)
                                deltaCompSet[ms.lineageStr].append(completeness - trueComp)
                                deltaContSet[ms.lineageStr].append(contamination - trueCont)
                                
                            for ms in refinedBinMarkerSet.markerSetIter():  
                                containedMarkerGenes= defaultdict(list)
                                self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), testGenomeId, retainedTestSeqs, containedMarkerGenes)
                                self.markerSetBuilder.markerGenesOnScaffolds(ms.getMarkerGenes(), contGenomeId, contSampledSeqIds, containedMarkerGenes)
                                
                                completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True)
                                deltaCompRefined[ms.lineageStr].append(completeness - trueComp)
                                deltaContRefined[ms.lineageStr].append(contamination - trueCont)
                                
                                completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=False)
                                deltaCompSetRefined[ms.lineageStr].append(completeness - trueComp)
                                deltaContSetRefined[ms.lineageStr].append(contamination - trueCont)
                                
                        taxonomy = ';'.join(metadata[testGenomeId]['taxonomy'])
                        queueOut.put((testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts))
            
    def __writerThread(self, numTestGenomes, writerQueue):
        """Store or write results of worker threads in a single thread."""
        
        summaryOut = open('/tmp/simulation.random_scaffolds.w_refinement_50.draft.summary.tsv', 'w')
        summaryOut.write('Genome Id\tContig len\t% comp\t% cont')
        summaryOut.write('\tTaxonomy\tMarker set\t# descendants')
        summaryOut.write('\tUnmodified comp\tUnmodified cont')
        summaryOut.write('\tIM comp\tIM comp std\tIM cont\tIM cont std')
        summaryOut.write('\tMS comp\tMS comp std\tMS cont\tMS cont std')
        summaryOut.write('\tRIM comp\tRIM comp std\tRIM cont\tRIM cont std')
        summaryOut.write('\tRMS comp\tRMS comp std\tRMS cont\tRMS cont std\n')
        
        fout = gzip.open('/tmp/simulation.random_scaffolds.w_refinement_50.draft.tsv.gz', 'wb')
        fout.write('Genome Id\tContig len\t% comp\t% cont')
        fout.write('\tTaxonomy\tMarker set\t# descendants')
        fout.write('\tUnmodified comp\tUnmodified cont')
        fout.write('\tIM comp\tIM cont')
        fout.write('\tMS comp\tMS cont')
        fout.write('\tRIM comp\tRIM cont')
        fout.write('\tRMS comp\tRMS cont\tTrue Comp\tTrue Cont\n')
        
        testsPerGenome = len(self.contigLens) * len(self.percentComps) * len(self.percentConts)

        itemsProcessed = 0
        while True:
            testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts = writerQueue.get(block=True, timeout=None)
            if testGenomeId == None:
                break

            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, numTestGenomes*testsPerGenome, float(itemsProcessed)*100/(numTestGenomes*testsPerGenome))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            for markerSetId in unmodifiedComp:
                summaryOut.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) 
                summaryOut.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId]))
                summaryOut.write('\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId]))
                summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaComp[markerSetId])), std(abs(deltaComp[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCont[markerSetId])), std(abs(deltaCont[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSet[markerSetId])), std(abs(deltaCompSet[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSet[markerSetId])), std(abs(deltaContSet[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompRefined[markerSetId])), std(abs(deltaCompRefined[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContRefined[markerSetId])), std(abs(deltaContRefined[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaCompSetRefined[markerSetId])), std(abs(deltaCompSetRefined[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' % (mean(abs(deltaContSetRefined[markerSetId])), std(abs(deltaContSetRefined[markerSetId]))))
                summaryOut.write('\n')
                
                fout.write(testGenomeId + '\t%d\t%.2f\t%.2f' % (contigLen, percentComp, percentCont)) 
                fout.write('\t' + taxonomy + '\t' + markerSetId + '\t' + str(numDescendants[markerSetId]))
                fout.write('\t%.3f\t%.3f' % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId]))
                fout.write('\t%s' % ','.join(map(str, deltaComp[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, deltaCont[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, deltaCompSet[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, deltaContSet[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, deltaCompRefined[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, deltaContRefined[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, deltaCompSetRefined[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, deltaContSetRefined[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, trueComps)))
                fout.write('\t%s' % ','.join(map(str, trueConts)))
                fout.write('\n')
            
        summaryOut.close()
        fout.close()

        sys.stdout.write('\n')

    def run(self, ubiquityThreshold, singleCopyThreshold, numReplicates, minScaffolds, numThreads):
        random.seed(0)

        print '\n  Reading reference genome tree.'
        treeFile = os.path.join('/srv', 'db', 'checkm', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre')
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True)
        
        print '    Number of taxa in tree: %d' % (len(tree.leaf_nodes()))
        
        genomesInTree = set()
        for leaf in tree.leaf_iter():
            genomesInTree.add(leaf.taxon.label.replace('IMG_', ''))

        # get all draft genomes consisting of a user-specific minimum number of scaffolds
        print ''
        metadata = self.img.genomeMetadata()
        print '  Total genomes: %d' % len(metadata)
        
        draftGenomeIds = genomesInTree - self.img.filterGenomeIds(genomesInTree, metadata, 'status', 'Finished')
        print '  Number of draft genomes: %d' % len(draftGenomeIds)
        
        genomeIdsToTest = set()
        for genomeId in draftGenomeIds:
            if metadata[genomeId]['scaffold count'] >= minScaffolds:
                genomeIdsToTest.add(genomeId)
                
        
        print '  Number of draft genomes with >= %d scaffolds: %d' % (minScaffolds, len(genomeIdsToTest))

        print ''
        start = time.time()
        self.markerSetBuilder.readLineageSpecificGenesToRemove()
        end = time.time()
        print '    readLineageSpecificGenesToRemove: %.2f' % (end - start)
        
        print '  Pre-computing genome information for calculating marker sets:'
        start = time.time()
        self.markerSetBuilder.precomputeGenomeFamilyScaffolds(metadata.keys())
        end = time.time()
        print '    precomputeGenomeFamilyScaffolds: %.2f' % (end - start)
        
        start = time.time()
        self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys())
        end = time.time()
        print '    globalGeneCountTable: %.2f' % (end - start)
        
        start = time.time()
        self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys())
        end = time.time()
        print '    precomputeGenomeSeqLens: %.2f' % (end - start)
        
        start = time.time()
        self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 0)
        end = time.time()
        print '    precomputeGenomeFamilyPositions: %.2f' % (end - start)
                     
        print ''    
        print '  Evaluating %d test genomes.' % len(genomeIdsToTest)
            
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for testGenomeId in list(genomeIdsToTest):
            workerQueue.put(testGenomeId)

        for _ in range(numThreads):
            workerQueue.put(None)

        workerProc = [mp.Process(target = self.__workerThread, args = (tree, metadata, genomeIdsToTest, ubiquityThreshold, singleCopyThreshold, numReplicates, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target = self.__writerThread, args = (len(genomeIdsToTest), writerQueue))

        writeProc.start()

        for p in workerProc:
            p.start()

        for p in workerProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None))
        writeProc.join()
示例#11
0
class SimCompareDiffPlot(object):
    def __init__(self):
        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv',
                       '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')

    def run(self):
        # count number of times the lineage-specific marker set results outperform
        # the domain-specific marker set for varying differences between the two sets
        numBars = 15

        lineageCountsComp = [0] * numBars
        domainCountsComp = [0] * numBars

        lineageCountsCont = [0] * numBars
        domainCountsCont = [0] * numBars

        totalCountsComp = 0
        totalCountsCont = 0

        domCompBest = 0
        lineageCompBest = 0
        domContBest = 0
        lineageContBest = 0

        metadata = self.img.genomeMetadata()
        domCompTaxon = defaultdict(int)
        lineageCompTaxon = defaultdict(int)

        for line in open('./simulations/briefSummaryOut.tsv'):
            lineSplit = line.split('\t')
            genomeId = lineSplit[0]
            taxonomy = metadata[genomeId]['taxonomy']
            phylum = taxonomy[1]
            domCompMS, lineageCompMS, lineageCompRMS, domContMS, lineageContMS, lineageContRMS = [
                float(x) for x in lineSplit[1:]
            ]

            diff = abs(abs(lineageCompMS) - abs(domCompMS))
            if diff > 5:
                intDiff = int(diff)
                if intDiff >= numBars:
                    intDiff = (numBars - 1)

                if abs(domCompMS) < abs(lineageCompMS):
                    domainCountsComp[intDiff] += 1
                    domCompBest += 1
                    domCompTaxon[phylum] += 1
                else:
                    lineageCountsComp[intDiff] += 1
                    lineageCompBest += 1
                    lineageCompTaxon[phylum] += 1

                totalCountsComp += 1

            diff = abs(abs(lineageContMS) - abs(domContMS))
            if diff > 5:
                intDiff = int(diff)
                if intDiff >= numBars:
                    intDiff = (numBars - 1)

                if abs(domContMS) < abs(lineageContMS):
                    domainCountsCont[intDiff] += 1
                    domContBest += 1
                else:
                    lineageCountsCont[intDiff] += 1
                    lineageContBest += 1

                totalCountsCont += 1

        print('%% times lineage comp better than domain: %.2f' %
              (float(lineageCompBest) * 100 / (domCompBest + lineageCompBest)))
        print('%% times lineage cont better than domain: %.2f' %
              (float(lineageContBest) * 100 / (domContBest + lineageContBest)))

        print('')
        print('Taxonomy breakdown (dom best, lineage best):')
        taxa = set(domCompTaxon.keys()).union(lineageCompTaxon.keys())
        for t in taxa:
            print('%s\t%.2f\t%.2f' %
                  (t, domCompTaxon[t] * 100.0 / domCompBest,
                   lineageCompTaxon[t] * 100.0 / lineageCompBest))

        # normalize counts
        for i in range(0, numBars):
            lineageCountsComp[i] = float(
                lineageCountsComp[i]) * 100 / totalCountsComp
            domainCountsComp[i] = float(
                domainCountsComp[i]) * 100 / totalCountsComp

            if domainCountsComp[i] > lineageCountsComp[i]:
                print('Domain bets lineage (comp): %d%% (%f, %f)' %
                      (i + 1, domainCountsComp[i], lineageCountsComp[i]))

            lineageCountsCont[i] = float(
                lineageCountsCont[i]) * 100 / totalCountsCont
            domainCountsCont[i] = float(
                domainCountsCont[i]) * 100 / totalCountsCont

            if domainCountsCont[i] > lineageCountsCont[i]:
                print('Domain bets lineage (cont): %d%% (%f, %f)' %
                      (i + 1, domainCountsCont[i], lineageCountsCont[i]))

        stackedBarPlot = StackedBarPlot()
        stackedBarPlot.plot(lineageCountsComp, domainCountsComp,
                            lineageCountsCont, domainCountsCont)
        stackedBarPlot.savePlot('./experiments/simCompareDiffPlot.svg')
class PlotScaffoldLenVsMarkers(object):
    def __init__(self):
        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        

    def run(self):
        # get all draft genomes consisting of a user-specific minimum number of scaffolds
        print ''
        metadata = self.img.genomeMetadata()
        print '  Total genomes: %d' % len(metadata)
        
        arGenome = set()
        for genomeId in metadata:
            if metadata[genomeId]['taxonomy'][0] == 'Archaea':
                arGenome.add(genomeId)
                
        draftGenomeIds = arGenome - self.img.filterGenomeIds(arGenome, metadata, 'status', 'Finished')
        print '  Number of draft genomes: %d' % len(draftGenomeIds)
        
        minScaffolds = 20
        genomeIdsToTest = set()
        for genomeId in draftGenomeIds:
            if metadata[genomeId]['scaffold count'] >= minScaffolds:
                genomeIdsToTest.add(genomeId)
        print '  Number of draft genomes with >= %d scaffolds: %d' % (minScaffolds, len(genomeIdsToTest))

        print ''
        print '  Calculating genome information for calculating marker sets:'
        genomeFamilyScaffolds = self.img.precomputeGenomeFamilyScaffolds(genomeIdsToTest)
        
        print '  Calculating genome sequence lengths.'
        genomeSeqLens = self.img.precomputeGenomeSeqLens(genomeIdsToTest)
        
        print '  Determining domain-specific marker sets.'
        taxonParser = TaxonParser()
        taxonMarkerSets = taxonParser.readMarkerSets()
        bacMarkers = taxonMarkerSets['domain']['Bacteria'].getMarkerGenes()
        arMarkers = taxonMarkerSets['domain']['Archaea'].getMarkerGenes()
        print '    There are %d bacterial markers and %d archaeal markers.' % (len(bacMarkers), len(arMarkers))
        
        print '  Determining percentage of markers on each scaffold.'
        totalMarkers = 0
        totalSequenceLen = 0
        markersOnShortScaffolds = 0
        totalShortScaffoldLen = 0
        
        scaffoldLen = {}
        percentageMarkers = defaultdict(float)
        for genomeId, markerIds in genomeFamilyScaffolds.iteritems():
            domain = metadata[genomeId]['taxonomy'][0]
            markerGenes = bacMarkers if domain == 'Bacteria' else arMarkers
            for markerId in markerGenes:
                if markerId.startswith('PF'):
                    markerId = markerId.replace('PF', 'pfam')
                    markerId = markerId[0:markerId.rfind('.')]
                if markerId in markerIds:
                    for scaffoldId in markerIds[markerId]:
                        scaffoldLen[scaffoldId] = genomeSeqLens[genomeId][scaffoldId]
                        percentageMarkers[scaffoldId] += 1.0/len(markerGenes)
                        
                        totalMarkers += 1
                        totalSequenceLen += genomeSeqLens[genomeId][scaffoldId]
                        
                        if genomeSeqLens[genomeId][scaffoldId] < 10000:
                            markersOnShortScaffolds += 1
                            totalShortScaffoldLen += genomeSeqLens[genomeId][scaffoldId]
       
        print 'Markers on short scaffolds: %d over %d Mbp (%f markers per base)' % (markersOnShortScaffolds, totalShortScaffoldLen, float(markersOnShortScaffolds)/totalShortScaffoldLen)
        print 'Total markers on scaffolds: %d over %d Mbp (%f markers per base)' % (totalMarkers, totalSequenceLen, float(totalMarkers)/totalSequenceLen)
                        
        print '  Create plot.'
        plotLens = []
        plotPerMarkers = []
        for scaffoldId in percentageMarkers:
            plotLens.append(scaffoldLen[scaffoldId])
            plotPerMarkers.append(percentageMarkers[scaffoldId]/scaffoldLen[scaffoldId] * 1e6)
            
        scatterPlot = ScatterPlot()
        scatterPlot.plot(plotLens, plotPerMarkers)     
        scatterPlot.savePlot('./experiments/plotScaffoldLenVsMarkers.png')
示例#13
0
class SimComparePlots(object):
    def __init__(self):
        
        self.plotPrefix = './simulations/simulation.draft.w_refinement_50'
        self.simCompareFile = './simulations/simCompare.draft.w_refinement_50.full.tsv'
        self.simCompareMarkerSetOut = './simulations/simCompare.draft.marker_set_table.w_refinement_50.tsv'
        self.simCompareConditionOut = './simulations/simCompare.draft.condition_table.w_refinement_50.tsv'
        self.simCompareTaxonomyTableOut = './simulations/simCompare.draft.taxonomy_table.w_refinement_50.tsv'
        self.simCompareRefinementTableOut = './simulations/simCompare.draft.refinment_table.w_refinement_50.tsv'
               
        #self.plotPrefix = './simulations/simulation.scaffolds.draft.w_refinement_50'
        #self.simCompareFile = './simulations/simCompare.scaffolds.draft.w_refinement_50.full.tsv'
        #self.simCompareMarkerSetOut = './simulations/simCompare.scaffolds.draft.marker_set_table.w_refinement_50.tsv'
        #self.simCompareConditionOut = './simulations/simCompare.scaffolds.draft.condition_table.w_refinement_50.tsv'
        #self.simCompareTaxonomyTableOut = './simulations/simCompare.scaffolds.draft.taxonomy_table.w_refinement_50.tsv'
        #self.simCompareRefinementTableOut = './simulations/simCompare.scaffolds.draft.refinment_table.w_refinement_50.tsv'
        
        #self.plotPrefix = './simulations/simulation.random_scaffolds.w_refinement_50'
        #self.simCompareFile = './simulations/simCompare.random_scaffolds.w_refinement_50.full.tsv'
        #self.simCompareMarkerSetOut = './simulations/simCompare.random_scaffolds.marker_set_table.w_refinement_50.tsv'
        #self.simCompareConditionOut = './simulations/simCompare.random_scaffolds.condition_table.w_refinement_50.tsv'
        #self.simCompareTaxonomyTableOut = './simulations/simCompare.random_scaffolds.taxonomy_table.w_refinement_50.tsv'
        #self.simCompareRefinementTableOut = './simulations/simCompare.random_scaffolds.refinment_table.w_refinement_50.tsv'
        
        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        
        self.compsToConsider = [0.5, 0.7, 0.8, 0.9] #[0.5, 0.7, 0.8, 0.9]
        self.contsToConsider = [0.05, 0.1, 0.15] #[0.05, 0.1, 0.15]
        
        self.dpi = 1200
  
    def __readResults(self, filename):
        results = defaultdict(dict)
        genomeIds = set()
        with open(filename) as f:
            f.readline()
            for line in f:
                lineSplit = line.split('\t')
                
                simId = lineSplit[0]
                genomeId = simId.split('-')[0]
                genomeIds.add(genomeId)
                
                bestCompIM = [float(x) for x in lineSplit[6].split(',')]
                bestContIM = [float(x) for x in lineSplit[7].split(',')]
                
                bestCompMS = [float(x) for x in lineSplit[8].split(',')]
                bestContMS = [float(x) for x in lineSplit[9].split(',')]
                                
                domCompIM = [float(x) for x in lineSplit[10].split(',')]
                domContIM = [float(x) for x in lineSplit[11].split(',')]
                
                domCompMS = [float(x) for x in lineSplit[12].split(',')]
                domContMS = [float(x) for x in lineSplit[13].split(',')]
                
                simCompIM = [float(x) for x in lineSplit[14].split(',')]
                simContIM = [float(x) for x in lineSplit[15].split(',')]
                
                simCompMS = [float(x) for x in lineSplit[16].split(',')]
                simContMS = [float(x) for x in lineSplit[17].split(',')]
                
                simCompRMS = [float(x) for x in lineSplit[18].split(',')]
                simContRMS = [float(x) for x in lineSplit[19].split(',')]
                
                results[simId] = [bestCompIM, bestContIM, bestCompMS, bestContMS, domCompIM, domContIM, domCompMS, domContMS, simCompIM, simContIM, simCompMS, simContMS, simCompRMS, simContRMS]
                
        print('    Number of test genomes: ' + str(len(genomeIds)))
        
        return results
    
    def markerSets(self, results):
        # summarize results from IM vs MS
        print('  Tabulating results for domain-level marker genes vs marker sets.')
        
        itemsProcessed = 0      
        compDataDict = defaultdict(lambda : defaultdict(list))
        contDataDict = defaultdict(lambda : defaultdict(list))

        genomeIds = set()
        for simId in results:
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            genomeId, seqLen, comp, cont = simId.split('-')
            genomeIds.add(genomeId)
            expCondStr = str(float(comp)) + '-' + str(float(cont)) + '-' + str(int(seqLen))
            
            compDataDict[expCondStr]['IM'] += results[simId][4]
            compDataDict[expCondStr]['MS'] += results[simId][6]

            contDataDict[expCondStr]['IM'] += results[simId][5]
            contDataDict[expCondStr]['MS'] += results[simId][7]
                
        print('  There are %d unique genomes.' % len(genomeIds))
              
        sys.stdout.write('\n')
        
        print('    There are %d experimental conditions.' % (len(compDataDict)))
                
        # plot data
        print('  Plotting results.')
        compData = []
        contData = []
        rowLabels = []
        
        for comp in self.compsToConsider:
            for cont in self.contsToConsider:
                for seqLen in [20000]: 
                    for msStr in ['MS', 'IM']:
                        rowLabels.append(msStr +': %d%%, %d%%' % (comp*100, cont*100))
                        
                        expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                        compData.append(compDataDict[expCondStr][msStr])
                        contData.append(contDataDict[expCondStr][msStr])  
                                       
        print('MS:\t%.2f\t%.2f' % (mean(abs(array(compData[0::2]))), mean(abs(array(contData[0::2])))))
        print('IM:\t%.2f\t%.2f' % (mean(abs(array(compData[1::2]))), mean(abs(array(contData[1::2])))))   
            
        boxPlot = BoxPlot()
        plotFilename = self.plotPrefix + '.markerSets.png'
        boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                        r'$\Delta$' + ' % Completion', 'Simulation Conditions', 
                        r'$\Delta$' + ' % Contamination', None,
                        rowsPerCategory = 2, dpi = self.dpi)
        
        # print table of results 
        tableOut = open(self.simCompareMarkerSetOut, 'w')
        tableOut.write('Comp. (%)\tCont. (%)\tIM (5kb)\t\tMS (5kb)\t\tIM (20kb)\t\tMS (20kb)\t\tIM (50kb)\t\tMS (50kb)\n')
        
        avgComp = defaultdict(lambda : defaultdict(list))
        avgCont = defaultdict(lambda : defaultdict(list))
        for comp in [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]:
            for cont in [0.0, 0.05, 0.1, 0.15, 0.2]:
                
                tableOut.write('%d\t%d' % (comp*100, cont*100))
                
                for seqLen in [5000, 20000, 50000]:
                    expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                     
                    meanCompIM = mean(abs(array(compDataDict[expCondStr]['IM'])))
                    stdCompIM = std(abs(array(compDataDict[expCondStr]['IM'])))
                    meanContIM = mean(abs(array(contDataDict[expCondStr]['IM'])))
                    stdContIM = std(abs(array(contDataDict[expCondStr]['IM'])))
                    
                    avgComp[seqLen]['IM'] += compDataDict[expCondStr]['IM']
                    avgCont[seqLen]['IM'] += contDataDict[expCondStr]['IM']
                    
                    meanCompMS = mean(abs(array(compDataDict[expCondStr]['MS'])))
                    stdCompMS = std(abs(array(compDataDict[expCondStr]['MS'])))
                    meanContMS = mean(abs(array(contDataDict[expCondStr]['MS'])))
                    stdContMS = std(abs(array(contDataDict[expCondStr]['MS'])))
                    
                    avgComp[seqLen]['MS'] += compDataDict[expCondStr]['MS']
                    avgCont[seqLen]['MS'] += contDataDict[expCondStr]['MS']
                    
                    tableOut.write('\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f' % (meanCompIM, stdCompIM, meanCompMS, stdCompMS, meanContIM, stdContIM, meanContMS, stdContMS))
                tableOut.write('\n')
                
        tableOut.write('\tAverage:')
        for seqLen in [5000, 20000, 50000]: 
            meanCompIM = mean(abs(array(avgComp[seqLen]['IM'])))
            stdCompIM = std(abs(array(avgComp[seqLen]['IM'])))
            meanContIM = mean(abs(array(avgCont[seqLen]['IM'])))
            stdContIM = std(abs(array(avgCont[seqLen]['IM'])))
            
            meanCompMS = mean(abs(array(avgComp[seqLen]['MS'])))
            stdCompMS = std(abs(array(avgComp[seqLen]['MS'])))
            meanContMS = mean(abs(array(avgCont[seqLen]['MS'])))
            stdContMS = std(abs(array(avgCont[seqLen]['MS'])))
            
            tableOut.write('\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f' % (meanCompIM, stdCompIM, meanCompMS, stdCompMS, meanContIM, stdContIM, meanContMS, stdContMS))
                        
        tableOut.write('\n')     
                
        tableOut.close()
    
    def conditionsPlot(self, results):
        # summarize results for each experimental condition  
        print('  Tabulating results for each experimental condition using marker sets.')
        
        itemsProcessed = 0      
        compDataDict = defaultdict(lambda : defaultdict(list))
        contDataDict = defaultdict(lambda : defaultdict(list))
        comps = set()
        conts = set()
        seqLens = set()
        
        compOutliers = defaultdict(list)
        contOutliers = defaultdict(list)
        
        genomeIds = set()
        for simId in results:
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            genomeId, seqLen, comp, cont = simId.split('-')
            genomeIds.add(genomeId)
            expCondStr = str(float(comp)) + '-' + str(float(cont)) + '-' + str(int(seqLen))
            
            comps.add(float(comp))
            conts.add(float(cont))
            seqLens.add(int(seqLen))
            
            compDataDict[expCondStr]['best'] += results[simId][2]
            compDataDict[expCondStr]['domain'] += results[simId][6]
            compDataDict[expCondStr]['selected'] += results[simId][10]
            
            for dComp in results[simId][2]:
                compOutliers[expCondStr] += [[dComp, genomeId]]
            
            contDataDict[expCondStr]['best'] += results[simId][3]
            contDataDict[expCondStr]['domain'] += results[simId][7]
            contDataDict[expCondStr]['selected'] += results[simId][11]
            
            for dCont in results[simId][3]:
                contOutliers[expCondStr] += [[dCont, genomeId]]
                
        print('  There are %d unique genomes.' % len(genomeIds))
              
        sys.stdout.write('\n')
        
        print('    There are %d experimental conditions.' % (len(compDataDict)))
                
        # plot data
        print('  Plotting results.')
        compData = []
        contData = []
        rowLabels = []
        
        foutComp = open('./simulations/simulation.scaffolds.draft.comp_outliers.domain.tsv', 'w')
        foutCont = open('./simulations/simulation.scaffolds.draft.cont_outliers.domain.tsv', 'w')
        for comp in self.compsToConsider:
            for cont in self.contsToConsider:
                for msStr in ['best', 'selected', 'domain']:
                    for seqLen in [20000]: 
                        rowLabels.append(msStr +': %d%%, %d%%' % (comp*100, cont*100))
                        
                        expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                        compData.append(compDataDict[expCondStr][msStr])
                        contData.append(contDataDict[expCondStr][msStr])  
                    
                # report completenes outliers
                foutComp.write(expCondStr)

                compOutliers[expCondStr].sort()
                
                dComps = array([r[0] for r in compOutliers[expCondStr]])
                perc1 = scoreatpercentile(dComps, 1)
                perc99 = scoreatpercentile(dComps, 99)
                print(expCondStr, perc1, perc99)
                
                foutComp.write('\t%.2f\t%.2f' % (perc1, perc99))
                
                outliers = []
                for item in compOutliers[expCondStr]:
                    if item[0] < perc1 or item[0] > perc99:
                        outliers.append(item[1])
                        
                outlierCount = Counter(outliers)
                for genomeId, count in outlierCount.most_common():
                    foutComp.write('\t' + genomeId + ': ' + str(count))
                foutComp.write('\n')
                
                # report contamination outliers
                foutCont.write(expCondStr)

                contOutliers[expCondStr].sort()
                
                dConts = array([r[0] for r in contOutliers[expCondStr]])
                perc1 = scoreatpercentile(dConts, 1)
                perc99 = scoreatpercentile(dConts, 99)
                
                foutCont.write('\t%.2f\t%.2f' % (perc1, perc99))
                
                outliers = []
                for item in contOutliers[expCondStr]:
                    if item[0] < perc1 or item[0] > perc99:
                        outliers.append(item[1])
                        
                outlierCount = Counter(outliers)
                for genomeId, count in outlierCount.most_common():
                    foutCont.write('\t' + genomeId + ': ' + str(count))
                foutCont.write('\n')
                
        foutComp.close()
        foutCont.close()
                               
        print('best:\t%.2f\t%.2f' % (mean(abs(array(compData[0::3]))), mean(abs(array(contData[0::3])))))
        print('selected:\t%.2f\t%.2f' % (mean(abs(array(compData[1::3]))), mean(abs(array(contData[1::3])))))   
        print('domain:\t%.2f\t%.2f' % (mean(abs(array(compData[2::3]))), mean(abs(array(contData[2::3])))))   

        boxPlot = BoxPlot()
        plotFilename = self.plotPrefix + '.conditions.png'
        boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                        r'$\Delta$' + ' % Completion', 'Simulation Conditions', 
                        r'$\Delta$' + ' % Contamination', None,
                        rowsPerCategory = 3, dpi = self.dpi)
        
        
        # print table of results 
        tableOut = open(self.simCompareConditionOut, 'w')
        tableOut.write('Comp. (%)\tCont. (%)\tbest (5kb)\t\tselected (5kb)\t\tdomain (5kb)\t\tbest (20kb)\t\tselected (20kb)\t\tdomain (20kb)\t\tbest (50kb)\t\tselected (50kb)\t\tdomain (50kb)\n')
        
        avgComp = defaultdict(lambda : defaultdict(list))
        avgCont = defaultdict(lambda : defaultdict(list))
        for comp in [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]:
            for cont in [0.0, 0.05, 0.1, 0.15, 0.2]:
                
                tableOut.write('%d\t%d' % (comp*100, cont*100))
                
                for seqLen in [5000, 20000, 50000]:
                    expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                   
                    meanCompD = mean(abs(array(compDataDict[expCondStr]['domain'])))
                    stdCompD = std(abs(array(compDataDict[expCondStr]['domain'])))
                    meanContD = mean(abs(array(contDataDict[expCondStr]['domain'])))
                    stdContD = std(abs(array(contDataDict[expCondStr]['domain'])))
                    
                    avgComp[seqLen]['domain'] += compDataDict[expCondStr]['domain']
                    avgCont[seqLen]['domain'] += contDataDict[expCondStr]['domain']
                    
                    meanCompS = mean(abs(array(compDataDict[expCondStr]['selected'])))
                    stdCompS = std(abs(array(compDataDict[expCondStr]['selected'])))
                    meanContS = mean(abs(array(contDataDict[expCondStr]['selected'])))
                    stdContS = std(abs(array(contDataDict[expCondStr]['selected'])))
                    
                    avgComp[seqLen]['selected'] += compDataDict[expCondStr]['selected']
                    avgCont[seqLen]['selected'] += contDataDict[expCondStr]['selected']
                    
                    meanCompB = mean(abs(array(compDataDict[expCondStr]['best'])))
                    stdCompB = std(abs(array(compDataDict[expCondStr]['best'])))
                    meanContB = mean(abs(array(contDataDict[expCondStr]['best'])))
                    stdContB = std(abs(array(contDataDict[expCondStr]['best'])))
                    
                    avgComp[seqLen]['best'] += compDataDict[expCondStr]['best']
                    avgCont[seqLen]['best'] += contDataDict[expCondStr]['best']
                    
                    tableOut.write('\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f' % (meanCompD, meanCompS, meanCompB, meanContD, meanContS, meanContB))
                tableOut.write('\n')
                
        tableOut.write('\tAverage:')
        for seqLen in [5000, 20000, 50000]: 
            meanCompD = mean(abs(array(avgComp[seqLen]['domain'])))
            stdCompD = std(abs(array(avgComp[seqLen]['domain'])))
            meanContD = mean(abs(array(avgCont[seqLen]['domain'])))
            stdContD = std(abs(array(avgCont[seqLen]['domain'])))
            
            meanCompS = mean(abs(array(avgComp[seqLen]['selected'])))
            stdCompS = std(abs(array(avgComp[seqLen]['selected'])))
            meanContS = mean(abs(array(avgCont[seqLen]['selected'])))
            stdContS = std(abs(array(avgCont[seqLen]['selected'])))
            
            meanCompB = mean(abs(array(avgComp[seqLen]['best'])))
            stdCompB = std(abs(array(avgComp[seqLen]['best'])))
            meanContB = mean(abs(array(avgCont[seqLen]['best'])))
            stdContB = std(abs(array(avgCont[seqLen]['best'])))
            
            tableOut.write('\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f' % (meanCompD, meanCompS, meanCompB, meanContD, meanContS, meanContB))
                        
        tableOut.write('\n')     
                
        tableOut.close()
        
    def taxonomicPlots(self, results):
        # summarize results for different taxonomic groups  
        print('  Tabulating results for taxonomic groups.')
        
        metadata = self.img.genomeMetadata()
        
        itemsProcessed = 0      
        compDataDict = defaultdict(lambda : defaultdict(list))
        contDataDict = defaultdict(lambda : defaultdict(list))
        comps = set()
        conts = set()
        seqLens = set()
        
        ranksToProcess = 3
        taxaByRank = [set() for _ in range(0, ranksToProcess)]
        
        overallComp = []
        overallCont = []
                
        genomeInTaxon = defaultdict(set)
        testCases = 0
        for simId in results:
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            genomeId, seqLen, comp, cont = simId.split('-')
            
            if seqLen != '20000':
                continue
            
            if str(float(comp)) in ['0.5', '0.7', '0.8', '0.9'] and str(float(cont)) in ['0.05', '0.10', '0.1', '0.15']:
                print(comp, cont)
                taxonomy = metadata[genomeId]['taxonomy']
                
                testCases += 1
                
                comps.add(float(comp))
                conts.add(float(cont))
                seqLens.add(int(seqLen))
                
                overallComp += results[simId][10]
                overallCont += results[simId][11]
                
                for r in range(0, ranksToProcess):
                    taxon = taxonomy[r]
                    
                    if r == 0 and taxon == 'unclassified':
                        print('*****************************Unclassified at domain-level*****************')
                        continue
                    
                    if taxon == 'unclassified':
                        continue
                    
                    taxon = rankPrefixes[r] + taxon
                    
                    taxaByRank[r].add(taxon)
                                                    
                    compDataDict[taxon]['best'] += results[simId][2]
                    compDataDict[taxon]['domain'] += results[simId][6]
                    compDataDict[taxon]['selected'] += results[simId][10]
                    
                    contDataDict[taxon]['best'] += results[simId][3]
                    contDataDict[taxon]['domain'] += results[simId][7]
                    contDataDict[taxon]['selected'] += results[simId][11]
                    
                    genomeInTaxon[taxon].add(genomeId)
            
        sys.stdout.write('\n')
        
        print('Test cases', testCases)
        
        print('')        
        print('Creating plots for:')
        print('  comps = ', comps)
        print('  conts = ', conts)
        
        print('')
        print('    There are %d taxa.' % (len(compDataDict)))
        
        print('')
        print('  Overall bias:')
        print('    Selected comp: %.2f' % mean(overallComp))
        print('    Selected cont: %.2f' % mean(overallCont))
        
        # get list of ordered taxa by rank
        orderedTaxa = []
        for taxa in taxaByRank:
            orderedTaxa += sorted(taxa)
                
        # plot data
        print('  Plotting results.')
        compData = []
        contData = []
        rowLabels = []
        for taxon in orderedTaxa:
            for msStr in ['best', 'selected', 'domain']:
                numGenomes = len(genomeInTaxon[taxon])
                if numGenomes < 10: # skip groups with only a few genomes
                    continue
                
                rowLabels.append(msStr + ': ' + taxon + ' (' + str(numGenomes) + ')')
                compData.append(compDataDict[taxon][msStr])
                contData.append(contDataDict[taxon][msStr])        
                
        for i, rowLabel in enumerate(rowLabels):
            print(rowLabel + '\t%.2f\t%.2f' % (mean(abs(array(compData[i]))), mean(abs(array(contData[i])))))            
                  
        # print taxonomic table of results organized by class
        taxonomyTableOut = open(self.simCompareTaxonomyTableOut, 'w')
        for taxon in orderedTaxa:
            numGenomes = len(genomeInTaxon[taxon])
            if numGenomes < 2: # skip groups with only a few genomes
                continue
                
            taxonomyTableOut.write(taxon + '\t' + str(numGenomes))
            for msStr in ['domain', 'selected']:                
                meanTaxonComp = mean(abs(array(compDataDict[taxon][msStr])))
                stdTaxonComp = std(abs(array(compDataDict[taxon][msStr])))
                meanTaxonCont = mean(abs(array(contDataDict[taxon][msStr])))
                stdTaxonCont = std(abs(array(contDataDict[taxon][msStr])))
                
                taxonomyTableOut.write('\t%.1f +/- %.2f\t%.1f +/- %.2f' % (meanTaxonComp, stdTaxonComp, meanTaxonCont, stdTaxonCont))
            taxonomyTableOut.write('\n')
        taxonomyTableOut.close()
        
        # create box plot
        boxPlot = BoxPlot()
        plotFilename = self.plotPrefix +  '.taxonomy.png'
        boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                        r'$\Delta$' + ' % Completion', None, 
                        r'$\Delta$' + ' % Contamination', None,
                        rowsPerCategory = 3, dpi = self.dpi)
    
    
    def refinementPlots(self, results):
        # summarize results for different CheckM refinements 
        print('  Tabulating results for different refinements.')
        
        metadata = self.img.genomeMetadata()
        
        itemsProcessed = 0      
        compDataDict = defaultdict(lambda : defaultdict(list))
        contDataDict = defaultdict(lambda : defaultdict(list))
        comps = set()
        conts = set()
        seqLens = set()
        
        ranksToProcess = 3
        taxaByRank = [set() for _ in range(0, ranksToProcess)]
        
        overallCompIM = []
        overallContIM = [] 
        
        overallCompMS = []
        overallContMS = [] 
        
        overallCompRMS = []
        overallContRMS = [] 
        
        genomeInTaxon = defaultdict(set)
        
        testCases = 0
        for simId in results:
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            genomeId, seqLen, comp, cont = simId.split('-')
            taxonomy = metadata[genomeId]['taxonomy']
            
            if float(comp) < 0.7 or float(cont) > 0.1:
                continue
            
            comps.add(float(comp))
            conts.add(float(cont))
            seqLens.add(int(seqLen))
            
            overallCompIM.append(results[simId][8])
            overallContIM.append(results[simId][9])
            
            overallCompMS.append(results[simId][10])
            overallContMS.append(results[simId][11])
            
            overallCompRMS.append(results[simId][12])
            overallContRMS.append(results[simId][13])
            
            for r in range(0, ranksToProcess):
                taxon = taxonomy[r]
                
                if taxon == 'unclassified':
                    continue
                
                taxaByRank[r].add(taxon)
                
                compDataDict[taxon]['IM'] += results[simId][8]
                compDataDict[taxon]['MS'] += results[simId][10]
                compDataDict[taxon]['RMS'] += results[simId][12]
                
                contDataDict[taxon]['IM'] += results[simId][9]
                contDataDict[taxon]['MS'] += results[simId][11]
                contDataDict[taxon]['RMS'] += results[simId][13]
                                
                genomeInTaxon[taxon].add(genomeId)
            
        sys.stdout.write('\n')
        
        print('Creating plots for:')
        print('  comps = ', comps)
        print('  conts = ', conts)
        
        print('')
        print('    There are %d taxon.' % (len(compDataDict)))
        print('')
        print('Percentage change MS-IM comp: %.4f' % ((mean(abs(array(overallCompMS))) - mean(abs(array(overallCompIM)))) * 100 / mean(abs(array(overallCompIM)))))
        print('Percentage change MS-IM cont: %.4f' % ((mean(abs(array(overallContMS))) - mean(abs(array(overallContIM)))) * 100 / mean(abs(array(overallContIM)))))
        print('')
        print('Percentage change RMS-MS comp: %.4f' % ((mean(abs(array(overallCompRMS))) - mean(abs(array(overallCompMS)))) * 100 / mean(abs(array(overallCompIM)))))
        print('Percentage change RMS-MS cont: %.4f' % ((mean(abs(array(overallContRMS))) - mean(abs(array(overallContMS)))) * 100 / mean(abs(array(overallContIM)))))
        
        print('')
        
        # get list of ordered taxa by rank
        orderedTaxa = []
        for taxa in taxaByRank:
            orderedTaxa += sorted(taxa)
             
        # print table of results organized by class
        refinmentTableOut = open(self.simCompareRefinementTableOut, 'w')
        for taxon in orderedTaxa:
            numGenomes = len(genomeInTaxon[taxon])
            if numGenomes < 2: # skip groups with only a few genomes
                continue
                
            refinmentTableOut.write(taxon + '\t' + str(numGenomes))
            for refineStr in ['IM', 'MS']:               
                meanTaxonComp = mean(abs(array(compDataDict[taxon][refineStr])))
                stdTaxonComp = std(abs(array(compDataDict[taxon][refineStr])))
                meanTaxonCont = mean(abs(array(contDataDict[taxon][refineStr])))
                stdTaxonCont = std(abs(array(contDataDict[taxon][refineStr])))
                
                refinmentTableOut.write('\t%.1f +/- %.2f\t%.1f +/- %.2f' % (meanTaxonComp, stdTaxonComp, meanTaxonCont, stdTaxonCont))
            
            perCompChange = (mean(abs(array(compDataDict[taxon]['IM']))) - meanTaxonComp) * 100 / mean(abs(array(compDataDict[taxon]['IM'])))
            perContChange = (mean(abs(array(contDataDict[taxon]['IM']))) - meanTaxonCont) * 100 / mean(abs(array(contDataDict[taxon]['IM'])))
            refinmentTableOut.write('\t%.2f\t%.2f\n' % (perCompChange, perContChange))
        refinmentTableOut.close()
       
        # plot data
        print('  Plotting results.')
        compData = []
        contData = []
        rowLabels = []
        for taxon in orderedTaxa:
            for refineStr in ['RMS', 'MS', 'IM']:
                numGenomes = len(genomeInTaxon[taxon])
                if numGenomes < 10: # skip groups with only a few genomes
                    continue

                rowLabels.append(refineStr + ': ' + taxon + ' (' + str(numGenomes) + ')')
                compData.append(compDataDict[taxon][refineStr])
                contData.append(contDataDict[taxon][refineStr])       
                
        for i, rowLabel in enumerate(rowLabels):
            print(rowLabel + '\t%.2f\t%.2f' % (mean(abs(array(compData[i]))), mean(abs(array(contData[i])))))
            
        boxPlot = BoxPlot()
        plotFilename = self.plotPrefix + '.refinements.png'
        boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                        r'$\Delta$' + ' % Completion', None, 
                        r'$\Delta$' + ' % Contamination', None,
                        rowsPerCategory = 3, dpi = self.dpi)
        
    def run(self):
        # read simulation results
        print('  Reading simulation results.')
        results = self.__readResults(self.simCompareFile)
        
        print('\n')         
        #self.markerSets(results)
                   
        print('\n')         
        #self.conditionsPlot(results)
        
        #print '\n'
        self.taxonomicPlots(results)
        
        print('\n')
示例#14
0
class SimCompareDiffPlot(object):
    def __init__(self):
        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        
    def run(self):
        # count number of times the lineage-specific marker set results outperform
        # the domain-specific marker set for varying differences between the two sets
        numBars = 15
        
        lineageCountsComp = [0]*numBars
        domainCountsComp = [0]*numBars
        
        lineageCountsCont = [0]*numBars
        domainCountsCont = [0]*numBars
        
        totalCountsComp = 0
        totalCountsCont = 0
        
        domCompBest = 0
        lineageCompBest = 0
        domContBest = 0
        lineageContBest = 0
        
        metadata = self.img.genomeMetadata()
        domCompTaxon = defaultdict(int)
        lineageCompTaxon = defaultdict(int)
        
        for line in open('./simulations/briefSummaryOut.tsv'):
            lineSplit = line.split('\t')
            genomeId = lineSplit[0]
            taxonomy = metadata[genomeId]['taxonomy']
            phylum = taxonomy[1]
            domCompMS, lineageCompMS, lineageCompRMS, domContMS, lineageContMS, lineageContRMS = [float(x) for x in lineSplit[1:]]
            
            diff = abs(abs(lineageCompMS) - abs(domCompMS))
            if diff > 5:
                intDiff = int(diff)
                if intDiff >= numBars:
                    intDiff = (numBars-1)
                    
                if abs(domCompMS) < abs(lineageCompMS):
                    domainCountsComp[intDiff] += 1
                    domCompBest += 1
                    domCompTaxon[phylum] += 1
                else:
                    lineageCountsComp[intDiff] += 1
                    lineageCompBest += 1
                    lineageCompTaxon[phylum] += 1
                    
                totalCountsComp += 1
                
            diff = abs(abs(lineageContMS) - abs(domContMS))
            if diff > 5:
                intDiff = int(diff)
                if intDiff >= numBars:
                    intDiff = (numBars-1)
                    
                if abs(domContMS) < abs(lineageContMS):
                    domainCountsCont[intDiff] += 1
                    domContBest += 1
                else:
                    lineageCountsCont[intDiff] += 1
                    lineageContBest += 1
                    
                totalCountsCont += 1
                
        print '%% times lineage comp better than domain: %.2f' % (float(lineageCompBest)*100/(domCompBest + lineageCompBest))
        print '%% times lineage cont better than domain: %.2f' % (float(lineageContBest)*100/(domContBest + lineageContBest))
        
        print ''
        print 'Taxonomy breakdown (dom best, lineage best):'
        taxa = set(domCompTaxon.keys()).union(lineageCompTaxon.keys())
        for t in taxa:
            print '%s\t%.2f\t%.2f' % (t, domCompTaxon[t]*100.0/domCompBest, lineageCompTaxon[t]*100.0/lineageCompBest)
                
        # normalize counts
        for i in xrange(0, numBars):
            lineageCountsComp[i] = float(lineageCountsComp[i])*100 / totalCountsComp
            domainCountsComp[i] = float(domainCountsComp[i])*100 / totalCountsComp
            
            if domainCountsComp[i] > lineageCountsComp[i]:
                print 'Domain bets lineage (comp): %d%% (%f, %f)' % (i+1, domainCountsComp[i], lineageCountsComp[i])
            
            lineageCountsCont[i] = float(lineageCountsCont[i])*100 / totalCountsCont
            domainCountsCont[i] = float(domainCountsCont[i])*100 / totalCountsCont
            
            if domainCountsCont[i] > lineageCountsCont[i]:
                print 'Domain bets lineage (cont): %d%% (%f, %f)' % (i+1, domainCountsCont[i], lineageCountsCont[i])
         
        stackedBarPlot = StackedBarPlot()
        stackedBarPlot.plot(lineageCountsComp, domainCountsComp, lineageCountsCont, domainCountsCont)     
        stackedBarPlot.savePlot('./experiments/simCompareDiffPlot.svg')
示例#15
0
 def __init__(self):
     img = IMG("/srv/whitlam/bio/db/checkm/img/img_metadata.tsv", "/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv")
     self.metadata = img.genomeMetadata()
示例#16
0
class Simulation(object):
    def __init__(self):
        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG(
            "/srv/whitlam/bio/db/checkm/img/img_metadata.tsv", "/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv"
        )

        self.contigLens = [1000, 2000, 5000, 10000, 20000, 50000]
        self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]
        self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2]

    def __workerThread(self, tree, metadata, ubiquityThreshold, singleCopyThreshold, numReplicates, queueIn, queueOut):
        """Process each data item in parallel."""

        while True:
            testGenomeId = queueIn.get(block=True, timeout=None)
            if testGenomeId == None:
                break

            # build marker sets for evaluating test genome
            testNode = tree.find_node_with_taxon_label("IMG_" + testGenomeId)
            binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(
                tree,
                testNode.parent_node,
                ubiquityThreshold,
                singleCopyThreshold,
                bMarkerSet=True,
                genomeIdsToRemove=[testGenomeId],
            )
            #!!!binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildDomainMarkerSet(tree, testNode.parent_node, ubiquityThreshold, singleCopyThreshold, bMarkerSet = False, genomeIdsToRemove = [testGenomeId])

            # determine distribution of all marker genes within the test genome
            geneDistTable = self.img.geneDistTable(
                [testGenomeId], binMarkerSets.getMarkerGenes(), spacingBetweenContigs=0
            )

            print "# marker genes: ", len(binMarkerSets.getMarkerGenes())
            print "# genes in table: ", len(geneDistTable[testGenomeId])

            # estimate completeness of unmodified genome
            unmodifiedComp = {}
            unmodifiedCont = {}
            for ms in binMarkerSets.markerSetIter():
                hits = {}
                for mg in ms.getMarkerGenes():
                    if mg in geneDistTable[testGenomeId]:
                        hits[mg] = geneDistTable[testGenomeId][mg]
                completeness, contamination = ms.genomeCheck(hits, bIndividualMarkers=True)
                unmodifiedComp[ms.lineageStr] = completeness
                unmodifiedCont[ms.lineageStr] = contamination

            print completeness, contamination

            # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets
            genomeSize = readFastaBases(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + ".fna"))
            print "genomeSize", genomeSize

            for contigLen in self.contigLens:
                for percentComp in self.percentComps:
                    for percentCont in self.percentConts:
                        deltaComp = defaultdict(list)
                        deltaCont = defaultdict(list)
                        deltaCompSet = defaultdict(list)
                        deltaContSet = defaultdict(list)

                        deltaCompRefined = defaultdict(list)
                        deltaContRefined = defaultdict(list)
                        deltaCompSetRefined = defaultdict(list)
                        deltaContSetRefined = defaultdict(list)

                        trueComps = []
                        trueConts = []

                        numDescendants = {}

                        for _ in xrange(0, numReplicates):
                            trueComp, trueCont, startPartialGenomeContigs = self.markerSetBuilder.sampleGenome(
                                genomeSize, percentComp, percentCont, contigLen
                            )
                            print contigLen, trueComp, trueCont, len(startPartialGenomeContigs)

                            trueComps.append(trueComp)
                            trueConts.append(trueCont)

                            for ms in binMarkerSets.markerSetIter():
                                numDescendants[ms.lineageStr] = ms.numGenomes

                                containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(
                                    ms.getMarkerGenes(),
                                    geneDistTable[testGenomeId],
                                    startPartialGenomeContigs,
                                    contigLen,
                                )
                                completeness, contamination = ms.genomeCheck(
                                    containedMarkerGenes, bIndividualMarkers=True
                                )
                                deltaComp[ms.lineageStr].append(completeness - trueComp)
                                deltaCont[ms.lineageStr].append(contamination - trueCont)

                                completeness, contamination = ms.genomeCheck(
                                    containedMarkerGenes, bIndividualMarkers=False
                                )
                                deltaCompSet[ms.lineageStr].append(completeness - trueComp)
                                deltaContSet[ms.lineageStr].append(contamination - trueCont)

                            for ms in refinedBinMarkerSet.markerSetIter():
                                containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(
                                    ms.getMarkerGenes(),
                                    geneDistTable[testGenomeId],
                                    startPartialGenomeContigs,
                                    contigLen,
                                )
                                completeness, contamination = ms.genomeCheck(
                                    containedMarkerGenes, bIndividualMarkers=True
                                )
                                deltaCompRefined[ms.lineageStr].append(completeness - trueComp)
                                deltaContRefined[ms.lineageStr].append(contamination - trueCont)

                                completeness, contamination = ms.genomeCheck(
                                    containedMarkerGenes, bIndividualMarkers=False
                                )
                                deltaCompSetRefined[ms.lineageStr].append(completeness - trueComp)
                                deltaContSetRefined[ms.lineageStr].append(contamination - trueCont)

                        taxonomy = ";".join(metadata[testGenomeId]["taxonomy"])
                        queueOut.put(
                            (
                                testGenomeId,
                                contigLen,
                                percentComp,
                                percentCont,
                                taxonomy,
                                numDescendants,
                                unmodifiedComp,
                                unmodifiedCont,
                                trueComps,
                                trueConts,
                                deltaComp,
                                deltaCont,
                                deltaCompSet,
                                deltaContSet,
                                deltaCompRefined,
                                deltaContRefined,
                                deltaCompSetRefined,
                                deltaContSetRefined,
                                trueComps,
                                trueConts,
                            )
                        )

    def __writerThread(self, numTestGenomes, writerQueue):
        """Store or write results of worker threads in a single thread."""

        # summaryOut = open('/tmp/simulation.draft.summary.w_refinement_50.tsv', 'w')
        summaryOut = open("/tmp/simulation.summary.testing.tsv", "w")
        summaryOut.write("Genome Id\tContig len\t% comp\t% cont")
        summaryOut.write("\tTaxonomy\tMarker set\t# descendants")
        summaryOut.write("\tUnmodified comp\tUnmodified cont\tTrue comp\tTrue cont")
        summaryOut.write("\tIM comp\tIM comp std\tIM cont\tIM cont std")
        summaryOut.write("\tMS comp\tMS comp std\tMS cont\tMS cont std")
        summaryOut.write("\tRIM comp\tRIM comp std\tRIM cont\tRIM cont std")
        summaryOut.write("\tRMS comp\tRMS comp std\tRMS cont\tRMS cont std\n")

        # fout = gzip.open('/tmp/simulation.draft.w_refinement_50.tsv.gz', 'wb')
        fout = gzip.open("/tmp/simulation.testing.tsv.gz", "wb")
        fout.write("Genome Id\tContig len\t% comp\t% cont")
        fout.write("\tTaxonomy\tMarker set\t# descendants")
        fout.write("\tUnmodified comp\tUnmodified cont\tTrue comp\tTrue cont")
        fout.write("\tIM comp\tIM cont")
        fout.write("\tMS comp\tMS cont")
        fout.write("\tRIM comp\tRIM cont")
        fout.write("\tRMS comp\tRMS cont\tTrue Comp\tTrue Cont\n")

        testsPerGenome = len(self.contigLens) * len(self.percentComps) * len(self.percentConts)

        itemsProcessed = 0
        while True:
            testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, trueComps, trueConts, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts = writerQueue.get(
                block=True, timeout=None
            )
            if testGenomeId == None:
                break

            itemsProcessed += 1
            statusStr = "    Finished processing %d of %d (%.2f%%) test cases." % (
                itemsProcessed,
                numTestGenomes * testsPerGenome,
                float(itemsProcessed) * 100 / (numTestGenomes * testsPerGenome),
            )
            sys.stdout.write("%s\r" % statusStr)
            sys.stdout.flush()

            for markerSetId in unmodifiedComp:
                summaryOut.write(testGenomeId + "\t%d\t%.2f\t%.2f" % (contigLen, percentComp, percentCont))
                summaryOut.write("\t" + taxonomy + "\t" + markerSetId + "\t" + str(numDescendants[markerSetId]))
                summaryOut.write("\t%.3f\t%.3f" % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId]))
                summaryOut.write("\t%.3f\t%.3f" % (mean(trueComps), std(trueConts)))
                summaryOut.write("\t%.3f\t%.3f" % (mean(abs(deltaComp[markerSetId])), std(abs(deltaComp[markerSetId]))))
                summaryOut.write("\t%.3f\t%.3f" % (mean(abs(deltaCont[markerSetId])), std(abs(deltaCont[markerSetId]))))
                summaryOut.write(
                    "\t%.3f\t%.3f" % (mean(abs(deltaCompSet[markerSetId])), std(abs(deltaCompSet[markerSetId])))
                )
                summaryOut.write(
                    "\t%.3f\t%.3f" % (mean(abs(deltaContSet[markerSetId])), std(abs(deltaContSet[markerSetId])))
                )
                summaryOut.write(
                    "\t%.3f\t%.3f" % (mean(abs(deltaCompRefined[markerSetId])), std(abs(deltaCompRefined[markerSetId])))
                )
                summaryOut.write(
                    "\t%.3f\t%.3f" % (mean(abs(deltaContRefined[markerSetId])), std(abs(deltaContRefined[markerSetId])))
                )
                summaryOut.write(
                    "\t%.3f\t%.3f"
                    % (mean(abs(deltaCompSetRefined[markerSetId])), std(abs(deltaCompSetRefined[markerSetId])))
                )
                summaryOut.write(
                    "\t%.3f\t%.3f"
                    % (mean(abs(deltaContSetRefined[markerSetId])), std(abs(deltaContSetRefined[markerSetId])))
                )
                summaryOut.write("\n")

                fout.write(testGenomeId + "\t%d\t%.2f\t%.2f" % (contigLen, percentComp, percentCont))
                fout.write("\t" + taxonomy + "\t" + markerSetId + "\t" + str(numDescendants[markerSetId]))
                fout.write("\t%.3f\t%.3f" % (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId]))
                fout.write("\t%s" % ",".join(map(str, trueComps)))
                fout.write("\t%s" % ",".join(map(str, trueConts)))
                fout.write("\t%s" % ",".join(map(str, deltaComp[markerSetId])))
                fout.write("\t%s" % ",".join(map(str, deltaCont[markerSetId])))
                fout.write("\t%s" % ",".join(map(str, deltaCompSet[markerSetId])))
                fout.write("\t%s" % ",".join(map(str, deltaContSet[markerSetId])))
                fout.write("\t%s" % ",".join(map(str, deltaCompRefined[markerSetId])))
                fout.write("\t%s" % ",".join(map(str, deltaContRefined[markerSetId])))
                fout.write("\t%s" % ",".join(map(str, deltaCompSetRefined[markerSetId])))
                fout.write("\t%s" % ",".join(map(str, deltaContSetRefined[markerSetId])))
                fout.write("\t%s" % ",".join(map(str, trueComps)))
                fout.write("\t%s" % ",".join(map(str, trueConts)))
                fout.write("\n")

        summaryOut.close()
        fout.close()

        sys.stdout.write("\n")

    def run(self, ubiquityThreshold, singleCopyThreshold, numReplicates, numThreads):
        print "\n  Reading reference genome tree."
        treeFile = os.path.join("/srv", "db", "checkm", "genome_tree", "genome_tree_full.refpkg", "genome_tree.tre")
        tree = dendropy.Tree.get_from_path(treeFile, schema="newick", as_rooted=True, preserve_underscores=True)

        print "    Number of taxa in tree: %d" % (len(tree.leaf_nodes()))

        genomesInTree = set()
        for leaf in tree.leaf_iter():
            genomesInTree.add(leaf.taxon.label.replace("IMG_", ""))

        # get all draft genomes for testing
        print ""
        metadata = self.img.genomeMetadata()
        print "  Total genomes: %d" % len(metadata)

        genomeIdsToTest = genomesInTree - self.img.filterGenomeIds(genomesInTree, metadata, "status", "Finished")
        print "  Number of draft genomes: %d" % len(genomeIdsToTest)

        print ""
        print "  Pre-computing genome information for calculating marker sets:"
        start = time.time()
        self.markerSetBuilder.readLineageSpecificGenesToRemove()
        end = time.time()
        print "    readLineageSpecificGenesToRemove: %.2f" % (end - start)

        start = time.time()
        # self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys())
        end = time.time()
        print "    globalGeneCountTable: %.2f" % (end - start)

        start = time.time()
        # self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys())
        end = time.time()
        print "    precomputeGenomeSeqLens: %.2f" % (end - start)

        start = time.time()
        # self.markerSetBuilder.precomputeGenomeFamilyPositions(metadata.keys(), 0)
        end = time.time()
        print "    precomputeGenomeFamilyPositions: %.2f" % (end - start)

        print ""
        print "  Evaluating %d test genomes." % len(genomeIdsToTest)
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for testGenomeId in genomeIdsToTest:
            workerQueue.put(testGenomeId)

        for _ in range(numThreads):
            workerQueue.put(None)

        workerProc = [
            mp.Process(
                target=self.__workerThread,
                args=(tree, metadata, ubiquityThreshold, singleCopyThreshold, numReplicates, workerQueue, writerQueue),
            )
            for _ in range(numThreads)
        ]
        writeProc = mp.Process(target=self.__writerThread, args=(len(genomeIdsToTest), writerQueue))

        writeProc.start()

        for p in workerProc:
            p.start()

        for p in workerProc:
            p.join()

        writerQueue.put(
            (
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
            )
        )
        writeProc.join()
示例#17
0
class CreateSamllTree(object):
    def __init__(self, outputDir):
        self.__checkForFastTree()

        self.derepConcatenatedAlignFile = os.path.join(outputDir, 'genome_tree.concatenated.derep.fasta')
        self.tree = os.path.join(outputDir, 'genome_tree.final.tre')

        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        self.metadata = self.img.genomeMetadata()

    def __checkForFastTree(self):
        """Check to see if FastTree is on the system path."""

        try:
            exit_status = os.system('FastTree 2> /dev/null')
        except:
            print "Unexpected error!", sys.exc_info()[0]
            raise

        if exit_status != 0:
            print "[Error] FastTree is not on the system path"
            sys.exit()

    def __nearlyIdentical(self, string1, string2, max_diff_perc=0.08):
        max_diff = int(max_diff_perc * len(string1))

        n_diff = 0
        for c1, c2 in itertools.izip(string1, string2):
            if c1 != c2:
                n_diff += 1
                if n_diff >= max_diff:
                    return False

        return True

    def __nearlyIdenticalGenomes(self, seqs, outputDir):
        identical = []
        numTaxa = 0

        nearlyIdenticalFile = os.path.join(outputDir, 'nearly_identical.tsv')
        if os.path.exists(nearlyIdenticalFile):
            for line in open(nearlyIdenticalFile):
                lineSplit = line.split('\t')
                s = set()
                for genomeId in lineSplit:
                    numTaxa += 1
                    s.add(genomeId.strip())
                identical.append(s)
        else:
            seqIds = seqs.keys()

            processed = set()
            for i in xrange(0, len(seqIds)):
                print '  %d of %d' % (i, len(seqIds))
                seqIdI = seqIds[i]
                seqI = seqs[seqIdI]

                if seqIdI in processed:
                    continue

                processed.add(seqIdI)

                numTaxa += 1

                s = set()
                s.add(seqIdI)
                for j in xrange(i + 1, len(seqIds)):
                    seqIdJ = seqIds[j]
                    seqJ = seqs[seqIdJ]

                    if seqIdJ in processed:
                        continue

                    if self.__nearlyIdentical(seqI, seqJ):
                        s.add(seqIdJ)
                        processed.add(seqIdJ)

                identical.append(s)
                print '    set size: %d' % len(s)
                if len(s) > 1:
                    for genomeId in s:
                        genomeId = genomeId.replace('IMG_', '')
                        print genomeId, self.metadata[genomeId]['taxonomy']

            fout = open(nearlyIdenticalFile, 'w')
            for s in identical:
                fout.write('\t'.join(list(s)) + '\n')
            fout.close()

        print '  Number of taxa: %d' % numTaxa
        print '  Number of dereplicated taxa: %d' % len(identical)

        return identical

    def run(self, outputDir):
        # make sure output directory exists
        if not os.path.exists(outputDir):
            os.mkdir(outputDir)

        # remove similar taxa
        print 'Filtering out highly similar taxa in order to reduce size of tree:'
        seqs = readFasta(self.derepConcatenatedAlignFile)

        nearlyIdentical = self.__nearlyIdenticalGenomes(seqs, outputDir)

        reducedSeqs = {}
        for s in nearlyIdentical:
            rndGenome = random.choice(tuple(s))
            reducedSeqs[rndGenome] = seqs[rndGenome]

        # write out reduced alignment
        reducedAlignmentFile = os.path.join(outputDir, "genome_tree.fasta")
        writeFasta(reducedSeqs, reducedAlignmentFile)

        # prune tree to retained taxa
        print ''
        print 'Pruning tree:'
        tree = dendropy.Tree.get_from_path(self.tree, schema='newick', as_rooted=False, preserve_underscores=True)

        for seqId in reducedSeqs:
            node = tree.find_node_with_taxon_label(seqId)
            if not node:
                print 'Missing taxa: %s' % seqId

        tree.retain_taxa_with_labels(reducedSeqs.keys())

        outputTree = os.path.join(outputDir, 'genome_tree.tre')
        tree.write_to_path(outputTree, schema='newick', suppress_rooting=True, unquoted_underscores=True)

        for t in tree.internal_nodes():
            t.label = None

        for t in tree.leaf_nodes():
            if t.taxon.label not in reducedSeqs:
                print 'missing in sequence file: %s' % t.taxon.label

        outputTreeWithoutLabels = os.path.join(outputDir, 'genome_tree.small.no_internal_labels.tre')
        tree.write_to_path(outputTreeWithoutLabels, schema='newick', suppress_rooting=True, unquoted_underscores=True)
        print '  Pruned tree written to: %s' % outputTree

        # calculate model parameters for pruned tree
        print ''
        print 'Determining model parameters for new tree.'
        outputTreeLog = os.path.join(outputDir, 'genome_tree.log')
        fastTreeOutput = os.path.join(outputDir, 'genome_tree.no_internal_labels.fasttree.tre')
        # os.system('FastTreeMP -nome -mllen -intree %s -log %s < %s > %s' % (outputTreeWithoutLabels, outputTreeLog, reducedAlignmentFile, fastTreeOutput))

        # calculate reference package for pruned tree
        print ''
        print 'Creating reference package.'
        os.system('taxit create -l %s -P %s --aln-fasta %s --tree-stats %s --tree-file %s' % ('genome_tree_reduced', os.path.join(outputDir, 'genome_tree_reduced.refpkg'), reducedAlignmentFile, outputTreeLog, outputTree))
示例#18
0
    def run(
        self, geneTreeDir, alignmentDir, extension, outputAlignFile, outputTree, outputTaxonomy, bSupportValues=False
    ):
        # read gene trees
        print "Reading gene trees."
        geneIds = set()
        files = os.listdir(geneTreeDir)
        for f in files:
            if f.endswith(".tre"):
                geneId = f[0 : f.find(".")]
                geneIds.add(geneId)

        # write out genome tree taxonomy
        print "Reading trusted genomes."
        img = IMG("/srv/whitlam/bio/db/checkm/img/img_metadata.tsv", "/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv")
        genomeIds = img.genomeMetadata().keys()
        self.__taxonomy(img, genomeIds, outputTaxonomy)

        print "  There are %d trusted genomes." % (len(genomeIds))

        # get genes in genomes
        print "Reading all PFAM and TIGRFAM hits in trusted genomes."
        genesInGenomes = self.__genesInGenomes(genomeIds)

        # read alignment files
        print "Reading alignment files."
        alignments = {}
        genomeIds = set()
        files = os.listdir(alignmentDir)
        for f in files:
            geneId = f[0 : f.find(".")]
            if f.endswith(extension) and geneId in geneIds:
                seqs = readFasta(os.path.join(alignmentDir, f))

                imgGeneId = geneId
                if imgGeneId.startswith("PF"):
                    imgGeneId = imgGeneId.replace("PF", "pfam")
                seqs = self.__filterParalogs(seqs, imgGeneId, genesInGenomes)

                genomeIds.update(set(seqs.keys()))
                alignments[geneId] = seqs

        # create concatenated alignment
        print "Concatenating alignments:"
        concatenatedSeqs = {}
        totalAlignLen = 0
        for geneId in sorted(alignments.keys()):
            seqs = alignments[geneId]
            alignLen = len(seqs[seqs.keys()[0]])
            print "  " + str(geneId) + "," + str(alignLen)
            totalAlignLen += alignLen
            for genomeId in genomeIds:
                if genomeId in seqs:
                    # append alignment
                    concatenatedSeqs["IMG_" + genomeId] = concatenatedSeqs.get("IMG_" + genomeId, "") + seqs[genomeId]
                else:
                    # missing gene
                    concatenatedSeqs["IMG_" + genomeId] = concatenatedSeqs.get("IMG_" + genomeId, "") + "-" * alignLen

        print "  Total alignment length: " + str(totalAlignLen)

        # save concatenated alignment
        writeFasta(concatenatedSeqs, outputAlignFile)

        # infer genome tree
        print "Inferring genome tree."
        outputLog = outputTree[0 : outputTree.rfind(".")] + ".log"

        supportStr = " "
        if not bSupportValues:
            supportStr = " -nosupport "

        cmd = "FastTreeMP" + supportStr + "-wag -gamma -log " + outputLog + " " + outputAlignFile + " > " + outputTree
        os.system(cmd)
示例#19
0
    def run(self, geneTreeDir, acceptPer, extension, outputDir):
        # make sure output directory is empty
        if not os.path.exists(outputDir):
            os.makedirs(outputDir)

        files = os.listdir(outputDir)
        for f in files:
            os.remove(os.path.join(outputDir, f))

        img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        metadata = img.genomeMetadata()

        files = os.listdir(geneTreeDir)
        print 'Identifying gene trees with only conspecific paralogous genes:'
        filteredGeneTrees = 0
        retainedGeneTrees = 0
        for f in files:
            if not f.endswith(extension):
                continue

            geneId = f[0:f.find('.')]
            print '  Testing gene tree: ' + geneId

            tree = dendropy.Tree.get_from_path(os.path.join(geneTreeDir, f), schema='newick', as_rooted=False, preserve_underscores=True)

            taxa = tree.leaf_nodes()
            numTaxa = len(taxa)
            print '  Genes in tree: ' + str(numTaxa)

            # root tree with archaeal genomes
            rerootTree = RerootTree()
            rerootTree.reroot(tree)
            
            # get species name of each taxa
            leafNodeToSpeciesName = {}
            for t in taxa:
                genomeId = t.taxon.label.split('|')[0]
                genus = metadata[genomeId]['taxonomy'][5]
                sp = metadata[genomeId]['taxonomy'][6].lower()

                leafNodeToSpeciesName[t.taxon.label] = genus + ' ' + sp
                
            # find all paralogous genes
            print '  Finding paralogous genes.'

            paralogs = defaultdict(set)
            for i in xrange(0, len(taxa)):
                genomeId = taxa[i].taxon.label.split('|')[0]
                for j in xrange(i+1, len(taxa)):
                    # genes from the same genome are paralogs, but we filter out
                    # those that are identical (distance of 0 on the tree) to
                    # speed up computation and because these clearly do not
                    # adversely effect phylogenetic inference
                    if genomeId == taxa[j].taxon.label.split('|')[0] and self.__patristicDist(tree, taxa[i], taxa[j]) > 0:
                        paralogs[genomeId].add(taxa[i].taxon.label)
                        paralogs[genomeId].add(taxa[j].taxon.label)
                        
            print '    Paralogous genes: ' + str(len(paralogs))

            # check if paralogous genes are conspecific
            print '  Determining if paralogous genes are conspecific.'
            nonConspecificGenomes = []
            for genomeId, taxaLabels in paralogs.iteritems():
                lcaNode = tree.mrca(taxon_labels = taxaLabels)

                children = lcaNode.leaf_nodes()
                species = set()
                for child in children:
                    childGenomeId = child.taxon.label.split('|')[0]

                    genus = metadata[childGenomeId]['taxonomy'][5]
                    sp = metadata[childGenomeId]['taxonomy'][6].lower()
                    if sp != '' and sp != 'unclassified' and genus != 'unclassified':
                        species.add(genus + ' ' + sp)

                if len(species) > 1:
                    nonConspecificGenomes.append(genomeId)

            if len(nonConspecificGenomes) > acceptPer*numTaxa:
                filteredGeneTrees += 1
                print '  Tree is not conspecific for the following genome: ' + str(nonConspecificGenomes)
            else:
                retainedGeneTrees += 1

                if len(nonConspecificGenomes) > 1:
                    print '  An acceptable number of genomes are not conspecific: ' + str(nonConspecificGenomes)
                else:
                    print '  Tree is conspecific.'

                os.system('cp ' + os.path.join(geneTreeDir, f) + ' ' + os.path.join(outputDir, f))

            print ''

        print 'Filtered gene trees: ' + str(filteredGeneTrees)
        print 'Retained gene trees: ' + str(retainedGeneTrees)
示例#20
0
class DecorateTree(object):
    def __init__(self):
        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv',
                       '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        self.pfamHMMs = '/srv/whitlam/bio/db/pfam/27/Pfam-A.hmm'
        self.markerSetBuilder = MarkerSetBuilder()

    def __meanStd(self, metadata, genomeIds, category):
        values = []
        for genomeId in genomeIds:
            genomeId = genomeId.replace('IMG_', '')
            v = metadata[genomeId][category]
            if v != 'NA':
                values.append(v)

        return mean(values), std(values)

    def __calculateMarkerSet(self,
                             genomeLabels,
                             ubiquityThreshold=0.97,
                             singleCopyThreshold=0.97):
        """Calculate marker set for a set of genomes."""

        # get genome IDs from genome labels
        genomeIds = set()
        for genomeLabel in genomeLabels:
            genomeIds.add(genomeLabel.replace('IMG_', ''))

        markerSet = self.markerSetBuilder.buildMarkerSet(
            genomeIds, ubiquityThreshold, singleCopyThreshold)

        return markerSet.markerSet

    def __pfamIdToPfamAcc(self, img):
        pfamIdToPfamAcc = {}
        for line in open(self.pfamHMMs):
            if 'ACC' in line:
                acc = line.split()[1].strip()
                pfamId = acc.split('.')[0]

                pfamIdToPfamAcc[pfamId] = acc

        return pfamIdToPfamAcc

    def decorate(self, taxaTreeFile, derepFile, inputTreeFile, metadataOut,
                 numThreads):
        # read genome metadata
        print('  Reading metadata.')
        metadata = self.img.genomeMetadata()

        # read list of taxa with duplicate sequences
        print('  Read list of taxa with duplicate sequences.')
        duplicateTaxa = {}
        for line in open(derepFile):
            lineSplit = line.rstrip().split()
            if len(lineSplit) > 1:
                duplicateTaxa[lineSplit[0]] = lineSplit[1:]

        # build gene count table
        print('  Building gene count table.')
        genomeIds = self.img.genomeMetadata().keys()
        print('    # trusted genomes = ' + str(len(genomeIds)))

        # calculate statistics for each internal node using multiple threads
        print('  Calculating statistics for each internal node.')
        self.__internalNodeStatistics(taxaTreeFile, inputTreeFile,
                                      duplicateTaxa, metadata, metadataOut,
                                      numThreads)

    def __internalNodeStatistics(self, taxaTreeFile, inputTreeFile,
                                 duplicateTaxa, metadata, metadataOut,
                                 numThreads):

        # determine HMM model accession numbers
        pfamIdToPfamAcc = self.__pfamIdToPfamAcc(self.img)

        taxaTree = dendropy.Tree.get_from_path(taxaTreeFile,
                                               schema='newick',
                                               as_rooted=True,
                                               preserve_underscores=True)
        inputTree = dendropy.Tree.get_from_path(inputTreeFile,
                                                schema='newick',
                                                as_rooted=True,
                                                preserve_underscores=True)

        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        uniqueId = 0
        for node in inputTree.internal_nodes():
            uniqueId += 1
            workerQueue.put((uniqueId, node))

        for _ in range(numThreads):
            workerQueue.put((None, None))

        calcProc = [
            mp.Process(target=self.__processInternalNode,
                       args=(taxaTree, duplicateTaxa, workerQueue,
                             writerQueue)) for _ in range(numThreads)
        ]
        writeProc = mp.Process(target=self.__reportStatistics,
                               args=(metadata, metadataOut, inputTree,
                                     inputTreeFile, pfamIdToPfamAcc,
                                     writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None, None))
        writeProc.join()

    def __processInternalNode(self, taxaTree, duplicateTaxa, queueIn,
                              queueOut):
        """Run each marker gene in a separate thread."""

        while True:
            uniqueId, node = queueIn.get(block=True, timeout=None)
            if uniqueId == None:
                break

            # find corresponding internal node in taxa tree
            labels = []
            for leaf in node.leaf_nodes():
                labels.append(leaf.taxon.label)
                if leaf.taxon.label in duplicateTaxa:
                    for genomeId in duplicateTaxa[leaf.taxon.label]:
                        labels.append(genomeId)

            # check if there is a taxonomic label
            mrca = taxaTree.mrca(taxon_labels=labels)
            taxaStr = ''
            if mrca.label:
                taxaStr = mrca.label.replace(' ', '')

            # give node a unique Id while retraining bootstrap value
            bootstrap = ''
            if node.label:
                bootstrap = node.label
            nodeLabel = 'UID' + str(uniqueId) + '|' + taxaStr + '|' + bootstrap

            # calculate marker set
            markerSet = self.__calculateMarkerSet(labels)

            queueOut.put((uniqueId, labels, markerSet, taxaStr, bootstrap,
                          node.oid, nodeLabel))

    def __reportStatistics(self, metadata, metadataOut, inputTree,
                           inputTreeFile, pfamIdToPfamAcc, writerQueue):
        """Store statistics for internal node."""

        fout = open(metadataOut, 'w')
        fout.write('UID\t# genomes\tTaxonomy\tBootstrap')
        fout.write('\tGC mean\tGC std')
        fout.write('\tGenome size mean\tGenome size std')
        fout.write('\tGene count mean\tGene count std')
        fout.write('\tMarker set')
        fout.write('\n')

        numProcessedNodes = 0
        numInternalNodes = len(inputTree.internal_nodes())
        while True:
            uniqueId, labels, markerSet, taxaStr, bootstrap, nodeID, nodeLabel = writerQueue.get(
                block=True, timeout=None)
            if uniqueId == None:
                break

            numProcessedNodes += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) internal nodes.' % (
                numProcessedNodes, numInternalNodes,
                float(numProcessedNodes) * 100 / numInternalNodes)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            fout.write('UID' + str(uniqueId) + '\t' + str(len(labels)) + '\t' +
                       taxaStr + '\t' + bootstrap)

            m, s = self.__meanStd(metadata, labels, 'GC %')
            fout.write('\t' + str(m * 100) + '\t' + str(s * 100))

            m, s = self.__meanStd(metadata, labels, 'genome size')
            fout.write('\t' + str(m) + '\t' + str(s))

            m, s = self.__meanStd(metadata, labels, 'gene count')
            fout.write('\t' + str(m) + '\t' + str(s))

            # change model names to accession numbers, and make
            # sure there is an HMM model for each PFAM
            mungedMarkerSets = []
            for geneSet in markerSet:
                s = set()
                for geneId in geneSet:
                    if 'pfam' in geneId:
                        pfamId = geneId.replace('pfam', 'PF')
                        if pfamId in pfamIdToPfamAcc:
                            s.add(pfamIdToPfamAcc[pfamId])
                    else:
                        s.add(geneId)
                mungedMarkerSets.append(s)

            fout.write('\t' + str(mungedMarkerSets))

            fout.write('\n')

            node = inputTree.find_node(
                filter_fn=lambda n: hasattr(n, 'oid') and n.oid == nodeID)
            node.label = nodeLabel

        sys.stdout.write('\n')

        fout.close()

        inputTree.write_to_path(inputTreeFile,
                                schema='newick',
                                suppress_rooting=True,
                                unquoted_underscores=True)
示例#21
0
 def __init__(self):
     img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv',
               '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
     self.metadata = img.genomeMetadata()
示例#22
0
    def run(self,
            geneTreeDir,
            alignmentDir,
            extension,
            outputAlignFile,
            outputTree,
            outputTaxonomy,
            bSupportValues=False):
        # read gene trees
        print 'Reading gene trees.'
        geneIds = set()
        files = os.listdir(geneTreeDir)
        for f in files:
            if f.endswith('.tre'):
                geneId = f[0:f.find('.')]
                geneIds.add(geneId)

        # write out genome tree taxonomy
        print 'Reading trusted genomes.'
        img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv',
                  '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        genomeIds = img.genomeMetadata().keys()
        self.__taxonomy(img, genomeIds, outputTaxonomy)

        print '  There are %d trusted genomes.' % (len(genomeIds))

        # get genes in genomes
        print 'Reading all PFAM and TIGRFAM hits in trusted genomes.'
        genesInGenomes = self.__genesInGenomes(genomeIds)

        # read alignment files
        print 'Reading alignment files.'
        alignments = {}
        genomeIds = set()
        files = os.listdir(alignmentDir)
        for f in files:
            geneId = f[0:f.find('.')]
            if f.endswith(extension) and geneId in geneIds:
                seqs = readFasta(os.path.join(alignmentDir, f))

                imgGeneId = geneId
                if imgGeneId.startswith('PF'):
                    imgGeneId = imgGeneId.replace('PF', 'pfam')
                seqs = self.__filterParalogs(seqs, imgGeneId, genesInGenomes)

                genomeIds.update(set(seqs.keys()))
                alignments[geneId] = seqs

        # create concatenated alignment
        print 'Concatenating alignments:'
        concatenatedSeqs = {}
        totalAlignLen = 0
        for geneId in sorted(alignments.keys()):
            seqs = alignments[geneId]
            alignLen = len(seqs[seqs.keys()[0]])
            print '  ' + str(geneId) + ',' + str(alignLen)
            totalAlignLen += alignLen
            for genomeId in genomeIds:
                if genomeId in seqs:
                    # append alignment
                    concatenatedSeqs['IMG_' + genomeId] = concatenatedSeqs.get(
                        'IMG_' + genomeId, '') + seqs[genomeId]
                else:
                    # missing gene
                    concatenatedSeqs['IMG_' + genomeId] = concatenatedSeqs.get(
                        'IMG_' + genomeId, '') + '-' * alignLen

        print '  Total alignment length: ' + str(totalAlignLen)

        # save concatenated alignment
        writeFasta(concatenatedSeqs, outputAlignFile)

        # infer genome tree
        print 'Inferring genome tree.'
        outputLog = outputTree[0:outputTree.rfind('.')] + '.log'

        supportStr = ' '
        if not bSupportValues:
            supportStr = ' -nosupport '

        cmd = 'FastTreeMP' + supportStr + '-wag -gamma -log ' + outputLog + ' ' + outputAlignFile + ' > ' + outputTree
        os.system(cmd)
示例#23
0
class SimulationScaffolds(object):
    def __init__(self):
        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv',
                       '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')

        self.contigLens = [5000, 20000, 50000]
        self.percentComps = [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]
        self.percentConts = [0.0, 0.05, 0.1, 0.15, 0.2]

    def __seqLens(self, seqs):
        """Calculate lengths of seqs."""
        genomeSize = 0
        seqLens = {}
        for seqId, seq in seqs.iteritems():
            seqLens[seqId] = len(seq)
            genomeSize += len(seq)

        return seqLens, genomeSize

    def __workerThread(self, tree, metadata, genomeIdsToTest,
                       ubiquityThreshold, singleCopyThreshold, numReplicates,
                       queueIn, queueOut):
        """Process each data item in parallel."""

        while True:
            testGenomeId = queueIn.get(block=True, timeout=None)
            if testGenomeId == None:
                break

            # build marker sets for evaluating test genome
            testNode = tree.find_node_with_taxon_label('IMG_' + testGenomeId)
            binMarkerSets, refinedBinMarkerSet = self.markerSetBuilder.buildBinMarkerSet(
                tree,
                testNode.parent_node,
                ubiquityThreshold,
                singleCopyThreshold,
                bMarkerSet=True,
                genomeIdsToRemove=[testGenomeId])

            # determine distribution of all marker genes within the test genome
            geneDistTable = self.img.geneDistTable(
                [testGenomeId],
                binMarkerSets.getMarkerGenes(),
                spacingBetweenContigs=0)

            # estimate completeness of unmodified genome
            unmodifiedComp = {}
            unmodifiedCont = {}
            for ms in binMarkerSets.markerSetIter():
                hits = {}
                for mg in ms.getMarkerGenes():
                    if mg in geneDistTable[testGenomeId]:
                        hits[mg] = geneDistTable[testGenomeId][mg]
                completeness, contamination = ms.genomeCheck(
                    hits, bIndividualMarkers=True)
                unmodifiedComp[ms.lineageStr] = completeness
                unmodifiedCont[ms.lineageStr] = contamination

            # estimate completion and contamination of genome after subsampling using both the domain and lineage-specific marker sets
            testSeqs = readFasta(
                os.path.join(self.img.genomeDir, testGenomeId,
                             testGenomeId + '.fna'))
            testSeqLens, genomeSize = self.__seqLens(testSeqs)

            for contigLen in self.contigLens:
                for percentComp in self.percentComps:
                    for percentCont in self.percentConts:
                        deltaComp = defaultdict(list)
                        deltaCont = defaultdict(list)
                        deltaCompSet = defaultdict(list)
                        deltaContSet = defaultdict(list)

                        deltaCompRefined = defaultdict(list)
                        deltaContRefined = defaultdict(list)
                        deltaCompSetRefined = defaultdict(list)
                        deltaContSetRefined = defaultdict(list)

                        trueComps = []
                        trueConts = []

                        numDescendants = {}

                        for i in xrange(0, numReplicates):
                            # generate test genome with a specific level of completeness, by randomly sampling scaffolds to remove
                            # (this will sample >= the desired level of completeness)
                            retainedTestSeqs, trueComp = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement(
                                percentComp, testSeqLens, genomeSize)
                            trueComps.append(trueComp)

                            # select a random genome to use as a source of contamination
                            contGenomeId = random.sample(
                                genomeIdsToTest - set([testGenomeId]), 1)[0]
                            contSeqs = readFasta(
                                os.path.join(self.img.genomeDir, contGenomeId,
                                             contGenomeId + '.fna'))
                            contSeqLens, contGenomeSize = self.__seqLens(
                                contSeqs)
                            seqsToRetain, trueRetainedPer = self.markerSetBuilder.sampleGenomeScaffoldsWithoutReplacement(
                                1 - percentCont, contSeqLens, contGenomeSize)

                            contSampledSeqIds = set(
                                contSeqs.keys()).difference(seqsToRetain)
                            trueCont = 100.0 - trueRetainedPer
                            trueConts.append(trueCont)

                            for ms in binMarkerSets.markerSetIter():
                                numDescendants[ms.lineageStr] = ms.numGenomes
                                containedMarkerGenes = defaultdict(list)
                                self.markerSetBuilder.markerGenesOnScaffolds(
                                    ms.getMarkerGenes(), testGenomeId,
                                    retainedTestSeqs, containedMarkerGenes)
                                self.markerSetBuilder.markerGenesOnScaffolds(
                                    ms.getMarkerGenes(), contGenomeId,
                                    contSampledSeqIds, containedMarkerGenes)

                                completeness, contamination = ms.genomeCheck(
                                    containedMarkerGenes,
                                    bIndividualMarkers=True)
                                deltaComp[ms.lineageStr].append(completeness -
                                                                trueComp)
                                deltaCont[ms.lineageStr].append(contamination -
                                                                trueCont)

                                completeness, contamination = ms.genomeCheck(
                                    containedMarkerGenes,
                                    bIndividualMarkers=False)
                                deltaCompSet[ms.lineageStr].append(
                                    completeness - trueComp)
                                deltaContSet[ms.lineageStr].append(
                                    contamination - trueCont)

                            for ms in refinedBinMarkerSet.markerSetIter():
                                containedMarkerGenes = defaultdict(list)
                                self.markerSetBuilder.markerGenesOnScaffolds(
                                    ms.getMarkerGenes(), testGenomeId,
                                    retainedTestSeqs, containedMarkerGenes)
                                self.markerSetBuilder.markerGenesOnScaffolds(
                                    ms.getMarkerGenes(), contGenomeId,
                                    contSampledSeqIds, containedMarkerGenes)

                                completeness, contamination = ms.genomeCheck(
                                    containedMarkerGenes,
                                    bIndividualMarkers=True)
                                deltaCompRefined[ms.lineageStr].append(
                                    completeness - trueComp)
                                deltaContRefined[ms.lineageStr].append(
                                    contamination - trueCont)

                                completeness, contamination = ms.genomeCheck(
                                    containedMarkerGenes,
                                    bIndividualMarkers=False)
                                deltaCompSetRefined[ms.lineageStr].append(
                                    completeness - trueComp)
                                deltaContSetRefined[ms.lineageStr].append(
                                    contamination - trueCont)

                        taxonomy = ';'.join(metadata[testGenomeId]['taxonomy'])
                        queueOut.put(
                            (testGenomeId, contigLen, percentComp, percentCont,
                             taxonomy, numDescendants, unmodifiedComp,
                             unmodifiedCont, deltaComp, deltaCont,
                             deltaCompSet, deltaContSet, deltaCompRefined,
                             deltaContRefined, deltaCompSetRefined,
                             deltaContSetRefined, trueComps, trueConts))

    def __writerThread(self, numTestGenomes, writerQueue):
        """Store or write results of worker threads in a single thread."""

        summaryOut = open(
            '/tmp/simulation.random_scaffolds.w_refinement_50.draft.summary.tsv',
            'w')
        summaryOut.write('Genome Id\tContig len\t% comp\t% cont')
        summaryOut.write('\tTaxonomy\tMarker set\t# descendants')
        summaryOut.write('\tUnmodified comp\tUnmodified cont')
        summaryOut.write('\tIM comp\tIM comp std\tIM cont\tIM cont std')
        summaryOut.write('\tMS comp\tMS comp std\tMS cont\tMS cont std')
        summaryOut.write('\tRIM comp\tRIM comp std\tRIM cont\tRIM cont std')
        summaryOut.write('\tRMS comp\tRMS comp std\tRMS cont\tRMS cont std\n')

        fout = gzip.open(
            '/tmp/simulation.random_scaffolds.w_refinement_50.draft.tsv.gz',
            'wb')
        fout.write('Genome Id\tContig len\t% comp\t% cont')
        fout.write('\tTaxonomy\tMarker set\t# descendants')
        fout.write('\tUnmodified comp\tUnmodified cont')
        fout.write('\tIM comp\tIM cont')
        fout.write('\tMS comp\tMS cont')
        fout.write('\tRIM comp\tRIM cont')
        fout.write('\tRMS comp\tRMS cont\tTrue Comp\tTrue Cont\n')

        testsPerGenome = len(self.contigLens) * len(self.percentComps) * len(
            self.percentConts)

        itemsProcessed = 0
        while True:
            testGenomeId, contigLen, percentComp, percentCont, taxonomy, numDescendants, unmodifiedComp, unmodifiedCont, deltaComp, deltaCont, deltaCompSet, deltaContSet, deltaCompRefined, deltaContRefined, deltaCompSetRefined, deltaContSetRefined, trueComps, trueConts = writerQueue.get(
                block=True, timeout=None)
            if testGenomeId == None:
                break

            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (
                itemsProcessed, numTestGenomes * testsPerGenome,
                float(itemsProcessed) * 100 /
                (numTestGenomes * testsPerGenome))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            for markerSetId in unmodifiedComp:
                summaryOut.write(testGenomeId + '\t%d\t%.2f\t%.2f' %
                                 (contigLen, percentComp, percentCont))
                summaryOut.write('\t' + taxonomy + '\t' + markerSetId + '\t' +
                                 str(numDescendants[markerSetId]))
                summaryOut.write(
                    '\t%.3f\t%.3f' %
                    (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId]))
                summaryOut.write('\t%.3f\t%.3f' %
                                 (mean(abs(deltaComp[markerSetId])),
                                  std(abs(deltaComp[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' %
                                 (mean(abs(deltaCont[markerSetId])),
                                  std(abs(deltaCont[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' %
                                 (mean(abs(deltaCompSet[markerSetId])),
                                  std(abs(deltaCompSet[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' %
                                 (mean(abs(deltaContSet[markerSetId])),
                                  std(abs(deltaContSet[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' %
                                 (mean(abs(deltaCompRefined[markerSetId])),
                                  std(abs(deltaCompRefined[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' %
                                 (mean(abs(deltaContRefined[markerSetId])),
                                  std(abs(deltaContRefined[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' %
                                 (mean(abs(deltaCompSetRefined[markerSetId])),
                                  std(abs(deltaCompSetRefined[markerSetId]))))
                summaryOut.write('\t%.3f\t%.3f' %
                                 (mean(abs(deltaContSetRefined[markerSetId])),
                                  std(abs(deltaContSetRefined[markerSetId]))))
                summaryOut.write('\n')

                fout.write(testGenomeId + '\t%d\t%.2f\t%.2f' %
                           (contigLen, percentComp, percentCont))
                fout.write('\t' + taxonomy + '\t' + markerSetId + '\t' +
                           str(numDescendants[markerSetId]))
                fout.write(
                    '\t%.3f\t%.3f' %
                    (unmodifiedComp[markerSetId], unmodifiedCont[markerSetId]))
                fout.write('\t%s' % ','.join(map(str, deltaComp[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, deltaCont[markerSetId])))
                fout.write('\t%s' %
                           ','.join(map(str, deltaCompSet[markerSetId])))
                fout.write('\t%s' %
                           ','.join(map(str, deltaContSet[markerSetId])))
                fout.write('\t%s' %
                           ','.join(map(str, deltaCompRefined[markerSetId])))
                fout.write('\t%s' %
                           ','.join(map(str, deltaContRefined[markerSetId])))
                fout.write(
                    '\t%s' %
                    ','.join(map(str, deltaCompSetRefined[markerSetId])))
                fout.write(
                    '\t%s' %
                    ','.join(map(str, deltaContSetRefined[markerSetId])))
                fout.write('\t%s' % ','.join(map(str, trueComps)))
                fout.write('\t%s' % ','.join(map(str, trueConts)))
                fout.write('\n')

        summaryOut.close()
        fout.close()

        sys.stdout.write('\n')

    def run(self, ubiquityThreshold, singleCopyThreshold, numReplicates,
            minScaffolds, numThreads):
        random.seed(0)

        print '\n  Reading reference genome tree.'
        treeFile = os.path.join('/srv', 'db', 'checkm', 'genome_tree',
                                'genome_tree_prok.refpkg',
                                'genome_tree.final.tre')
        tree = dendropy.Tree.get_from_path(treeFile,
                                           schema='newick',
                                           as_rooted=True,
                                           preserve_underscores=True)

        print '    Number of taxa in tree: %d' % (len(tree.leaf_nodes()))

        genomesInTree = set()
        for leaf in tree.leaf_iter():
            genomesInTree.add(leaf.taxon.label.replace('IMG_', ''))

        # get all draft genomes consisting of a user-specific minimum number of scaffolds
        print ''
        metadata = self.img.genomeMetadata()
        print '  Total genomes: %d' % len(metadata)

        draftGenomeIds = genomesInTree - self.img.filterGenomeIds(
            genomesInTree, metadata, 'status', 'Finished')
        print '  Number of draft genomes: %d' % len(draftGenomeIds)

        genomeIdsToTest = set()
        for genomeId in draftGenomeIds:
            if metadata[genomeId]['scaffold count'] >= minScaffolds:
                genomeIdsToTest.add(genomeId)

        print '  Number of draft genomes with >= %d scaffolds: %d' % (
            minScaffolds, len(genomeIdsToTest))

        print ''
        start = time.time()
        self.markerSetBuilder.readLineageSpecificGenesToRemove()
        end = time.time()
        print '    readLineageSpecificGenesToRemove: %.2f' % (end - start)

        print '  Pre-computing genome information for calculating marker sets:'
        start = time.time()
        self.markerSetBuilder.precomputeGenomeFamilyScaffolds(metadata.keys())
        end = time.time()
        print '    precomputeGenomeFamilyScaffolds: %.2f' % (end - start)

        start = time.time()
        self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(
            metadata.keys())
        end = time.time()
        print '    globalGeneCountTable: %.2f' % (end - start)

        start = time.time()
        self.markerSetBuilder.precomputeGenomeSeqLens(metadata.keys())
        end = time.time()
        print '    precomputeGenomeSeqLens: %.2f' % (end - start)

        start = time.time()
        self.markerSetBuilder.precomputeGenomeFamilyPositions(
            metadata.keys(), 0)
        end = time.time()
        print '    precomputeGenomeFamilyPositions: %.2f' % (end - start)

        print ''
        print '  Evaluating %d test genomes.' % len(genomeIdsToTest)

        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for testGenomeId in list(genomeIdsToTest):
            workerQueue.put(testGenomeId)

        for _ in range(numThreads):
            workerQueue.put(None)

        workerProc = [
            mp.Process(target=self.__workerThread,
                       args=(tree, metadata, genomeIdsToTest,
                             ubiquityThreshold, singleCopyThreshold,
                             numReplicates, workerQueue, writerQueue))
            for _ in range(numThreads)
        ]
        writeProc = mp.Process(target=self.__writerThread,
                               args=(len(genomeIdsToTest), writerQueue))

        writeProc.start()

        for p in workerProc:
            p.start()

        for p in workerProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None, None, None, None,
                         None, None, None, None, None, None, None, None, None))
        writeProc.join()
示例#24
0
    def run(self, geneTreeDir, treeExtension, consistencyThreshold,
            minTaxaForAverage, outputFile, outputDir):
        # make sure output directory is empty
        if not os.path.exists(outputDir):
            os.makedirs(outputDir)

        files = os.listdir(outputDir)
        for f in files:
            if os.path.isfile(os.path.join(outputDir, f)):
                os.remove(os.path.join(outputDir, f))

        # get TIGRFam info
        descDict = {}
        files = os.listdir('/srv/db/tigrfam/13.0/TIGRFAMs_13.0_INFO')
        for f in files:
            shortDesc = longDesc = ''
            for line in open('/srv/db/tigrfam/13.0/TIGRFAMs_13.0_INFO/' + f):
                lineSplit = line.split('  ')
                if lineSplit[0] == 'AC':
                    acc = lineSplit[1].strip()
                elif lineSplit[0] == 'DE':
                    shortDesc = lineSplit[1].strip()
                elif lineSplit[0] == 'CC':
                    longDesc = lineSplit[1].strip()

            descDict[acc] = [shortDesc, longDesc]

        # get PFam info
        for line in open('/srv/db/pfam/27/Pfam-A.clans.tsv'):
            lineSplit = line.split('\t')
            acc = lineSplit[0]
            shortDesc = lineSplit[3]
            longDesc = lineSplit[4].strip()

            descDict[acc] = [shortDesc, longDesc]

        # get IMG taxonomy
        img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv',
                  '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        metadata = img.genomeMetadata()
        genomeIdToTaxonomy = {}
        for genomeId, m in metadata.iteritems():
            genomeIdToTaxonomy[genomeId] = m['taxonomy']

        # perform analysis for each tree
        treeFiles = os.listdir(geneTreeDir)
        allResults = {}
        allTaxa = [set([]), set([]), set([])]
        taxaCounts = {}
        avgConsistency = {}
        for treeFile in treeFiles:
            if not treeFile.endswith(treeExtension):
                continue

            print treeFile
            tree = dendropy.Tree.get_from_path(os.path.join(
                geneTreeDir, treeFile),
                                               schema='newick',
                                               as_rooted=True,
                                               preserve_underscores=True)

            domainConsistency = {}
            phylaConsistency = {}
            classConsistency = {}
            consistencyDict = [
                domainConsistency, phylaConsistency, classConsistency
            ]

            # get abundance of taxa at different taxonomic ranks
            totals = [{}, {}, {}]
            leaves = tree.leaf_nodes()
            print '  Number of leaves: ' + str(len(leaves))
            totalValidLeaves = 0

            for leaf in leaves:
                genomeId = self.__genomeId(leaf.taxon.label)

                if genomeId not in metadata:
                    print '[Error] Genome is missing metadata: ' + genomeId
                    sys.exit()

                totalValidLeaves += 1
                taxonomy = genomeIdToTaxonomy[genomeId]
                for r in xrange(0, 3):
                    totals[r][taxonomy[r]] = totals[r].get(taxonomy[r], 0) + 1
                    consistencyDict[r][taxonomy[r]] = 0
                    allTaxa[r].add(taxonomy[r])

            taxaCounts[treeFile] = [
                totalValidLeaves, totals[0].get('Bacteria', 0),
                totals[0].get('Archaea', 0)
            ]

            # find highest consistency nodes (congruent descendant taxa / (total taxa + incongruent descendant taxa))
            internalNodes = tree.internal_nodes()
            for node in internalNodes:
                leaves = node.leaf_nodes()

                for r in xrange(0, 3):
                    leafCounts = {}
                    for leaf in leaves:
                        genomeId = self.__genomeId(leaf.taxon.label)
                        taxonomy = genomeIdToTaxonomy[genomeId]
                        leafCounts[taxonomy[r]] = leafCounts.get(
                            taxonomy[r], 0) + 1

                    # calculate consistency for node
                    for taxa in consistencyDict[r]:
                        totalTaxaCount = totals[r][taxa]
                        if totalTaxaCount <= 1 or taxa == 'unclassified':
                            consistencyDict[r][taxa] = 'N/A'
                            continue

                        taxaCount = leafCounts.get(taxa, 0)
                        incongruentTaxa = len(leaves) - taxaCount
                        c = float(taxaCount) / (totalTaxaCount +
                                                incongruentTaxa)
                        if c > consistencyDict[r][taxa]:
                            consistencyDict[r][taxa] = c

                        # consider clan in other direction since the trees are unrooted
                        taxaCount = totalTaxaCount - leafCounts.get(taxa, 0)
                        incongruentTaxa = totalValidLeaves - len(
                            leaves) - taxaCount
                        c = float(taxaCount) / (totalTaxaCount +
                                                incongruentTaxa)
                        if c > consistencyDict[r][taxa]:
                            consistencyDict[r][taxa] = c

            # write results
            consistencyDir = os.path.join(outputDir, 'consistency')
            if not os.path.exists(consistencyDir):
                os.makedirs(consistencyDir)
            fout = open(
                os.path.join(consistencyDir, treeFile + '.results.tsv'), 'w')
            fout.write('Tree')
            for r in xrange(0, 3):
                for taxa in sorted(consistencyDict[r].keys()):
                    fout.write('\t' + taxa)
            fout.write('\n')

            fout.write(treeFile)
            for r in xrange(0, 3):
                for taxa in sorted(consistencyDict[r].keys()):
                    if consistencyDict[r][taxa] != 'N/A':
                        fout.write('\t%.2f' % (consistencyDict[r][taxa] * 100))
                    else:
                        fout.write('\tN/A')
            fout.close()

            # calculate average consistency at each taxonomic rank
            average = []
            for r in xrange(0, 3):
                sumConsistency = []
                for taxa in consistencyDict[r]:
                    if totals[r][taxa] > minTaxaForAverage and consistencyDict[
                            r][taxa] != 'N/A':
                        sumConsistency.append(consistencyDict[r][taxa])

                if len(sumConsistency) > 0:
                    average.append(sum(sumConsistency) / len(sumConsistency))
                else:
                    average.append(0)
            avgConsistency[treeFile] = average
            allResults[treeFile] = consistencyDict

            print '  Average consistency: ' + str(
                average) + ', mean = %.2f' % (sum(average) / len(average))
            print ''

        # print out combined results
        fout = open(outputFile, 'w')
        fout.write(
            'Tree\tShort Desc.\tLong Desc.\tAlignment Length\t# Taxa\t# Bacteria\t# Archaea\tAvg. Consistency\tAvg. Domain Consistency\tAvg. Phylum Consistency\tAvg. Class Consistency'
        )
        for r in xrange(0, 3):
            for t in sorted(allTaxa[r]):
                fout.write('\t' + t)
        fout.write('\n')

        filteredGeneTrees = 0
        retainedGeneTrees = 0
        for treeFile in sorted(allResults.keys()):
            consistencyDict = allResults[treeFile]
            treeId = treeFile[0:treeFile.find('.')].replace('pfam', 'PF')

            fout.write(treeId + '\t' + descDict[treeId][0] + '\t' +
                       descDict[treeId][1])

            # Taxa count
            fout.write('\t' + str(taxaCounts[treeFile][0]) + '\t' +
                       str(taxaCounts[treeFile][1]) + '\t' +
                       str(taxaCounts[treeFile][2]))

            avgCon = 0
            for r in xrange(0, 3):
                avgCon += avgConsistency[treeFile][r]
            avgCon /= 3
            fout.write('\t' + str(avgCon))

            if avgCon >= consistencyThreshold:
                retainedGeneTrees += 1
                os.system('cp ' + os.path.join(geneTreeDir, treeFile) + ' ' +
                          os.path.join(outputDir, treeFile))
            else:
                filteredGeneTrees += 1
                print 'Filtered % s with an average consistency of %.4f.' % (
                    treeFile, avgCon)

            for r in xrange(0, 3):
                fout.write('\t' + str(avgConsistency[treeFile][r]))

            for r in xrange(0, 3):
                for t in sorted(allTaxa[r]):
                    if t in consistencyDict[r]:
                        if consistencyDict[r][t] != 'N/A':
                            fout.write('\t%.2f' %
                                       (consistencyDict[r][t] * 100))
                        else:
                            fout.write('\tN/A')
                    else:
                        fout.write('\tN/A')
            fout.write('\n')
        fout.close()

        print 'Retained gene trees: ' + str(retainedGeneTrees)
        print 'Filtered gene trees: ' + str(filteredGeneTrees)
class IdentifyGeneLossAndDuplication(object):
    def __init__(self):
        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')

    def run(self, ubiquityThreshold, minGenomes):
        # Pre-compute gene count table
        print 'Computing gene count table.'
        start = time.time()
        metadata = self.img.genomeMetadata()
        self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys())
        end = time.time()
        print '    globalGeneCountTable: %.2f' % (end - start)

        # read selected node for defining marker set
        print 'Reading node defining marker set for each internal node.'
        selectedMarkerNode = {}
        for line in open('/srv/whitlam/bio/db/checkm/selected_marker_sets.tsv'):
            lineSplit = line.split('\t')
            selectedMarkerNode[lineSplit[0].strip()] = lineSplit[1].strip()
            
        # read duplicate taxa
        print 'Reading list of identical taxa in genome tree.'
        duplicateTaxa = {}
        for line in open('/srv/whitlam/bio/db/checkm/genome_tree/genome_tree.derep.txt'):
            lineSplit = line.rstrip().split()
            if len(lineSplit) > 1:
                duplicateTaxa[lineSplit[0]] = lineSplit[1:]
        
        # read in node metadata
        print 'Reading node metadata.'
        treeParser = TreeParser()
        uniqueIdToLineageStatistics = treeParser.readNodeMetadata()
        
        # read genome tree
        print 'Reading in genome tree.'
                
        treeFile = '/srv/whitlam/bio/db/checkm/genome_tree/genome_tree_prok.refpkg/genome_tree.final.tre'
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True)
        
        # determine lineage-specific gene loss and duplication (relative to potential marker genes used by a node)
        print 'Determining lineage-specific gene loss and duplication'
        
        fout = open('/srv/whitlam/bio/db/checkm/genome_tree/missing_duplicate_genes_50.tsv', 'w')
        
        processed = 0
        numInternalNodes = len(tree.internal_nodes())
        for node in tree.internal_nodes():
            processed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) internal nodes.' % (processed, numInternalNodes, float(processed)*100/numInternalNodes)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            nodeId = node.label.split('|')[0]
            
            missingGenes = []
            duplicateGenes = []
            
            nodeStats = uniqueIdToLineageStatistics[nodeId]
            if nodeStats['# genomes'] >= minGenomes:               
                # get marker genes defined for current node along with all parental nodes    
                markerGenes = set() 
                parentNode = node
                while parentNode != None:                     
                    parentNodeId = parentNode.label.split('|')[0]
                
                    stats = uniqueIdToLineageStatistics[parentNodeId]
                    markerSet = MarkerSet(parentNodeId, stats['taxonomy'], stats['# genomes'], eval(stats['marker set']))
                    markerGenes = markerGenes.union(markerSet.getMarkerGenes())
                
                    parentNode = parentNode.parent_node
                
                # silly hack since PFAM ids are inconsistent between the PFAM data and IMG data
                revisedMarkerGeneIds = set()
                for mg in markerGenes:
                    if mg.startswith('PF'):
                        revisedMarkerGeneIds.add(mg[0:mg.rfind('.')].replace('PF', 'pfam'))
                    else:
                        revisedMarkerGeneIds.add(mg)
                
                # get all genomes below the internal node (including genomes removed as duplicates)
                genomeIds = []
                for leaf in node.leaf_nodes():
                    genomeIds.append(leaf.taxon.label.replace('IMG_', ''))
                    if leaf.taxon.label in duplicateTaxa:
                        for genomeId in duplicateTaxa[leaf.taxon.label]:
                            genomeIds.append(genomeId.replace('IMG_', ''))
                            
                    genomeIds.append(leaf.taxon.label.replace('IMG_', ''))
                
                missingGenes = self.markerSetBuilder.missingGenes(genomeIds, revisedMarkerGeneIds, ubiquityThreshold)
                duplicateGenes = self.markerSetBuilder.duplicateGenes(genomeIds, revisedMarkerGeneIds, ubiquityThreshold)
                
            fout.write('%s\t%s\t%s\n' % (nodeId, str(missingGenes), str(duplicateGenes)))
            
        sys.stdout.write('\n')
            
        fout.close()
示例#26
0
class SimComparePlots(object):
    def __init__(self):
        
        self.plotPrefix = './simulations/simulation.draft.w_refinement_50'
        self.simCompareFile = './simulations/simCompare.draft.w_refinement_50.full.tsv'
        self.simCompareMarkerSetOut = './simulations/simCompare.draft.marker_set_table.w_refinement_50.tsv'
        self.simCompareConditionOut = './simulations/simCompare.draft.condition_table.w_refinement_50.tsv'
        self.simCompareTaxonomyTableOut = './simulations/simCompare.draft.taxonomy_table.w_refinement_50.tsv'
        self.simCompareRefinementTableOut = './simulations/simCompare.draft.refinment_table.w_refinement_50.tsv'
               
        #self.plotPrefix = './simulations/simulation.scaffolds.draft.w_refinement_50'
        #self.simCompareFile = './simulations/simCompare.scaffolds.draft.w_refinement_50.full.tsv'
        #self.simCompareMarkerSetOut = './simulations/simCompare.scaffolds.draft.marker_set_table.w_refinement_50.tsv'
        #self.simCompareConditionOut = './simulations/simCompare.scaffolds.draft.condition_table.w_refinement_50.tsv'
        #self.simCompareTaxonomyTableOut = './simulations/simCompare.scaffolds.draft.taxonomy_table.w_refinement_50.tsv'
        #self.simCompareRefinementTableOut = './simulations/simCompare.scaffolds.draft.refinment_table.w_refinement_50.tsv'
        
        #self.plotPrefix = './simulations/simulation.random_scaffolds.w_refinement_50'
        #self.simCompareFile = './simulations/simCompare.random_scaffolds.w_refinement_50.full.tsv'
        #self.simCompareMarkerSetOut = './simulations/simCompare.random_scaffolds.marker_set_table.w_refinement_50.tsv'
        #self.simCompareConditionOut = './simulations/simCompare.random_scaffolds.condition_table.w_refinement_50.tsv'
        #self.simCompareTaxonomyTableOut = './simulations/simCompare.random_scaffolds.taxonomy_table.w_refinement_50.tsv'
        #self.simCompareRefinementTableOut = './simulations/simCompare.random_scaffolds.refinment_table.w_refinement_50.tsv'
        
        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
        
        self.compsToConsider = [0.5, 0.7, 0.8, 0.9] #[0.5, 0.7, 0.8, 0.9]
        self.contsToConsider = [0.05, 0.1, 0.15] #[0.05, 0.1, 0.15]
        
        self.dpi = 1200
  
    def __readResults(self, filename):
        results = defaultdict(dict)
        genomeIds = set()
        with open(filename) as f:
            f.readline()
            for line in f:
                lineSplit = line.split('\t')
                
                simId = lineSplit[0]
                genomeId = simId.split('-')[0]
                genomeIds.add(genomeId)
                
                bestCompIM = [float(x) for x in lineSplit[6].split(',')]
                bestContIM = [float(x) for x in lineSplit[7].split(',')]
                
                bestCompMS = [float(x) for x in lineSplit[8].split(',')]
                bestContMS = [float(x) for x in lineSplit[9].split(',')]
                                
                domCompIM = [float(x) for x in lineSplit[10].split(',')]
                domContIM = [float(x) for x in lineSplit[11].split(',')]
                
                domCompMS = [float(x) for x in lineSplit[12].split(',')]
                domContMS = [float(x) for x in lineSplit[13].split(',')]
                
                simCompIM = [float(x) for x in lineSplit[14].split(',')]
                simContIM = [float(x) for x in lineSplit[15].split(',')]
                
                simCompMS = [float(x) for x in lineSplit[16].split(',')]
                simContMS = [float(x) for x in lineSplit[17].split(',')]
                
                simCompRMS = [float(x) for x in lineSplit[18].split(',')]
                simContRMS = [float(x) for x in lineSplit[19].split(',')]
                
                results[simId] = [bestCompIM, bestContIM, bestCompMS, bestContMS, domCompIM, domContIM, domCompMS, domContMS, simCompIM, simContIM, simCompMS, simContMS, simCompRMS, simContRMS]
                
        print '    Number of test genomes: ' + str(len(genomeIds))
        
        return results
    
    def markerSets(self, results):
        # summarize results from IM vs MS
        print '  Tabulating results for domain-level marker genes vs marker sets.'
        
        itemsProcessed = 0      
        compDataDict = defaultdict(lambda : defaultdict(list))
        contDataDict = defaultdict(lambda : defaultdict(list))

        genomeIds = set()
        for simId in results:
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            genomeId, seqLen, comp, cont = simId.split('-')
            genomeIds.add(genomeId)
            expCondStr = str(float(comp)) + '-' + str(float(cont)) + '-' + str(int(seqLen))
            
            compDataDict[expCondStr]['IM'] += results[simId][4]
            compDataDict[expCondStr]['MS'] += results[simId][6]

            contDataDict[expCondStr]['IM'] += results[simId][5]
            contDataDict[expCondStr]['MS'] += results[simId][7]
                
        print '  There are %d unique genomes.' % len(genomeIds)
              
        sys.stdout.write('\n')
        
        print '    There are %d experimental conditions.' % (len(compDataDict))
                
        # plot data
        print '  Plotting results.'
        compData = []
        contData = []
        rowLabels = []
        
        for comp in self.compsToConsider:
            for cont in self.contsToConsider:
                for seqLen in [20000]: 
                    for msStr in ['MS', 'IM']:
                        rowLabels.append(msStr +': %d%%, %d%%' % (comp*100, cont*100))
                        
                        expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                        compData.append(compDataDict[expCondStr][msStr])
                        contData.append(contDataDict[expCondStr][msStr])  
                                       
        print 'MS:\t%.2f\t%.2f' % (mean(abs(array(compData[0::2]))), mean(abs(array(contData[0::2]))))
        print 'IM:\t%.2f\t%.2f' % (mean(abs(array(compData[1::2]))), mean(abs(array(contData[1::2]))))   
            
        boxPlot = BoxPlot()
        plotFilename = self.plotPrefix + '.markerSets.png'
        boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                        r'$\Delta$' + ' % Completion', 'Simulation Conditions', 
                        r'$\Delta$' + ' % Contamination', None,
                        rowsPerCategory = 2, dpi = self.dpi)
        
        # print table of results 
        tableOut = open(self.simCompareMarkerSetOut, 'w')
        tableOut.write('Comp. (%)\tCont. (%)\tIM (5kb)\t\tMS (5kb)\t\tIM (20kb)\t\tMS (20kb)\t\tIM (50kb)\t\tMS (50kb)\n')
        
        avgComp = defaultdict(lambda : defaultdict(list))
        avgCont = defaultdict(lambda : defaultdict(list))
        for comp in [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]:
            for cont in [0.0, 0.05, 0.1, 0.15, 0.2]:
                
                tableOut.write('%d\t%d' % (comp*100, cont*100))
                
                for seqLen in [5000, 20000, 50000]:
                    expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                     
                    meanCompIM = mean(abs(array(compDataDict[expCondStr]['IM'])))
                    stdCompIM = std(abs(array(compDataDict[expCondStr]['IM'])))
                    meanContIM = mean(abs(array(contDataDict[expCondStr]['IM'])))
                    stdContIM = std(abs(array(contDataDict[expCondStr]['IM'])))
                    
                    avgComp[seqLen]['IM'] += compDataDict[expCondStr]['IM']
                    avgCont[seqLen]['IM'] += contDataDict[expCondStr]['IM']
                    
                    meanCompMS = mean(abs(array(compDataDict[expCondStr]['MS'])))
                    stdCompMS = std(abs(array(compDataDict[expCondStr]['MS'])))
                    meanContMS = mean(abs(array(contDataDict[expCondStr]['MS'])))
                    stdContMS = std(abs(array(contDataDict[expCondStr]['MS'])))
                    
                    avgComp[seqLen]['MS'] += compDataDict[expCondStr]['MS']
                    avgCont[seqLen]['MS'] += contDataDict[expCondStr]['MS']
                    
                    tableOut.write('\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f' % (meanCompIM, stdCompIM, meanCompMS, stdCompMS, meanContIM, stdContIM, meanContMS, stdContMS))
                tableOut.write('\n')
                
        tableOut.write('\tAverage:')
        for seqLen in [5000, 20000, 50000]: 
            meanCompIM = mean(abs(array(avgComp[seqLen]['IM'])))
            stdCompIM = std(abs(array(avgComp[seqLen]['IM'])))
            meanContIM = mean(abs(array(avgCont[seqLen]['IM'])))
            stdContIM = std(abs(array(avgCont[seqLen]['IM'])))
            
            meanCompMS = mean(abs(array(avgComp[seqLen]['MS'])))
            stdCompMS = std(abs(array(avgComp[seqLen]['MS'])))
            meanContMS = mean(abs(array(avgCont[seqLen]['MS'])))
            stdContMS = std(abs(array(avgCont[seqLen]['MS'])))
            
            tableOut.write('\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f\t%.1f+/-%.2f' % (meanCompIM, stdCompIM, meanCompMS, stdCompMS, meanContIM, stdContIM, meanContMS, stdContMS))
                        
        tableOut.write('\n')     
                
        tableOut.close()
    
    def conditionsPlot(self, results):
        # summarize results for each experimental condition  
        print '  Tabulating results for each experimental condition using marker sets.'
        
        itemsProcessed = 0      
        compDataDict = defaultdict(lambda : defaultdict(list))
        contDataDict = defaultdict(lambda : defaultdict(list))
        comps = set()
        conts = set()
        seqLens = set()
        
        compOutliers = defaultdict(list)
        contOutliers = defaultdict(list)
        
        genomeIds = set()
        for simId in results:
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            genomeId, seqLen, comp, cont = simId.split('-')
            genomeIds.add(genomeId)
            expCondStr = str(float(comp)) + '-' + str(float(cont)) + '-' + str(int(seqLen))
            
            comps.add(float(comp))
            conts.add(float(cont))
            seqLens.add(int(seqLen))
            
            compDataDict[expCondStr]['best'] += results[simId][2]
            compDataDict[expCondStr]['domain'] += results[simId][6]
            compDataDict[expCondStr]['selected'] += results[simId][10]
            
            for dComp in results[simId][2]:
                compOutliers[expCondStr] += [[dComp, genomeId]]
            
            contDataDict[expCondStr]['best'] += results[simId][3]
            contDataDict[expCondStr]['domain'] += results[simId][7]
            contDataDict[expCondStr]['selected'] += results[simId][11]
            
            for dCont in results[simId][3]:
                contOutliers[expCondStr] += [[dCont, genomeId]]
                
        print '  There are %d unique genomes.' % len(genomeIds)
              
        sys.stdout.write('\n')
        
        print '    There are %d experimental conditions.' % (len(compDataDict))
                
        # plot data
        print '  Plotting results.'
        compData = []
        contData = []
        rowLabels = []
        
        foutComp = open('./simulations/simulation.scaffolds.draft.comp_outliers.domain.tsv', 'w')
        foutCont = open('./simulations/simulation.scaffolds.draft.cont_outliers.domain.tsv', 'w')
        for comp in self.compsToConsider:
            for cont in self.contsToConsider:
                for msStr in ['best', 'selected', 'domain']:
                    for seqLen in [20000]: 
                        rowLabels.append(msStr +': %d%%, %d%%' % (comp*100, cont*100))
                        
                        expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                        compData.append(compDataDict[expCondStr][msStr])
                        contData.append(contDataDict[expCondStr][msStr])  
                    
                # report completenes outliers
                foutComp.write(expCondStr)

                compOutliers[expCondStr].sort()
                
                dComps = array([r[0] for r in compOutliers[expCondStr]])
                perc1 = scoreatpercentile(dComps, 1)
                perc99 = scoreatpercentile(dComps, 99)
                print expCondStr, perc1, perc99
                
                foutComp.write('\t%.2f\t%.2f' % (perc1, perc99))
                
                outliers = []
                for item in compOutliers[expCondStr]:
                    if item[0] < perc1 or item[0] > perc99:
                        outliers.append(item[1])
                        
                outlierCount = Counter(outliers)
                for genomeId, count in outlierCount.most_common():
                    foutComp.write('\t' + genomeId + ': ' + str(count))
                foutComp.write('\n')
                
                # report contamination outliers
                foutCont.write(expCondStr)

                contOutliers[expCondStr].sort()
                
                dConts = array([r[0] for r in contOutliers[expCondStr]])
                perc1 = scoreatpercentile(dConts, 1)
                perc99 = scoreatpercentile(dConts, 99)
                
                foutCont.write('\t%.2f\t%.2f' % (perc1, perc99))
                
                outliers = []
                for item in contOutliers[expCondStr]:
                    if item[0] < perc1 or item[0] > perc99:
                        outliers.append(item[1])
                        
                outlierCount = Counter(outliers)
                for genomeId, count in outlierCount.most_common():
                    foutCont.write('\t' + genomeId + ': ' + str(count))
                foutCont.write('\n')
                
        foutComp.close()
        foutCont.close()
                               
        print 'best:\t%.2f\t%.2f' % (mean(abs(array(compData[0::3]))), mean(abs(array(contData[0::3]))))
        print 'selected:\t%.2f\t%.2f' % (mean(abs(array(compData[1::3]))), mean(abs(array(contData[1::3]))))   
        print 'domain:\t%.2f\t%.2f' % (mean(abs(array(compData[2::3]))), mean(abs(array(contData[2::3]))))   

        boxPlot = BoxPlot()
        plotFilename = self.plotPrefix + '.conditions.png'
        boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                        r'$\Delta$' + ' % Completion', 'Simulation Conditions', 
                        r'$\Delta$' + ' % Contamination', None,
                        rowsPerCategory = 3, dpi = self.dpi)
        
        
        # print table of results 
        tableOut = open(self.simCompareConditionOut, 'w')
        tableOut.write('Comp. (%)\tCont. (%)\tbest (5kb)\t\tselected (5kb)\t\tdomain (5kb)\t\tbest (20kb)\t\tselected (20kb)\t\tdomain (20kb)\t\tbest (50kb)\t\tselected (50kb)\t\tdomain (50kb)\n')
        
        avgComp = defaultdict(lambda : defaultdict(list))
        avgCont = defaultdict(lambda : defaultdict(list))
        for comp in [0.5, 0.7, 0.8, 0.9, 0.95, 1.0]:
            for cont in [0.0, 0.05, 0.1, 0.15, 0.2]:
                
                tableOut.write('%d\t%d' % (comp*100, cont*100))
                
                for seqLen in [5000, 20000, 50000]:
                    expCondStr = str(comp) + '-' + str(cont) + '-' + str(seqLen)
                   
                    meanCompD = mean(abs(array(compDataDict[expCondStr]['domain'])))
                    stdCompD = std(abs(array(compDataDict[expCondStr]['domain'])))
                    meanContD = mean(abs(array(contDataDict[expCondStr]['domain'])))
                    stdContD = std(abs(array(contDataDict[expCondStr]['domain'])))
                    
                    avgComp[seqLen]['domain'] += compDataDict[expCondStr]['domain']
                    avgCont[seqLen]['domain'] += contDataDict[expCondStr]['domain']
                    
                    meanCompS = mean(abs(array(compDataDict[expCondStr]['selected'])))
                    stdCompS = std(abs(array(compDataDict[expCondStr]['selected'])))
                    meanContS = mean(abs(array(contDataDict[expCondStr]['selected'])))
                    stdContS = std(abs(array(contDataDict[expCondStr]['selected'])))
                    
                    avgComp[seqLen]['selected'] += compDataDict[expCondStr]['selected']
                    avgCont[seqLen]['selected'] += contDataDict[expCondStr]['selected']
                    
                    meanCompB = mean(abs(array(compDataDict[expCondStr]['best'])))
                    stdCompB = std(abs(array(compDataDict[expCondStr]['best'])))
                    meanContB = mean(abs(array(contDataDict[expCondStr]['best'])))
                    stdContB = std(abs(array(contDataDict[expCondStr]['best'])))
                    
                    avgComp[seqLen]['best'] += compDataDict[expCondStr]['best']
                    avgCont[seqLen]['best'] += contDataDict[expCondStr]['best']
                    
                    tableOut.write('\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f' % (meanCompD, meanCompS, meanCompB, meanContD, meanContS, meanContB))
                tableOut.write('\n')
                
        tableOut.write('\tAverage:')
        for seqLen in [5000, 20000, 50000]: 
            meanCompD = mean(abs(array(avgComp[seqLen]['domain'])))
            stdCompD = std(abs(array(avgComp[seqLen]['domain'])))
            meanContD = mean(abs(array(avgCont[seqLen]['domain'])))
            stdContD = std(abs(array(avgCont[seqLen]['domain'])))
            
            meanCompS = mean(abs(array(avgComp[seqLen]['selected'])))
            stdCompS = std(abs(array(avgComp[seqLen]['selected'])))
            meanContS = mean(abs(array(avgCont[seqLen]['selected'])))
            stdContS = std(abs(array(avgCont[seqLen]['selected'])))
            
            meanCompB = mean(abs(array(avgComp[seqLen]['best'])))
            stdCompB = std(abs(array(avgComp[seqLen]['best'])))
            meanContB = mean(abs(array(avgCont[seqLen]['best'])))
            stdContB = std(abs(array(avgCont[seqLen]['best'])))
            
            tableOut.write('\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f' % (meanCompD, meanCompS, meanCompB, meanContD, meanContS, meanContB))
                        
        tableOut.write('\n')     
                
        tableOut.close()
        
    def taxonomicPlots(self, results):
        # summarize results for different taxonomic groups  
        print '  Tabulating results for taxonomic groups.'
        
        metadata = self.img.genomeMetadata()
        
        itemsProcessed = 0      
        compDataDict = defaultdict(lambda : defaultdict(list))
        contDataDict = defaultdict(lambda : defaultdict(list))
        comps = set()
        conts = set()
        seqLens = set()
        
        ranksToProcess = 3
        taxaByRank = [set() for _ in xrange(0, ranksToProcess)]
        
        overallComp = []
        overallCont = []
                
        genomeInTaxon = defaultdict(set)
        testCases = 0
        for simId in results:
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            genomeId, seqLen, comp, cont = simId.split('-')
            
            if seqLen != '20000':
                continue
            
            if str(float(comp)) in ['0.5', '0.7', '0.8', '0.9'] and str(float(cont)) in ['0.05', '0.10', '0.1', '0.15']:
                print comp, cont
                taxonomy = metadata[genomeId]['taxonomy']
                
                testCases += 1
                
                comps.add(float(comp))
                conts.add(float(cont))
                seqLens.add(int(seqLen))
                
                overallComp += results[simId][10]
                overallCont += results[simId][11]
                
                for r in xrange(0, ranksToProcess):
                    taxon = taxonomy[r]
                    
                    if r == 0 and taxon == 'unclassified':
                        print '*****************************Unclassified at domain-level*****************'
                        continue
                    
                    if taxon == 'unclassified':
                        continue
                    
                    taxon = rankPrefixes[r] + taxon
                    
                    taxaByRank[r].add(taxon)
                                                    
                    compDataDict[taxon]['best'] += results[simId][2]
                    compDataDict[taxon]['domain'] += results[simId][6]
                    compDataDict[taxon]['selected'] += results[simId][10]
                    
                    contDataDict[taxon]['best'] += results[simId][3]
                    contDataDict[taxon]['domain'] += results[simId][7]
                    contDataDict[taxon]['selected'] += results[simId][11]
                    
                    genomeInTaxon[taxon].add(genomeId)
            
        sys.stdout.write('\n')
        
        print 'Test cases', testCases
        
        print ''        
        print 'Creating plots for:'
        print '  comps = ', comps
        print '  conts = ', conts
        
        print ''
        print '    There are %d taxa.' % (len(compDataDict))
        
        print ''
        print '  Overall bias:'
        print '    Selected comp: %.2f' % mean(overallComp)
        print '    Selected cont: %.2f' % mean(overallCont)
        
        # get list of ordered taxa by rank
        orderedTaxa = []
        for taxa in taxaByRank:
            orderedTaxa += sorted(taxa)
                
        # plot data
        print '  Plotting results.'
        compData = []
        contData = []
        rowLabels = []
        for taxon in orderedTaxa:
            for msStr in ['best', 'selected', 'domain']:
                numGenomes = len(genomeInTaxon[taxon])
                if numGenomes < 10: # skip groups with only a few genomes
                    continue
                
                rowLabels.append(msStr + ': ' + taxon + ' (' + str(numGenomes) + ')')
                compData.append(compDataDict[taxon][msStr])
                contData.append(contDataDict[taxon][msStr])        
                
        for i, rowLabel in enumerate(rowLabels):
            print rowLabel + '\t%.2f\t%.2f' % (mean(abs(array(compData[i]))), mean(abs(array(contData[i]))))            
                  
        # print taxonomic table of results organized by class
        taxonomyTableOut = open(self.simCompareTaxonomyTableOut, 'w')
        for taxon in orderedTaxa:
            numGenomes = len(genomeInTaxon[taxon])
            if numGenomes < 2: # skip groups with only a few genomes
                continue
                
            taxonomyTableOut.write(taxon + '\t' + str(numGenomes))
            for msStr in ['domain', 'selected']:                
                meanTaxonComp = mean(abs(array(compDataDict[taxon][msStr])))
                stdTaxonComp = std(abs(array(compDataDict[taxon][msStr])))
                meanTaxonCont = mean(abs(array(contDataDict[taxon][msStr])))
                stdTaxonCont = std(abs(array(contDataDict[taxon][msStr])))
                
                taxonomyTableOut.write('\t%.1f +/- %.2f\t%.1f +/- %.2f' % (meanTaxonComp, stdTaxonComp, meanTaxonCont, stdTaxonCont))
            taxonomyTableOut.write('\n')
        taxonomyTableOut.close()
        
        # create box plot
        boxPlot = BoxPlot()
        plotFilename = self.plotPrefix +  '.taxonomy.png'
        boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                        r'$\Delta$' + ' % Completion', None, 
                        r'$\Delta$' + ' % Contamination', None,
                        rowsPerCategory = 3, dpi = self.dpi)
    
    
    def refinementPlots(self, results):
        # summarize results for different CheckM refinements 
        print '  Tabulating results for different refinements.'
        
        metadata = self.img.genomeMetadata()
        
        itemsProcessed = 0      
        compDataDict = defaultdict(lambda : defaultdict(list))
        contDataDict = defaultdict(lambda : defaultdict(list))
        comps = set()
        conts = set()
        seqLens = set()
        
        ranksToProcess = 3
        taxaByRank = [set() for _ in xrange(0, ranksToProcess)]
        
        overallCompIM = []
        overallContIM = [] 
        
        overallCompMS = []
        overallContMS = [] 
        
        overallCompRMS = []
        overallContRMS = [] 
        
        genomeInTaxon = defaultdict(set)
        
        testCases = 0
        for simId in results:
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) test cases.' % (itemsProcessed, len(results), float(itemsProcessed)*100/len(results))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            genomeId, seqLen, comp, cont = simId.split('-')
            taxonomy = metadata[genomeId]['taxonomy']
            
            if float(comp) < 0.7 or float(cont) > 0.1:
                continue
            
            comps.add(float(comp))
            conts.add(float(cont))
            seqLens.add(int(seqLen))
            
            overallCompIM.append(results[simId][8])
            overallContIM.append(results[simId][9])
            
            overallCompMS.append(results[simId][10])
            overallContMS.append(results[simId][11])
            
            overallCompRMS.append(results[simId][12])
            overallContRMS.append(results[simId][13])
            
            for r in xrange(0, ranksToProcess):
                taxon = taxonomy[r]
                
                if taxon == 'unclassified':
                    continue
                
                taxaByRank[r].add(taxon)
                
                compDataDict[taxon]['IM'] += results[simId][8]
                compDataDict[taxon]['MS'] += results[simId][10]
                compDataDict[taxon]['RMS'] += results[simId][12]
                
                contDataDict[taxon]['IM'] += results[simId][9]
                contDataDict[taxon]['MS'] += results[simId][11]
                contDataDict[taxon]['RMS'] += results[simId][13]
                                
                genomeInTaxon[taxon].add(genomeId)
            
        sys.stdout.write('\n')
        
        print 'Creating plots for:'
        print '  comps = ', comps
        print '  conts = ', conts
        
        print ''
        print '    There are %d taxon.' % (len(compDataDict))
        print ''
        print 'Percentage change MS-IM comp: %.4f' % ((mean(abs(array(overallCompMS))) - mean(abs(array(overallCompIM)))) * 100 / mean(abs(array(overallCompIM))))
        print 'Percentage change MS-IM cont: %.4f' % ((mean(abs(array(overallContMS))) - mean(abs(array(overallContIM)))) * 100 / mean(abs(array(overallContIM))))
        print ''
        print 'Percentage change RMS-MS comp: %.4f' % ((mean(abs(array(overallCompRMS))) - mean(abs(array(overallCompMS)))) * 100 / mean(abs(array(overallCompIM))))
        print 'Percentage change RMS-MS cont: %.4f' % ((mean(abs(array(overallContRMS))) - mean(abs(array(overallContMS)))) * 100 / mean(abs(array(overallContIM))))
        
        print ''
        
        # get list of ordered taxa by rank
        orderedTaxa = []
        for taxa in taxaByRank:
            orderedTaxa += sorted(taxa)
             
        # print table of results organized by class
        refinmentTableOut = open(self.simCompareRefinementTableOut, 'w')
        for taxon in orderedTaxa:
            numGenomes = len(genomeInTaxon[taxon])
            if numGenomes < 2: # skip groups with only a few genomes
                continue
                
            refinmentTableOut.write(taxon + '\t' + str(numGenomes))
            for refineStr in ['IM', 'MS']:               
                meanTaxonComp = mean(abs(array(compDataDict[taxon][refineStr])))
                stdTaxonComp = std(abs(array(compDataDict[taxon][refineStr])))
                meanTaxonCont = mean(abs(array(contDataDict[taxon][refineStr])))
                stdTaxonCont = std(abs(array(contDataDict[taxon][refineStr])))
                
                refinmentTableOut.write('\t%.1f +/- %.2f\t%.1f +/- %.2f' % (meanTaxonComp, stdTaxonComp, meanTaxonCont, stdTaxonCont))
            
            perCompChange = (mean(abs(array(compDataDict[taxon]['IM']))) - meanTaxonComp) * 100 / mean(abs(array(compDataDict[taxon]['IM'])))
            perContChange = (mean(abs(array(contDataDict[taxon]['IM']))) - meanTaxonCont) * 100 / mean(abs(array(contDataDict[taxon]['IM'])))
            refinmentTableOut.write('\t%.2f\t%.2f\n' % (perCompChange, perContChange))
        refinmentTableOut.close()
       
        # plot data
        print '  Plotting results.'
        compData = []
        contData = []
        rowLabels = []
        for taxon in orderedTaxa:
            for refineStr in ['RMS', 'MS', 'IM']:
                numGenomes = len(genomeInTaxon[taxon])
                if numGenomes < 10: # skip groups with only a few genomes
                    continue

                rowLabels.append(refineStr + ': ' + taxon + ' (' + str(numGenomes) + ')')
                compData.append(compDataDict[taxon][refineStr])
                contData.append(contDataDict[taxon][refineStr])       
                
        for i, rowLabel in enumerate(rowLabels):
            print rowLabel + '\t%.2f\t%.2f' % (mean(abs(array(compData[i]))), mean(abs(array(contData[i]))))
            
        boxPlot = BoxPlot()
        plotFilename = self.plotPrefix + '.refinements.png'
        boxPlot.plot(plotFilename, compData, contData, rowLabels, 
                        r'$\Delta$' + ' % Completion', None, 
                        r'$\Delta$' + ' % Contamination', None,
                        rowsPerCategory = 3, dpi = self.dpi)
        
    def run(self):
        # read simulation results
        print '  Reading simulation results.'
        results = self.__readResults(self.simCompareFile)
        
        print '\n'         
        #self.markerSets(results)
                   
        print '\n'         
        #self.conditionsPlot(results)
        
        #print '\n'
        self.taxonomicPlots(results)
        
        print '\n'