示例#1
0
class Simulation(object):
    def __init__(self):
        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG()
        
        self.simContigLen = 10000
        
    def __selectMarkerSet(self, tree, internalNode, metadata, ubiquityThreshold, singleCopyThreshold, queueOut):
        """Select marker set for parent edge of specified internal node."""
        
        # get genomes descendant from each child of the specified internal node
        leaves = []
        for child in internalNode.child_nodes(): 
            genomeIds = set()  
            for leaf in child.leaf_nodes():
                genomeId = leaf.taxon.label.replace('IMG_', '')
                genomeIds.add(genomeId)
                
                duplicateGenomes = self.markerSetBuilder.duplicateSeqs.get(leaf.taxon.label, [])
                for dup in duplicateGenomes:
                    dupId = dup.replace('IMG_', '')
                    genomeIds.add(dupId)
                 
            leaves.append(genomeIds)
            
        # make sure each set of leaves contains at least a minimum number of genomes
        orderedLeaves = sorted(leaves, key=len)
        if len(orderedLeaves[0]) < 5:
            queueOut.put(('NA', -1, -1, -1, -1, -1))
            return
                   
        # calculate marker genes with all genomes in lineage with the fewest genomes removed 
        binMarkerGenes, _ = self.markerSetBuilder.buildBinMarkerSet(tree, internalNode, ubiquityThreshold, singleCopyThreshold, bMarkerSet = False, genomeIdsToRemove = orderedLeaves[0])
        
        # evaluate accuracy of completeness and contamination estimations on different partial genomes from lineage with fewest genomes   
        testGenomeIds = random.sample(orderedLeaves[0], min(len(orderedLeaves[0]), 100))    
        
        deltaComp = defaultdict(list)
        deltaCont = defaultdict(list)
        
        for testGenomeId in testGenomeIds:   
            geneDistTable = self.img.geneDistTable([testGenomeId], binMarkerGenes.getMarkerGenes(), spacingBetweenContigs=0)
            genomeSize = readFastaBases(os.path.join(self.img.genomeDir, testGenomeId, testGenomeId + '.fna'))
            
            repsPerGenome = 100
            for _ in xrange(0, repsPerGenome): 
                testComp = random.uniform(0.5, 1.0)
                testCont = random.uniform(0, 0.2)
                trueComp, trueCont, startPartialGenomeContigs = self.markerSetBuilder.sampleGenome(genomeSize, testComp, testCont, self.simContigLen)   
      
                for ms in binMarkerGenes.markerSetIter():  
                    containedMarkerGenes = self.markerSetBuilder.containedMarkerGenes(ms.getMarkerGenes(), geneDistTable[testGenomeId], startPartialGenomeContigs, self.simContigLen)
                    completeness, contamination = ms.genomeCheck(containedMarkerGenes, bIndividualMarkers=True)      
                    if completeness == 0.0:
                        print ms.getMarkerGenes()
                        print geneDistTable[testGenomeId]
                        print startPartialGenomeContigs
                        print genomeSize
                        print '*****************' + testGenomeId
                        sys.exit()
                    deltaComp[ms.lineageStr].append(completeness - trueComp)
                    deltaCont[ms.lineageStr].append(contamination - trueCont)
            
        # determine lineage-specific marker set with best average performance
        curBest = 1000
        bestUID = None
        dCompBest = 0
        dContBest = 0
        
        for lineageStr in deltaComp:
            dComp, dCont = mean(abs(array(deltaComp[lineageStr]))), mean(abs(array(deltaCont[lineageStr])))

            if (dComp + dCont) < curBest:
                dCompBest = dComp
                dContBest = dCont
                dCompStdBest = std(abs(array(deltaComp[lineageStr])))
                dContStdBest = std(abs(array(deltaCont[lineageStr])))
                bestUID = lineageStr.split('|')[0]
                curBest = dComp + dCont

        queueOut.put((internalNode, bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest))
                        
    def __workerThread(self, tree, metadata, ubiquityThreshold, singleCopyThreshold, queueIn, queueOut):
        """Process each data item in parallel."""

        while True:
            internalNode = queueIn.get(block=True, timeout=None)
            if internalNode == None:
                break
            
            self.__selectMarkerSet(tree, internalNode, metadata, ubiquityThreshold, singleCopyThreshold, queueOut)      
                      
    def __writerThread(self, numInternalNodes, writerQueue):
        """Store or write results of worker threads in a single thread."""

        fout = open('/tmp/simInferBestMarkerSet.tsv', 'w')
        fout.write('Internal node ID\tMarker set ID\tmean % delta comp\tstd % delta comp\tmean % delta cont\tstd % delta cont\n')

        itemsProcessed = 0
        while True:
            internalNode, bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest = writerQueue.get(block=True, timeout=None)
            if internalNode == None:
                break
            
            itemsProcessed += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) internal branches.' % (itemsProcessed, numInternalNodes, float(itemsProcessed)*100/(numInternalNodes))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            if internalNode != 'NA':
                fout.write(internalNode.label + '\t%s\t%.2f\t%.2f\t%.2f\t%.2f\n' % (bestUID, dCompBest, dCompStdBest, dContBest, dContStdBest)) 
            
        fout.close()

        sys.stdout.write('\n')

    def run(self, ubiquityThreshold, singleCopyThreshold, numThreads):
        random.seed(0)
          
        print '\n  Calculating global gene count table.'
        metadata = self.img.genomeMetadata()
        self.markerSetBuilder.globalGeneCountTable = self.img.geneCountTable(metadata.keys())
          
        print '\n  Reading reference genome tree.'
        treeFile = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'genome_tree', 'genome_tree_prok.refpkg', 'genome_tree.final.tre')
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True)
            
        print '  Evaluating %d internal nodes.' % len(tree.internal_nodes())
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for internalNode in tree.internal_nodes():
            if internalNode.parent_node != None:
                workerQueue.put(internalNode)

        for _ in range(numThreads):
            workerQueue.put(None)

        metadata = self.img.genomeMetadata()
        workerProc = [mp.Process(target = self.__workerThread, args = (tree, metadata, ubiquityThreshold, singleCopyThreshold, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target = self.__writerThread, args = (len(tree.internal_nodes())-1, writerQueue))

        writeProc.start()

        for p in workerProc:
            p.start()

        for p in workerProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None))
        writeProc.join()