示例#1
0
 def run(self, outputFile):
     img = IMG()
     
     print 'Identifying all IMG prokaryotic genomes with valid data.'
     metadata = img.genomeMetadata()
     genomeIds = img.genomeIdsByTaxonomy('prokaryotes', metadata)
     genomeMissingData = img.genomesWithMissingData(genomeIds)
     genomeIds -= genomeMissingData
     
     print '  Identified %d valid genomes.' % (len(genomeIds))
     
     print 'Calculating gene copy number for each genome.'
     countTable = img.geneCountTable(genomeIds)
     
     counts = []
     for _, count in countTable['pfam00318'].iteritems():
         counts.append(count)
             
     print len(genomeIds)
     print len(counts)
     print mean(counts)
      
     fout = open(outputFile, 'w')
     fout.write(str(countTable))
     fout.close()
     
     print 'Gene count dictionary to: ' + outputFile
示例#2
0
    def run(self, outputFile):
        img = IMG()

        print('Identifying all IMG prokaryotic genomes with valid data.')
        metadata = img.genomeMetadata()
        genomeIds = img.genomeIdsByTaxonomy('prokaryotes', metadata)
        genomeMissingData = img.genomesWithMissingData(genomeIds)
        genomeIds -= genomeMissingData

        print('  Identified %d valid genomes.' % (len(genomeIds)))

        print('Calculating gene copy number for each genome.')
        countTable = img.geneCountTable(genomeIds)

        counts = []
        for _, count in countTable['pfam00318'].iteritems():
            counts.append(count)

        print(len(genomeIds))
        print(len(counts))
        print(mean(counts))

        fout = open(outputFile, 'w')
        fout.write(str(countTable))
        fout.close()

        print('Gene count dictionary to: ' + outputFile)
示例#3
0
class MarkerSetStability(object):
    def __init__(self):
        self.img = IMG()
        self.markerset = MarkerSet()

    def __processLineage(self, metadata, ubiquityThreshold,
                         singleCopyThreshold, minGenomes, queueIn, queueOut):
        """Assess stability of marker set for a specific named taxonomic group."""
        while True:
            lineage = queueIn.get(block=True, timeout=None)
            if lineage == None:
                break

            genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata,
                                                     'trusted')

            changeMarkerSetSize = {}
            markerGenes = []
            if len(genomeIds) >= minGenomes:
                # calculate marker set for all genomes in lineage
                geneCountTable = self.img.geneCountTable(genomeIds)
                markerGenes = self.markerset.markerGenes(
                    genomeIds, geneCountTable,
                    ubiquityThreshold * len(genomeIds),
                    singleCopyThreshold * len(genomeIds))
                tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)
                markerGenes = markerGenes - tigrToRemove

                for selectPer in range(50, 101, 5):
                    numGenomesToSelect = int(
                        float(selectPer) / 100 * len(genomeIds))
                    perChange = []
                    for _ in range(0, 10):
                        # calculate marker set for subset of genomes
                        subsetGenomeIds = random.sample(
                            genomeIds, numGenomesToSelect)
                        geneCountTable = self.img.geneCountTable(
                            subsetGenomeIds)
                        subsetMarkerGenes = self.markerset.markerGenes(
                            subsetGenomeIds, geneCountTable,
                            ubiquityThreshold * numGenomesToSelect,
                            singleCopyThreshold * numGenomesToSelect)
                        tigrToRemove = self.img.identifyRedundantTIGRFAMs(
                            subsetMarkerGenes)
                        subsetMarkerGenes = subsetMarkerGenes - tigrToRemove

                        perChange.append(
                            float(
                                len(
                                    markerGenes.symmetric_difference(
                                        subsetMarkerGenes))) * 100.0 /
                            len(markerGenes))

                    changeMarkerSetSize[selectPer] = [
                        mean(perChange), std(perChange)
                    ]

            queueOut.put((lineage, len(genomeIds), len(markerGenes),
                          changeMarkerSetSize))

    def __storeResults(self, outputFile, totalLineages, writerQueue):
        """Store results to file."""

        fout = open(outputFile, 'w')
        fout.write(
            'Lineage\t# genomes\t# markers\tsubsample %\tmean % change\tstd % change\n'
        )

        numProcessedLineages = 0
        while True:
            lineage, numGenomes, numMarkerGenes, changeMarkerSetSize = writerQueue.get(
                block=True, timeout=None)
            if lineage == None:
                break

            numProcessedLineages += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) lineages.' % (
                numProcessedLineages, totalLineages,
                float(numProcessedLineages) * 100 / totalLineages)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            for selectPer in sorted(changeMarkerSetSize.keys()):
                fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' %
                           (lineage, numGenomes, numMarkerGenes, selectPer,
                            changeMarkerSetSize[selectPer][0],
                            changeMarkerSetSize[selectPer][1]))

        sys.stdout.write('\n')

        fout.close()

    def run(self, outputFile, ubiquityThreshold, singleCopyThreshold,
            minGenomes, mostSpecificRank, numThreads):
        """Calculate stability of marker sets for named taxonomic groups."""

        print('  Calculating stability of marker sets:')

        random.seed(1)

        # process each sequence in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        metadata = self.img.genomeMetadata()
        lineages = self.img.lineagesByCriteria(metadata, minGenomes,
                                               mostSpecificRank)

        #lineages = ['Bacteria']
        #lineages += ['Bacteria;Proteobacteria']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia;coli']

        #lineages = ['Archaea']
        #lineages += ['Archaea;Euryarchaeota']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosarcinaceae']

        for lineage in lineages:
            workerQueue.put(lineage)

        for _ in range(numThreads):
            workerQueue.put(None)

        calcProc = [
            mp.Process(target=self.__processLineage,
                       args=(metadata, ubiquityThreshold, singleCopyThreshold,
                             minGenomes, workerQueue, writerQueue))
            for _ in range(numThreads)
        ]
        writeProc = mp.Process(target=self.__storeResults,
                               args=(outputFile, len(lineages), writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None))
        writeProc.join()
示例#4
0
class GenomeTreeWorkflow(object):
    def __init__(self, outputDir):
        self.img = IMG()
        self.markerSetBuilder = MarkerSetBuilder()

        if os.path.exists(outputDir):
            print '[Error] Output directory already exists: ' + outputDir
            sys.exit(0)
        else:
            os.makedirs(outputDir)

        self.__checkForHMMER()
        self.__checkForFastTree()

        self.hmmDir = os.path.join(outputDir, 'phylo_hmms')
        self.alignmentDir = os.path.join(outputDir, 'gene_alignments')
        self.geneTreeDir = os.path.join(outputDir, 'gene_trees')
        self.conspecificGeneTreeDir = os.path.join(outputDir,
                                                   'gene_trees_conspecific')
        self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final')

        self.consistencyOut = os.path.join(outputDir,
                                           'genome_tree.consistency.tsv')
        self.concatenatedAlignFile = os.path.join(
            outputDir, 'genome_tree.concatenated.faa')
        self.derepConcatenatedAlignFile = os.path.join(
            outputDir, 'genome_tree.concatenated.derep.fasta')
        self.treeOut = os.path.join(outputDir, 'genome_tree.tre')
        self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre')

        self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre')
        self.treeTaxonomyOut = os.path.join(outputDir,
                                            'genome_tree.taxonomy.tre')
        self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre')
        self.treeDerepRootedOut = os.path.join(outputDir,
                                               'genome_tree.derep.rooted.tre')
        self.treeDerepBootstrapOut = os.path.join(outputDir,
                                                  'genome_tree.derep.bs.tre')
        self.treeDerepFinalOut = os.path.join(outputDir,
                                              'genome_tree.final.tre')
        self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv')
        self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv')
        self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm')
        self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt')

        self.phyloUbiquity = 0.90
        self.phyloSingleCopy = 0.90
        self.paralogAcceptPer = 0.01
        # self.consistencyAcceptPer = 0.95    # for trees at the class-level
        self.consistencyAcceptPer = 0.906  # for trees at the phylum-level
        self.consistencyMinTaxa = 20

        # create output directories
        os.makedirs(self.hmmDir)
        os.makedirs(self.alignmentDir)
        os.makedirs(self.geneTreeDir)
        os.makedirs(self.conspecificGeneTreeDir)
        os.makedirs(self.finalGeneTreeDir)

    def __checkForHMMER(self):
        """Check to see if HMMER is on the system path."""

        try:
            exit_status = os.system('hmmfetch -h > /dev/null')
        except:
            print "Unexpected error!", sys.exc_info()[0]
            raise

        if exit_status != 0:
            print "[Error] hmmfetch is not on the system path"
            sys.exit()

    def __checkForFastTree(self):
        """Check to see if FastTree is on the system path."""

        try:
            exit_status = os.system('FastTree 2> /dev/null')
        except:
            print "Unexpected error!", sys.exc_info()[0]
            raise

        if exit_status != 0:
            print "[Error] FastTree is not on the system path"
            sys.exit()

    def __genesInGenomes(self, genomeIds):
        genesInGenomes = {}
        for genomeId in genomeIds:
            markerIdToGeneIds = defaultdict(set)
            for line in open(
                    os.path.join(IMG.genomeDir, genomeId,
                                 genomeId + IMG.pfamExtension)):
                lineSplit = line.split('\t')
                markerIdToGeneIds[lineSplit[8]].add(lineSplit[0])

            for line in open(
                    os.path.join(IMG.genomeDir, genomeId,
                                 genomeId + IMG.tigrExtension)):
                lineSplit = line.split('\t')
                markerIdToGeneIds[lineSplit[6]].add(lineSplit[0])

            genesInGenomes[genomeId] = markerIdToGeneIds

        return genesInGenomes

    def __fetchMarkerModels(self, universalMarkerGenes, outputModelDir):
        markerIdToName = {}
        for line in open('/srv/whitlam/bio/db/pfam/27/Pfam-A.hmm'):
            if 'NAME' in line:
                name = line.split()[1].rstrip()
            elif 'ACC' in line:
                acc = line.split()[1].rstrip()
                markerId = acc.replace('PF', 'pfam')
                markerId = markerId[0:markerId.rfind('.')]
                markerIdToName[markerId] = name

        for markerId in universalMarkerGenes:
            if 'pfam' in markerId:
                os.system('hmmfetch  /srv/whitlam/bio/db/pfam/27/Pfam-A.hmm ' +
                          markerIdToName[markerId] + ' > ' +
                          os.path.join(outputModelDir,
                                       markerId.replace('pfam', 'PF') +
                                       '.hmm'))
            else:
                os.system('hmmfetch  /srv/whitlam/bio/db/tigrfam/13.0/' +
                          markerId + '.HMM ' + markerId + ' > ' +
                          os.path.join(outputModelDir, markerId + '.hmm'))

    def __alignMarkers(self, genomeIds, markerGenes, genesInGenomes,
                       numThreads, outputGeneDir, outputModelDir):
        """Perform multithreaded alignment of marker genes using HMM align."""

        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for _, markerId in enumerate(markerGenes):
            workerQueue.put(markerId)

        for _ in range(numThreads):
            workerQueue.put(None)

        calcProc = [
            mp.Process(target=self.__runHmmAlign,
                       args=(genomeIds, genesInGenomes, outputGeneDir,
                             outputModelDir, workerQueue, writerQueue))
            for _ in range(numThreads)
        ]
        writeProc = mp.Process(target=self.__reportThreads,
                               args=(len(markerGenes), writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put(None)
        writeProc.join()

    def __runHmmAlign(self, genomeIds, genesInGenomes, outputGeneDir,
                      outputModelDir, queueIn, queueOut):
        """Run each marker gene in a separate thread."""

        while True:
            markerId = queueIn.get(block=True, timeout=None)
            if markerId == None:
                break

            modelName = markerId
            if modelName.startswith('pfam'):
                modelName = modelName.replace('pfam', 'PF')

            markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa')
            fout = open(markerSeqFile, 'w')
            for genomeId in genomeIds:
                seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' +
                                 genomeId + '.genes.faa')

                for geneId in genesInGenomes[genomeId].get(markerId, []):
                    if geneId not in seqs:
                        # this shouldn't be necessary, but the IMG metadata isn't always
                        # perfectly in sync with the sequence data
                        continue

                    fout.write('>' + genomeId + '|' + geneId + '\n')
                    fout.write(seqs[geneId] + '\n')
            fout.close()

            hmmer = HMMERRunner('align')
            hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'),
                        markerSeqFile,
                        os.path.join(outputGeneDir, modelName + '.aln.faa'),
                        trim=False,
                        outputFormat='Pfam')
            self.__maskAlignment(
                os.path.join(outputGeneDir, modelName + '.aln.faa'),
                os.path.join(outputGeneDir, modelName + '.aln.masked.faa'))

            queueOut.put(modelName)

    def __reportThreads(self, numGenes, writerQueue):
        """Store confidence intervals (i.e., to shared memory)."""

        numProcessedGenes = 0
        while True:
            markerId = writerQueue.get(block=True, timeout=None)
            if markerId == None:
                break

            numProcessedGenes += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) marker genes.' % (
                numProcessedGenes, numGenes,
                float(numProcessedGenes) * 100 / numGenes)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

        sys.stdout.write('\n')

    def __maskAlignment(self, inputFile, outputFile):
        """Read HMMER alignment in STOCKHOLM format and output masked alignment in FASTA format."""
        # read STOCKHOLM alignment
        seqs = {}
        for line in open(inputFile):
            line = line.rstrip()
            if line == '' or line[0] == '#' or line == '//':
                if 'GC RF' in line:
                    mask = line.split('GC RF')[1].strip()
                continue
            else:
                lineSplit = line.split()
                seqs[lineSplit[0]] = lineSplit[1].upper().replace('.',
                                                                  '-').strip()

        # output masked sequences in FASTA format
        fout = open(outputFile, 'w')
        for seqId, seq in seqs.iteritems():
            fout.write('>' + seqId + '\n')

            maskedSeq = ''.join(
                [seq[i] for i in xrange(0, len(seq)) if mask[i] == 'x'])
            fout.write(maskedSeq + '\n')
        fout.close()

    def __taxonomicMarkers(self, taxaStr, metadata, ubiquityThreshold,
                           singleCopyThreshold):
        """Get genomes and marker genes for a specific lineage."""

        genomeIds = self.img.genomeIdsByTaxonomy(taxaStr, metadata)

        # hack to add in other genomes with incorrect taxonomy
        aceIds = set()
        for line in open('./taxonomicTrees/firmicutes.nds'):
            aceIds.add(line.strip())

        aceIdsToImgIds = self.aceIdsToImgIds()
        for aceId in aceIds:
            if aceId in aceIdsToImgIds:
                genomeId = aceIdsToImgIds[aceId]
                if genomeId in metadata:
                    genomeIds.add(genomeId)

        markerGenes = self.markerSetBuilder.buildMarkerGenes(
            genomeIds, ubiquityThreshold, singleCopyThreshold)

        return genomeIds, markerGenes

    def inferGeneTrees(self, phyloUbiquityThreshold, phyloSingleCopyThreshold,
                       numThreads, outputGeneDir, outputModelDir,
                       outgroupSize):
        # make sure output directory is empty
        if not os.path.exists(outputGeneDir):
            os.makedirs(outputGeneDir)

        if not os.path.exists(outputModelDir):
            os.makedirs(outputModelDir)

        files = os.listdir(outputGeneDir)
        for f in files:
            os.remove(os.path.join(outputGeneDir, f))

        # get genomes and marker genes for taxonomic groups of interest
        print ''
        print 'Identifying genomes and marker genes of interest:'
        metadata = self.img.genomeMetadata()
        ingroupGenomeIds, ingroupMarkers = self.__taxonomicMarkers(
            'Bacteria;Firmicutes', metadata, phyloUbiquityThreshold,
            phyloSingleCopyThreshold)
        outgroupGenomeIds = self.img.genomeIdsByTaxonomy(
            'Bacteria;Coprothermobacter', metadata)
        # alphaGenomeIds, _ = self.__taxonomicMarkers('Bacteria;Proteobacteria;Alphaproteobacteria', metadata, phyloUbiquityThreshold, phyloSingleCopyThreshold)

        print '  Identified ingroup genomes: %d' % len(ingroupGenomeIds)
        print '  Identified outgroup genomes: %d' % len(outgroupGenomeIds)

        numOutgroupTaxa = min(outgroupSize, len(outgroupGenomeIds))
        print ''
        print '  Selecting %d taxa from the outgroup.' % (numOutgroupTaxa)
        genomeIds = ingroupGenomeIds.union(
            random.sample(outgroupGenomeIds, numOutgroupTaxa))

        self.imgIdsToAceIds(genomeIds)

        print '  Identified markers: %d' % len(ingroupMarkers)

        # get mapping of marker ids to gene ids for each genome
        print '  Determine genes for genomes of interest.'
        genesInGenomes = self.__genesInGenomes(genomeIds)

        # get HMM for each marker gene
        print '  Fetching HMM for each marker genes.'
        self.__fetchMarkerModels(ingroupMarkers, outputModelDir)

        # align gene sequences and infer gene trees
        print '  Aligning marker genes:'
        #***self.__alignMarkers(genomeIds, ingroupMarkers, genesInGenomes, numThreads, outputGeneDir, outputModelDir)

        return genomeIds

    def imgIdsToAceIds(self, imgIds):
        imgIdToAceId = {}
        for line in open('ggg_tax_img.feb_2014.txt'):
            lineSplit = line.split('\t')
            imgIdToAceId[lineSplit[1].rstrip()] = lineSplit[0]

        missing = 0
        for imgId in imgIds:
            if imgId not in imgIdToAceId:
                missing += 1

        print '  Number of genomes without an ACE id: ' + str(missing)

        return imgIdToAceId

    def aceIdsToImgIds(self):
        aceIdsToImgIds = {}
        for line in open('ggg_tax_img.feb_2014.txt'):
            lineSplit = line.split('\t')
            aceIdsToImgIds[lineSplit[0].strip()] = lineSplit[1].strip()

        return aceIdsToImgIds

    def run(self, numThreads, outgroupSize):

        # identify genes suitable for phylogenetic inference
        print '--- Identifying genes suitable for phylogenetic inference ---'
        genomeIds = self.inferGeneTrees(self.phyloUbiquity,
                                        self.phyloSingleCopy, numThreads,
                                        self.alignmentDir, self.hmmDir,
                                        outgroupSize)

        # infer gene trees
        print ''
        print '--- Inferring gene trees ---'
        makeTrees = MakeTrees()
        makeTrees.run(self.alignmentDir, self.geneTreeDir, '.aln.masked.faa',
                      numThreads)

        # test gene trees for paralogs
        print ''
        print '--- Testing for paralogs in gene trees ---'
        paralogTest = ParalogTest()
        paralogTest.run(self.geneTreeDir, self.paralogAcceptPer, '.tre',
                        self.conspecificGeneTreeDir)

        # test gene trees for consistency with IMG taxonomy
        print ''
        print '--- Testing taxonomic consistency of gene trees ---'
        consistencyTest = ConsistencyTest()
        consistencyTest.run(self.conspecificGeneTreeDir, '.tre',
                            self.consistencyAcceptPer, self.consistencyMinTaxa,
                            self.consistencyOut, self.finalGeneTreeDir)

        # gather phylogenetically informative HMMs into a single model file
        print ''
        print '--- Gathering phylogenetically informative HMMs ---'
        getPhylogeneticHMMs = GetPhylogeneticHMMs()
        getPhylogeneticHMMs.run(self.hmmDir, self.finalGeneTreeDir,
                                self.phyloHMMsOut)

        # infer genome tree
        print ''
        print '--- Inferring full genome tree ---'
        inferGenomeTree = InferGenomeTree()
        inferGenomeTree.run(self.finalGeneTreeDir,
                            self.alignmentDir,
                            '.aln.masked.faa',
                            self.concatenatedAlignFile,
                            self.treeOut,
                            self.taxonomyOut,
                            bSupportValues=True)

        # replace IMG identifiers with ACE identifiers
        imgIdToAceId = self.imgIdsToAceIds(genomeIds)
        with open(self.treeOut) as f:
            tree = ''.join(f.readlines())

            for genomeId in genomeIds:
                if genomeId in imgIdToAceId:
                    tree = tree.replace('IMG_' + genomeId,
                                        imgIdToAceId[genomeId])

        fout = open(self.treeOutAce, 'w')
        fout.write(tree)
        fout.close()
示例#5
0
class MarkerSetStabilityTest(object):
    def __init__(self):
        self.img = IMG()
        self.markerset = MarkerSet()

    def __processLineage(self, metadata, ubiquityThreshold,
                         singleCopyThreshold, minGenomes, queueIn, queueOut):
        """Assess stability of marker set for a specific named taxonomic group."""
        while True:
            lineage = queueIn.get(block=True, timeout=None)
            if lineage == None:
                break

            genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata,
                                                     'trusted')

            markerGenes = []
            perChange = []
            numGenomesToSelect = int(0.9 * len(genomeIds))
            if len(genomeIds) >= minGenomes:
                # calculate marker set for all genomes in lineage
                geneCountTable = self.img.geneCountTable(genomeIds)
                markerGenes = self.markerset.markerGenes(
                    genomeIds, geneCountTable,
                    ubiquityThreshold * len(genomeIds),
                    singleCopyThreshold * len(genomeIds))
                tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)

                markerGenes = markerGenes - tigrToRemove

                for _ in range(0, 100):
                    # calculate marker set for subset of genomes
                    subsetGenomeIds = random.sample(genomeIds,
                                                    numGenomesToSelect)
                    geneCountTable = self.img.geneCountTable(subsetGenomeIds)
                    subsetMarkerGenes = self.markerset.markerGenes(
                        subsetGenomeIds, geneCountTable,
                        ubiquityThreshold * numGenomesToSelect,
                        singleCopyThreshold * numGenomesToSelect)
                    tigrToRemove = self.img.identifyRedundantTIGRFAMs(
                        subsetMarkerGenes)
                    subsetMarkerGenes = subsetMarkerGenes - tigrToRemove

                    perChange.append(
                        float(
                            len(
                                markerGenes.symmetric_difference(
                                    subsetMarkerGenes))) * 100.0 /
                        len(markerGenes))

            if perChange != []:
                queueOut.put(
                    (lineage, len(genomeIds), len(markerGenes),
                     numGenomesToSelect, mean(perChange), std(perChange)))
            else:
                queueOut.put((lineage, len(genomeIds), len(markerGenes),
                              numGenomesToSelect, -1, -1))

    def __storeResults(self, outputFile, totalLineages, writerQueue):
        """Store results to file."""

        fout = open(outputFile, 'w')
        fout.write(
            'Lineage\t# genomes\t# markers\t# sampled genomes\tmean % change\tstd % change\n'
        )

        numProcessedLineages = 0
        while True:
            lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange = writerQueue.get(
                block=True, timeout=None)
            if lineage == None:
                break

            numProcessedLineages += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) lineages.' % (
                numProcessedLineages, totalLineages,
                float(numProcessedLineages) * 100 / totalLineages)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' %
                       (lineage, numGenomes, numMarkerGenes, numSampledGenomes,
                        meanPerChange, stdPerChange))

        sys.stdout.write('\n')

        fout.close()

    def run(self, outputFile, ubiquityThreshold, singleCopyThreshold,
            minGenomes, mostSpecificRank, numThreads):
        """Calculate stability of marker sets for named taxonomic groups."""

        print('  Testing stability of marker sets:')

        random.seed(1)

        # process each sequence in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        metadata = self.img.genomeMetadata()
        lineages = self.img.lineagesByCriteria(metadata, minGenomes,
                                               mostSpecificRank)

        for lineage in lineages:
            workerQueue.put(lineage)

        for _ in range(numThreads):
            workerQueue.put(None)

        calcProc = [
            mp.Process(target=self.__processLineage,
                       args=(metadata, ubiquityThreshold, singleCopyThreshold,
                             minGenomes, workerQueue, writerQueue))
            for _ in range(numThreads)
        ]
        writeProc = mp.Process(target=self.__storeResults,
                               args=(outputFile, len(lineages), writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None))
        writeProc.join()
示例#6
0
class MarkerSetStability(object):
    def __init__(self):
        self.img = IMG()
        self.markerset = MarkerSet()

    def __processLineage(self, metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, queueIn, queueOut):
        """Assess stability of marker set for a specific named taxonomic group."""
        while True:
            lineage = queueIn.get(block=True, timeout=None) 
            if lineage == None:
                break  
            
            genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata, 'trusted')
            
            changeMarkerSetSize = {}
            markerGenes = []
            if len(genomeIds) >= minGenomes:  
                # calculate marker set for all genomes in lineage          
                geneCountTable = self.img.geneCountTable(genomeIds)
                markerGenes = self.markerset.markerGenes(genomeIds, geneCountTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
                tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)
                markerGenes = markerGenes - tigrToRemove
     
                for selectPer in xrange(50, 101, 5):
                    numGenomesToSelect = int(float(selectPer)/100 * len(genomeIds))
                    perChange = []
                    for _ in xrange(0, 10):
                        # calculate marker set for subset of genomes
                        subsetGenomeIds = random.sample(genomeIds, numGenomesToSelect)
                        geneCountTable = self.img.geneCountTable(subsetGenomeIds)
                        subsetMarkerGenes = self.markerset.markerGenes(subsetGenomeIds, geneCountTable, ubiquityThreshold*numGenomesToSelect, singleCopyThreshold*numGenomesToSelect)
                        tigrToRemove = self.img.identifyRedundantTIGRFAMs(subsetMarkerGenes)
                        subsetMarkerGenes = subsetMarkerGenes - tigrToRemove
    
                        perChange.append(float(len(markerGenes.symmetric_difference(subsetMarkerGenes)))*100.0 / len(markerGenes))
    
                    changeMarkerSetSize[selectPer] = [mean(perChange), std(perChange)]  

            queueOut.put((lineage, len(genomeIds), len(markerGenes), changeMarkerSetSize))

    def __storeResults(self, outputFile, totalLineages, writerQueue):
        """Store results to file."""
        
        fout = open(outputFile, 'w')
        fout.write('Lineage\t# genomes\t# markers\tsubsample %\tmean % change\tstd % change\n')

        numProcessedLineages = 0
        while True:
            lineage, numGenomes, numMarkerGenes, changeMarkerSetSize = writerQueue.get(block=True, timeout=None)
            if lineage == None:
                break
                    
            numProcessedLineages += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) lineages.' % (numProcessedLineages, totalLineages, float(numProcessedLineages)*100/totalLineages)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            
            for selectPer in sorted(changeMarkerSetSize.keys()): 
                fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' % (lineage, numGenomes, numMarkerGenes, selectPer, changeMarkerSetSize[selectPer][0], changeMarkerSetSize[selectPer][1]))

        sys.stdout.write('\n')
            
        fout.close()
        
        
    def run(self, outputFile, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, numThreads):
        """Calculate stability of marker sets for named taxonomic groups."""  
        
        print '  Calculating stability of marker sets:'
        
        random.seed(1)
        
        # process each sequence in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        metadata = self.img.genomeMetadata()
        lineages = self.img.lineagesByCriteria(metadata, minGenomes, mostSpecificRank)
        
        #lineages = ['Bacteria']
        #lineages += ['Bacteria;Proteobacteria']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia']
        #lineages += ['Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacteriales;Enterobacteriaceae;Escherichia;coli']
        
        #lineages = ['Archaea']
        #lineages += ['Archaea;Euryarchaeota']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales']
        #lineages += ['Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosarcinaceae']

        for lineage in lineages:
            workerQueue.put(lineage)

        for _ in range(numThreads):
            workerQueue.put(None)
 
        calcProc = [mp.Process(target = self.__processLineage, args = (metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target = self.__storeResults, args = (outputFile, len(lineages), writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None))
        writeProc.join()
class GenomeTreeWorkflow(object):
    def __init__(self, outputDir):
        self.img = IMG()
        self.markerSetBuilder = MarkerSetBuilder()

        if os.path.exists(outputDir):
            print '[Error] Output directory already exists: ' + outputDir
            sys.exit(0)
        else:
            os.makedirs(outputDir)

        self.__checkForHMMER()
        self.__checkForFastTree()

        self.hmmDir = os.path.join(outputDir, 'phylo_hmms')
        self.alignmentDir = os.path.join(outputDir, 'gene_alignments')
        self.geneTreeDir = os.path.join(outputDir, 'gene_trees')
        self.conspecificGeneTreeDir = os.path.join(outputDir, 'gene_trees_conspecific')
        self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final')

        self.consistencyOut = os.path.join(outputDir, 'genome_tree.consistency.tsv')
        self.concatenatedAlignFile = os.path.join(outputDir, 'genome_tree.concatenated.faa')
        self.derepConcatenatedAlignFile = os.path.join(outputDir, 'genome_tree.concatenated.derep.fasta')
        self.treeOut = os.path.join(outputDir, 'genome_tree.tre')
        self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre')

        self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre')
        self.treeTaxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tre')
        self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre')
        self.treeDerepRootedOut = os.path.join(outputDir, 'genome_tree.derep.rooted.tre')
        self.treeDerepBootstrapOut = os.path.join(outputDir, 'genome_tree.derep.bs.tre')
        self.treeDerepFinalOut = os.path.join(outputDir, 'genome_tree.final.tre')
        self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv')
        self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv')
        self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm')
        self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt')

        self.phyloUbiquity = 0.90
        self.phyloSingleCopy = 0.90
        self.paralogAcceptPer = 0.01
        #self.consistencyAcceptPer = 0.95    # for trees at the class-level
        self.consistencyAcceptPer = 0.906   # for trees at the phylum-level
        self.consistencyMinTaxa = 20

        # create output directories
        os.makedirs(self.hmmDir)
        os.makedirs(self.alignmentDir)
        os.makedirs(self.geneTreeDir)
        os.makedirs(self.conspecificGeneTreeDir)
        os.makedirs(self.finalGeneTreeDir)

    def __checkForHMMER(self):
        """Check to see if HMMER is on the system path."""

        try:
            exit_status = os.system('hmmfetch -h > /dev/null')
        except:
            print "Unexpected error!", sys.exc_info()[0]
            raise

        if exit_status != 0:
            print "[Error] hmmfetch is not on the system path"
            sys.exit()

    def __checkForFastTree(self):
        """Check to see if FastTree is on the system path."""

        try:
            exit_status = os.system('FastTree 2> /dev/null')
        except:
            print "Unexpected error!", sys.exc_info()[0]
            raise

        if exit_status != 0:
            print "[Error] FastTree is not on the system path"
            sys.exit()

    def __genesInGenomes(self, genomeIds):
        genesInGenomes = {}
        for genomeId in genomeIds:
            markerIdToGeneIds = defaultdict(set)
            for line in open(os.path.join(IMG.genomeDir, genomeId, genomeId + IMG.pfamExtension)):
                lineSplit = line.split('\t')
                markerIdToGeneIds[lineSplit[8]].add(lineSplit[0])

            for line in open(os.path.join(IMG.genomeDir, genomeId, genomeId + IMG.tigrExtension)):
                lineSplit = line.split('\t')
                markerIdToGeneIds[lineSplit[6]].add(lineSplit[0])

            genesInGenomes[genomeId] = markerIdToGeneIds

        return genesInGenomes

    def __fetchMarkerModels(self, universalMarkerGenes, outputModelDir):
        markerIdToName = {}
        for line in open('/srv/whitlam/bio/db/pfam/27/Pfam-A.hmm'):
            if 'NAME' in line:
                name = line.split()[1].rstrip()
            elif 'ACC' in line:
                acc = line.split()[1].rstrip()
                markerId = acc.replace('PF', 'pfam')
                markerId = markerId[0:markerId.rfind('.')]
                markerIdToName[markerId] = name

        for markerId in universalMarkerGenes:
            if 'pfam' in markerId:
                os.system('hmmfetch  /srv/whitlam/bio/db/pfam/27/Pfam-A.hmm ' + markerIdToName[markerId] + ' > ' + os.path.join(outputModelDir, markerId.replace('pfam', 'PF') + '.hmm'))
            else:
                os.system('hmmfetch  /srv/whitlam/bio/db/tigrfam/13.0/' + markerId + '.HMM ' + markerId + ' > ' + os.path.join(outputModelDir, markerId + '.hmm'))

    def __alignMarkers(self, genomeIds, markerGenes, genesInGenomes, numThreads, outputGeneDir, outputModelDir):
        """Perform multithreaded alignment of marker genes using HMM align."""

        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for _, markerId in enumerate(markerGenes):
            workerQueue.put(markerId)

        for _ in range(numThreads):
            workerQueue.put(None)

        calcProc = [mp.Process(target = self.__runHmmAlign, args = (genomeIds, genesInGenomes, outputGeneDir, outputModelDir, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target = self.__reportThreads, args = (len(markerGenes), writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put(None)
        writeProc.join()

    def __runHmmAlign(self, genomeIds, genesInGenomes, outputGeneDir, outputModelDir, queueIn, queueOut):
        """Run each marker gene in a separate thread."""

        while True:
            markerId = queueIn.get(block=True, timeout=None)
            if markerId == None:
                break

            modelName = markerId
            if modelName.startswith('pfam'):
                modelName = modelName.replace('pfam', 'PF')

            markerSeqFile = os.path.join(outputGeneDir, modelName + '.faa')
            fout = open(markerSeqFile, 'w')
            for genomeId in genomeIds:
                seqs = readFasta(IMG.genomeDir + '/' + genomeId + '/' + genomeId + '.genes.faa')

                for geneId in genesInGenomes[genomeId].get(markerId, []):
                    if geneId not in seqs:
                        # this shouldn't be necessary, but the IMG metadata isn't always
                        # perfectly in sync with the sequence data
                        continue

                    fout.write('>' + genomeId + '|' + geneId + '\n')
                    fout.write(seqs[geneId] + '\n')
            fout.close()

            hmmer = HMMERRunner('align')
            hmmer.align(os.path.join(outputModelDir, modelName + '.hmm'), markerSeqFile, os.path.join(outputGeneDir, modelName + '.aln.faa'), trim=False, outputFormat='Pfam')
            self.__maskAlignment(os.path.join(outputGeneDir, modelName + '.aln.faa'), os.path.join(outputGeneDir, modelName + '.aln.masked.faa'))

            queueOut.put(modelName)

    def __reportThreads(self, numGenes, writerQueue):
        """Store confidence intervals (i.e., to shared memory)."""

        numProcessedGenes = 0
        while True:
            markerId = writerQueue.get(block=True, timeout=None)
            if markerId == None:
                break

            numProcessedGenes += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) marker genes.' % (numProcessedGenes, numGenes, float(numProcessedGenes)*100/numGenes)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

        sys.stdout.write('\n')

    def __maskAlignment(self, inputFile, outputFile):
        """Read HMMER alignment in STOCKHOLM format and output masked alignment in FASTA format."""
        # read STOCKHOLM alignment
        seqs = {}
        for line in open(inputFile):
            line = line.rstrip()
            if line == '' or line[0] == '#' or line == '//':
                if 'GC RF' in line:
                    mask = line.split('GC RF')[1].strip()
                continue
            else:
                lineSplit = line.split()
                seqs[lineSplit[0]] = lineSplit[1].upper().replace('.', '-').strip()

        # output masked sequences in FASTA format
        fout = open(outputFile, 'w')
        for seqId, seq in seqs.iteritems():
            fout.write('>' + seqId + '\n')

            maskedSeq = ''.join([seq[i] for i in xrange(0, len(seq)) if mask[i] == 'x'])
            fout.write(maskedSeq + '\n')
        fout.close()

    def __taxonomicMarkers(self, taxaStr, metadata, ubiquityThreshold, singleCopyThreshold):
        """Get genomes and marker genes for a specific lineage."""

        genomeIds = self.img.genomeIdsByTaxonomy(taxaStr, metadata)
        
        # hack to add in other genomes with incorrect taxonomy
        aceIds = set()
        for line in open('./taxonomicTrees/firmicutes.nds'):
            aceIds.add(line.strip())
            
        aceIdsToImgIds = self.aceIdsToImgIds()
        for aceId in aceIds:
            if aceId in aceIdsToImgIds:
                genomeId = aceIdsToImgIds[aceId]
                if genomeId in metadata:
                    genomeIds.add(genomeId)
        
        markerGenes = self.markerSetBuilder.buildMarkerGenes(genomeIds, ubiquityThreshold, singleCopyThreshold )

        return genomeIds, markerGenes

    def inferGeneTrees(self, phyloUbiquityThreshold, phyloSingleCopyThreshold, numThreads, outputGeneDir, outputModelDir, outgroupSize):
        # make sure output directory is empty
        if not os.path.exists(outputGeneDir):
            os.makedirs(outputGeneDir)

        if not os.path.exists(outputModelDir):
            os.makedirs(outputModelDir)

        files = os.listdir(outputGeneDir)
        for f in files:
            os.remove(os.path.join(outputGeneDir, f))

        # get genomes and marker genes for taxonomic groups of interest
        print ''
        print 'Identifying genomes and marker genes of interest:'
        metadata = self.img.genomeMetadata()
        ingroupGenomeIds, ingroupMarkers = self.__taxonomicMarkers('Bacteria;Firmicutes', metadata, phyloUbiquityThreshold, phyloSingleCopyThreshold)
        outgroupGenomeIds = self.img.genomeIdsByTaxonomy('Bacteria;Coprothermobacter', metadata)
        #alphaGenomeIds, _ = self.__taxonomicMarkers('Bacteria;Proteobacteria;Alphaproteobacteria', metadata, phyloUbiquityThreshold, phyloSingleCopyThreshold)

        print '  Identified ingroup genomes: %d' % len(ingroupGenomeIds)
        print '  Identified outgroup genomes: %d' % len(outgroupGenomeIds)

        numOutgroupTaxa = min(outgroupSize, len(outgroupGenomeIds))
        print ''
        print '  Selecting %d taxa from the outgroup.' % (numOutgroupTaxa)
        genomeIds = ingroupGenomeIds.union(random.sample(outgroupGenomeIds, numOutgroupTaxa))

        self.imgIdsToAceIds(genomeIds)

        print '  Identified markers: %d' % len(ingroupMarkers)

        # get mapping of marker ids to gene ids for each genome
        print '  Determine genes for genomes of interest.'
        genesInGenomes = self.__genesInGenomes(genomeIds)

        # get HMM for each marker gene
        print '  Fetching HMM for each marker genes.'
        self.__fetchMarkerModels(ingroupMarkers, outputModelDir)

        # align gene sequences and infer gene trees
        print '  Aligning marker genes:'
        self.__alignMarkers(genomeIds, ingroupMarkers, genesInGenomes, numThreads, outputGeneDir, outputModelDir)

        return genomeIds

    def imgIdsToAceIds(self, imgIds):
        imgIdToAceId = {}
        for line in open('ggg_tax_img.feb_2014.txt'):
            lineSplit = line.split('\t')
            imgIdToAceId[lineSplit[1].rstrip()] = lineSplit[0]

        missing = 0
        for imgId in imgIds:
            if imgId not in imgIdToAceId:
                missing += 1

        print '  Number of genomes without an ACE id: ' + str(missing)

        return imgIdToAceId
    
    def aceIdsToImgIds(self):
        aceIdsToImgIds = {}
        for line in open('ggg_tax_img.feb_2014.txt'):
            lineSplit = line.split('\t')
            aceIdsToImgIds[lineSplit[0].strip()] = lineSplit[1].strip()

        return aceIdsToImgIds

    def run(self, numThreads, outgroupSize):

        # identify genes suitable for phylogenetic inference
        print '--- Identifying genes suitable for phylogenetic inference ---'
        genomeIds = self.inferGeneTrees(self.phyloUbiquity, self.phyloSingleCopy, numThreads, self.alignmentDir, self.hmmDir, outgroupSize)

        # infer gene trees
        print ''
        print '--- Inferring gene trees ---'
        makeTrees = MakeTrees()
        makeTrees.run(self.alignmentDir, self.geneTreeDir, '.aln.masked.faa', numThreads)

        # test gene trees for paralogs
        print ''
        print '--- Testing for paralogs in gene trees ---'
        paralogTest = ParalogTest()
        paralogTest.run(self.geneTreeDir, self.paralogAcceptPer, '.tre', self.conspecificGeneTreeDir)

        # test gene trees for consistency with IMG taxonomy
        print ''
        print '--- Testing taxonomic consistency of gene trees ---'
        consistencyTest = ConsistencyTest()
        consistencyTest.run(self.conspecificGeneTreeDir, '.tre', self.consistencyAcceptPer, self.consistencyMinTaxa, self.consistencyOut, self.finalGeneTreeDir)

        # gather phylogenetically informative HMMs into a single model file
        print ''
        print '--- Gathering phylogenetically informative HMMs ---'
        getPhylogeneticHMMs = GetPhylogeneticHMMs()
        getPhylogeneticHMMs.run(self.hmmDir, self.finalGeneTreeDir, self.phyloHMMsOut)

        # infer genome tree
        print ''
        print '--- Inferring full genome tree ---'
        inferGenomeTree = InferGenomeTree()
        inferGenomeTree.run(self.finalGeneTreeDir, self.alignmentDir, '.aln.masked.faa', self.concatenatedAlignFile, self.treeOut, self.taxonomyOut, bSupportValues = True)

        # replace IMG identifiers with ACE identifiers
        imgIdToAceId = self.imgIdsToAceIds(genomeIds)
        with open(self.treeOut) as f:
            tree = ''.join(f.readlines())

            for genomeId in genomeIds:
                if genomeId in imgIdToAceId:
                    tree = tree.replace('IMG_' + genomeId, imgIdToAceId[genomeId])

        fout = open(self.treeOutAce, 'w')
        fout.write(tree)
        fout.close()
class MarkerSetStabilityTest(object):
    def __init__(self):
        self.img = IMG()
        self.markerset = MarkerSet()

    def __processLineage(self, metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, queueIn, queueOut):
        """Assess stability of marker set for a specific named taxonomic group."""
        while True:
            lineage = queueIn.get(block=True, timeout=None) 
            if lineage == None:
                break  
            
            genomeIds = self.img.genomeIdsByTaxonomy(lineage, metadata, 'trusted')
            
            markerGenes = []
            perChange = []
            numGenomesToSelect = int(0.9*len(genomeIds))
            if len(genomeIds) >= minGenomes:  
                # calculate marker set for all genomes in lineage          
                geneCountTable = self.img.geneCountTable(genomeIds)
                markerGenes = self.markerset.markerGenes(genomeIds, geneCountTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
                tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)

                markerGenes = markerGenes - tigrToRemove

                for _ in xrange(0, 100):
                    # calculate marker set for subset of genomes
                    subsetGenomeIds = random.sample(genomeIds, numGenomesToSelect)
                    geneCountTable = self.img.geneCountTable(subsetGenomeIds)
                    subsetMarkerGenes = self.markerset.markerGenes(subsetGenomeIds, geneCountTable, ubiquityThreshold*numGenomesToSelect, singleCopyThreshold*numGenomesToSelect)
                    tigrToRemove = self.img.identifyRedundantTIGRFAMs(subsetMarkerGenes)
                    subsetMarkerGenes = subsetMarkerGenes - tigrToRemove

                    perChange.append(float(len(markerGenes.symmetric_difference(subsetMarkerGenes)))*100.0 / len(markerGenes))

            if perChange != []:
                queueOut.put((lineage, len(genomeIds), len(markerGenes), numGenomesToSelect, mean(perChange), std(perChange)))
            else:
                queueOut.put((lineage, len(genomeIds), len(markerGenes), numGenomesToSelect, -1, -1))
                
    def __storeResults(self, outputFile, totalLineages, writerQueue):
        """Store results to file."""
        
        fout = open(outputFile, 'w')
        fout.write('Lineage\t# genomes\t# markers\t# sampled genomes\tmean % change\tstd % change\n')

        numProcessedLineages = 0
        while True:
            lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange = writerQueue.get(block=True, timeout=None)
            if lineage == None:
                break
                    
            numProcessedLineages += 1
            statusStr = '    Finished processing %d of %d (%.2f%%) lineages.' % (numProcessedLineages, totalLineages, float(numProcessedLineages)*100/totalLineages)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()
            

            fout.write('%s\t%d\t%d\t%d\t%f\t%f\n' % (lineage, numGenomes, numMarkerGenes, numSampledGenomes, meanPerChange, stdPerChange))

        sys.stdout.write('\n')
            
        fout.close()
        
        
    def run(self, outputFile, ubiquityThreshold, singleCopyThreshold, minGenomes, mostSpecificRank, numThreads):
        """Calculate stability of marker sets for named taxonomic groups."""  
        
        print '  Testing stability of marker sets:'
        
        random.seed(1)
        
        # process each sequence in parallel
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        metadata = self.img.genomeMetadata()
        lineages = self.img.lineagesByCriteria(metadata, minGenomes, mostSpecificRank)

        for lineage in lineages:
            workerQueue.put(lineage)

        for _ in range(numThreads):
            workerQueue.put(None)
 
        calcProc = [mp.Process(target = self.__processLineage, args = (metadata, ubiquityThreshold, singleCopyThreshold, minGenomes, workerQueue, writerQueue)) for _ in range(numThreads)]
        writeProc = mp.Process(target = self.__storeResults, args = (outputFile, len(lineages), writerQueue))

        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put((None, None, None, None, None, None))
        writeProc.join()