示例#1
0
        def run(self):
            img = IMG()

            fout = open('./data/evaluate_hmms_with_prodigal.txt', 'w', 1)

            # get list of all marker genes
            markerset = MarkerSet()
            pfamMarkers, tigrMarkers = markerset.getCalculatedMarkerGenes()

            print('PFAM marker genes: ' + str(len(tigrMarkers)))
            print('TIGR marker genes: ' + str(len(pfamMarkers)))
            print('')

            # run HMMs on each of the finished genomes
            genomeIds = img.genomeIds('Finished')
            for genomeId in genomeIds:
                print(genomeId + ':')
                fout.write(genomeId + ':\n')

                self.runPFAM(genomeId)
                self.runTIGRFAM(genomeId)

                fout.write('  ORF results:\n')
                self.compareResults(genomeId, pfamMarkers, tigrMarkers, fout)

                #self.translateSixFrames(genomeId)
                #self.runPFAM_SixFrames(genomeId)
                #self.runTIGRFAM_SixFrames(genomeId)

                #fout.write('  Six-frame translation results:\n')
                #self.compareSixFrameResults(genomeId, pfamMarkers, tigrMarkers, fout)

            fout.close()
示例#2
0
    def run(self, outputFile):
        img = IMG()

        print('Identifying all IMG prokaryotic genomes with valid data.')
        metadata = img.genomeMetadata()
        genomeIds = img.genomeIdsByTaxonomy('prokaryotes', metadata)
        genomeMissingData = img.genomesWithMissingData(genomeIds)
        genomeIds -= genomeMissingData

        print('  Identified %d valid genomes.' % (len(genomeIds)))

        print('Calculating gene copy number for each genome.')
        countTable = img.geneCountTable(genomeIds)

        counts = []
        for _, count in countTable['pfam00318'].iteritems():
            counts.append(count)

        print(len(genomeIds))
        print(len(counts))
        print(mean(counts))

        fout = open(outputFile, 'w')
        fout.write(str(countTable))
        fout.close()

        print('Gene count dictionary to: ' + outputFile)
示例#3
0
    def __init__(self, outputDir):
        self.img = IMG()
        self.markerSetBuilder = MarkerSetBuilder()

        if os.path.exists(outputDir):
            print '[Error] Output directory already exists: ' + outputDir
            sys.exit(0)
        else:
            os.makedirs(outputDir)

        self.__checkForHMMER()
        self.__checkForFastTree()

        self.hmmDir = os.path.join(outputDir, 'phylo_hmms')
        self.alignmentDir = os.path.join(outputDir, 'gene_alignments')
        self.geneTreeDir = os.path.join(outputDir, 'gene_trees')
        self.conspecificGeneTreeDir = os.path.join(outputDir,
                                                   'gene_trees_conspecific')
        self.finalGeneTreeDir = os.path.join(outputDir, 'gene_trees_final')

        self.consistencyOut = os.path.join(outputDir,
                                           'genome_tree.consistency.tsv')
        self.concatenatedAlignFile = os.path.join(
            outputDir, 'genome_tree.concatenated.faa')
        self.derepConcatenatedAlignFile = os.path.join(
            outputDir, 'genome_tree.concatenated.derep.fasta')
        self.treeOut = os.path.join(outputDir, 'genome_tree.tre')
        self.treeOutAce = os.path.join(outputDir, 'genome_tree.ace_ids.tre')

        self.treeRootedOut = os.path.join(outputDir, 'genome_tree.rooted.tre')
        self.treeTaxonomyOut = os.path.join(outputDir,
                                            'genome_tree.taxonomy.tre')
        self.treeDerepOut = os.path.join(outputDir, 'genome_tree.derep.tre')
        self.treeDerepRootedOut = os.path.join(outputDir,
                                               'genome_tree.derep.rooted.tre')
        self.treeDerepBootstrapOut = os.path.join(outputDir,
                                                  'genome_tree.derep.bs.tre')
        self.treeDerepFinalOut = os.path.join(outputDir,
                                              'genome_tree.final.tre')
        self.taxonomyOut = os.path.join(outputDir, 'genome_tree.taxonomy.tsv')
        self.treeMetadata = os.path.join(outputDir, 'genome_tree.metadata.tsv')
        self.phyloHMMsOut = os.path.join(outputDir, 'phylo.hmm')
        self.derepSeqFile = os.path.join(outputDir, 'genome_tree.derep.txt')

        self.phyloUbiquity = 0.90
        self.phyloSingleCopy = 0.90
        self.paralogAcceptPer = 0.01
        # self.consistencyAcceptPer = 0.95    # for trees at the class-level
        self.consistencyAcceptPer = 0.906  # for trees at the phylum-level
        self.consistencyMinTaxa = 20

        # create output directories
        os.makedirs(self.hmmDir)
        os.makedirs(self.alignmentDir)
        os.makedirs(self.geneTreeDir)
        os.makedirs(self.conspecificGeneTreeDir)
        os.makedirs(self.finalGeneTreeDir)
示例#4
0
    def __init__(self):
        self.simFile = './experiments/simulation.tuning.genus.summary.tsv'
        self.looRank = 5

        self.markerSetBuilder = MarkerSetBuilder()
        self.img = IMG()
示例#5
0
 def __init__(self):
     self.markerSetBuilder = MarkerSetBuilder()
     self.img = IMG()
     
     self.simContigLen = 10000
示例#6
0
 def __init__(self):
     self.img = IMG()
     self.markerset = MarkerSet()
示例#7
0
    def run(self, inputMetadataFile, outputMetadataFile, outputDir,
            ubiquityThreshold, singleCopyThreshold, trustedCompleteness,
            trustedContamination):
        img = IMG()
        markerSetBuilder = MarkerSetBuilder()

        allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w')
        allOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n'
        )

        trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w')
        trustedOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n'
        )

        filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'),
                           'w')
        filteredOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n'
        )

        metadataOut = open(outputMetadataFile, 'w')

        # read input metadata file
        metadata = img.genomeMetadataFromFile(inputMetadataFile)

        finishedGenomes = defaultdict(set)
        allGenomes = defaultdict(set)

        metadataLine = {}

        bHeader = True
        for line in open(inputMetadataFile):
            if bHeader:
                metadataOut.write(line)
                bHeader = False
                continue

            lineSplit = line.split('\t')
            genomeId = lineSplit[0]
            domain = lineSplit[1]
            status = lineSplit[2]

            if status == 'Finished':
                finishedGenomes[domain].add(genomeId)

            allGenomes[domain].add(genomeId)
            metadataLine[genomeId] = line

        allTrustedGenomeIds = set()
        for lineage, allLineageGenomeIds in allGenomes.items():
            print('[' + lineage + ']')
            print('  Number of genomes: %d' % len(allLineageGenomeIds))

            # tabulate genomes from each phylum
            allPhylumCounts = {}
            for genomeId in allLineageGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1

            # identify marker genes for finished genomes
            print(
                '\nDetermining initial marker gene sets for genome filtering.')
            markerSet = markerSetBuilder.buildMarkerSet(
                finishedGenomes[lineage], ubiquityThreshold,
                singleCopyThreshold)

            print(
                '  Marker set consists of %s marker genes organized into %d sets.'
                % (markerSet.numMarkers(), markerSet.numSets()))
            fout = open(
                os.path.join(outputDir,
                             'trusted_marker_sets_' + lineage + '.txt'), 'w')
            fout.write(str(markerSet.markerSet))
            fout.close()

            # identifying trusted genomes (highly complete, low contamination genomes)
            print('\nIdentifying highly complete, low contamination genomes.')
            trustedGenomeIds = set()
            filteredGenomes = set()
            retainedStatus = {}
            filteredStatus = {}
            geneCountTable = img.geneCountTable(allLineageGenomeIds)
            for genomeId in allLineageGenomeIds:
                completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck(
                    markerSet.markerSet, genomeId, geneCountTable)

                genomeStr = self.__genomeString(genomeId, metadata,
                                                completeness, contamination,
                                                missingMarkers,
                                                duplicateMarkers)

                if completeness >= trustedCompleteness and contamination <= trustedContamination:
                    trustedGenomeIds.add(genomeId)
                    allTrustedGenomeIds.add(genomeId)
                    retainedStatus[metadata[genomeId]
                                   ['status']] = retainedStatus.get(
                                       metadata[genomeId]['status'], 0) + 1

                    trustedOut.write(genomeStr)
                    allOut.write(genomeStr)

                    metadataOut.write(metadataLine[genomeId])
                else:
                    filteredGenomes.add(genomeId)
                    filteredStatus[metadata[genomeId]
                                   ['status']] = filteredStatus.get(
                                       metadata[genomeId]['status'], 0) + 1

                    filteredOut.write(genomeStr)
                    allOut.write(genomeStr)

            print('  Filtered genomes: %d (%.2f%%)' %
                  (len(filteredGenomes),
                   len(filteredGenomes) * 100.0 / len(allLineageGenomeIds)))
            print('  ' + str(filteredStatus))
            print('  \nTrusted genomes: %d (%.2f%%)' %
                  (len(trustedGenomeIds),
                   len(trustedGenomeIds) * 100.0 / len(allLineageGenomeIds)))
            print('  ' + str(retainedStatus))

            # determine status of retained genomes
            print('\nTrusted genomes by phylum:')
            trustedPhylumCounts = {}
            for genomeId in trustedGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon,
                                                                     0) + 1

            for phylum, count in allPhylumCounts.items():
                print('  ' + phylum + ': %d of %d' %
                      (trustedPhylumCounts.get(phylum, 0), count))
            print('')

        allOut.close()
        trustedOut.close()
        filteredOut.close()
        metadataOut.close()

        # write out lineage statistics for genome distribution
        allStats = {}
        trustedStats = {}

        for r in range(0, 6):  # Domain to Genus
            for genomeId, data in metadata.items():
                taxaStr = ';'.join(data['taxonomy'][0:r + 1])
                allStats[taxaStr] = allStats.get(taxaStr, 0) + 1
                if genomeId in allTrustedGenomeIds:
                    trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1

        sortedLineages = img.lineagesSorted(metadata)

        fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w')
        fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n')
        for lineage in sortedLineages:
            fout.write(lineage + '\t' + str(allStats.get(lineage, 0)) + '\t' +
                       str(trustedStats.get(lineage, 0)) + '\n')
        fout.close()
示例#8
0
 def __init__(self):
     self.img = IMG()
示例#9
0
 def __init__(self):
     self.markerSetBuilder = MarkerSetBuilder()
     self.img = IMG()
示例#10
0
    def run(self, metadataFile, percentThreshold):
        img = IMG()

        metadata = img.genomeMetadataFromFile(metadataFile)

        matches = {}
        pfamCount = {}
        tigrCount = {}
        for genomeCounter, genomeId in enumerate(metadata):
            statusStr = '  Finished processing %d of %d (%.2f%%) genomes.' % (genomeCounter+1, len(metadata), float(genomeCounter+1)*100/len(metadata))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            if metadata[genomeId]['status'] == 'Finished':
                pfamFile = img.genomeDir + genomeId + '/' + genomeId + img.pfamExtension

                if not os.path.exists(pfamFile):
                    continue

                # get PFAM hits
                geneIdToPfams = {}
                bHeader = True
                for line in open(pfamFile):
                    if bHeader:
                        bHeader = False
                        continue

                    lineSplit = line.split('\t')
                    if lineSplit[0] in geneIdToPfams:
                        geneIdToPfams[lineSplit[0]].add(lineSplit[8])
                    else:
                        geneIdToPfams[lineSplit[0]] = set([lineSplit[8]])

                    if lineSplit[8] in pfamCount:
                        pfamCount[lineSplit[8]].add(genomeId)
                    else:
                        pfamCount[lineSplit[8]] = set([genomeId])

                # get TIGRFAM hits
                geneIdToTigr = {}
                bHeader = True
                for line in open(img.genomeDir + genomeId + '/' + genomeId + img.tigrExtension):
                    if bHeader:
                        bHeader = False
                        continue

                    lineSplit = line.split('\t')
                    if lineSplit[0] in geneIdToTigr:
                        geneIdToTigr[lineSplit[0]].add(lineSplit[6])
                    else:
                        geneIdToTigr[lineSplit[0]] = set([lineSplit[6]])

                    if lineSplit[6] in tigrCount:
                        tigrCount[lineSplit[6]].add(genomeId)
                    else:
                        tigrCount[lineSplit[6]] = set([genomeId])

                # keep track of TIGRFAMs matching the same gene as a PFAM
                geneIds = set(geneIdToPfams.keys()).union(set(geneIdToTigr.keys()))
                for geneId in geneIds:
                    pfams = geneIdToPfams.get(geneId, None)
                    tigrs = geneIdToTigr.get(geneId, None)

                    if pfams == None or tigrs == None:
                        continue

                    for pfamId in pfams:
                        for tigrId in tigrs:
                            key = pfamId + '-' + tigrId
                            if key in matches:
                                matches[key].add(genomeId)
                            else:
                                matches[key] = set([genomeId])

        sys.stdout.write('\n')

        # find TIGRFAMs that generally hit the same gene as a PFAM
        fout = open('../data/pfam/tigrfam2pfam.tsv', 'w')
        for key, genomeSet in matches.items():
            pfam, tigr = key.split('-')

            # deem a TIGRFAM HMM redundant if it is almost always hits that
            # same ORF as a PFAM HMM
            if float(len(genomeSet)) / len(tigrCount[tigr]) >= percentThreshold:
                fout.write(pfam + '\t' + tigr + '\n')
        fout.close()