示例#1
0
    def run(self, metadataFile, percentThreshold):
        img = IMG()

        metadata = img.genomeMetadataFromFile(metadataFile)

        matches = {}
        pfamCount = {}
        tigrCount = {}
        for genomeCounter, genomeId in enumerate(metadata):
            statusStr = '  Finished processing %d of %d (%.2f%%) genomes.' % (genomeCounter+1, len(metadata), float(genomeCounter+1)*100/len(metadata))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            if metadata[genomeId]['status'] == 'Finished':
                pfamFile = img.genomeDir + genomeId + '/' + genomeId + img.pfamExtension

                if not os.path.exists(pfamFile):
                    continue

                # get PFAM hits
                geneIdToPfams = {}
                bHeader = True
                for line in open(pfamFile):
                    if bHeader:
                        bHeader = False
                        continue

                    lineSplit = line.split('\t')
                    if lineSplit[0] in geneIdToPfams:
                        geneIdToPfams[lineSplit[0]].add(lineSplit[8])
                    else:
                        geneIdToPfams[lineSplit[0]] = set([lineSplit[8]])

                    if lineSplit[8] in pfamCount:
                        pfamCount[lineSplit[8]].add(genomeId)
                    else:
                        pfamCount[lineSplit[8]] = set([genomeId])

                # get TIGRFAM hits
                geneIdToTigr = {}
                bHeader = True
                for line in open(img.genomeDir + genomeId + '/' + genomeId + img.tigrExtension):
                    if bHeader:
                        bHeader = False
                        continue

                    lineSplit = line.split('\t')
                    if lineSplit[0] in geneIdToTigr:
                        geneIdToTigr[lineSplit[0]].add(lineSplit[6])
                    else:
                        geneIdToTigr[lineSplit[0]] = set([lineSplit[6]])

                    if lineSplit[6] in tigrCount:
                        tigrCount[lineSplit[6]].add(genomeId)
                    else:
                        tigrCount[lineSplit[6]] = set([genomeId])

                # keep track of TIGRFAMs matching the same gene as a PFAM
                geneIds = set(geneIdToPfams.keys()).union(set(geneIdToTigr.keys()))
                for geneId in geneIds:
                    pfams = geneIdToPfams.get(geneId, None)
                    tigrs = geneIdToTigr.get(geneId, None)

                    if pfams == None or tigrs == None:
                        continue

                    for pfamId in pfams:
                        for tigrId in tigrs:
                            key = pfamId + '-' + tigrId
                            if key in matches:
                                matches[key].add(genomeId)
                            else:
                                matches[key] = set([genomeId])

        sys.stdout.write('\n')

        # find TIGRFAMs that generally hit the same gene as a PFAM
        fout = open('../data/pfam/tigrfam2pfam.tsv', 'w')
        for key, genomeSet in matches.iteritems():
            pfam, tigr = key.split('-')

            # deem a TIGRFAM HMM redundant if it is almost always hits that
            # same ORF as a PFAM HMM
            if float(len(genomeSet)) / len(tigrCount[tigr]) >= percentThreshold:
                fout.write(pfam + '\t' + tigr + '\n')
        fout.close()
示例#2
0
    def run(self, metadataFile, percentThreshold):
        img = IMG()

        metadata = img.genomeMetadataFromFile(metadataFile)

        matches = {}
        pfamCount = {}
        tigrCount = {}
        for genomeCounter, genomeId in enumerate(metadata):
            statusStr = '  Finished processing %d of %d (%.2f%%) genomes.' % (genomeCounter+1, len(metadata), float(genomeCounter+1)*100/len(metadata))
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            if metadata[genomeId]['status'] == 'Finished':
                pfamFile = img.genomeDir + genomeId + '/' + genomeId + img.pfamExtension

                if not os.path.exists(pfamFile):
                    continue

                # get PFAM hits
                geneIdToPfams = {}
                bHeader = True
                for line in open(pfamFile):
                    if bHeader:
                        bHeader = False
                        continue

                    lineSplit = line.split('\t')
                    if lineSplit[0] in geneIdToPfams:
                        geneIdToPfams[lineSplit[0]].add(lineSplit[8])
                    else:
                        geneIdToPfams[lineSplit[0]] = set([lineSplit[8]])

                    if lineSplit[8] in pfamCount:
                        pfamCount[lineSplit[8]].add(genomeId)
                    else:
                        pfamCount[lineSplit[8]] = set([genomeId])

                # get TIGRFAM hits
                geneIdToTigr = {}
                bHeader = True
                for line in open(img.genomeDir + genomeId + '/' + genomeId + img.tigrExtension):
                    if bHeader:
                        bHeader = False
                        continue

                    lineSplit = line.split('\t')
                    if lineSplit[0] in geneIdToTigr:
                        geneIdToTigr[lineSplit[0]].add(lineSplit[6])
                    else:
                        geneIdToTigr[lineSplit[0]] = set([lineSplit[6]])

                    if lineSplit[6] in tigrCount:
                        tigrCount[lineSplit[6]].add(genomeId)
                    else:
                        tigrCount[lineSplit[6]] = set([genomeId])

                # keep track of TIGRFAMs matching the same gene as a PFAM
                geneIds = set(geneIdToPfams.keys()).union(set(geneIdToTigr.keys()))
                for geneId in geneIds:
                    pfams = geneIdToPfams.get(geneId, None)
                    tigrs = geneIdToTigr.get(geneId, None)

                    if pfams == None or tigrs == None:
                        continue

                    for pfamId in pfams:
                        for tigrId in tigrs:
                            key = pfamId + '-' + tigrId
                            if key in matches:
                                matches[key].add(genomeId)
                            else:
                                matches[key] = set([genomeId])

        sys.stdout.write('\n')

        # find TIGRFAMs that generally hit the same gene as a PFAM
        fout = open('../data/pfam/tigrfam2pfam.tsv', 'w')
        for key, genomeSet in matches.items():
            pfam, tigr = key.split('-')

            # deem a TIGRFAM HMM redundant if it is almost always hits that
            # same ORF as a PFAM HMM
            if float(len(genomeSet)) / len(tigrCount[tigr]) >= percentThreshold:
                fout.write(pfam + '\t' + tigr + '\n')
        fout.close()
示例#3
0
    def run(self, inputMetadataFile, outputMetadataFile, outputDir,
            ubiquityThreshold, singleCopyThreshold, trustedCompleteness,
            trustedContamination):
        img = IMG()
        markerSetBuilder = MarkerSetBuilder()

        allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w')
        allOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n'
        )

        trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w')
        trustedOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n'
        )

        filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'),
                           'w')
        filteredOut.write(
            'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n'
        )

        metadataOut = open(outputMetadataFile, 'w')

        # read input metadata file
        metadata = img.genomeMetadataFromFile(inputMetadataFile)

        finishedGenomes = defaultdict(set)
        allGenomes = defaultdict(set)

        metadataLine = {}

        bHeader = True
        for line in open(inputMetadataFile):
            if bHeader:
                metadataOut.write(line)
                bHeader = False
                continue

            lineSplit = line.split('\t')
            genomeId = lineSplit[0]
            domain = lineSplit[1]
            status = lineSplit[2]

            if status == 'Finished':
                finishedGenomes[domain].add(genomeId)

            allGenomes[domain].add(genomeId)
            metadataLine[genomeId] = line

        allTrustedGenomeIds = set()
        for lineage, allLineageGenomeIds in allGenomes.items():
            print('[' + lineage + ']')
            print('  Number of genomes: %d' % len(allLineageGenomeIds))

            # tabulate genomes from each phylum
            allPhylumCounts = {}
            for genomeId in allLineageGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1

            # identify marker genes for finished genomes
            print(
                '\nDetermining initial marker gene sets for genome filtering.')
            markerSet = markerSetBuilder.buildMarkerSet(
                finishedGenomes[lineage], ubiquityThreshold,
                singleCopyThreshold)

            print(
                '  Marker set consists of %s marker genes organized into %d sets.'
                % (markerSet.numMarkers(), markerSet.numSets()))
            fout = open(
                os.path.join(outputDir,
                             'trusted_marker_sets_' + lineage + '.txt'), 'w')
            fout.write(str(markerSet.markerSet))
            fout.close()

            # identifying trusted genomes (highly complete, low contamination genomes)
            print('\nIdentifying highly complete, low contamination genomes.')
            trustedGenomeIds = set()
            filteredGenomes = set()
            retainedStatus = {}
            filteredStatus = {}
            geneCountTable = img.geneCountTable(allLineageGenomeIds)
            for genomeId in allLineageGenomeIds:
                completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck(
                    markerSet.markerSet, genomeId, geneCountTable)

                genomeStr = self.__genomeString(genomeId, metadata,
                                                completeness, contamination,
                                                missingMarkers,
                                                duplicateMarkers)

                if completeness >= trustedCompleteness and contamination <= trustedContamination:
                    trustedGenomeIds.add(genomeId)
                    allTrustedGenomeIds.add(genomeId)
                    retainedStatus[metadata[genomeId]
                                   ['status']] = retainedStatus.get(
                                       metadata[genomeId]['status'], 0) + 1

                    trustedOut.write(genomeStr)
                    allOut.write(genomeStr)

                    metadataOut.write(metadataLine[genomeId])
                else:
                    filteredGenomes.add(genomeId)
                    filteredStatus[metadata[genomeId]
                                   ['status']] = filteredStatus.get(
                                       metadata[genomeId]['status'], 0) + 1

                    filteredOut.write(genomeStr)
                    allOut.write(genomeStr)

            print('  Filtered genomes: %d (%.2f%%)' %
                  (len(filteredGenomes),
                   len(filteredGenomes) * 100.0 / len(allLineageGenomeIds)))
            print('  ' + str(filteredStatus))
            print('  \nTrusted genomes: %d (%.2f%%)' %
                  (len(trustedGenomeIds),
                   len(trustedGenomeIds) * 100.0 / len(allLineageGenomeIds)))
            print('  ' + str(retainedStatus))

            # determine status of retained genomes
            print('\nTrusted genomes by phylum:')
            trustedPhylumCounts = {}
            for genomeId in trustedGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon,
                                                                     0) + 1

            for phylum, count in allPhylumCounts.items():
                print('  ' + phylum + ': %d of %d' %
                      (trustedPhylumCounts.get(phylum, 0), count))
            print('')

        allOut.close()
        trustedOut.close()
        filteredOut.close()
        metadataOut.close()

        # write out lineage statistics for genome distribution
        allStats = {}
        trustedStats = {}

        for r in range(0, 6):  # Domain to Genus
            for genomeId, data in metadata.items():
                taxaStr = ';'.join(data['taxonomy'][0:r + 1])
                allStats[taxaStr] = allStats.get(taxaStr, 0) + 1
                if genomeId in allTrustedGenomeIds:
                    trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1

        sortedLineages = img.lineagesSorted(metadata)

        fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w')
        fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n')
        for lineage in sortedLineages:
            fout.write(lineage + '\t' + str(allStats.get(lineage, 0)) + '\t' +
                       str(trustedStats.get(lineage, 0)) + '\n')
        fout.close()
示例#4
0
    def run(self, inputMetadataFile, outputMetadataFile, outputDir, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination):
        img = IMG()
        markerSetBuilder = MarkerSetBuilder()

        allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w')
        allOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n')

        trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w')
        trustedOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n')

        filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'), 'w')
        filteredOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n')

        metadataOut = open(outputMetadataFile, 'w')
        
        # read input metadata file
        metadata = img.genomeMetadataFromFile(inputMetadataFile)
        
        finishedGenomes = defaultdict(set)
        allGenomes = defaultdict(set)
        
        metadataLine = {}
        
        bHeader = True
        for line in open(inputMetadataFile):
            if bHeader:
                metadataOut.write(line)
                bHeader = False
                continue
            
            lineSplit = line.split('\t')
            genomeId = lineSplit[0]
            domain = lineSplit[1]
            status = lineSplit[2]
            
            if status == 'Finished':
                finishedGenomes[domain].add(genomeId)
            
            allGenomes[domain].add(genomeId)
            metadataLine[genomeId] = line

        allTrustedGenomeIds = set()
        for lineage, allLineageGenomeIds in allGenomes.iteritems():
            print '[' + lineage + ']'
            print '  Number of genomes: %d' % len(allLineageGenomeIds)

            # tabulate genomes from each phylum
            allPhylumCounts = {}
            for genomeId in allLineageGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1

            # identify marker genes for finished genomes
            print '\nDetermining initial marker gene sets for genome filtering.'
            markerSet = markerSetBuilder.buildMarkerSet(finishedGenomes[lineage], ubiquityThreshold, singleCopyThreshold)

            print '  Marker set consists of %s marker genes organized into %d sets.' % (markerSet.numMarkers(), markerSet.numSets())
            fout = open(os.path.join(outputDir, 'trusted_marker_sets_' + lineage + '.txt'), 'w')
            fout.write(str(markerSet.markerSet))
            fout.close()

            # identifying trusted genomes (highly complete, low contamination genomes)
            print '\nIdentifying highly complete, low contamination genomes.'
            trustedGenomeIds = set()
            filteredGenomes = set()
            retainedStatus = {}
            filteredStatus = {}
            geneCountTable = img.geneCountTable(allLineageGenomeIds)
            for genomeId in allLineageGenomeIds:
                completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck(markerSet.markerSet, genomeId, geneCountTable)
                
                genomeStr = self.__genomeString(genomeId, metadata, completeness, contamination, missingMarkers, duplicateMarkers)

                if completeness >= trustedCompleteness and contamination <= trustedContamination:
                    trustedGenomeIds.add(genomeId)
                    allTrustedGenomeIds.add(genomeId)
                    retainedStatus[metadata[genomeId]['status']] = retainedStatus.get(metadata[genomeId]['status'], 0) + 1

                    trustedOut.write(genomeStr)
                    allOut.write(genomeStr)
                    
                    metadataOut.write(metadataLine[genomeId])
                else:
                    filteredGenomes.add(genomeId)
                    filteredStatus[metadata[genomeId]['status']] = filteredStatus.get(metadata[genomeId]['status'], 0) + 1

                    filteredOut.write(genomeStr)
                    allOut.write(genomeStr)

            print '  Filtered genomes: %d (%.2f%%)' % (len(filteredGenomes), len(filteredGenomes)*100.0 / len(allLineageGenomeIds))
            print '  ' + str(filteredStatus)
            print '  \nTrusted genomes: %d (%.2f%%)' % (len(trustedGenomeIds), len(trustedGenomeIds)*100.0 / len(allLineageGenomeIds))
            print '  ' + str(retainedStatus)

            # determine status of retained genomes
            print '\nTrusted genomes by phylum:'
            trustedPhylumCounts = {}
            for genomeId in trustedGenomeIds:
                taxon = metadata[genomeId]['taxonomy'][1]
                trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1

            for phylum, count in allPhylumCounts.iteritems():
                print '  ' + phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count)
            print ''

        allOut.close()
        trustedOut.close()
        filteredOut.close()
        metadataOut.close()

        # write out lineage statistics for genome distribution
        allStats = {}
        trustedStats = {}

        for r in xrange(0, 6): # Domain to Genus
            for genomeId, data in metadata.iteritems():
                taxaStr = ';'.join(data['taxonomy'][0:r+1])
                allStats[taxaStr] = allStats.get(taxaStr, 0) + 1
                if genomeId in allTrustedGenomeIds:
                    trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1

        sortedLineages = img.lineagesSorted(metadata)

        fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w')
        fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n')
        for lineage in sortedLineages:
            fout.write(lineage + '\t' + str(allStats.get(lineage, 0))+ '\t' + str(trustedStats.get(lineage, 0))+ '\n')
        fout.close()