def run(self, metadataFile, percentThreshold): img = IMG() metadata = img.genomeMetadataFromFile(metadataFile) matches = {} pfamCount = {} tigrCount = {} for genomeCounter, genomeId in enumerate(metadata): statusStr = ' Finished processing %d of %d (%.2f%%) genomes.' % (genomeCounter+1, len(metadata), float(genomeCounter+1)*100/len(metadata)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() if metadata[genomeId]['status'] == 'Finished': pfamFile = img.genomeDir + genomeId + '/' + genomeId + img.pfamExtension if not os.path.exists(pfamFile): continue # get PFAM hits geneIdToPfams = {} bHeader = True for line in open(pfamFile): if bHeader: bHeader = False continue lineSplit = line.split('\t') if lineSplit[0] in geneIdToPfams: geneIdToPfams[lineSplit[0]].add(lineSplit[8]) else: geneIdToPfams[lineSplit[0]] = set([lineSplit[8]]) if lineSplit[8] in pfamCount: pfamCount[lineSplit[8]].add(genomeId) else: pfamCount[lineSplit[8]] = set([genomeId]) # get TIGRFAM hits geneIdToTigr = {} bHeader = True for line in open(img.genomeDir + genomeId + '/' + genomeId + img.tigrExtension): if bHeader: bHeader = False continue lineSplit = line.split('\t') if lineSplit[0] in geneIdToTigr: geneIdToTigr[lineSplit[0]].add(lineSplit[6]) else: geneIdToTigr[lineSplit[0]] = set([lineSplit[6]]) if lineSplit[6] in tigrCount: tigrCount[lineSplit[6]].add(genomeId) else: tigrCount[lineSplit[6]] = set([genomeId]) # keep track of TIGRFAMs matching the same gene as a PFAM geneIds = set(geneIdToPfams.keys()).union(set(geneIdToTigr.keys())) for geneId in geneIds: pfams = geneIdToPfams.get(geneId, None) tigrs = geneIdToTigr.get(geneId, None) if pfams == None or tigrs == None: continue for pfamId in pfams: for tigrId in tigrs: key = pfamId + '-' + tigrId if key in matches: matches[key].add(genomeId) else: matches[key] = set([genomeId]) sys.stdout.write('\n') # find TIGRFAMs that generally hit the same gene as a PFAM fout = open('../data/pfam/tigrfam2pfam.tsv', 'w') for key, genomeSet in matches.iteritems(): pfam, tigr = key.split('-') # deem a TIGRFAM HMM redundant if it is almost always hits that # same ORF as a PFAM HMM if float(len(genomeSet)) / len(tigrCount[tigr]) >= percentThreshold: fout.write(pfam + '\t' + tigr + '\n') fout.close()
def run(self, metadataFile, percentThreshold): img = IMG() metadata = img.genomeMetadataFromFile(metadataFile) matches = {} pfamCount = {} tigrCount = {} for genomeCounter, genomeId in enumerate(metadata): statusStr = ' Finished processing %d of %d (%.2f%%) genomes.' % (genomeCounter+1, len(metadata), float(genomeCounter+1)*100/len(metadata)) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() if metadata[genomeId]['status'] == 'Finished': pfamFile = img.genomeDir + genomeId + '/' + genomeId + img.pfamExtension if not os.path.exists(pfamFile): continue # get PFAM hits geneIdToPfams = {} bHeader = True for line in open(pfamFile): if bHeader: bHeader = False continue lineSplit = line.split('\t') if lineSplit[0] in geneIdToPfams: geneIdToPfams[lineSplit[0]].add(lineSplit[8]) else: geneIdToPfams[lineSplit[0]] = set([lineSplit[8]]) if lineSplit[8] in pfamCount: pfamCount[lineSplit[8]].add(genomeId) else: pfamCount[lineSplit[8]] = set([genomeId]) # get TIGRFAM hits geneIdToTigr = {} bHeader = True for line in open(img.genomeDir + genomeId + '/' + genomeId + img.tigrExtension): if bHeader: bHeader = False continue lineSplit = line.split('\t') if lineSplit[0] in geneIdToTigr: geneIdToTigr[lineSplit[0]].add(lineSplit[6]) else: geneIdToTigr[lineSplit[0]] = set([lineSplit[6]]) if lineSplit[6] in tigrCount: tigrCount[lineSplit[6]].add(genomeId) else: tigrCount[lineSplit[6]] = set([genomeId]) # keep track of TIGRFAMs matching the same gene as a PFAM geneIds = set(geneIdToPfams.keys()).union(set(geneIdToTigr.keys())) for geneId in geneIds: pfams = geneIdToPfams.get(geneId, None) tigrs = geneIdToTigr.get(geneId, None) if pfams == None or tigrs == None: continue for pfamId in pfams: for tigrId in tigrs: key = pfamId + '-' + tigrId if key in matches: matches[key].add(genomeId) else: matches[key] = set([genomeId]) sys.stdout.write('\n') # find TIGRFAMs that generally hit the same gene as a PFAM fout = open('../data/pfam/tigrfam2pfam.tsv', 'w') for key, genomeSet in matches.items(): pfam, tigr = key.split('-') # deem a TIGRFAM HMM redundant if it is almost always hits that # same ORF as a PFAM HMM if float(len(genomeSet)) / len(tigrCount[tigr]) >= percentThreshold: fout.write(pfam + '\t' + tigr + '\n') fout.close()
def run(self, inputMetadataFile, outputMetadataFile, outputDir, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination): img = IMG() markerSetBuilder = MarkerSetBuilder() allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w') allOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n' ) trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w') trustedOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n' ) filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'), 'w') filteredOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n' ) metadataOut = open(outputMetadataFile, 'w') # read input metadata file metadata = img.genomeMetadataFromFile(inputMetadataFile) finishedGenomes = defaultdict(set) allGenomes = defaultdict(set) metadataLine = {} bHeader = True for line in open(inputMetadataFile): if bHeader: metadataOut.write(line) bHeader = False continue lineSplit = line.split('\t') genomeId = lineSplit[0] domain = lineSplit[1] status = lineSplit[2] if status == 'Finished': finishedGenomes[domain].add(genomeId) allGenomes[domain].add(genomeId) metadataLine[genomeId] = line allTrustedGenomeIds = set() for lineage, allLineageGenomeIds in allGenomes.items(): print('[' + lineage + ']') print(' Number of genomes: %d' % len(allLineageGenomeIds)) # tabulate genomes from each phylum allPhylumCounts = {} for genomeId in allLineageGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1 # identify marker genes for finished genomes print( '\nDetermining initial marker gene sets for genome filtering.') markerSet = markerSetBuilder.buildMarkerSet( finishedGenomes[lineage], ubiquityThreshold, singleCopyThreshold) print( ' Marker set consists of %s marker genes organized into %d sets.' % (markerSet.numMarkers(), markerSet.numSets())) fout = open( os.path.join(outputDir, 'trusted_marker_sets_' + lineage + '.txt'), 'w') fout.write(str(markerSet.markerSet)) fout.close() # identifying trusted genomes (highly complete, low contamination genomes) print('\nIdentifying highly complete, low contamination genomes.') trustedGenomeIds = set() filteredGenomes = set() retainedStatus = {} filteredStatus = {} geneCountTable = img.geneCountTable(allLineageGenomeIds) for genomeId in allLineageGenomeIds: completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck( markerSet.markerSet, genomeId, geneCountTable) genomeStr = self.__genomeString(genomeId, metadata, completeness, contamination, missingMarkers, duplicateMarkers) if completeness >= trustedCompleteness and contamination <= trustedContamination: trustedGenomeIds.add(genomeId) allTrustedGenomeIds.add(genomeId) retainedStatus[metadata[genomeId] ['status']] = retainedStatus.get( metadata[genomeId]['status'], 0) + 1 trustedOut.write(genomeStr) allOut.write(genomeStr) metadataOut.write(metadataLine[genomeId]) else: filteredGenomes.add(genomeId) filteredStatus[metadata[genomeId] ['status']] = filteredStatus.get( metadata[genomeId]['status'], 0) + 1 filteredOut.write(genomeStr) allOut.write(genomeStr) print(' Filtered genomes: %d (%.2f%%)' % (len(filteredGenomes), len(filteredGenomes) * 100.0 / len(allLineageGenomeIds))) print(' ' + str(filteredStatus)) print(' \nTrusted genomes: %d (%.2f%%)' % (len(trustedGenomeIds), len(trustedGenomeIds) * 100.0 / len(allLineageGenomeIds))) print(' ' + str(retainedStatus)) # determine status of retained genomes print('\nTrusted genomes by phylum:') trustedPhylumCounts = {} for genomeId in trustedGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1 for phylum, count in allPhylumCounts.items(): print(' ' + phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count)) print('') allOut.close() trustedOut.close() filteredOut.close() metadataOut.close() # write out lineage statistics for genome distribution allStats = {} trustedStats = {} for r in range(0, 6): # Domain to Genus for genomeId, data in metadata.items(): taxaStr = ';'.join(data['taxonomy'][0:r + 1]) allStats[taxaStr] = allStats.get(taxaStr, 0) + 1 if genomeId in allTrustedGenomeIds: trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1 sortedLineages = img.lineagesSorted(metadata) fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w') fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n') for lineage in sortedLineages: fout.write(lineage + '\t' + str(allStats.get(lineage, 0)) + '\t' + str(trustedStats.get(lineage, 0)) + '\n') fout.close()
def run(self, inputMetadataFile, outputMetadataFile, outputDir, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination): img = IMG() markerSetBuilder = MarkerSetBuilder() allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w') allOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n') trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w') trustedOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n') filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'), 'w') filteredOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n') metadataOut = open(outputMetadataFile, 'w') # read input metadata file metadata = img.genomeMetadataFromFile(inputMetadataFile) finishedGenomes = defaultdict(set) allGenomes = defaultdict(set) metadataLine = {} bHeader = True for line in open(inputMetadataFile): if bHeader: metadataOut.write(line) bHeader = False continue lineSplit = line.split('\t') genomeId = lineSplit[0] domain = lineSplit[1] status = lineSplit[2] if status == 'Finished': finishedGenomes[domain].add(genomeId) allGenomes[domain].add(genomeId) metadataLine[genomeId] = line allTrustedGenomeIds = set() for lineage, allLineageGenomeIds in allGenomes.iteritems(): print '[' + lineage + ']' print ' Number of genomes: %d' % len(allLineageGenomeIds) # tabulate genomes from each phylum allPhylumCounts = {} for genomeId in allLineageGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1 # identify marker genes for finished genomes print '\nDetermining initial marker gene sets for genome filtering.' markerSet = markerSetBuilder.buildMarkerSet(finishedGenomes[lineage], ubiquityThreshold, singleCopyThreshold) print ' Marker set consists of %s marker genes organized into %d sets.' % (markerSet.numMarkers(), markerSet.numSets()) fout = open(os.path.join(outputDir, 'trusted_marker_sets_' + lineage + '.txt'), 'w') fout.write(str(markerSet.markerSet)) fout.close() # identifying trusted genomes (highly complete, low contamination genomes) print '\nIdentifying highly complete, low contamination genomes.' trustedGenomeIds = set() filteredGenomes = set() retainedStatus = {} filteredStatus = {} geneCountTable = img.geneCountTable(allLineageGenomeIds) for genomeId in allLineageGenomeIds: completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck(markerSet.markerSet, genomeId, geneCountTable) genomeStr = self.__genomeString(genomeId, metadata, completeness, contamination, missingMarkers, duplicateMarkers) if completeness >= trustedCompleteness and contamination <= trustedContamination: trustedGenomeIds.add(genomeId) allTrustedGenomeIds.add(genomeId) retainedStatus[metadata[genomeId]['status']] = retainedStatus.get(metadata[genomeId]['status'], 0) + 1 trustedOut.write(genomeStr) allOut.write(genomeStr) metadataOut.write(metadataLine[genomeId]) else: filteredGenomes.add(genomeId) filteredStatus[metadata[genomeId]['status']] = filteredStatus.get(metadata[genomeId]['status'], 0) + 1 filteredOut.write(genomeStr) allOut.write(genomeStr) print ' Filtered genomes: %d (%.2f%%)' % (len(filteredGenomes), len(filteredGenomes)*100.0 / len(allLineageGenomeIds)) print ' ' + str(filteredStatus) print ' \nTrusted genomes: %d (%.2f%%)' % (len(trustedGenomeIds), len(trustedGenomeIds)*100.0 / len(allLineageGenomeIds)) print ' ' + str(retainedStatus) # determine status of retained genomes print '\nTrusted genomes by phylum:' trustedPhylumCounts = {} for genomeId in trustedGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1 for phylum, count in allPhylumCounts.iteritems(): print ' ' + phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count) print '' allOut.close() trustedOut.close() filteredOut.close() metadataOut.close() # write out lineage statistics for genome distribution allStats = {} trustedStats = {} for r in xrange(0, 6): # Domain to Genus for genomeId, data in metadata.iteritems(): taxaStr = ';'.join(data['taxonomy'][0:r+1]) allStats[taxaStr] = allStats.get(taxaStr, 0) + 1 if genomeId in allTrustedGenomeIds: trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1 sortedLineages = img.lineagesSorted(metadata) fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w') fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n') for lineage in sortedLineages: fout.write(lineage + '\t' + str(allStats.get(lineage, 0))+ '\t' + str(trustedStats.get(lineage, 0))+ '\n') fout.close()