def run(self, minThreshold, maxThreshold, stepSize, minGenomes, mostSpecificRanks): img = IMG() trustedGenomeIds = img.trustedGenomes() fout = open("./data/markerSetSize.tsv", "w") fout.write("Lineage\t# genomes") for threshold in arange(maxThreshold, minThreshold, -stepSize): fout.write("\t" + str(threshold)) fout.write("\n") lineages = img.lineagesSorted(mostSpecificRanks) for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage) genomeIds = list(genomeIds.intersection(trustedGenomeIds)) if len(genomeIds) < minGenomes: continue print "\nLineage " + lineage + " contains " + str(len(genomeIds)) + " genomes." fout.write(lineage + "\t" + str(len(genomeIds))) pfamTable = img.pfamTable(genomeIds) for threshold in arange(maxThreshold, minThreshold, -stepSize): markerSet = img.markerGenes( genomeIds, pfamTable, threshold * len(genomeIds), threshold * len(genomeIds) ) fout.write("\t" + str(len(markerSet))) print " Threshold = %.2f, marker set size = %d" % (threshold, len(markerSet)) fout.write("\n") fout.close()
def run(self, minThreshold, maxThreshold, stepSize, minGenomes, mostSpecificRanks): img = IMG() trustedGenomeIds = img.trustedGenomes() fout = open('./data/markerSetSize.tsv', 'w') fout.write('Lineage\t# genomes') for threshold in arange(maxThreshold, minThreshold, -stepSize): fout.write('\t' + str(threshold)) fout.write('\n') lineages = img.lineagesSorted(mostSpecificRanks) for lineage in lineages: genomeIds = img.genomeIdsByTaxonomy(lineage) genomeIds = list(genomeIds.intersection(trustedGenomeIds)) if len(genomeIds) < minGenomes: continue print('\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.') fout.write(lineage + '\t' + str(len(genomeIds))) pfamTable = img.pfamTable(genomeIds) for threshold in arange(maxThreshold, minThreshold, -stepSize): markerSet = img.markerGenes(genomeIds, pfamTable, threshold * len(genomeIds), threshold * len(genomeIds)) fout.write('\t' + str(len(markerSet))) print(' Threshold = %.2f, marker set size = %d' % (threshold, len(markerSet))) fout.write('\n') fout.close()
def run(self, ubiquityThreshold, singleCopyThreshold, rank): img = IMG() markerset = MarkerSet() print('Reading metadata.') metadata = img.genomeMetadata() print(' Genomes with metadata: ' + str(len(metadata))) # calculate marker set for each lineage at the specified rank sortedLineages = img.lineagesSorted(metadata, rank) markerGeneLists = {} for lineage in sortedLineages: taxonomy = lineage.split(';') if len(taxonomy) != rank + 1: continue genomeIds = img.genomeIdsByTaxonomy(lineage, metadata, 'Final') countTable = img.countTable(genomeIds) if len(genomeIds) < 3: continue print('Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.') markerGenes = markerset.markerGenes( genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) print(' Marker genes: ' + str(len(markerGenes))) print('') markerGeneLists[lineage] = markerGenes # calculate union of marker gene list for higher taxonomic groups for r in range(rank - 1, -1, -1): print('Processing rank ' + str(r)) rankMarkerGeneLists = {} for lineage, markerGenes in markerGeneLists.iteritems(): taxonomy = lineage.split(';') if len(taxonomy) != r + 2: continue curLineage = '; '.join(taxonomy[0:r + 1]) if curLineage not in rankMarkerGeneLists: rankMarkerGeneLists[curLineage] = markerGenes else: curMarkerGenes = rankMarkerGeneLists[curLineage] curMarkerGenes = curMarkerGenes.intersection(markerGenes) rankMarkerGeneLists[curLineage] = curMarkerGenes # combine marker gene list dictionaries markerGeneLists.update(rankMarkerGeneLists)
def run( self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, distThreshold, genomeThreshold, ): img = IMG() markerset = MarkerSet() lineages = img.lineagesSorted(mostSpecificRank) fout = open("./data/colocated.tsv", "w", 1) fout.write("Lineage\t# genomes\t# markers\t# co-located sets\tCo-located markers\n") lineageCount = 0 for lineage in lineages: lineageCount += 1 genomeIds = img.genomeIdsByTaxonomy(lineage, "Final") if len(genomeIds) < minGenomes: continue countTable = img.countTable(genomeIds) markerGenes = markerset.markerGenes( genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds) ) geneDistTable = img.geneDistTable(genomeIds, markerGenes) colocatedGenes = markerset.colocatedGenes(geneDistTable, distThreshold, genomeThreshold) colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes) if len(colocatedSets) < minMarkers: continue print "\nLineage " + lineage + " contains " + str(len(genomeIds)) + " genomes (" + str( lineageCount ) + " of " + str(len(lineages)) + ")." print " Marker genes: " + str(len(markerGenes)) print " Co-located gene sets: " + str(len(colocatedSets)) fout.write( lineage + "\t" + str(len(genomeIds)) + "\t" + str(len(markerGenes)) + "\t" + str(len(colocatedSets)) ) for cs in colocatedSets: fout.write("\t" + ", ".join(cs)) fout.write("\n") fout.close()
def run(self, ubiquityThreshold, singleCopyThreshold, rank): img = IMG() markerset = MarkerSet() print 'Reading metadata.' metadata = img.genomeMetadata() print ' Genomes with metadata: ' + str(len(metadata)) # calculate marker set for each lineage at the specified rank sortedLineages = img.lineagesSorted(metadata, rank) markerGeneLists = {} for lineage in sortedLineages: taxonomy = lineage.split(';') if len(taxonomy) != rank+1: continue genomeIds = img.genomeIdsByTaxonomy(lineage, metadata, 'Final') countTable = img.countTable(genomeIds) if len(genomeIds) < 3: continue print 'Lineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes.' markerGenes = markerset.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) print ' Marker genes: ' + str(len(markerGenes)) print '' markerGeneLists[lineage] = markerGenes # calculate union of marker gene list for higher taxonomic groups for r in xrange(rank-1, -1, -1): print 'Processing rank ' + str(r) rankMarkerGeneLists = {} for lineage, markerGenes in markerGeneLists.iteritems(): taxonomy = lineage.split(';') if len(taxonomy) != r+2: continue curLineage = '; '.join(taxonomy[0:r+1]) if curLineage not in rankMarkerGeneLists: rankMarkerGeneLists[curLineage] = markerGenes else: curMarkerGenes = rankMarkerGeneLists[curLineage] curMarkerGenes = curMarkerGenes.intersection(markerGenes) rankMarkerGeneLists[curLineage] = curMarkerGenes # combine marker gene list dictionaries markerGeneLists.update(rankMarkerGeneLists)
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, distThreshold, genomeThreshold): img = IMG() markerset = MarkerSet() lineages = img.lineagesSorted(mostSpecificRank) fout = open('./data/colocated.tsv', 'w', 1) fout.write( 'Lineage\t# genomes\t# markers\t# co-located sets\tCo-located markers\n' ) lineageCount = 0 for lineage in lineages: lineageCount += 1 genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') if len(genomeIds) < minGenomes: continue countTable = img.countTable(genomeIds) markerGenes = markerset.markerGenes( genomeIds, countTable, ubiquityThreshold * len(genomeIds), singleCopyThreshold * len(genomeIds)) geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = markerset.colocatedGenes(geneDistTable, distThreshold, genomeThreshold) colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes) if len(colocatedSets) < minMarkers: continue print '\nLineage ' + lineage + ' contains ' + str(len( genomeIds)) + ' genomes (' + str(lineageCount) + ' of ' + str( len(lineages)) + ').' print ' Marker genes: ' + str(len(markerGenes)) print ' Co-located gene sets: ' + str(len(colocatedSets)) fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' + str(len(markerGenes)) + '\t' + str(len(colocatedSets))) for cs in colocatedSets: fout.write('\t' + ', '.join(cs)) fout.write('\n') fout.close()
def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, distThreshold, genomeThreshold): img = IMG() markerset = MarkerSet() lineages = img.lineagesSorted(mostSpecificRank) fout = open('./data/colocated.tsv', 'w', 1) fout.write('Lineage\t# genomes\t# markers\t# co-located sets\tCo-located markers\n') lineageCount = 0 for lineage in lineages: lineageCount += 1 genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') if len(genomeIds) < minGenomes: continue countTable = img.countTable(genomeIds) markerGenes = markerset.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = markerset.colocatedGenes(geneDistTable, distThreshold, genomeThreshold) colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes) if len(colocatedSets) < minMarkers: continue print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes (' + str(lineageCount) + ' of ' + str(len(lineages)) + ').' print ' Marker genes: ' + str(len(markerGenes)) print ' Co-located gene sets: ' + str(len(colocatedSets)) fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' + str(len(markerGenes)) + '\t' + str(len(colocatedSets))) for cs in colocatedSets: fout.write('\t' + ', '.join(cs)) fout.write('\n') fout.close()
def run(self, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination, genomeCompleteness, genomeContamination): img = IMG() markerset = MarkerSet() metadata = img.genomeMetadata() trustedOut = open('./data/trusted_genomes.tsv', 'w') trustedOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tBiotic Relationship\tStatus\tCompleteness\tContamination\n') filteredOut = open('./data/filtered_genomes.tsv', 'w') filteredOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tBiotic Relationship\tStatus\tCompleteness\tContamination\n') allGenomeIds = set() allTrustedGenomeIds = set() for lineage in ['Archaea', 'Bacteria']: # get all genomes in lineage and build gene count table print '\nBuilding gene count table.' allLineageGenomeIds = img.genomeIdsByTaxonomy(lineage, metadata, 'All') countTable = img.countTable(allLineageGenomeIds) countTable = img.filterTable(allLineageGenomeIds, countTable, 0.9*ubiquityThreshold, 0.9*singleCopyThreshold) # get all genomes from specific lineage allGenomeIds = allGenomeIds.union(allLineageGenomeIds) print 'Lineage ' + lineage + ' contains ' + str(len(allLineageGenomeIds)) + ' genomes.' # tabulate genomes from each phylum allPhylumCounts = {} for genomeId in allLineageGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1 # identify marker set for genomes markerGenes = markerset.markerGenes(allLineageGenomeIds, countTable, ubiquityThreshold*len(allLineageGenomeIds), singleCopyThreshold*len(allLineageGenomeIds)) print ' Marker genes: ' + str(len(markerGenes)) geneDistTable = img.geneDistTable(allLineageGenomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = markerset.colocatedGenes(geneDistTable, metadata) colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes) print ' Marker set size: ' + str(len(colocatedSets)) # identifying trusted genomes (highly complete, low contamination genomes) trustedGenomeIds = set() for genomeId in allLineageGenomeIds: completeness, contamination = markerset.genomeCheck(colocatedSets, genomeId, countTable) if completeness >= trustedCompleteness and contamination <= trustedContamination: trustedGenomeIds.add(genomeId) allTrustedGenomeIds.add(genomeId) trustedOut.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy'])) trustedOut.write('\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6)) trustedOut.write('\t' + str(metadata[genomeId]['scaffold count'])) trustedOut.write('\t' + metadata[genomeId]['biotic relationships']) trustedOut.write('\t' + metadata[genomeId]['status']) trustedOut.write('\t%.3f\t%.3f' % (completeness, contamination) + '\n') else: filteredOut.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy'])) filteredOut.write('\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6)) filteredOut.write('\t' + str(metadata[genomeId]['scaffold count'])) filteredOut.write('\t' + metadata[genomeId]['biotic relationships']) filteredOut.write('\t' + metadata[genomeId]['status']) filteredOut.write('\t%.3f\t%.3f' % (completeness, contamination) + '\n') print ' Trusted genomes: ' + str(len(trustedGenomeIds)) # determine status of trusted genomes statusBreakdown = {} for genomeId in trustedGenomeIds: statusBreakdown[metadata[genomeId]['status']] = statusBreakdown.get(metadata[genomeId]['status'], 0) + 1 print ' Trusted genome status breakdown: ' for status, count in statusBreakdown.iteritems(): print ' ' + status + ': ' + str(count) # determine status of retained genomes proposalNameBreakdown = {} for genomeId in trustedGenomeIds: proposalNameBreakdown[metadata[genomeId]['proposal name']] = proposalNameBreakdown.get(metadata[genomeId]['proposal name'], 0) + 1 print ' Retained genome proposal name breakdown: ' for pn, count in proposalNameBreakdown.iteritems(): if 'KMG' in pn or 'GEBA' in pn or 'HMP' in pn: print ' ' + pn + ': ' + str(count) print ' Filtered genomes by phylum:' trustedPhylumCounts = {} for genomeId in trustedGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1 for phylum, count in allPhylumCounts.iteritems(): print phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count) trustedOut.close() filteredOut.close() # write out lineage statistics for genome distribution allStats = {} trustedStats = {} for r in xrange(0, 6): # Domain to Genus for genomeId, data in metadata.iteritems(): taxaStr = '; '.join(data['taxonomy'][0:r+1]) allStats[taxaStr] = allStats.get(taxaStr, 0) + 1 if genomeId in allTrustedGenomeIds: trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1 sortedLineages = img.lineagesSorted() fout = open('./data/lineage_stats.tsv', 'w') fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n') for lineage in sortedLineages: fout.write(lineage + '\t' + str(allStats.get(lineage, 0))+ '\t' + str(trustedStats.get(lineage, 0))+ '\n') fout.close()
def run(self, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination, genomeCompleteness, genomeContamination): img = IMG() markerset = MarkerSet() metadata = img.genomeMetadata() trustedOut = open('./data/trusted_genomes.tsv', 'w') trustedOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tBiotic Relationship\tStatus\tCompleteness\tContamination\n' ) filteredOut = open('./data/filtered_genomes.tsv', 'w') filteredOut.write( 'Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tBiotic Relationship\tStatus\tCompleteness\tContamination\n' ) allGenomeIds = set() allTrustedGenomeIds = set() for lineage in ['Archaea', 'Bacteria']: # get all genomes in lineage and build gene count table print '\nBuilding gene count table.' allLineageGenomeIds = img.genomeIdsByTaxonomy( lineage, metadata, 'All') countTable = img.countTable(allLineageGenomeIds) countTable = img.filterTable(allLineageGenomeIds, countTable, 0.9 * ubiquityThreshold, 0.9 * singleCopyThreshold) # get all genomes from specific lineage allGenomeIds = allGenomeIds.union(allLineageGenomeIds) print 'Lineage ' + lineage + ' contains ' + str( len(allLineageGenomeIds)) + ' genomes.' # tabulate genomes from each phylum allPhylumCounts = {} for genomeId in allLineageGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1 # identify marker set for genomes markerGenes = markerset.markerGenes( allLineageGenomeIds, countTable, ubiquityThreshold * len(allLineageGenomeIds), singleCopyThreshold * len(allLineageGenomeIds)) print ' Marker genes: ' + str(len(markerGenes)) geneDistTable = img.geneDistTable(allLineageGenomeIds, markerGenes, spacingBetweenContigs=1e6) colocatedGenes = markerset.colocatedGenes(geneDistTable, metadata) colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes) print ' Marker set size: ' + str(len(colocatedSets)) # identifying trusted genomes (highly complete, low contamination genomes) trustedGenomeIds = set() for genomeId in allLineageGenomeIds: completeness, contamination = markerset.genomeCheck( colocatedSets, genomeId, countTable) if completeness >= trustedCompleteness and contamination <= trustedContamination: trustedGenomeIds.add(genomeId) allTrustedGenomeIds.add(genomeId) trustedOut.write(genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy'])) trustedOut.write( '\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6)) trustedOut.write('\t' + str(metadata[genomeId]['scaffold count'])) trustedOut.write( '\t' + metadata[genomeId]['biotic relationships']) trustedOut.write('\t' + metadata[genomeId]['status']) trustedOut.write('\t%.3f\t%.3f' % (completeness, contamination) + '\n') else: filteredOut.write( genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy'])) filteredOut.write( '\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6)) filteredOut.write( '\t' + str(metadata[genomeId]['scaffold count'])) filteredOut.write( '\t' + metadata[genomeId]['biotic relationships']) filteredOut.write('\t' + metadata[genomeId]['status']) filteredOut.write('\t%.3f\t%.3f' % (completeness, contamination) + '\n') print ' Trusted genomes: ' + str(len(trustedGenomeIds)) # determine status of trusted genomes statusBreakdown = {} for genomeId in trustedGenomeIds: statusBreakdown[metadata[genomeId] ['status']] = statusBreakdown.get( metadata[genomeId]['status'], 0) + 1 print ' Trusted genome status breakdown: ' for status, count in statusBreakdown.iteritems(): print ' ' + status + ': ' + str(count) # determine status of retained genomes proposalNameBreakdown = {} for genomeId in trustedGenomeIds: proposalNameBreakdown[metadata[genomeId][ 'proposal name']] = proposalNameBreakdown.get( metadata[genomeId]['proposal name'], 0) + 1 print ' Retained genome proposal name breakdown: ' for pn, count in proposalNameBreakdown.iteritems(): if 'KMG' in pn or 'GEBA' in pn or 'HMP' in pn: print ' ' + pn + ': ' + str(count) print ' Filtered genomes by phylum:' trustedPhylumCounts = {} for genomeId in trustedGenomeIds: taxon = metadata[genomeId]['taxonomy'][1] trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1 for phylum, count in allPhylumCounts.iteritems(): print phylum + ': %d of %d' % (trustedPhylumCounts.get( phylum, 0), count) trustedOut.close() filteredOut.close() # write out lineage statistics for genome distribution allStats = {} trustedStats = {} for r in xrange(0, 6): # Domain to Genus for genomeId, data in metadata.iteritems(): taxaStr = '; '.join(data['taxonomy'][0:r + 1]) allStats[taxaStr] = allStats.get(taxaStr, 0) + 1 if genomeId in allTrustedGenomeIds: trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1 sortedLineages = img.lineagesSorted() fout = open('./data/lineage_stats.tsv', 'w') fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n') for lineage in sortedLineages: fout.write(lineage + '\t' + str(allStats.get(lineage, 0)) + '\t' + str(trustedStats.get(lineage, 0)) + '\n') fout.close()