def __getUniversalMarkerGenes(self, phyloUbiquityThreshold, phyloSingleCopyThreshold, outputGeneDir): img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') markerSetBuilder = MarkerSetBuilder() metadata = img.genomeMetadata() allTrustedGenomeIds = set() phyloMarkerGenes = {} for lineage in ['Archaea', 'Bacteria']: # get all genomes in lineage print('\nIdentifying all ' + lineage + ' genomes.') trustedGenomeIds = img.genomeIdsByTaxonomy(lineage, metadata) print(' Trusted genomes in lineage: ' + str(len(trustedGenomeIds))) if len(trustedGenomeIds) < 1: print( ' Skipping lineage due to insufficient number of genomes.' ) continue allTrustedGenomeIds.update(trustedGenomeIds) print(' Building marker set.') markerGenes = markerSetBuilder.buildMarkerGenes( trustedGenomeIds, phyloUbiquityThreshold, phyloSingleCopyThreshold) phyloMarkerGenes[lineage] = markerGenes #print lineage #print len(markerGenes) #print 'pfam01379: ', ('pfam01379' in markerGenes) #print '--------------------' # universal marker genes universalMarkerGenes = None for markerGenes in phyloMarkerGenes.values(): if universalMarkerGenes == None: universalMarkerGenes = markerGenes else: universalMarkerGenes.intersection_update(markerGenes) fout = open(os.path.join(outputGeneDir, 'phylo_marker_set.txt'), 'w') fout.write(str(universalMarkerGenes)) fout.close() print('') print(' Universal marker genes: ' + str(len(universalMarkerGenes))) return allTrustedGenomeIds, universalMarkerGenes
def __init__(self): self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') self.pfamHMMs = '/srv/whitlam/bio/db/pfam/27/Pfam-A.hmm' self.markerSetBuilder = MarkerSetBuilder()