def verifyLineageSet(self, markerSetFile, bRequireTaxonomy): """Verify output of lineage set command.""" with open(markerSetFile) as f: f.readline() # skip header for line in f: if line.strip() != '': lineSplit = line.split('\t') binId = lineSplit[0] numMarkers = int(lineSplit[1]) uid = lineSplit[2] lineage = lineSplit[3] numGenomes = int(lineSplit[4]) markerSet = MarkerSet(uid, lineage, numGenomes, eval(lineSplit[5].rstrip())) np.testing.assert_almost_equal(int(binId), 637000110, err_msg="Failed bin ID test") if not bRequireTaxonomy: # this might be a little unstable as it depends on HMMER and prodigal, but # we will see how it goes np.testing.assert_equal(markerSet.numSets(), 266, err_msg="Failed # marker set test") np.testing.assert_equal(markerSet.numMarkers(), 2134, err_msg="Failed # markers test") assert(uid == 'UID5199') else: np.testing.assert_equal(markerSet.numSets(), 282, err_msg="Failed # marker set test") np.testing.assert_equal(markerSet.numMarkers(), 1254, err_msg="Failed # markers test") assert(lineage == 'f__Enterobacteriaceae')
def run(self): # read all taxonomic-specific marker genes print('Reading taxonomic-specific marker genes.') taxonomicMarkers = set() taxonParser = TaxonParser() taxonMarkerSets = taxonParser.readMarkerSets() for _, taxa in taxonMarkerSets.items(): for _, markerSet in taxa.items(): taxonomicMarkers = taxonomicMarkers.union( markerSet.getMarkerGenes()) print(' Taxonomic-specific marker genes: %d' % len(taxonomicMarkers)) # read all lineage-specific marker genes print('Reading lineage-specific marker genes.') lineageMarkers = set() treeParser = TreeParser() uniqueIdToLineageStatistics = treeParser.readNodeMetadata() for uniqueId, d in uniqueIdToLineageStatistics.items(): markerSet = MarkerSet(uniqueId, 'NA', int(d['# genomes']), eval(d['marker set'])) lineageMarkers = lineageMarkers.union(markerSet.getMarkerGenes()) print(' Lineage-specific marker genes: %d' % len(lineageMarkers)) # gather all marker genes markerGenes = taxonomicMarkers.union(lineageMarkers) print(' Total marker genes: %d' % len(markerGenes)) # get genes from same clan as marker genes print('Gathering HMMs from the same clan as marker genes.') pfam = PFAM() genesInSameClan = pfam.genesInSameClan(markerGenes) allMarkers = markerGenes.union(genesInSameClan) # create file with all model accession numbers keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) fout = open(keyFile, 'w') for modelAcc in allMarkers: fout.write(modelAcc + '\n') fout.close() # fetch specified models HF = HMMERRunner(mode='fetch') HF.fetch(self.hmms, keyFile, self.outputHMMs, bKeyFile=True) # index the HMM file if os.path.exists(self.outputHMMs + '.ssi'): os.remove(self.outputHMMs + '.ssi') HF.index(self.outputHMMs) # remove key file os.remove(keyFile)
def buildDomainMarkerSet(self, tree, curNode, ubiquityThreshold, singleCopyThreshold, bMarkerSet = True, genomeIdsToRemove = None): """Build domain-specific marker sets for a genome in a LOO-fashion.""" # determine marker sets for bin binMarkerSets = BinMarkerSets(curNode.label, BinMarkerSets.TREE_MARKER_SET) refinedBinMarkerSet = BinMarkerSets(curNode.label, BinMarkerSets.TREE_MARKER_SET) # calculate marker set for bacterial or archaeal node uniqueId = curNode.label.split('|')[0] lineageSpecificRefinement = self.lineageSpecificGenesToRemove[uniqueId] while curNode != None: uniqueId = curNode.label.split('|')[0] if uniqueId != 'UID2' and uniqueId != 'UID203': curNode = curNode.parent_node continue stats = self.uniqueIdToLineageStatistics[uniqueId] taxonomyStr = stats['taxonomy'] if taxonomyStr == '': taxonomyStr = self.__getNextNamedNode(curNode, self.uniqueIdToLineageStatistics) leafNodes = curNode.leaf_nodes() genomeIds = set() for leaf in leafNodes: genomeIds.add(leaf.taxon.label.replace('IMG_', '')) duplicateGenomes = self.duplicateSeqs.get(leaf.taxon.label, []) for dup in duplicateGenomes: genomeIds.add(dup.replace('IMG_', '')) # remove all genomes from the same taxonomic group as the genome of interest if genomeIdsToRemove != None: genomeIds.difference_update(genomeIdsToRemove) if len(genomeIds) >= 2: if bMarkerSet: markerSet = self.buildMarkerSet(genomeIds, ubiquityThreshold, singleCopyThreshold) else: markerSet = MarkerSet(0, 'NA', len(genomeIds), [self.buildMarkerGenes(genomeIds, ubiquityThreshold, singleCopyThreshold)]) markerSet.lineageStr = uniqueId + ' | ' + taxonomyStr.split(';')[-1] binMarkerSets.addMarkerSet(markerSet) #refinedMarkerSet = self.__refineMarkerSet(markerSet, lineageSpecificMarkerSet) refinedMarkerSet = self.____removeInvalidLineageMarkerGenes(markerSet, lineageSpecificRefinement) #print 'Refinement: %d of %d' % (len(refinedMarkerSet.getMarkerGenes()), len(markerSet.getMarkerGenes())) refinedBinMarkerSet.addMarkerSet(refinedMarkerSet) curNode = curNode.parent_node return binMarkerSets, refinedBinMarkerSet
def run(self): # read all taxonomic-specific marker genes print 'Reading taxonomic-specific marker genes.' taxonomicMarkers = set() taxonParser = TaxonParser() taxonMarkerSets = taxonParser.readMarkerSets() for _, taxa in taxonMarkerSets.iteritems(): for _, markerSet in taxa.iteritems(): taxonomicMarkers = taxonomicMarkers.union(markerSet.getMarkerGenes()) print ' Taxonomic-specific marker genes: %d' % len(taxonomicMarkers) # read all lineage-specific marker genes print 'Reading lineage-specific marker genes.' lineageMarkers = set() treeParser = TreeParser() uniqueIdToLineageStatistics = treeParser.readNodeMetadata() for uniqueId, d in uniqueIdToLineageStatistics.iteritems(): markerSet = MarkerSet(uniqueId, 'NA', int(d['# genomes']), eval(d['marker set'])) lineageMarkers = lineageMarkers.union(markerSet.getMarkerGenes()) print ' Lineage-specific marker genes: %d' % len(lineageMarkers) # gather all marker genes markerGenes = taxonomicMarkers.union(lineageMarkers) print ' Total marker genes: %d' % len(markerGenes) # get genes from same clan as marker genes print 'Gathering HMMs from the same clan as marker genes.' pfam = PFAM() genesInSameClan = pfam.genesInSameClan(markerGenes) allMarkers = markerGenes.union(genesInSameClan) # create file with all model accession numbers keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) fout = open(keyFile, 'w') for modelAcc in allMarkers: fout.write(modelAcc + '\n') fout.close() # fetch specified models HF = HMMERRunner(mode='fetch') HF.fetch(self.hmms, keyFile, self.outputHMMs, bKeyFile=True) # index the HMM file if os.path.exists(self.outputHMMs + '.ssi'): os.remove(self.outputHMMs + '.ssi') HF.index(self.outputHMMs) # remove key file os.remove(keyFile)
def testMarkerSet(self): """Verify marker set data structure.""" markers = [set(["a", "b"]), set(["c"])] ms = MarkerSet(0, "k__Bacteria", 100, markers) markerGenes, markerSets = ms.size() self.assertEqual(markerGenes, 3) self.assertEqual(markerSets, 2) self.assertEqual(ms.numMarkers(), 3) self.assertEqual(ms.numSets(), 2) self.assertEqual(ms.getMarkerGenes(), set(["a", "b", "c"]))
def testMarkerSet(self): """Verify marker set data structure.""" markers = [set(['a', 'b']), set(['c'])] ms = MarkerSet(0, 'k__Bacteria', 100, markers) markerGenes, markerSets = ms.size() self.assertEqual(markerGenes, 3) self.assertEqual(markerSets, 2) self.assertEqual(ms.numMarkers(), 3) self.assertEqual(ms.numSets(), 2) self.assertEqual(ms.getMarkerGenes(), set(['a', 'b', 'c']))
def testBinMarkerSets(self): """Verify bin marker set data structure.""" bms = BinMarkerSets(0, BinMarkerSets.TAXONOMIC_MARKER_SET) ms1 = MarkerSet(1, 'k__Bacteria', 100, [set(['a', 'b']), set(['c'])]) bms.addMarkerSet(ms1) ms2 = MarkerSet(2, 'k__Bacteria', 100, [set(['d', 'e']), set(['f'])]) bms.addMarkerSet(ms2) self.assertEqual(bms.getMarkerGenes(), set(['a', 'b', 'c', 'd', 'e', 'f'])) self.assertEqual(bms.mostSpecificMarkerSet(), ms1) self.assertEqual(bms.selectedMarkerSet(), ms1)
def ____removeInvalidLineageMarkerGenes(self, markerSet, lineageSpecificMarkersToRemove): """Refine marker set to account for lineage-specific gene loss and duplication.""" # refine marker set by removing marker genes subject to lineage-specific # gene loss and duplication # # Note: co-localization information is taken from the trusted set finalMarkerSet = [] for ms in markerSet.markerSet: s = set() for gene in ms: if gene.startswith('PF'): print('ERROR! Expected genes to start with pfam, not PF.') if gene not in lineageSpecificMarkersToRemove: s.add(gene) if s: finalMarkerSet.append(s) refinedMarkerSet = MarkerSet(markerSet.UID, markerSet.lineageStr, markerSet.numGenomes, finalMarkerSet) return refinedMarkerSet
def __getMarkerSet(self, parentNode, tree, uniqueIdToLineageStatistics, numGenomesMarkers, bootstrap, bForceDomain, bRequireTaxonomy): """Get marker set for next parent node meeting selection criteria.""" # ascend tree to root finding first node meeting all selection criteria selectedParentNode = parentNode taxonomyStr = 'root' while True: if selectedParentNode.label: # nodes inserted by PPLACER will not have a label trustedUniqueId = selectedParentNode.label.split('|')[0] nodeTaxonomy = selectedParentNode.label.split('|')[1] stats = uniqueIdToLineageStatistics[trustedUniqueId] if stats['# genomes'] >= numGenomesMarkers and stats['bootstrap'] >= bootstrap: if not bForceDomain or nodeTaxonomy in ['k__Bacteria', 'k__Archaea']: if not bRequireTaxonomy or stats['taxonomy'] != '': # get closest taxonomic label taxonomyStr = stats['taxonomy'] if not bRequireTaxonomy and stats['taxonomy'] == '': taxonomyStr = self.__getNextNamedNode(selectedParentNode, uniqueIdToLineageStatistics) # all criteria meet, so use marker set from this node break if selectedParentNode.parent_node == None: break # reached the root node so terminate selectedParentNode = selectedParentNode.parent_node # get marker set meeting all criteria required for a trusted marker set taxonomyStr = taxonomyStr.split(';')[-1] # extract most specific taxonomy identifier markerSet = MarkerSet(trustedUniqueId, taxonomyStr, int(stats['# genomes']), eval(stats['marker set'])) return selectedParentNode, markerSet
def __removeInvalidLineageMarkerGenes(self, markerSet, lineageSpecificMarkersToRemove): """Refine marker set to account for lineage-specific gene loss and duplication.""" # refine marker set by removing marker genes subject to lineage-specific # gene loss and duplication # # Note: co-localization information is taken from the trusted set finalMarkerSet = [] for ms in markerSet.markerSet: s = set() for gene in ms: geneIdToTest = gene if geneIdToTest.startswith('PF'): geneIdToTest = gene.replace('PF', 'pfam') geneIdToTest = geneIdToTest[0:geneIdToTest.rfind('.')] if geneIdToTest not in lineageSpecificMarkersToRemove: s.add(gene) if s: finalMarkerSet.append(s) refinedMarkerSet = MarkerSet(markerSet.UID, markerSet.lineageStr, markerSet.numGenomes, finalMarkerSet) return refinedMarkerSet
def readMarkerSets(self): taxonMarkerSets = defaultdict(dict) for line in open(DefaultValues.TAXON_MARKER_SETS): lineSplit = line.split('\t') rank = lineSplit[0] taxon = lineSplit[1] lineage = lineSplit[2] numGenomes = int(lineSplit[3]) markerSet = eval(lineSplit[6].rstrip()) ms = MarkerSet(ranksByLabel[rank], lineage, numGenomes, markerSet) ms.removeMarkers(DefaultValues.MARKERS_TO_EXCLUDE) taxonMarkerSets[rank][taxon] = ms return taxonMarkerSets
def verifyLineageSet(self, markerSetFile, bRequireTaxonomy): """Verify output of lineage set command.""" with open(markerSetFile) as f: f.readline() # skip header for line in f: if line.strip() != '': lineSplit = line.split('\t') binId = lineSplit[0] _numMarkers = int(lineSplit[1]) uid = lineSplit[2] lineage = lineSplit[3] numGenomes = int(lineSplit[4]) _markerSet = MarkerSet(uid, lineage, numGenomes, eval(lineSplit[5].rstrip())) np.testing.assert_almost_equal(int(binId), 637000110, err_msg="Failed bin ID test") if not bRequireTaxonomy: # this is unstable as it depends on HMMER and prodigal # np.testing.assert_equal(markerSet.numSets(), 336, err_msg="Failed # marker set test") # np.testing.assert_equal(markerSet.numMarkers(), 1173, err_msg="Failed # markers test") pass else: # np.testing.assert_equal(markerSet.numSets(), 282, err_msg="Failed # marker set test") # np.testing.assert_equal(markerSet.numMarkers(), 1254, err_msg="Failed # markers test") pass
def buildMarkerSet(self, genomeIds, ubiquityThreshold, singleCopyThreshold, spacingBetweenContigs = 5000): """Infer marker set from specified genomes.""" markerGenes = self.buildMarkerGenes(genomeIds, ubiquityThreshold, singleCopyThreshold) geneDistTable = self.img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs) colocatedGenes = self.colocatedGenes(geneDistTable) colocatedSets = self.colocatedSets(colocatedGenes, markerGenes) markerSet = MarkerSet(0, 'NA', len(genomeIds), colocatedSets) return markerSet
def run(self): # read internal nodes file metadata = {} for line in open('./experiments/classTree.internal_nodes.tsv'): uid, label = [x.strip() for x in line.split('\t')] metadata[uid] = label # read all lineage-specific marker genes treeParser = TreeParser() uniqueIdToLineageStatistics = treeParser.readNodeMetadata() for uid in metadata: stats = uniqueIdToLineageStatistics[uid] markerSet = MarkerSet(uid, 'NA', int(stats['# genomes']), eval(stats['marker set'])) metadata[uid] += ' [%d, %d, %d]' % (stats['# genomes'], markerSet.numMarkers(), markerSet.numSets()) # write out results fout = open('./experiments/classTree.internal_nodes.metadata.tsv', 'w') for uid, label in metadata.iteritems(): fout.write(uid + '\t' + label + '\n') fout.close()
def __refineMarkerSet(self, markerSet, binNode, tree, uniqueIdToLineageStatistics, numGenomesRefine): """Refine marker set to account for lineage-specific gene loss and duplication.""" # lineage-specific refine is done with the sister lineage to where the bin is inserted # get lineage-specific marker set which will be used to refine the above marker set curNode = binNode.sister_nodes()[0] while True: if curNode.label: # nodes inserted by PPLACER will not have a label uniqueId = curNode.label.split('|')[0] stats = uniqueIdToLineageStatistics[uniqueId] if stats['# genomes'] >= numGenomesRefine: break curNode = curNode.parent_node if curNode == None: break # reached the root node so terminate # get lineage-specific marker set lineageMarkerSet = eval(stats['marker set']) # refine marker set by finding the intersection between these two sets, # this removes markers that are not single-copy or ubiquitous in the # specific lineage of a bin # Note: co-localization information is taken from the trusted set # get all lineage-specific marker genes allLineageSpecificGenes = set() for m in lineageMarkerSet: for gene in m: allLineageSpecificGenes.add(gene) # remove genes not present in the lineage-specific gene set finalMarkerSet = [] for ms in markerSet.markerSet: s = set() for gene in ms: if gene in allLineageSpecificGenes: s.add(gene) if s: finalMarkerSet.append(s) refinedMarkerSet = MarkerSet(markerSet.UID, markerSet.lineageStr, markerSet.numGenomes, finalMarkerSet) return refinedMarkerSet
def run(self): # read internal nodes file metadata = {} for line in open('./experiments/classTree.internal_nodes.tsv'): uid, label = [x.strip() for x in line.split('\t')] metadata[uid] = label # read all lineage-specific marker genes treeParser = TreeParser() uniqueIdToLineageStatistics = treeParser.readNodeMetadata() for uid in metadata: stats = uniqueIdToLineageStatistics[uid] markerSet = MarkerSet(uid, 'NA', int(stats['# genomes']), eval(stats['marker set'])) metadata[uid] += ' [%d, %d, %d]' % (stats['# genomes'], markerSet.numMarkers(), markerSet.numSets()) # write out results fout = open('./experiments/classTree.internal_nodes.metadata.tsv', 'w') for uid, label in metadata.items(): fout.write(uid + '\t' + label + '\n') fout.close()
def __refineMarkerSet(self, markerSet, lineageSpecificMarkerSet): """Refine marker set to account for lineage-specific gene loss and duplication.""" # refine marker set by finding the intersection between these two sets, # this removes markers that are not single-copy or ubiquitous in the # specific lineage of a bin # Note: co-localization information is taken from the trusted set # remove genes not present in the lineage-specific gene set finalMarkerSet = [] for ms in markerSet.markerSet: s = set() for gene in ms: if gene in lineageSpecificMarkerSet.getMarkerGenes(): s.add(gene) if s: finalMarkerSet.append(s) refinedMarkerSet = MarkerSet(markerSet.UID, markerSet.lineageStr, markerSet.numGenomes, finalMarkerSet) return refinedMarkerSet
def run(self, ubiquityThreshold, minGenomes): # Pre-compute gene count table print 'Computing gene count table.' start = time.time() metadata = self.img.genomeMetadata() self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys()) end = time.time() print ' globalGeneCountTable: %.2f' % (end - start) # read selected node for defining marker set print 'Reading node defining marker set for each internal node.' selectedMarkerNode = {} for line in open('/srv/whitlam/bio/db/checkm/selected_marker_sets.tsv'): lineSplit = line.split('\t') selectedMarkerNode[lineSplit[0].strip()] = lineSplit[1].strip() # read duplicate taxa print 'Reading list of identical taxa in genome tree.' duplicateTaxa = {} for line in open('/srv/whitlam/bio/db/checkm/genome_tree/genome_tree.derep.txt'): lineSplit = line.rstrip().split() if len(lineSplit) > 1: duplicateTaxa[lineSplit[0]] = lineSplit[1:] # read in node metadata print 'Reading node metadata.' treeParser = TreeParser() uniqueIdToLineageStatistics = treeParser.readNodeMetadata() # read genome tree print 'Reading in genome tree.' treeFile = '/srv/whitlam/bio/db/checkm/genome_tree/genome_tree_prok.refpkg/genome_tree.final.tre' tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) # determine lineage-specific gene loss and duplication (relative to potential marker genes used by a node) print 'Determining lineage-specific gene loss and duplication' fout = open('/srv/whitlam/bio/db/checkm/genome_tree/missing_duplicate_genes_50.tsv', 'w') processed = 0 numInternalNodes = len(tree.internal_nodes()) for node in tree.internal_nodes(): processed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) internal nodes.' % (processed, numInternalNodes, float(processed)*100/numInternalNodes) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() nodeId = node.label.split('|')[0] missingGenes = [] duplicateGenes = [] nodeStats = uniqueIdToLineageStatistics[nodeId] if nodeStats['# genomes'] >= minGenomes: # get marker genes defined for current node along with all parental nodes markerGenes = set() parentNode = node while parentNode != None: parentNodeId = parentNode.label.split('|')[0] stats = uniqueIdToLineageStatistics[parentNodeId] markerSet = MarkerSet(parentNodeId, stats['taxonomy'], stats['# genomes'], eval(stats['marker set'])) markerGenes = markerGenes.union(markerSet.getMarkerGenes()) parentNode = parentNode.parent_node # silly hack since PFAM ids are inconsistent between the PFAM data and IMG data revisedMarkerGeneIds = set() for mg in markerGenes: if mg.startswith('PF'): revisedMarkerGeneIds.add(mg[0:mg.rfind('.')].replace('PF', 'pfam')) else: revisedMarkerGeneIds.add(mg) # get all genomes below the internal node (including genomes removed as duplicates) genomeIds = [] for leaf in node.leaf_nodes(): genomeIds.append(leaf.taxon.label.replace('IMG_', '')) if leaf.taxon.label in duplicateTaxa: for genomeId in duplicateTaxa[leaf.taxon.label]: genomeIds.append(genomeId.replace('IMG_', '')) genomeIds.append(leaf.taxon.label.replace('IMG_', '')) missingGenes = self.markerSetBuilder.missingGenes(genomeIds, revisedMarkerGeneIds, ubiquityThreshold) duplicateGenes = self.markerSetBuilder.duplicateGenes(genomeIds, revisedMarkerGeneIds, ubiquityThreshold) fout.write('%s\t%s\t%s\n' % (nodeId, str(missingGenes), str(duplicateGenes))) sys.stdout.write('\n') fout.close()