def testBinMarkerSets(self): """Verify bin marker set data structure.""" bms = BinMarkerSets(0, BinMarkerSets.TAXONOMIC_MARKER_SET) ms1 = MarkerSet(1, 'k__Bacteria', 100, [set(['a', 'b']), set(['c'])]) bms.addMarkerSet(ms1) ms2 = MarkerSet(2, 'k__Bacteria', 100, [set(['d', 'e']), set(['f'])]) bms.addMarkerSet(ms2) self.assertEqual(bms.getMarkerGenes(), set(['a', 'b', 'c', 'd', 'e', 'f'])) self.assertEqual(bms.mostSpecificMarkerSet(), ms1) self.assertEqual(bms.selectedMarkerSet(), ms1)
def buildDomainMarkerSet(self, tree, curNode, ubiquityThreshold, singleCopyThreshold, bMarkerSet = True, genomeIdsToRemove = None): """Build domain-specific marker sets for a genome in a LOO-fashion.""" # determine marker sets for bin binMarkerSets = BinMarkerSets(curNode.label, BinMarkerSets.TREE_MARKER_SET) refinedBinMarkerSet = BinMarkerSets(curNode.label, BinMarkerSets.TREE_MARKER_SET) # calculate marker set for bacterial or archaeal node uniqueId = curNode.label.split('|')[0] lineageSpecificRefinement = self.lineageSpecificGenesToRemove[uniqueId] while curNode != None: uniqueId = curNode.label.split('|')[0] if uniqueId != 'UID2' and uniqueId != 'UID203': curNode = curNode.parent_node continue stats = self.uniqueIdToLineageStatistics[uniqueId] taxonomyStr = stats['taxonomy'] if taxonomyStr == '': taxonomyStr = self.__getNextNamedNode(curNode, self.uniqueIdToLineageStatistics) leafNodes = curNode.leaf_nodes() genomeIds = set() for leaf in leafNodes: genomeIds.add(leaf.taxon.label.replace('IMG_', '')) duplicateGenomes = self.duplicateSeqs.get(leaf.taxon.label, []) for dup in duplicateGenomes: genomeIds.add(dup.replace('IMG_', '')) # remove all genomes from the same taxonomic group as the genome of interest if genomeIdsToRemove != None: genomeIds.difference_update(genomeIdsToRemove) if len(genomeIds) >= 2: if bMarkerSet: markerSet = self.buildMarkerSet(genomeIds, ubiquityThreshold, singleCopyThreshold) else: markerSet = MarkerSet(0, 'NA', len(genomeIds), [self.buildMarkerGenes(genomeIds, ubiquityThreshold, singleCopyThreshold)]) markerSet.lineageStr = uniqueId + ' | ' + taxonomyStr.split(';')[-1] binMarkerSets.addMarkerSet(markerSet) #refinedMarkerSet = self.__refineMarkerSet(markerSet, lineageSpecificMarkerSet) refinedMarkerSet = self.____removeInvalidLineageMarkerGenes(markerSet, lineageSpecificRefinement) #print 'Refinement: %d of %d' % (len(refinedMarkerSet.getMarkerGenes()), len(markerSet.getMarkerGenes())) refinedBinMarkerSet.addMarkerSet(refinedMarkerSet) curNode = curNode.parent_node return binMarkerSets, refinedBinMarkerSet
def testBinMarkerSets(self): """Verify bin marker set data structure.""" bms = BinMarkerSets(0, BinMarkerSets.TAXONOMIC_MARKER_SET) ms1 = MarkerSet(1, "k__Bacteria", 100, [set(["a", "b"]), set(["c"])]) bms.addMarkerSet(ms1) ms2 = MarkerSet(2, "k__Bacteria", 100, [set(["d", "e"]), set(["f"])]) bms.addMarkerSet(ms2) self.assertEqual(bms.getMarkerGenes(), set(["a", "b", "c", "d", "e", "f"])) self.assertEqual(bms.mostSpecificMarkerSet(), ms1) self.assertEqual(bms.selectedMarkerSet(), ms1)
def markerSet(self, rank, taxon, markerFile): """Obtain specified taxonomic-specific marker set.""" taxonMarkerSets = self.readMarkerSets() if rank not in taxonMarkerSets: self.logger.error(' Unrecognized taxonomic rank: ' + rank) return False elif taxon not in taxonMarkerSets[rank]: self.logger.error(' Unrecognized taxon: %s (in rank %s): ' % (taxon, rank)) return False markerSet = taxonMarkerSets[rank][taxon] taxonomy = markerSet.lineageStr.split(';')[::-1] binMarkerSets = BinMarkerSets(taxon, BinMarkerSets.TAXONOMIC_MARKER_SET) for i, taxon in enumerate(taxonomy): if rank != 'life': rank = ranksByLevel[len(taxonomy)-i-1] if rank == 'species': taxon = taxonomy[1] + ' ' + taxonomy[0] markerSet = taxonMarkerSets[rank][taxon] numMarkers, numMarkerSets = markerSet.size() self.logger.info(' Marker set for %s contains %d marker genes arranged in %d sets.' % (taxon, numMarkers, numMarkerSets)) self.logger.info(' Marker set inferred from %d reference genomes.' % markerSet.numGenomes) markerSet.lineageStr = taxon binMarkerSets.addMarkerSet(markerSet) fout = open(markerFile, 'w') fout.write(DefaultValues.TAXON_MARKER_FILE_HEADER + '\n') binMarkerSets.write(fout) fout.close() return True
def markerSet(self, rank, taxon, markerFile): """Obtain specified taxonomic-specific marker set.""" taxonMarkerSets = self.readMarkerSets() if rank not in taxonMarkerSets: self.logger.error(' Unrecognized taxonomic rank: ' + rank) return False elif taxon not in taxonMarkerSets[rank]: self.logger.error(' Unrecognized taxon: %s (in rank %s): ' % (taxon, rank)) return False markerSet = taxonMarkerSets[rank][taxon] taxonomy = markerSet.lineageStr.split(';')[::-1] binMarkerSets = BinMarkerSets(taxon, BinMarkerSets.TAXONOMIC_MARKER_SET) for i, taxon in enumerate(taxonomy): if rank != 'life': rank = ranksByLevel[len(taxonomy) - i - 1] if rank == 'species': taxon = taxonomy[1] + ' ' + taxonomy[0] markerSet = taxonMarkerSets[rank][taxon] numMarkers, numMarkerSets = markerSet.size() self.logger.info( ' Marker set for %s contains %d marker genes arranged in %d sets.' % (taxon, numMarkers, numMarkerSets)) self.logger.info( ' Marker set inferred from %d reference genomes.' % markerSet.numGenomes) markerSet.lineageStr = taxon binMarkerSets.addMarkerSet(markerSet) fout = open(markerFile, 'w') fout.write(DefaultValues.TAXON_MARKER_FILE_HEADER + '\n') binMarkerSets.write(fout) fout.close() return True
def getBinMarkerSets(self, outDir, markerFile, numGenomesMarkers, bootstrap, bNoLineageSpecificRefinement, bForceDomain, bRequireTaxonomy, resultsParser, minUnique, maxMulti): """Determine marker sets for each bin.""" self.logger.info(' Determining marker sets for each genome bin.') # get all bin ids binIds = getBinIdsFromOutDir(outDir) # get statistics for internal nodes uniqueIdToLineageStatistics = self.readNodeMetadata() # determine marker set for each bin treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT) tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True) rootNode = tree.find_node(filter_fn=lambda n: n.parent_node == None) fout = open(markerFile, 'w') fout.write(DefaultValues.LINEAGE_MARKER_FILE_HEADER + '\n') numProcessedBins = 0 statusStr = '' for binId in binIds: if self.logger.getEffectiveLevel() <= logging.INFO: numProcessedBins += 1 sys.stderr.write(' ' * len(statusStr) + '\r') # clear previous line statusStr = ' Finished processing %d of %d (%.2f%%) bins (current: %s).' % ( numProcessedBins, len(binIds), float(numProcessedBins) * 100 / len(binIds), binId) sys.stderr.write('%s\r' % statusStr) sys.stderr.flush() node = tree.find_node_with_taxon_label(binId) binMarkerSets = BinMarkerSets(binId, BinMarkerSets.TREE_MARKER_SET) if node == None: # bin is not in tree node, markerSet = self.__getMarkerSet( rootNode, tree, uniqueIdToLineageStatistics, numGenomesMarkers, bootstrap, bForceDomain, bRequireTaxonomy) binMarkerSets.addMarkerSet(markerSet) else: # special case: if node is on the bacterial or archaeal branch descendant from the root, # then move down the tree to include the domain-specific marker set parentNode = node.parent_node while parentNode != None: if parentNode.label: bRoot = (parentNode.parent_node == None) break parentNode = parentNode.parent_node if bRoot: # since the root is the first labeled node, we need to descend the # tree to incorporate the domain-specific marker set domainNode = self.__findDomainNode(node) curNode = domainNode.child_nodes()[0] else: curNode = node # get lineage specific refinement for first node with an id if not bNoLineageSpecificRefinement: uniqueId = parentNode.label.split('|')[0] self.__readLineageSpecificGenesToRemove() lineageSpecificRefinement = self.lineageSpecificGenesToRemove[ uniqueId] # ascend tree to root, recording all marker sets meeting selection criteria while curNode.parent_node != None: uniqueHits, multiCopyHits = resultsParser.results[ binId].countUniqueHits() tempForceDomain = bForceDomain or ( uniqueHits < minUnique) or (multiCopyHits > maxMulti) curNode, markerSet = self.__getMarkerSet( curNode.parent_node, tree, uniqueIdToLineageStatistics, numGenomesMarkers, bootstrap, tempForceDomain, bRequireTaxonomy) if not bNoLineageSpecificRefinement: markerSet = self.__removeInvalidLineageMarkerGenes( markerSet, lineageSpecificRefinement) binMarkerSets.addMarkerSet(markerSet) binMarkerSets.write(fout) if self.logger.getEffectiveLevel() <= logging.INFO: sys.stderr.write('\n') fout.close()
def getBinMarkerSets(self, outDir, markerFile, numGenomesMarkers, bootstrap, bNoLineageSpecificRefinement, bForceDomain, bRequireTaxonomy, resultsParser, minUnique, maxMulti): """Determine marker sets for each bin.""" self.logger.info(' Determining marker sets for each genome bin.') # get all bin ids binIds = getBinIdsFromOutDir(outDir) # get statistics for internal nodes uniqueIdToLineageStatistics = self.readNodeMetadata() # determine marker set for each bin treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT) tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) rootNode = tree.find_node(filter_fn = lambda n: n.parent_node == None) fout = open(markerFile, 'w') fout.write(DefaultValues.LINEAGE_MARKER_FILE_HEADER + '\n') numProcessedBins = 0 for binId in binIds: if self.logger.getEffectiveLevel() <= logging.INFO: numProcessedBins += 1 statusStr = ' Finished processing %d of %d (%.2f%%) bins.' % (numProcessedBins, len(binIds), float(numProcessedBins)*100/len(binIds)) sys.stderr.write('%s\r' % statusStr) sys.stderr.flush() node = tree.find_node_with_taxon_label(binId) binMarkerSets = BinMarkerSets(binId, BinMarkerSets.TREE_MARKER_SET) if node == None: # bin is not in tree node, markerSet = self.__getMarkerSet(rootNode, tree, uniqueIdToLineageStatistics, numGenomesMarkers, bootstrap, bForceDomain, bRequireTaxonomy) binMarkerSets.addMarkerSet(markerSet) else: # special case: if node is on the bacterial or archaeal branch descendant from the root, # then move down the tree to include the domain-specific marker set parentNode = node.parent_node while parentNode != None: if parentNode.label: bRoot = (parentNode.parent_node == None) break parentNode = parentNode.parent_node if bRoot: # since the root is the first labeled node, we need to descend the # tree to incorporate the domain-specific marker set domainNode = self.__findDomainNode(node) curNode = domainNode.child_nodes()[0] else: curNode = node # get lineage specific refinement for first node with an id if not bNoLineageSpecificRefinement: uniqueId = parentNode.label.split('|')[0] self.__readLineageSpecificGenesToRemove() lineageSpecificRefinement = self.lineageSpecificGenesToRemove[uniqueId] # ascend tree to root, recording all marker sets meeting selection criteria while curNode.parent_node != None: uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits() tempForceDomain = bForceDomain or (uniqueHits < minUnique) or (multiCopyHits > maxMulti) curNode, markerSet = self.__getMarkerSet(curNode.parent_node, tree, uniqueIdToLineageStatistics, numGenomesMarkers, bootstrap, tempForceDomain, bRequireTaxonomy) if not bNoLineageSpecificRefinement: markerSet = self.__removeInvalidLineageMarkerGenes(markerSet, lineageSpecificRefinement) binMarkerSets.addMarkerSet(markerSet) binMarkerSets.write(fout) if self.logger.getEffectiveLevel() <= logging.INFO: sys.stderr.write('\n') fout.close()