def treeQA(self, options): """QA command""" self.logger.info( '[CheckM - tree_qa] Assessing phylogenetic markers found in each bin.' ) checkDirExists(options.tree_dir) # set HMM file for each bin markerSetParser = MarkerSetParser() hmmModelInfoFile = os.path.join(options.tree_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO) binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) # calculate marker gene statistics RP = ResultsParser(binIdToModels) binStats = RP.analyseResults(options.tree_dir, DefaultValues.BIN_STATS_PHYLO_OUT, DefaultValues.HMMER_TABLE_PHYLO_OUT) # determine taxonomy of each bin treeParser = TreeParser() treeParser.printSummary(options.out_format, options.tree_dir, RP, options.bTabTable, options.file, binStats) if options.file != '': self.logger.info('QA information written to: ' + options.file) self.timeKeeper.printTimeStamp()
def run(self): # read all taxonomic-specific marker genes print('Reading taxonomic-specific marker genes.') taxonomicMarkers = set() taxonParser = TaxonParser() taxonMarkerSets = taxonParser.readMarkerSets() for _, taxa in taxonMarkerSets.items(): for _, markerSet in taxa.items(): taxonomicMarkers = taxonomicMarkers.union( markerSet.getMarkerGenes()) print(' Taxonomic-specific marker genes: %d' % len(taxonomicMarkers)) # read all lineage-specific marker genes print('Reading lineage-specific marker genes.') lineageMarkers = set() treeParser = TreeParser() uniqueIdToLineageStatistics = treeParser.readNodeMetadata() for uniqueId, d in uniqueIdToLineageStatistics.items(): markerSet = MarkerSet(uniqueId, 'NA', int(d['# genomes']), eval(d['marker set'])) lineageMarkers = lineageMarkers.union(markerSet.getMarkerGenes()) print(' Lineage-specific marker genes: %d' % len(lineageMarkers)) # gather all marker genes markerGenes = taxonomicMarkers.union(lineageMarkers) print(' Total marker genes: %d' % len(markerGenes)) # get genes from same clan as marker genes print('Gathering HMMs from the same clan as marker genes.') pfam = PFAM() genesInSameClan = pfam.genesInSameClan(markerGenes) allMarkers = markerGenes.union(genesInSameClan) # create file with all model accession numbers keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) fout = open(keyFile, 'w') for modelAcc in allMarkers: fout.write(modelAcc + '\n') fout.close() # fetch specified models HF = HMMERRunner(mode='fetch') HF.fetch(self.hmms, keyFile, self.outputHMMs, bKeyFile=True) # index the HMM file if os.path.exists(self.outputHMMs + '.ssi'): os.remove(self.outputHMMs + '.ssi') HF.index(self.outputHMMs) # remove key file os.remove(keyFile)
def run(self): # read all taxonomic-specific marker genes print 'Reading taxonomic-specific marker genes.' taxonomicMarkers = set() taxonParser = TaxonParser() taxonMarkerSets = taxonParser.readMarkerSets() for _, taxa in taxonMarkerSets.iteritems(): for _, markerSet in taxa.iteritems(): taxonomicMarkers = taxonomicMarkers.union(markerSet.getMarkerGenes()) print ' Taxonomic-specific marker genes: %d' % len(taxonomicMarkers) # read all lineage-specific marker genes print 'Reading lineage-specific marker genes.' lineageMarkers = set() treeParser = TreeParser() uniqueIdToLineageStatistics = treeParser.readNodeMetadata() for uniqueId, d in uniqueIdToLineageStatistics.iteritems(): markerSet = MarkerSet(uniqueId, 'NA', int(d['# genomes']), eval(d['marker set'])) lineageMarkers = lineageMarkers.union(markerSet.getMarkerGenes()) print ' Lineage-specific marker genes: %d' % len(lineageMarkers) # gather all marker genes markerGenes = taxonomicMarkers.union(lineageMarkers) print ' Total marker genes: %d' % len(markerGenes) # get genes from same clan as marker genes print 'Gathering HMMs from the same clan as marker genes.' pfam = PFAM() genesInSameClan = pfam.genesInSameClan(markerGenes) allMarkers = markerGenes.union(genesInSameClan) # create file with all model accession numbers keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4())) fout = open(keyFile, 'w') for modelAcc in allMarkers: fout.write(modelAcc + '\n') fout.close() # fetch specified models HF = HMMERRunner(mode='fetch') HF.fetch(self.hmms, keyFile, self.outputHMMs, bKeyFile=True) # index the HMM file if os.path.exists(self.outputHMMs + '.ssi'): os.remove(self.outputHMMs + '.ssi') HF.index(self.outputHMMs) # remove key file os.remove(keyFile)
def run(self): # read internal nodes file metadata = {} for line in open('./experiments/classTree.internal_nodes.tsv'): uid, label = [x.strip() for x in line.split('\t')] metadata[uid] = label # read all lineage-specific marker genes treeParser = TreeParser() uniqueIdToLineageStatistics = treeParser.readNodeMetadata() for uid in metadata: stats = uniqueIdToLineageStatistics[uid] markerSet = MarkerSet(uid, 'NA', int(stats['# genomes']), eval(stats['marker set'])) metadata[uid] += ' [%d, %d, %d]' % (stats['# genomes'], markerSet.numMarkers(), markerSet.numSets()) # write out results fout = open('./experiments/classTree.internal_nodes.metadata.tsv', 'w') for uid, label in metadata.iteritems(): fout.write(uid + '\t' + label + '\n') fout.close()
def lineageSet(self, options, db=None): """Lineage set command""" self.logger.info( '[CheckM - lineage_set] Inferring lineage-specific marker sets.') checkDirExists(options.tree_dir) # set HMM file for each bin markerSetParser = MarkerSetParser() hmmModelInfoFile = os.path.join(options.tree_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO) binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) # calculate marker gene statistics resultsParser = ResultsParser(binIdToModels) resultsParser.analyseResults(options.tree_dir, DefaultValues.BIN_STATS_PHYLO_OUT, DefaultValues.HMMER_TABLE_PHYLO_OUT) # These options are incompatible with how the lineage-specific marker set is selected, so # the default values are currently hard-coded options.num_genomes_markers = 2 options.bootstrap = 0 options.bRequireTaxonomy = False treeParser = TreeParser() treeParser.getBinMarkerSets( options.tree_dir, options.marker_file, options.num_genomes_markers, options.bootstrap, options.bNoLineageSpecificRefinement, options.bForceDomain, options.bRequireTaxonomy, resultsParser, options.unique, options.multi) self.logger.info('Marker set written to: ' + options.marker_file) self.timeKeeper.printTimeStamp()
def run(self): # read internal nodes file metadata = {} for line in open('./experiments/classTree.internal_nodes.tsv'): uid, label = [x.strip() for x in line.split('\t')] metadata[uid] = label # read all lineage-specific marker genes treeParser = TreeParser() uniqueIdToLineageStatistics = treeParser.readNodeMetadata() for uid in metadata: stats = uniqueIdToLineageStatistics[uid] markerSet = MarkerSet(uid, 'NA', int(stats['# genomes']), eval(stats['marker set'])) metadata[uid] += ' [%d, %d, %d]' % (stats['# genomes'], markerSet.numMarkers(), markerSet.numSets()) # write out results fout = open('./experiments/classTree.internal_nodes.metadata.tsv', 'w') for uid, label in metadata.items(): fout.write(uid + '\t' + label + '\n') fout.close()
def run(self, ubiquityThreshold, minGenomes): # Pre-compute gene count table print 'Computing gene count table.' start = time.time() metadata = self.img.genomeMetadata() self.markerSetBuilder.cachedGeneCountTable = self.img.geneCountTable(metadata.keys()) end = time.time() print ' globalGeneCountTable: %.2f' % (end - start) # read selected node for defining marker set print 'Reading node defining marker set for each internal node.' selectedMarkerNode = {} for line in open('/srv/whitlam/bio/db/checkm/selected_marker_sets.tsv'): lineSplit = line.split('\t') selectedMarkerNode[lineSplit[0].strip()] = lineSplit[1].strip() # read duplicate taxa print 'Reading list of identical taxa in genome tree.' duplicateTaxa = {} for line in open('/srv/whitlam/bio/db/checkm/genome_tree/genome_tree.derep.txt'): lineSplit = line.rstrip().split() if len(lineSplit) > 1: duplicateTaxa[lineSplit[0]] = lineSplit[1:] # read in node metadata print 'Reading node metadata.' treeParser = TreeParser() uniqueIdToLineageStatistics = treeParser.readNodeMetadata() # read genome tree print 'Reading in genome tree.' treeFile = '/srv/whitlam/bio/db/checkm/genome_tree/genome_tree_prok.refpkg/genome_tree.final.tre' tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) # determine lineage-specific gene loss and duplication (relative to potential marker genes used by a node) print 'Determining lineage-specific gene loss and duplication' fout = open('/srv/whitlam/bio/db/checkm/genome_tree/missing_duplicate_genes_50.tsv', 'w') processed = 0 numInternalNodes = len(tree.internal_nodes()) for node in tree.internal_nodes(): processed += 1 statusStr = ' Finished processing %d of %d (%.2f%%) internal nodes.' % (processed, numInternalNodes, float(processed)*100/numInternalNodes) sys.stdout.write('%s\r' % statusStr) sys.stdout.flush() nodeId = node.label.split('|')[0] missingGenes = [] duplicateGenes = [] nodeStats = uniqueIdToLineageStatistics[nodeId] if nodeStats['# genomes'] >= minGenomes: # get marker genes defined for current node along with all parental nodes markerGenes = set() parentNode = node while parentNode != None: parentNodeId = parentNode.label.split('|')[0] stats = uniqueIdToLineageStatistics[parentNodeId] markerSet = MarkerSet(parentNodeId, stats['taxonomy'], stats['# genomes'], eval(stats['marker set'])) markerGenes = markerGenes.union(markerSet.getMarkerGenes()) parentNode = parentNode.parent_node # silly hack since PFAM ids are inconsistent between the PFAM data and IMG data revisedMarkerGeneIds = set() for mg in markerGenes: if mg.startswith('PF'): revisedMarkerGeneIds.add(mg[0:mg.rfind('.')].replace('PF', 'pfam')) else: revisedMarkerGeneIds.add(mg) # get all genomes below the internal node (including genomes removed as duplicates) genomeIds = [] for leaf in node.leaf_nodes(): genomeIds.append(leaf.taxon.label.replace('IMG_', '')) if leaf.taxon.label in duplicateTaxa: for genomeId in duplicateTaxa[leaf.taxon.label]: genomeIds.append(genomeId.replace('IMG_', '')) genomeIds.append(leaf.taxon.label.replace('IMG_', '')) missingGenes = self.markerSetBuilder.missingGenes(genomeIds, revisedMarkerGeneIds, ubiquityThreshold) duplicateGenes = self.markerSetBuilder.duplicateGenes(genomeIds, revisedMarkerGeneIds, ubiquityThreshold) fout.write('%s\t%s\t%s\n' % (nodeId, str(missingGenes), str(duplicateGenes))) sys.stdout.write('\n') fout.close()