def treeQA(self, options): """QA command""" self.logger.info( '[CheckM - tree_qa] Assessing phylogenetic markers found in each bin.' ) checkDirExists(options.tree_dir) # set HMM file for each bin markerSetParser = MarkerSetParser() hmmModelInfoFile = os.path.join(options.tree_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO) binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) # calculate marker gene statistics RP = ResultsParser(binIdToModels) binStats = RP.analyseResults(options.tree_dir, DefaultValues.BIN_STATS_PHYLO_OUT, DefaultValues.HMMER_TABLE_PHYLO_OUT) # determine taxonomy of each bin treeParser = TreeParser() treeParser.printSummary(options.out_format, options.tree_dir, RP, options.bTabTable, options.file, binStats) if options.file != '': self.logger.info('QA information written to: ' + options.file) self.timeKeeper.printTimeStamp()
def tree(self, options): """Tree command""" self.logger.info( '[CheckM - tree] Placing bins in reference genome tree.') binFiles = self.binFiles(options.bin_dir, options.extension) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): return else: if not checkProteinSeqs(binFiles): return # setup directory structure makeSurePathExists(os.path.join(options.output_dir, 'bins')) makeSurePathExists(os.path.join(options.output_dir, 'storage')) # find phylogenetically informative genes in genome bins mgf = MarkerGeneFinder(options.threads) binIdToModels = mgf.find(binFiles, options.output_dir, DefaultValues.HMMER_TABLE_PHYLO_OUT, DefaultValues.HMMER_PHYLO_OUT, DefaultValues.PHYLO_HMM_MODELS, options.bKeepAlignment, options.bNucORFs, options.bCalledGenes) # write model information to file markerSetParser = MarkerSetParser(options.threads) hmmModelInfoFile = os.path.join(options.output_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO) markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile) # calculate statistics for each genome bin binStats = BinStatistics(options.threads) binStats.calculate(binFiles, options.output_dir, DefaultValues.BIN_STATS_PHYLO_OUT) # align identified marker genes HA = HmmerAligner(options.threads) resultsParser = HA.makeAlignmentToPhyloMarkers( options.output_dir, DefaultValues.PHYLO_HMM_MODELS, DefaultValues.HMMER_TABLE_PHYLO_OUT, binIdToModels, False, DefaultValues.E_VAL, DefaultValues.LENGTH, False, os.path.join(options.output_dir, 'storage', 'tree')) # place bins into genome tree pplacer = PplacerRunner( threads=options.pplacer_threads ) # fix at one thread to keep memory requirements reasonable pplacer.run(binFiles, resultsParser, options.output_dir, options.bReducedTree) self.timeKeeper.printTimeStamp()
def qa(self, options): """QA command""" self.logger.info('[CheckM - qa] Tabulating genome statistics.') checkDirExists(options.analyze_dir) if options.exclude_markers: checkFileExists(options.exclude_markers) # calculate AAI between marks with multiple hits in a single bin aai = AminoAcidIdentity() aai.run(options.aai_strain, options.analyze_dir, options.alignment_file) # get HMM file for each bin markerSetParser = MarkerSetParser(options.threads) hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) binIdToBinMarkerSets = markerSetParser.getMarkerSets( options.analyze_dir, getBinIdsFromOutDir(options.analyze_dir), options.marker_file, options.exclude_markers) # get results for each bin RP = ResultsParser(binIdToModels) RP.analyseResults( options.analyze_dir, DefaultValues.BIN_STATS_OUT, DefaultValues.HMMER_TABLE_OUT, bIgnoreThresholds=options.bIgnoreThresholds, evalueThreshold=options.e_value, lengthThreshold=options.length, bSkipPseudoGeneCorrection=options.bSkipPseudoGeneCorrection, bSkipAdjCorrection=options.bSkipAdjCorrection) RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file, anaFolder=options.analyze_dir) RP.cacheResults(options.analyze_dir, binIdToBinMarkerSets, options.bIndividualMarkers) if options.file != '': self.logger.info('QA information written to: ' + options.file) self.timeKeeper.printTimeStamp()
def __init__(self): """Initialization.""" self.logger = logging.getLogger('timestamp') parser = MarkerSetParser() bin_marker_sets = parser.parseTaxonomicMarkerSetFile(CHECKM_BAC_MS) self.bac_ms = bin_marker_sets.mostSpecificMarkerSet() bin_marker_sets = parser.parseTaxonomicMarkerSetFile(CHECKM_AR_MS) self.ar_ms = bin_marker_sets.mostSpecificMarkerSet() self.bac_markers_on_contigs = None self.ar_markers_on_contigs = None
def __processBin(self, outDir, tableOut, hmmerOut, markerFile, bKeepAlignment, bNucORFs, bCalledGenes, queueIn, queueOut): """Thread safe bin processing.""" markerSetParser = MarkerSetParser(self.threadsPerSearch) while True: binFile = queueIn.get(block=True, timeout=None) if binFile == None: break binId = binIdFromFilename(binFile) binDir = os.path.join(outDir, 'bins', binId) makeSurePathExists(binDir) # run Prodigal if not bCalledGenes: prodigal = ProdigalRunner(binDir) if not prodigal.areORFsCalled(bNucORFs): prodigal.run(binFile, bNucORFs) aaGeneFile = prodigal.aaGeneFile else: aaGeneFile = binFile shutil.copyfile( aaGeneFile, os.path.join(binDir, DefaultValues.PRODIGAL_AA)) # extract HMMs into temporary file hmmModelFile = markerSetParser.createHmmModelFile( binId, markerFile) # run HMMER hmmer = HMMERRunner() tableOutPath = os.path.join(binDir, tableOut) hmmerOutPath = os.path.join(binDir, hmmerOut) keepAlignStr = '' if not bKeepAlignment: keepAlignStr = '--noali' hmmer.search( hmmModelFile, aaGeneFile, tableOutPath, hmmerOutPath, '--cpu ' + str(self.threadsPerSearch) + ' --notextw -E 0.1 --domE 0.1 ' + keepAlignStr, bKeepAlignment) queueOut.put((binId, hmmModelFile))
def __processBin(self, outDir, tableOut, hmmerOut, markerFile, bKeepAlignment, bNucORFs, bCalledGenes, queueIn, queueOut): """Thread safe bin processing.""" markerSetParser = MarkerSetParser(self.threadsPerSearch) while True: binFile = queueIn.get(block=True, timeout=None) if binFile == None: break binId = binIdFromFilename(binFile) binDir = os.path.join(outDir, 'bins', binId) makeSurePathExists(binDir) # run Prodigal if not bCalledGenes: prodigal = ProdigalRunner(binDir) if not prodigal.areORFsCalled(bNucORFs): prodigal.run(binFile, bNucORFs) aaGeneFile = prodigal.aaGeneFile else: aaGeneFile = binFile shutil.copyfile(aaGeneFile, os.path.join(binDir, DefaultValues.PRODIGAL_AA)) # extract HMMs into temporary file hmmModelFile = markerSetParser.createHmmModelFile(binId, markerFile) # run HMMER hmmer = HMMERRunner() tableOutPath = os.path.join(binDir, tableOut) hmmerOutPath = os.path.join(binDir, hmmerOut) keepAlignStr = '' if not bKeepAlignment: keepAlignStr = '--noali' hmmer.search(hmmModelFile, aaGeneFile, tableOutPath, hmmerOutPath, '--cpu ' + str(self.threadsPerSearch) + ' --notextw -E 0.1 --domE 0.1 ' + keepAlignStr, bKeepAlignment) queueOut.put((binId, hmmModelFile))
def merge(self, options): """Merge command""" self.logger.info( '[CheckM - merge] Identifying bins with complementary sets of marker genes.' ) checkDirExists(options.bin_dir) binFiles = self.binFiles(options.bin_dir, options.extension) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): return else: if not checkProteinSeqs(binFiles): return markerSetParser = MarkerSetParser() if markerSetParser.markerFileType( options.marker_file) == BinMarkerSets.TREE_MARKER_SET: self.logger.error( 'Merge command requires a taxonomic-specific marker set or a user-defined HMM file.\n' ) return # setup directory structure makeSurePathExists(options.output_dir) makeSurePathExists(os.path.join(options.output_dir, 'bins')) makeSurePathExists(os.path.join(options.output_dir, 'storage')) makeSurePathExists(os.path.join(options.output_dir, 'storage', 'hmms')) binIds = [] for binFile in binFiles: binIds.append(binIdFromFilename(binFile)) # find marker genes in genome bins mgf = MarkerGeneFinder(options.threads) binIdToModels = mgf.find(binFiles, options.output_dir, "merger.table.txt", "merger.hmmer3", options.marker_file, False, False, options.bCalledGenes) # get HMM file for each bin markerSetParser = MarkerSetParser() binIdToBinMarkerSets = markerSetParser.getMarkerSets( options.output_dir, binIds, options.marker_file) # compare markers found in each bin merger = Merger() outputFile = merger.run(binFiles, options.output_dir, "merger.table.txt", binIdToModels, binIdToBinMarkerSets, options.delta_comp, options.delta_cont, options.merged_comp, options.merged_cont) self.logger.info('Merger information written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def lineageSet(self, options, db=None): """Lineage set command""" self.logger.info( '[CheckM - lineage_set] Inferring lineage-specific marker sets.') checkDirExists(options.tree_dir) # set HMM file for each bin markerSetParser = MarkerSetParser() hmmModelInfoFile = os.path.join(options.tree_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO) binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) # calculate marker gene statistics resultsParser = ResultsParser(binIdToModels) resultsParser.analyseResults(options.tree_dir, DefaultValues.BIN_STATS_PHYLO_OUT, DefaultValues.HMMER_TABLE_PHYLO_OUT) # These options are incompatible with how the lineage-specific marker set is selected, so # the default values are currently hard-coded options.num_genomes_markers = 2 options.bootstrap = 0 options.bRequireTaxonomy = False treeParser = TreeParser() treeParser.getBinMarkerSets( options.tree_dir, options.marker_file, options.num_genomes_markers, options.bootstrap, options.bNoLineageSpecificRefinement, options.bForceDomain, options.bRequireTaxonomy, resultsParser, options.unique, options.multi) self.logger.info('Marker set written to: ' + options.marker_file) self.timeKeeper.printTimeStamp()
def analyze(self, options, db=None): """Analyze command""" self.logger.info( '[CheckM - analyze] Identifying marker genes in bins.') binFiles = self.binFiles(options.bin_dir, options.extension) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): return else: if not checkProteinSeqs(binFiles): return # setup directory structure makeSurePathExists(options.output_dir) makeSurePathExists(os.path.join(options.output_dir, 'bins')) makeSurePathExists(os.path.join(options.output_dir, 'storage')) makeSurePathExists( os.path.join(options.output_dir, 'storage', 'aai_qa')) # find marker genes in genome bins mgf = MarkerGeneFinder(options.threads) binIdToModels = mgf.find(binFiles, options.output_dir, DefaultValues.HMMER_TABLE_OUT, DefaultValues.HMMER_OUT, options.marker_file, options.bKeepAlignment, options.bNucORFs, options.bCalledGenes) markerSetParser = MarkerSetParser(options.threads) binIdToBinMarkerSets = markerSetParser.getMarkerSets( options.output_dir, getBinIdsFromOutDir(options.output_dir), options.marker_file) hmmModelInfoFile = os.path.join(options.output_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile) self.timeKeeper.printTimeStamp() # HMM model file if markerSetParser.markerFileType( options.marker_file) == BinMarkerSets.HMM_MODELS_SET: markerFile = options.marker_file else: markerFile = DefaultValues.HMM_MODELS # align marker genes with multiple hits within a bin HA = HmmerAligner(options.threads) HA.makeAlignmentsOfMultipleHits( options.output_dir, markerFile, DefaultValues.HMMER_TABLE_OUT, binIdToModels, binIdToBinMarkerSets, False, DefaultValues.E_VAL, DefaultValues.LENGTH, os.path.join(options.output_dir, 'storage', 'aai_qa')) self.timeKeeper.printTimeStamp() # calculate statistics for each genome bin binStats = BinStatistics(options.threads) binStats.calculate(binFiles, options.output_dir, DefaultValues.BIN_STATS_OUT) self.timeKeeper.printTimeStamp() # align top hit to each marker if requested if options.bAlignTopHit: alignmentOutputFolder = os.path.join(options.output_dir, 'storage', 'alignments') makeSurePathExists(alignmentOutputFolder) HA = HmmerAligner(options.threads) resultsParser = HA.makeAlignmentTopHit( options.output_dir, options.marker_file, DefaultValues.HMMER_TABLE_OUT, binIdToModels, False, DefaultValues.E_VAL, DefaultValues.LENGTH, True, alignmentOutputFolder) # report marker gene data fout = open( os.path.join(alignmentOutputFolder, 'alignment_info.tsv'), 'w') fout.write('Marker Id\tLength (bp)\n') markerIds = resultsParser.models[list( resultsParser.models.keys())[0]].keys() for markerId in markerIds: fout.write('%s\t%d\n' % (markerId, resultsParser.models[list( resultsParser.models.keys())[0]][markerId].leng)) fout.close() self.logger.info('Alignments to top hits stored in: ' + alignmentOutputFolder) self.timeKeeper.printTimeStamp()