def treeQA(self, options): """QA command""" self.logger.info( '[CheckM - tree_qa] Assessing phylogenetic markers found in each bin.' ) checkDirExists(options.tree_dir) # set HMM file for each bin markerSetParser = MarkerSetParser() hmmModelInfoFile = os.path.join(options.tree_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO) binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) # calculate marker gene statistics RP = ResultsParser(binIdToModels) binStats = RP.analyseResults(options.tree_dir, DefaultValues.BIN_STATS_PHYLO_OUT, DefaultValues.HMMER_TABLE_PHYLO_OUT) # determine taxonomy of each bin treeParser = TreeParser() treeParser.printSummary(options.out_format, options.tree_dir, RP, options.bTabTable, options.file, binStats) if options.file != '': self.logger.info('QA information written to: ' + options.file) self.timeKeeper.printTimeStamp()
def makeAlignmentsOfMultipleHits(self, outDir, markerFile, hmmTableFile, binIdToModels, binIdToBinMarkerSets, bIgnoreThresholds, evalueThreshold, lengthThreshold, alignOutputDir, ): """Align markers with multiple hits within a bin.""" makeSurePathExists(alignOutputDir) # parse HMM information resultsParser = ResultsParser(binIdToModels) # get HMM hits to each bin resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold) # align any markers with multiple hits in a bin self.logger.info(' Aligning marker genes with multiple hits in a single bin:') # process each bin in parallel workerQueue = mp.Queue() writerQueue = mp.Queue() for binId in binIdToModels: workerQueue.put(binId) for _ in range(self.totalThreads): workerQueue.put(None) try: calcProc = [mp.Process(target=self.__createMSA, args=(resultsParser, binIdToBinMarkerSets, markerFile, outDir, alignOutputDir, workerQueue, writerQueue)) for _ in range(self.totalThreads)] writeProc = mp.Process(target=self.__reportBinProgress, args=(len(binIdToModels), writerQueue)) writeProc.start() for p in calcProc: p.start() for p in calcProc: p.join() writerQueue.put(None) writeProc.join() except: # make sure all processes are terminated for p in calcProc: p.terminate() writeProc.terminate()
def makeAlignmentToPhyloMarkers(self, outDir, hmmModelFile, hmmTableFile, binIdToModels, bIgnoreThresholds, evalueThreshold, lengthThreshold, bReportHitStats, alignOutputDir, bKeepUnmaskedAlign=False): """Align hits to a set of common marker genes.""" self.logger.info("Extracting marker genes to align.") # parse HMM information resultsParser = ResultsParser(binIdToModels) # get HMM hits to each bin resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold) # extract the ORFs to align markerSeqs, markerStats = self.__extractMarkerSeqsUnique( outDir, resultsParser) # generate individual HMMs required to create multiple sequence alignments binId = list(binIdToModels.keys())[0] hmmModelFiles = {} self.__makeAlignmentModels(hmmModelFile, binIdToModels[binId], hmmModelFiles) # align each of the marker genes makeSurePathExists(alignOutputDir) self.__alignMarkerGenes(markerSeqs, markerStats, bReportHitStats, hmmModelFiles, alignOutputDir, bKeepUnmaskedAlign) # remove the temporary HMM files for fileName in hmmModelFiles: os.remove(hmmModelFiles[fileName]) return resultsParser
def qa(self, options): """QA command""" self.logger.info('[CheckM - qa] Tabulating genome statistics.') checkDirExists(options.analyze_dir) if options.exclude_markers: checkFileExists(options.exclude_markers) # calculate AAI between marks with multiple hits in a single bin aai = AminoAcidIdentity() aai.run(options.aai_strain, options.analyze_dir, options.alignment_file) # get HMM file for each bin markerSetParser = MarkerSetParser(options.threads) hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) binIdToBinMarkerSets = markerSetParser.getMarkerSets( options.analyze_dir, getBinIdsFromOutDir(options.analyze_dir), options.marker_file, options.exclude_markers) # get results for each bin RP = ResultsParser(binIdToModels) RP.analyseResults( options.analyze_dir, DefaultValues.BIN_STATS_OUT, DefaultValues.HMMER_TABLE_OUT, bIgnoreThresholds=options.bIgnoreThresholds, evalueThreshold=options.e_value, lengthThreshold=options.length, bSkipPseudoGeneCorrection=options.bSkipPseudoGeneCorrection, bSkipAdjCorrection=options.bSkipAdjCorrection) RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file, anaFolder=options.analyze_dir) RP.cacheResults(options.analyze_dir, binIdToBinMarkerSets, options.bIndividualMarkers) if options.file != '': self.logger.info('QA information written to: ' + options.file) self.timeKeeper.printTimeStamp()
def markerPlot(self, options): """Marker gene position plot command""" self.logger.info( '[CheckM - marker_plot] Creating marker gene position plot.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) # generate plot for each bin binFiles = self.binFiles(options.bin_dir, options.extension) resultsParser = ResultsParser(None) markerGeneStats = resultsParser.parseMarkerGeneStats( options.results_dir) binStats = resultsParser.parseBinStatsExt(options.results_dir) plot = MarkerGenePosPlot(options) filesProcessed = 1 for f in binFiles: binId = binIdFromFilename(f) self.logger.info( 'Plotting marker gene position plot for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) filesProcessed += 1 if binId not in markerGeneStats or binId not in binStats: continue # bin has no marker genes bPlotted = plot.plot(f, markerGeneStats[binId], binStats[binId]) if bPlotted: outputFile = os.path.join( options.output_dir, binId) + '.marker_pos_plot.' + options.image_type plot.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) else: self.logger.info('No marker genes found in bin.') self.timeKeeper.printTimeStamp()
def makeAlignmentToPhyloMarkers(self, outDir, hmmModelFile, hmmTableFile, binIdToModels, bIgnoreThresholds, evalueThreshold, lengthThreshold, bReportHitStats, alignOutputDir, bKeepUnmaskedAlign=False ): """Align hits to a set of common marker genes.""" self.logger.info(" Extracting marker genes to align.") # parse HMM information resultsParser = ResultsParser(binIdToModels) # get HMM hits to each bin resultsParser.parseBinHits(outDir, hmmTableFile, False, bIgnoreThresholds, evalueThreshold, lengthThreshold) # extract the ORFs to align markerSeqs, markerStats = self.__extractMarkerSeqsUnique(outDir, resultsParser) # generate individual HMMs required to create multiple sequence alignments binId = binIdToModels.keys()[0] hmmModelFiles = {} self.__makeAlignmentModels(hmmModelFile, binIdToModels[binId], hmmModelFiles) # align each of the marker genes makeSurePathExists(alignOutputDir) self.__alignMarkerGenes(markerSeqs, markerStats, bReportHitStats, hmmModelFiles, alignOutputDir, bKeepUnmaskedAlign) # remove the temporary HMM files for fileName in hmmModelFiles: os.remove(hmmModelFiles[fileName]) return resultsParser
def binQAPlot(self, options): """Bin QA plot command""" self.logger.info( '[CheckM - bin_qa_plot] Creating bar plot of bin quality.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) # read model info # hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) # binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) # read sequence stats file resultsParser = ResultsParser(None) binStatsExt = resultsParser.parseBinStatsExt(options.results_dir) # create plot for each bin plot = BinQAPlot(options) bMakePlot = True if not options.bIgnoreHetero: aai = AminoAcidIdentity() aai.run(options.aai_strain, options.results_dir, None) bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero, aai.aaiHetero) else: bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero, None) if bMakePlot: outputFile = os.path.join(options.output_dir, 'bin_qa_plot.' + options.image_type) plot.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def lineageSet(self, options, db=None): """Lineage set command""" self.logger.info( '[CheckM - lineage_set] Inferring lineage-specific marker sets.') checkDirExists(options.tree_dir) # set HMM file for each bin markerSetParser = MarkerSetParser() hmmModelInfoFile = os.path.join(options.tree_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO) binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) # calculate marker gene statistics resultsParser = ResultsParser(binIdToModels) resultsParser.analyseResults(options.tree_dir, DefaultValues.BIN_STATS_PHYLO_OUT, DefaultValues.HMMER_TABLE_PHYLO_OUT) # These options are incompatible with how the lineage-specific marker set is selected, so # the default values are currently hard-coded options.num_genomes_markers = 2 options.bootstrap = 0 options.bRequireTaxonomy = False treeParser = TreeParser() treeParser.getBinMarkerSets( options.tree_dir, options.marker_file, options.num_genomes_markers, options.bootstrap, options.bNoLineageSpecificRefinement, options.bForceDomain, options.bRequireTaxonomy, resultsParser, options.unique, options.multi) self.logger.info('Marker set written to: ' + options.marker_file) self.timeKeeper.printTimeStamp()
def verifyAnalyze(self, outdir): """Verify output of analyze command.""" # verify bin stats using independently verified ground truth values binStats = ResultsParser(None).parseBinStats( outdir, DefaultValues.BIN_STATS_OUT) np.testing.assert_almost_equal(binStats['637000110']['GC'], 0.508, decimal=3, err_msg="Failed GC test") np.testing.assert_almost_equal(binStats['637000110']['GC std'], 0.0, err_msg="Failed GC std test") # np.testing.assert_almost_equal(binStats['637000110']['Coding density'], 0.877, decimal=3, err_msg="Failed coding density test") # depends on exact version of prodigal np.testing.assert_equal(binStats['637000110']['# contigs'], 1, err_msg="Failed # contigs test") np.testing.assert_equal(binStats['637000110']['# scaffolds'], 1, err_msg="Failed # scaffolds test") np.testing.assert_equal(binStats['637000110']['Longest contig'], 4646332, err_msg="Failed longest contig test") np.testing.assert_equal(binStats['637000110']['Longest scaffold'], 4646332, err_msg="Failed longest scaffold test") # np.testing.assert_equal(binStats['637000110']['# predicted genes'], 4326, err_msg="Failed # predicted genes test") # depends on exact version of prodigal np.testing.assert_equal(binStats['637000110']['N50 (contigs)'], 4646332, err_msg="Failed N50 (contigs) test") np.testing.assert_equal(binStats['637000110']['N50 (scaffolds)'], 4646332, err_msg="Failed N50 (scaffolds) test") np.testing.assert_equal(binStats['637000110']['Genome size'], 4646332, err_msg="Failed genome size test")
def run(self, binFiles, outDir, hmmTableFile, binIdToModels, binIdToBinMarkerSets, minDeltaComp, maxDeltaCont, minMergedComp, maxMergedCont): checkDirExists(outDir) self.logger.info(' Comparing marker sets between all pairs of bins.') # ensure all bins are using the same marker set markerGenesI = binIdToBinMarkerSets[binIdToBinMarkerSets.keys( )[0]].mostSpecificMarkerSet().getMarkerGenes() for binIdJ in binIdToBinMarkerSets: if markerGenesI != binIdToBinMarkerSets[ binIdJ].mostSpecificMarkerSet().getMarkerGenes(): self.logger.error( ' [Error] All bins must use the same marker set to assess potential mergers.' ) sys.exit(0) # parse HMM information resultsParser = ResultsParser(binIdToModels) # get HMM hits to each bin resultsParser.parseBinHits(outDir, hmmTableFile) # determine union and intersection of marker sets for each pair of bins outputFile = os.path.join(outDir, "merger.tsv") fout = open(outputFile, 'w') fout.write('Bin Id 1\tBin Id 2') fout.write('\tBin 1 completeness\tBin 1 contamination') fout.write('\tBin 2 completeness\tBin 2 contamination') fout.write('\tDelta completeness\tDelta contamination\tMerger delta') fout.write('\tMerged completeness\tMerged contamination\n') binMarkerHits = resultsParser.results binIds = sorted(binMarkerHits.keys()) for i in range(0, len(binMarkerHits)): binIdI = binIds[i] geneCountsI = binMarkerHits[binIdI].geneCounts( binIdToBinMarkerSets[binIdI].mostSpecificMarkerSet(), binMarkerHits[binIdI].markerHits, True) completenessI, contaminationI = geneCountsI[6:8] for j in range(i + 1, len(binMarkerHits)): binIdJ = binIds[j] geneCountsJ = binMarkerHits[binIdJ].geneCounts( binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), binMarkerHits[binIdJ].markerHits, True) completenessJ, contaminationJ = geneCountsJ[6:8] # merge together hits from both bins and calculate completeness and contamination mergedHits = {} for markerId, hits in binMarkerHits[ binIdI].markerHits.iteritems(): mergedHits[markerId] = list(hits) for markerId, hits in binMarkerHits[ binIdJ].markerHits.iteritems(): if markerId in mergedHits: mergedHits[markerId].extend(hits) else: mergedHits[markerId] = hits geneCountsMerged = binMarkerHits[binIdI].geneCounts( binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), mergedHits, True) completenessMerged, contaminationMerged = geneCountsMerged[6:8] if not (completenessMerged >= minMergedComp and contaminationMerged < maxMergedCont): continue # calculate merged statistics deltaComp = completenessMerged - max(completenessI, completenessJ) deltaCont = contaminationMerged - max(contaminationI, contaminationJ) delta = deltaComp - deltaCont if deltaComp >= minDeltaComp and deltaCont < maxDeltaCont: fout.write( '%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n' % (binIdI, binIdJ, completenessI, contaminationI, completenessJ, contaminationJ, deltaComp, deltaCont, delta, completenessMerged, contaminationMerged)) fout.close() return outputFile
def run(self, binFiles, outDir, hmmTableFile, binIdToModels, binIdToBinMarkerSets, minDeltaComp, maxDeltaCont, minMergedComp, maxMergedCont): checkDirExists(outDir) self.logger.info(' Comparing marker sets between all pairs of bins.') # ensure all bins are using the same marker set markerGenesI = binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]].mostSpecificMarkerSet().getMarkerGenes() for binIdJ in binIdToBinMarkerSets: if markerGenesI != binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet().getMarkerGenes(): self.logger.error(' [Error] All bins must use the same marker set to assess potential mergers.') sys.exit(0) # parse HMM information resultsParser = ResultsParser(binIdToModels) # get HMM hits to each bin resultsParser.parseBinHits(outDir, hmmTableFile) # determine union and intersection of marker sets for each pair of bins outputFile = os.path.join(outDir, "merger.tsv") fout = open(outputFile, 'w') fout.write('Bin Id 1\tBin Id 2') fout.write('\tBin 1 completeness\tBin 1 contamination') fout.write('\tBin 2 completeness\tBin 2 contamination') fout.write('\tDelta completeness\tDelta contamination\tMerger delta') fout.write('\tMerged completeness\tMerged contamination\n') binMarkerHits = resultsParser.results binIds = sorted(binMarkerHits.keys()) for i in xrange(0, len(binMarkerHits)): binIdI = binIds[i] geneCountsI = binMarkerHits[binIdI].geneCounts(binIdToBinMarkerSets[binIdI].mostSpecificMarkerSet(), binMarkerHits[binIdI].markerHits, True) completenessI, contaminationI = geneCountsI[6:8] for j in xrange(i + 1, len(binMarkerHits)): binIdJ = binIds[j] geneCountsJ = binMarkerHits[binIdJ].geneCounts(binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), binMarkerHits[binIdJ].markerHits, True) completenessJ, contaminationJ = geneCountsJ[6:8] # merge together hits from both bins and calculate completeness and contamination mergedHits = {} for markerId, hits in binMarkerHits[binIdI].markerHits.iteritems(): mergedHits[markerId] = list(hits) for markerId, hits in binMarkerHits[binIdJ].markerHits.iteritems(): if markerId in mergedHits: mergedHits[markerId].extend(hits) else: mergedHits[markerId] = hits geneCountsMerged = binMarkerHits[binIdI].geneCounts(binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), mergedHits, True) completenessMerged, contaminationMerged = geneCountsMerged[6:8] if not (completenessMerged >= minMergedComp and contaminationMerged < maxMergedCont): continue # calculate merged statistics deltaComp = completenessMerged - max(completenessI, completenessJ) deltaCont = contaminationMerged - max(contaminationI, contaminationJ) delta = deltaComp - deltaCont if deltaComp >= minDeltaComp and deltaCont < maxDeltaCont: fout.write('%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n' % (binIdI, binIdJ, completenessI, contaminationI, completenessJ, contaminationJ, deltaComp, deltaCont, delta, completenessMerged, contaminationMerged)) fout.close() return outputFile