def merge(self, options): """Merge command""" self.logger.info( '[CheckM - merge] Identifying bins with complementary sets of marker genes.' ) checkDirExists(options.bin_dir) binFiles = self.binFiles(options.bin_dir, options.extension) if not options.bCalledGenes: if not checkNuclotideSeqs(binFiles): return else: if not checkProteinSeqs(binFiles): return markerSetParser = MarkerSetParser() if markerSetParser.markerFileType( options.marker_file) == BinMarkerSets.TREE_MARKER_SET: self.logger.error( 'Merge command requires a taxonomic-specific marker set or a user-defined HMM file.\n' ) return # setup directory structure makeSurePathExists(options.output_dir) makeSurePathExists(os.path.join(options.output_dir, 'bins')) makeSurePathExists(os.path.join(options.output_dir, 'storage')) makeSurePathExists(os.path.join(options.output_dir, 'storage', 'hmms')) binIds = [] for binFile in binFiles: binIds.append(binIdFromFilename(binFile)) # find marker genes in genome bins mgf = MarkerGeneFinder(options.threads) binIdToModels = mgf.find(binFiles, options.output_dir, "merger.table.txt", "merger.hmmer3", options.marker_file, False, False, options.bCalledGenes) # get HMM file for each bin markerSetParser = MarkerSetParser() binIdToBinMarkerSets = markerSetParser.getMarkerSets( options.output_dir, binIds, options.marker_file) # compare markers found in each bin merger = Merger() outputFile = merger.run(binFiles, options.output_dir, "merger.table.txt", binIdToModels, binIdToBinMarkerSets, options.delta_comp, options.delta_cont, options.merged_comp, options.merged_cont) self.logger.info('Merger information written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def nxPlot(self, options): """Nx-plot command""" self.logger.info('[CheckM - nx_plot] Creating Nx-plots.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) nx = NxPlot(options) filesProcessed = 1 for f in binFiles: binId = binIdFromFilename(f) self.logger.info('Plotting Nx-plot for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) filesProcessed += 1 nx.plot(f) outputFile = os.path.join(options.output_dir, binId) + '.nx_plot.' + options.image_type nx.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def lengthHistogram(self, options): """Sequence length histogram command""" self.logger.info( '[CheckM - len_hist] Creating sequence length histogram.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) plot = LengthHistogram(options) filesProcessed = 1 for f in binFiles: binId = binIdFromFilename(f) self.logger.info( 'Plotting sequence length histogram for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) filesProcessed += 1 plot.plot(f) outputFile = os.path.join( options.output_dir, binId) + '.len_hist.' + options.image_type plot.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def distributionPlots(self, options): """Reference distribution plot command""" self.logger.info( '[CheckM - dist_plot] Creating GC, CD, and TD distribution plots.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) genomicSignatures = GenomicSignatures(K=4, threads=1) tetraSigs = genomicSignatures.read(options.tetra_profile) plots = DistributionPlots(options) filesProcessed = 1 for f in binFiles: self.logger.info( 'Plotting reference distribution plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) filesProcessed += 1 binId = binIdFromFilename(f) plots.plot(f, tetraSigs, options.distributions) outputFile = os.path.join( options.output_dir, binId) + '.ref_dist_plots.' + options.image_type plots.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def getInsertionBranchId(self, outDir, binIds): # make sure output and tree directories exist checkDirExists(outDir) alignOutputDir = os.path.join(outDir, 'storage', 'tree') checkDirExists(alignOutputDir) # read genome tree (if it exists) binIdToUID = {} treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT) tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True) # find first parent of each bin with a taxonomic label for binId in binIds: node = tree.find_node_with_taxon_label(binId) if node == None: binIdToUID[binId] = 'NA' continue # find first node decorated with a UID string between leaf and root parentNode = node.parent_node while parentNode != None: if parentNode.label: uid = parentNode.label.split('|')[0] break parentNode = parentNode.parent_node binIdToUID[binId] = uid return binIdToUID
def treeQA(self, options): """QA command""" self.logger.info( '[CheckM - tree_qa] Assessing phylogenetic markers found in each bin.' ) checkDirExists(options.tree_dir) # set HMM file for each bin markerSetParser = MarkerSetParser() hmmModelInfoFile = os.path.join(options.tree_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO) binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) # calculate marker gene statistics RP = ResultsParser(binIdToModels) binStats = RP.analyseResults(options.tree_dir, DefaultValues.BIN_STATS_PHYLO_OUT, DefaultValues.HMMER_TABLE_PHYLO_OUT) # determine taxonomy of each bin treeParser = TreeParser() treeParser.printSummary(options.out_format, options.tree_dir, RP, options.bTabTable, options.file, binStats) if options.file != '': self.logger.info('QA information written to: ' + options.file) self.timeKeeper.printTimeStamp()
def codingDensityPlot(self, options): """Coding density plot command""" self.logger.info( '[CheckM - coding_plot] Creating coding density histogram and delta-CD plot.' ) checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) plots = CodingDensityPlots(options) filesProcessed = 1 for f in binFiles: self.logger.info( 'Plotting coding density plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) filesProcessed += 1 plots.plot(f, options.distributions) binId = binIdFromFilename(f) outputFile = os.path.join( options.output_dir, binId) + '.coding_density_plots.' + options.image_type plots.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def run(self, binFiles, resultsParser, outDir): # make sure output and tree directories exist checkDirExists(outDir) alignOutputDir = os.path.join(outDir, 'storage', 'tree') checkDirExists(alignOutputDir) treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT) pplacerJsonOut = os.path.join(alignOutputDir, DefaultValues.PPLACER_JSON_OUT) pplacerOut = os.path.join(alignOutputDir, DefaultValues.PPLACER_OUT) # create concatenated alignment file for each bin concatenatedAlignFile = self.__createConcatenatedAlignment(binFiles, resultsParser, alignOutputDir) # check if concatenated alignment file is empty # (this can occur when all genomes have no phylogenetically informative marker genes) if os.stat(concatenatedAlignFile)[stat.ST_SIZE] == 0: self.logger.info(' No genomes were identified that could be placed in the reference genome tree.') shutil.copyfile(os.path.join( DefaultValues.PPLACER_REF_PACKAGE, DefaultValues.GENOME_TREE_FINAL), treeFile) return # run pplacer to place bins in reference genome tree self.logger.info(' Placing %d bins into the genome tree with pplacer (be patient).' % len(binFiles)) cmd = 'pplacer -j %d -c %s -o %s %s > %s' % (self.numThreads, DefaultValues.PPLACER_REF_PACKAGE, pplacerJsonOut, concatenatedAlignFile, pplacerOut) os.system(cmd) # extract tree cmd = 'guppy tog -o %s %s' % (treeFile, pplacerJsonOut) os.system(cmd)
def reportBinTaxonomy(self, outDir, resultsParser, bTabTable, outFile, binStats, bLineageStatistics): # make sure output and tree directories exist checkDirExists(outDir) alignOutputDir = os.path.join(outDir, 'storage', 'tree') checkDirExists(alignOutputDir) # get all bin ids binIds = getBinIdsFromOutDir(outDir) # get taxonomy for each bin binIdToTaxonomy = self.getBinTaxonomy(outDir, binIds) # get weighted ML likelihood #pplacerJsonFile = os.path.join(outDir, 'storage', 'tree', 'concatenated.pplacer.json') #binIdToWeightedML = self.readPlacementFile(pplacerJsonFile) # write table if not bLineageStatistics: self.__printSimpleSummaryTable(binIdToTaxonomy, resultsParser, bTabTable, outFile) else: # get taxonomy of sister lineage for each bin binIdToSisterTaxonomy = self.getBinSisterTaxonomy(outDir, binIds) binIdToLineageStatistics = self.readLineageMetadata(outDir, binIds) self.__printFullTable(binIdToTaxonomy, binIdToSisterTaxonomy, binIdToLineageStatistics, resultsParser, binStats, bTabTable, outFile)
def reportBinTaxonomy(self, outDir, resultsParser, bTabTable, outFile, binStats, bLineageStatistics): # make sure output and tree directories exist checkDirExists(outDir) alignOutputDir = os.path.join(outDir, 'storage', 'tree') checkDirExists(alignOutputDir) # get all bin ids binIds = getBinIdsFromOutDir(outDir) # get taxonomy for each bin binIdToTaxonomy = self.getBinTaxonomy(outDir, binIds) # write table if not bLineageStatistics: self.__printSimpleSummaryTable(binIdToTaxonomy, resultsParser, bTabTable, outFile) else: # get taxonomy of sister lineage for each bin binIdToSisterTaxonomy = self.getBinSisterTaxonomy(outDir, binIds) binIdToUID = self.getInsertionBranchId(outDir, binIds) binIdToLineageStatistics = self.readLineageMetadata(outDir, binIds) self.__printFullTable(binIdToUID, binIdToTaxonomy, binIdToSisterTaxonomy, binIdToLineageStatistics, resultsParser, binStats, bTabTable, outFile)
def coveragePcaPlot(self, options): """PCA plot of coverage profiles""" self.logger.info( '[CheckM - cov_pca] Creating PCA plot of coverage profiles.') checkDirExists(options.bin_dir) checkFileExists(options.coverage_file) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) coverage = Coverage(threads=1) coverageStats = coverage.parseCoverage(options.coverage_file) seqIds = [] coverageProfiles = [] for binId, seqDict in coverageStats.items(): for seqId, bamDict in seqDict.items(): seqIds.append(seqId) coverages = [] for _, coverage in bamDict.items(): coverages.append(coverage) coverageProfiles.append(coverages) coverageProfiles = np.array(coverageProfiles) if coverageProfiles.shape[1] < 2: self.logger.error( 'Coverage profile is 1 dimensional. PCA requires at least 2 dimensions.' ) sys.exit(1) self.logger.info('Computing PCA of coverage profiles.\n') pca = PCA() pc, variance = pca.pcaMatrix(coverageProfiles, fraction=1.0, bCenter=True, bScale=False) plots = PcaPlot(options) filesProcessed = 1 for f in binFiles: self.logger.info( 'Plotting PCA of coverage profiles for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) filesProcessed += 1 plots.plot(f, seqIds, pc, variance) binId = binIdFromFilename(f) outputFile = os.path.join( options.output_dir, binId) + '.cov_pca_plots.' + options.image_type plots.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def qa(self, options): """QA command""" self.logger.info('[CheckM - qa] Tabulating genome statistics.') checkDirExists(options.analyze_dir) if options.exclude_markers: checkFileExists(options.exclude_markers) # calculate AAI between marks with multiple hits in a single bin aai = AminoAcidIdentity() aai.run(options.aai_strain, options.analyze_dir, options.alignment_file) # get HMM file for each bin markerSetParser = MarkerSetParser(options.threads) hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) binIdToBinMarkerSets = markerSetParser.getMarkerSets( options.analyze_dir, getBinIdsFromOutDir(options.analyze_dir), options.marker_file, options.exclude_markers) # get results for each bin RP = ResultsParser(binIdToModels) RP.analyseResults( options.analyze_dir, DefaultValues.BIN_STATS_OUT, DefaultValues.HMMER_TABLE_OUT, bIgnoreThresholds=options.bIgnoreThresholds, evalueThreshold=options.e_value, lengthThreshold=options.length, bSkipPseudoGeneCorrection=options.bSkipPseudoGeneCorrection, bSkipAdjCorrection=options.bSkipAdjCorrection) RP.printSummary(options.out_format, aai, binIdToBinMarkerSets, options.bIndividualMarkers, options.coverage_file, options.bTabTable, options.file, anaFolder=options.analyze_dir) RP.cacheResults(options.analyze_dir, binIdToBinMarkerSets, options.bIndividualMarkers) if options.file != '': self.logger.info('QA information written to: ' + options.file) self.timeKeeper.printTimeStamp()
def binUnion(self, options): """Bin union command""" self.logger.info( '[CheckM - bin_union] Redundancy reduce multiple sets of bins into a single set.' ) output_dir = options.output_dir makeSurePathExists(output_dir) bin_dirs = [] checkmQaTsvs = [] for i, arg in enumerate(options.bin_or_checkm_qa_table): if i % 2 == 0: checkDirExists(arg) bin_dirs.append(arg) else: checkFileExists(arg) checkmQaTsvs.append(arg) if len(bin_dirs) < 2: self.logger.error( "Need to specify at least two bin folders, found %i: " % len(bin_dirs)) sys.exit(1) if len(bin_dirs) != len(checkmQaTsvs): self.logger.error( "Need to specify the same number of bin folders as checkm_qa_tsv files, found %i and %i, respectively: " % (len(bin_dirs), len(checkmQaTsvs))) sys.exit(1) binFileSets = [] for bin_dir in bin_dirs: self.logger.info( "Reading fasta files with extension %s from bin folder %s" % (options.extension, bin_dir)) binFileSets.append(self.binFiles(bin_dir, options.extension)) binUnion = BinUnion() contigConflictsOutputFile = os.path.join(output_dir, 'contigConflicts.csv') unionBinOutputFile = os.path.join(output_dir, 'union.txt') binUnion.report(bin_dirs, binFileSets, checkmQaTsvs, unionBinOutputFile, contigConflictsOutputFile, options.min_completeness, options.max_contamination)
def parallelCoordPlot(self, options): """Parallel coordinate plot command""" self.logger.info( '[CheckM - par_plot] Creating parallel coordinate plot of GC and coverage.' ) checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) checkFileExists(options.coverage_file) binFiles = self.binFiles(options.bin_dir, options.extension) # read coverage stats file coverage = Coverage(threads=1) coverageStats = coverage.parseCoverage(options.coverage_file) # calculate sequence stats for all bins self.logger.info('Calculating sequence statistics for each bin.') binStats = BinStatistics() seqStats = {} for f in binFiles: binId = binIdFromFilename(f) seqStats[binId] = binStats.sequenceStats(options.results_dir, f) # create plot for each bin plot = ParallelCoordPlot(options) filesProcessed = 1 for f in binFiles: binId = binIdFromFilename(f) self.logger.info( 'Plotting parallel coordinates for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) filesProcessed += 1 plot.plot(binId, seqStats, coverageStats) outputFile = os.path.join( options.output_dir, binId) + '.paralel_coord_plot.' + options.image_type plot.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def unbinned(self, options): """Unbinned Command""" self.logger.info('[CheckM - unbinned] Identify unbinned sequences.') checkDirExists(options.bin_dir) binFiles = self.binFiles(options.bin_dir, options.extension) unbinned = Unbinned() unbinned.run(binFiles, options.seq_file, options.output_seq_file, options.output_stats_file, options.min_seq_len) self.logger.info('Unbinned sequences written to: ' + options.output_seq_file) self.logger.info('Unbinned sequences statistics written to: ' + options.output_stats_file) self.timeKeeper.printTimeStamp()
def binCompare(self, options): """Bin compare command""" self.logger.info('[CheckM - bin_compare] Comparing two sets of bins.') checkDirExists(options.bin_dir1) checkDirExists(options.bin_dir2) binFiles1 = self.binFiles(options.bin_dir1, options.extension1) binFiles2 = self.binFiles(options.bin_dir2, options.extension2) binComparer = BinComparer() binComparer.report(binFiles1, binFiles2, options.seq_file, options.output_file) self.logger.info('Detailed bin comparison written to: ' + options.output_file) self.timeKeeper.printTimeStamp()
def getBinTaxonomy(self, outDir, binIds): # make sure output and tree directories exist checkDirExists(outDir) alignOutputDir = os.path.join(outDir, 'storage', 'tree') checkDirExists(alignOutputDir) # read genome tree (if it exists) binIdToTaxonomy = {} treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT) tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True) # find first parent of each bin with a taxonomic label for binId in binIds: node = tree.find_node_with_taxon_label(binId) if node == None: binIdToTaxonomy[binId] = 'NA' continue # find first node decorated with a taxon string between leaf and root taxaStr = None parentNode = node.parent_node while parentNode != None: if parentNode.label: tokens = parentNode.label.split('|') if tokens[1] != '': if taxaStr: taxaStr = tokens[1] + ';' + taxaStr else: taxaStr = tokens[1] parentNode = parentNode.parent_node if not taxaStr: domainNode = self.__findDomainNode(node) taxaStr = domainNode.label.split('|')[1] + ' (root)' binIdToTaxonomy[node.taxon.label] = taxaStr return binIdToTaxonomy
def run(self, binFiles, resultsParser, outDir, bReducedTree): # make sure output and tree directories exist checkDirExists(outDir) alignOutputDir = os.path.join(outDir, 'storage', 'tree') checkDirExists(alignOutputDir) treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT) pplacerJsonOut = os.path.join(alignOutputDir, DefaultValues.PPLACER_JSON_OUT) pplacerOut = os.path.join(alignOutputDir, DefaultValues.PPLACER_OUT) # create concatenated alignment file for each bin concatenatedAlignFile = self.__createConcatenatedAlignment( binFiles, resultsParser, alignOutputDir) pplacerRefPkg = DefaultValues.PPLACER_REF_PACKAGE_FULL if bReducedTree: pplacerRefPkg = DefaultValues.PPLACER_REF_PACKAGE_REDUCED # check if concatenated alignment file is empty # (this can occur when all genomes have no phylogenetically informative marker genes) if os.stat(concatenatedAlignFile)[stat.ST_SIZE] == 0: self.logger.info( ' No genomes were identified that could be placed in the reference genome tree.' ) shutil.copyfile( os.path.join(pplacerRefPkg, DefaultValues.GENOME_TREE), treeFile) return # run pplacer to place bins in reference genome tree self.logger.info( ' Placing %d bins into the genome tree with pplacer (be patient).' % len(binFiles)) cmd = 'pplacer -j %d -c %s -o %s %s > %s' % ( self.numThreads, pplacerRefPkg, pplacerJsonOut, concatenatedAlignFile, pplacerOut) os.system(cmd) # extract tree cmd = 'guppy tog -o %s %s' % (treeFile, pplacerJsonOut) os.system(cmd)
def outliers(self, options): """Outlier command""" self.logger.info('[CheckM - outlier] Identifying outliers in bins.') checkDirExists(options.bin_dir) checkFileExists(options.tetra_profile) makeSurePathExists(os.path.dirname(options.output_file)) binFiles = self.binFiles(options.bin_dir, options.extension) binTools = BinTools() binTools.identifyOutliers(options.results_dir, binFiles, options.tetra_profile, options.distributions, options.report_type, options.output_file) self.logger.info('Outlier information written to: ' + options.output_file) self.timeKeeper.printTimeStamp()
def coverage(self, options): """Coverage command""" self.logger.info( '[CheckM - coverage] Calculating coverage of sequences.') checkDirExists(options.bin_dir) makeSurePathExists(os.path.dirname(options.output_file)) binFiles = self.binFiles(options.bin_dir, options.extension) coverage = Coverage(options.threads) coverage.run(binFiles, options.bam_files, options.output_file, options.all_reads, options.min_align, options.max_edit_dist, options.min_qc) self.logger.info('Coverage information written to: ' + options.output_file) self.timeKeeper.printTimeStamp()
def getBinTaxonomy(self, outDir, binIds): # make sure output and tree directories exist checkDirExists(outDir) alignOutputDir = os.path.join(outDir, 'storage', 'tree') checkDirExists(alignOutputDir) # read genome tree (if it exists) binIdToTaxonomy = {} treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT) tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) # find first parent of each bin with a taxonomic label for binId in binIds: node = tree.find_node_with_taxon_label(binId) if node == None: binIdToTaxonomy[binId] = 'NA' continue # find first node decorated with a taxon string between leaf and root taxaStr = None parentNode = node.parent_node while parentNode != None: if parentNode.label: tokens = parentNode.label.split('|') if tokens[1] != '': if taxaStr: taxaStr = tokens[1] + ';' + taxaStr else: taxaStr = tokens[1] parentNode = parentNode.parent_node if not taxaStr: domainNode = self.__findDomainNode(node) taxaStr = domainNode.label.split('|')[1] + ' (root)' binIdToTaxonomy[node.taxon.label] = taxaStr return binIdToTaxonomy
def markerPlot(self, options): """Marker gene position plot command""" self.logger.info( '[CheckM - marker_plot] Creating marker gene position plot.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) # generate plot for each bin binFiles = self.binFiles(options.bin_dir, options.extension) resultsParser = ResultsParser(None) markerGeneStats = resultsParser.parseMarkerGeneStats( options.results_dir) binStats = resultsParser.parseBinStatsExt(options.results_dir) plot = MarkerGenePosPlot(options) filesProcessed = 1 for f in binFiles: binId = binIdFromFilename(f) self.logger.info( 'Plotting marker gene position plot for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) filesProcessed += 1 if binId not in markerGeneStats or binId not in binStats: continue # bin has no marker genes bPlotted = plot.plot(f, markerGeneStats[binId], binStats[binId]) if bPlotted: outputFile = os.path.join( options.output_dir, binId) + '.marker_pos_plot.' + options.image_type plot.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) else: self.logger.info('No marker genes found in bin.') self.timeKeeper.printTimeStamp()
def binQAPlot(self, options): """Bin QA plot command""" self.logger.info( '[CheckM - bin_qa_plot] Creating bar plot of bin quality.') checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) # read model info # hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) # binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) # read sequence stats file resultsParser = ResultsParser(None) binStatsExt = resultsParser.parseBinStatsExt(options.results_dir) # create plot for each bin plot = BinQAPlot(options) bMakePlot = True if not options.bIgnoreHetero: aai = AminoAcidIdentity() aai.run(options.aai_strain, options.results_dir, None) bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero, aai.aaiHetero) else: bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero, None) if bMakePlot: outputFile = os.path.join(options.output_dir, 'bin_qa_plot.' + options.image_type) plot.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def tetraPcaPlot(self, options): """PCA plot of tetranucleotide signatures""" self.logger.info( '[CheckM - tetra_pca] Creating PCA plot of tetranucleotide signatures.' ) checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) self.logger.info('Computing PCA of tetranuclotide signatures.\n') pca = PCA() seqIds, pc, variance = pca.pcaFile(options.tetra_profile, fraction=1.0, bCenter=True, bScale=False) plots = PcaPlot(options) filesProcessed = 1 for f in binFiles: self.logger.info( 'Plotting PCA of tetranuclotide signatures for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) filesProcessed += 1 plots.plot(f, seqIds, pc, variance) binId = binIdFromFilename(f) outputFile = os.path.join( options.output_dir, binId) + '.tetra_pca_plots.' + options.image_type plots.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def lineageSet(self, options, db=None): """Lineage set command""" self.logger.info( '[CheckM - lineage_set] Inferring lineage-specific marker sets.') checkDirExists(options.tree_dir) # set HMM file for each bin markerSetParser = MarkerSetParser() hmmModelInfoFile = os.path.join(options.tree_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO) binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) # calculate marker gene statistics resultsParser = ResultsParser(binIdToModels) resultsParser.analyseResults(options.tree_dir, DefaultValues.BIN_STATS_PHYLO_OUT, DefaultValues.HMMER_TABLE_PHYLO_OUT) # These options are incompatible with how the lineage-specific marker set is selected, so # the default values are currently hard-coded options.num_genomes_markers = 2 options.bootstrap = 0 options.bRequireTaxonomy = False treeParser = TreeParser() treeParser.getBinMarkerSets( options.tree_dir, options.marker_file, options.num_genomes_markers, options.bootstrap, options.bNoLineageSpecificRefinement, options.bForceDomain, options.bRequireTaxonomy, resultsParser, options.unique, options.multi) self.logger.info('Marker set written to: ' + options.marker_file) self.timeKeeper.printTimeStamp()
def gcBiasPlot(self, options): """GC bias plot command""" self.logger.info( '[CheckM - gc_bias_plot] Plotting bin coverage as a function of GC.' ) checkDirExists(options.bin_dir) makeSurePathExists(options.output_dir) binFiles = self.binFiles(options.bin_dir, options.extension) coverageWindows = CoverageWindows(options.threads) coverageProfile = coverageWindows.run(binFiles, options.bam_file, options.all_reads, options.min_align, options.max_edit_dist, options.window_size) plots = GcBiasPlot(options) filesProcessed = 1 for f in binFiles: self.logger.info('Plotting GC plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) filesProcessed += 1 plots.plot(f, coverageProfile) binId = binIdFromFilename(f) outputFile = os.path.join( options.output_dir, binId) + '.gc_bias_plot.' + options.image_type plots.savePlot(outputFile, dpi=options.dpi) self.logger.info('Plot written to: ' + outputFile) self.timeKeeper.printTimeStamp()
def getBinSisterTaxonomy(self, outDir, binIds): # make sure output and tree directories exist checkDirExists(outDir) alignOutputDir = os.path.join(outDir, 'storage', 'tree') checkDirExists(alignOutputDir) # read genome tree treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT) tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True) # read taxonomy string for each IMG genome leafIdToTaxonomy = {} for line in open( os.path.join(DefaultValues.GENOME_TREE_DIR, DefaultValues.GENOME_TREE_TAXONOMY)): lineSplit = line.split('\t') leafIdToTaxonomy[lineSplit[0]] = lineSplit[1].rstrip() # find LCA of all labeled node in sister lineage binIdToSisterTaxonomy = {} for binId in binIds: node = tree.find_node_with_taxon_label(binId) taxaStr = '' if node != None: # get taxonomic labels of all internal nodes in sister lineages sisterNodes = node.sister_nodes() internalTaxonomyLabels = set() leafTaxonomyLabels = set() for sn in sisterNodes: for curNode in sn.postorder_iter(): if curNode.is_leaf(): if curNode.taxon.label: taxonomy = leafIdToTaxonomy.get( curNode.taxon.label, None) if taxonomy != None: # inserted bins will not have an assigned taxonomy for taxa in taxonomy.split(';'): leafTaxonomyLabels.add(taxa.strip()) else: if curNode.label: tokens = curNode.label.split('|') if tokens[1] != '': for taxa in tokens[1].split(';'): internalTaxonomyLabels.add(taxa) # find LCA of taxonomic labels in rank order; # only consider leaf node labels if there were no internal labels labels = internalTaxonomyLabels if len(labels) == 0: labels = leafTaxonomyLabels for prefix in taxonomicPrefixes: taxa = [] for taxon in labels: if prefix in taxon: taxa.append(taxon) if len(taxa) == 1: # unambiguous label at this rank taxaStr += taxa[0] + ';' elif len(taxa) > 1: # unable to resolve taxonomy at this rank break if not taxaStr: taxaStr = 'unresolved' binIdToSisterTaxonomy[binId] = taxaStr return binIdToSisterTaxonomy
def run(self, binFiles, outDir, hmmTableFile, binIdToModels, binIdToBinMarkerSets, minDeltaComp, maxDeltaCont, minMergedComp, maxMergedCont): checkDirExists(outDir) self.logger.info(' Comparing marker sets between all pairs of bins.') # ensure all bins are using the same marker set markerGenesI = binIdToBinMarkerSets[binIdToBinMarkerSets.keys( )[0]].mostSpecificMarkerSet().getMarkerGenes() for binIdJ in binIdToBinMarkerSets: if markerGenesI != binIdToBinMarkerSets[ binIdJ].mostSpecificMarkerSet().getMarkerGenes(): self.logger.error( ' [Error] All bins must use the same marker set to assess potential mergers.' ) sys.exit(0) # parse HMM information resultsParser = ResultsParser(binIdToModels) # get HMM hits to each bin resultsParser.parseBinHits(outDir, hmmTableFile) # determine union and intersection of marker sets for each pair of bins outputFile = os.path.join(outDir, "merger.tsv") fout = open(outputFile, 'w') fout.write('Bin Id 1\tBin Id 2') fout.write('\tBin 1 completeness\tBin 1 contamination') fout.write('\tBin 2 completeness\tBin 2 contamination') fout.write('\tDelta completeness\tDelta contamination\tMerger delta') fout.write('\tMerged completeness\tMerged contamination\n') binMarkerHits = resultsParser.results binIds = sorted(binMarkerHits.keys()) for i in range(0, len(binMarkerHits)): binIdI = binIds[i] geneCountsI = binMarkerHits[binIdI].geneCounts( binIdToBinMarkerSets[binIdI].mostSpecificMarkerSet(), binMarkerHits[binIdI].markerHits, True) completenessI, contaminationI = geneCountsI[6:8] for j in range(i + 1, len(binMarkerHits)): binIdJ = binIds[j] geneCountsJ = binMarkerHits[binIdJ].geneCounts( binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), binMarkerHits[binIdJ].markerHits, True) completenessJ, contaminationJ = geneCountsJ[6:8] # merge together hits from both bins and calculate completeness and contamination mergedHits = {} for markerId, hits in binMarkerHits[ binIdI].markerHits.iteritems(): mergedHits[markerId] = list(hits) for markerId, hits in binMarkerHits[ binIdJ].markerHits.iteritems(): if markerId in mergedHits: mergedHits[markerId].extend(hits) else: mergedHits[markerId] = hits geneCountsMerged = binMarkerHits[binIdI].geneCounts( binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), mergedHits, True) completenessMerged, contaminationMerged = geneCountsMerged[6:8] if not (completenessMerged >= minMergedComp and contaminationMerged < maxMergedCont): continue # calculate merged statistics deltaComp = completenessMerged - max(completenessI, completenessJ) deltaCont = contaminationMerged - max(contaminationI, contaminationJ) delta = deltaComp - deltaCont if deltaComp >= minDeltaComp and deltaCont < maxDeltaCont: fout.write( '%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n' % (binIdI, binIdJ, completenessI, contaminationI, completenessJ, contaminationJ, deltaComp, deltaCont, delta, completenessMerged, contaminationMerged)) fout.close() return outputFile
def run(self, binFiles, outDir, hmmTableFile, binIdToModels, binIdToBinMarkerSets, minDeltaComp, maxDeltaCont, minMergedComp, maxMergedCont): checkDirExists(outDir) self.logger.info(' Comparing marker sets between all pairs of bins.') # ensure all bins are using the same marker set markerGenesI = binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]].mostSpecificMarkerSet().getMarkerGenes() for binIdJ in binIdToBinMarkerSets: if markerGenesI != binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet().getMarkerGenes(): self.logger.error(' [Error] All bins must use the same marker set to assess potential mergers.') sys.exit(0) # parse HMM information resultsParser = ResultsParser(binIdToModels) # get HMM hits to each bin resultsParser.parseBinHits(outDir, hmmTableFile) # determine union and intersection of marker sets for each pair of bins outputFile = os.path.join(outDir, "merger.tsv") fout = open(outputFile, 'w') fout.write('Bin Id 1\tBin Id 2') fout.write('\tBin 1 completeness\tBin 1 contamination') fout.write('\tBin 2 completeness\tBin 2 contamination') fout.write('\tDelta completeness\tDelta contamination\tMerger delta') fout.write('\tMerged completeness\tMerged contamination\n') binMarkerHits = resultsParser.results binIds = sorted(binMarkerHits.keys()) for i in xrange(0, len(binMarkerHits)): binIdI = binIds[i] geneCountsI = binMarkerHits[binIdI].geneCounts(binIdToBinMarkerSets[binIdI].mostSpecificMarkerSet(), binMarkerHits[binIdI].markerHits, True) completenessI, contaminationI = geneCountsI[6:8] for j in xrange(i + 1, len(binMarkerHits)): binIdJ = binIds[j] geneCountsJ = binMarkerHits[binIdJ].geneCounts(binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), binMarkerHits[binIdJ].markerHits, True) completenessJ, contaminationJ = geneCountsJ[6:8] # merge together hits from both bins and calculate completeness and contamination mergedHits = {} for markerId, hits in binMarkerHits[binIdI].markerHits.iteritems(): mergedHits[markerId] = list(hits) for markerId, hits in binMarkerHits[binIdJ].markerHits.iteritems(): if markerId in mergedHits: mergedHits[markerId].extend(hits) else: mergedHits[markerId] = hits geneCountsMerged = binMarkerHits[binIdI].geneCounts(binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), mergedHits, True) completenessMerged, contaminationMerged = geneCountsMerged[6:8] if not (completenessMerged >= minMergedComp and contaminationMerged < maxMergedCont): continue # calculate merged statistics deltaComp = completenessMerged - max(completenessI, completenessJ) deltaCont = contaminationMerged - max(contaminationI, contaminationJ) delta = deltaComp - deltaCont if deltaComp >= minDeltaComp and deltaCont < maxDeltaCont: fout.write('%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n' % (binIdI, binIdJ, completenessI, contaminationI, completenessJ, contaminationJ, deltaComp, deltaCont, delta, completenessMerged, contaminationMerged)) fout.close() return outputFile
def getBinSisterTaxonomy(self, outDir, binIds): # make sure output and tree directories exist checkDirExists(outDir) alignOutputDir = os.path.join(outDir, 'storage', 'tree') checkDirExists(alignOutputDir) # read genome tree treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT) tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True) # read taxonomy string for each IMG genome leafIdToTaxonomy = {} for line in open(os.path.join(DefaultValues.GENOME_TREE_DIR, 'genome_tree.taxonomy.tsv')): lineSplit = line.split('\t') leafIdToTaxonomy[lineSplit[0]] = lineSplit[1].rstrip() # find LCA of all labeled node in sister lineage binIdToSisterTaxonomy = {} for binId in binIds: node = tree.find_node_with_taxon_label(binId) taxaStr = '' if node != None: # get taxonomic labels of all internal nodes in sister lineages sisterNodes = node.sister_nodes() internalTaxonomyLabels = set() leafTaxonomyLabels = set() for sn in sisterNodes: for curNode in sn.postorder_iter(): if curNode.is_leaf(): if curNode.taxon.label: taxonomy = leafIdToTaxonomy.get(curNode.taxon.label, None) if taxonomy != None: # inserted bins will not have an assigned taxonomy for taxa in taxonomy.split(';'): leafTaxonomyLabels.add(taxa.strip()) else: if curNode.label: tokens = curNode.label.split('|') if tokens[1] != '': for taxa in tokens[1].split(';'): internalTaxonomyLabels.add(taxa) # find LCA of taxonomic labels in rank order; # only consider leaf node labels if there were no internal labels labels = internalTaxonomyLabels if len(labels) == 0: labels = leafTaxonomyLabels for prefix in taxonomicPrefixes: taxa = [] for taxon in labels: if prefix in taxon: taxa.append(taxon) if len(taxa) == 1: # unambiguous label at this rank taxaStr += taxa[0] + ';' elif len(taxa) > 1: # unable to resolve taxonomy at this rank break if not taxaStr: taxaStr = 'unresolved' binIdToSisterTaxonomy[binId] = taxaStr return binIdToSisterTaxonomy