示例#1
0
    def merge(self, options):
        """Merge command"""

        self.logger.info(
            '[CheckM - merge] Identifying bins with complementary sets of marker genes.'
        )

        checkDirExists(options.bin_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        if not options.bCalledGenes:
            if not checkNuclotideSeqs(binFiles):
                return
        else:
            if not checkProteinSeqs(binFiles):
                return

        markerSetParser = MarkerSetParser()
        if markerSetParser.markerFileType(
                options.marker_file) == BinMarkerSets.TREE_MARKER_SET:
            self.logger.error(
                'Merge command requires a taxonomic-specific marker set or a user-defined HMM file.\n'
            )
            return

        # setup directory structure
        makeSurePathExists(options.output_dir)
        makeSurePathExists(os.path.join(options.output_dir, 'bins'))
        makeSurePathExists(os.path.join(options.output_dir, 'storage'))
        makeSurePathExists(os.path.join(options.output_dir, 'storage', 'hmms'))

        binIds = []
        for binFile in binFiles:
            binIds.append(binIdFromFilename(binFile))

        # find marker genes in genome bins
        mgf = MarkerGeneFinder(options.threads)
        binIdToModels = mgf.find(binFiles, options.output_dir,
                                 "merger.table.txt", "merger.hmmer3",
                                 options.marker_file, False, False,
                                 options.bCalledGenes)

        # get HMM file for each bin
        markerSetParser = MarkerSetParser()
        binIdToBinMarkerSets = markerSetParser.getMarkerSets(
            options.output_dir, binIds, options.marker_file)

        # compare markers found in each bin

        merger = Merger()
        outputFile = merger.run(binFiles, options.output_dir,
                                "merger.table.txt", binIdToModels,
                                binIdToBinMarkerSets, options.delta_comp,
                                options.delta_cont, options.merged_comp,
                                options.merged_cont)

        self.logger.info('Merger information written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
示例#2
0
    def nxPlot(self, options):
        """Nx-plot command"""

        self.logger.info('[CheckM - nx_plot] Creating Nx-plots.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        nx = NxPlot(options)
        filesProcessed = 1
        for f in binFiles:
            binId = binIdFromFilename(f)
            self.logger.info('Plotting Nx-plot for %s (%d of %d)' %
                             (binId, filesProcessed, len(binFiles)))
            filesProcessed += 1
            nx.plot(f)

            outputFile = os.path.join(options.output_dir,
                                      binId) + '.nx_plot.' + options.image_type
            nx.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
示例#3
0
    def lengthHistogram(self, options):
        """Sequence length histogram command"""

        self.logger.info(
            '[CheckM - len_hist] Creating sequence length histogram.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        plot = LengthHistogram(options)
        filesProcessed = 1
        for f in binFiles:
            binId = binIdFromFilename(f)
            self.logger.info(
                'Plotting sequence length histogram for %s (%d of %d)' %
                (binId, filesProcessed, len(binFiles)))
            filesProcessed += 1
            plot.plot(f)

            outputFile = os.path.join(
                options.output_dir, binId) + '.len_hist.' + options.image_type
            plot.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
示例#4
0
    def distributionPlots(self, options):
        """Reference distribution plot command"""
        self.logger.info(
            '[CheckM - dist_plot] Creating GC, CD, and TD distribution plots.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        genomicSignatures = GenomicSignatures(K=4, threads=1)
        tetraSigs = genomicSignatures.read(options.tetra_profile)

        plots = DistributionPlots(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info(
                'Plotting reference distribution plots for %s (%d of %d)' %
                (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            binId = binIdFromFilename(f)
            plots.plot(f, tetraSigs, options.distributions)

            outputFile = os.path.join(
                options.output_dir,
                binId) + '.ref_dist_plots.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
示例#5
0
    def getInsertionBranchId(self, outDir, binIds):
        # make sure output and tree directories exist
        checkDirExists(outDir)
        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
        checkDirExists(alignOutputDir)

        # read genome tree (if it exists)
        binIdToUID = {}
        treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT)
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True)

        # find first parent of each bin with a taxonomic label
        for binId in binIds:
            node = tree.find_node_with_taxon_label(binId)
            if node == None:
                binIdToUID[binId] = 'NA'
                continue

            # find first node decorated with a UID string between leaf and root
            parentNode = node.parent_node
            while parentNode != None:
                if parentNode.label:
                    uid = parentNode.label.split('|')[0]
                    break

                parentNode = parentNode.parent_node

            binIdToUID[binId] = uid

        return binIdToUID
示例#6
0
    def treeQA(self, options):
        """QA command"""
        self.logger.info(
            '[CheckM - tree_qa] Assessing phylogenetic markers found in each bin.'
        )

        checkDirExists(options.tree_dir)

        # set HMM file for each bin
        markerSetParser = MarkerSetParser()
        hmmModelInfoFile = os.path.join(options.tree_dir, 'storage',
                                        DefaultValues.PHYLO_HMM_MODEL_INFO)
        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        # calculate marker gene statistics
        RP = ResultsParser(binIdToModels)
        binStats = RP.analyseResults(options.tree_dir,
                                     DefaultValues.BIN_STATS_PHYLO_OUT,
                                     DefaultValues.HMMER_TABLE_PHYLO_OUT)

        # determine taxonomy of each bin

        treeParser = TreeParser()
        treeParser.printSummary(options.out_format, options.tree_dir, RP,
                                options.bTabTable, options.file, binStats)

        if options.file != '':
            self.logger.info('QA information written to: ' + options.file)

        self.timeKeeper.printTimeStamp()
示例#7
0
    def codingDensityPlot(self, options):
        """Coding density plot command"""
        self.logger.info(
            '[CheckM - coding_plot] Creating coding density histogram and delta-CD plot.'
        )

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        plots = CodingDensityPlots(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info(
                'Plotting coding density plots for %s (%d of %d)' %
                (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plots.plot(f, options.distributions)

            binId = binIdFromFilename(f)
            outputFile = os.path.join(
                options.output_dir,
                binId) + '.coding_density_plots.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
示例#8
0
    def run(self, binFiles, resultsParser, outDir):
        # make sure output and tree directories exist
        checkDirExists(outDir)
        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
        checkDirExists(alignOutputDir)
        
        treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT)
        pplacerJsonOut = os.path.join(alignOutputDir, DefaultValues.PPLACER_JSON_OUT)
        pplacerOut = os.path.join(alignOutputDir, DefaultValues.PPLACER_OUT)

        # create concatenated alignment file for each bin
        concatenatedAlignFile = self.__createConcatenatedAlignment(binFiles, resultsParser, alignOutputDir)
        
        # check if concatenated alignment file is empty
        # (this can occur when all genomes have no phylogenetically informative marker genes)
        if os.stat(concatenatedAlignFile)[stat.ST_SIZE] == 0:
            self.logger.info('  No genomes were identified that could be placed in the reference genome tree.')
            shutil.copyfile(os.path.join( DefaultValues.PPLACER_REF_PACKAGE, DefaultValues.GENOME_TREE_FINAL), treeFile)
            return

        # run pplacer to place bins in reference genome tree
        self.logger.info('  Placing %d bins into the genome tree with pplacer (be patient).' % len(binFiles))
        cmd = 'pplacer -j %d -c %s -o %s %s > %s' % (self.numThreads,
                                                     DefaultValues.PPLACER_REF_PACKAGE,
                                                     pplacerJsonOut,
                                                     concatenatedAlignFile,
                                                     pplacerOut)
        os.system(cmd)

        # extract tree
        cmd = 'guppy tog -o %s %s' % (treeFile, pplacerJsonOut)
        os.system(cmd)
示例#9
0
    def getInsertionBranchId(self, outDir, binIds):
        # make sure output and tree directories exist
        checkDirExists(outDir)
        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
        checkDirExists(alignOutputDir)

        # read genome tree (if it exists)
        binIdToUID = {}
        treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT)
        tree = dendropy.Tree.get_from_path(treeFile,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)

        # find first parent of each bin with a taxonomic label
        for binId in binIds:
            node = tree.find_node_with_taxon_label(binId)
            if node == None:
                binIdToUID[binId] = 'NA'
                continue

            # find first node decorated with a UID string between leaf and root
            parentNode = node.parent_node
            while parentNode != None:
                if parentNode.label:
                    uid = parentNode.label.split('|')[0]
                    break

                parentNode = parentNode.parent_node

            binIdToUID[binId] = uid

        return binIdToUID
示例#10
0
    def reportBinTaxonomy(self, outDir, resultsParser, bTabTable, outFile, binStats, bLineageStatistics):
        # make sure output and tree directories exist
        checkDirExists(outDir)
        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
        checkDirExists(alignOutputDir)

        # get all bin ids
        binIds = getBinIdsFromOutDir(outDir)

        # get taxonomy for each bin
        binIdToTaxonomy = self.getBinTaxonomy(outDir, binIds)

        # get weighted ML likelihood
        #pplacerJsonFile = os.path.join(outDir, 'storage', 'tree', 'concatenated.pplacer.json')
        #binIdToWeightedML = self.readPlacementFile(pplacerJsonFile)

        # write table
        if not bLineageStatistics:
            self.__printSimpleSummaryTable(binIdToTaxonomy, resultsParser, bTabTable, outFile)
        else:
            # get taxonomy of sister lineage for each bin
            binIdToSisterTaxonomy = self.getBinSisterTaxonomy(outDir, binIds)

            binIdToLineageStatistics = self.readLineageMetadata(outDir, binIds)
            self.__printFullTable(binIdToTaxonomy, binIdToSisterTaxonomy, binIdToLineageStatistics, resultsParser, binStats, bTabTable, outFile)
示例#11
0
    def reportBinTaxonomy(self, outDir, resultsParser, bTabTable, outFile,
                          binStats, bLineageStatistics):
        # make sure output and tree directories exist
        checkDirExists(outDir)
        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
        checkDirExists(alignOutputDir)

        # get all bin ids
        binIds = getBinIdsFromOutDir(outDir)

        # get taxonomy for each bin
        binIdToTaxonomy = self.getBinTaxonomy(outDir, binIds)

        # write table
        if not bLineageStatistics:
            self.__printSimpleSummaryTable(binIdToTaxonomy, resultsParser,
                                           bTabTable, outFile)
        else:
            # get taxonomy of sister lineage for each bin
            binIdToSisterTaxonomy = self.getBinSisterTaxonomy(outDir, binIds)
            binIdToUID = self.getInsertionBranchId(outDir, binIds)

            binIdToLineageStatistics = self.readLineageMetadata(outDir, binIds)
            self.__printFullTable(binIdToUID, binIdToTaxonomy,
                                  binIdToSisterTaxonomy,
                                  binIdToLineageStatistics, resultsParser,
                                  binStats, bTabTable, outFile)
示例#12
0
    def coveragePcaPlot(self, options):
        """PCA plot of coverage profiles"""
        self.logger.info(
            '[CheckM - cov_pca] Creating PCA plot of coverage profiles.')

        checkDirExists(options.bin_dir)
        checkFileExists(options.coverage_file)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        coverage = Coverage(threads=1)
        coverageStats = coverage.parseCoverage(options.coverage_file)

        seqIds = []
        coverageProfiles = []
        for binId, seqDict in coverageStats.items():
            for seqId, bamDict in seqDict.items():
                seqIds.append(seqId)

                coverages = []
                for _, coverage in bamDict.items():
                    coverages.append(coverage)

                coverageProfiles.append(coverages)

        coverageProfiles = np.array(coverageProfiles)
        if coverageProfiles.shape[1] < 2:
            self.logger.error(
                'Coverage profile is 1 dimensional. PCA requires at least 2 dimensions.'
            )
            sys.exit(1)

        self.logger.info('Computing PCA of coverage profiles.\n')
        pca = PCA()
        pc, variance = pca.pcaMatrix(coverageProfiles,
                                     fraction=1.0,
                                     bCenter=True,
                                     bScale=False)

        plots = PcaPlot(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info(
                'Plotting PCA of coverage profiles for %s (%d of %d)' %
                (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plots.plot(f, seqIds, pc, variance)

            binId = binIdFromFilename(f)
            outputFile = os.path.join(
                options.output_dir,
                binId) + '.cov_pca_plots.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
示例#13
0
    def qa(self, options):
        """QA command"""
        self.logger.info('[CheckM - qa] Tabulating genome statistics.')

        checkDirExists(options.analyze_dir)

        if options.exclude_markers:
            checkFileExists(options.exclude_markers)

        # calculate AAI between marks with multiple hits in a single bin
        aai = AminoAcidIdentity()
        aai.run(options.aai_strain, options.analyze_dir,
                options.alignment_file)

        # get HMM file for each bin

        markerSetParser = MarkerSetParser(options.threads)

        hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage',
                                        DefaultValues.CHECKM_HMM_MODEL_INFO)
        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        binIdToBinMarkerSets = markerSetParser.getMarkerSets(
            options.analyze_dir, getBinIdsFromOutDir(options.analyze_dir),
            options.marker_file, options.exclude_markers)

        # get results for each bin
        RP = ResultsParser(binIdToModels)
        RP.analyseResults(
            options.analyze_dir,
            DefaultValues.BIN_STATS_OUT,
            DefaultValues.HMMER_TABLE_OUT,
            bIgnoreThresholds=options.bIgnoreThresholds,
            evalueThreshold=options.e_value,
            lengthThreshold=options.length,
            bSkipPseudoGeneCorrection=options.bSkipPseudoGeneCorrection,
            bSkipAdjCorrection=options.bSkipAdjCorrection)

        RP.printSummary(options.out_format,
                        aai,
                        binIdToBinMarkerSets,
                        options.bIndividualMarkers,
                        options.coverage_file,
                        options.bTabTable,
                        options.file,
                        anaFolder=options.analyze_dir)
        RP.cacheResults(options.analyze_dir, binIdToBinMarkerSets,
                        options.bIndividualMarkers)

        if options.file != '':
            self.logger.info('QA information written to: ' + options.file)

        self.timeKeeper.printTimeStamp()
示例#14
0
    def binUnion(self, options):
        """Bin union command"""

        self.logger.info(
            '[CheckM - bin_union] Redundancy reduce multiple sets of bins into a single set.'
        )

        output_dir = options.output_dir
        makeSurePathExists(output_dir)

        bin_dirs = []
        checkmQaTsvs = []
        for i, arg in enumerate(options.bin_or_checkm_qa_table):
            if i % 2 == 0:
                checkDirExists(arg)
                bin_dirs.append(arg)
            else:
                checkFileExists(arg)
                checkmQaTsvs.append(arg)

        if len(bin_dirs) < 2:
            self.logger.error(
                "Need to specify at least two bin folders, found %i: " %
                len(bin_dirs))
            sys.exit(1)
        if len(bin_dirs) != len(checkmQaTsvs):
            self.logger.error(
                "Need to specify the same number of bin folders as checkm_qa_tsv files, found %i and %i, respectively: "
                % (len(bin_dirs), len(checkmQaTsvs)))
            sys.exit(1)

        binFileSets = []
        for bin_dir in bin_dirs:
            self.logger.info(
                "Reading fasta files with extension %s from bin folder %s" %
                (options.extension, bin_dir))
            binFileSets.append(self.binFiles(bin_dir, options.extension))

        binUnion = BinUnion()

        contigConflictsOutputFile = os.path.join(output_dir,
                                                 'contigConflicts.csv')
        unionBinOutputFile = os.path.join(output_dir, 'union.txt')
        binUnion.report(bin_dirs, binFileSets, checkmQaTsvs,
                        unionBinOutputFile, contigConflictsOutputFile,
                        options.min_completeness, options.max_contamination)
示例#15
0
    def parallelCoordPlot(self, options):
        """Parallel coordinate plot command"""

        self.logger.info(
            '[CheckM - par_plot] Creating parallel coordinate plot of GC and coverage.'
        )

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)
        checkFileExists(options.coverage_file)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        # read coverage stats file
        coverage = Coverage(threads=1)
        coverageStats = coverage.parseCoverage(options.coverage_file)

        # calculate sequence stats for all bins
        self.logger.info('Calculating sequence statistics for each bin.')
        binStats = BinStatistics()
        seqStats = {}
        for f in binFiles:
            binId = binIdFromFilename(f)
            seqStats[binId] = binStats.sequenceStats(options.results_dir, f)

        # create plot for each bin

        plot = ParallelCoordPlot(options)
        filesProcessed = 1
        for f in binFiles:
            binId = binIdFromFilename(f)
            self.logger.info(
                'Plotting parallel coordinates for %s (%d of %d)' %
                (binId, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plot.plot(binId, seqStats, coverageStats)

            outputFile = os.path.join(
                options.output_dir,
                binId) + '.paralel_coord_plot.' + options.image_type
            plot.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
示例#16
0
    def unbinned(self, options):
        """Unbinned Command"""

        self.logger.info('[CheckM - unbinned] Identify unbinned sequences.')

        checkDirExists(options.bin_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        unbinned = Unbinned()
        unbinned.run(binFiles, options.seq_file, options.output_seq_file,
                     options.output_stats_file, options.min_seq_len)

        self.logger.info('Unbinned sequences written to: ' +
                         options.output_seq_file)
        self.logger.info('Unbinned sequences statistics written to: ' +
                         options.output_stats_file)

        self.timeKeeper.printTimeStamp()
示例#17
0
    def binCompare(self, options):
        """Bin compare command"""

        self.logger.info('[CheckM - bin_compare] Comparing two sets of bins.')

        checkDirExists(options.bin_dir1)
        checkDirExists(options.bin_dir2)

        binFiles1 = self.binFiles(options.bin_dir1, options.extension1)
        binFiles2 = self.binFiles(options.bin_dir2, options.extension2)

        binComparer = BinComparer()
        binComparer.report(binFiles1, binFiles2, options.seq_file,
                           options.output_file)

        self.logger.info('Detailed bin comparison written to: ' +
                         options.output_file)

        self.timeKeeper.printTimeStamp()
示例#18
0
    def getBinTaxonomy(self, outDir, binIds):
        # make sure output and tree directories exist
        checkDirExists(outDir)
        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
        checkDirExists(alignOutputDir)

        # read genome tree (if it exists)
        binIdToTaxonomy = {}
        treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT)
        tree = dendropy.Tree.get_from_path(treeFile,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)

        # find first parent of each bin with a taxonomic label
        for binId in binIds:
            node = tree.find_node_with_taxon_label(binId)
            if node == None:
                binIdToTaxonomy[binId] = 'NA'
                continue

            # find first node decorated with a taxon string between leaf and root
            taxaStr = None
            parentNode = node.parent_node
            while parentNode != None:
                if parentNode.label:
                    tokens = parentNode.label.split('|')

                    if tokens[1] != '':
                        if taxaStr:
                            taxaStr = tokens[1] + ';' + taxaStr
                        else:
                            taxaStr = tokens[1]

                parentNode = parentNode.parent_node

            if not taxaStr:
                domainNode = self.__findDomainNode(node)
                taxaStr = domainNode.label.split('|')[1] + ' (root)'

            binIdToTaxonomy[node.taxon.label] = taxaStr

        return binIdToTaxonomy
    def run(self, binFiles, resultsParser, outDir, bReducedTree):
        # make sure output and tree directories exist
        checkDirExists(outDir)
        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
        checkDirExists(alignOutputDir)

        treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT)
        pplacerJsonOut = os.path.join(alignOutputDir,
                                      DefaultValues.PPLACER_JSON_OUT)
        pplacerOut = os.path.join(alignOutputDir, DefaultValues.PPLACER_OUT)

        # create concatenated alignment file for each bin
        concatenatedAlignFile = self.__createConcatenatedAlignment(
            binFiles, resultsParser, alignOutputDir)

        pplacerRefPkg = DefaultValues.PPLACER_REF_PACKAGE_FULL
        if bReducedTree:
            pplacerRefPkg = DefaultValues.PPLACER_REF_PACKAGE_REDUCED

        # check if concatenated alignment file is empty
        # (this can occur when all genomes have no phylogenetically informative marker genes)
        if os.stat(concatenatedAlignFile)[stat.ST_SIZE] == 0:
            self.logger.info(
                '  No genomes were identified that could be placed in the reference genome tree.'
            )
            shutil.copyfile(
                os.path.join(pplacerRefPkg, DefaultValues.GENOME_TREE),
                treeFile)
            return

        # run pplacer to place bins in reference genome tree
        self.logger.info(
            '  Placing %d bins into the genome tree with pplacer (be patient).'
            % len(binFiles))
        cmd = 'pplacer -j %d -c %s -o %s %s > %s' % (
            self.numThreads, pplacerRefPkg, pplacerJsonOut,
            concatenatedAlignFile, pplacerOut)
        os.system(cmd)

        # extract tree
        cmd = 'guppy tog -o %s %s' % (treeFile, pplacerJsonOut)
        os.system(cmd)
示例#20
0
    def outliers(self, options):
        """Outlier command"""

        self.logger.info('[CheckM - outlier] Identifying outliers in bins.')

        checkDirExists(options.bin_dir)
        checkFileExists(options.tetra_profile)
        makeSurePathExists(os.path.dirname(options.output_file))

        binFiles = self.binFiles(options.bin_dir, options.extension)

        binTools = BinTools()
        binTools.identifyOutliers(options.results_dir, binFiles,
                                  options.tetra_profile, options.distributions,
                                  options.report_type, options.output_file)

        self.logger.info('Outlier information written to: ' +
                         options.output_file)

        self.timeKeeper.printTimeStamp()
示例#21
0
    def coverage(self, options):
        """Coverage command"""

        self.logger.info(
            '[CheckM - coverage] Calculating coverage of sequences.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(os.path.dirname(options.output_file))

        binFiles = self.binFiles(options.bin_dir, options.extension)

        coverage = Coverage(options.threads)
        coverage.run(binFiles, options.bam_files, options.output_file,
                     options.all_reads, options.min_align,
                     options.max_edit_dist, options.min_qc)

        self.logger.info('Coverage information written to: ' +
                         options.output_file)

        self.timeKeeper.printTimeStamp()
示例#22
0
    def getBinTaxonomy(self, outDir, binIds):
        # make sure output and tree directories exist
        checkDirExists(outDir)
        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
        checkDirExists(alignOutputDir)

        # read genome tree (if it exists)
        binIdToTaxonomy = {}
        treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT)       
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True)

        # find first parent of each bin with a taxonomic label
        
        for binId in binIds:
            node = tree.find_node_with_taxon_label(binId)
            if node == None:
                binIdToTaxonomy[binId] = 'NA'
                continue

            # find first node decorated with a taxon string between leaf and root
            taxaStr = None
            parentNode = node.parent_node
            while parentNode != None:
                if parentNode.label:
                    tokens = parentNode.label.split('|')

                    if tokens[1] != '':
                        if taxaStr:
                            taxaStr = tokens[1] + ';' + taxaStr
                        else:
                            taxaStr = tokens[1]

                parentNode = parentNode.parent_node

            if not taxaStr:
                domainNode = self.__findDomainNode(node)
                taxaStr = domainNode.label.split('|')[1] + ' (root)'

            binIdToTaxonomy[node.taxon.label] = taxaStr

        return binIdToTaxonomy
示例#23
0
    def markerPlot(self, options):
        """Marker gene position plot command"""

        self.logger.info(
            '[CheckM - marker_plot] Creating marker gene position plot.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        # generate plot for each bin
        binFiles = self.binFiles(options.bin_dir, options.extension)

        resultsParser = ResultsParser(None)
        markerGeneStats = resultsParser.parseMarkerGeneStats(
            options.results_dir)
        binStats = resultsParser.parseBinStatsExt(options.results_dir)

        plot = MarkerGenePosPlot(options)
        filesProcessed = 1
        for f in binFiles:
            binId = binIdFromFilename(f)
            self.logger.info(
                'Plotting marker gene position plot for %s (%d of %d)' %
                (binId, filesProcessed, len(binFiles)))
            filesProcessed += 1

            if binId not in markerGeneStats or binId not in binStats:
                continue  # bin has no marker genes

            bPlotted = plot.plot(f, markerGeneStats[binId], binStats[binId])

            if bPlotted:
                outputFile = os.path.join(
                    options.output_dir,
                    binId) + '.marker_pos_plot.' + options.image_type
                plot.savePlot(outputFile, dpi=options.dpi)
                self.logger.info('Plot written to: ' + outputFile)
            else:
                self.logger.info('No marker genes found in bin.')

        self.timeKeeper.printTimeStamp()
示例#24
0
    def binQAPlot(self, options):
        """Bin QA plot command"""

        self.logger.info(
            '[CheckM - bin_qa_plot] Creating bar plot of bin quality.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        # read model info
        # hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO)
        # binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        # read sequence stats file
        resultsParser = ResultsParser(None)
        binStatsExt = resultsParser.parseBinStatsExt(options.results_dir)

        # create plot for each bin
        plot = BinQAPlot(options)
        bMakePlot = True
        if not options.bIgnoreHetero:
            aai = AminoAcidIdentity()
            aai.run(options.aai_strain, options.results_dir, None)
            bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero,
                                  aai.aaiHetero)
        else:
            bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero,
                                  None)

        if bMakePlot:
            outputFile = os.path.join(options.output_dir,
                                      'bin_qa_plot.' + options.image_type)
            plot.savePlot(outputFile, dpi=options.dpi)

            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
示例#25
0
    def reportBinTaxonomy(self, outDir, resultsParser, bTabTable, outFile, binStats, bLineageStatistics):
        # make sure output and tree directories exist
        checkDirExists(outDir)
        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
        checkDirExists(alignOutputDir)

        # get all bin ids
        binIds = getBinIdsFromOutDir(outDir)

        # get taxonomy for each bin
        binIdToTaxonomy = self.getBinTaxonomy(outDir, binIds)

        # write table
        if not bLineageStatistics:
            self.__printSimpleSummaryTable(binIdToTaxonomy, resultsParser, bTabTable, outFile)
        else:
            # get taxonomy of sister lineage for each bin
            binIdToSisterTaxonomy = self.getBinSisterTaxonomy(outDir, binIds)
            binIdToUID = self.getInsertionBranchId(outDir, binIds)

            binIdToLineageStatistics = self.readLineageMetadata(outDir, binIds)
            self.__printFullTable(binIdToUID, binIdToTaxonomy, binIdToSisterTaxonomy, binIdToLineageStatistics, resultsParser, binStats, bTabTable, outFile)
示例#26
0
    def tetraPcaPlot(self, options):
        """PCA plot of tetranucleotide signatures"""
        self.logger.info(
            '[CheckM - tetra_pca] Creating PCA plot of tetranucleotide signatures.'
        )

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        self.logger.info('Computing PCA of tetranuclotide signatures.\n')
        pca = PCA()
        seqIds, pc, variance = pca.pcaFile(options.tetra_profile,
                                           fraction=1.0,
                                           bCenter=True,
                                           bScale=False)

        plots = PcaPlot(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info(
                'Plotting PCA of tetranuclotide signatures for %s (%d of %d)' %
                (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plots.plot(f, seqIds, pc, variance)

            binId = binIdFromFilename(f)
            outputFile = os.path.join(
                options.output_dir,
                binId) + '.tetra_pca_plots.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
示例#27
0
    def lineageSet(self, options, db=None):
        """Lineage set command"""
        self.logger.info(
            '[CheckM - lineage_set] Inferring lineage-specific marker sets.')

        checkDirExists(options.tree_dir)

        # set HMM file for each bin
        markerSetParser = MarkerSetParser()
        hmmModelInfoFile = os.path.join(options.tree_dir, 'storage',
                                        DefaultValues.PHYLO_HMM_MODEL_INFO)
        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        # calculate marker gene statistics
        resultsParser = ResultsParser(binIdToModels)
        resultsParser.analyseResults(options.tree_dir,
                                     DefaultValues.BIN_STATS_PHYLO_OUT,
                                     DefaultValues.HMMER_TABLE_PHYLO_OUT)

        # These options are incompatible with how the lineage-specific marker set is selected, so
        # the default values are currently hard-coded

        options.num_genomes_markers = 2
        options.bootstrap = 0
        options.bRequireTaxonomy = False

        treeParser = TreeParser()
        treeParser.getBinMarkerSets(
            options.tree_dir, options.marker_file, options.num_genomes_markers,
            options.bootstrap, options.bNoLineageSpecificRefinement,
            options.bForceDomain, options.bRequireTaxonomy, resultsParser,
            options.unique, options.multi)

        self.logger.info('Marker set written to: ' + options.marker_file)

        self.timeKeeper.printTimeStamp()
示例#28
0
    def gcBiasPlot(self, options):
        """GC bias plot command"""

        self.logger.info(
            '[CheckM - gc_bias_plot] Plotting bin coverage as a function of GC.'
        )

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        coverageWindows = CoverageWindows(options.threads)
        coverageProfile = coverageWindows.run(binFiles, options.bam_file,
                                              options.all_reads,
                                              options.min_align,
                                              options.max_edit_dist,
                                              options.window_size)

        plots = GcBiasPlot(options)
        filesProcessed = 1
        for f in binFiles:
            self.logger.info('Plotting GC plots for %s (%d of %d)' %
                             (f, filesProcessed, len(binFiles)))
            filesProcessed += 1

            plots.plot(f, coverageProfile)

            binId = binIdFromFilename(f)
            outputFile = os.path.join(
                options.output_dir,
                binId) + '.gc_bias_plot.' + options.image_type
            plots.savePlot(outputFile, dpi=options.dpi)
            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
示例#29
0
    def getBinSisterTaxonomy(self, outDir, binIds):
        # make sure output and tree directories exist
        checkDirExists(outDir)
        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
        checkDirExists(alignOutputDir)

        # read genome tree
        treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT)
        tree = dendropy.Tree.get_from_path(treeFile,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)

        # read taxonomy string for each IMG genome
        leafIdToTaxonomy = {}
        for line in open(
                os.path.join(DefaultValues.GENOME_TREE_DIR,
                             DefaultValues.GENOME_TREE_TAXONOMY)):
            lineSplit = line.split('\t')
            leafIdToTaxonomy[lineSplit[0]] = lineSplit[1].rstrip()

        # find LCA of all labeled node in sister lineage
        binIdToSisterTaxonomy = {}
        for binId in binIds:
            node = tree.find_node_with_taxon_label(binId)

            taxaStr = ''
            if node != None:
                # get taxonomic labels of all internal nodes in sister lineages
                sisterNodes = node.sister_nodes()
                internalTaxonomyLabels = set()
                leafTaxonomyLabels = set()
                for sn in sisterNodes:
                    for curNode in sn.postorder_iter():
                        if curNode.is_leaf():
                            if curNode.taxon.label:
                                taxonomy = leafIdToTaxonomy.get(
                                    curNode.taxon.label, None)
                                if taxonomy != None:  # inserted bins will not have an assigned taxonomy
                                    for taxa in taxonomy.split(';'):
                                        leafTaxonomyLabels.add(taxa.strip())
                        else:
                            if curNode.label:
                                tokens = curNode.label.split('|')
                                if tokens[1] != '':
                                    for taxa in tokens[1].split(';'):
                                        internalTaxonomyLabels.add(taxa)

                # find LCA of taxonomic labels in rank order;
                # only consider leaf node labels if there were no internal labels
                labels = internalTaxonomyLabels
                if len(labels) == 0:
                    labels = leafTaxonomyLabels

                for prefix in taxonomicPrefixes:
                    taxa = []
                    for taxon in labels:
                        if prefix in taxon:
                            taxa.append(taxon)

                    if len(taxa) == 1:
                        # unambiguous label at this rank
                        taxaStr += taxa[0] + ';'
                    elif len(taxa) > 1:
                        # unable to resolve taxonomy at this rank
                        break

            if not taxaStr:
                taxaStr = 'unresolved'
            binIdToSisterTaxonomy[binId] = taxaStr

        return binIdToSisterTaxonomy
示例#30
0
文件: merger.py 项目: HadrienG/CheckM
    def run(self, binFiles, outDir, hmmTableFile, binIdToModels,
            binIdToBinMarkerSets, minDeltaComp, maxDeltaCont, minMergedComp,
            maxMergedCont):
        checkDirExists(outDir)

        self.logger.info('  Comparing marker sets between all pairs of bins.')

        # ensure all bins are using the same marker set
        markerGenesI = binIdToBinMarkerSets[binIdToBinMarkerSets.keys(
        )[0]].mostSpecificMarkerSet().getMarkerGenes()
        for binIdJ in binIdToBinMarkerSets:
            if markerGenesI != binIdToBinMarkerSets[
                    binIdJ].mostSpecificMarkerSet().getMarkerGenes():
                self.logger.error(
                    '  [Error] All bins must use the same marker set to assess potential mergers.'
                )
                sys.exit(0)

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile)

        # determine union and intersection of marker sets for each pair of bins
        outputFile = os.path.join(outDir, "merger.tsv")
        fout = open(outputFile, 'w')
        fout.write('Bin Id 1\tBin Id 2')
        fout.write('\tBin 1 completeness\tBin 1 contamination')
        fout.write('\tBin 2 completeness\tBin 2 contamination')
        fout.write('\tDelta completeness\tDelta contamination\tMerger delta')
        fout.write('\tMerged completeness\tMerged contamination\n')

        binMarkerHits = resultsParser.results
        binIds = sorted(binMarkerHits.keys())
        for i in range(0, len(binMarkerHits)):
            binIdI = binIds[i]

            geneCountsI = binMarkerHits[binIdI].geneCounts(
                binIdToBinMarkerSets[binIdI].mostSpecificMarkerSet(),
                binMarkerHits[binIdI].markerHits, True)
            completenessI, contaminationI = geneCountsI[6:8]

            for j in range(i + 1, len(binMarkerHits)):
                binIdJ = binIds[j]

                geneCountsJ = binMarkerHits[binIdJ].geneCounts(
                    binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(),
                    binMarkerHits[binIdJ].markerHits, True)
                completenessJ, contaminationJ = geneCountsJ[6:8]

                # merge together hits from both bins and calculate completeness and contamination
                mergedHits = {}
                for markerId, hits in binMarkerHits[
                        binIdI].markerHits.iteritems():
                    mergedHits[markerId] = list(hits)

                for markerId, hits in binMarkerHits[
                        binIdJ].markerHits.iteritems():
                    if markerId in mergedHits:
                        mergedHits[markerId].extend(hits)
                    else:
                        mergedHits[markerId] = hits

                geneCountsMerged = binMarkerHits[binIdI].geneCounts(
                    binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(),
                    mergedHits, True)
                completenessMerged, contaminationMerged = geneCountsMerged[6:8]

                if not (completenessMerged >= minMergedComp
                        and contaminationMerged < maxMergedCont):
                    continue

                # calculate merged statistics
                deltaComp = completenessMerged - max(completenessI,
                                                     completenessJ)
                deltaCont = contaminationMerged - max(contaminationI,
                                                      contaminationJ)
                delta = deltaComp - deltaCont

                if deltaComp >= minDeltaComp and deltaCont < maxDeltaCont:
                    fout.write(
                        '%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n'
                        % (binIdI, binIdJ, completenessI, contaminationI,
                           completenessJ, contaminationJ, deltaComp, deltaCont,
                           delta, completenessMerged, contaminationMerged))

        fout.close()

        return outputFile
示例#31
0
    def run(self, binFiles, outDir, hmmTableFile,
                binIdToModels, binIdToBinMarkerSets,
                minDeltaComp, maxDeltaCont,
                minMergedComp, maxMergedCont):
        checkDirExists(outDir)

        self.logger.info('  Comparing marker sets between all pairs of bins.')

        # ensure all bins are using the same marker set
        markerGenesI = binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]].mostSpecificMarkerSet().getMarkerGenes()
        for binIdJ in binIdToBinMarkerSets:
            if markerGenesI != binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet().getMarkerGenes():
                self.logger.error('  [Error] All bins must use the same marker set to assess potential mergers.')
                sys.exit(0)

        # parse HMM information
        resultsParser = ResultsParser(binIdToModels)

        # get HMM hits to each bin
        resultsParser.parseBinHits(outDir, hmmTableFile)

        # determine union and intersection of marker sets for each pair of bins
        outputFile = os.path.join(outDir, "merger.tsv")
        fout = open(outputFile, 'w')
        fout.write('Bin Id 1\tBin Id 2')
        fout.write('\tBin 1 completeness\tBin 1 contamination')
        fout.write('\tBin 2 completeness\tBin 2 contamination')
        fout.write('\tDelta completeness\tDelta contamination\tMerger delta')
        fout.write('\tMerged completeness\tMerged contamination\n')

        binMarkerHits = resultsParser.results
        binIds = sorted(binMarkerHits.keys())
        for i in xrange(0, len(binMarkerHits)):
            binIdI = binIds[i]

            geneCountsI = binMarkerHits[binIdI].geneCounts(binIdToBinMarkerSets[binIdI].mostSpecificMarkerSet(), binMarkerHits[binIdI].markerHits, True)
            completenessI, contaminationI = geneCountsI[6:8]

            for j in xrange(i + 1, len(binMarkerHits)):
                binIdJ = binIds[j]

                geneCountsJ = binMarkerHits[binIdJ].geneCounts(binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), binMarkerHits[binIdJ].markerHits, True)
                completenessJ, contaminationJ = geneCountsJ[6:8]

                # merge together hits from both bins and calculate completeness and contamination
                mergedHits = {}
                for markerId, hits in binMarkerHits[binIdI].markerHits.iteritems():
                    mergedHits[markerId] = list(hits)

                for markerId, hits in binMarkerHits[binIdJ].markerHits.iteritems():
                    if markerId in mergedHits:
                        mergedHits[markerId].extend(hits)
                    else:
                        mergedHits[markerId] = hits

                geneCountsMerged = binMarkerHits[binIdI].geneCounts(binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), mergedHits, True)
                completenessMerged, contaminationMerged = geneCountsMerged[6:8]

                if not (completenessMerged >= minMergedComp and contaminationMerged < maxMergedCont):
                    continue

                # calculate merged statistics
                deltaComp = completenessMerged - max(completenessI, completenessJ)
                deltaCont = contaminationMerged - max(contaminationI, contaminationJ)
                delta = deltaComp - deltaCont

                if deltaComp >= minDeltaComp and deltaCont < maxDeltaCont:
                    fout.write('%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n' %
                                                                        (binIdI, binIdJ,
                                                                         completenessI, contaminationI,
                                                                         completenessJ, contaminationJ,
                                                                         deltaComp, deltaCont, delta,
                                                                         completenessMerged, contaminationMerged))

        fout.close()

        return outputFile
示例#32
0
    def getBinSisterTaxonomy(self, outDir, binIds):
        # make sure output and tree directories exist
        checkDirExists(outDir)
        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
        checkDirExists(alignOutputDir)

        # read genome tree
        treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT)
        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', as_rooted=True, preserve_underscores=True)

        # read taxonomy string for each IMG genome
        leafIdToTaxonomy = {}
        for line in open(os.path.join(DefaultValues.GENOME_TREE_DIR, 'genome_tree.taxonomy.tsv')):
            lineSplit = line.split('\t')
            leafIdToTaxonomy[lineSplit[0]] = lineSplit[1].rstrip()

        # find LCA of all labeled node in sister lineage
        binIdToSisterTaxonomy = {}
        for binId in binIds:
            node = tree.find_node_with_taxon_label(binId)

            taxaStr = ''
            if node != None:
                # get taxonomic labels of all internal nodes in sister lineages
                sisterNodes = node.sister_nodes()
                internalTaxonomyLabels = set()
                leafTaxonomyLabels = set()
                for sn in sisterNodes:
                    for curNode in sn.postorder_iter():
                        if curNode.is_leaf():
                            if curNode.taxon.label:
                                taxonomy = leafIdToTaxonomy.get(curNode.taxon.label, None)
                                if taxonomy != None: # inserted bins will not have an assigned taxonomy
                                    for taxa in taxonomy.split(';'):
                                        leafTaxonomyLabels.add(taxa.strip())
                        else:
                            if curNode.label:
                                tokens = curNode.label.split('|')
                                if tokens[1] != '':
                                    for taxa in tokens[1].split(';'):
                                        internalTaxonomyLabels.add(taxa)

                # find LCA of taxonomic labels in rank order;
                # only consider leaf node labels if there were no internal labels
                labels = internalTaxonomyLabels
                if len(labels) == 0:
                    labels = leafTaxonomyLabels

                for prefix in taxonomicPrefixes:
                    taxa = []
                    for taxon in labels:
                        if prefix in taxon:
                            taxa.append(taxon)

                    if len(taxa) == 1:
                        # unambiguous label at this rank
                        taxaStr += taxa[0] + ';'
                    elif len(taxa) > 1:
                        # unable to resolve taxonomy at this rank
                        break

            if not taxaStr:
                taxaStr = 'unresolved'
            binIdToSisterTaxonomy[binId] = taxaStr

        return binIdToSisterTaxonomy