Exemplo n.º 1
0
    def merge(self, options):
        """Merge command"""

        self.logger.info(
            '[CheckM - merge] Identifying bins with complementary sets of marker genes.'
        )

        checkDirExists(options.bin_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        if not options.bCalledGenes:
            if not checkNuclotideSeqs(binFiles):
                return
        else:
            if not checkProteinSeqs(binFiles):
                return

        markerSetParser = MarkerSetParser()
        if markerSetParser.markerFileType(
                options.marker_file) == BinMarkerSets.TREE_MARKER_SET:
            self.logger.error(
                'Merge command requires a taxonomic-specific marker set or a user-defined HMM file.\n'
            )
            return

        # setup directory structure
        makeSurePathExists(options.output_dir)
        makeSurePathExists(os.path.join(options.output_dir, 'bins'))
        makeSurePathExists(os.path.join(options.output_dir, 'storage'))
        makeSurePathExists(os.path.join(options.output_dir, 'storage', 'hmms'))

        binIds = []
        for binFile in binFiles:
            binIds.append(binIdFromFilename(binFile))

        # find marker genes in genome bins
        mgf = MarkerGeneFinder(options.threads)
        binIdToModels = mgf.find(binFiles, options.output_dir,
                                 "merger.table.txt", "merger.hmmer3",
                                 options.marker_file, False, False,
                                 options.bCalledGenes)

        # get HMM file for each bin
        markerSetParser = MarkerSetParser()
        binIdToBinMarkerSets = markerSetParser.getMarkerSets(
            options.output_dir, binIds, options.marker_file)

        # compare markers found in each bin

        merger = Merger()
        outputFile = merger.run(binFiles, options.output_dir,
                                "merger.table.txt", binIdToModels,
                                binIdToBinMarkerSets, options.delta_comp,
                                options.delta_cont, options.merged_comp,
                                options.merged_cont)

        self.logger.info('Merger information written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
Exemplo n.º 2
0
    def qa(self, options):
        """QA command"""
        self.logger.info('[CheckM - qa] Tabulating genome statistics.')

        checkDirExists(options.analyze_dir)

        if options.exclude_markers:
            checkFileExists(options.exclude_markers)

        # calculate AAI between marks with multiple hits in a single bin
        aai = AminoAcidIdentity()
        aai.run(options.aai_strain, options.analyze_dir,
                options.alignment_file)

        # get HMM file for each bin

        markerSetParser = MarkerSetParser(options.threads)

        hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage',
                                        DefaultValues.CHECKM_HMM_MODEL_INFO)
        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        binIdToBinMarkerSets = markerSetParser.getMarkerSets(
            options.analyze_dir, getBinIdsFromOutDir(options.analyze_dir),
            options.marker_file, options.exclude_markers)

        # get results for each bin
        RP = ResultsParser(binIdToModels)
        RP.analyseResults(
            options.analyze_dir,
            DefaultValues.BIN_STATS_OUT,
            DefaultValues.HMMER_TABLE_OUT,
            bIgnoreThresholds=options.bIgnoreThresholds,
            evalueThreshold=options.e_value,
            lengthThreshold=options.length,
            bSkipPseudoGeneCorrection=options.bSkipPseudoGeneCorrection,
            bSkipAdjCorrection=options.bSkipAdjCorrection)

        RP.printSummary(options.out_format,
                        aai,
                        binIdToBinMarkerSets,
                        options.bIndividualMarkers,
                        options.coverage_file,
                        options.bTabTable,
                        options.file,
                        anaFolder=options.analyze_dir)
        RP.cacheResults(options.analyze_dir, binIdToBinMarkerSets,
                        options.bIndividualMarkers)

        if options.file != '':
            self.logger.info('QA information written to: ' + options.file)

        self.timeKeeper.printTimeStamp()
Exemplo n.º 3
0
    def analyze(self, options, db=None):
        """Analyze command"""
        self.logger.info(
            '[CheckM - analyze] Identifying marker genes in bins.')

        binFiles = self.binFiles(options.bin_dir, options.extension)

        if not options.bCalledGenes:
            if not checkNuclotideSeqs(binFiles):
                return
        else:
            if not checkProteinSeqs(binFiles):
                return

        # setup directory structure
        makeSurePathExists(options.output_dir)
        makeSurePathExists(os.path.join(options.output_dir, 'bins'))
        makeSurePathExists(os.path.join(options.output_dir, 'storage'))
        makeSurePathExists(
            os.path.join(options.output_dir, 'storage', 'aai_qa'))

        # find marker genes in genome bins
        mgf = MarkerGeneFinder(options.threads)
        binIdToModels = mgf.find(binFiles, options.output_dir,
                                 DefaultValues.HMMER_TABLE_OUT,
                                 DefaultValues.HMMER_OUT, options.marker_file,
                                 options.bKeepAlignment, options.bNucORFs,
                                 options.bCalledGenes)

        markerSetParser = MarkerSetParser(options.threads)
        binIdToBinMarkerSets = markerSetParser.getMarkerSets(
            options.output_dir, getBinIdsFromOutDir(options.output_dir),
            options.marker_file)

        hmmModelInfoFile = os.path.join(options.output_dir, 'storage',
                                        DefaultValues.CHECKM_HMM_MODEL_INFO)
        markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile)

        self.timeKeeper.printTimeStamp()

        # HMM model file
        if markerSetParser.markerFileType(
                options.marker_file) == BinMarkerSets.HMM_MODELS_SET:
            markerFile = options.marker_file
        else:
            markerFile = DefaultValues.HMM_MODELS

        # align marker genes with multiple hits within a bin
        HA = HmmerAligner(options.threads)
        HA.makeAlignmentsOfMultipleHits(
            options.output_dir, markerFile, DefaultValues.HMMER_TABLE_OUT,
            binIdToModels, binIdToBinMarkerSets, False, DefaultValues.E_VAL,
            DefaultValues.LENGTH,
            os.path.join(options.output_dir, 'storage', 'aai_qa'))

        self.timeKeeper.printTimeStamp()

        # calculate statistics for each genome bin
        binStats = BinStatistics(options.threads)
        binStats.calculate(binFiles, options.output_dir,
                           DefaultValues.BIN_STATS_OUT)

        self.timeKeeper.printTimeStamp()

        # align top hit to each marker if requested
        if options.bAlignTopHit:
            alignmentOutputFolder = os.path.join(options.output_dir, 'storage',
                                                 'alignments')
            makeSurePathExists(alignmentOutputFolder)

            HA = HmmerAligner(options.threads)
            resultsParser = HA.makeAlignmentTopHit(
                options.output_dir, options.marker_file,
                DefaultValues.HMMER_TABLE_OUT, binIdToModels, False,
                DefaultValues.E_VAL, DefaultValues.LENGTH, True,
                alignmentOutputFolder)

            # report marker gene data
            fout = open(
                os.path.join(alignmentOutputFolder, 'alignment_info.tsv'), 'w')
            fout.write('Marker Id\tLength (bp)\n')
            markerIds = resultsParser.models[list(
                resultsParser.models.keys())[0]].keys()
            for markerId in markerIds:
                fout.write('%s\t%d\n' % (markerId, resultsParser.models[list(
                    resultsParser.models.keys())[0]][markerId].leng))
            fout.close()

            self.logger.info('Alignments to top hits stored in: ' +
                             alignmentOutputFolder)

            self.timeKeeper.printTimeStamp()