示例#1
0
    def qa(self, options):
        """QA command"""
        self.logger.info('[CheckM - qa] Tabulating genome statistics.')

        checkDirExists(options.analyze_dir)

        if options.exclude_markers:
            checkFileExists(options.exclude_markers)

        # calculate AAI between marks with multiple hits in a single bin
        aai = AminoAcidIdentity()
        aai.run(options.aai_strain, options.analyze_dir,
                options.alignment_file)

        # get HMM file for each bin

        markerSetParser = MarkerSetParser(options.threads)

        hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage',
                                        DefaultValues.CHECKM_HMM_MODEL_INFO)
        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        binIdToBinMarkerSets = markerSetParser.getMarkerSets(
            options.analyze_dir, getBinIdsFromOutDir(options.analyze_dir),
            options.marker_file, options.exclude_markers)

        # get results for each bin
        RP = ResultsParser(binIdToModels)
        RP.analyseResults(
            options.analyze_dir,
            DefaultValues.BIN_STATS_OUT,
            DefaultValues.HMMER_TABLE_OUT,
            bIgnoreThresholds=options.bIgnoreThresholds,
            evalueThreshold=options.e_value,
            lengthThreshold=options.length,
            bSkipPseudoGeneCorrection=options.bSkipPseudoGeneCorrection,
            bSkipAdjCorrection=options.bSkipAdjCorrection)

        RP.printSummary(options.out_format,
                        aai,
                        binIdToBinMarkerSets,
                        options.bIndividualMarkers,
                        options.coverage_file,
                        options.bTabTable,
                        options.file,
                        anaFolder=options.analyze_dir)
        RP.cacheResults(options.analyze_dir, binIdToBinMarkerSets,
                        options.bIndividualMarkers)

        if options.file != '':
            self.logger.info('QA information written to: ' + options.file)

        self.timeKeeper.printTimeStamp()
    def testMultiCopyStrainHetero(self):
        """Verify computation of strain heterogeneity score when there are multiple copies of a sequence."""
        aai = AminoAcidIdentity()

        aaiScores = defaultdict(dict)
        aaiScores['b1'] = {'g1': [0.95, 0.95, 0.95], 'g2': [0.1, 0.1, 0.1], 'g3': [0.95, 0.1, 0.1]}

        aaiHetero, aaiMeanBinHetero = aai.strainHetero(aaiScores, 0.9)

        self.assertAlmostEqual(aaiHetero['b1']['g1'], 1.0)
        self.assertAlmostEqual(aaiHetero['b1']['g2'], 0.0)
        self.assertAlmostEqual(aaiHetero['b1']['g3'], 1.0 / 3.0)

        self.assertAlmostEqual(aaiMeanBinHetero['b1'], 4.0 * 100 / 9.0)
    def testMixedStrainHetero(self):
        """Verify computation of strain heterogeneity score on sequences with variable similarity."""
        aai = AminoAcidIdentity()

        aaiScores = defaultdict(dict)
        aaiScores['b1'] = {'g1': [0.95], 'g2': [0.1], 'g3': [0.1]}

        aaiHetero, aaiMeanBinHetero = aai.strainHetero(aaiScores, 0.9)

        self.assertAlmostEqual(aaiHetero['b1']['g1'], 1.0)
        self.assertAlmostEqual(aaiHetero['b1']['g2'], 0.0)
        self.assertAlmostEqual(aaiHetero['b1']['g3'], 0.0)

        self.assertAlmostEqual(aaiMeanBinHetero['b1'], 1.0 * 100 / 3.0)
    def testAllStrainHetero(self):
        """Verify computation of strain heterogeneity score on highly similar sequences."""
        aai = AminoAcidIdentity()

        aaiScores = defaultdict(dict)
        aaiScores['b1'] = {'g1': [0.95], 'g2': [0.95], 'g3': [0.95]}

        aaiHetero, aaiMeanBinHetero = aai.strainHetero(aaiScores, 0.9)

        self.assertAlmostEqual(aaiHetero['b1']['g1'], 1.0)
        self.assertAlmostEqual(aaiHetero['b1']['g2'], 1.0)
        self.assertAlmostEqual(aaiHetero['b1']['g3'], 1.0)

        self.assertAlmostEqual(aaiMeanBinHetero['b1'], 100.0)
    def testAllStrainHetero(self):
        """Verify computation of strain heterogeneity score on highly similar sequences."""
        aai = AminoAcidIdentity()

        aaiScores = defaultdict(dict)
        aaiScores['b1'] = {'g1': [0.95], 'g2': [0.95], 'g3': [0.95]}

        aaiHetero, aaiMeanBinHetero = aai.strainHetero(aaiScores, 0.9)

        self.assertAlmostEqual(aaiHetero['b1']['g1'], 1.0)
        self.assertAlmostEqual(aaiHetero['b1']['g2'], 1.0)
        self.assertAlmostEqual(aaiHetero['b1']['g3'], 1.0)

        self.assertAlmostEqual(aaiMeanBinHetero['b1'], 100.0)
    def testMixedStrainHetero(self):
        """Verify computation of strain heterogeneity score on sequences with variable similarity."""
        aai = AminoAcidIdentity()

        aaiScores = defaultdict(dict)
        aaiScores['b1'] = {'g1': [0.95], 'g2': [0.1], 'g3': [0.1]}

        aaiHetero, aaiMeanBinHetero = aai.strainHetero(aaiScores, 0.9)

        self.assertAlmostEqual(aaiHetero['b1']['g1'], 1.0)
        self.assertAlmostEqual(aaiHetero['b1']['g2'], 0.0)
        self.assertAlmostEqual(aaiHetero['b1']['g3'], 0.0)

        self.assertAlmostEqual(aaiMeanBinHetero['b1'], 1.0 * 100 / 3.0)
    def testMultiCopyStrainHetero(self):
        """Verify computation of strain heterogeneity score when there are multiple copies of a sequence."""
        aai = AminoAcidIdentity()

        aaiScores = defaultdict(dict)
        aaiScores['b1'] = {
            'g1': [0.95, 0.95, 0.95],
            'g2': [0.1, 0.1, 0.1],
            'g3': [0.95, 0.1, 0.1]
        }

        aaiHetero, aaiMeanBinHetero = aai.strainHetero(aaiScores, 0.9)

        self.assertAlmostEqual(aaiHetero['b1']['g1'], 1.0)
        self.assertAlmostEqual(aaiHetero['b1']['g2'], 0.0)
        self.assertAlmostEqual(aaiHetero['b1']['g3'], 1.0 / 3.0)

        self.assertAlmostEqual(aaiMeanBinHetero['b1'], 4.0 * 100 / 9.0)
示例#8
0
    def binQAPlot(self, options):
        """Bin QA plot command"""

        self.logger.info(
            '[CheckM - bin_qa_plot] Creating bar plot of bin quality.')

        checkDirExists(options.bin_dir)
        makeSurePathExists(options.output_dir)

        binFiles = self.binFiles(options.bin_dir, options.extension)

        # read model info
        # hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO)
        # binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)

        # read sequence stats file
        resultsParser = ResultsParser(None)
        binStatsExt = resultsParser.parseBinStatsExt(options.results_dir)

        # create plot for each bin
        plot = BinQAPlot(options)
        bMakePlot = True
        if not options.bIgnoreHetero:
            aai = AminoAcidIdentity()
            aai.run(options.aai_strain, options.results_dir, None)
            bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero,
                                  aai.aaiHetero)
        else:
            bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero,
                                  None)

        if bMakePlot:
            outputFile = os.path.join(options.output_dir,
                                      'bin_qa_plot.' + options.image_type)
            plot.savePlot(outputFile, dpi=options.dpi)

            self.logger.info('Plot written to: ' + outputFile)

        self.timeKeeper.printTimeStamp()
    def testDiffSeqs(self):
        """Verify computation of AAI on two completely different sequences."""
        aai = AminoAcidIdentity()

        score = aai.aai('ACGT', 'TGCA')
        self.assertAlmostEqual(score, 0.0)
    def testIdenticalInsertionSeqs(self):
        """Verify computation of AAI on two identical sequences with multiple insertions."""
        aai = AminoAcidIdentity()

        score = aai.aai('A-C-G-T', 'A-C-G-T')
        self.assertAlmostEqual(score, 1.0)
    def testDiffInsertionSeqs(self):
        """Verify computation of AAI on two different sequences with multiple insertions."""
        aai = AminoAcidIdentity()

        score = aai.aai('A-C-G-T', 'AACCGGT')
        self.assertAlmostEqual(score, 4.0 / 7.0)
    def testIncompleteSeqs(self):
        """Verify computation of AAI on incomplete sequence."""
        aai = AminoAcidIdentity()

        score = aai.aai('AAAACGTTTT', '---ACGG---')
        self.assertAlmostEqual(score, 3.0 / 4.0)
    def testSimilarSeqs(self):
        """Verify computation of AAI on two similar sequences."""
        aai = AminoAcidIdentity()

        score = aai.aai('ACGT', 'ACGG')
        self.assertAlmostEqual(score, 3.0 / 4.0)
    def testNonOverlappingSeqs(self):
        """Verify computation of AAI on two non-overlapping sequences."""
        aai = AminoAcidIdentity()

        score = aai.aai('ACGT----', '----TGCA')
        self.assertAlmostEqual(score, 0.0)
    def testPartialOverlappingSeqs(self):
        """Verify computation of AAI on two partially overlapping sequences."""
        aai = AminoAcidIdentity()

        score = aai.aai('ACGT--', '--GTAC')
        self.assertAlmostEqual(score, 1.0)
    def testIdentiticalSeqs(self):
        """Verify computation of AAI on identical sequences."""
        aai = AminoAcidIdentity()

        score = aai.aai('ACGT', 'ACGT')
        self.assertAlmostEqual(score, 1.0)
    def testDiffSeqs(self):
        """Verify computation of AAI on two completely different sequences."""
        aai = AminoAcidIdentity()

        score = aai.aai('ACGT', 'TGCA')
        self.assertAlmostEqual(score, 0.0)
    def testPartialOverlappingSeqs(self):
        """Verify computation of AAI on two partially overlapping sequences."""
        aai = AminoAcidIdentity()

        score = aai.aai('ACGT--', '--GTAC')
        self.assertAlmostEqual(score, 1.0)
    def testNonOverlappingSeqs(self):
        """Verify computation of AAI on two non-overlapping sequences."""
        aai = AminoAcidIdentity()

        score = aai.aai('ACGT----', '----TGCA')
        self.assertAlmostEqual(score, 0.0)
    def testIdentiticalSeqs(self):
        """Verify computation of AAI on identical sequences."""
        aai = AminoAcidIdentity()

        score = aai.aai('ACGT', 'ACGT')
        self.assertAlmostEqual(score, 1.0)
    def testDiffInsertionSeqs(self):
        """Verify computation of AAI on two different sequences with multiple insertions."""
        aai = AminoAcidIdentity()

        score = aai.aai('A-C-G-T', 'AACCGGT')
        self.assertAlmostEqual(score, 4.0 / 7.0)
    def testIdenticalInsertionSeqs(self):
        """Verify computation of AAI on two identical sequences with multiple insertions."""
        aai = AminoAcidIdentity()

        score = aai.aai('A-C-G-T', 'A-C-G-T')
        self.assertAlmostEqual(score, 1.0)
    def testSimilarSeqs(self):
        """Verify computation of AAI on two similar sequences."""
        aai = AminoAcidIdentity()

        score = aai.aai('ACGT', 'ACGG')
        self.assertAlmostEqual(score, 3.0 / 4.0)
    def testIncompleteSeqs(self):
        """Verify computation of AAI on incomplete sequence."""
        aai = AminoAcidIdentity()

        score = aai.aai('AAAACGTTTT', '---ACGG---')
        self.assertAlmostEqual(score, 3.0 / 4.0)