예제 #1
0
    def test_q_score(self):
        ar = Artifact.load(self.get_data_path('simple.qza'))
        view = ar.view(SingleLanePerSampleSingleEndFastqDirFmt)
        obs_drop_ambig, stats = q_score(view, quality_window=2,
                                        min_quality=20,
                                        min_length_fraction=0.25)

        exp_drop_ambig = ["@foo_1",
                          "ATGCATGC",
                          "+",
                          "DDDDBBDD"]
        columns = ['sample-id', 'total-input-reads', 'total-retained-reads',
                   'reads-truncated',
                   'reads-too-short-after-truncation',
                   'reads-exceeding-maximum-ambiguous-bases']
        exp_drop_ambig_stats = pd.DataFrame([('foo', 2, 1, 0, 0, 1),
                                             ('bar', 1, 0, 0, 0, 1)],
                                            columns=columns)
        exp_drop_ambig_stats = exp_drop_ambig_stats.set_index('sample-id')
        obs = []
        iterator = obs_drop_ambig.sequences.iter_views(FastqGzFormat)
        for sample_id, fp in iterator:
            obs.extend([l.strip() for l in gzip.open(str(fp), 'rt')])
        self.assertEqual(obs, exp_drop_ambig)
        pdt.assert_frame_equal(stats, exp_drop_ambig_stats.loc[stats.index])

        obs_trunc, stats = q_score(view, quality_window=1, min_quality=33,
                                   min_length_fraction=0.25)
        exp_trunc = ["@foo_1",
                     "ATGCATGC",
                     "+",
                     "DDDDBBDD",
                     "@bar_1",
                     "ATA",
                     "+",
                     "DDD"]
        exp_trunc_stats = pd.DataFrame([('foo', 2, 1, 0, 0, 1),
                                        ('bar', 1, 1, 1, 0, 0)],
                                       columns=columns)
        exp_trunc_stats = exp_trunc_stats.set_index('sample-id')

        obs = []
        for sample_id, fp in obs_trunc.sequences.iter_views(FastqGzFormat):
            obs.extend([l.strip() for l in gzip.open(str(fp), 'rt')])
        self.assertEqual(sorted(obs), sorted(exp_trunc))
        pdt.assert_frame_equal(stats, exp_trunc_stats.loc[stats.index])
예제 #2
0
    def test_q_score_numeric_ids(self):
        ar = Artifact.load(self.get_data_path('numeric_ids.qza'))
        view = ar.view(SingleLanePerSampleSingleEndFastqDirFmt)
        exp_sids = {'00123', '0.4560'}
        obs, stats = q_score(view, min_quality=20)
        obs_manifest = obs.manifest.view(obs.manifest.format)
        obs_manifest = pd.read_csv(obs_manifest.open(), dtype=str, comment='#')
        obs_manifest.set_index('sample-id', inplace=True)

        obs_sids = set(obs_manifest.index)
        self.assertEqual(obs_sids, exp_sids)
        self.assertEqual(set(stats.index), exp_sids)
예제 #3
0
    def test_q_score_all_dropped(self):
        ar = Artifact.load(self.get_data_path('simple.qza'))
        view = ar.view(SingleLanePerSampleSingleEndFastqDirFmt)

        with self.assertRaisesRegex(ValueError, "filtered out"):
            q_score(view, min_quality=50)
예제 #4
0
    def test_q_score_real(self):
        ar = Artifact.load(self.get_data_path('real_data.qza'))
        view = ar.view(SingleLanePerSampleSingleEndFastqDirFmt)
        obs_result, stats = q_score(view, min_quality=40,
                                    min_length_fraction=0.24)

        # All input reads are represented here in their post-quality filtered
        # form. Reads that are commented out were manually identified as being
        # filtered by the q_score method. For the commented reads, the comments
        # denote why the read is not retained.

        # The first read, @HWI-EAS440_0386:1:32:15467:1432#0/1, is 25% of
        # total read length and is indicative of a sequence at the
        # min_length_fraction boundary.
        exp_result = [
                      "@HWI-EAS440_0386:1:32:15467:1432#0/1",
                      "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTT",
                      "+",
                      "hhhhhhhhhhhhfghhhghghghhhchhhahhhhhfhh",

                      # too short
                      # "@HWI-EAS440_0386:1:36:9986:17043#0/1",
                      # "TACGTAGGTGGCAAGCGTTATCCGGATTTATTG",
                      # "+",
                      # "hhhhhhhhhhhhhhhhhhhhhhhhhffhhghhh",

                      "@HWI-EAS440_0386:1:37:13343:14820#0/1",
                      ("TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGAT"
                       "GGATGTTTAAGTCAGTTGTG"),
                      "+",
                      ("hhhhhhhhhhhhhfhhhhhfhhhhghhhhghhhhhhhhhgghhhgghhhgghh"
                       "hgdhhhhghghhhdhhhhgh"),

                      "@HWI-EAS440_0386:1:41:18215:15404#0/1",
                      "TACGTAGGTGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCATGTA",
                      "+",
                      "hhhhhhhhhhhhghhhhhhhhhhhhffhhghhhhghhghgghghhhhhgh",

                      # too short
                      # "@HWI-EAS440_0386:1:42:5423:19606#0/1",
                      # "TACGTAGGGAGCAAGCGTT",
                      # "+",
                      # "hhhhghhhhhhhhhghhfh",

                      "@HWI-EAS440_0386:1:52:7507:5841#0/1",
                      "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTT",
                      "+",
                      "hhhhhhhhhghhfghhhhhhhhhhgfhhhghhhghdhh",

                      "@HWI-EAS440_0386:1:53:18599:4074#0/1",
                      "TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTG",
                      "+",
                      "hhhhfhhhhhfhhhhhhfhffhghhfgghggghdcbh",

                      # too short
                      # "@HWI-EAS440_0386:1:55:16425:9514#0/1",
                      # "TACGGAGGATCCGAGCGTTATCCGGATT",
                      # "+",
                      # "hhhhhhhhhhhhfghhhghghhhhhbgh",

                      "@HWI-EAS440_0386:1:65:12049:5619#0/1",
                      "TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTG",
                      "+",
                      "hhhhhhhhhhhhhhhhhhhhhhhhhfhhhhhhhghdhghhhhhghcfh",

                      # @HWI-EAS440_0386:1:95:4837:16388#0/1
                      # starts off < Q40
                      ]

        columns = ['sample-id', 'total-input-reads', 'total-retained-reads',
                   'reads-truncated',
                   'reads-too-short-after-truncation',
                   'reads-exceeding-maximum-ambiguous-bases']
        exp_stats = pd.DataFrame([('foo', 10, 6, 10, 4, 0)],
                                 columns=columns)
        exp_stats = exp_stats.set_index('sample-id')
        obs = []
        iterator = obs_result.sequences.iter_views(FastqGzFormat)
        for sample_id, fp in iterator:
            obs.extend([l.strip() for l in gzip.open(str(fp), 'rt')])
        self.assertEqual(obs, exp_result)
        pdt.assert_frame_equal(stats, exp_stats.loc[stats.index])