def test_q_score(self): ar = Artifact.load(self.get_data_path('simple.qza')) view = ar.view(SingleLanePerSampleSingleEndFastqDirFmt) obs_drop_ambig, stats = q_score(view, quality_window=2, min_quality=20, min_length_fraction=0.25) exp_drop_ambig = ["@foo_1", "ATGCATGC", "+", "DDDDBBDD"] columns = ['sample-id', 'total-input-reads', 'total-retained-reads', 'reads-truncated', 'reads-too-short-after-truncation', 'reads-exceeding-maximum-ambiguous-bases'] exp_drop_ambig_stats = pd.DataFrame([('foo', 2, 1, 0, 0, 1), ('bar', 1, 0, 0, 0, 1)], columns=columns) exp_drop_ambig_stats = exp_drop_ambig_stats.set_index('sample-id') obs = [] iterator = obs_drop_ambig.sequences.iter_views(FastqGzFormat) for sample_id, fp in iterator: obs.extend([l.strip() for l in gzip.open(str(fp), 'rt')]) self.assertEqual(obs, exp_drop_ambig) pdt.assert_frame_equal(stats, exp_drop_ambig_stats.loc[stats.index]) obs_trunc, stats = q_score(view, quality_window=1, min_quality=33, min_length_fraction=0.25) exp_trunc = ["@foo_1", "ATGCATGC", "+", "DDDDBBDD", "@bar_1", "ATA", "+", "DDD"] exp_trunc_stats = pd.DataFrame([('foo', 2, 1, 0, 0, 1), ('bar', 1, 1, 1, 0, 0)], columns=columns) exp_trunc_stats = exp_trunc_stats.set_index('sample-id') obs = [] for sample_id, fp in obs_trunc.sequences.iter_views(FastqGzFormat): obs.extend([l.strip() for l in gzip.open(str(fp), 'rt')]) self.assertEqual(sorted(obs), sorted(exp_trunc)) pdt.assert_frame_equal(stats, exp_trunc_stats.loc[stats.index])
def test_q_score_numeric_ids(self): ar = Artifact.load(self.get_data_path('numeric_ids.qza')) view = ar.view(SingleLanePerSampleSingleEndFastqDirFmt) exp_sids = {'00123', '0.4560'} obs, stats = q_score(view, min_quality=20) obs_manifest = obs.manifest.view(obs.manifest.format) obs_manifest = pd.read_csv(obs_manifest.open(), dtype=str, comment='#') obs_manifest.set_index('sample-id', inplace=True) obs_sids = set(obs_manifest.index) self.assertEqual(obs_sids, exp_sids) self.assertEqual(set(stats.index), exp_sids)
def test_q_score_all_dropped(self): ar = Artifact.load(self.get_data_path('simple.qza')) view = ar.view(SingleLanePerSampleSingleEndFastqDirFmt) with self.assertRaisesRegex(ValueError, "filtered out"): q_score(view, min_quality=50)
def test_q_score_real(self): ar = Artifact.load(self.get_data_path('real_data.qza')) view = ar.view(SingleLanePerSampleSingleEndFastqDirFmt) obs_result, stats = q_score(view, min_quality=40, min_length_fraction=0.24) # All input reads are represented here in their post-quality filtered # form. Reads that are commented out were manually identified as being # filtered by the q_score method. For the commented reads, the comments # denote why the read is not retained. # The first read, @HWI-EAS440_0386:1:32:15467:1432#0/1, is 25% of # total read length and is indicative of a sequence at the # min_length_fraction boundary. exp_result = [ "@HWI-EAS440_0386:1:32:15467:1432#0/1", "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTT", "+", "hhhhhhhhhhhhfghhhghghghhhchhhahhhhhfhh", # too short # "@HWI-EAS440_0386:1:36:9986:17043#0/1", # "TACGTAGGTGGCAAGCGTTATCCGGATTTATTG", # "+", # "hhhhhhhhhhhhhhhhhhhhhhhhhffhhghhh", "@HWI-EAS440_0386:1:37:13343:14820#0/1", ("TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGAT" "GGATGTTTAAGTCAGTTGTG"), "+", ("hhhhhhhhhhhhhfhhhhhfhhhhghhhhghhhhhhhhhgghhhgghhhgghh" "hgdhhhhghghhhdhhhhgh"), "@HWI-EAS440_0386:1:41:18215:15404#0/1", "TACGTAGGTGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCATGTA", "+", "hhhhhhhhhhhhghhhhhhhhhhhhffhhghhhhghhghgghghhhhhgh", # too short # "@HWI-EAS440_0386:1:42:5423:19606#0/1", # "TACGTAGGGAGCAAGCGTT", # "+", # "hhhhghhhhhhhhhghhfh", "@HWI-EAS440_0386:1:52:7507:5841#0/1", "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTT", "+", "hhhhhhhhhghhfghhhhhhhhhhgfhhhghhhghdhh", "@HWI-EAS440_0386:1:53:18599:4074#0/1", "TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTG", "+", "hhhhfhhhhhfhhhhhhfhffhghhfgghggghdcbh", # too short # "@HWI-EAS440_0386:1:55:16425:9514#0/1", # "TACGGAGGATCCGAGCGTTATCCGGATT", # "+", # "hhhhhhhhhhhhfghhhghghhhhhbgh", "@HWI-EAS440_0386:1:65:12049:5619#0/1", "TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTG", "+", "hhhhhhhhhhhhhhhhhhhhhhhhhfhhhhhhhghdhghhhhhghcfh", # @HWI-EAS440_0386:1:95:4837:16388#0/1 # starts off < Q40 ] columns = ['sample-id', 'total-input-reads', 'total-retained-reads', 'reads-truncated', 'reads-too-short-after-truncation', 'reads-exceeding-maximum-ambiguous-bases'] exp_stats = pd.DataFrame([('foo', 10, 6, 10, 4, 0)], columns=columns) exp_stats = exp_stats.set_index('sample-id') obs = [] iterator = obs_result.sequences.iter_views(FastqGzFormat) for sample_id, fp in iterator: obs.extend([l.strip() for l in gzip.open(str(fp), 'rt')]) self.assertEqual(obs, exp_result) pdt.assert_frame_equal(stats, exp_stats.loc[stats.index])