def test_load_with_archive_filepath_modified(self): # Save an artifact for use in the following test case. fp = os.path.join(self.test_dir.name, 'artifact.qza') Artifact.import_data(FourInts, [-1, 42, 0, 43]).save(fp) # Load the artifact from a filepath then save a different artifact to # the same filepath. Assert that both artifacts produce the correct # views of their data. # # `load` used to be lazy, only extracting data when it needed to (e.g. # when `save` or `view` was called). This was buggy as the filepath # could have been deleted, or worse, modified to contain a different # .qza file. Thus, the wrong archive could be extracted on demand, or # the archive could be missing altogether. There isn't an easy # cross-platform compatible way to solve this problem, so Artifact.load # is no longer lazy and always extracts its data immediately. The real # motivation for lazy loading was for quick inspection of archives # without extracting/copying data, so that API is now provided through # Artifact.peek. artifact1 = Artifact.load(fp) Artifact.import_data(FourInts, [10, 11, 12, 13]).save(fp) artifact2 = Artifact.load(fp) self.assertEqual(artifact1.view(list), [-1, 42, 0, 43]) self.assertEqual(artifact2.view(list), [10, 11, 12, 13])
def test_filter_features_nooverlap(self): ar_tree = Artifact.load(self.get_data_path('tree_reject.qza')) ar_table = Artifact.load( self.get_data_path('counts_nooverlap.biom.qza')) with self.assertRaises(ValueError): tbl_positive, tbl_negative = filter_features( table=ar_table.view(biom.Table), tree=ar_tree.view(NewickFormat))
def test_load_different_type_with_multiple_view_types(self): saved_artifact = Artifact.import_data(IntSequence1, [42, 42, 43, -999, 42]) fp = os.path.join(self.test_dir.name, 'artifact.qza') saved_artifact.save(fp) artifact = Artifact.load(fp) self.assertEqual(artifact.type, IntSequence1) self.assertEqual(artifact.uuid, saved_artifact.uuid) self.assertEqual(artifact.view(list), [42, 42, 43, -999, 42]) self.assertEqual(artifact.view(list), [42, 42, 43, -999, 42]) self.assertEqual(artifact.view(collections.Counter), collections.Counter({ 42: 3, 43: 1, -999: 1 })) self.assertEqual(artifact.view(collections.Counter), collections.Counter({ 42: 3, 43: 1, -999: 1 }))
def test_load_and_save(self): fp1 = os.path.join(self.test_dir.name, 'artifact1.qza') fp2 = os.path.join(self.test_dir.name, 'artifact2.qza') artifact = Artifact.import_data(FourInts, [-1, 42, 0, 43]) artifact.save(fp1) artifact = Artifact.load(fp1) # Overwriting its source file works. artifact.save(fp1) # Saving to a new file works. artifact.save(fp2) root_dir = str(artifact.uuid) expected = { 'VERSION', 'metadata.yaml', 'data/file1.txt', 'data/file2.txt', 'data/nested/file3.txt', 'data/nested/file4.txt', 'provenance/metadata.yaml', 'provenance/VERSION', 'provenance/action/action.yaml' } self.assertArchiveMembers(fp1, root_dir, expected) root_dir = str(artifact.uuid) expected = { 'VERSION', 'metadata.yaml', 'data/file1.txt', 'data/file2.txt', 'data/nested/file3.txt', 'data/nested/file4.txt', 'provenance/metadata.yaml', 'provenance/VERSION', 'provenance/action/action.yaml' } self.assertArchiveMembers(fp2, root_dir, expected)
def test_classify_otus_experimental(self): ar_tree = Artifact.load(self.get_data_path('sepp_tree_tiny.qza')) ar_repseq = Artifact.load(self.get_data_path('real_data.qza')) obs_classification = classify_otus_experimental( ar_repseq.view(DNASequencesDirectoryFormat), ar_tree.view(NewickFormat)) exp_classification = pd.read_csv( self.get_data_path('taxonomy_real_data_tiny_otus.tsv'), index_col=0, sep="\t").fillna("") assert_frame_equal(obs_classification, exp_classification) ar_tree_small = Artifact.load( self.get_data_path('sepp_tree_small.qza')) obs_classification_small = classify_otus_experimental( ar_repseq.view(DNASequencesDirectoryFormat), ar_tree_small.view(NewickFormat)) exp_classification_small = pd.read_csv( self.get_data_path('taxonomy_real_data_small_otus.tsv'), index_col=0, sep="\t").fillna("") assert_frame_equal(obs_classification_small, exp_classification_small) ar_refphylo_tiny = Artifact.load( self.get_data_path('reference_phylogeny_tiny.qza')) ref_phylo_tiny = ar_refphylo_tiny.view(NewickFormat) with self.assertRaises(ValueError): classify_otus_experimental( ar_repseq.view(DNASequencesDirectoryFormat), ref_phylo_tiny) # test that missing taxon mappings result in an error ar_taxonomy = Artifact.load( self.get_data_path('taxonomy_missingotus.qza')) # capture stderr message and check if its content is as expected captured_stderr = StringIO() with redirect_stderr(captured_stderr): with self.assertRaises(ValueError): classify_otus_experimental( ar_repseq.view(DNASequencesDirectoryFormat), ar_tree.view(NewickFormat), reference_taxonomy=ar_taxonomy.view(pd.DataFrame)) self.assertIn('The taxonomy artifact you provided does not cont', captured_stderr.getvalue()) self.assertIn('539572', captured_stderr.getvalue())
def test_eq_same_uuid(self): fp = os.path.join(self.test_dir.name, 'artifact.qza') artifact1 = Artifact.import_data(FourInts, [-1, 42, 0, 43]) artifact1.save(fp) artifact2 = Artifact.load(fp) self.assertEqual(artifact1, artifact2)
def test_roundtrip(self): fp1 = os.path.join(self.test_dir.name, 'artifact1.qza') fp2 = os.path.join(self.test_dir.name, 'artifact2.qza') artifact = Artifact.import_data(FourInts, [-1, 42, 0, 43]) artifact.save(fp1) artifact1 = Artifact.load(fp1) artifact1.save(fp2) artifact2 = Artifact.load(fp2) self.assertEqual(artifact1.type, artifact2.type) self.assertEqual(artifact1.format, artifact2.format) self.assertEqual(artifact1.uuid, artifact2.uuid) self.assertEqual(artifact1.view(list), artifact2.view(list)) # double view to make sure multiple views can be taken self.assertEqual(artifact1.view(list), artifact2.view(list))
def test_load(self): saved_artifact = Artifact.import_data(FourInts, [-1, 42, 0, 43]) fp = os.path.join(self.test_dir.name, 'artifact.qza') saved_artifact.save(fp) artifact = Artifact.load(fp) self.assertEqual(artifact.type, FourInts) self.assertEqual(artifact.uuid, saved_artifact.uuid) self.assertEqual(artifact.view(list), [-1, 42, 0, 43]) self.assertEqual(artifact.view(list), [-1, 42, 0, 43])
def test_exercise_sepp(self): ar = Artifact.load(self.get_data_path('real_data.qza')) view = ar.view(DNASequencesDirectoryFormat) ar_refphylo = Artifact.load( self.get_data_path('reference_phylogeny_small.qza')) ref_phylo_small = ar_refphylo.view(NewickFormat) ar_refaln = Artifact.load( self.get_data_path('reference_alignment_small.qza')) ref_aln_small = ar_refaln.view(AlignedDNASequencesDirectoryFormat) obs_tree, obs_placements = sepp(view, reference_alignment=ref_aln_small, reference_phylogeny=ref_phylo_small) tree = skbio.TreeNode.read(str(obs_tree)) obs = {n.name for n in tree.tips()} seqs = {r.metadata['id'] for r in ar.view(DNAIterator)} for seq in seqs: self.assertIn(seq, obs)
def test_q_score_numeric_ids(self): ar = Artifact.load(self.get_data_path('numeric_ids.qza')) view = ar.view(SingleLanePerSampleSingleEndFastqDirFmt) exp_sids = {'00123', '0.4560'} obs, stats = q_score(view, min_quality=20) obs_manifest = obs.manifest.view(obs.manifest.format) obs_manifest = pd.read_csv(obs_manifest.open(), dtype=str, comment='#') obs_manifest.set_index('sample-id', inplace=True) obs_sids = set(obs_manifest.index) self.assertEqual(obs_sids, exp_sids) self.assertEqual(set(stats.index), exp_sids)
def test_ne_subclass_same_uuid(self): class ArtifactSubclass(Artifact): pass fp = os.path.join(self.test_dir.name, 'artifact.qza') artifact1 = ArtifactSubclass.import_data(FourInts, [-1, 42, 0, 43]) artifact1.save(fp) artifact2 = Artifact.load(fp) self.assertNotEqual(artifact1, artifact2) self.assertNotEqual(artifact2, artifact1)
def test_q_score(self): ar = Artifact.load(self.get_data_path('simple.qza')) with redirected_stdio(stdout=os.devnull): obs_drop_ambig_ar, stats_ar = self.plugin.methods['q_score']( ar, quality_window=2, min_quality=20, min_length_fraction=0.25) obs_drop_ambig = obs_drop_ambig_ar.view( SingleLanePerSampleSingleEndFastqDirFmt) stats = stats_ar.view(pd.DataFrame) exp_drop_ambig = ["@foo_1", "ATGCATGC", "+", "DDDDBBDD"] columns = ['sample-id', 'total-input-reads', 'total-retained-reads', 'reads-truncated', 'reads-too-short-after-truncation', 'reads-exceeding-maximum-ambiguous-bases'] exp_drop_ambig_stats = pd.DataFrame([('foo', 2., 1., 0., 0., 1.), ('bar', 1., 0., 0., 0., 1.)], columns=columns) exp_drop_ambig_stats = exp_drop_ambig_stats.set_index('sample-id') obs = [] iterator = obs_drop_ambig.sequences.iter_views(FastqGzFormat) for sample_id, fp in iterator: obs.extend([x.strip() for x in gzip.open(str(fp), 'rt')]) self.assertEqual(obs, exp_drop_ambig) pdt.assert_frame_equal(stats, exp_drop_ambig_stats.loc[stats.index]) with redirected_stdio(stdout=os.devnull): obs_trunc_ar, stats_ar = self.plugin.methods['q_score']( ar, quality_window=1, min_quality=33, min_length_fraction=0.25) obs_trunc = obs_trunc_ar.view(SingleLanePerSampleSingleEndFastqDirFmt) stats = stats_ar.view(pd.DataFrame) exp_trunc = ["@foo_1", "ATGCATGC", "+", "DDDDBBDD", "@bar_1", "ATA", "+", "DDD"] exp_trunc_stats = pd.DataFrame([('foo', 2., 1., 0., 0., 1.), ('bar', 1., 1., 1., 0., 0.)], columns=columns) exp_trunc_stats = exp_trunc_stats.set_index('sample-id') obs = [] for sample_id, fp in obs_trunc.sequences.iter_views(FastqGzFormat): obs.extend([x.strip() for x in gzip.open(str(fp), 'rt')]) self.assertEqual(sorted(obs), sorted(exp_trunc)) pdt.assert_frame_equal(stats, exp_trunc_stats.loc[stats.index])
def test_refmismatch(self): ar_refphylo = Artifact.load(self.get_data_path( 'reference_phylogeny_small.qza')) ref_phylo_small = ar_refphylo.view(NewickFormat) ar_refaln = Artifact.load(self.get_data_path( 'reference_alignment_small.qza')) ref_aln_small = ar_refaln.view(AlignedDNASequencesDirectoryFormat) with self.assertRaises(ValueError): sepp(None, reference_phylogeny=ref_phylo_small) with self.assertRaises(ValueError): sepp(None, reference_alignment=ref_aln_small) ar_refphylo_tiny = Artifact.load(self.get_data_path( 'reference_phylogeny_tiny.qza')) ref_phylo_tiny = ar_refphylo_tiny.view(NewickFormat) with self.assertRaises(ValueError): sepp(None, reference_alignment=ref_aln_small, reference_phylogeny=ref_phylo_tiny)
def test_filter_features(self): ar_tree = Artifact.load(self.get_data_path('tree_reject.qza')) ar_table = Artifact.load(self.get_data_path('counts_reject.biom.qza')) tbl_positive, tbl_negative = filter_features( table=ar_table.view(biom.Table), tree=ar_tree.view(NewickFormat) ) self.assertEqual(tbl_positive.sum(), 715) self.assertEqual(tbl_negative.sum(), 133) exp_sample_ids = set(['sample_a', 'sample_b', 'sample_c', 'sample_d']) self.assertEqual(set(tbl_positive.ids()) ^ exp_sample_ids, set()) self.assertEqual(set(tbl_negative.ids()) ^ exp_sample_ids, set()) exp_pos_feature_ids = set([ 'testseqa', 'testseqb', 'testseqc', 'testseqd', 'testseqe', 'testseqf', 'testseqg', 'testseqh', 'testseqi', 'testseqj']) self.assertEqual(set(tbl_positive.ids( axis='observation')) ^ exp_pos_feature_ids, set()) exp_neg_feature_ids = set(['testseq_reject_1', 'testseq_reject_2']) self.assertEqual(set(tbl_negative.ids( axis='observation')) ^ exp_neg_feature_ids, set())
def test_q_score_numeric_ids(self): ar = Artifact.load(self.get_data_path('numeric_ids.qza')) exp_sids = {'00123', '0.4560'} with redirected_stdio(stdout=os.devnull): obs_ar, stats_ar = self.plugin.methods['q_score']( ar, min_quality=20) obs = obs_ar.view(SingleLanePerSampleSingleEndFastqDirFmt) stats = stats_ar.view(pd.DataFrame) obs_manifest = obs.manifest.view(obs.manifest.format) obs_manifest = pd.read_csv(obs_manifest.open(), dtype=str, comment='#') obs_manifest.set_index('sample-id', inplace=True) obs_sids = set(obs_manifest.index) self.assertEqual(obs_sids, exp_sids) self.assertEqual(set(stats.index), exp_sids)
def test_load_and_save(self): fp1 = os.path.join(self.test_dir.name, 'artifact1.qza') fp2 = os.path.join(self.test_dir.name, 'artifact2.qza') artifact = Artifact.import_data(FourInts, [-1, 42, 0, 43]) artifact.save(fp1) artifact = Artifact.load(fp1) # Overwriting its source file works. artifact.save(fp1) # Saving to a new file works. artifact.save(fp2) root_dir = str(artifact.uuid) expected = { 'VERSION', 'checksums.md5', 'metadata.yaml', 'data/file1.txt', 'data/file2.txt', 'data/nested/file3.txt', 'data/nested/file4.txt', 'provenance/metadata.yaml', 'provenance/VERSION', 'provenance/citations.bib', 'provenance/action/action.yaml' } self.assertArchiveMembers(fp1, root_dir, expected) root_dir = str(artifact.uuid) expected = { 'VERSION', 'checksums.md5', 'metadata.yaml', 'data/file1.txt', 'data/file2.txt', 'data/nested/file3.txt', 'data/nested/file4.txt', 'provenance/metadata.yaml', 'provenance/VERSION', 'provenance/citations.bib', 'provenance/action/action.yaml' } self.assertArchiveMembers(fp2, root_dir, expected)
def test_q_score(self): ar = Artifact.load(self.get_data_path('simple.qza')) view = ar.view(SingleLanePerSampleSingleEndFastqDirFmt) obs_drop_ambig, stats = q_score(view, quality_window=2, min_length_fraction=0.25) exp_drop_ambig = ["@foo_1", "ATGCATGC", "+", "DDDDBBDD"] columns = ['sample-id', 'total-input-reads', 'total-retained-reads', 'reads-truncated', 'reads-too-short-after-truncation', 'reads-exceeding-maximum-ambiguous-bases'] exp_drop_ambig_stats = pd.DataFrame([('foo', 2, 1, 0, 0, 1), ('bar', 1, 0, 0, 0, 1)], columns=columns) exp_drop_ambig_stats = exp_drop_ambig_stats.set_index('sample-id') obs = [] iterator = obs_drop_ambig.sequences.iter_views(FastqGzFormat) for sample_id, fp in iterator: obs.extend([l.strip() for l in gzip.open(str(fp), 'rt')]) self.assertEqual(obs, exp_drop_ambig) pdt.assert_frame_equal(stats, exp_drop_ambig_stats.loc[stats.index]) obs_trunc, stats = q_score(view, quality_window=1, min_quality=33, min_length_fraction=0.25) exp_trunc = ["@foo_1", "ATGCATGC", "+", "DDDDBBDD", "@bar_1", "ATA", "+", "DDD"] exp_trunc_stats = pd.DataFrame([('foo', 2, 1, 0, 0, 1), ('bar', 1, 1, 1, 0, 0)], columns=columns) exp_trunc_stats = exp_trunc_stats.set_index('sample-id') obs = [] for sample_id, fp in obs_trunc.sequences.iter_views(FastqGzFormat): obs.extend([l.strip() for l in gzip.open(str(fp), 'rt')]) self.assertEqual(sorted(obs), sorted(exp_trunc)) pdt.assert_frame_equal(stats, exp_trunc_stats.loc[stats.index])
def test_load_different_type_with_multiple_view_types(self): saved_artifact = Artifact.import_data(IntSequence1, [42, 42, 43, -999, 42]) fp = os.path.join(self.test_dir.name, 'artifact.qza') saved_artifact.save(fp) artifact = Artifact.load(fp) self.assertEqual(artifact.type, IntSequence1) self.assertEqual(artifact.uuid, saved_artifact.uuid) self.assertEqual(artifact.view(list), [42, 42, 43, -999, 42]) self.assertEqual(artifact.view(list), [42, 42, 43, -999, 42]) self.assertEqual(artifact.view(collections.Counter), collections.Counter({42: 3, 43: 1, -999: 1})) self.assertEqual(artifact.view(collections.Counter), collections.Counter({42: 3, 43: 1, -999: 1}))
def export_artifact(uuid): output = request.get_json().get('path') Artifact.load(ARTIFACTS[uuid]).export_data(output) return jsonify({'path': output})
def classify_otus_experimental( representative_sequences: DNASequencesDirectoryFormat, tree: NewickFormat, reference_taxonomy: pd.DataFrame = None) -> pd.DataFrame: if reference_taxonomy is None: filename_default_taxonomy = os.path.join(_sepp_refs_path(), 'taxonomy_gg99.qza') reference_taxonomy = Artifact.load(filename_default_taxonomy).view( pd.DataFrame) # convert type of feature IDs to str (depending on pandas type inference # they might come as integers), to make sure they are of the same type as # in the tree. reference_taxonomy.index = map(str, reference_taxonomy.index) # load the insertion tree tree = skbio.TreeNode.read(str(tree)) # ensure that all reference tips in the tree (those without the inserted # fragments) have a mapping in the user provided taxonomy table names_tips = {node.name for node in tree.tips()} names_fragments = { fragment.metadata['id'] for fragment in representative_sequences.file.view(DNAIterator) } missing_features = (names_tips - names_fragments) -\ set(reference_taxonomy.index) if len(missing_features) > 0: # QIIME2 users can run with --verbose and see stderr and stdout. # Thus, we here report more details about the mismatch: sys.stderr.write( ("The taxonomy artifact you provided does not contain lineage " "information for the following %i features:\n%s") % (len(missing_features), "\n".join(missing_features))) raise ValueError("Not all OTUs in the provided insertion tree have " "mappings in the provided reference taxonomy.") taxonomy = [] for fragment in representative_sequences.file.view(DNAIterator): # for every inserted fragment we now try to find the closest OTU tip # in the tree and available mapping from the OTU-ID to a lineage # string: lineage_str = np.nan # first, let us check if the fragment has been inserted at all ... try: curr_node = tree.find(fragment.metadata['id']) except skbio.tree.MissingNodeError: continue # if yes, we start from the inserted node and traverse the tree as less # as possible towards the root and check at every level if one or # several OTU-tips are within the sub-tree. if curr_node is not None: foundOTUs = [] # Traversal is stopped at a certain level, if one or more OTU-tips # have been found in the sub-tree OR ... (see break below) while len(foundOTUs) == 0: # SEPP insertion - especially for multiple very similar # sequences - can result in a rather complex topology change # if all those sequences are inserted into the same branch # leading to one OTU-tip. Thus, we cannot simply visit only # all siblings or decendents and rather need to traverse the # whole sub-tree. Average case should be well behaved, # thus I think it is ok. for node in curr_node.postorder(): if (node.name is not None) and \ (node.name in reference_taxonomy.index): # if a suitable OTU-tip node is found AND this OTU-ID # has a mapping in the user provided reference_taxonomy # we store the OTU-ID in the growing result list foundOTUs.append(node.name) # ... if the whole tree has been traversed without success, # e.g. if user provided reference_taxonomy did not contain any # matching OTU-IDs. if curr_node.is_root(): break # prepare next while iteration, by changing to the parent node curr_node = curr_node.parent if len(foundOTUs) > 0: # If the above method has identified exactly one OTU-tip, # resulting lineage string would simple be the one provided by # the user reference_taxonomy. However, if the inserted # fragment cannot unambiguously places into the reference tree, # the above method will find multiple OTU-IDs, which might have # lineage strings in the user provided reference_taxonomy that # are similar up to a certain rank and differ e.g. for genus # and species. # Thus, we here find the longest common prefix of all lineage # strings. We don't operate per character, but per taxonomic # rank. Therefore, we first "convert" every lineage sting into # a list of taxa, one per rank. split_lineages = [] for otu in foundOTUs: # find lineage string for OTU lineage = reference_taxonomy.loc[otu, 'Taxon'] # necessary to split lineage apart to ensure that # the longest common prefix operates on atomic ranks # instead of characters split_lineages.append( list(map(str.strip, lineage.split(';')))) # find the longest common prefix rank-wise and concatenate to # one lineage string, separated by ; lineage_str = "; ".join(os.path.commonprefix(split_lineages)) taxonomy.append({ 'Feature ID': fragment.metadata['id'], 'Taxon': lineage_str }) pd_taxonomy = pd.DataFrame(taxonomy) # test if dataframe is completely empty, or if no lineages could be found if (len(taxonomy) == 0) or \ (pd_taxonomy['Taxon'].dropna().shape[0] == 0): raise ValueError( ("None of the representative-sequences can be found in the " "insertion tree. Please double check that both inputs match up, " "i.e. are results from the same 'sepp' run.")) return pd_taxonomy.set_index('Feature ID')
def load_artifacts(**kwargs): return { k: Artifact.load(ARTIFACTS[v]) for k, v in kwargs.items() if v != '' }
def test_q_score_all_dropped(self): ar = Artifact.load(self.get_data_path('simple.qza')) with self.assertRaisesRegex(ValueError, "filtered out"): with redirected_stdio(stdout=os.devnull): self.plugin.methods['q_score'](ar, min_quality=50)
def test_q_score_real_joined(self): ar = Artifact.load(self.get_data_path('real_data_joined.qza')) view = ar.view(SingleLanePerSampleSingleEndFastqDirFmt) obs_result, stats = q_score_joined( view, min_quality=40, min_length_fraction=0.24) # All input reads are represented here in their post-quality filtered # form. Reads that are commented out were manually identified as being # filtered by the q_score method. For the commented reads, the comments # denote why the read is not retained. # The first read, @HWI-EAS440_0386:1:32:15467:1432#0/1, is 25% of # total read length and is indicative of a sequence at the # min_length_fraction boundary. exp_result = [ "@HWI-EAS440_0386:1:32:15467:1432#0/1", "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTT", "+", "hhhhhhhhhhhhfghhhghghghhhchhhahhhhhfhh", # too short # "@HWI-EAS440_0386:1:36:9986:17043#0/1", # "TACGTAGGTGGCAAGCGTTATCCGGATTTATTG", # "+", # "hhhhhhhhhhhhhhhhhhhhhhhhhffhhghhh", "@HWI-EAS440_0386:1:37:13343:14820#0/1", "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGAT" "GGATGTTTAAGTCAGTTGTG", "+", "hhhhhhhhhhhhhfhhhhhfhhhhghhhhghhhhhhhhhgghhhgghhhgghh" "hgdhhhhghghhhdhhhhgh", "@HWI-EAS440_0386:1:41:18215:15404#0/1", "TACGTAGGTGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCATGTA", "+", "hhhhhhhhhhhhghhhhhhhhhhhhffhhghhhhghhghgghghhhhhgh", # too short # "@HWI-EAS440_0386:1:42:5423:19606#0/1", # "TACGTAGGGAGCAAGCGTT", # "+", # "hhhhghhhhhhhhhghhfh", "@HWI-EAS440_0386:1:52:7507:5841#0/1", "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTT", "+", "hhhhhhhhhghhfghhhhhhhhhhgfhhhghhhghdhh", "@HWI-EAS440_0386:1:53:18599:4074#0/1", "TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTG", "+", "hhhhfhhhhhfhhhhhhfhffhghhfgghggghdcbh", # too short # "@HWI-EAS440_0386:1:55:16425:9514#0/1", # "TACGGAGGATCCGAGCGTTATCCGGATT", # "+", # "hhhhhhhhhhhhfghhhghghhhhhbgh", "@HWI-EAS440_0386:1:65:12049:5619#0/1", "TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTG", "+", "hhhhhhhhhhhhhhhhhhhhhhhhhfhhhhhhhghdhghhhhhghcfh", # @HWI-EAS440_0386:1:95:4837:16388#0/1 # starts off < Q40 ] columns = ['sample-id', 'total-input-reads', 'total-retained-reads', 'reads-truncated', 'reads-too-short-after-truncation', 'reads-exceeding-maximum-ambiguous-bases'] exp_stats = pd.DataFrame([('foo', 10, 6, 10, 4, 0)], columns=columns) exp_stats = exp_stats.set_index('sample-id') obs = [] iterator = obs_result.sequences.iter_views(FastqGzFormat) for sample_id, fp in iterator: obs.extend([l.strip() for l in gzip.open(str(fp), 'rt')]) self.assertEqual(obs, exp_result) pdt.assert_frame_equal(stats, exp_stats.loc[stats.index])
def test_q_score_all_dropped(self): ar = Artifact.load(self.get_data_path('simple.qza')) view = ar.view(SingleLanePerSampleSingleEndFastqDirFmt) with self.assertRaisesRegex(ValueError, "filtered out"): q_score(view, min_quality=50)
def load_artifacts(**kwargs): return {k: Artifact.load(ARTIFACTS[v]) for k, v in kwargs.items() if v != ''}