Exemplo n.º 1
0
    def test_load_with_archive_filepath_modified(self):
        # Save an artifact for use in the following test case.
        fp = os.path.join(self.test_dir.name, 'artifact.qza')
        Artifact.import_data(FourInts, [-1, 42, 0, 43]).save(fp)

        # Load the artifact from a filepath then save a different artifact to
        # the same filepath. Assert that both artifacts produce the correct
        # views of their data.
        #
        # `load` used to be lazy, only extracting data when it needed to (e.g.
        # when `save` or `view` was called). This was buggy as the filepath
        # could have been deleted, or worse, modified to contain a different
        # .qza file. Thus, the wrong archive could be extracted on demand, or
        # the archive could be missing altogether. There isn't an easy
        # cross-platform compatible way to solve this problem, so Artifact.load
        # is no longer lazy and always extracts its data immediately. The real
        # motivation for lazy loading was for quick inspection of archives
        # without extracting/copying data, so that API is now provided through
        # Artifact.peek.
        artifact1 = Artifact.load(fp)
        Artifact.import_data(FourInts, [10, 11, 12, 13]).save(fp)
        artifact2 = Artifact.load(fp)

        self.assertEqual(artifact1.view(list), [-1, 42, 0, 43])
        self.assertEqual(artifact2.view(list), [10, 11, 12, 13])
Exemplo n.º 2
0
    def test_load_with_archive_filepath_modified(self):
        # Save an artifact for use in the following test case.
        fp = os.path.join(self.test_dir.name, 'artifact.qza')
        Artifact.import_data(FourInts, [-1, 42, 0, 43]).save(fp)

        # Load the artifact from a filepath then save a different artifact to
        # the same filepath. Assert that both artifacts produce the correct
        # views of their data.
        #
        # `load` used to be lazy, only extracting data when it needed to (e.g.
        # when `save` or `view` was called). This was buggy as the filepath
        # could have been deleted, or worse, modified to contain a different
        # .qza file. Thus, the wrong archive could be extracted on demand, or
        # the archive could be missing altogether. There isn't an easy
        # cross-platform compatible way to solve this problem, so Artifact.load
        # is no longer lazy and always extracts its data immediately. The real
        # motivation for lazy loading was for quick inspection of archives
        # without extracting/copying data, so that API is now provided through
        # Artifact.peek.
        artifact1 = Artifact.load(fp)
        Artifact.import_data(FourInts, [10, 11, 12, 13]).save(fp)
        artifact2 = Artifact.load(fp)

        self.assertEqual(artifact1.view(list), [-1, 42, 0, 43])
        self.assertEqual(artifact2.view(list), [10, 11, 12, 13])
Exemplo n.º 3
0
    def test_filter_features_nooverlap(self):
        ar_tree = Artifact.load(self.get_data_path('tree_reject.qza'))
        ar_table = Artifact.load(
            self.get_data_path('counts_nooverlap.biom.qza'))

        with self.assertRaises(ValueError):
            tbl_positive, tbl_negative = filter_features(
                table=ar_table.view(biom.Table),
                tree=ar_tree.view(NewickFormat))
Exemplo n.º 4
0
    def test_load_different_type_with_multiple_view_types(self):
        saved_artifact = Artifact.import_data(IntSequence1,
                                              [42, 42, 43, -999, 42])
        fp = os.path.join(self.test_dir.name, 'artifact.qza')
        saved_artifact.save(fp)

        artifact = Artifact.load(fp)

        self.assertEqual(artifact.type, IntSequence1)
        self.assertEqual(artifact.uuid, saved_artifact.uuid)

        self.assertEqual(artifact.view(list), [42, 42, 43, -999, 42])
        self.assertEqual(artifact.view(list), [42, 42, 43, -999, 42])

        self.assertEqual(artifact.view(collections.Counter),
                         collections.Counter({
                             42: 3,
                             43: 1,
                             -999: 1
                         }))
        self.assertEqual(artifact.view(collections.Counter),
                         collections.Counter({
                             42: 3,
                             43: 1,
                             -999: 1
                         }))
Exemplo n.º 5
0
    def test_load_and_save(self):
        fp1 = os.path.join(self.test_dir.name, 'artifact1.qza')
        fp2 = os.path.join(self.test_dir.name, 'artifact2.qza')
        artifact = Artifact.import_data(FourInts, [-1, 42, 0, 43])
        artifact.save(fp1)

        artifact = Artifact.load(fp1)
        # Overwriting its source file works.
        artifact.save(fp1)
        # Saving to a new file works.
        artifact.save(fp2)

        root_dir = str(artifact.uuid)
        expected = {
            'VERSION', 'metadata.yaml', 'data/file1.txt', 'data/file2.txt',
            'data/nested/file3.txt', 'data/nested/file4.txt',
            'provenance/metadata.yaml', 'provenance/VERSION',
            'provenance/action/action.yaml'
        }

        self.assertArchiveMembers(fp1, root_dir, expected)

        root_dir = str(artifact.uuid)
        expected = {
            'VERSION', 'metadata.yaml', 'data/file1.txt', 'data/file2.txt',
            'data/nested/file3.txt', 'data/nested/file4.txt',
            'provenance/metadata.yaml', 'provenance/VERSION',
            'provenance/action/action.yaml'
        }

        self.assertArchiveMembers(fp2, root_dir, expected)
Exemplo n.º 6
0
    def test_classify_otus_experimental(self):
        ar_tree = Artifact.load(self.get_data_path('sepp_tree_tiny.qza'))
        ar_repseq = Artifact.load(self.get_data_path('real_data.qza'))

        obs_classification = classify_otus_experimental(
            ar_repseq.view(DNASequencesDirectoryFormat),
            ar_tree.view(NewickFormat))
        exp_classification = pd.read_csv(
            self.get_data_path('taxonomy_real_data_tiny_otus.tsv'),
            index_col=0,
            sep="\t").fillna("")
        assert_frame_equal(obs_classification, exp_classification)

        ar_tree_small = Artifact.load(
            self.get_data_path('sepp_tree_small.qza'))
        obs_classification_small = classify_otus_experimental(
            ar_repseq.view(DNASequencesDirectoryFormat),
            ar_tree_small.view(NewickFormat))

        exp_classification_small = pd.read_csv(
            self.get_data_path('taxonomy_real_data_small_otus.tsv'),
            index_col=0,
            sep="\t").fillna("")
        assert_frame_equal(obs_classification_small, exp_classification_small)

        ar_refphylo_tiny = Artifact.load(
            self.get_data_path('reference_phylogeny_tiny.qza'))
        ref_phylo_tiny = ar_refphylo_tiny.view(NewickFormat)
        with self.assertRaises(ValueError):
            classify_otus_experimental(
                ar_repseq.view(DNASequencesDirectoryFormat), ref_phylo_tiny)

        # test that missing taxon mappings result in an error
        ar_taxonomy = Artifact.load(
            self.get_data_path('taxonomy_missingotus.qza'))

        # capture stderr message and check if its content is as expected
        captured_stderr = StringIO()
        with redirect_stderr(captured_stderr):
            with self.assertRaises(ValueError):
                classify_otus_experimental(
                    ar_repseq.view(DNASequencesDirectoryFormat),
                    ar_tree.view(NewickFormat),
                    reference_taxonomy=ar_taxonomy.view(pd.DataFrame))
        self.assertIn('The taxonomy artifact you provided does not cont',
                      captured_stderr.getvalue())
        self.assertIn('539572', captured_stderr.getvalue())
Exemplo n.º 7
0
    def test_eq_same_uuid(self):
        fp = os.path.join(self.test_dir.name, 'artifact.qza')
        artifact1 = Artifact.import_data(FourInts, [-1, 42, 0, 43])
        artifact1.save(fp)

        artifact2 = Artifact.load(fp)

        self.assertEqual(artifact1, artifact2)
Exemplo n.º 8
0
    def test_eq_same_uuid(self):
        fp = os.path.join(self.test_dir.name, 'artifact.qza')
        artifact1 = Artifact.import_data(FourInts, [-1, 42, 0, 43])
        artifact1.save(fp)

        artifact2 = Artifact.load(fp)

        self.assertEqual(artifact1, artifact2)
Exemplo n.º 9
0
    def test_roundtrip(self):
        fp1 = os.path.join(self.test_dir.name, 'artifact1.qza')
        fp2 = os.path.join(self.test_dir.name, 'artifact2.qza')
        artifact = Artifact.import_data(FourInts, [-1, 42, 0, 43])

        artifact.save(fp1)

        artifact1 = Artifact.load(fp1)
        artifact1.save(fp2)
        artifact2 = Artifact.load(fp2)

        self.assertEqual(artifact1.type, artifact2.type)
        self.assertEqual(artifact1.format, artifact2.format)
        self.assertEqual(artifact1.uuid, artifact2.uuid)
        self.assertEqual(artifact1.view(list), artifact2.view(list))
        # double view to make sure multiple views can be taken
        self.assertEqual(artifact1.view(list), artifact2.view(list))
Exemplo n.º 10
0
    def test_roundtrip(self):
        fp1 = os.path.join(self.test_dir.name, 'artifact1.qza')
        fp2 = os.path.join(self.test_dir.name, 'artifact2.qza')
        artifact = Artifact.import_data(FourInts, [-1, 42, 0, 43])

        artifact.save(fp1)

        artifact1 = Artifact.load(fp1)
        artifact1.save(fp2)
        artifact2 = Artifact.load(fp2)

        self.assertEqual(artifact1.type, artifact2.type)
        self.assertEqual(artifact1.format, artifact2.format)
        self.assertEqual(artifact1.uuid, artifact2.uuid)
        self.assertEqual(artifact1.view(list),
                         artifact2.view(list))
        # double view to make sure multiple views can be taken
        self.assertEqual(artifact1.view(list),
                         artifact2.view(list))
Exemplo n.º 11
0
    def test_load(self):
        saved_artifact = Artifact.import_data(FourInts, [-1, 42, 0, 43])
        fp = os.path.join(self.test_dir.name, 'artifact.qza')
        saved_artifact.save(fp)

        artifact = Artifact.load(fp)

        self.assertEqual(artifact.type, FourInts)
        self.assertEqual(artifact.uuid, saved_artifact.uuid)
        self.assertEqual(artifact.view(list), [-1, 42, 0, 43])
        self.assertEqual(artifact.view(list), [-1, 42, 0, 43])
Exemplo n.º 12
0
    def test_load(self):
        saved_artifact = Artifact.import_data(FourInts, [-1, 42, 0, 43])
        fp = os.path.join(self.test_dir.name, 'artifact.qza')
        saved_artifact.save(fp)

        artifact = Artifact.load(fp)

        self.assertEqual(artifact.type, FourInts)
        self.assertEqual(artifact.uuid, saved_artifact.uuid)
        self.assertEqual(artifact.view(list), [-1, 42, 0, 43])
        self.assertEqual(artifact.view(list), [-1, 42, 0, 43])
Exemplo n.º 13
0
    def test_exercise_sepp(self):
        ar = Artifact.load(self.get_data_path('real_data.qza'))
        view = ar.view(DNASequencesDirectoryFormat)

        ar_refphylo = Artifact.load(
            self.get_data_path('reference_phylogeny_small.qza'))
        ref_phylo_small = ar_refphylo.view(NewickFormat)

        ar_refaln = Artifact.load(
            self.get_data_path('reference_alignment_small.qza'))
        ref_aln_small = ar_refaln.view(AlignedDNASequencesDirectoryFormat)

        obs_tree, obs_placements = sepp(view,
                                        reference_alignment=ref_aln_small,
                                        reference_phylogeny=ref_phylo_small)

        tree = skbio.TreeNode.read(str(obs_tree))
        obs = {n.name for n in tree.tips()}
        seqs = {r.metadata['id'] for r in ar.view(DNAIterator)}
        for seq in seqs:
            self.assertIn(seq, obs)
Exemplo n.º 14
0
    def test_q_score_numeric_ids(self):
        ar = Artifact.load(self.get_data_path('numeric_ids.qza'))
        view = ar.view(SingleLanePerSampleSingleEndFastqDirFmt)
        exp_sids = {'00123', '0.4560'}
        obs, stats = q_score(view, min_quality=20)
        obs_manifest = obs.manifest.view(obs.manifest.format)
        obs_manifest = pd.read_csv(obs_manifest.open(), dtype=str, comment='#')
        obs_manifest.set_index('sample-id', inplace=True)

        obs_sids = set(obs_manifest.index)
        self.assertEqual(obs_sids, exp_sids)
        self.assertEqual(set(stats.index), exp_sids)
Exemplo n.º 15
0
    def test_ne_subclass_same_uuid(self):
        class ArtifactSubclass(Artifact):
            pass

        fp = os.path.join(self.test_dir.name, 'artifact.qza')
        artifact1 = ArtifactSubclass.import_data(FourInts, [-1, 42, 0, 43])
        artifact1.save(fp)

        artifact2 = Artifact.load(fp)

        self.assertNotEqual(artifact1, artifact2)
        self.assertNotEqual(artifact2, artifact1)
Exemplo n.º 16
0
    def test_ne_subclass_same_uuid(self):
        class ArtifactSubclass(Artifact):
            pass

        fp = os.path.join(self.test_dir.name, 'artifact.qza')
        artifact1 = ArtifactSubclass.import_data(FourInts, [-1, 42, 0, 43])
        artifact1.save(fp)

        artifact2 = Artifact.load(fp)

        self.assertNotEqual(artifact1, artifact2)
        self.assertNotEqual(artifact2, artifact1)
Exemplo n.º 17
0
    def test_q_score(self):
        ar = Artifact.load(self.get_data_path('simple.qza'))
        with redirected_stdio(stdout=os.devnull):
            obs_drop_ambig_ar, stats_ar = self.plugin.methods['q_score'](
                ar, quality_window=2, min_quality=20, min_length_fraction=0.25)
        obs_drop_ambig = obs_drop_ambig_ar.view(
            SingleLanePerSampleSingleEndFastqDirFmt)
        stats = stats_ar.view(pd.DataFrame)

        exp_drop_ambig = ["@foo_1",
                          "ATGCATGC",
                          "+",
                          "DDDDBBDD"]
        columns = ['sample-id', 'total-input-reads', 'total-retained-reads',
                   'reads-truncated',
                   'reads-too-short-after-truncation',
                   'reads-exceeding-maximum-ambiguous-bases']
        exp_drop_ambig_stats = pd.DataFrame([('foo', 2., 1., 0., 0., 1.),
                                             ('bar', 1., 0., 0., 0., 1.)],
                                            columns=columns)
        exp_drop_ambig_stats = exp_drop_ambig_stats.set_index('sample-id')
        obs = []
        iterator = obs_drop_ambig.sequences.iter_views(FastqGzFormat)
        for sample_id, fp in iterator:
            obs.extend([x.strip() for x in gzip.open(str(fp), 'rt')])
        self.assertEqual(obs, exp_drop_ambig)
        pdt.assert_frame_equal(stats, exp_drop_ambig_stats.loc[stats.index])

        with redirected_stdio(stdout=os.devnull):
            obs_trunc_ar, stats_ar = self.plugin.methods['q_score'](
                ar, quality_window=1, min_quality=33, min_length_fraction=0.25)
        obs_trunc = obs_trunc_ar.view(SingleLanePerSampleSingleEndFastqDirFmt)
        stats = stats_ar.view(pd.DataFrame)

        exp_trunc = ["@foo_1",
                     "ATGCATGC",
                     "+",
                     "DDDDBBDD",
                     "@bar_1",
                     "ATA",
                     "+",
                     "DDD"]
        exp_trunc_stats = pd.DataFrame([('foo', 2., 1., 0., 0., 1.),
                                        ('bar', 1., 1., 1., 0., 0.)],
                                       columns=columns)
        exp_trunc_stats = exp_trunc_stats.set_index('sample-id')

        obs = []
        for sample_id, fp in obs_trunc.sequences.iter_views(FastqGzFormat):
            obs.extend([x.strip() for x in gzip.open(str(fp), 'rt')])
        self.assertEqual(sorted(obs), sorted(exp_trunc))
        pdt.assert_frame_equal(stats, exp_trunc_stats.loc[stats.index])
Exemplo n.º 18
0
    def test_refmismatch(self):
        ar_refphylo = Artifact.load(self.get_data_path(
            'reference_phylogeny_small.qza'))
        ref_phylo_small = ar_refphylo.view(NewickFormat)

        ar_refaln = Artifact.load(self.get_data_path(
            'reference_alignment_small.qza'))
        ref_aln_small = ar_refaln.view(AlignedDNASequencesDirectoryFormat)

        with self.assertRaises(ValueError):
            sepp(None, reference_phylogeny=ref_phylo_small)

        with self.assertRaises(ValueError):
            sepp(None, reference_alignment=ref_aln_small)

        ar_refphylo_tiny = Artifact.load(self.get_data_path(
            'reference_phylogeny_tiny.qza'))
        ref_phylo_tiny = ar_refphylo_tiny.view(NewickFormat)

        with self.assertRaises(ValueError):
            sepp(None, reference_alignment=ref_aln_small,
                 reference_phylogeny=ref_phylo_tiny)
Exemplo n.º 19
0
    def test_filter_features(self):
        ar_tree = Artifact.load(self.get_data_path('tree_reject.qza'))
        ar_table = Artifact.load(self.get_data_path('counts_reject.biom.qza'))

        tbl_positive, tbl_negative = filter_features(
            table=ar_table.view(biom.Table),
            tree=ar_tree.view(NewickFormat)
        )
        self.assertEqual(tbl_positive.sum(), 715)
        self.assertEqual(tbl_negative.sum(), 133)

        exp_sample_ids = set(['sample_a', 'sample_b', 'sample_c', 'sample_d'])
        self.assertEqual(set(tbl_positive.ids()) ^ exp_sample_ids, set())
        self.assertEqual(set(tbl_negative.ids()) ^ exp_sample_ids, set())

        exp_pos_feature_ids = set([
            'testseqa', 'testseqb', 'testseqc', 'testseqd', 'testseqe',
            'testseqf', 'testseqg', 'testseqh', 'testseqi', 'testseqj'])
        self.assertEqual(set(tbl_positive.ids(
            axis='observation')) ^ exp_pos_feature_ids, set())
        exp_neg_feature_ids = set(['testseq_reject_1', 'testseq_reject_2'])
        self.assertEqual(set(tbl_negative.ids(
            axis='observation')) ^ exp_neg_feature_ids, set())
Exemplo n.º 20
0
    def test_q_score_numeric_ids(self):
        ar = Artifact.load(self.get_data_path('numeric_ids.qza'))
        exp_sids = {'00123', '0.4560'}

        with redirected_stdio(stdout=os.devnull):
            obs_ar, stats_ar = self.plugin.methods['q_score'](
                ar, min_quality=20)
        obs = obs_ar.view(SingleLanePerSampleSingleEndFastqDirFmt)
        stats = stats_ar.view(pd.DataFrame)
        obs_manifest = obs.manifest.view(obs.manifest.format)
        obs_manifest = pd.read_csv(obs_manifest.open(), dtype=str, comment='#')
        obs_manifest.set_index('sample-id', inplace=True)

        obs_sids = set(obs_manifest.index)
        self.assertEqual(obs_sids, exp_sids)
        self.assertEqual(set(stats.index), exp_sids)
Exemplo n.º 21
0
    def test_load_and_save(self):
        fp1 = os.path.join(self.test_dir.name, 'artifact1.qza')
        fp2 = os.path.join(self.test_dir.name, 'artifact2.qza')
        artifact = Artifact.import_data(FourInts, [-1, 42, 0, 43])
        artifact.save(fp1)

        artifact = Artifact.load(fp1)
        # Overwriting its source file works.
        artifact.save(fp1)
        # Saving to a new file works.
        artifact.save(fp2)

        root_dir = str(artifact.uuid)
        expected = {
            'VERSION',
            'checksums.md5',
            'metadata.yaml',
            'data/file1.txt',
            'data/file2.txt',
            'data/nested/file3.txt',
            'data/nested/file4.txt',
            'provenance/metadata.yaml',
            'provenance/VERSION',
            'provenance/citations.bib',
            'provenance/action/action.yaml'
        }

        self.assertArchiveMembers(fp1, root_dir, expected)

        root_dir = str(artifact.uuid)
        expected = {
            'VERSION',
            'checksums.md5',
            'metadata.yaml',
            'data/file1.txt',
            'data/file2.txt',
            'data/nested/file3.txt',
            'data/nested/file4.txt',
            'provenance/metadata.yaml',
            'provenance/VERSION',
            'provenance/citations.bib',
            'provenance/action/action.yaml'
        }

        self.assertArchiveMembers(fp2, root_dir, expected)
    def test_q_score(self):
        ar = Artifact.load(self.get_data_path('simple.qza'))
        view = ar.view(SingleLanePerSampleSingleEndFastqDirFmt)
        obs_drop_ambig, stats = q_score(view, quality_window=2,
                                        min_length_fraction=0.25)

        exp_drop_ambig = ["@foo_1",
                          "ATGCATGC",
                          "+",
                          "DDDDBBDD"]
        columns = ['sample-id', 'total-input-reads', 'total-retained-reads',
                   'reads-truncated',
                   'reads-too-short-after-truncation',
                   'reads-exceeding-maximum-ambiguous-bases']
        exp_drop_ambig_stats = pd.DataFrame([('foo', 2, 1, 0, 0, 1),
                                             ('bar', 1, 0, 0, 0, 1)],
                                            columns=columns)
        exp_drop_ambig_stats = exp_drop_ambig_stats.set_index('sample-id')
        obs = []
        iterator = obs_drop_ambig.sequences.iter_views(FastqGzFormat)
        for sample_id, fp in iterator:
            obs.extend([l.strip() for l in gzip.open(str(fp), 'rt')])
        self.assertEqual(obs, exp_drop_ambig)
        pdt.assert_frame_equal(stats, exp_drop_ambig_stats.loc[stats.index])

        obs_trunc, stats = q_score(view, quality_window=1, min_quality=33,
                                   min_length_fraction=0.25)
        exp_trunc = ["@foo_1",
                     "ATGCATGC",
                     "+",
                     "DDDDBBDD",
                     "@bar_1",
                     "ATA",
                     "+",
                     "DDD"]
        exp_trunc_stats = pd.DataFrame([('foo', 2, 1, 0, 0, 1),
                                        ('bar', 1, 1, 1, 0, 0)],
                                       columns=columns)
        exp_trunc_stats = exp_trunc_stats.set_index('sample-id')

        obs = []
        for sample_id, fp in obs_trunc.sequences.iter_views(FastqGzFormat):
            obs.extend([l.strip() for l in gzip.open(str(fp), 'rt')])
        self.assertEqual(sorted(obs), sorted(exp_trunc))
        pdt.assert_frame_equal(stats, exp_trunc_stats.loc[stats.index])
Exemplo n.º 23
0
    def test_load_different_type_with_multiple_view_types(self):
        saved_artifact = Artifact.import_data(IntSequence1,
                                              [42, 42, 43, -999, 42])
        fp = os.path.join(self.test_dir.name, 'artifact.qza')
        saved_artifact.save(fp)

        artifact = Artifact.load(fp)

        self.assertEqual(artifact.type, IntSequence1)
        self.assertEqual(artifact.uuid, saved_artifact.uuid)

        self.assertEqual(artifact.view(list),
                         [42, 42, 43, -999, 42])
        self.assertEqual(artifact.view(list),
                         [42, 42, 43, -999, 42])

        self.assertEqual(artifact.view(collections.Counter),
                         collections.Counter({42: 3, 43: 1, -999: 1}))
        self.assertEqual(artifact.view(collections.Counter),
                         collections.Counter({42: 3, 43: 1, -999: 1}))
Exemplo n.º 24
0
def export_artifact(uuid):
    output = request.get_json().get('path')
    Artifact.load(ARTIFACTS[uuid]).export_data(output)
    return jsonify({'path': output})
Exemplo n.º 25
0
def classify_otus_experimental(
        representative_sequences: DNASequencesDirectoryFormat,
        tree: NewickFormat,
        reference_taxonomy: pd.DataFrame = None) -> pd.DataFrame:
    if reference_taxonomy is None:
        filename_default_taxonomy = os.path.join(_sepp_refs_path(),
                                                 'taxonomy_gg99.qza')
        reference_taxonomy = Artifact.load(filename_default_taxonomy).view(
            pd.DataFrame)

    # convert type of feature IDs to str (depending on pandas type inference
    # they might come as integers), to make sure they are of the same type as
    # in the tree.
    reference_taxonomy.index = map(str, reference_taxonomy.index)

    # load the insertion tree
    tree = skbio.TreeNode.read(str(tree))

    # ensure that all reference tips in the tree (those without the inserted
    # fragments) have a mapping in the user provided taxonomy table
    names_tips = {node.name for node in tree.tips()}
    names_fragments = {
        fragment.metadata['id']
        for fragment in representative_sequences.file.view(DNAIterator)
    }
    missing_features = (names_tips - names_fragments) -\
        set(reference_taxonomy.index)
    if len(missing_features) > 0:
        # QIIME2 users can run with --verbose and see stderr and stdout.
        # Thus, we here report more details about the mismatch:
        sys.stderr.write(
            ("The taxonomy artifact you provided does not contain lineage "
             "information for the following %i features:\n%s") %
            (len(missing_features), "\n".join(missing_features)))
        raise ValueError("Not all OTUs in the provided insertion tree have "
                         "mappings in the provided reference taxonomy.")

    taxonomy = []
    for fragment in representative_sequences.file.view(DNAIterator):
        # for every inserted fragment we now try to find the closest OTU tip
        # in the tree and available mapping from the OTU-ID to a lineage
        # string:
        lineage_str = np.nan
        # first, let us check if the fragment has been inserted at all ...
        try:
            curr_node = tree.find(fragment.metadata['id'])
        except skbio.tree.MissingNodeError:
            continue
        # if yes, we start from the inserted node and traverse the tree as less
        # as possible towards the root and check at every level if one or
        # several OTU-tips are within the sub-tree.
        if curr_node is not None:
            foundOTUs = []
            # Traversal is stopped at a certain level, if one or more OTU-tips
            # have been found in the sub-tree OR ... (see break below)
            while len(foundOTUs) == 0:
                # SEPP insertion - especially for multiple very similar
                # sequences - can result in a rather complex topology change
                # if all those sequences are inserted into the same branch
                # leading to one OTU-tip. Thus, we cannot simply visit only
                # all siblings or decendents and rather need to traverse the
                # whole sub-tree. Average case should be well behaved,
                # thus I think it is ok.
                for node in curr_node.postorder():
                    if (node.name is not None) and \
                       (node.name in reference_taxonomy.index):
                        # if a suitable OTU-tip node is found AND this OTU-ID
                        # has a mapping in the user provided reference_taxonomy
                        # we store the OTU-ID in the growing result list
                        foundOTUs.append(node.name)
                # ... if the whole tree has been traversed without success,
                # e.g. if user provided reference_taxonomy did not contain any
                # matching OTU-IDs.
                if curr_node.is_root():
                    break
                # prepare next while iteration, by changing to the parent node
                curr_node = curr_node.parent

            if len(foundOTUs) > 0:
                # If the above method has identified exactly one OTU-tip,
                # resulting lineage string would simple be the one provided by
                # the user reference_taxonomy. However, if the inserted
                # fragment cannot unambiguously places into the reference tree,
                # the above method will find multiple OTU-IDs, which might have
                # lineage strings in the user provided reference_taxonomy that
                # are similar up to a certain rank and differ e.g. for genus
                # and species.
                # Thus, we here find the longest common prefix of all lineage
                # strings. We don't operate per character, but per taxonomic
                # rank. Therefore, we first "convert" every lineage sting into
                # a list of taxa, one per rank.
                split_lineages = []
                for otu in foundOTUs:
                    # find lineage string for OTU
                    lineage = reference_taxonomy.loc[otu, 'Taxon']
                    # necessary to split lineage apart to ensure that
                    # the longest common prefix operates on atomic ranks
                    # instead of characters
                    split_lineages.append(
                        list(map(str.strip, lineage.split(';'))))
                # find the longest common prefix rank-wise and concatenate to
                # one lineage string, separated by ;
                lineage_str = "; ".join(os.path.commonprefix(split_lineages))
            taxonomy.append({
                'Feature ID': fragment.metadata['id'],
                'Taxon': lineage_str
            })
    pd_taxonomy = pd.DataFrame(taxonomy)
    # test if dataframe is completely empty, or if no lineages could be found
    if (len(taxonomy) == 0) or \
       (pd_taxonomy['Taxon'].dropna().shape[0] == 0):
        raise ValueError(
            ("None of the representative-sequences can be found in the "
             "insertion tree. Please double check that both inputs match up, "
             "i.e. are results from the same 'sepp' run."))

    return pd_taxonomy.set_index('Feature ID')
Exemplo n.º 26
0
def load_artifacts(**kwargs):
    return {
        k: Artifact.load(ARTIFACTS[v])
        for k, v in kwargs.items() if v != ''
    }
Exemplo n.º 27
0
    def test_q_score_all_dropped(self):
        ar = Artifact.load(self.get_data_path('simple.qza'))

        with self.assertRaisesRegex(ValueError, "filtered out"):
            with redirected_stdio(stdout=os.devnull):
                self.plugin.methods['q_score'](ar, min_quality=50)
Exemplo n.º 28
0
    def test_q_score_real_joined(self):
        ar = Artifact.load(self.get_data_path('real_data_joined.qza'))
        view = ar.view(SingleLanePerSampleSingleEndFastqDirFmt)
        obs_result, stats = q_score_joined(
            view, min_quality=40, min_length_fraction=0.24)

        # All input reads are represented here in their post-quality filtered
        # form. Reads that are commented out were manually identified as being
        # filtered by the q_score method. For the commented reads, the comments
        # denote why the read is not retained.

        # The first read, @HWI-EAS440_0386:1:32:15467:1432#0/1, is 25% of
        # total read length and is indicative of a sequence at the
        # min_length_fraction boundary.
        exp_result = [
                      "@HWI-EAS440_0386:1:32:15467:1432#0/1",
                      "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTT",
                      "+",
                      "hhhhhhhhhhhhfghhhghghghhhchhhahhhhhfhh",

                      # too short
                      # "@HWI-EAS440_0386:1:36:9986:17043#0/1",
                      # "TACGTAGGTGGCAAGCGTTATCCGGATTTATTG",
                      # "+",
                      # "hhhhhhhhhhhhhhhhhhhhhhhhhffhhghhh",

                      "@HWI-EAS440_0386:1:37:13343:14820#0/1",
                      "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGAT"
                      "GGATGTTTAAGTCAGTTGTG",
                      "+",
                      "hhhhhhhhhhhhhfhhhhhfhhhhghhhhghhhhhhhhhgghhhgghhhgghh"
                      "hgdhhhhghghhhdhhhhgh",

                      "@HWI-EAS440_0386:1:41:18215:15404#0/1",
                      "TACGTAGGTGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCATGTA",
                      "+",
                      "hhhhhhhhhhhhghhhhhhhhhhhhffhhghhhhghhghgghghhhhhgh",

                      # too short
                      # "@HWI-EAS440_0386:1:42:5423:19606#0/1",
                      # "TACGTAGGGAGCAAGCGTT",
                      # "+",
                      # "hhhhghhhhhhhhhghhfh",

                      "@HWI-EAS440_0386:1:52:7507:5841#0/1",
                      "TACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTT",
                      "+",
                      "hhhhhhhhhghhfghhhhhhhhhhgfhhhghhhghdhh",

                      "@HWI-EAS440_0386:1:53:18599:4074#0/1",
                      "TACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGTG",
                      "+",
                      "hhhhfhhhhhfhhhhhhfhffhghhfgghggghdcbh",

                      # too short
                      # "@HWI-EAS440_0386:1:55:16425:9514#0/1",
                      # "TACGGAGGATCCGAGCGTTATCCGGATT",
                      # "+",
                      # "hhhhhhhhhhhhfghhhghghhhhhbgh",

                      "@HWI-EAS440_0386:1:65:12049:5619#0/1",
                      "TACGTAGGTGGCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTG",
                      "+",
                      "hhhhhhhhhhhhhhhhhhhhhhhhhfhhhhhhhghdhghhhhhghcfh",

                      # @HWI-EAS440_0386:1:95:4837:16388#0/1
                      # starts off < Q40
                      ]

        columns = ['sample-id', 'total-input-reads', 'total-retained-reads',
                   'reads-truncated',
                   'reads-too-short-after-truncation',
                   'reads-exceeding-maximum-ambiguous-bases']
        exp_stats = pd.DataFrame([('foo', 10, 6, 10, 4, 0)],
                                 columns=columns)
        exp_stats = exp_stats.set_index('sample-id')
        obs = []
        iterator = obs_result.sequences.iter_views(FastqGzFormat)
        for sample_id, fp in iterator:
            obs.extend([l.strip() for l in gzip.open(str(fp), 'rt')])
        self.assertEqual(obs, exp_result)
        pdt.assert_frame_equal(stats, exp_stats.loc[stats.index])
Exemplo n.º 29
0
    def test_q_score_all_dropped(self):
        ar = Artifact.load(self.get_data_path('simple.qza'))
        view = ar.view(SingleLanePerSampleSingleEndFastqDirFmt)

        with self.assertRaisesRegex(ValueError, "filtered out"):
            q_score(view, min_quality=50)
Exemplo n.º 30
0
def export_artifact(uuid):
    output = request.get_json().get('path')
    Artifact.load(ARTIFACTS[uuid]).export_data(output)
    return jsonify({'path': output})
Exemplo n.º 31
0
def load_artifacts(**kwargs):
    return {k: Artifact.load(ARTIFACTS[v]) for k, v in kwargs.items()
            if v != ''}