Exemplo n.º 1
0
    def setUp(self):
        dummy_plugin = get_dummy_plugin()

        self.runner = CliRunner()
        self.tempdir = tempfile.mkdtemp(prefix='qiime2-q2cli-test-temp-')

        self.metadata_file_mixed_types = os.path.join(
                self.tempdir, 'metadata-mixed-types.tsv')
        with open(self.metadata_file_mixed_types, 'w') as f:
            f.write('id\tnumbers\tstrings\n0\t42\tabc\n1\t-1.5\tdef\n')

        self.bad_metadata_file = os.path.join(
                self.tempdir, 'bad-metadata.tsv')
        with open(self.bad_metadata_file, 'w') as f:
            f.write('wrong\tnumbers\tstrings\nid1\t42\tabc\nid2\t-1.5\tdef\n')

        self.metadata_artifact = os.path.join(self.tempdir, 'metadata.qza')
        Artifact.import_data(
            'Mapping', {'a': 'dog', 'b': 'cat'}).save(self.metadata_artifact)

        self.ints1 = os.path.join(self.tempdir, 'ints1.qza')
        ints1 = Artifact.import_data(
            'IntSequence1', [0, 42, 43], list)
        ints1.save(self.ints1)

        self.ints2 = os.path.join(self.tempdir, 'ints')
        ints1.export_data(self.ints2)

        self.viz = os.path.join(self.tempdir, 'viz.qzv')
        most_common_viz = dummy_plugin.actions['most_common_viz']
        self.viz = most_common_viz(ints1).visualization.save(self.viz)
Exemplo n.º 2
0
    def test_split_ints(self):
        qiime_cli = RootCommand()
        command = qiime_cli.get_command(ctx=None, name='dummy-plugin')

        # build output file names
        left_path = os.path.join(self.tempdir, 'left.qza')
        right_path = os.path.join(self.tempdir, 'right.qza')

        # TODO: currently must pass `--verbose` to commands invoked by Click's
        # test runner because redirecting stdout/stderr raises an
        # "io.UnsupportedOperation: fileno" error. Likely related to Click
        # mocking a filesystem in the test runner.
        result = self.runner.invoke(
            command, ['split-ints', '--i-ints', self.artifact1_path,
                      '--o-left', left_path, '--o-right', right_path,
                      '--verbose'])
        # command completes successfully and creates the correct
        # output files
        self.assertEqual(result.exit_code, 0)
        self.assertTrue(os.path.exists(left_path))
        self.assertTrue(os.path.exists(right_path))
        # results are correct
        left = Artifact.load(left_path)
        right = Artifact.load(right_path)
        self.assertEqual(left.view(list), [0])
        self.assertEqual(right.view(list), [42, 43])
Exemplo n.º 3
0
    def test_variadic_inputs(self):
        qiime_cli = RootCommand()
        command = qiime_cli.get_command(ctx=None, name='dummy-plugin')
        output_path = os.path.join(self.tempdir, 'output.qza')

        ints1 = Artifact.import_data('IntSequence1', [1, 2, 3]).save(
            os.path.join(self.tempdir, 'ints1.qza'))
        ints2 = Artifact.import_data('IntSequence2', [4, 5, 6]).save(
            os.path.join(self.tempdir, 'ints2.qza'))
        set1 = Artifact.import_data('SingleInt', 7).save(
            os.path.join(self.tempdir, 'set1.qza'))
        set2 = Artifact.import_data('SingleInt', 8).save(
            os.path.join(self.tempdir, 'set2.qza'))

        result = self.runner.invoke(
            command,
            ['variadic-input-method', '--i-ints', ints1, '--i-ints', ints2,
             '--i-int-set', set1, '--i-int-set', set2, '--p-nums', '9',
             '--p-nums', '10', '--p-opt-nums', '11', '--p-opt-nums', '12',
             '--p-opt-nums', '13', '--o-output', output_path, '--verbose'])

        self.assertEqual(result.exit_code, 0)
        self.assertTrue(os.path.exists(output_path))

        output = Artifact.load(output_path)
        self.assertEqual(output.view(list), list(range(1, 14)))
Exemplo n.º 4
0
    def test_repeated_multiple_option(self):
        input_path = os.path.join(self.tempdir, 'ints.qza')
        artifact = Artifact.import_data(IntSequence1, [0, 42, 43], list)
        artifact.save(input_path)

        metadata_path1 = os.path.join(self.tempdir, 'metadata1.tsv')
        with open(metadata_path1, 'w') as f:
            f.write('id\tcol1\nid1\tfoo\nid2\tbar\n')
        metadata_path2 = os.path.join(self.tempdir, 'metadata2.tsv')
        with open(metadata_path2, 'w') as f:
            f.write('id\tcol2\nid1\tbaz\nid2\tbaa\n')

        output_path = os.path.join(self.tempdir, 'out.qza')

        qiime_cli = RootCommand()
        command = qiime_cli.get_command(ctx=None, name='dummy-plugin')

        result = self.runner.invoke(
            command, ['identity-with-metadata', '--i-ints', input_path,
                      '--o-out', output_path, '--m-metadata-file',
                      metadata_path1, '--m-metadata-file', metadata_path2,
                      '--verbose'])

        self.assertEqual(result.exit_code, 0)
        self.assertTrue(os.path.exists(output_path))
        self.assertEqual(Artifact.load(output_path).view(list), [0, 42, 43])
Exemplo n.º 5
0
    def test_core_metrics_phylogenetic_multiple_jobs(self):
        table = biom.Table(np.array([[0, 11, 11], [13, 11, 11]]),
                           ['O1', 'O2'],
                           ['S1', 'S2', 'S3'])
        table = Artifact.import_data('FeatureTable[Frequency]', table)

        tree = skbio.TreeNode.read(io.StringIO(
            '((O1:0.25, O2:0.50):0.25, O3:0.75)root;'))
        tree = Artifact.import_data('Phylogeny[Rooted]', tree)

        metadata = Metadata(
            pd.DataFrame({'foo': ['1', '2', '3']},
                         index=pd.Index(['S1', 'S2', 'S3'], name='id')))

        results = self.core_metrics_phylogenetic(table, tree, 13, metadata,
                                                 n_jobs=2)

        self.assertEqual(len(results), 17)

        self.assertEqual(repr(results.bray_curtis_distance_matrix.type),
                         'DistanceMatrix')
        self.assertEqual(repr(results.jaccard_emperor.type), 'Visualization')

        # pipelines preserve the output's type, in this case, beta_phylogenetic
        # returns this type, and that is passed through to the final output
        # (as long as the type is a subtype of the signature).
        self.assertEqual(
            repr(results.faith_pd_vector.type),
            "SampleData[AlphaDiversity] % Properties(['phylogenetic'])")

        expected = pd.Series({'S1': 1, 'S2': 2, 'S3': 2},
                             name='observed_otus')
        pdt.assert_series_equal(results[2].view(pd.Series), expected)
Exemplo n.º 6
0
    def test_add_artifacts(self):
        # First two artifacts have the same data but different UUIDs.
        artifact1 = Artifact.import_data('Mapping', {'a': '1', 'b': '3'})
        self.mdc._add_artifacts([artifact1])

        artifact2 = Artifact.import_data('Mapping', {'a': '1', 'b': '3'})
        artifact3 = Artifact.import_data('IntSequence1', [1, 2, 3, 4])
        self.mdc._add_artifacts([artifact2, artifact3])

        self.assertEqual(self.mdc.artifacts, (artifact1, artifact2, artifact3))
Exemplo n.º 7
0
    def test_artifact_mismatch(self):
        # Metadata created from different artifacts shouldn't compare equal,
        # even if the data is the same.
        artifact1 = Artifact.import_data('Mapping', {'a': '1', 'b': '2'})
        artifact2 = Artifact.import_data('Mapping', {'a': '1', 'b': '2'})

        md1 = artifact1.view(Metadata)
        md2 = artifact2.view(Metadata)

        pdt.assert_frame_equal(md1.to_dataframe(), md2.to_dataframe())
        self.assertReallyNotEqual(md1, md2)
Exemplo n.º 8
0
    def test_add_duplicate_artifact(self):
        artifact1 = Artifact.import_data('Mapping', {'a': '1', 'b': '3'})
        artifact2 = Artifact.import_data('IntSequence1', [1, 2, 3, 4])
        self.mdc._add_artifacts([artifact1, artifact2])

        with self.assertRaisesRegex(
                ValueError, "Duplicate source artifacts.*DummyMetadataColumn.*"
                            "artifact: Mapping"):
            self.mdc._add_artifacts([artifact1])

        # Test that the object hasn't been mutated.
        self.assertEqual(self.mdc.artifacts, (artifact1, artifact2))
Exemplo n.º 9
0
    def setUp(self):
        get_dummy_plugin()
        self.runner = CliRunner()
        self.tempdir = tempfile.mkdtemp(prefix='qiime2-q2cli-test-temp-')
        self.artifact1_path = os.path.join(self.tempdir, 'a1.qza')
        self.mapping_path = os.path.join(self.tempdir, 'mapping.qza')

        artifact1 = Artifact.import_data(IntSequence1, [0, 42, 43])
        artifact1.save(self.artifact1_path)
        self.artifact1_root_dir = str(artifact1.uuid)

        mapping = Artifact.import_data('Mapping', {'foo': '42'})
        mapping.save(self.mapping_path)
Exemplo n.º 10
0
    def setUp(self):
        get_dummy_plugin()
        self.runner = CliRunner()
        self.plugin_command = RootCommand().get_command(
            ctx=None, name='dummy-plugin')
        self.tempdir = tempfile.mkdtemp(prefix='qiime2-q2cli-test-temp-')

        self.input_artifact = os.path.join(self.tempdir, 'in.qza')
        Artifact.import_data(
            IntSequence1, [0, 42, 43], list).save(self.input_artifact)
        self.output_artifact = os.path.join(self.tempdir, 'out.qza')

        self.metadata_file1 = os.path.join(self.tempdir, 'metadata1.tsv')
        with open(self.metadata_file1, 'w') as f:
            f.write('id\tcol1\n0\tfoo\nid1\tbar\n')

        self.metadata_file_alt_id_header = os.path.join(
                self.tempdir, 'metadata-alt-id-header.tsv')
        with open(self.metadata_file_alt_id_header, 'w') as f:
            f.write('#SampleID\tcol1\n0\tfoo\nid1\tbar\n')

        self.metadata_file2 = os.path.join(self.tempdir, 'metadata2.tsv')
        with open(self.metadata_file2, 'w') as f:
            f.write('id\tcol2\n0\tbaz\nid1\tbaa\n')

        self.metadata_file_mixed_types = os.path.join(
                self.tempdir, 'metadata-mixed-types.tsv')
        with open(self.metadata_file_mixed_types, 'w') as f:
            f.write('id\tnumbers\tstrings\nid1\t42\tabc\nid2\t-1.5\tdef\n')

        self.metadata_artifact = os.path.join(self.tempdir, 'metadata.qza')
        Artifact.import_data(
            'Mapping', {'a': 'dog', 'b': 'cat'}).save(self.metadata_artifact)

        self.cmd_config = os.path.join(self.tempdir, 'conf.ini')
        with open(self.cmd_config, 'w') as f:
            f.write('[dummy-plugin.identity-with-metadata]\n'
                    'm-metadata-file=%s\n' % self.metadata_file1)
            f.write('[dummy-plugin.identity-with-optional-metadata]\n'
                    'm-metadata-file=%s\n' % self.metadata_file1)
            f.write('[dummy-plugin.identity-with-metadata-column]\n'
                    'm-metadata-file=%s\n'
                    'm-metadata-column=col1\n' % self.metadata_file1)
            f.write('[dummy-plugin.identity-with-optional-metadata-column]\n'
                    'm-metadata-file=%s\n'
                    'm-metadata-column=col1\n' % self.metadata_file1)
    def setUp(self):
        super().setUp()
        self.align_to_tree_mafft_fasttree = self.plugin.pipelines[
                                    'align_to_tree_mafft_fasttree']

        input_sequences_fp = self.get_data_path('dna-sequences-1.fasta')
        self.input_sequences = Artifact.import_data('FeatureData[Sequence]',
                                                    input_sequences_fp)
Exemplo n.º 12
0
    def test_add_non_artifact(self):
        artifact = Artifact.import_data('Mapping', {'a': '1', 'b': '3'})

        with self.assertRaisesRegex(TypeError, "Artifact object.*42"):
            self.mdc._add_artifacts([artifact, 42])

        # Test that the object hasn't been mutated.
        self.assertEqual(self.mdc.artifacts, ())
Exemplo n.º 13
0
    def test_no_optional_artifacts_provided(self):
        result = self._run_command(
            'optional-artifacts-method', '--i-ints', self.ints1,
            '--p-num1', 42, '--o-output', self.output, '--verbose')

        self.assertEqual(result.exit_code, 0)
        self.assertEqual(Artifact.load(self.output).view(list),
                         [0, 42, 43, 42])
Exemplo n.º 14
0
    def test_with_artifacts(self):
        artifact1 = Artifact.import_data('Mapping', {'a': '1', 'b': '2'})
        artifact2 = Artifact.import_data('Mapping', {'d': '4'})

        mdc = DummyMetadataColumn(pd.Series(
            [1, 2, 3], name='col1',
            index=pd.Index(['a', 'b', 'c'], name='id')))
        mdc._add_artifacts([artifact1, artifact2])

        obs = mdc.filter_ids({'a', 'c'})

        exp = DummyMetadataColumn(pd.Series(
            [1, 3], name='col1', index=pd.Index(['a', 'c'], name='id')))
        exp._add_artifacts([artifact1, artifact2])

        self.assertEqual(obs, exp)
        self.assertEqual(obs.artifacts, (artifact1, artifact2))
Exemplo n.º 15
0
    def test_artifacts_mismatch(self):
        artifact1 = Artifact.import_data('Mapping', {'a': '1', 'b': '2'})
        artifact2 = Artifact.import_data('Mapping', {'a': '1', 'b': '2'})
        series = pd.Series([42, 43], name='col1',
                           index=pd.Index(['id1', 'id2'], name='id'))

        # No artifacts
        mdc1 = DummyMetadataColumn(series)

        # Has an artifact
        mdc2 = DummyMetadataColumn(series)
        mdc2._add_artifacts([artifact1])

        # Has a different artifact
        mdc3 = DummyMetadataColumn(series)
        mdc3._add_artifacts([artifact2])

        self.assertReallyNotEqual(mdc1, mdc2)
        self.assertReallyNotEqual(mdc2, mdc3)
Exemplo n.º 16
0
    def setUp(self):
        self.runner = CliRunner()
        self.tempdir = tempfile.mkdtemp(prefix='qiime2-test-temp-')
        self.artifact1_path = os.path.join(self.tempdir, 'a1.qza')

        artifact1 = Artifact._from_view(
            IntSequence1, [0, 42, 43], list,
            provenance_capture=ImportProvenanceCapture())
        artifact1.save(self.artifact1_path)
        self.artifact1_root_dir = str(artifact1.uuid)
Exemplo n.º 17
0
    def test_core_metrics_phylogenetic_rarefy_drops_sample(self):
        table = biom.Table(np.array([[0, 11, 11], [12, 11, 11]]),
                           ['O1', 'O2'],
                           ['S1', 'S2', 'S3'])
        table = Artifact.import_data('FeatureTable[Frequency]', table)

        tree = skbio.TreeNode.read(io.StringIO(
            '((O1:0.25, O2:0.50):0.25, O3:0.75)root;'))
        tree = Artifact.import_data('Phylogeny[Rooted]', tree)

        metadata = Metadata(pd.DataFrame({'foo': ['1', '2', '3']},
                                         index=['S1', 'S2', 'S3']))

        results = self.core_metrics_phylogenetic(table, tree, 13, metadata)

        self.assertEqual(len(results), 17)

        expected = pd.Series({'S2': 2, 'S3': 2},
                             name='observed_otus')
        pdt.assert_series_equal(results[2].view(pd.Series), expected)
Exemplo n.º 18
0
    def test_artifacts_are_propagated(self):
        A = Artifact.import_data('Mapping', {'a': '1', 'b': '3'})
        md = A.view(Metadata)

        obs = md.get_column('b')

        # TODO update to use MetadataColumn.__eq__
        self.assertEqual(obs.artifacts, (A,))
        pdt.assert_series_equal(
            obs.to_series(),
            pd.Series(['3'], index=pd.Index(['0'], name='id'), name='b'))
Exemplo n.º 19
0
    def test_source_mismatch(self):
        # Metadata created from an artifact vs not shouldn't compare equal,
        # even if the data is the same.
        artifact = Artifact.import_data('Mapping', {'a': '1', 'b': '2'})
        md_from_artifact = artifact.view(Metadata)

        md_no_artifact = Metadata(md_from_artifact.to_dataframe())

        pdt.assert_frame_equal(md_from_artifact.to_dataframe(),
                               md_no_artifact.to_dataframe())
        self.assertReallyNotEqual(md_from_artifact, md_no_artifact)
Exemplo n.º 20
0
    def test_with_artifacts(self):
        artifact1 = Artifact.import_data('Mapping', {'a': '1', 'b': '2'})
        artifact2 = Artifact.import_data('Mapping', {'d': '4'})

        md_from_artifact1 = artifact1.view(Metadata)
        md_from_artifact2 = artifact2.view(Metadata)
        md_no_artifact = Metadata(pd.DataFrame(
            {'c': ['3', '42']}, index=pd.Index(['0', '1'], name='id')))

        # Merge three metadata objects -- the first has an artifact, the second
        # does not, and the third has an artifact.
        obs_md = md_from_artifact1.merge(md_no_artifact, md_from_artifact2)

        exp_df = pd.DataFrame(
            {'a': '1', 'b': '2', 'c': '3', 'd': '4'},
            index=pd.Index(['0'], name='id'))
        exp_md = Metadata(exp_df)
        exp_md._add_artifacts((artifact1, artifact2))

        self.assertEqual(obs_md, exp_md)
        self.assertEqual(obs_md.artifacts, (artifact1, artifact2))
Exemplo n.º 21
0
    def test_equality_with_artifact(self):
        artifact = Artifact.import_data('Mapping', {'a': '1', 'b': '2'})

        mdc1 = DummyMetadataColumn(pd.Series(
            [42, 43], name='col1', index=pd.Index(['id1', 'id2'], name='id')))
        mdc1._add_artifacts([artifact])

        mdc2 = DummyMetadataColumn(pd.Series(
            [42, 43], name='col1', index=pd.Index(['id1', 'id2'], name='id')))
        mdc2._add_artifacts([artifact])

        self.assertReallyEqual(mdc1, mdc2)
Exemplo n.º 22
0
    def test_without_inputs_or_parameters(self):
        qiime_cli = RootCommand()
        command = qiime_cli.get_command(ctx=None, name='dummy-plugin')
        output_path = os.path.join(self.tempdir, 'output.qza')

        result = self.runner.invoke(
            command, ['no-input-method', '--o-out', output_path, '--verbose'])

        self.assertEqual(result.exit_code, 0)
        self.assertTrue(os.path.exists(output_path))

        artifact = Artifact.load(output_path)
        self.assertEqual(artifact.view(dict), {'foo': '42'})
Exemplo n.º 23
0
    def setUp(self):
        super().setUp()
        self.beta_correlation = self.plugin.pipelines['beta_correlation']
        dm = skbio.DistanceMatrix([[0, 1, 2],
                                   [1, 0, 1],
                                   [2, 1, 0]],
                                  ids=['sample1', 'sample2', 'sample3'])
        self.dm = Artifact.import_data('DistanceMatrix', dm)

        self.md = qiime2.NumericMetadataColumn(
            pd.Series([1, 2, 3], name='number',
                      index=pd.Index(['sample1', 'sample2', 'sample3'],
                                     name='id')))
Exemplo n.º 24
0
    def _assertMetadataOutput(self, result, *, exp_tsv, exp_yaml):
        self.assertEqual(result.exit_code, 0)

        artifact = Artifact.load(self.output_artifact)
        action_dir = artifact._archiver.provenance_dir / 'action'

        if exp_tsv is None:
            self.assertFalse((action_dir / 'metadata.tsv').exists())
        else:
            with (action_dir / 'metadata.tsv').open() as fh:
                self.assertEqual(fh.read(), exp_tsv)

        with (action_dir / 'action.yaml').open() as fh:
            self.assertIn(exp_yaml, fh.read())
Exemplo n.º 25
0
    def test_multiple_metadata(self):
        for command in ('identity-with-metadata-category',
                        'identity-with-optional-metadata-category'):
            result = self._run_command(
                command, '--i-ints', self.input_artifact, '--o-out',
                self.output_artifact, '--m-metadata-file', self.metadata_file1,
                '--m-metadata-file', self.metadata_file2, '--m-metadata-file',
                self.metadata_artifact, '--m-metadata-category', 'col2',
                '--verbose')

            exp_yaml = "metadata: !metadata '%s:metadata.tsv'" % (
                Artifact.load(self.metadata_artifact).uuid)
            self._assertMetadataOutput(result, exp_tsv='0\tbaz\n',
                                       exp_yaml=exp_yaml)
Exemplo n.º 26
0
    def test_qza_extension(self):
        qiime_cli = RootCommand()
        command = qiime_cli.get_command(ctx=None, name='dummy-plugin')

        # build output parameter arguments and expected output file names
        left_path = os.path.join(self.tempdir, 'left')
        expected_left_path = os.path.join(self.tempdir, 'left.qza')
        right_path = os.path.join(self.tempdir, 'right')
        expected_right_path = os.path.join(self.tempdir, 'right.qza')

        result = self.runner.invoke(
            command, ['split-ints', '--i-ints', self.artifact1_path,
                      '--o-left', left_path, '--o-right', right_path,
                      '--verbose'])
        # command completes successfully and creates the correct
        # output files
        self.assertEqual(result.exit_code, 0)
        self.assertTrue(os.path.exists(expected_left_path))
        self.assertTrue(os.path.exists(expected_right_path))
        # results are correct
        left = Artifact.load(expected_left_path)
        right = Artifact.load(expected_right_path)
        self.assertEqual(left.view(list), [0])
        self.assertEqual(right.view(list), [42, 43])
Exemplo n.º 27
0
    def setUp(self):
        get_dummy_plugin()
        self.runner = CliRunner()
        self.plugin_command = RootCommand().get_command(
            ctx=None, name='dummy-plugin')
        self.tempdir = tempfile.mkdtemp(prefix='qiime2-q2cli-test-temp-')

        self.input_artifact = os.path.join(self.tempdir, 'in.qza')
        Artifact.import_data(
            IntSequence1, [0, 42, 43], list).save(self.input_artifact)
        self.output_artifact = os.path.join(self.tempdir, 'out.qza')

        self.metadata_file1 = os.path.join(self.tempdir, 'metadata1.tsv')
        with open(self.metadata_file1, 'w') as f:
            f.write('id\tcol1\n0\tfoo\nid1\tbar\n')

        self.metadata_file2 = os.path.join(self.tempdir, 'metadata2.tsv')
        with open(self.metadata_file2, 'w') as f:
            f.write('id\tcol2\n0\tbaz\nid1\tbaa\n')

        self.metadata_artifact = os.path.join(self.tempdir, 'metadata.qza')
        Artifact.import_data(
            'Mapping', {'a': 'dog', 'b': 'cat'}).save(self.metadata_artifact)

        self.cmd_config = os.path.join(self.tempdir, 'conf.ini')
        with open(self.cmd_config, 'w') as f:
            f.write('[dummy-plugin.identity-with-metadata]\n'
                    'm-metadata-file=%s\n' % self.metadata_file1)
            f.write('[dummy-plugin.identity-with-optional-metadata]\n'
                    'm-metadata-file=%s\n' % self.metadata_file1)
            f.write('[dummy-plugin.identity-with-metadata-category]\n'
                    'm-metadata-file=%s\n'
                    'm-metadata-category=col1\n' % self.metadata_file1)
            f.write('[dummy-plugin.identity-with-optional-metadata-category]\n'
                    'm-metadata-file=%s\n'
                    'm-metadata-category=col1\n' % self.metadata_file1)
Exemplo n.º 28
0
    def test_core_metrics(self):
        table = biom.Table(np.array([[0, 11, 11], [13, 11, 11]]),
                           ['O1', 'O2'],
                           ['S1', 'S2', 'S3'])
        table = Artifact.import_data('FeatureTable[Frequency]', table)

        metadata = Metadata(pd.DataFrame({'foo': ['1', '2', '3']},
                                         index=['S1', 'S2', 'S3']))

        results = self.core_metrics(table, 13, metadata)

        self.assertEqual(len(results), 10)
        self.assertEqual(repr(results.bray_curtis_distance_matrix.type),
                         'DistanceMatrix')
        self.assertEqual(repr(results.jaccard_emperor.type), 'Visualization')

        expected = pd.Series({'S1': 1, 'S2': 2, 'S3': 2}, name='observed_otus')
        pdt.assert_series_equal(results[1].view(pd.Series), expected)
Exemplo n.º 29
0
    def test_artifacts_are_propagated(self):
        artifact = Artifact.import_data('Mapping', {'a': '1', 'b': '2'})

        series = pd.Series(
            [0.0, np.nan, 3.3, np.nan, np.nan, 4.4], name='col1',
            index=pd.Index(['a', 'b', 'c', 'd', 'e', 'f'], name='sampleid'))
        mdc = DummyMetadataColumn(series)
        mdc._add_artifacts([artifact])

        obs = mdc.drop_missing_values()

        exp = DummyMetadataColumn(pd.Series(
            [0.0, 3.3, 4.4], name='col1',
            index=pd.Index(['a', 'c', 'f'], name='sampleid')))
        exp._add_artifacts([artifact])

        self.assertEqual(obs, exp)
        self.assertEqual(obs.artifacts, (artifact,))
Exemplo n.º 30
0
    def test_multiple_metadata(self):
        for command in ('identity-with-metadata',
                        'identity-with-optional-metadata'):
            result = self._run_command(
                command, '--i-ints', self.input_artifact, '--o-out',
                self.output_artifact, '--m-metadata-file',
                self.metadata_file_alt_id_header, '--m-metadata-file',
                self.metadata_file2, '--m-metadata-file',
                self.metadata_artifact, '--verbose')

            exp_tsv = (
                'id\tcol1\tcol2\ta\tb\n'
                '#q2:types\tcategorical\tcategorical\tcategorical\tcategorical'
                '\n0\tfoo\tbaz\tdog\tcat\n'
            )
            exp_yaml = "metadata: !metadata '%s:metadata.tsv'" % (
                Artifact.load(self.metadata_artifact).uuid)
            self._assertMetadataOutput(result, exp_tsv=exp_tsv,
                                       exp_yaml=exp_yaml)
Exemplo n.º 31
0
    def setUp(self):
        get_dummy_plugin()

        self.runner = CliRunner()
        self.plugin_command = RootCommand().get_command(ctx=None,
                                                        name='dummy-plugin')
        self.tempdir = tempfile.mkdtemp(prefix='qiime2-q2cli-test-temp-')

        self.ints1 = os.path.join(self.tempdir, 'ints1.qza')
        Artifact.import_data(IntSequence1, [0, 42, 43], list).save(self.ints1)
        self.ints2 = os.path.join(self.tempdir, 'ints2.qza')
        Artifact.import_data(IntSequence1, [99, -22], list).save(self.ints2)
        self.ints3 = os.path.join(self.tempdir, 'ints3.qza')
        Artifact.import_data(IntSequence2, [43, 43], list).save(self.ints3)
        self.output = os.path.join(self.tempdir, 'output.qza')
Exemplo n.º 32
0
def demultiplex_manifests(fastq_files,
                          primers,
                          regions=None,
                          split_on_header=True,
                          threads=16):
    """Demultiplex fastq files into variable region origins.
    """
    if regions is not None:
        primers = {k: v for k, v in primers.items() if k in regions}
    r1 = [abspath(i) for i in fastq_files if '_R1.fastq' in i]
    rundir = mkdtemp()  #run in a tmpdir
    cwd = os.path.abspath(os.curdir)
    os.chdir(rundir)
    with mp.Pool(threads) as pool:
        if split_on_header:
            args_iter = itertools.product(r1, regions)
            pool.starmap(seqkit_worker, args_iter)
        else:
            pool.map(cutadapt_worker, r1, primer_subset)

    manifest_filenames = {}
    for r in regions:
        R1 = glob.glob(join(rundir, '*_{}_R1.fastq'.format(r)))
        df = pandas_manifest(R1)
        manifest_fn = r + '_manifest.csv'
        if df is not None:
            df.to_csv(manifest_fn, index=False, sep='\t')
            manifest_filenames[r] = manifest_fn

    adata = {}
    with mp.Pool(threads) as pool:
        pool.map(import_data_worker, manifest_filenames.values())

    for r, fn in manifest_filenames.items():
        write_message('importing data ({}) from {}'.format(r, fn))
        adata[r] = Artifact.load(fn.split('_')[0] + '.qza')

    # clean up tmpdir
    os.chdir(cwd)
    shutil.rmtree(rundir)
    return adata
Exemplo n.º 33
0
def main():
    cdir = Path("./data/silva_138_1")
    habitats = """
      animal-distal-gut
      animal-surface
      animal-secretion
      water-non-saline
      animal-proximal-gut
      animal-corpus
      plant-rhizosphere
      water-saline
      sediment-saline
      sediment-non-saline
      plant-corpus
      plant-surface
      surface-saline
      soil-non-saline
      human-stool
      human-oral
      average
    """
    habitats = habitats.split()

    v4 = {}
    fl = {}
    for habitat in habitats:
        for ddirs, collection in [("515f-806r", v4), ("full_length", fl)]:
            art = Artifact.load(cdir / ddirs / (habitat + ".qza"))
            alpha = diversity.actions.alpha(metric="shannon",
                                            table=art)[0].view(Series)[0]
            collection[habitat] = alpha
    assert len(v4.values()) == len(set(v4.values())), \
            "WARNING: two sets of weights are the same"
    assert len(fl.values()) == len(set(fl.values())), \
            "WARNING: two sets of weights are the same"
    for alpha in v4.values():
        assert v4[
            'average'] >= alpha, "WARNING: average weights are not the most diverse"
    for alpha in fl.values():
        assert fl[
            'average'] >= alpha, "WARNING: average weights are not the most diverse"
    def test_parse_q2_data_wrong_semantic_type(self):
        resource_filename = self.create_tempfile(suffix='.qza').name
        test_series = pd.Series({
            'feature1': 'k__1',
            'feature2': 'k__2'
        },
                                name='Taxon')
        test_series.index.name = 'Feature ID'
        imported_artifact = Artifact.import_data(
            # the distincion here is that this is not alpha diversity
            "FeatureData[Taxonomy]",
            test_series)
        imported_artifact.save(resource_filename)

        with self.assertRaisesRegex(
                ConfigurationError, r"Expected (.*) "
                r"'SampleData\[AlphaDiversity\]'. "
                r"Received 'FeatureData\[Taxonomy\]'."):
            _parse_q2_data(resource_filename,
                           SampleData[AlphaDiversity],
                           view_type=pd.Series)
Exemplo n.º 35
0
def _parse_q2_data(filepath,
                   semantic_type,
                   view_type=None,
                   ignore_predicate=True):
    try:
        data = Artifact.load(filepath)
    except ValueError as e:
        raise ConfigurationError(*e.args)

    data_type = data.type
    if ignore_predicate:
        data_type = TypeExp(data_type.template, fields=data_type.fields)

    if data_type != semantic_type:
        raise ConfigurationError(f"Expected QZA '{filepath}' to have type "
                                 f"'{semantic_type}'. "
                                 f"Received '{data.type}'.")
    if view_type is not None:
        data = data.view(view_type=view_type)

    return data
Exemplo n.º 36
0
    def validate_denoise_input(sequence_data):
        """
		Precheck input files prior to running denoise step

		Input:
			- sequence_data: sequence data in QIIME2 artifact format
		"""

        # Check Artifact type
        try:
            q2_artifact = Artifact.load(sequence_data)
            if (str(q2_artifact.type) !=
                    "SampleData[PairedEndSequencesWithQuality]"):
                msg = "Input QIIME2 Artifact is not of type 'SampleData[PairedEndSequencesWithQuality]'!"
                raise ValueError(msg)

        except ValueError as err:
            message = str(err)

            return 400, message

        return 200, "Imported data good!"
Exemplo n.º 37
0
def import_qiime2_feature_table(feature_table_filepath):
    """
    Convert QIIME2 feature table artifact to compatible format
    """
    artifact = Artifact.load(feature_table_filepath)
    artifact_type = str(artifact.type)

    if (artifact_type == "FeatureTable[Frequency]"
            or artifact_type == "FeatureTable[RelativeFrequency]"):

        feature_table_df = artifact.view(pd.DataFrame)

        # return transposed version for better view
        transposed = feature_table_df.T
        transposed.index.name = "SampleID"

        return transposed.reset_index()
    # raise error if not feature table artifact
    else:
        raise ValueError(
            "Input artifact is not of type FeatureTable[Frequency] or FeatureTable[RelativeFrequency]!"
        )
Exemplo n.º 38
0
def convert(artifact_path):
    """
    Converts QIIME2 artifact to tsv if applicabl if applicable

    Input:
        - artifact_path: path to QIIME2 artifact (.qza)
        - output_path: path to save output as
    Returns:
        - Dictionary with pandas series or dataframe as values
    """
    artifact = Artifact.load(artifact_path)
    artifact_type = str(artifact.type)

    if(artifact_type == "FeatureTable[Frequency]" or
        artifact_type == "FeatureTable[RelativeFrequency]"):
        df = artifact.view(pd.DataFrame)

        output = {
                "feature_table": df
                }

        return output
    elif(artifact_type == "PCoAResults"):
        ordination_result = artifact.view(ordination.OrdinationResults)

        eigvals = ordination_result.eigvals # pd.DataFrame
        coordinates = ordination_result.samples # pd.Series
        proportion_explained = ordination_results.proportion_explained # pd.Series 

        output = {
                "eigvals": eigvals,
                "coordinates": coordinates,
                "proportion_explained": proportion_explained
                }

        return output
    else:
        logger.warning("Could not convert specified QIIME2 artifact.")
        return {}
Exemplo n.º 39
0
 def setUp(self):
     _ranks = pd.DataFrame([[4.1, 1.3, 2.1], [0.1, 0.3, 0.2],
                            [2.2, 4.3, 3.2], [-6.3, -4.4, 2.1]],
                           index=pd.Index([c for c in 'ABCD'], name='id'),
                           columns=['m1', 'm2', 'm3'])
     self.ranks = Artifact.import_data('FeatureData[Conditional]', _ranks)
     self.taxa = CategoricalMetadataColumn(
         pd.Series([
             'k__Bacteria; p__Proteobacteria; c__Deltaproteobacteria; '
             'o__Desulfobacterales; f__Desulfobulbaceae; g__; s__',
             'k__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Streptophyta',
             'k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; '
             'o__Rickettsiales; f__mitochondria; g__Lardizabala; s__biternata',
             'k__Archaea; p__Euryarchaeota; c__Methanomicrobia; '
             'o__Methanosarcinales; f__Methanosarcinaceae; g__Methanosarcina'
         ],
                   index=pd.Index([c for c in 'ABCD'], name='feature-id'),
                   name='Taxon'))
     self.metabolites = CategoricalMetadataColumn(
         pd.Series(['amino acid', 'carbohydrate', 'drug metabolism'],
                   index=pd.Index(['m1', 'm2', 'm3'], name='feature-id'),
                   name='Super Pathway'))
Exemplo n.º 40
0
def merge_data(tables, taxas, sequences, samples):
    taxa_list = []
    table_list = []
    seq_list = []
    meta_region = {}
    write_message('merging region results ...')
    # ensure same ordering of dicts
    for r in taxas.keys():
        write_message('collecting data from {}'.format(r))
        taxa_list.append(taxas[r].classification)
        df = tables[r].view(pd.DataFrame)
        df.index = df.index.str.replace('_{}'.format(r), '')
        for seq_id in df.columns:
            if seq_id in meta_region.keys():
                rr = meta_region[seq_id] + '_' + r
            else:
                rr = r
            meta_region[seq_id] = rr
        table = Artifact.import_data('FeatureTable[Frequency]', df)
        table_list.append(table)
        seq_list.append(sequences[r])
    merged_taxa = feature_table.methods.merge_taxa(taxa_list)
    merged_seq = feature_table.methods.merge_seqs(seq_list)
    merged_table = feature_table.methods.merge(table_list,
                                               overlap_method='sum')

    #
    meta = [
        meta_region[seq_id]
        for seq_id in merged_table.merged_table.view(pd.DataFrame).columns
    ]
    meta = pd.DataFrame(meta)
    meta.index = merged_table.merged_table.view(pd.DataFrame).columns
    meta.index.name = 'feature-id'
    meta.columns = ['region']
    meta = Metadata(meta)

    return merged_table, merged_taxa, merged_seq, meta
Exemplo n.º 41
0
    def test__qiime2_rclr(self):
        """Tests q2-rclr matches standalone rclr."""

        # make mock table to write
        samps_ids = ['s%i' % i for i in range(self.cdata.shape[0])]
        feats_ids = ['f%i' % i for i in range(self.cdata.shape[1])]
        table_test = Table(self.cdata.T, feats_ids, samps_ids)
        # write table
        in_ = get_data_path('test.biom', subfolder='data')
        out_path = os_path_sep.join(in_.split(os_path_sep)[:-1])
        test_path = os.path.join(out_path, 'rclr-test.biom')
        with biom_open(test_path, 'w') as wf:
            table_test.to_hdf5(wf, "test")
        # run standalone
        runner = CliRunner()
        result = runner.invoke(sdc.commands['rclr'],
                               ['--in-biom', test_path,
                                '--output-dir', out_path])
        out_table = get_data_path('rclr-table.biom',
                                  subfolder='data')
        res_table = load_table(out_table)
        standalone_mat = res_table.matrix_data.toarray().T
        # check that exit code was 0 (indicating success)
        try:
            self.assertEqual(0, result.exit_code)
        except AssertionError:
            ex = result.exception
            error = Exception('Command failed with non-zero exit code')
            raise error.with_traceback(ex.__traceback__)
        # run QIIME2
        q2_table_test = Artifact.import_data("FeatureTable[Frequency]",
                                             table_test)
        q2_res = rclr_transformation(q2_table_test).rclr_table.view(Table)
        q2_res_mat = q2_res.matrix_data.toarray().T
        # check same and check both correct
        npt.assert_allclose(standalone_mat, q2_res_mat)
        npt.assert_allclose(standalone_mat, self.true)
        npt.assert_allclose(q2_res_mat, self.true)
def qiime2PCoA(sample_metadata,
               df,
               out_dir,
               norm=True,
               scale=False,
               metric='canberra'):
    sample_metadata.rename(index=str,
                           columns={"filename": "#SampleID"},
                           inplace=True)
    sample_metadata.columns = sample_metadata.columns.str.replace('\s', '_')

    sample_metadata.index = sample_metadata['#SampleID']
    sample_metadata.drop(['#SampleID'], axis=1, inplace=True)
    qsample_metadata = qiime2.metadata.Metadata(sample_metadata)

    df2 = df[df.columns[df.columns.str.contains(' Peak area')]]
    df2.columns = [re.sub('(.+\.mzX?ML) .+', '\\1', a) for a in df2.columns]
    df2.index = df['row ID'].astype(str)
    df2 = df2.T

    if norm:
        df2 = df2.apply(lambda a: a / sum(a), axis=1)

    if scale:
        df2 = (df2 - df2.mean()) / df2.std()

    dm1 = squareform(pdist(df2, metric=metric))
    dm1 = skbio.DistanceMatrix(dm1, ids=df2.index.tolist())
    dm1 = Artifact.import_data("DistanceMatrix", dm1)
    pcoa = diversity.methods.pcoa(dm1)
    emperor_plot = emperor.visualizers.plot(pcoa.pcoa, qsample_metadata)

    if '.qzv' in out_dir:
        emperor_plot.visualization.save(out_dir)
    else:
        emperor_plot.visualization.export_data(out_dir)
    return pcoa
Exemplo n.º 43
0
def cross_validate_for_weights(ref_taxa, ref_seqs, weights, obs_dir,
                               results_dir, intermediate_dir, n_jobs, log_file,
                               log_level):
    # set up logging
    setup_logging(log_level, log_file)
    logging.info(locals())

    # load taxonomy-level information
    biom_path = join(intermediate_dir, 'taxonomy_samples.biom')
    taxonomy_samples = biom.load_table(biom_path)
    logging.info('Got taxonomy samples')

    # load folds
    taxon_defaults_file = join(intermediate_dir, 'taxon_defaults.json')
    with open(taxon_defaults_file) as fh:
        taxon_defaults = json.load(fh)
    folds = glob.glob(join(intermediate_dir, 'fold-*'))
    logging.info('Got folds')

    # load the weights
    weights = Artifact.load(weights)
    # for each fold
    for fold in folds:
        # load the simulated test samples
        test_samples = load_simulated_samples(fold, results_dir)
        # generate the training taxa, seqs, ref_seqs, reduced weights
        train_taxa, train_seqs, ref_seqs_art, fold_weights = \
         get_train_artifacts(taxonomy_samples, fold, taxon_defaults,
              ref_taxa, ref_seqs, weights)
        # train the weighted classifier and classify the test samples
        classification = classify_samples(test_samples, train_taxa,
                                          ref_seqs_art, 0.7, n_jobs,
                                          fold_weights)
        # save the classified taxonomy artifacts
        save_observed(results_dir, test_samples, classification, obs_dir)
        logging.info('Done ' + fold)
    def setUp(self):
        super().setUp()
        self.qza_resource_fp = self.create_tempfile(suffix='.qza').name
        self.qza_resource_fp2 = self.create_tempfile(suffix='.qza').name
        self.qza_resource_fh2 = self.create_tempfile(suffix='.qza')
        self.qza_resource_fh2.close()
        self.qza_resource_dne = self.qza_resource_fh2.name
        self.non_qza_resource_fp = self.create_tempfile(
            suffix='.some_ext').name
        self.test_series = pd.Series({
            'sample1': 7.15,
            'sample2': 9.04
        },
                                     name='chao1')
        self.test_series2 = pd.Series({
            'sample1': 7.16,
            'sample2': 9.01
        },
                                      name='faith_pd')
        self.resources = ResourceManager(some_key='some_value')

        imported_artifact = Artifact.import_data("SampleData[AlphaDiversity]",
                                                 self.test_series)
        imported_artifact.save(self.qza_resource_fp)
        self.update_with = {
            'random-value': 7.24,
            'alpha_resources': {
                'chao1': self.qza_resource_fp,
                'faith_pd': 9,
            },
            'other': {
                'dict': {
                    'of': 'things'
                }
            },
        }
Exemplo n.º 45
0
def taxon2fasta(taxonomy, sequences, taxon, path):
    '''
    taxonomy is an artifact of type FeatureData[Taxonomy]
    sequences is an artifact of type FeatureData[Sequence]
    taxon is the annotated OTU we are interested in. input string
    path is where to export the fasta files. input string
    '''
    # convert FeatureData[Taxonomy] to pandas dataframe
    df_taxon = taxonomy.view(pd.DataFrame)

    # filter ASV that were annotated to 'taxon'
    df_taxon = df_taxon.loc[(df_taxon.loc[:, 'Taxon'] == taxon)]

    # convert FeatureData[Sequence] to pandas series
    ser = sequences.view(pd.Series)

    # filter seqs that were annotated to 'taxon'
    ser_taxon = ser[df_taxon.index]

    # covert filtered seqs to artifact
    taxon_seq = Artifact.import_data('FeatureData[Sequence]', ser_taxon)

    # export fasta files to given path
    taxon_seq.export_data(path)
Exemplo n.º 46
0
    def test_integration(self):
        # This will run through a slightly more complex dataset...
        base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                'files/integration')
        test_dir = os.path.join(base_dir, 'test')
        known_dir = os.path.join(base_dir, 'known')
        data_dir = os.path.join(base_dir, 'data')
        if os.path.exists(test_dir):
            shutil.rmtree(test_dir)

        ### Sequence extraction
        region1_seqs, region1_map = sidle.prepare_extracted_region(
            Artifact.load(os.path.join(data_dir, 'region1-extract-seqs.qza')),
            fwd_primer='TGGCGGACGGGTGAGTAA',
            rev_primer='CTGCTGCCTCCCGTAGGA',
            trim_length=50,
            region='1',
            debug=True,
        )
        known = \
            Artifact.load(os.path.join(known_dir, 'region1-kmer-seqs.qza'))
        pdt.assert_series_equal(
            region1_seqs.view(pd.Series).astype(str),
            known.view(pd.Series).astype(str))
        known = \
            Artifact.load(os.path.join(known_dir, 'region1-kmer-map.qza'))
        pdt.assert_frame_equal(
            known.view(pd.DataFrame).sort_index(),
            region1_map.view(pd.DataFrame).sort_index())

        region2_seqs, region2_map = sidle.prepare_extracted_region(
            Artifact.load(os.path.join(data_dir, 'region2-extract-seqs.qza')),
            fwd_primer='CAGCAGCCGCGGTAATAC',
            rev_primer='CGCATTTCACCGCTACAC',
            trim_length=50,
            region='2',
            debug=True,
        )
        known = \
            Artifact.load(os.path.join(known_dir, 'region2-kmer-seqs.qza'))
        pdt.assert_series_equal(
            region2_seqs.view(pd.Series).astype(str),
            known.view(pd.Series).astype(str))
        known = \
            Artifact.load(os.path.join(known_dir, 'region2-kmer-map.qza'))
        pdt.assert_frame_equal(known.view(pd.DataFrame),
                               region2_map.view(pd.DataFrame))
        region3_seqs, region3_map = sidle.prepare_extracted_region(
            Artifact.load(os.path.join(data_dir, 'region3-extract-seqs.qza')),
            fwd_primer='GCACAAGCGGTGGAGCAT',
            rev_primer='CGCTCGTTGCGGGACTTA',
            trim_length=50,
            region='3',
            debug=True,
        )
        known = \
            Artifact.load(os.path.join(known_dir, 'region3-kmer-seqs.qza'))
        pdt.assert_series_equal(
            region3_seqs.view(pd.Series).astype(str),
            known.view(pd.Series).astype(str))
        known = \
            Artifact.load(os.path.join(known_dir, 'region3-kmer-map.qza'))
        pdt.assert_frame_equal(known.view(pd.DataFrame),
                               region3_map.view(pd.DataFrame))

        ### Regiomal Alignment
        align1 = sidle.align_regional_kmers(
            region1_seqs,
            Artifact.load(os.path.join(data_dir, 'region1-rep-seq.qza')),
            region='1',
            max_mismatch=2,
            debug=True,
            chunk_size=1,
        ).regional_alignment
        known = \
            Artifact.load(os.path.join(known_dir, 'region1-align-map.qza'))
        pdt.assert_frame_equal(
            align1.view(pd.DataFrame).sort_values(['kmer', 'asv']),
            known.view(pd.DataFrame))

        align2 = sidle.align_regional_kmers(
            region2_seqs,
            Artifact.load(os.path.join(data_dir, 'region2-rep-seq.qza')),
            region='2',
            max_mismatch=2,
            debug=True,
        ).regional_alignment
        known = \
            Artifact.load(os.path.join(known_dir, 'region2-align-map.qza'))
        pdt.assert_frame_equal(
            align2.view(pd.DataFrame).sort_values(['kmer', 'asv']),
            known.view(pd.DataFrame))

        align3 = sidle.align_regional_kmers(
            region3_seqs,
            Artifact.load(os.path.join(data_dir, 'region3-rep-seq.qza')),
            region='3',
            max_mismatch=2,
            debug=True,
        ).regional_alignment
        known = \
            Artifact.load(os.path.join(known_dir, 'region3-align-map.qza'))
        pdt.assert_frame_equal(
            align3.view(pd.DataFrame).sort_values(['kmer', 'asv']),
            known.view(pd.DataFrame))

        count1 = Artifact.load(os.path.join(data_dir, 'region1-counts.qza'))
        count2 = Artifact.load(os.path.join(data_dir, 'region2-counts.qza'))
        count3 = Artifact.load(os.path.join(data_dir, 'region3-counts.qza'))

        ### Reconstruction
        map_, summary = sidle.reconstruct_database(
            region=['1', '2', '3'],
            kmer_map=[region1_map, region2_map, region3_map],
            regional_alignment=[align1, align2, align3],
            count_degenerates=False,
            debug=True,
        )
        known = \
            Artifact.load(os.path.join(known_dir, 'reconstructed-summary.qza'))
        # ASV mapping was optional in the  original sidle. This is  tested
        # elsewhere  and dealing w ith it is going to suck.
        pdt.assert_frame_equal(
            known.view(pd.DataFrame),
            summary.view(pd.DataFrame).drop(columns=['mapped-asvs']))
        known = \
            Artifact.load(os.path.join(known_dir, 'sidle-reconstruction.qza'))
        pdt.assert_series_equal(
            known.view(pd.Series).sort_index(), map_.view(pd.Series))

        table = sidle.reconstruct_counts(
            region=['1', '2', '3'],
            regional_alignment=[align1, align2, align3],
            regional_table=[count1, count2, count3],
            database_map=map_,
            database_summary=summary,
            debug=True,
            min_counts=100,
            min_abund=1e-5,
        ).reconstructed_table
        known = \
            Artifact.load(os.path.join(known_dir, 'reconstructed-table.qza'))
        pdt.assert_frame_equal(known.view(pd.DataFrame),
                               table.view(pd.DataFrame))
        known = \
            Artifact.load(os.path.join(known_dir, 'reconstructed-summary.qza'))
        pdt.assert_frame_equal(
            known.view(pd.DataFrame),
            summary.view(pd.DataFrame).drop(columns=['mapped-asvs']))
Exemplo n.º 47
0
 def test_reconstruct_fragment_rep_seqs(self):
     recon_map = Artifact.import_data(
         'FeatureData[SidleReconstruction]',
         pd.DataFrame(
             data=np.array([
                 ['seq01|seq02', 0, 'WANTCAT', 0, 'WANTCAT', 15],
                 ['seq01|seq02', 0, 'WANTCAT', 0, 'WANTCAT', 15],
                 ['seq03|seq04', 0, 'WANTCAT', 1, 'CACCTCGTN', 15],
                 ['seq03|seq04', 0, 'CACCTCGTN', 1, 'CACCTCGTN', 15],
                 ['seq05', 0, 'WANTCAT', 1, 'CACCTCGTN', 15],
             ],
                           dtype=object),
             index=pd.Index(['seq01', 'seq02', 'seq03', 'seq04', 'seq05'],
                            name='db-seq'),
             columns=[
                 'clean_name', 'first-region', 'first-fwd-primer',
                 'last-region', 'last-fwd-primer', 'last-kmer-length'
             ],
         ))
     recon_summary = Artifact.import_data(
         'FeatureData[ReconstructionSummary]',
         Metadata(
             pd.DataFrame(
                 data=[[1, 2, 2, 0, 'asv01|asv02'],
                       [2, 3, 1.5,
                        np.std([1, 2], ddof=1), 'asv03|asv04'],
                       [2, 2, 1, 0, 'asv07|asv08']],
                 index=pd.Index(['seq01|seq02', 'seq03|seq04', 'seq05'],
                                name='feature-id'),
                 columns=[
                     'num-regions', 'total-kmers-mapped',
                     'mean-kmer-per-region', 'stdv-kmer-per-region',
                     'mapped-asvs'
                 ])))
     aligned_seqs = Artifact.import_data(
         'FeatureData[AlignedSequence]',
         skbio.TabularMSA([
             DNA(
                 'CTAGTCATGCGAAGCGGCTCAGGATGATGATGAAGAC-------------------'
                 '--------------',
                 metadata={'id': 'seq01'}),
             DNA(
                 'CTAGTCATGCGAAGCGGCTCAGGATGATGATGAAGAC-------------------'
                 '--------------',
                 metadata={'id': 'seq02'}),
             DNA(
                 'CATAGTCATWTCCGCGTTGGAGTTATGATGATGAWACCACCTCGTCCCAGTTCCGC'
                 'GCTTCTGACGTGC-',
                 metadata={'id': 'seq03'}),
             DNA(
                 '------------------GGAGTTATGATGA--AGACCACCTCGTCCCAGTTCCGC'
                 'GCTTCTGACGTGCC',
                 metadata={'id': 'seq04'}),
             DNA(
                 'CATAGTCATCGTTTATGTATGCCCATGATGATGCGAGCACCTCGTATGGATGTAGA'
                 'GCCACTGACGTGCG',
                 metadata={'id': 'seq05'}),
         ]))
     known = pd.Series(
         data=[
             'GCGAAGCGGCTCAGG',
             'WTCCGCGTTGGAGTTATGATGATGAGACCACCTCGTCCCAGTTCCGCGCTTC'
         ],
         index=pd.Index(['seq01|seq02', 'seq03|seq04']),
     )
     test = sidle.reconstruct_fragment_rep_seqs(
         reconstruction_map=recon_map,
         reconstruction_summary=recon_summary,
         aligned_sequences=aligned_seqs,
     ).representative_fragments
     pdt.assert_series_equal(known, test.view(pd.Series).astype(str))
Exemplo n.º 48
0
    def test_mismatch_nested(self):
        a = Artifact.import_data('C1[Foo]', "element 1", view_type=str)
        b = Artifact.import_data('Foo', "element 2", view_type=str)

        with self.assertRaisesRegex(ValueError, 'No solution.*C1'):
            viz, = self.run_action(a=a, b=b)
Exemplo n.º 49
0
    def test_intsequence2(self):
        a = Artifact.import_data('IntSequence2', [1])

        x = self.run_action(ints=a, strs1=['a'], strs2={'a'})
        self.assertEqual(x.output.type, IntSequence2)
Exemplo n.º 50
0
    tax = re.sub('; ', ';', tax)
    tax = re.sub(r'[\-| ]+', '_', tax)
    tax = re.sub(';$', '', tax)
    return(tax)


def ch_col(df, index, val):
    columns = list(df.columns)
    columns[index] = val
    df.columns = columns
    return df


# load otu table from qiime2 Artifact
if opt.otu.endswith('.qza'):
    otu = Artifact.load(opt.otu)
    otu = otu.view(DataFrame)
    taxon = Artifact.load(opt.taxon)
    taxon = taxon.view(DataFrame)
# otherwise load otu table from tsv table
else:
    skip_rows = []
    with open(opt.otu) as f:
        for num, line in enumerate(f):
            if len(line.split('\t')) < 2:
                skip_rows.append(num)
    otu = read_csv(opt.otu, sep='\t', skiprows=skip_rows, index_col=0)
    otu_t = otu.iloc[:, -1]
    taxon = DataFrame({
        'Taxon': otu_t.copy(),
        'Confidence': [0.9] * otu_t.shape[0]
Exemplo n.º 51
0
def main(sequence_artifact, seq_samplesize, output_dir):
    demux_artifact = Artifact.load(sequence_artifact)
    demux_viz = demux.visualizers.summarize(demux_artifact, n=seq_samplesize)
    demux_viz.visualization.export_data(output_dir)
    help="the reference_reads path")
parser.add_argument(
    '-t',
    '--ref_taxa',
    dest='reference_taxonomy',
    type=str,
    required=False,
    default="/mnt/d/Lab/TaxaIdentification/classifier/silva-138-99-tax.qza",
    help="the reference_taxonomy path")

args = parser.parse_args()

inputDir = os.path.abspath(args.fileDir)
outputDir = os.path.abspath(args.OpDir)
ref_reads = os.path.abspath(args.reference_reads)
ref_taxa = os.path.abspath(args.reference_taxonomy)

artifact = Artifact.import_data('FeatureData[Sequence]', inputDir)

reference_reads = Artifact.load(ref_reads)
reference_taxonomy = Artifact.load(ref_taxa)

taxonomy = classify_consensus_vsearch(artifact,
                                      reference_reads,
                                      reference_taxonomy,
                                      threads=8)

mafft_alignment = align_to_tree_mafft_fasttree(artifact, 8)

Artifact.export_data(taxonomy.classification, outputDir)
Artifact.export_data(mafft_alignment.tree, outputDir)
Exemplo n.º 53
0
 def test_beta_empty_table(self):
     t = Table(np.array([]), [], [])
     t = Artifact.import_data('FeatureTable[Frequency]', t)
     with self.assertRaisesRegex(ValueError, 'empty'):
         self.beta(table=t, metric='braycurtis')
Exemplo n.º 54
0
def tada(phylogeny: NewickFormat, table: biom.Table,
         meta_data: NumericMetadataColumn = None,
         seed_num: Int = 0, xgen: Int = 0,
         n_beta: Int = 1, n_binom: Int = 5, var_method: Str = 'br_penalized',
         stat_method: Str = 'binom', prior_weight: Float = 0,
         coef: Float = 200, exponent: Float = 0.5,
         pseudo_branch_length: Float = 1e-6, pseudo_cnt: Float = 5,
         normalized: Bool = False, output_log_fp: Str = None,
         original_table: Str = None, augmented_table: Str = None,
         concatenate_meta: Metadata = None,
         sampling_strategy: Str = None) -> (NewickFormat, biom.Table):
    _table, y, _phylogeny, generate_strategy, pruned_phylogeny = \
        _read_inputs(biom_table=table, phylogeny_fp=phylogeny,
                     meta_data=meta_data)
    if generate_strategy is 'balancing' and (concatenate_meta is None):
        raise ValueError(
            "Expected a path to write out the generated and original labels and metadata!"
        )
    tmp = tempfile.mkdtemp()
    try:
        sG = SampleGenerator(seed_num=seed_num, logger=None,
                             generate_strategy=generate_strategy, tmp_dir=tmp,
                             xgen=xgen, n_beta=n_beta, n_binom=n_binom,
                             var_method=var_method, stat_method=stat_method,
                             prior_weight=prior_weight,
                             coef=coef, exponent=exponent,
                             pseudo=pseudo_branch_length,
                             pseudo_cnt=pseudo_cnt, normalized=normalized)

        orig_biom, orig_labels, augm_biom, augm_labels = \
            sG.fit_transform(table=_table, y=y, tree=_phylogeny,
                             sampling_strategy=sampling_strategy)
        if np.sum(orig_biom.matrix_data - table.matrix_data) > 1e-20:
            raise ValueError(
                "The original biom table doesn't match the "
                "output of generator function! Please double check")
        if generate_strategy is 'balancing':
            orig_pd, augm_pd = make_data_frame(orig_biom, augm_biom, orig_labels, augm_labels)

            if concatenate_meta and not os.path.exists(os.path.dirname(str(concatenate_meta))):
                os.mkdir(os.path.dirname(str(concatenate_meta)))

            concat_pd = pd.concat([orig_pd, augm_pd])
            concat_meta = qiime2.Metadata(concat_pd)
            concat_meta.save(concatenate_meta)

        if output_log_fp is not None:
            if not os.path.exists(os.path.dirname(output_log_fp)):
                os.mkdir(os.path.dirname(output_log_fp))
            shutil.copyfile(sG.log_fp, output_log_fp)
        if np.sum(orig_biom.ids('observation') == augm_biom.ids('observation'))\
                != len(orig_biom.ids('observation')):
            raise ValueError(
                "The order of features in original and augmented data "
                "is different. Please make sure that your phylogeny doesn't "
                "have extra features"
            )
        if original_table and \
                not os.path.exists(os.path.dirname(str(original_table))):
            os.mkdir(os.path.dirname(str(original_table)))
        elif augmented_table and \
                not os.path.exists(os.path.dirname(str(original_table))):
            os.mkdir(os.path.dirname(str(augmented_table)))
        if augmented_table is not None:
            augm_qza = \
                Artifact.import_data("FeatureTable[Frequency]", augm_biom)
            augm_qza.save(augmented_table)
        if original_table is not None:
            orig_qza = \
                Artifact.import_data("FeatureTable[Frequency]", orig_biom)
            orig_qza.save(original_table)

        concat_biom = orig_biom
        concat_biom = concat_biom.merge(augm_biom)
    finally:
        print("Something went wrong")

    return pruned_phylogeny, concat_biom
Exemplo n.º 55
0
    def test_mismatch(self):
        a = Artifact.import_data("Foo % Properties('X')",
                                 'element 1', view_type=str)

        with self.assertRaises(TypeError):
            self.run_action(a=a)
Exemplo n.º 56
0
    def test_true(self):
        a = Artifact.import_data('Bar', 'element', view_type=str)

        x, = self.run_action(a=a, b=True)

        self.assertEqual(repr(x.type), 'C1[Foo]')
Exemplo n.º 57
0
def run_integration_test(
    input_dir_name,
    output_dir_name,
    ranks_name,
    table_name,
    sample_metadata_name,
    feature_metadata_name=None,
    use_q2=False,
    q2_ranking_tool="songbird",
    expected_unsupported_samples=0,
    expected_unsupported_features=0,
    expect_all_unsupported_samples=False,
    q2_table_biom_format="BIOMV210Format",
    extreme_feature_count=None,
):
    """Runs qurro, and validates the output somewhat.

       Note that this is a pretty outdated function (as in, it doesn't support
       checking many of the corner cases/etc. that happen when running Qurro).
       The main purpose of this function is just checking at a high level that
       things look good, and that data is faithfully represented in the output
       main.js file.
    """

    in_dir = os.path.join("qurro", "tests", "input", input_dir_name)
    rloc = os.path.join(in_dir, ranks_name)
    tloc = os.path.join(in_dir, table_name)
    sloc = os.path.join(in_dir, sample_metadata_name)
    floc = None
    if feature_metadata_name is not None:
        floc = os.path.join(in_dir, feature_metadata_name)
    out_dir = os.path.join("docs", "demos", output_dir_name)

    rrv_qzv = result = None
    if use_q2:
        if q2_ranking_tool == "songbird":
            q2_action = q2qurro.actions.differential_plot
            q2_rank_type = "FeatureData[Differential]"
        elif q2_ranking_tool == "DEICODE":
            q2_action = q2qurro.actions.loading_plot
            q2_rank_type = "PCoAResults % Properties(['biplot'])"
        else:
            raise ValueError(
                "Unknown q2_ranking_tool: {}".format(q2_ranking_tool)
            )
        # Import all of these files as Q2 artifacts or metadata.
        rank_qza = Artifact.import_data(q2_rank_type, rloc)
        table_qza = Artifact.import_data(
            "FeatureTable[Frequency]", tloc, view_type=q2_table_biom_format
        )
        sample_metadata = Metadata.load(sloc)
        feature_metadata = None
        if floc is not None:
            feature_metadata = Metadata.load(floc)

        # Now that everything's imported, try running qurro
        rrv_qzv = q2_action(
            ranks=rank_qza,
            table=table_qza,
            sample_metadata=sample_metadata,
            feature_metadata=feature_metadata,
            extreme_feature_count=extreme_feature_count,
        )
        # Output the contents of the visualization to out_dir.
        rrv_qzv.visualization.export_data(out_dir)
    else:
        # Run qurro "standalone" -- i.e. outside of QIIME 2
        runner = CliRunner()
        args = [
            "--ranks",
            rloc,
            "--table",
            tloc,
            "--sample-metadata",
            sloc,
            "--output-dir",
            out_dir,
        ]
        if floc is not None:
            args += ["--feature-metadata", floc]
        if extreme_feature_count is not None:
            args += ["--extreme-feature-count", extreme_feature_count]
        result = runner.invoke(rrvp.plot, args)
        # Validate that the correct exit code and output were recorded
        validate_standalone_result(
            result,
            expected_unsupported_samples=expected_unsupported_samples,
            expect_all_unsupported_samples=expect_all_unsupported_samples,
            expected_unsupported_features=expected_unsupported_features,
        )
    # If we expected this test to fail due to invalid inputs, don't bother
    # doing any JSON validation.
    # (Input validity checking is done in generate.process_input(), before
    # any output files are created in generate.gen_visualization() -- so no
    # output should be created anyway in these cases.)
    if expect_all_unsupported_samples or expected_unsupported_features > 0:
        return None, None
    else:
        # Only validate JSONs if -x wasn't specified (i.e. the passed
        # extreme feature count is None)
        validate_jsons = extreme_feature_count is None
        rank_json, sample_json, count_json = validate_main_js(
            out_dir, rloc, tloc, sloc, validate_jsons=validate_jsons
        )
        return rank_json, sample_json, count_json
Exemplo n.º 58
0
    def test_false(self):
        a = Artifact.import_data('Bar', 'element', view_type=str)

        x, = self.run_action(a=a, b=False)

        self.assertEqual(repr(x.type), 'Foo')
Exemplo n.º 59
0
    def run(self, factory):
        factory.validate()
        _check_unique_names(factory)

        configs = []
        for config in factory.gen_configurations():
            configs.append(config)

        # Run test
        configs = [x for x in factory.gen_configurations()]

        self.callbacks.batch_info(configs)
        for config in configs:
            try:
                print(config.analysis_name)

                # Run this custom preprocessing
                (final_biom, target, _,
                 _) = run_preprocessing(config, self.callbacks)

                base_dir = "dataset"
                dataset = "imsms-mlab"
                preparation = config.analysis_name
                target_name = "disease_binary"
                algorithm = config.mlab_algorithm
                if algorithm is None:
                    algorithm = "RandomForestClassifier"

                # Create the  expected file structure
                results_dir = path.join(base_dir, dataset, preparation,
                                        target_name)
                if not path.exists(results_dir):
                    makedirs(results_dir)

                # Save table and metadata
                table_fp = path.join(base_dir, dataset, preparation,
                                     target_name, "filtered_table.qza")
                table_artifact = Artifact.import_data(
                    "FeatureTable[Frequency]", final_biom)
                table_artifact.save(table_fp)

                metadata_fp = path.join(base_dir, dataset, preparation,
                                        target_name, "filtered_metadata.qza")
                metadata_artifact = Artifact.import_data(
                    "SampleData[Target]", target)
                metadata_artifact.save(metadata_fp)

                # run job indices 1...10 inclusive, letting 5 jobs run at once
                # each job has 100 parameter sets, for a total of 1000 parameter sets
                start = 1
                end = 10
                n_concurrent_jobs = 5
                chunk_size = 100
                (
                    script_fp,
                    params_fp,
                    run_info_fp,
                ) = orchestrate_hyperparameter_search(
                    dataset=dataset,
                    preparation=preparation,
                    target=target_name,
                    algorithm=algorithm,
                    repeats=3,  # num CV repeats
                    base_dir=
                    base_dir,  # Directory with mlab structure containing datasets
                    ppn=4,  # processors per node
                    memory=32,  # memory in GB
                    wall=50,  # walltime  in hours
                    chunk_size=chunk_size,  # num parameter sets to run per job
                    randomize=True,  # randomly shuffle order of parameter set
                    force=False,  # force overwrite of existing results
                    dry=False,  # dry runs
                    dataset_path=table_fp,
                    metadata_path=metadata_fp,
                )

                cmd = [
                    "qsub", "-t", f"{start}-{end}%{n_concurrent_jobs}",
                    script_fp
                ]
                subprocess.run(cmd)

            except Exception:
                print(f"TEST FAILURE. CONFIG: " + config.analysis_name)
                traceback.print_exc()
Exemplo n.º 60
0
 def setUp(self):
     self.q2table = Artifact.import_data("FeatureTable[Frequency]",
                                         create_test_table())