Exemplo n.º 1
0
    def test_no_artifacts(self):
        md1 = Metadata(
            pd.DataFrame({'a': [1, 2]},
                         index=pd.Index(['id1', 'id2'], name='id')))
        md2 = Metadata(
            pd.DataFrame({'b': [3, 4]},
                         index=pd.Index(['id1', 'id2'], name='id')))

        metadata = md1.merge(md2)

        self.assertEqual(metadata.artifacts, ())
Exemplo n.º 2
0
    def test_id_column_only(self):
        md1 = Metadata(
            pd.DataFrame({}, index=pd.Index(['id1', 'id2', 'id3'], name='id')))
        md2 = Metadata(
            pd.DataFrame({}, index=pd.Index(['id2', 'X', 'id1'], name='id')))
        md3 = Metadata(
            pd.DataFrame({}, index=pd.Index(['id1', 'id3', 'id2'], name='id')))

        obs = md1.merge(md2, md3)

        exp = Metadata(
            pd.DataFrame({}, index=pd.Index(['id1', 'id2'], name='id')))
        self.assertEqual(obs, exp)
Exemplo n.º 3
0
    def test_empty_metadata(self):
        # No index, no columns.
        df = pd.DataFrame([], index=pd.Index([], name='id'))

        with self.assertRaisesRegex(ValueError, 'Metadata.*at least one ID'):
            Metadata(df)

        # No index, has columns.
        df = pd.DataFrame([], index=pd.Index([], name='id'),
                          columns=['a', 'b'])

        with self.assertRaisesRegex(ValueError, 'Metadata.*at least one ID'):
            Metadata(df)
Exemplo n.º 4
0
    def test_invalid_column_dtype_w_null(self):
        columns = pd.Index(['a', float('nan')], dtype=object)
        with self.assertRaisesRegex(TypeError, 'non-string.*column name.*nan'):
            Metadata(pd.DataFrame([['val1', 'val2']],
                                  index=pd.Index(['x'], name='id'),
                                  columns=columns))

        columns = pd.Index(['a', None], dtype=object)
        with self.assertRaisesRegex(TypeError,
                                    'non-string.*column name.*None'):
            Metadata(pd.DataFrame([['val1', 'val2']],
                                  index=pd.Index(['x'], name='id'),
                                  columns=columns))
Exemplo n.º 5
0
    def test_merged_id_column_name(self):
        md1 = Metadata(pd.DataFrame(
            {'a': [1, 2]},
            index=pd.Index(['id1', 'id2'], name='sample ID')))
        md2 = Metadata(pd.DataFrame(
            {'b': [3, 4]},
            index=pd.Index(['id1', 'id2'], name='feature ID')))

        obs = md1.merge(md2)

        exp = Metadata(pd.DataFrame(
            {'a': [1, 2], 'b': [3, 4]},
            index=pd.Index(['id1', 'id2'], name='id')))
        self.assertEqual(obs, exp)
Exemplo n.º 6
0
    def test_inner_join(self):
        md1 = Metadata(
            pd.DataFrame({
                'a': [1, 2, 3],
                'b': [4, 5, 6]
            },
                         index=pd.Index(['id1', 'id2', 'id3'], name='id')))
        md2 = Metadata(
            pd.DataFrame({
                'c': [7, 8, 9],
                'd': [10, 11, 12]
            },
                         index=pd.Index(['id2', 'X', 'Y'], name='id')))
        md3 = Metadata(
            pd.DataFrame({
                'e': [13, 14, 15],
                'f': [16, 17, 18]
            },
                         index=pd.Index(['X', 'id3', 'id2'], name='id')))

        # Single shared ID.
        obs = md1.merge(md2, md3)

        exp = Metadata(
            pd.DataFrame(
                {
                    'a': [2],
                    'b': [5],
                    'c': [7],
                    'd': [10],
                    'e': [15],
                    'f': [18]
                },
                index=pd.Index(['id2'], name='id')))
        self.assertEqual(obs, exp)

        # Multiple shared IDs.
        obs = md1.merge(md3)

        exp = Metadata(
            pd.DataFrame(
                {
                    'a': [2, 3],
                    'b': [5, 6],
                    'e': [15, 14],
                    'f': [18, 17]
                },
                index=pd.Index(['id2', 'id3'], name='id')))
        self.assertEqual(obs, exp)
Exemplo n.º 7
0
    def test_equality_without_artifact(self):
        md1 = Metadata(
            pd.DataFrame({
                'a': '1',
                'b': '3'
            },
                         index=pd.Index(['0'], name='id')))
        md2 = Metadata(
            pd.DataFrame({
                'a': '1',
                'b': '3'
            },
                         index=pd.Index(['0'], name='id')))

        self.assertReallyEqual(md1, md2)
Exemplo n.º 8
0
    def test_merging_two(self):
        md1 = Metadata(pd.DataFrame(
            {'a': [1, 2, 3], 'b': [4, 5, 6]},
            index=pd.Index(['id1', 'id2', 'id3'], name='id')))
        md2 = Metadata(pd.DataFrame(
            {'c': [7, 8, 9], 'd': [10, 11, 12]},
            index=pd.Index(['id1', 'id2', 'id3'], name='id')))

        obs = md1.merge(md2)

        exp = Metadata(pd.DataFrame(
            {'a': [1, 2, 3], 'b': [4, 5, 6],
             'c': [7, 8, 9], 'd': [10, 11, 12]},
            index=pd.Index(['id1', 'id2', 'id3'], name='id')))
        self.assertEqual(obs, exp)
Exemplo n.º 9
0
    def test_data_mismatch(self):
        md1 = Metadata(
            pd.DataFrame({
                'a': '1',
                'b': '3'
            },
                         index=pd.Index(['0'], name='id')))
        md2 = Metadata(
            pd.DataFrame({
                'a': '1',
                'b': '2'
            },
                         index=pd.Index(['0'], name='id')))

        self.assertReallyNotEqual(md1, md2)
Exemplo n.º 10
0
    def test_case_insensitive_duplicate_column_names(self):
        index = pd.Index(['a', 'b', 'c'], name='id')
        df = pd.DataFrame({'column': ['1', '2', '3'],
                           'Column': ['4', '5', '6']}, index=index)
        metadata = Metadata(df)

        self.assertEqual(set(metadata.columns), {'column', 'Column'})
Exemplo n.º 11
0
    def test_duplicate_indices(self):
        index = pd.Index(['a', 'b', 'b'], name='id', dtype=object)
        df = pd.DataFrame({'foo': [1, 2, 3]}, index=index)

        with self.assertRaisesRegex(ValueError,
                                    "IDs must be unique.*'b'"):
            Metadata(df)
Exemplo n.º 12
0
    def test_invalid_index_dtype_w_null(self):
        index = pd.Index(['a', float('nan'), 'b'], name='id', dtype=object)
        with self.assertRaisesRegex(TypeError, 'non-string.*ID.*nan'):
            Metadata(
                pd.DataFrame({
                    'x': [1, 2, 3],
                    'y': [4, 5, 6]
                }, index=index))

        index = pd.Index(['a', None, 'c'], name='id', dtype=object)
        with self.assertRaisesRegex(TypeError, 'non-string.*ID.*None'):
            Metadata(
                pd.DataFrame({
                    'x': [1, 2, 3],
                    'y': [4, 5, 6]
                }, index=index))
Exemplo n.º 13
0
    def test_various_numbers(self):
        numbers = [
            0.0, -0.0, np.nan, 1.0, 42.0, -33.0, 1e-10, 1.5e15, 0.0003, -4.234,
            # This last number should be rounded because it exceeds 15 digits
            # of precision.
            12.34567891234567
        ]
        index = pd.Index(['id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7',
                          'id8', 'id9', 'id10', 'id11'], name='ID')
        md = Metadata(pd.DataFrame({'numbers': numbers}, index=index))

        md.save(self.filepath)

        with open(self.filepath, 'r') as fh:
            obs = fh.read()

        exp = (
            "ID\tnumbers\n"
            "#q2:types\tnumeric\n"
            "id1\t0\n"
            "id2\t-0\n"
            "id3\t\n"
            "id4\t1\n"
            "id5\t42\n"
            "id6\t-33\n"
            "id7\t1e-10\n"
            "id8\t1.5e+15\n"
            "id9\t0.0003\n"
            "id10\t-4.234\n"
            "id11\t12.3456789123457\n"
        )

        self.assertEqual(obs, exp)
Exemplo n.º 14
0
    def test_ids_and_column_names_as_numeric_strings(self):
        index = pd.Index(['0.000001', '0.004000', '0.000000'],
                         dtype=object, name='id')
        columns = ['42.0', '1000', '-4.2']
        data = [
            [2.0, 'b', 2.5],
            [1.0, 'b', 4.2],
            [3.0, 'c', -9.999]
        ]
        df = pd.DataFrame(data, index=index, columns=columns)
        md = Metadata(df)

        md.save(self.filepath)

        with open(self.filepath, 'r') as fh:
            obs = fh.read()

        exp = (
            "id\t42.0\t1000\t-4.2\n"
            "#q2:types\tnumeric\tcategorical\tnumeric\n"
            "0.000001\t2\tb\t2.5\n"
            "0.004000\t1\tb\t4.2\n"
            "0.000000\t3\tc\t-9.999\n"
        )

        self.assertEqual(obs, exp)
Exemplo n.º 15
0
    def test_duplicate_columns_self_merge(self):
        md = Metadata(pd.DataFrame(
            {'a': [1, 2], 'b': [3, 4]},
            index=pd.Index(['id1', 'id2'], name='id')))

        with self.assertRaisesRegex(ValueError, "columns overlap: 'a', 'b'"):
            md.merge(md)
Exemplo n.º 16
0
    def test_valid_metadata_id_column_only(self):
        index = pd.Index(['a', 'b', 'c'], name='ID', dtype=object)
        df = pd.DataFrame({}, index=index, dtype=object)
        metadata = Metadata(df)

        self.assertEqual(metadata.id_count, 3)
        self.assertEqual(metadata.column_count, 0)
Exemplo n.º 17
0
    def test_non_standard_characters(self):
        # Test that non-standard characters in IDs, column names, and cells are
        # handled correctly. The test case isn't exhaustive (e.g. it doesn't
        # test every Unicode character; that would be a nice additional test
        # case to have in the future). Instead, this test aims to be more of an
        # integration test for the robustness of the reader to non-standard
        # data. Many of the characters and their placement within the data file
        # are based on use-cases/bugs reported on the forum, Slack, etc. The
        # data file has comments explaining these test case choices in more
        # detail.
        fp = get_data_path('valid/non-standard-characters.tsv')

        obs_md = Metadata.load(fp)

        exp_index = pd.Index(
            ['©id##1', '((id))2', "'id_3<>'", '"id#4"', 'i d\r\t\n5'],
            name='id')
        exp_columns = [
            '↩c@l1™', 'col(#2)', "#col'3", '"<col_4>"', 'col\t  \r\n5'
        ]
        exp_data = [['ƒoo', '(foo)', '#f o #o', 'fo\ro', np.nan],
                    ["''2''", 'b#r', 'ba\nr', np.nan, np.nan],
                    ['b"ar', 'c\td', '4\r\n2', np.nan, np.nan],
                    ['b__a_z', '<42>', '>42', np.nan, np.nan],
                    ['baz', np.nan, '42']]
        exp_df = pd.DataFrame(exp_data, index=exp_index, columns=exp_columns)
        exp_md = Metadata(exp_df)

        self.assertEqual(obs_md, exp_md)
Exemplo n.º 18
0
    def test_simple_expression(self):
        df = pd.DataFrame(
            {
                'Subject': ['subject-1', 'subject-1', 'subject-2'],
                'SampleType': ['gut', 'tongue', 'gut']
            },
            index=pd.Index(['S1', 'S2', 'S3'], name='id'))
        metadata = Metadata(df)

        where = "Subject='subject-1'"
        actual = metadata.get_ids(where)
        expected = {'S1', 'S2'}
        self.assertEqual(actual, expected)

        where = "Subject='subject-2'"
        actual = metadata.get_ids(where)
        expected = {'S3'}
        self.assertEqual(actual, expected)

        where = "Subject='subject-3'"
        actual = metadata.get_ids(where)
        expected = set()
        self.assertEqual(actual, expected)

        where = "SampleType='gut'"
        actual = metadata.get_ids(where)
        expected = {'S1', 'S3'}
        self.assertEqual(actual, expected)

        where = "SampleType='tongue'"
        actual = metadata.get_ids(where)
        expected = {'S2'}
        self.assertEqual(actual, expected)
Exemplo n.º 19
0
    def test_artifacts(self):
        index = pd.Index(['a', 'b', 'c'], name='id', dtype=object)
        df = pd.DataFrame({'col1': ['2', '1', '3']}, index=index, dtype=object)

        metadata = Metadata(df)

        self.assertEqual(metadata.artifacts, ())
Exemplo n.º 20
0
    def test_duplicate_columns(self):
        md1 = Metadata(
            pd.DataFrame({
                'a': [1, 2],
                'b': [3, 4]
            },
                         index=pd.Index(['id1', 'id2'], name='id')))
        md2 = Metadata(
            pd.DataFrame({
                'c': [5, 6],
                'b': [7, 8]
            },
                         index=pd.Index(['id1', 'id2'], name='id')))

        with self.assertRaisesRegex(ValueError, "columns overlap: 'b'"):
            md1.merge(md2)
Exemplo n.º 21
0
    def test_disjoint_indices(self):
        md1 = Metadata(
            pd.DataFrame({
                'a': [1, 2, 3],
                'b': [4, 5, 6]
            },
                         index=pd.Index(['id1', 'id2', 'id3'], name='id')))
        md2 = Metadata(
            pd.DataFrame({
                'c': [7, 8, 9],
                'd': [10, 11, 12]
            },
                         index=pd.Index(['X', 'Y', 'Z'], name='id')))

        with self.assertRaisesRegex(ValueError, 'no IDs shared'):
            md1.merge(md2)
Exemplo n.º 22
0
 def test_invalid_columns_dtype(self):
     with self.assertRaisesRegex(TypeError, 'non-string.*column name.*42'):
         Metadata(
             pd.DataFrame({
                 'foo': ['a', 'b'],
                 42: ['c', 'd']
             },
                          index=pd.Index(['0', '1'], name='id')))
Exemplo n.º 23
0
    def test_merging_nothing(self):
        md = Metadata(pd.DataFrame(
            {'a': [1, 2, 3], 'b': [4, 5, 6]},
            index=pd.Index(['id1', 'id2', 'id3'], name='id')))

        with self.assertRaisesRegex(ValueError,
                                    'At least one Metadata.*nothing to merge'):
            md.merge()
Exemplo n.º 24
0
    def test_duplicate_columns(self):
        index = pd.Index(['a', 'b'], name='id', dtype=object)
        df = pd.DataFrame({'foo': [1, 2], 'bar': [3, 4]}, index=index)
        df.columns = ['foo', 'foo']

        with self.assertRaisesRegex(ValueError,
                                    "column names must be unique.*'foo'"):
            Metadata(df)
Exemplo n.º 25
0
    def test_valid_metadata_str(self):
        index = pd.Index(['a', 'b', 'c'], name='sample id', dtype=str)
        df = pd.DataFrame({'col1': ['2', '1', '3']}, index=index, dtype=str)
        metadata = Metadata(df)

        obs_type = metadata.columns['col1'].type

        self.assertEqual(obs_type, 'categorical')
Exemplo n.º 26
0
    def test_valid_metadata(self):
        index = pd.Index(['a', 'b', 'c'], name='feature ID', dtype=object)
        df = pd.DataFrame({'col1': ['2', '1', '3']}, index=index, dtype=object)
        metadata = Metadata(df)

        obs_type = metadata.columns['col1'].type

        self.assertEqual(obs_type, 'categorical')
Exemplo n.º 27
0
    def test_invalid_where(self):
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue', 'gut']},
                          index=pd.Index(['S1', 'S2', 'S3'], name='sampleid'))
        metadata = Metadata(df)

        where = "not-a-column-name='subject-1'"
        with self.assertRaises(ValueError):
            metadata.get_ids(where)
Exemplo n.º 28
0
    def test_query_by_id(self):
        df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
                           'SampleType': ['gut', 'tongue', 'gut']},
                          index=pd.Index(['S1', 'S2', 'S3'], name='id'))
        metadata = Metadata(df)

        actual = metadata.get_ids(where="id='S2' OR id='S1'")
        expected = {'S1', 'S2'}
        self.assertEqual(actual, expected)
Exemplo n.º 29
0
    def test_single_column(self):
        fp = get_data_path('valid/single-column.tsv')

        obs_md = Metadata.load(fp)

        exp_index = pd.Index(['id1', 'id2', 'id3'], name='id')
        exp_df = pd.DataFrame({'col1': [1.0, 2.0, 3.0]}, index=exp_index)
        exp_md = Metadata(exp_df)

        self.assertEqual(obs_md, exp_md)
Exemplo n.º 30
0
    def test_no_columns(self):
        fp = get_data_path('valid/no-columns.tsv')

        obs_md = Metadata.load(fp)

        exp_index = pd.Index(['a', 'b', 'my-id'], name='id')
        exp_df = pd.DataFrame({}, index=exp_index)
        exp_md = Metadata(exp_df)

        self.assertEqual(obs_md, exp_md)