示例#1
0
    def test_importer_date(self):
        expected_birthday_date = [
            '1990-01-01', '1980-03-04', '1970-04-05', '1960-04-05',
            '1950-04-05'
        ]

        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False, {}, {},
                                        self.ts,
                                        chunk_row_size=self.chunk_row_size)
            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['birthday'].data[:].tolist(), [
                datetime.strptime(
                    x, "%Y-%m-%d").replace(tzinfo=timezone.utc).timestamp()
                for x in expected_birthday_date
            ])

        with h5py.File(bio, 'r') as hf:
            self.assertEqual(
                hf['schema_key']['birthday']['values'][:].tolist(), [
                    datetime.strptime(x, "%Y-%m-%d").replace(
                        tzinfo=timezone.utc).timestamp()
                    for x in expected_birthday_date
                ])
示例#2
0
    def test_read_csv_only_datetime_field(self):
        bio = BytesIO()
        with session.Session() as s:
            dst = s.open_dataset(bio, 'w', 'dst')
            df = dst.create_dataframe('df')

            parsers.read_csv(self.csv_file_name,
                             df,
                             self.schema_dict,
                             include=['updated_at'])

            expected_updated_at_list = [
                '2020-05-12 07:00:00', '2020-05-13 01:00:00',
                '2020-05-14 03:00:00', '2020-05-15 03:00:00',
                '2020-05-16 03:00:00'
            ]
            expected_updated_at_date_list = [
                b'2020-05-12', b'2020-05-13', b'2020-05-14', b'2020-05-15',
                b'2020-05-16'
            ]
            self.assertEqual(df['updated_at'].data[:].tolist(), [
                datetime.strptime(x, "%Y-%m-%d %H:%M:%S").replace(
                    tzinfo=timezone.utc).timestamp()
                for x in expected_updated_at_list
            ])
            self.assertEqual(df['updated_at_day'].data[:].tolist(),
                             expected_updated_at_date_list)
    def test_ordered_map_valid_stream(self):
        s = session.Session()
        bio = BytesIO()
        with h5py.File(bio, 'w') as hf:
            map_field = np.asarray([
                0, 0, 0, 1, 1, 3, 3, 3, 3, 5, 5, 5, 5, ops.INVALID_INDEX,
                ops.INVALID_INDEX, 7, 7, 7
            ],
                                   dtype=np.int64)
            data_field = np.asarray([-1, -2, -3, -4, -5, -6, -8, -9],
                                    dtype=np.int32)
            f_map_field = s.create_numeric(hf, "map_field", "int64")
            f_map_field.data.write(map_field)
            f_data_field = s.create_numeric(hf, "data_field", "int32")
            f_data_field.data.write(data_field)

            result_field = np.zeros(len(map_field), dtype=np.int32)
            ops.ordered_map_valid_stream(f_data_field, f_map_field,
                                         result_field, 4)
            expected = np.asarray([
                -1, -1, -1, -2, -2, -4, -4, -4, -4, -6, -6, -6, -6, 0, 0, -9,
                -9, -9
            ],
                                  dtype=np.int32)
            self.assertTrue(np.array_equal(result_field, expected))
示例#4
0
    def test_dataframe_init(self):
        bio = BytesIO()
        with session.Session() as s:
            dst = s.open_dataset(bio, 'w', 'dst')
            # init
            df = dst.create_dataframe('dst')
            self.assertTrue(isinstance(df, dataframe.DataFrame))
            numf = df.create_numeric('numf', 'uint32')
            df2 = dst.create_dataframe('dst2', dataframe=df)
            self.assertTrue(isinstance(df2, dataframe.DataFrame))

            # add & set & contains
            self.assertTrue('numf' in df)
            self.assertTrue('numf' in df2)
            cat = s.create_categorical(df2, 'cat', 'int8', {'a': 1, 'b': 2})
            self.assertFalse('cat' in df)
            self.assertFalse(df.contains_field(cat))
            df['cat'] = cat
            self.assertTrue('cat' in df)

            # list & get
            self.assertEqual(id(numf), id(df.get_field('numf')))
            self.assertEqual(id(numf), id(df['numf']))

            # list & iter
            dfit = iter(df)
            self.assertEqual('numf', next(dfit))
            self.assertEqual('cat', next(dfit))

            # del & del by field
            del df['numf']
            self.assertFalse('numf' in df)
            with self.assertRaises(ValueError, msg="This field is owned by a different dataframe"):
                df.delete_field(cat)
            self.assertFalse(df.contains_field(cat))
示例#5
0
    def tests_merge_left_compound_key(self):

        l_id_1 = np.asarray([0, 0, 0, 0, 1, 1, 1, 1], dtype='int32')
        l_id_2 = np.asarray([0, 1, 2, 3, 0, 1, 2, 3], dtype='int32')
        r_id_1 = np.asarray([0, 1, 0, 1, 0, 1, 0, 1], dtype='int32')
        r_id_2 = np.asarray([0, 0, 1, 1, 2, 2, 3, 3], dtype='int32')
        l_vals = ['00', '01', '02', '03', '10', '11', '12', '13']
        r_vals = ['00', '10', '01', '11', '02', '12', '03', '13']
        expected = ['00', '01', '02', '03', '10', '11', '12', '13']

        bio = BytesIO()
        with session.Session() as s:
            dst = s.open_dataset(bio, 'w', 'dst')
            ldf = dst.create_dataframe('ldf')
            rdf = dst.create_dataframe('rdf')
            ldf.create_numeric('l_id_1', 'int32').data.write(l_id_1)
            ldf.create_numeric('l_id_2', 'int32').data.write(l_id_2)
            ldf.create_indexed_string('l_vals').data.write(l_vals)
            rdf.create_numeric('r_id_1', 'int32').data.write(r_id_1)
            rdf.create_numeric('r_id_2', 'int32').data.write(r_id_2)
            rdf.create_indexed_string('r_vals').data.write(r_vals)
            ddf = dst.create_dataframe('ddf')
            dataframe.merge(ldf, rdf, ddf, ('l_id_1', 'l_id_2'), ('r_id_1', 'r_id_2'), how='left')
            self.assertEqual(expected, ddf['l_vals'].data[:])
            self.assertEqual(expected, ddf['r_vals'].data[:])
            self.assertEqual(ddf['l_id_1'].data[:].tolist(), ddf['r_id_1'].data[:].tolist())
            self.assertEqual(ddf['r_id_2'].data[:].tolist(), ddf['r_id_2'].data[:].tolist())
示例#6
0
    def test_streaming_sort_merge(self):
        bio = BytesIO()
        with session.Session() as s:
            dst = s.open_dataset(bio, 'r+', 'dst')
            hf = dst.create_dataframe('hf')
            rs = np.random.RandomState(12345678)
            length = 105
            segment_length = 25
            chunk_length = 8
            src_values = np.arange(length, dtype=np.int32)
            src_values += 1000
            rs.shuffle(src_values)
            src_v_f = s.create_numeric(hf, 'src_values', 'int32')
            src_v_f.data.write(src_values)
            src_i_f = s.create_numeric(hf, 'src_indices', 'int64')
            src_i_f.data.write(np.arange(length, dtype=np.int64))

            for c in utils.chunks(length, segment_length):
                sorted_index = np.argsort(src_v_f.data[c[0]:c[1]])
                src_v_f.data[c[0]:c[1]] =\
                    s.apply_index(sorted_index, src_v_f.data[c[0]:c[1]])
                src_i_f.data[c[0]:c[1]] =\
                    s.apply_index(sorted_index, src_i_f.data[c[0]:c[1]])

            tgt_i_f = s.create_numeric(hf, 'tgt_values', 'int32')
            tgt_v_f = s.create_numeric(hf, 'tgt_indices', 'int64')
            ops.streaming_sort_merge(src_i_f, src_v_f, tgt_i_f, tgt_v_f,
                                     segment_length, chunk_length)

            self.assertTrue(
                np.array_equal(tgt_v_f.data[:], np.sort(src_values[:])))
            self.assertTrue(
                np.array_equal(tgt_i_f.data[:], np.argsort(src_values)))
示例#7
0
    def tests_merge_outer(self):

        r_id = np.asarray([0, 1, 2, 3, 4, 5, 6, 7], dtype='int32')
        l_id = np.asarray([2, 3, 0, 4, 7, 6, 2, 0, 3], dtype='int32')
        r_vals = [
            'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven'
        ]
        l_vals = [
            'bb1', 'ccc1', '', 'dddd1', 'ggggggg1', 'ffffff1', 'bb2', '',
            'ccc2'
        ]
        expected_left = [
            'bb1', 'bb2', 'ccc1', 'ccc2', '', '', 'dddd1', 'ggggggg1',
            'ffffff1', '', ''
        ]
        expected_right = [
            'two', 'two', 'three', 'three', 'zero', 'zero', 'four', 'seven',
            'six', 'one', 'five'
        ]

        bio = BytesIO()
        with session.Session() as s:
            dst = s.open_dataset(bio, 'w', 'dst')
            ldf = dst.create_dataframe('ldf')
            rdf = dst.create_dataframe('rdf')
            ldf.create_numeric('l_id', 'int32').data.write(l_id)
            ldf.create_indexed_string('l_vals').data.write(l_vals)
            rdf.create_numeric('r_id', 'int32').data.write(r_id)
            rdf.create_indexed_string('r_vals').data.write(r_vals)
            ddf = dst.create_dataframe('ddf')
            dataframe.merge(ldf, rdf, ddf, 'l_id', 'r_id', how='outer')
            self.assertEqual(expected_left, ddf['l_vals'].data[:])
            self.assertEqual(expected_right, ddf['r_vals'].data[:])
示例#8
0
    def test_ordered_merge_inner(self):
        l_id = np.asarray([b'a', b'b', b'd', b'f', b'g', b'h'], dtype='S1')
        l_vals = np.asarray([100, 200, 400, 600, 700, 800])
        l_vals_2 = np.asarray([10000, 20000, 40000, 60000, 70000, 80000])

        r_id = np.asarray(
            [b'a', b'c', b'c', b'd', b'd', b'e', b'e', b'f', b'f', b'h', b'h'],
            dtype='S1')
        r_vals = np.asarray(
            [1000, 3000, 3001, 4000, 4001, 5000, 5001, 6000, 6001, 8000, 8001])
        r_vals_2 = np.asarray([
            100000, 300001, 300000, 400001, 400000, 500001, 50000, 600001,
            600000, 800001, 800000
        ])

        l_vals_exp = np.asarray([100, 400, 400, 600, 600, 800, 800],
                                dtype=np.int32)
        l_vals_2_exp = np.asarray(
            [10000, 40000, 40000, 60000, 60000, 80000, 80000], dtype=np.int32)
        r_vals_exp = np.asarray([1000, 4000, 4001, 6000, 6001, 8000, 8001],
                                dtype=np.int32)
        r_vals_2_exp = np.asarray(
            [100000, 400001, 400000, 600001, 600000, 800001, 800000],
            dtype=np.int32)
        s = session.Session()
        actual = s.ordered_merge_inner(l_id,
                                       r_id,
                                       left_field_sources=(l_vals, l_vals_2),
                                       right_field_sources=(r_vals, r_vals_2),
                                       left_unique=True,
                                       right_unique=False)
        self.assertTrue(np.array_equal(actual[0][0], l_vals_exp))
        self.assertTrue(np.array_equal(actual[0][1], l_vals_2_exp))
        self.assertTrue(np.array_equal(actual[1][0], r_vals_exp))
        self.assertTrue(np.array_equal(actual[1][1], r_vals_2_exp))
示例#9
0
    def test_dataset_sort_index_groups(self):

        s = session.Session(10)
        vx = np.asarray([b'a', b'b', b'c', b'd', b'e'], dtype='S1')
        va = np.asarray([1, 2, 2, 1, 1])
        vb = np.asarray([5, 4, 3, 2, 1])

        bio = BytesIO()
        with h5py.File(bio, 'w') as hf:
            s.create_fixed_string(hf, 'x', 1).data.write(vx)
            s.create_numeric(hf, 'a', 'int32').data.write(va)
            s.create_numeric(hf, 'b', 'int32').data.write(vb)

            sindex = s.dataset_sort_index((hf['a'], hf['b']),
                                          np.arange(5, dtype='uint32'))

            s.get(hf['a']).writeable().data[:] = s.apply_index(sindex, hf['a'])
            s.get(hf['b']).writeable().data[:] = s.apply_index(sindex, hf['b'])
            s.get(hf['x']).writeable().data[:] = s.apply_index(sindex, hf['x'])

            self.assertListEqual([1, 1, 1, 2, 2],
                                 s.get(hf['a']).data[:].tolist())
            self.assertListEqual([1, 2, 5, 3, 4],
                                 s.get(hf['b']).data[:].tolist())
            self.assertListEqual([b'e', b'd', b'a', b'c', b'b'],
                                 s.get(hf['x']).data[:].tolist())
示例#10
0
    def test_categorical_field_importer_with_small_chunk_size(self):
        chunk_row_size = 20  # chunk_row_size * column_count < total_bytes

        expected_postcode_value_list = [1, 3, 2, 0, 4]
        expected_key_names = [b'', b'NW1', b'E1', b'SW1P', b'NW3']
        expected_key_values = [0, 1, 2, 3, 4]

        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        True,
                                        None,
                                        None,
                                        self.ts,
                                        chunk_row_size=chunk_row_size)
            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['postcode'].data[:].tolist(),
                             expected_postcode_value_list)
            self.assertEqual(list(df['postcode'].keys.values()),
                             expected_key_names)

        with h5py.File(bio, 'r') as hf:
            self.assertEqual(
                hf['schema_key']['postcode']['values'][:].tolist(),
                expected_postcode_value_list)
            #self.assertEqual(hf['schema_key']['postcode']['key_names'][:].tolist(), expected_key_names)
            self.assertEqual(
                hf['schema_key']['postcode']['key_values'][:].tolist(),
                expected_key_values)
示例#11
0
    def test_fixed_string_field_importer(self):
        expected_patient_id_value_list = [
            b'E1', b'E123', b'E234', b'', b'E456'
        ]

        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False,
                                        None,
                                        None,
                                        self.ts,
                                        chunk_row_size=self.chunk_row_size)
            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['patient_id'].data[:].tolist(),
                             expected_patient_id_value_list)

        with h5py.File(bio, 'r') as hf:
            self.assertEqual(
                hf['schema_key']['patient_id']['values'][:].tolist(),
                expected_patient_id_value_list)
示例#12
0
    def test_indexed_string_importer_with_small_chunk_size(self):
        chunk_row_size = 20  # chunk_row_size * column_count < total_bytes

        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False,
                                        None,
                                        None,
                                        self.ts,
                                        chunk_row_size=chunk_row_size)
            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['name'].data[:],
                             ['a', 'bb', 'ccc', 'dddd', 'eeeee'])

        with h5py.File(bio, 'r') as hf:
            indices = hf['schema_key']['name']['index'][:]
            values = hf['schema_key']['name']['values'][:]

            self.assertListEqual(list(indices), [0, 1, 3, 6, 10, 15])
            self.assertEqual(values[indices[0]:indices[1]].tobytes(), b'a')
            self.assertEqual(values[indices[3]:indices[4]].tobytes(), b'dddd')
示例#13
0
    def test_numeric_importer_in_allow_empty_mode(self):
        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False, {}, {},
                                        self.ts,
                                        chunk_row_size=self.chunk_row_size)
            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['age_valid'].data[:].tolist(),
                             [True, True, True, True, True])
            self.assertTrue('weight_change_valid' not in df)

        with h5py.File(bio, 'r') as hf:
            self.assertTrue(hf['schema_key']['age']['values'][:].tolist(),
                            [30, 40, 50, 60, 70])
            self.assertTrue(
                hf['schema_key']['age_valid']['values'][:].tolist(),
                [True, True, True, True, True])
            self.assertTrue(
                'weight_change_valid' not in set(hf['schema_key'].keys()))
示例#14
0
    def test_numeric_importer_with_non_numeric_value_in_strict_mode(self):
        TEST_CSV_CONTENTS_EMPTY_VALUE = '\n'.join(
            ('name, id', 'a,     1', 'c,     5@'))

        fd_csv, csv_file_name = tempfile.mkstemp(suffix='.csv')
        with open(csv_file_name, 'w') as fcsv:
            fcsv.write(TEST_CSV_CONTENTS_EMPTY_VALUE)

        files = {'schema_key': csv_file_name}

        bio = BytesIO()
        with self.assertRaises(ValueError) as context:
            with session.Session() as s:
                importer.import_with_schema(s,
                                            bio,
                                            self.ds_name,
                                            self.schema,
                                            files,
                                            False, {}, {},
                                            self.ts,
                                            chunk_row_size=self.chunk_row_size)

        self.assertEqual(
            str(context.exception),
            "Field 'id' contains values that cannot be converted to float in 'strict' mode"
        )

        os.close(fd_csv)
示例#15
0
    def test_ordered_map_valid_stream(self):
        bio = BytesIO()
        with session.Session() as s:
            dst = s.open_dataset(bio, 'r+', 'dst')
            hf = dst.create_dataframe('hf')
            map_field = np.asarray([
                0, 0, 0, 1, 1, 3, 3, 3, 3, 5, 5, 5, 5, ops.INVALID_INDEX,
                ops.INVALID_INDEX, 7, 7, 7
            ],
                                   dtype=np.int64)
            data_field = np.asarray([-1, -2, -3, -4, -5, -6, -8, -9],
                                    dtype=np.int32)
            f_map_field = s.create_numeric(hf, "map_field", "int64")
            f_map_field.data.write(map_field)
            f_data_field = s.create_numeric(hf, "data_field", "int32")
            f_data_field.data.write(data_field)

            result_field = np.zeros(len(map_field), dtype=np.int32)
            ops.ordered_map_valid_stream(f_data_field, f_map_field,
                                         result_field, 4)
            expected = np.asarray([
                -1, -1, -1, -2, -2, -4, -4, -4, -4, -6, -6, -6, -6, 0, 0, -9,
                -9, -9
            ],
                                  dtype=np.int32)
            self.assertTrue(np.array_equal(result_field, expected))
示例#16
0
    def test_apply_spans_concat_field(self):
        idx = np.asarray([0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2],
                         dtype=np.int32)
        vals = [
            'a', "'b'", 'what', 'some, information', 'x', '', 'foo', 'flop',
            "'dun'", "'mun'", "'race, track?'", '', "for, too", 'z', 'now!'
        ]

        # vals = ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b']
        bio = BytesIO()
        with session.Session() as s:
            spans = s.get_spans(idx)
            # results = s.apply_spans_concat(spans, vals)
            # self.assertListEqual([0, 8, 6, 9], results.tolist())

            ds = s.open_dataset(bio, "w", "ds")
            # s.apply_spans_concat(spans, vals, dest=s.create_indexed_string(ds, 'result'))
            # self.assertListEqual([0, 8, 6, 9], s.get(ds['result']).data[:].tolist())

            s.create_indexed_string(ds, 'vals').data.write(vals)
            s.apply_spans_concat(spans,
                                 s.get(ds['vals']),
                                 dest=s.create_indexed_string(ds, 'result'))
            self.assertListEqual([
                'a,\'b\',what,"some, information",x', 'foo,flop',
                '\'dun\',\'mun\',"\'race, track?\'","for, too",z,now!'
            ],
                                 s.get(ds['result']).data[:])
示例#17
0
 def test_ordered_inner_map_left_unique_streamed(self):
     bio = BytesIO()
     with session.Session() as s:
         dst = s.open_dataset(bio, 'r+', 'dst')
         hf = dst.create_dataframe('hf')
         a_ids = np.asarray(
             [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 15, 16, 17, 18],
             dtype=np.int64)
         b_ids = np.asarray([
             0, 1, 1, 2, 4, 5, 5, 6, 8, 9, 9, 10, 12, 13, 13, 14, 16, 17,
             17, 18
         ],
                            dtype=np.int64)
         a_ids_f = s.create_numeric(hf, 'a_ids', 'int64')
         a_ids_f.data.write(a_ids)
         b_ids_f = s.create_numeric(hf, 'b_ids', 'int64')
         b_ids_f.data.write(b_ids)
         left_result = s.create_numeric(hf, 'left_result', 'int64')
         right_result = s.create_numeric(hf, 'right_result', 'int64')
         ops.ordered_inner_map_left_unique_streamed(a_ids_f, b_ids_f,
                                                    left_result,
                                                    right_result)
         left_expected = np.asarray(
             [0, 1, 1, 2, 4, 4, 5, 7, 8, 10, 11, 11, 13, 14, 14, 15],
             dtype=np.int32)
         self.assertTrue(np.array_equal(left_result.data[:], left_expected))
         right_expected = np.asarray(
             [0, 1, 2, 3, 5, 6, 7, 8, 11, 12, 13, 14, 16, 17, 18, 19],
             dtype=np.int32)
         self.assertTrue(
             np.array_equal(right_result.data[:], right_expected))
示例#18
0
    def test_merge_left_dataset(self):
        bio1 = BytesIO()
        with h5py.File(bio1, 'w') as src:
            s = session.Session()
            p_id = np.array([100, 200, 300, 400, 500, 600, 800, 900])
            p_val = np.array([-1, -2, -3, -4, -5, -6, -8, -9])
            a_pid = np.array([
                100, 100, 100, 200, 200, 400, 400, 400, 400, 600, 600, 600,
                700, 700, 900, 900, 900
            ])
            a_val = np.array([
                10, 11, 12, 23, 22, 40, 43, 42, 41, 60, 61, 63, 71, 71, 94, 93,
                92
            ])
            src.create_group('p')
            s.create_numeric(src['p'], 'id', 'int32').data.write(p_id)
            s.create_numeric(src['p'], 'val', 'int32').data.write(p_val)
            src.create_group('a')
            s.create_numeric(src['a'], 'pid', 'int32').data.write(a_pid)

        bio2 = BytesIO()
        with h5py.File(bio1, 'r') as src:
            with h5py.File(bio2, 'w') as snk:
                s.merge_left(s.get(src['a']['pid']),
                             s.get(src['p']['id']),
                             right_fields=(s.get(src['p']['val']), ),
                             right_writers=(s.create_numeric(
                                 snk, 'val', 'int32'), ))
                expected = [
                    -1, -1, -1, -2, -2, -4, -4, -4, -4, -6, -6, -6, 0, 0, -9,
                    -9, -9
                ]
                actual = s.get(snk['val']).data[:]
                self.assertListEqual(expected, actual.data[:].tolist())
示例#19
0
    def tests_merge_right(self):

        r_id = np.asarray([0, 1, 2, 3, 4, 5, 6, 7], dtype='int32')
        l_id = np.asarray([2, 3, 0, 4, 7, 6, 2, 0, 3], dtype='int32')
        l_vals = [
            'bb1', 'ccc1', '', 'dddd1', 'ggggggg1', 'ffffff1', 'bb2', '',
            'ccc2'
        ]
        expected = [
            '', '', '', 'bb1', 'bb2', 'ccc1', 'ccc2', 'dddd1', '', 'ffffff1',
            'ggggggg1'
        ]

        bio = BytesIO()
        with session.Session() as s:
            dst = s.open_dataset(bio, 'w', 'dst')
            ldf = dst.create_dataframe('ldf')
            rdf = dst.create_dataframe('rdf')
            ldf.create_numeric('l_id', 'int32').data.write(l_id)
            ldf.create_indexed_string('l_vals').data.write(l_vals)
            rdf.create_numeric('r_id', 'int32').data.write(r_id)
            ddf = dst.create_dataframe('ddf')
            dataframe.merge(ldf, rdf, ddf, 'l_id', 'r_id', how='right')
            self.assertEqual(expected, ddf['l_vals'].data[:])
            valid_if_equal = (ddf['l_id'].data[:] == ddf['r_id'].data[:]) | \
                             np.logical_not(ddf['valid_l'].data[:])
            self.assertTrue(np.all(valid_if_equal))
示例#20
0
    def test_indexed_string_importer(self):
        s = session.Session()
        bio = BytesIO()
        with h5py.File(bio, 'w') as hf:
            values = [
                '', '', '1.0.0', '', '1.0.ä', '1.0.0', '1.0.0', '1.0.0', '',
                '', '1.0.0', '1.0.0', '', '1.0.0', '1.0.ä', '1.0.0', ''
            ]
            im = fields.IndexedStringImporter(s, hf, 'x')
            im.write(values)
            f = s.get(hf['x'])

            expected = [
                '', '', '1.0.0', '', '1.0.ä', '1.0.0', '1.0.0', '1.0.0', '',
                '', '1.0.0', '1.0.0', '', '1.0.0', '1.0.ä', '1.0.0', ''
            ]
            self.assertListEqual(expected, f.data[:])

            expected = [
                0, 0, 0, 5, 5, 11, 16, 21, 26, 26, 26, 31, 36, 36, 41, 47, 52,
                52
            ]
            self.assertListEqual(expected, f.indices[:].tolist())

            expected = [
                49, 46, 48, 46, 48, 49, 46, 48, 46, 195, 164, 49, 46, 48, 46,
                48, 49, 46, 48, 46, 48, 49, 46, 48, 46, 48, 49, 46, 48, 46, 48,
                49, 46, 48, 46, 48, 49, 46, 48, 46, 48, 49, 46, 48, 46, 195,
                164, 49, 46, 48, 46, 48
            ]
            self.assertListEqual(expected, f.values[:].tolist())
示例#21
0
    def test_dataframe_create_indexed_string(self):
        bio = BytesIO()
        with session.Session() as s:
            dst = s.open_dataset(bio, 'r+', 'dst')
            hf = dst.create_dataframe('dst')
            np.random.seed(12345678)
            values = np.random.randint(low=0, high=4, size=200000)
            svalues = [''.join(['x'] * v) for v in values]
            a = hf.create_indexed_string('a', 8)
            a.data.write(svalues)

            total = np.unique(a.data[:])
            self.assertListEqual(['', 'x', 'xx', 'xxx'], total.tolist())

            strs = a.data[:]
            strs = [s + 'y' for s in strs]
            a.data.clear()
            a.data.write(strs)

            self.assertListEqual(
                ['xxxy', 'xxy', 'xxxy', 'y', 'xy', 'y', 'xxxy', 'xxxy', 'xy', 'y'], strs[:10])
            self.assertListEqual([0, 4, 7, 11, 12, 14, 15, 19, 23, 25],
                                 a.indices[:10].tolist())
            self.assertListEqual(
                [120, 120, 120, 121, 120, 120, 121, 120, 120, 120], a.values[:10].tolist())
            self.assertListEqual(
                ['xxxy', 'xxy', 'xxxy', 'y', 'xy', 'y', 'xxxy', 'xxxy', 'xy', 'y'], a.data[:10])
示例#22
0
    def test_read_csv_with_fields_out_of_order(self):
        bio = BytesIO()
        with session.Session() as s:
            dst = s.open_dataset(bio, 'w', 'dst')
            df = dst.create_dataframe('df')

            parsers.read_csv(self.csv_file_name,
                             df,
                             self.schema_dict,
                             include=['weight_change', 'height', 'BMI'])

            expected_height_list = list(
                np.asarray([170.9, 180.2, 160.5, 160.5, 161.0],
                           dtype=np.float32))
            expected_height_valid_list = [True, True, False, False, True]
            self.assertEqual(list(df['height'].data[:]), expected_height_list)
            self.assertEqual(list(df['height_valid_test'].data[:]),
                             expected_height_valid_list)

            expected_weight_change_list = list(
                np.asarray(
                    [21.2,
                     utils.get_min_max('float32')[0], -17.5, -17.5, 2.5],
                    dtype=np.float32))
            self.assertEqual(list(df['weight_change'].data[:]),
                             expected_weight_change_list)
            self.assertTrue('weight_change_valid' not in df)

            expected_BMI_list = list(
                np.asarray([20.5, 25.4, 27.2, 27.2, 20.2], dtype=np.float64))
            expected_BMI_valid_list = [True, True, True, True, True]
            self.assertEqual(list(df['BMI'].data[:]), expected_BMI_list)
            self.assertEqual(list(df['BMI_valid'].data[:]),
                             expected_BMI_valid_list)
示例#23
0
    def test_dataframe_create_mem_numeric(self):
        bio = BytesIO()
        with session.Session() as s:
            dst = s.open_dataset(bio, 'r+', 'dst')
            df = dst.create_dataframe('dst')
            num = df.create_numeric('num', 'uint32')
            num.data.write([1, 2, 3, 4])
            self.assertEqual([1, 2, 3, 4], num.data[:].tolist())
            num2 = df.create_numeric('num2', 'uint32')
            num2.data.write([1, 2, 3, 4])

            df['num3'] = num + num2
            self.assertEqual([2, 4, 6, 8], df['num3'].data[:].tolist())
            df['num4'] = num - np.array([1, 2, 3, 4])
            self.assertEqual([0, 0, 0, 0], df['num4'].data[:].tolist())
            df['num5'] = num * np.array([1, 2, 3, 4])
            self.assertEqual([1, 4, 9, 16], df['num5'].data[:].tolist())
            df['num6'] = df['num5'] / np.array([1, 2, 3, 4])
            self.assertEqual([1, 2, 3, 4], df['num6'].data[:].tolist())
            df['num7'] = df['num'] & df['num2']
            self.assertEqual([1, 2, 3, 4], df['num7'].data[:].tolist())
            df['num8'] = df['num'] | df['num2']
            self.assertEqual([1, 2, 3, 4], df['num8'].data[:].tolist())
            df['num9'] = df['num'] ^ df['num2']
            self.assertEqual([0, 0, 0, 0], df['num9'].data[:].tolist())
            df['num10'] = df['num'] % df['num2']
            self.assertEqual([0, 0, 0, 0], df['num10'].data[:].tolist())
示例#24
0
 def test_ordered_map_to_right_left_unique_streamed(self):
     s = session.Session()
     bio = BytesIO()
     with h5py.File(bio, 'w') as hf:
         a_ids = np.asarray(
             [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 15, 16, 17, 18],
             dtype=np.int64)
         b_ids = np.asarray([
             0, 1, 1, 2, 4, 5, 5, 6, 8, 9, 9, 10, 12, 13, 13, 14, 16, 17,
             17, 18
         ],
                            dtype=np.int64)
         a_ids_f = s.create_numeric(hf, 'a_ids', 'int64')
         a_ids_f.data.write(a_ids)
         b_ids_f = s.create_numeric(hf, 'b_ids', 'int64')
         b_ids_f.data.write(b_ids)
         left_to_right_result = s.create_numeric(hf, 'left_result', 'int64')
         ops.ordered_map_to_right_right_unique_streamed(
             a_ids_f, b_ids_f, left_to_right_result)
         expected = np.asarray([
             0, 1, 3, ops.INVALID_INDEX, 5, 7, ops.INVALID_INDEX, 8, 11,
             ops.INVALID_INDEX, 12, 13, ops.INVALID_INDEX, 16, 17, 19
         ])
         self.assertTrue(
             np.array_equal(left_to_right_result.data[:], expected))
示例#25
0
    def test_match_assessment(self):
        bio = BytesIO()
        with esess.Session() as s:
            src = s.open_dataset(bio, 'w', 'src')
            # test df
            tests = src.create_dataframe('tests')
            pid = tests.create_numeric('patient_id', 'int32')
            pid.data.write([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
            d = tests.create_timestamp('created_at')
            d.data.write(
                [datetime(2020, 1, i).timestamp() for i in range(5, 15)])
            pid = tests.create_numeric('result', 'int32')
            pid.data.write([3, 4, 3, 4, 3, 4, 3, 4, 3, 4])

            #assessment df
            asmt = src.create_dataframe('assessments')
            pid = asmt.create_numeric('patient_id', 'int32')
            pid.data.write([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
            d = asmt.create_timestamp('created_at')
            d.data.write([
                datetime(2020, 1, i).timestamp()
                for i in list(reversed(range(7, 17)))
            ])

            result = src.create_dataframe('result')
            match_assessment(tests, asmt, result, 5)
            self.assertListEqual(result['patient_id_l'].data[:].tolist(),
                                 list([7, 8, 9]))
            result = src.create_dataframe('result2')
            match_assessment(tests, asmt, result, 5, True)
            self.assertListEqual(result['patient_id_l'].data[:].tolist(),
                                 list([8]))
示例#26
0
    def test_importer_with_arg_include(self):
        include, exclude = {'schema_key': ['id', 'name']}, {}

        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False,
                                        include,
                                        exclude,
                                        self.ts,
                                        chunk_row_size=self.chunk_row_size)

            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['id'].data[:].tolist(), [1, 2, 3, 4, 5])
            self.assertEqual(df['name'].data[:],
                             ['a', 'bb', 'ccc', 'dddd', 'eeeee'])

        with h5py.File(bio, 'r') as hf:
            self.assertListEqual(list(hf.keys()), ['schema_key'])
            self.assertTrue(
                set(hf['schema_key'].keys()) >= set(['id', 'name']))
            self.assertEqual(hf['schema_key']['id']['values'][:].tolist(),
                             [1, 2, 3, 4, 5])
            self.assertEqual(hf['schema_key']['name']['index'][:].tolist(),
                             [0, 1, 3, 6, 10, 15])
示例#27
0
    def test_dataframe_create_with_dataframe(self):

        iscontents1 = ['a', 'bb', 'ccc', 'dddd']
        iscontents2 = ['eeee', 'fff', 'gg', 'h']
        fscontents1 = [s.encode() for s in iscontents1]
        fscontents2 = [s.encode() for s in iscontents2]
        ccontents1 = np.array([1, 2, 2, 1], dtype=np.int8)
        ccontents2 = np.array([2, 1, 1, 2], dtype=np.int8)
        ncontents1 = np.array([1, 2, 3, 4], dtype=np.int32)
        ncontents2 = np.array([5, 6, 7, 8], dtype=np.int32)
        from datetime import datetime as D
        tcontents1 = [D(2020, 1, 1), D(2020, 1, 2), D(2020, 1, 3), D(2020, 1, 4)]
        tcontents1 = np.array([d.timestamp() for d in tcontents1])
        tcontents2 = [D(2021, 1, 1), D(2021, 1, 2), D(2021, 1, 3), D(2021, 1, 4)]
        tcontents2 = np.array([d.timestamp() for d in tcontents2])

        bio = BytesIO()
        with session.Session() as s:
            ds = s.open_dataset(bio, 'w', 'ds')
            df1 = ds.create_dataframe('df1')
            df1.create_indexed_string('is_foo').data.write(iscontents1)
            df1.create_fixed_string('fs_foo', 4).data.write(fscontents1)
            df1.create_categorical('c_foo', 'int8', {b'a': 1, b'b': 2}).data.write(ccontents1)
            df1.create_numeric('n_foo', 'uint32').data.write(ncontents1)
            df1.create_timestamp('t_foo').data.write(tcontents1)

            df2 = ds.create_dataframe('df2', dataframe=df1)

            self.assertListEqual(iscontents1, df1['is_foo'].data[:])
            self.assertListEqual(iscontents1, df2['is_foo'].data[:])
            df2['is_foo'].data.clear()
            df2['is_foo'].data.write(iscontents2)
            self.assertListEqual(iscontents1, df1['is_foo'].data[:])
            self.assertListEqual(iscontents2, df2['is_foo'].data[:])

            self.assertListEqual(fscontents1, df1['fs_foo'].data[:].tolist())
            self.assertListEqual(fscontents1, df2['fs_foo'].data[:].tolist())
            df2['fs_foo'].data[:] = fscontents2
            self.assertListEqual(fscontents1, df1['fs_foo'].data[:].tolist())
            self.assertListEqual(fscontents2, df2['fs_foo'].data[:].tolist())

            self.assertListEqual(ccontents1.tolist(), df1['c_foo'].data[:].tolist())
            self.assertListEqual(ccontents1.tolist(), df2['c_foo'].data[:].tolist())
            df2['c_foo'].data[:] = ccontents2
            self.assertListEqual(ccontents1.tolist(), df1['c_foo'].data[:].tolist())
            self.assertListEqual(ccontents2.tolist(), df2['c_foo'].data[:].tolist())
            self.assertDictEqual({1: b'a', 2: b'b'}, df1['c_foo'].keys)
            self.assertDictEqual({1: b'a', 2: b'b'}, df2['c_foo'].keys)

            self.assertListEqual(ncontents1.tolist(), df1['n_foo'].data[:].tolist())
            self.assertListEqual(ncontents1.tolist(), df2['n_foo'].data[:].tolist())
            df2['n_foo'].data[:] = np.array(ncontents2, dtype=np.uint32)
            self.assertListEqual(ncontents1.tolist(), df1['n_foo'].data[:].tolist())
            self.assertListEqual(ncontents2.tolist(), df2['n_foo'].data[:].tolist())

            self.assertListEqual(tcontents1.tolist(), df1['t_foo'].data[:].tolist())
            self.assertListEqual(tcontents1.tolist(), df2['t_foo'].data[:].tolist())
            df2['t_foo'].data[:] = np.array(tcontents2, dtype=np.float64)
            self.assertListEqual(tcontents1.tolist(), df1['t_foo'].data[:].tolist())
            self.assertListEqual(tcontents2.tolist(), df2['t_foo'].data[:].tolist())
示例#28
0
def join_tests():
    """
    Merge tests to previous merged (assessments, vaccine), filter out subjects has test records within 10days after vaccine
    """
    with sess.Session() as s:
        # open related datasets
        src = s.open_dataset(ADATA, 'r', 'asmt')
        tests_src = src['tests']
        dst = s.open_dataset(DSTDATA, 'r+', 'dst')
        vacc = dst['asmt_v']
        tests_m = dst.create_dataframe('tests_m')
        dataframe.merge(vacc,
                        tests_src,
                        tests_m,
                        'patient_id_l',
                        'patient_id',
                        how='inner')

        # filter out subjects has tests after 10days of vaccine
        # date_taken_specific_l is vaccine date, date_taken_specific_r is tests date
        test_filter = tests_m['date_taken_specific_l'] < tests_m[
            'date_taken_specific_r']  # test after vaccine
        test_filter &= tests_m['date_taken_specific_l'] > (
            tests_m['date_taken_specific_r'] - 3600 * 24 * 10)
        tests_m.apply_filter(test_filter)
示例#29
0
    def test_dataset_init_with_data(self):
        bio = BytesIO()
        with session.Session() as s:
            h5file = h5py.File(bio, 'w')
            hgrp1 = h5file.create_group("grp1")
            num1 = s.create_numeric(hgrp1, 'num1', 'uint32')
            num1.data.write(np.array([0, 1, 2, 3, 4]))
            h5file.close()

            # read existing datafile
            dst = s.open_dataset(bio, 'r+', 'dst')
            self.assertTrue(isinstance(dst['grp1'], DataFrame))
            self.assertEqual(s.get(dst['grp1']['num1']).data[:].tolist(), [0, 1, 2, 3, 4])

            # add dataframe
            bio2 = BytesIO()
            ds2 = s.open_dataset(bio2, 'w', 'ds2')
            df2 = ds2.create_dataframe('df2')
            fs = df2.create_fixed_string('fs', 1)
            fs.data.write([b'a', b'b', b'c', b'd'])

            dst.copy(df2, 'df2')
            self.assertTrue(isinstance(dst['df2'], DataFrame))
            self.assertEqual([b'a', b'b', b'c', b'd'], dst['df2']['fs'].data[:].tolist())

            del dst['df2']
            self.assertTrue(len(dst.keys()) == 1)
            self.assertTrue(len(dst._file.keys()) == 1)

            # set dataframe (this is a copy between datasets
            dst['df3'] = df2
            self.assertTrue(isinstance(dst['df3'], DataFrame))
            self.assertEqual([b'a', b'b', b'c', b'd'], dst['df3']['fs'].data[:].tolist())
示例#30
0
def asmt_merge_vacc():
    """
    Merge assessment df with vaccine dataframe, filter out subject has a healthy assessments before vaccine date
    """
    with sess.Session() as s:
        # open related datasets
        src = s.open_dataset(ADATA, 'r', 'asmt')
        asmt = src['assessments']
        vacc = s.open_dataset(VDATA, 'r', 'vacc')
        dst = s.open_dataset(DSTDATA, 'w', 'dst')

        #filter vaccine type
        vbrand_filter = (vacc['vaccine_doses']['brand'].data[:] == 2) | \
                        (vacc['vaccine_doses']['brand'].data[:] == 3)
        dvacc = dst.create_dataframe('vacc')
        vacc['vaccine_doses'].apply_filter(vbrand_filter, ddf=dvacc)

        #join asmt with vaccine using patient_id, write to result
        asmt_v = dst.create_dataframe('asmt_v')
        dataframe.merge(asmt,
                        dvacc,
                        asmt_v,
                        'patient_id',
                        'patient_id',
                        how='inner')

        #filter healthy asmt record within 10days of vaccine date

        symp_list = [
            'persistent_cough', 'fever', 'fatigue', 'delirium',
            'shortness_of_breath', 'diarrhoea', 'abdominal_pain', 'chest_pain',
            'hoarse_voice', 'skipped_meals', 'loss_of_smell', 'headache',
            'sore_throat', 'chills_or_shivers', 'eye_soreness', 'nausea',
            'blisters_on_feet', 'unusual_muscle_pains', 'runny_nose',
            'red_welts_on_face_or_lips', 'dizzy_light_headed',
            'swollen_glands', 'sneezing', 'skin_burning', 'earache',
            'altered_smell', 'brain_fog', 'irregular_heartbeat'
        ]
        symp_filter = asmt_v['persistent_cough'].data[:] > 1  # has symptom
        for symptom1 in symp_list:
            symp_filter |= asmt_v[symptom1].data[:] > 1  # has symptom
        symp_filter = ~symp_filter  # has no symptom
        symp_filter &= asmt_v['date_taken_specific'].data[:] > asmt_v[
            'updated_at_l'].data[:]  # asmt before vaccine
        symp_filter &= asmt_v['updated_at_l'].data[:] > asmt_v[
            'date_taken_specific'].data[:] - 3600 * 24 * 10  # 10 days
        asmt_v.apply_filter(symp_filter)

        # has symptom after vaccine
        yes_symp_filter = asmt_v['persistent_cough'].data[:] > 1
        for symptom1 in symp_list:
            yes_symp_filter |= asmt_v[symptom1].data[:] > 1  # has symptom
        yes_symp_filter &= asmt_v['date_taken_specific'].data[:] < asmt_v[
            'updated_at_l'].data[:]  # assessment after vaccine
        yes_symp_filter &= asmt_v[
            'date_taken_specific'].data[:] + 3600 * 24 * 10 > asmt_v[
                'updated_at_l'].data[:]  # assessment within 7 days of vaccine
        asmt_v.apply_filter(yes_symp_filter)
        print("finish asmt join vaccine.")