예제 #1
0
    def test_categorical_field_importer_with_small_chunk_size(self):
        chunk_row_size = 20  # chunk_row_size * column_count < total_bytes

        expected_postcode_value_list = [1, 3, 2, 0, 4]
        expected_key_names = [b'', b'NW1', b'E1', b'SW1P', b'NW3']
        expected_key_values = [0, 1, 2, 3, 4]

        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        True,
                                        None,
                                        None,
                                        self.ts,
                                        chunk_row_size=chunk_row_size)
            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['postcode'].data[:].tolist(),
                             expected_postcode_value_list)
            self.assertEqual(list(df['postcode'].keys.values()),
                             expected_key_names)

        with h5py.File(bio, 'r') as hf:
            self.assertEqual(
                hf['schema_key']['postcode']['values'][:].tolist(),
                expected_postcode_value_list)
            #self.assertEqual(hf['schema_key']['postcode']['key_names'][:].tolist(), expected_key_names)
            self.assertEqual(
                hf['schema_key']['postcode']['key_values'][:].tolist(),
                expected_key_values)
예제 #2
0
    def test_fixed_string_field_importer(self):
        expected_patient_id_value_list = [
            b'E1', b'E123', b'E234', b'', b'E456'
        ]

        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False,
                                        None,
                                        None,
                                        self.ts,
                                        chunk_row_size=self.chunk_row_size)
            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['patient_id'].data[:].tolist(),
                             expected_patient_id_value_list)

        with h5py.File(bio, 'r') as hf:
            self.assertEqual(
                hf['schema_key']['patient_id']['values'][:].tolist(),
                expected_patient_id_value_list)
예제 #3
0
    def test_importer_date(self):
        expected_birthday_date = [
            '1990-01-01', '1980-03-04', '1970-04-05', '1960-04-05',
            '1950-04-05'
        ]

        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False, {}, {},
                                        self.ts,
                                        chunk_row_size=self.chunk_row_size)
            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['birthday'].data[:].tolist(), [
                datetime.strptime(
                    x, "%Y-%m-%d").replace(tzinfo=timezone.utc).timestamp()
                for x in expected_birthday_date
            ])

        with h5py.File(bio, 'r') as hf:
            self.assertEqual(
                hf['schema_key']['birthday']['values'][:].tolist(), [
                    datetime.strptime(x, "%Y-%m-%d").replace(
                        tzinfo=timezone.utc).timestamp()
                    for x in expected_birthday_date
                ])
예제 #4
0
    def test_indexed_string_importer_with_small_chunk_size(self):
        chunk_row_size = 20  # chunk_row_size * column_count < total_bytes

        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False,
                                        None,
                                        None,
                                        self.ts,
                                        chunk_row_size=chunk_row_size)
            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['name'].data[:],
                             ['a', 'bb', 'ccc', 'dddd', 'eeeee'])

        with h5py.File(bio, 'r') as hf:
            indices = hf['schema_key']['name']['index'][:]
            values = hf['schema_key']['name']['values'][:]

            self.assertListEqual(list(indices), [0, 1, 3, 6, 10, 15])
            self.assertEqual(values[indices[0]:indices[1]].tobytes(), b'a')
            self.assertEqual(values[indices[3]:indices[4]].tobytes(), b'dddd')
예제 #5
0
    def test_numeric_importer_in_allow_empty_mode(self):
        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False, {}, {},
                                        self.ts,
                                        chunk_row_size=self.chunk_row_size)
            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['age_valid'].data[:].tolist(),
                             [True, True, True, True, True])
            self.assertTrue('weight_change_valid' not in df)

        with h5py.File(bio, 'r') as hf:
            self.assertTrue(hf['schema_key']['age']['values'][:].tolist(),
                            [30, 40, 50, 60, 70])
            self.assertTrue(
                hf['schema_key']['age_valid']['values'][:].tolist(),
                [True, True, True, True, True])
            self.assertTrue(
                'weight_change_valid' not in set(hf['schema_key'].keys()))
예제 #6
0
    def test_numeric_importer_with_non_numeric_value_in_strict_mode(self):
        TEST_CSV_CONTENTS_EMPTY_VALUE = '\n'.join(
            ('name, id', 'a,     1', 'c,     5@'))

        fd_csv, csv_file_name = tempfile.mkstemp(suffix='.csv')
        with open(csv_file_name, 'w') as fcsv:
            fcsv.write(TEST_CSV_CONTENTS_EMPTY_VALUE)

        files = {'schema_key': csv_file_name}

        bio = BytesIO()
        with self.assertRaises(ValueError) as context:
            with session.Session() as s:
                importer.import_with_schema(s,
                                            bio,
                                            self.ds_name,
                                            self.schema,
                                            files,
                                            False, {}, {},
                                            self.ts,
                                            chunk_row_size=self.chunk_row_size)

        self.assertEqual(
            str(context.exception),
            "Field 'id' contains values that cannot be converted to float in 'strict' mode"
        )

        os.close(fd_csv)
예제 #7
0
    def test_importer_with_arg_include(self):
        include, exclude = {'schema_key': ['id', 'name']}, {}

        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False,
                                        include,
                                        exclude,
                                        self.ts,
                                        chunk_row_size=self.chunk_row_size)

            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['id'].data[:].tolist(), [1, 2, 3, 4, 5])
            self.assertEqual(df['name'].data[:],
                             ['a', 'bb', 'ccc', 'dddd', 'eeeee'])

        with h5py.File(bio, 'r') as hf:
            self.assertListEqual(list(hf.keys()), ['schema_key'])
            self.assertTrue(
                set(hf['schema_key'].keys()) >= set(['id', 'name']))
            self.assertEqual(hf['schema_key']['id']['values'][:].tolist(),
                             [1, 2, 3, 4, 5])
            self.assertEqual(hf['schema_key']['name']['index'][:].tolist(),
                             [0, 1, 3, 6, 10, 15])
예제 #8
0
    def test_numeric_importer_with_non_empty_valid_value_in_strict_mode(self):
        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False, {}, {},
                                        self.ts,
                                        chunk_row_size=self.chunk_row_size)
            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['id'].data[:].tolist(), [1, 2, 3, 4, 5])
            self.assertTrue('id_valid' not in df)

        with h5py.File(bio, 'r') as hf:
            self.assertEqual(hf['schema_key']['id']['values'][:].tolist(),
                             [1, 2, 3, 4, 5])
            self.assertTrue('id_valid' not in set(hf['schema_key'].keys()))
예제 #9
0
    def test_numeric_field_importer_with_small_chunk_size(self):
        # numeric int field
        expected_age_list = list(np.array([30, 40, 50, 60, 70],
                                          dtype=np.int32))
        # numeric float field with default value
        expected_height_list = list(
            np.array([170.9, 180.2, 160.5, 160.5, 161.0], dtype=np.float32))
        # numeric float field with min_default_value
        expected_weight_change_list = list(
            np.array(
                [21.2,
                 utils.get_min_max('float32')[0], -17.5, -17.5, 2.5],
                dtype=np.float32))

        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False, {}, {},
                                        self.ts,
                                        chunk_row_size=self.chunk_row_size)
            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['age'].data[:].tolist(), expected_age_list)
            self.assertEqual(df['height'].data[:].tolist(),
                             expected_height_list)
            self.assertEqual(df['weight_change'].data[:].tolist(),
                             expected_weight_change_list)

        with h5py.File(bio, 'r') as hf:
            self.assertListEqual(hf['schema_key']['age']['values'][:].tolist(),
                                 expected_age_list)
            self.assertListEqual(
                hf['schema_key']['height']['values'][:].tolist(),
                expected_height_list)
            self.assertListEqual(
                hf['schema_key']['weight_change']['values'][:].tolist(),
                expected_weight_change_list)
예제 #10
0
    def test_importer_datetime_with_create_day_field(self):
        expected_updated_at_list = [
            '2020-05-12 07:00:00', '2020-05-13 01:00:00',
            '2020-05-14 03:00:00', '2020-05-15 03:00:00', '2020-05-16 03:00:00'
        ]
        expected_updated_at_date_list = [
            b'2020-05-12', b'2020-05-13', b'2020-05-14', b'2020-05-15',
            b'2020-05-16'
        ]

        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False, {}, {},
                                        self.ts,
                                        chunk_row_size=self.chunk_row_size)
            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['updated_at'].data[:].tolist(), [
                datetime.strptime(x, "%Y-%m-%d %H:%M:%S").replace(
                    tzinfo=timezone.utc).timestamp()
                for x in expected_updated_at_list
            ])
            self.assertEqual(df['updated_at_day'].data[:].tolist(),
                             expected_updated_at_date_list)

        with h5py.File(bio, 'r') as hf:
            print(hf['schema_key']['updated_at']['values'][:])
            self.assertAlmostEqual(
                hf['schema_key']['updated_at']['values'][:].tolist(), [
                    datetime.strptime(x, "%Y-%m-%d %H:%M:%S").replace(
                        tzinfo=timezone.utc).timestamp()
                    for x in expected_updated_at_list
                ])
            self.assertEqual(
                hf['schema_key']['updated_at_day']['values'][:].tolist(),
                expected_updated_at_date_list)
예제 #11
0
    def test_importer_with_arg_exclude(self):
        bio = BytesIO()
        include, exclude = {}, {'schema_key': ['updated_at']}

        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False,
                                        include,
                                        exclude,
                                        self.ts,
                                        chunk_row_size=self.chunk_row_size)
            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertTrue('updated_at' not in df)

        with h5py.File(bio, 'r') as hf:
            self.assertTrue('updated_at' not in set(hf['schema_key'].keys()))
예제 #12
0
    def test_importer_with_wrong_arg_include(self):
        bio = BytesIO()
        include, exclude = {'schema_wrong_key': ['id', 'name']}, {}

        s = session.Session()
        with self.assertRaises(Exception) as context:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False,
                                        include,
                                        exclude,
                                        self.ts,
                                        chunk_row_size=self.chunk_row_size)

        self.assertEqual(
            str(context.exception),
            "-n/--include: the following include table(s) are not part of "
            "any input files: {'schema_wrong_key'}")
예제 #13
0
    def test_numeric_importer_in_relaxed_mode(self):
        expected_height_list = list(
            np.asarray([170.9, 180.2, 160.5, 160.5, 161.0], dtype=np.float32))

        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False,
                                        None,
                                        None,
                                        self.ts,
                                        chunk_row_size=self.chunk_row_size)
            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['height'].data[:].tolist(),
                             expected_height_list)
            self.assertTrue('height_valid' not in df)
            self.assertEqual(df['height_valid_test'].data[:].tolist(),
                             [True, True, False, False, True])
            self.assertEqual(df['BMI_valid'].data[:].tolist(),
                             [True, True, True, True, True])

        with h5py.File(bio, 'r') as hf:
            self.assertEqual(hf['schema_key']['height']['values'][:].tolist(),
                             expected_height_list)
            self.assertTrue('height_valid' not in set(hf['schema_key'].keys()))
            self.assertTrue(
                hf['schema_key']['height_valid_test']['values'][:].tolist(),
                [True, True, False, False, True])
            self.assertTrue(hf['schema_key']['BMI']['values'][:].tolist(),
                            [20.5, 25.4, 27.2, 27.2, 20.2])
            self.assertTrue(
                hf['schema_key']['BMI_valid']['values'][:].tolist(),
                [True, True, True, True, True])
예제 #14
0
    def test_leaky_categorical_field_importer(self):
        expected_degree_value_list = [1, 2, 0, -1, 3]
        expected_degree_freetext_index_list = [0, 0, 0, 0, 4, 4]
        expected_degree_freetext_value_list = list(
            np.frombuffer(b'prof', dtype=np.uint8))

        bio = BytesIO()
        with session.Session() as s:
            importer.import_with_schema(s,
                                        bio,
                                        self.ds_name,
                                        self.schema,
                                        self.files,
                                        False,
                                        None,
                                        None,
                                        self.ts,
                                        chunk_row_size=self.chunk_row_size)
            ds = s.get_dataset(self.ds_name)
            df = ds.get_dataframe('schema_key')
            self.assertEqual(df['degree'].data[:].tolist(),
                             expected_degree_value_list)
            self.assertEqual(df['degree_freetext'].indices[:].tolist(),
                             expected_degree_freetext_index_list)
            self.assertEqual(df['degree_freetext'].values[:].tolist(),
                             expected_degree_freetext_value_list)

        with h5py.File(bio, 'r') as hf:
            self.assertEqual(list(hf['schema_key']['degree']['values'][:]),
                             expected_degree_value_list)
            self.assertEqual(
                list(hf['schema_key']['degree_freetext']['index'][:]),
                expected_degree_freetext_index_list)
            self.assertEqual(
                list(hf['schema_key']['degree_freetext']['values'][:]),
                expected_degree_freetext_value_list)