def test_process_csv_file(self): schema = Schema.from_csv(csv_path=self.test_csv_file_path) input = Input(schema) input.add_categorical_column('col_0') rows = Dataset.Builder(input=input, name="test", root_dir=self.test_dir, parallelism_level=2)._process_csv_files() self.assertEqual(len(rows), 10) for column in input.columns: if column.name == 'col_0': self.assertTrue(len(column.metadata.categories), 4)
def create_test_dataset(test_dir, test_csv_file_path, dataset_name, header=False, is_related_path=False): col_0 = 'col_0' col_1 = 'col_1' col_5 = 'col_5' if header: col_0 = 'col_0_h' col_1 = 'col_1_h' col_5 = 'col_5_h' schema = Schema.from_csv(csv_path=test_csv_file_path, header=header) schema.merge_columns_in_range('col_vector', (2, 4)) input = Input(schema) input.add_categorical_column(col_0) input.add_numeric_column(col_1) input.add_vector_column('col_vector') img2d = Img2DColumn(is_related_path=is_related_path) input.add_column(col_5, img2d) return Dataset.Builder(input, dataset_name, test_dir, parallelism_level=2).build()