def dataset(mockfs): format = ds.ParquetFileFormat() selector = fs.FileSelector('subdir', recursive=True) options = ds.FileSystemFactoryOptions('subdir') options.partitioning = ds.DirectoryPartitioning( pa.schema( [pa.field('group', pa.int32()), pa.field('key', pa.string())])) factory = ds.FileSystemSourceFactory(mockfs, selector, format, options) schema = factory.inspect() source = factory.finish() return ds.Dataset([source], schema)
def dataset(mockfs): format = ds.ParquetFileFormat() selector = fs.FileSelector('subdir', recursive=True) options = ds.FileSystemDiscoveryOptions('subdir') options.partition_scheme = ds.SchemaPartitionScheme( pa.schema( [pa.field('group', pa.int32()), pa.field('key', pa.string())])) discovery = ds.FileSystemDataSourceDiscovery(mockfs, selector, format, options) schema = discovery.inspect() source = discovery.finish() return ds.Dataset([source], schema)
def test_file_system_factory(mockfs, paths_or_selector): format = ds.ParquetFileFormat() options = ds.FileSystemFactoryOptions('subdir') options.partitioning = ds.DirectoryPartitioning( pa.schema([ pa.field('group', pa.int32()), pa.field('key', pa.string()) ]) ) assert options.partition_base_dir == 'subdir' assert options.ignore_prefixes == ['.', '_'] assert options.exclude_invalid_files is True factory = ds.FileSystemSourceFactory( mockfs, paths_or_selector, format, options ) inspected_schema = factory.inspect() assert isinstance(factory.inspect(), pa.Schema) assert isinstance(factory.inspect_schemas(), list) assert isinstance(factory.finish(inspected_schema), ds.FileSystemSource) assert factory.root_partition.equals(ds.ScalarExpression(True)) source = factory.finish() assert isinstance(source, ds.Source) dataset = ds.Dataset([source], inspected_schema) scanner = dataset.new_scan().finish() assert len(list(scanner.scan())) == 2 expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64()) expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64()) for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']): expected_group_column = pa.array([group] * 5, type=pa.int32()) expected_key_column = pa.array([key] * 5, type=pa.string()) for batch in task.execute(): assert batch.num_columns == 4 assert batch[0].equals(expected_i64) assert batch[1].equals(expected_f64) assert batch[2].equals(expected_group_column) assert batch[3].equals(expected_key_column) table = scanner.to_table() assert isinstance(table, pa.Table) assert len(table) == 10 assert table.num_columns == 4