def test_scanner(dataset): scanner = ds.Scanner(dataset, memory_pool=pa.default_memory_pool()) assert isinstance(scanner, ds.Scanner) assert len(list(scanner.scan())) == 2 with pytest.raises(pa.ArrowInvalid): dataset.scan(columns=['unknown']) scanner = ds.Scanner(dataset, columns=['i64'], memory_pool=pa.default_memory_pool()) assert isinstance(scanner, ds.Scanner) assert len(list(scanner.scan())) == 2 for task in scanner.scan(): for batch in task.execute(): assert batch.num_columns == 1
def test_dataset(dataset): assert isinstance(dataset, ds.Dataset) assert isinstance(dataset.schema, pa.Schema) # TODO(kszucs): test non-boolean Exprs for filter do raise expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64()) expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64()) for task in dataset.scan(): assert isinstance(task, ds.ScanTask) for batch in task.execute(): assert batch.column(0).equals(expected_i64) assert batch.column(1).equals(expected_f64) batches = dataset.to_batches() assert all(isinstance(batch, pa.RecordBatch) for batch in batches) table = dataset.to_table() assert isinstance(table, pa.Table) assert len(table) == 10 condition = ds.field('i64') == 1 scanner = ds.Scanner(dataset, use_threads=True, filter=condition) result = scanner.to_table().to_pydict() # don't rely on the scanning order assert result['i64'] == [1, 1] assert result['f64'] == [1., 1.] assert sorted(result['group']) == [1, 2] assert sorted(result['key']) == ['xxx', 'yyy']
def test_filesystem_factory(mockfs, paths_or_selector): format = ds.ParquetFileFormat(read_options=ds.ParquetReadOptions( dictionary_columns={"str"})) options = ds.FileSystemFactoryOptions('subdir') options.partitioning = ds.DirectoryPartitioning( pa.schema( [pa.field('group', pa.int32()), pa.field('key', pa.string())])) assert options.partition_base_dir == 'subdir' assert options.ignore_prefixes == ['.', '_'] assert options.exclude_invalid_files is False factory = ds.FileSystemDatasetFactory(mockfs, paths_or_selector, format, options) inspected_schema = factory.inspect() assert factory.inspect().equals(pa.schema([ pa.field('i64', pa.int64()), pa.field('f64', pa.float64()), pa.field('str', pa.dictionary(pa.int32(), pa.string())), pa.field('const', pa.int64()), pa.field('group', pa.int32()), pa.field('key', pa.string()), ]), check_metadata=False) assert isinstance(factory.inspect_schemas(), list) assert isinstance(factory.finish(inspected_schema), ds.FileSystemDataset) assert factory.root_partition.equals(ds.ScalarExpression(True)) dataset = factory.finish() assert isinstance(dataset, ds.FileSystemDataset) assert len(list(dataset.scan())) == 2 scanner = ds.Scanner(dataset) expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64()) expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64()) expected_str = pa.DictionaryArray.from_arrays( pa.array([0, 1, 2, 3, 4], type=pa.int32()), pa.array("0 1 2 3 4".split(), type=pa.string())) for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']): expected_group = pa.array([group] * 5, type=pa.int32()) expected_key = pa.array([key] * 5, type=pa.string()) expected_const = pa.array([group - 1] * 5, type=pa.int64()) for batch in task.execute(): assert batch.num_columns == 6 assert batch[0].equals(expected_i64) assert batch[1].equals(expected_f64) assert batch[2].equals(expected_str) assert batch[3].equals(expected_const) assert batch[4].equals(expected_group) assert batch[5].equals(expected_key) table = dataset.to_table() assert isinstance(table, pa.Table) assert len(table) == 10 assert table.num_columns == 6
def test_filter_implicit_cast(tempdir): # ARROW-7652 table = pa.table({'a': pa.array([0, 1, 2, 3, 4, 5], type=pa.int8())}) _, path = _create_single_file(tempdir, table) dataset = ds.dataset(str(path)) filter_ = ds.field('a') > 2 scanner = ds.Scanner(dataset, filter=filter_) result = scanner.to_table() assert len(result) == 3
def test_file_system_factory(mockfs, paths_or_selector): format = ds.ParquetFileFormat() options = ds.FileSystemFactoryOptions('subdir') options.partitioning = ds.DirectoryPartitioning( pa.schema( [pa.field('group', pa.int32()), pa.field('key', pa.string())])) assert options.partition_base_dir == 'subdir' assert options.ignore_prefixes == ['.', '_'] assert options.exclude_invalid_files is True factory = ds.FileSystemSourceFactory(mockfs, paths_or_selector, format, options) inspected_schema = factory.inspect() assert isinstance(factory.inspect(), pa.Schema) assert isinstance(factory.inspect_schemas(), list) assert isinstance(factory.finish(inspected_schema), ds.FileSystemSource) assert factory.root_partition.equals(ds.ScalarExpression(True)) source = factory.finish() assert isinstance(source, ds.Source) dataset = ds.Dataset([source], inspected_schema) assert len(list(dataset.scan())) == 2 scanner = ds.Scanner(dataset) expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64()) expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64()) for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']): expected_group_column = pa.array([group] * 5, type=pa.int32()) expected_key_column = pa.array([key] * 5, type=pa.string()) for batch in task.execute(): assert batch.num_columns == 4 assert batch[0].equals(expected_i64) assert batch[1].equals(expected_f64) assert batch[2].equals(expected_group_column) assert batch[3].equals(expected_key_column) table = dataset.to_table() assert isinstance(table, pa.Table) assert len(table) == 10 assert table.num_columns == 4