def test_partitioning(): schema = pa.schema( [pa.field('i64', pa.int64()), pa.field('f64', pa.float64())]) for klass in [ds.DirectoryPartitioning, ds.HivePartitioning]: partitioning = klass(schema) assert isinstance(partitioning, ds.Partitioning) partitioning = ds.DirectoryPartitioning( pa.schema( [pa.field('group', pa.int64()), pa.field('key', pa.float64())])) expr = partitioning.parse('/3/3.14') assert isinstance(expr, ds.Expression) expected = (ds.field('group') == 3) & (ds.field('key') == 3.14) assert expr.equals(expected) with pytest.raises(pa.ArrowInvalid): partitioning.parse('/prefix/3/aaa') partitioning = ds.HivePartitioning( pa.schema( [pa.field('alpha', pa.int64()), pa.field('beta', pa.int64())])) expr = partitioning.parse('/alpha=0/beta=3') expected = ((ds.field('alpha') == ds.scalar(0)) & (ds.field('beta') == ds.scalar(3))) assert expr.equals(expected)
def test_expression_construction(): zero = ds.scalar(0) one = ds.scalar(1) true = ds.scalar(True) false = ds.scalar(False) string = ds.scalar("string") field = ds.field("field") zero | one == string ~true == false for typ in ("bool", pa.bool_()): field.cast(typ) == true field.isin([1, 2]) with pytest.raises(TypeError): field.isin(1) # operations with non-scalar values with pytest.raises(TypeError): field == [1] with pytest.raises(TypeError): field != {1} with pytest.raises(TypeError): field & [1] with pytest.raises(TypeError): field | [1]
def test_expression_ergonomics(): zero = ds.scalar(0) one = ds.scalar(1) true = ds.scalar(True) false = ds.scalar(False) string = ds.scalar("string") field = ds.field("field") assert one.equals(ds.ScalarExpression(1)) assert zero.equals(ds.ScalarExpression(0)) assert true.equals(ds.ScalarExpression(True)) assert false.equals(ds.ScalarExpression(False)) assert string.equals(ds.ScalarExpression("string")) assert field.equals(ds.FieldExpression("field")) expected = ds.AndExpression(ds.ScalarExpression(1), ds.ScalarExpression(0)) for expr in [one & zero, 1 & zero, one & 0]: assert expr.equals(expected) expected = ds.OrExpression(ds.ScalarExpression(1), ds.ScalarExpression(0)) for expr in [one | zero, 1 | zero, one | 0]: assert expr.equals(expected) comparison_ops = [ (operator.eq, ds.CompareOperator.Equal), (operator.ne, ds.CompareOperator.NotEqual), (operator.ge, ds.CompareOperator.GreaterEqual), (operator.le, ds.CompareOperator.LessEqual), (operator.lt, ds.CompareOperator.Less), (operator.gt, ds.CompareOperator.Greater), ] for op, compare_op in comparison_ops: expr = op(zero, one) expected = ds.ComparisonExpression(compare_op, zero, one) assert expr.equals(expected) expr = ~true == false expected = ds.ComparisonExpression( ds.CompareOperator.Equal, ds.NotExpression(ds.ScalarExpression(True)), ds.ScalarExpression(False) ) assert expr.equals(expected) for typ in ("bool", pa.bool_()): expr = field.cast(typ) == true expected = ds.ComparisonExpression( ds.CompareOperator.Equal, ds.CastExpression(ds.FieldExpression("field"), pa.bool_()), ds.ScalarExpression(True) ) assert expr.equals(expected) expr = field.isin([1, 2]) expected = ds.InExpression(ds.FieldExpression("field"), pa.array([1, 2])) assert expr.equals(expected) with pytest.raises(TypeError): field.isin(1)
def test_expression_serialization(): a = ds.scalar(1) b = ds.scalar(1.1) c = ds.scalar(True) d = ds.scalar("string") e = ds.scalar(None) condition = ds.field('i64') > 5 schema = pa.schema([ pa.field('i64', pa.int64()), pa.field('f64', pa.float64()) ]) assert condition.validate(schema) == pa.bool_() assert condition.assume(ds.field('i64') == 5).equals( ds.scalar(False)) assert condition.assume(ds.field('i64') == 7).equals( ds.scalar(True)) all_exprs = [a, b, c, d, e, a == b, a > b, a & b, a | b, ~c, d.is_valid(), a.cast(pa.int32(), safe=False), a.cast(pa.int32(), safe=False), a.isin([1, 2, 3]), ds.field('i64') > 5, ds.field('i64') == 5, ds.field('i64') == 7] for expr in all_exprs: assert isinstance(expr, ds.Expression) restored = pickle.loads(pickle.dumps(expr)) assert expr.equals(restored)
'!=': operator.ne, 'and': operator.and_, 'or': operator.or_, 'in': lambda _x, _l: _x in _l, 'not in': lambda _x, _l: _x not in _l, # todo: dont know how this will be used filter tuple that uses three # elements # '~': lambda _x: ~x, # todo: not sure about this # 'not': lambda _x: ~x, } # do not try to reuse code for operators as we encounter problem of using # last value while looping over dict _OP_MAPPER (loop rolling issue with python) _OP_MAPPER_EXP = { '=': lambda _x, _y: operator.eq(pds.field(_x), pds.scalar(_y)), '==': lambda _x, _y: operator.eq(pds.field(_x), pds.scalar(_y)), '<': lambda _x, _y: operator.lt(pds.field(_x), pds.scalar(_y)), '>': lambda _x, _y: operator.gt(pds.field(_x), pds.scalar(_y)), '<=': lambda _x, _y: operator.le(pds.field(_x), pds.scalar(_y)), '>=': lambda _x, _y: operator.ge(pds.field(_x), pds.scalar(_y)), '!=': lambda _x, _y: operator.ne(pds.field(_x), pds.scalar(_y)), 'and': lambda _x, _y: operator.and_(pds.field(_x), pds.scalar(_y)), 'or': lambda _x, _y: operator.or_(pds.field(_x), pds.scalar(_y)), 'in': lambda _x, _l: pds.field(_x).isin(_l), 'not in': lambda _x, _l: operator.inv(pds.field(_x).isin(_l)), # todo: dont know how this will be used filter tuple that uses three # elements # '~': lambda _x: operator.inv(pds.scalar(_x)), # todo: not sure about this # 'not': lambda _x: operator.inv(pds.scalar(_x)),
def test_filesystem_factory(mockfs, paths_or_selector): format = ds.ParquetFileFormat( read_options=ds.ParquetReadOptions(dictionary_columns={"str"}) ) options = ds.FileSystemFactoryOptions('subdir') options.partitioning = ds.DirectoryPartitioning( pa.schema([ pa.field('group', pa.int32()), pa.field('key', pa.string()) ]) ) assert options.partition_base_dir == 'subdir' assert options.selector_ignore_prefixes == ['.', '_'] assert options.exclude_invalid_files is False factory = ds.FileSystemDatasetFactory( mockfs, paths_or_selector, format, options ) inspected_schema = factory.inspect() assert factory.inspect().equals(pa.schema([ pa.field('i64', pa.int64()), pa.field('f64', pa.float64()), pa.field('str', pa.dictionary(pa.int32(), pa.string())), pa.field('const', pa.int64()), pa.field('group', pa.int32()), pa.field('key', pa.string()), ]), check_metadata=False) assert isinstance(factory.inspect_schemas(), list) assert isinstance(factory.finish(inspected_schema), ds.FileSystemDataset) assert factory.root_partition.equals(ds.scalar(True)) dataset = factory.finish() assert isinstance(dataset, ds.FileSystemDataset) assert len(list(dataset.scan())) == 2 scanner = ds.Scanner.from_dataset(dataset) expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64()) expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64()) expected_str = pa.DictionaryArray.from_arrays( pa.array([0, 1, 2, 3, 4], type=pa.int32()), pa.array("0 1 2 3 4".split(), type=pa.string()) ) for task, group, key in zip(scanner.scan(), [1, 2], ['xxx', 'yyy']): expected_group = pa.array([group] * 5, type=pa.int32()) expected_key = pa.array([key] * 5, type=pa.string()) expected_const = pa.array([group - 1] * 5, type=pa.int64()) for batch in task.execute(): assert batch.num_columns == 6 assert batch[0].equals(expected_i64) assert batch[1].equals(expected_f64) assert batch[2].equals(expected_str) assert batch[3].equals(expected_const) assert batch[4].equals(expected_group) assert batch[5].equals(expected_key) table = dataset.to_table() assert isinstance(table, pa.Table) assert len(table) == 10 assert table.num_columns == 6
def test_filesystem_dataset(mockfs): schema = pa.schema([ pa.field('const', pa.int64()) ]) file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.scalar(True), ds.scalar(True)] dataset = ds.FileSystemDataset( schema=schema, format=file_format, filesystem=mockfs, paths_or_selector=paths, partitions=partitions ) assert isinstance(dataset.format, ds.ParquetFileFormat) # the root_partition and partitions keywords have defaults dataset = ds.FileSystemDataset( paths, schema, format=file_format, filesystem=mockfs, ) assert isinstance(dataset.format, ds.ParquetFileFormat) # validation of required arguments with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, format=file_format, filesystem=mockfs) with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, schema=schema, filesystem=mockfs) with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, schema=schema, format=file_format) # validation of root_partition with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, schema=schema, format=file_format, filesystem=mockfs, root_partition=1) root_partition = ds.field('level') == ds.scalar(1337) partitions = [ds.field('part') == x for x in range(1, 3)] dataset = ds.FileSystemDataset( paths_or_selector=paths, schema=schema, root_partition=root_partition, filesystem=mockfs, partitions=partitions, format=file_format ) assert dataset.partition_expression.equals(root_partition) assert set(dataset.files) == set(paths) fragments = list(dataset.get_fragments()) for fragment, partition, path in zip(fragments, partitions, paths): assert fragment.partition_expression.equals(partition) assert fragment.path == path assert isinstance(fragment.format, ds.ParquetFileFormat) assert isinstance(fragment, ds.ParquetFileFragment) assert fragment.row_groups is None row_group_fragments = list(fragment.get_row_group_fragments()) assert len(row_group_fragments) == 1 assert isinstance(fragment, ds.ParquetFileFragment) assert row_group_fragments[0].path == path assert row_group_fragments[0].row_groups == {0} fragments = list(dataset.get_fragments(filter=ds.field("const") == 0)) assert len(fragments) == 2