def test_expression(): a = ds.ScalarExpression(1) b = ds.ScalarExpression(1.1) c = ds.ScalarExpression(True) d = ds.ScalarExpression("string") e = ds.ScalarExpression(None) equal = ds.ComparisonExpression(ds.CompareOperator.Equal, a, b) greater = a > b assert equal.op == ds.CompareOperator.Equal and_ = ds.AndExpression(a, b) assert and_.left_operand.equals(a) assert and_.right_operand.equals(b) assert and_.equals(ds.AndExpression(a, b)) assert and_.equals(and_) or_ = ds.OrExpression(a, b) not_ = ds.NotExpression(ds.OrExpression(a, b)) is_valid = ds.IsValidExpression(a) cast_safe = ds.CastExpression(a, pa.int32()) cast_unsafe = ds.CastExpression(a, pa.int32(), safe=False) in_ = ds.InExpression(a, pa.array([1, 2, 3])) assert is_valid.operand == a assert in_.set_.equals(pa.array([1, 2, 3])) assert cast_unsafe.to == pa.int32() assert cast_unsafe.safe is False assert cast_safe.safe is True condition = ds.ComparisonExpression(ds.CompareOperator.Greater, ds.FieldExpression('i64'), ds.ScalarExpression(5)) schema = pa.schema( [pa.field('i64', pa.int64()), pa.field('f64', pa.float64())]) assert condition.validate(schema) == pa.bool_() i64_is_5 = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('i64'), ds.ScalarExpression(5)) i64_is_7 = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('i64'), ds.ScalarExpression(7)) assert condition.assume(i64_is_5).equals(ds.ScalarExpression(False)) assert condition.assume(i64_is_7).equals(ds.ScalarExpression(True)) assert str(condition) == "(i64 > 5:int64)" assert "(i64 > 5:int64)" in repr(condition) all_exprs = [ a, b, c, d, e, equal, greater, and_, or_, not_, is_valid, cast_unsafe, cast_safe, in_, condition, i64_is_5, i64_is_7 ] for expr in all_exprs: restored = pickle.loads(pickle.dumps(expr)) assert expr.equals(restored)
def test_partitioning(): schema = pa.schema([ pa.field('i64', pa.int64()), pa.field('f64', pa.float64()) ]) for klass in [ds.DirectoryPartitioning, ds.HivePartitioning]: partitioning = klass(schema) assert isinstance(partitioning, ds.Partitioning) partitioning = ds.DirectoryPartitioning( pa.schema([ pa.field('group', pa.int64()), pa.field('key', pa.float64()) ]) ) expr = partitioning.parse('/3/3.14') assert isinstance(expr, ds.Expression) expected = ds.AndExpression( ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('group'), ds.ScalarExpression(3) ), ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('key'), ds.ScalarExpression(3.14) ) ) assert expr.equals(expected) with pytest.raises(pa.ArrowInvalid): partitioning.parse('/prefix/3/aaa') partitioning = ds.HivePartitioning( pa.schema([ pa.field('alpha', pa.int64()), pa.field('beta', pa.int64()) ]) ) expr = partitioning.parse('/alpha=0/beta=3') expected = ds.AndExpression( ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('alpha'), ds.ScalarExpression(0) ), ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('beta'), ds.ScalarExpression(3) ) ) assert expr.equals(expected)
def test_filesystem_dataset(mockfs): schema = pa.schema([]) file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)] dataset = ds.FileSystemDataset( schema, root_partition=None, file_format=file_format, filesystem=mockfs, paths_or_selector=paths, partitions=partitions ) assert isinstance(dataset.format, ds.ParquetFileFormat) root_partition = ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('level'), ds.ScalarExpression(1337) ) partitions = [ ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(1) ), ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(2) ) ] dataset = ds.FileSystemDataset( paths_or_selector=paths, schema=schema, root_partition=root_partition, filesystem=mockfs, partitions=partitions, file_format=file_format ) assert dataset.partition_expression.equals(root_partition) assert set(dataset.files) == set(paths) fragments = list(dataset.get_fragments()) assert fragments[0].partition_expression.equals( ds.AndExpression(root_partition, partitions[0])) assert fragments[1].partition_expression.equals( ds.AndExpression(root_partition, partitions[1])) assert fragments[0].path == paths[0] assert fragments[1].path == paths[1]
def test_expression(): a = ds.ScalarExpression(1) b = ds.ScalarExpression(1.1) c = ds.ScalarExpression(True) d = ds.ScalarExpression("string") equal = ds.ComparisonExpression(ds.CompareOperator.Equal, a, b) assert equal.op() == ds.CompareOperator.Equal and_ = ds.AndExpression(a, b) assert and_.left_operand.equals(a) assert and_.right_operand.equals(b) assert and_.equals(ds.AndExpression(a, b)) assert and_.equals(and_) ds.AndExpression(a, b, c) ds.OrExpression(a, b) ds.OrExpression(a, b, c, d) ds.NotExpression(ds.OrExpression(a, b, c)) ds.IsValidExpression(a) ds.CastExpression(a, pa.int32()) ds.CastExpression(a, pa.int32(), safe=True) ds.InExpression(a, pa.array([1, 2, 3])) condition = ds.ComparisonExpression( ds.CompareOperator.Greater, ds.FieldExpression('i64'), ds.ScalarExpression(5) ) schema = pa.schema([ pa.field('i64', pa.int64()), pa.field('f64', pa.float64()) ]) assert condition.validate(schema) == pa.bool_() i64_is_5 = ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('i64'), ds.ScalarExpression(5) ) i64_is_7 = ds.ComparisonExpression( ds.CompareOperator.Equal, ds.FieldExpression('i64'), ds.ScalarExpression(7) ) assert condition.assume(i64_is_5).equals(ds.ScalarExpression(False)) assert condition.assume(i64_is_7).equals(ds.ScalarExpression(True)) assert str(condition) == "(i64 > 5:int64)" assert "(i64 > 5:int64)" in repr(condition)
def test_expression_ergonomics(): zero = ds.scalar(0) one = ds.scalar(1) true = ds.scalar(True) false = ds.scalar(False) string = ds.scalar("string") field = ds.field("field") assert one.equals(ds.ScalarExpression(1)) assert zero.equals(ds.ScalarExpression(0)) assert true.equals(ds.ScalarExpression(True)) assert false.equals(ds.ScalarExpression(False)) assert string.equals(ds.ScalarExpression("string")) assert field.equals(ds.FieldExpression("field")) expected = ds.AndExpression(ds.ScalarExpression(1), ds.ScalarExpression(0)) for expr in [one & zero, 1 & zero, one & 0]: assert expr.equals(expected) expected = ds.OrExpression(ds.ScalarExpression(1), ds.ScalarExpression(0)) for expr in [one | zero, 1 | zero, one | 0]: assert expr.equals(expected) comparison_ops = [ (operator.eq, ds.CompareOperator.Equal), (operator.ne, ds.CompareOperator.NotEqual), (operator.ge, ds.CompareOperator.GreaterEqual), (operator.le, ds.CompareOperator.LessEqual), (operator.lt, ds.CompareOperator.Less), (operator.gt, ds.CompareOperator.Greater), ] for op, compare_op in comparison_ops: expr = op(zero, one) expected = ds.ComparisonExpression(compare_op, zero, one) assert expr.equals(expected) expr = ~true == false expected = ds.ComparisonExpression( ds.CompareOperator.Equal, ds.NotExpression(ds.ScalarExpression(True)), ds.ScalarExpression(False) ) assert expr.equals(expected) for typ in ("bool", pa.bool_()): expr = field.cast(typ) == true expected = ds.ComparisonExpression( ds.CompareOperator.Equal, ds.CastExpression(ds.FieldExpression("field"), pa.bool_()), ds.ScalarExpression(True) ) assert expr.equals(expected) expr = field.isin([1, 2]) expected = ds.InExpression(ds.FieldExpression("field"), pa.array([1, 2])) assert expr.equals(expected) with pytest.raises(TypeError): field.isin(1)
def test_filesystem_dataset(mockfs): schema = pa.schema([pa.field('const', pa.int64())]) file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)] dataset = ds.FileSystemDataset(schema, root_partition=None, file_format=file_format, filesystem=mockfs, paths_or_selector=paths, partitions=partitions) assert isinstance(dataset.format, ds.ParquetFileFormat) root_partition = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('level'), ds.ScalarExpression(1337)) partitions = [ ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(1)), ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(2)) ] dataset = ds.FileSystemDataset(paths_or_selector=paths, schema=schema, root_partition=root_partition, filesystem=mockfs, partitions=partitions, file_format=file_format) assert dataset.partition_expression.equals(root_partition) assert set(dataset.files) == set(paths) fragments = list(dataset.get_fragments()) for fragment, partition, path in zip(fragments, partitions, paths): assert fragment.partition_expression.equals( ds.AndExpression(root_partition, partition)) assert fragment.path == path assert isinstance(fragment, ds.ParquetFileFragment) assert fragment.row_groups is None row_group_fragments = list(fragment.get_row_group_fragments()) assert len(row_group_fragments) == 1 assert isinstance(fragment, ds.ParquetFileFragment) assert row_group_fragments[0].path == path assert row_group_fragments[0].row_groups == {0} # test predicate pushdown using row group metadata fragments = list(dataset.get_fragments(filter=ds.field("const") == 0)) assert len(fragments) == 2 assert len(list(fragments[0].get_row_group_fragments())) == 1 assert len(list(fragments[1].get_row_group_fragments())) == 0