def test_expression_ergonomics(): zero = ds.scalar(0) one = ds.scalar(1) true = ds.scalar(True) false = ds.scalar(False) string = ds.scalar("string") field = ds.field("field") assert one.equals(ds.ScalarExpression(1)) assert zero.equals(ds.ScalarExpression(0)) assert true.equals(ds.ScalarExpression(True)) assert false.equals(ds.ScalarExpression(False)) assert string.equals(ds.ScalarExpression("string")) assert field.equals(ds.FieldExpression("field")) expected = ds.AndExpression(ds.ScalarExpression(1), ds.ScalarExpression(0)) for expr in [one & zero, 1 & zero, one & 0]: assert expr.equals(expected) expected = ds.OrExpression(ds.ScalarExpression(1), ds.ScalarExpression(0)) for expr in [one | zero, 1 | zero, one | 0]: assert expr.equals(expected) comparison_ops = [ (operator.eq, ds.CompareOperator.Equal), (operator.ne, ds.CompareOperator.NotEqual), (operator.ge, ds.CompareOperator.GreaterEqual), (operator.le, ds.CompareOperator.LessEqual), (operator.lt, ds.CompareOperator.Less), (operator.gt, ds.CompareOperator.Greater), ] for op, compare_op in comparison_ops: expr = op(zero, one) expected = ds.ComparisonExpression(compare_op, zero, one) assert expr.equals(expected) expr = ~true == false expected = ds.ComparisonExpression( ds.CompareOperator.Equal, ds.NotExpression(ds.ScalarExpression(True)), ds.ScalarExpression(False)) assert expr.equals(expected) for typ in ("bool", pa.bool_()): expr = field.cast(typ) == true expected = ds.ComparisonExpression( ds.CompareOperator.Equal, ds.CastExpression(ds.FieldExpression("field"), pa.bool_()), ds.ScalarExpression(True)) assert expr.equals(expected) expr = field.isin([1, 2]) expected = ds.InExpression(ds.FieldExpression("field"), pa.array([1, 2])) assert expr.equals(expected) with pytest.raises(TypeError): field.isin(1) # operations with non-scalar values with pytest.raises(TypeError): field == [1] with pytest.raises(TypeError): field != {1} with pytest.raises(TypeError): field & [1] with pytest.raises(TypeError): field | [1]
def test_filesystem_dataset(mockfs): schema = pa.schema([pa.field('const', pa.int64())]) file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)] dataset = ds.FileSystemDataset(schema=schema, root_partition=None, format=file_format, filesystem=mockfs, paths_or_selector=paths, partitions=partitions) assert isinstance(dataset.format, ds.ParquetFileFormat) # the root_partition and partitions keywords have defaults dataset = ds.FileSystemDataset( paths, schema, format=file_format, filesystem=mockfs, ) assert isinstance(dataset.format, ds.ParquetFileFormat) # validation of required arguments with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, format=file_format, filesystem=mockfs) with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, schema=schema, filesystem=mockfs) with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, schema=schema, format=file_format) # validation of root_partition with pytest.raises(TypeError, match="incorrect type"): ds.FileSystemDataset(paths, schema=schema, format=file_format, filesystem=mockfs, root_partition=1) root_partition = ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('level'), ds.ScalarExpression(1337)) partitions = [ ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(1)), ds.ComparisonExpression(ds.CompareOperator.Equal, ds.FieldExpression('part'), ds.ScalarExpression(2)) ] dataset = ds.FileSystemDataset(paths_or_selector=paths, schema=schema, root_partition=root_partition, filesystem=mockfs, partitions=partitions, format=file_format) assert dataset.partition_expression.equals(root_partition) assert set(dataset.files) == set(paths) fragments = list(dataset.get_fragments()) for fragment, partition, path in zip(fragments, partitions, paths): assert fragment.partition_expression.equals( ds.AndExpression(root_partition, partition)) assert fragment.path == path assert isinstance(fragment, ds.ParquetFileFragment) assert fragment.row_groups is None row_group_fragments = list(fragment.get_row_group_fragments()) assert len(row_group_fragments) == 1 assert isinstance(fragment, ds.ParquetFileFragment) assert row_group_fragments[0].path == path assert row_group_fragments[0].row_groups == {0} # test predicate pushdown using row group metadata fragments = list(dataset.get_fragments(filter=ds.field("const") == 0)) assert len(fragments) == 2 assert len(list(fragments[0].get_row_group_fragments())) == 1 assert len(list(fragments[1].get_row_group_fragments())) == 0