Пример #1
0
def test_filesystem_source(mockfs):
    schema = pa.schema([])

    file_format = ds.ParquetFileFormat()

    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
    partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)]

    source = ds.FileSystemSource(schema,
                                 source_partition=None,
                                 file_format=file_format,
                                 filesystem=mockfs,
                                 paths_or_selector=paths,
                                 partitions=partitions)

    source_partition = ds.ComparisonExpression(ds.CompareOperator.Equal,
                                               ds.FieldExpression('source'),
                                               ds.ScalarExpression(1337))
    partitions = [
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('part'),
                                ds.ScalarExpression(1)),
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('part'),
                                ds.ScalarExpression(2))
    ]
    source = ds.FileSystemSource(paths_or_selector=paths,
                                 schema=schema,
                                 source_partition=source_partition,
                                 filesystem=mockfs,
                                 partitions=partitions,
                                 file_format=file_format)
    assert source.partition_expression.equals(source_partition)
Пример #2
0
def test_filesystem_data_source(mockfs):
    file_format = ds.ParquetFileFormat()

    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
    partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)]

    source = ds.FileSystemDataSource(mockfs,
                                     paths,
                                     partitions,
                                     source_partition=None,
                                     file_format=file_format)

    source_partition = ds.ComparisonExpression(ds.CompareOperator.Equal,
                                               ds.FieldExpression('source'),
                                               ds.ScalarExpression(1337))
    partitions = [
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('part'),
                                ds.ScalarExpression(1)),
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('part'),
                                ds.ScalarExpression(2))
    ]
    source = ds.FileSystemDataSource(mockfs,
                                     paths,
                                     partitions,
                                     source_partition=source_partition,
                                     file_format=file_format)
    assert source.partition_expression.equals(source_partition)
Пример #3
0
def test_expression_ergonomics():
    zero = ds.scalar(0)
    one = ds.scalar(1)
    true = ds.scalar(True)
    false = ds.scalar(False)
    string = ds.scalar("string")
    field = ds.field("field")

    assert one.equals(ds.ScalarExpression(1))
    assert zero.equals(ds.ScalarExpression(0))
    assert true.equals(ds.ScalarExpression(True))
    assert false.equals(ds.ScalarExpression(False))
    assert string.equals(ds.ScalarExpression("string"))
    assert field.equals(ds.FieldExpression("field"))

    expected = ds.AndExpression(ds.ScalarExpression(1), ds.ScalarExpression(0))
    for expr in [one & zero, 1 & zero, one & 0]:
        assert expr.equals(expected)

    expected = ds.OrExpression(ds.ScalarExpression(1), ds.ScalarExpression(0))
    for expr in [one | zero, 1 | zero, one | 0]:
        assert expr.equals(expected)

    comparison_ops = [
        (operator.eq, ds.CompareOperator.Equal),
        (operator.ne, ds.CompareOperator.NotEqual),
        (operator.ge, ds.CompareOperator.GreaterEqual),
        (operator.le, ds.CompareOperator.LessEqual),
        (operator.lt, ds.CompareOperator.Less),
        (operator.gt, ds.CompareOperator.Greater),
    ]
    for op, compare_op in comparison_ops:
        expr = op(zero, one)
        expected = ds.ComparisonExpression(compare_op, zero, one)
        assert expr.equals(expected)

    expr = ~true == false
    expected = ds.ComparisonExpression(
        ds.CompareOperator.Equal,
        ds.NotExpression(ds.ScalarExpression(True)),
        ds.ScalarExpression(False)
    )
    assert expr.equals(expected)

    for typ in ("bool", pa.bool_()):
        expr = field.cast(typ) == true
        expected = ds.ComparisonExpression(
            ds.CompareOperator.Equal,
            ds.CastExpression(ds.FieldExpression("field"), pa.bool_()),
            ds.ScalarExpression(True)
        )
        assert expr.equals(expected)

    expr = field.isin([1, 2])
    expected = ds.InExpression(ds.FieldExpression("field"), pa.array([1, 2]))
    assert expr.equals(expected)

    with pytest.raises(TypeError):
        field.isin(1)
Пример #4
0
def test_expression():
    a = ds.ScalarExpression(1)
    b = ds.ScalarExpression(1.1)
    c = ds.ScalarExpression(True)
    d = ds.ScalarExpression("string")
    e = ds.ScalarExpression(None)

    equal = ds.ComparisonExpression(ds.CompareOperator.Equal, a, b)
    greater = a > b
    assert equal.op == ds.CompareOperator.Equal

    and_ = ds.AndExpression(a, b)
    assert and_.left_operand.equals(a)
    assert and_.right_operand.equals(b)
    assert and_.equals(ds.AndExpression(a, b))
    assert and_.equals(and_)

    or_ = ds.OrExpression(a, b)
    not_ = ds.NotExpression(ds.OrExpression(a, b))
    is_valid = ds.IsValidExpression(a)
    cast_safe = ds.CastExpression(a, pa.int32())
    cast_unsafe = ds.CastExpression(a, pa.int32(), safe=False)
    in_ = ds.InExpression(a, pa.array([1, 2, 3]))

    assert is_valid.operand == a
    assert in_.set_.equals(pa.array([1, 2, 3]))
    assert cast_unsafe.to == pa.int32()
    assert cast_unsafe.safe is False
    assert cast_safe.safe is True

    condition = ds.ComparisonExpression(ds.CompareOperator.Greater,
                                        ds.FieldExpression('i64'),
                                        ds.ScalarExpression(5))
    schema = pa.schema(
        [pa.field('i64', pa.int64()),
         pa.field('f64', pa.float64())])
    assert condition.validate(schema) == pa.bool_()

    i64_is_5 = ds.ComparisonExpression(ds.CompareOperator.Equal,
                                       ds.FieldExpression('i64'),
                                       ds.ScalarExpression(5))
    i64_is_7 = ds.ComparisonExpression(ds.CompareOperator.Equal,
                                       ds.FieldExpression('i64'),
                                       ds.ScalarExpression(7))
    assert condition.assume(i64_is_5).equals(ds.ScalarExpression(False))
    assert condition.assume(i64_is_7).equals(ds.ScalarExpression(True))
    assert str(condition) == "(i64 > 5:int64)"
    assert "(i64 > 5:int64)" in repr(condition)

    all_exprs = [
        a, b, c, d, e, equal, greater, and_, or_, not_, is_valid, cast_unsafe,
        cast_safe, in_, condition, i64_is_5, i64_is_7
    ]
    for expr in all_exprs:
        restored = pickle.loads(pickle.dumps(expr))
        assert expr.equals(restored)
Пример #5
0
def test_partitioning():
    schema = pa.schema([
        pa.field('i64', pa.int64()),
        pa.field('f64', pa.float64())
    ])
    for klass in [ds.DirectoryPartitioning, ds.HivePartitioning]:
        partitioning = klass(schema)
        assert isinstance(partitioning, ds.Partitioning)

    partitioning = ds.DirectoryPartitioning(
        pa.schema([
            pa.field('group', pa.int64()),
            pa.field('key', pa.float64())
        ])
    )
    expr = partitioning.parse('/3/3.14')
    assert isinstance(expr, ds.Expression)

    expected = ds.AndExpression(
        ds.ComparisonExpression(
            ds.CompareOperator.Equal,
            ds.FieldExpression('group'),
            ds.ScalarExpression(3)
        ),
        ds.ComparisonExpression(
            ds.CompareOperator.Equal,
            ds.FieldExpression('key'),
            ds.ScalarExpression(3.14)
        )
    )
    assert expr.equals(expected)

    with pytest.raises(pa.ArrowInvalid):
        partitioning.parse('/prefix/3/aaa')

    partitioning = ds.HivePartitioning(
        pa.schema([
            pa.field('alpha', pa.int64()),
            pa.field('beta', pa.int64())
        ])
    )
    expr = partitioning.parse('/alpha=0/beta=3')
    expected = ds.AndExpression(
        ds.ComparisonExpression(
            ds.CompareOperator.Equal,
            ds.FieldExpression('alpha'),
            ds.ScalarExpression(0)
        ),
        ds.ComparisonExpression(
            ds.CompareOperator.Equal,
            ds.FieldExpression('beta'),
            ds.ScalarExpression(3)
        )
    )
    assert expr.equals(expected)
Пример #6
0
def test_filesystem_dataset(mockfs):
    schema = pa.schema([pa.field('const', pa.int64())])

    file_format = ds.ParquetFileFormat()

    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
    partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)]

    dataset = ds.FileSystemDataset(schema,
                                   root_partition=None,
                                   file_format=file_format,
                                   filesystem=mockfs,
                                   paths_or_selector=paths,
                                   partitions=partitions)
    assert isinstance(dataset.format, ds.ParquetFileFormat)

    root_partition = ds.ComparisonExpression(ds.CompareOperator.Equal,
                                             ds.FieldExpression('level'),
                                             ds.ScalarExpression(1337))
    partitions = [
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('part'),
                                ds.ScalarExpression(1)),
        ds.ComparisonExpression(ds.CompareOperator.Equal,
                                ds.FieldExpression('part'),
                                ds.ScalarExpression(2))
    ]
    dataset = ds.FileSystemDataset(paths_or_selector=paths,
                                   schema=schema,
                                   root_partition=root_partition,
                                   filesystem=mockfs,
                                   partitions=partitions,
                                   file_format=file_format)
    assert dataset.partition_expression.equals(root_partition)
    assert set(dataset.files) == set(paths)

    fragments = list(dataset.get_fragments())
    for fragment, partition, path in zip(fragments, partitions, paths):
        assert fragment.partition_expression.equals(
            ds.AndExpression(root_partition, partition))
        assert fragment.path == path
        assert isinstance(fragment, ds.ParquetFileFragment)
        assert fragment.row_groups is None

        row_group_fragments = list(fragment.get_row_group_fragments())
        assert len(row_group_fragments) == 1
        assert isinstance(fragment, ds.ParquetFileFragment)
        assert row_group_fragments[0].path == path
        assert row_group_fragments[0].row_groups == {0}

    # test predicate pushdown using row group metadata
    fragments = list(dataset.get_fragments(filter=ds.field("const") == 0))
    assert len(fragments) == 2
    assert len(list(fragments[0].get_row_group_fragments())) == 1
    assert len(list(fragments[1].get_row_group_fragments())) == 0
Пример #7
0
def test_filesystem_dataset(mockfs):
    schema = pa.schema([])

    file_format = ds.ParquetFileFormat()

    paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet']
    partitions = [ds.ScalarExpression(True), ds.ScalarExpression(True)]

    dataset = ds.FileSystemDataset(
        schema,
        root_partition=None,
        file_format=file_format,
        filesystem=mockfs,
        paths_or_selector=paths,
        partitions=partitions
    )
    assert isinstance(dataset.format, ds.ParquetFileFormat)

    root_partition = ds.ComparisonExpression(
        ds.CompareOperator.Equal,
        ds.FieldExpression('level'),
        ds.ScalarExpression(1337)
    )
    partitions = [
        ds.ComparisonExpression(
            ds.CompareOperator.Equal,
            ds.FieldExpression('part'),
            ds.ScalarExpression(1)
        ),
        ds.ComparisonExpression(
            ds.CompareOperator.Equal,
            ds.FieldExpression('part'),
            ds.ScalarExpression(2)
        )
    ]
    dataset = ds.FileSystemDataset(
        paths_or_selector=paths,
        schema=schema,
        root_partition=root_partition,
        filesystem=mockfs,
        partitions=partitions,
        file_format=file_format
    )
    assert dataset.partition_expression.equals(root_partition)
    assert set(dataset.files) == set(paths)

    fragments = list(dataset.get_fragments())
    assert fragments[0].partition_expression.equals(
        ds.AndExpression(root_partition, partitions[0]))
    assert fragments[1].partition_expression.equals(
        ds.AndExpression(root_partition, partitions[1]))
    assert fragments[0].path == paths[0]
    assert fragments[1].path == paths[1]
Пример #8
0
def test_expression():
    a = ds.ScalarExpression(1)
    b = ds.ScalarExpression(1.1)
    c = ds.ScalarExpression(True)
    d = ds.ScalarExpression("string")

    equal = ds.ComparisonExpression(ds.CompareOperator.Equal, a, b)
    assert equal.op() == ds.CompareOperator.Equal

    and_ = ds.AndExpression(a, b)
    assert and_.left_operand.equals(a)
    assert and_.right_operand.equals(b)
    assert and_.equals(ds.AndExpression(a, b))
    assert and_.equals(and_)

    ds.AndExpression(a, b, c)
    ds.OrExpression(a, b)
    ds.OrExpression(a, b, c, d)
    ds.NotExpression(ds.OrExpression(a, b, c))
    ds.IsValidExpression(a)
    ds.CastExpression(a, pa.int32())
    ds.CastExpression(a, pa.int32(), safe=True)
    ds.InExpression(a, pa.array([1, 2, 3]))

    condition = ds.ComparisonExpression(
        ds.CompareOperator.Greater,
        ds.FieldExpression('i64'),
        ds.ScalarExpression(5)
    )
    schema = pa.schema([
        pa.field('i64', pa.int64()),
        pa.field('f64', pa.float64())
    ])
    assert condition.validate(schema) == pa.bool_()

    i64_is_5 = ds.ComparisonExpression(
        ds.CompareOperator.Equal,
        ds.FieldExpression('i64'),
        ds.ScalarExpression(5)
    )
    i64_is_7 = ds.ComparisonExpression(
        ds.CompareOperator.Equal,
        ds.FieldExpression('i64'),
        ds.ScalarExpression(7)
    )
    assert condition.assume(i64_is_5).equals(ds.ScalarExpression(False))
    assert condition.assume(i64_is_7).equals(ds.ScalarExpression(True))
    assert str(condition) == "(i64 > 5:int64)"
    assert "(i64 > 5:int64)" in repr(condition)
Пример #9
0
def test_dataset(dataset):
    assert isinstance(dataset, ds.Dataset)
    assert isinstance(dataset.schema, pa.Schema)

    # TODO(kszucs): test non-boolean expressions for filter do raise
    builder = dataset.new_scan()
    assert isinstance(builder, ds.ScannerBuilder)

    scanner = builder.finish()
    assert isinstance(scanner, ds.Scanner)
    assert len(list(scanner.scan())) == 2

    expected_i64 = pa.array([0, 1, 2, 3, 4], type=pa.int64())
    expected_f64 = pa.array([0, 1, 2, 3, 4], type=pa.float64())
    for task in scanner.scan():
        assert isinstance(task, ds.ScanTask)
        for batch in task.execute():
            assert batch.column(0).equals(expected_i64)
            assert batch.column(1).equals(expected_f64)

    table = scanner.to_table()
    assert isinstance(table, pa.Table)
    assert len(table) == 10

    condition = ds.ComparisonExpression(ds.CompareOperator.Equal,
                                        ds.FieldExpression('i64'),
                                        ds.ScalarExpression(1))
    scanner = dataset.new_scan().use_threads(True).filter(condition).finish()
    result = scanner.to_table().to_pydict()

    # don't rely on the scanning order
    assert result['i64'] == [1, 1]
    assert result['f64'] == [1., 1.]
    assert sorted(result['group']) == [1, 2]
    assert sorted(result['key']) == ['xxx', 'yyy']
Пример #10
0
def test_filter_implicit_cast(tempdir):
    # ARROW-7652
    table = pa.table({'a': pa.array([0, 1, 2, 3, 4, 5], type=pa.int8())})
    _, path = _create_single_file(tempdir, table)
    dataset = ds.dataset(str(path))

    filter_ = ds.ComparisonExpression(ds.CompareOperator.Greater,
                                      ds.FieldExpression('a'),
                                      ds.ScalarExpression(2))

    scanner_builder = dataset.new_scan()
    scanner_builder.filter(filter_)
    result = scanner_builder.finish().to_table()
    assert len(result) == 3