Пример #1
0
def test_filter_table_errors():
    t = pa.table({"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]})

    with pytest.raises(pa.ArrowTypeError):
        ep._filter_table(t,
                         pc.divide(pc.field("a"), pc.scalar(2)),
                         output_type=pa.Table)

    with pytest.raises(pa.ArrowInvalid):
        ep._filter_table(t, (pc.field("Z") <= pc.scalar(2)),
                         output_type=pa.Table)
Пример #2
0
def test_complex_filter_table():
    t = pa.table({
        "a": [1, 2, 3, 4, 5, 6, 6],
        "b": [10, 20, 30, 40, 50, 60, 61]
    })

    result = ep._filter_table(
        t, ((pc.bit_wise_and(pc.field("a"), pc.scalar(1)) == pc.scalar(0)) &
            (pc.multiply(pc.field("a"), pc.scalar(10)) == pc.field("b"))))

    assert result == pa.table({
        "a": [2, 4, 6],  # second six must be omitted because 6*10 != 61
        "b": [20, 40, 60]
    })
Пример #3
0
def test_filter_table(use_datasets):
    t = pa.table({"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]})
    if use_datasets:
        t = ds.dataset([t])

    result = ep._filter_table(
        t, (pc.field("a") <= pc.scalar(3)) & (pc.field("b") == pc.scalar(20)),
        output_type=pa.Table if not use_datasets else ds.InMemoryDataset)
    if use_datasets:
        result = result.to_table()
    assert result == pa.table({"a": [2], "b": [20]})

    result = ep._filter_table(
        t,
        pc.field("b") > pc.scalar(30),
        output_type=pa.Table if not use_datasets else ds.InMemoryDataset)
    if use_datasets:
        result = result.to_table()
    assert result == pa.table({"a": [4, 5], "b": [40, 50]})
Пример #4
0
def test_filter_table_ordering():
    table1 = pa.table({'a': [1, 2, 3, 4], 'b': ['a'] * 4})
    table2 = pa.table({'a': [1, 2, 3, 4], 'b': ['b'] * 4})
    table = pa.concat_tables([table1, table2])

    for _ in range(20):
        # 20 seems to consistently cause errors when order is not preserved.
        # If the order problem is reintroduced this test will become flaky
        # which is still a signal that the order is not preserved.
        r = ep._filter_table(table, pc.field('a') == 1)
        assert r["b"] == pa.chunked_array([["a"], ["b"]])
Пример #5
0
    def test_parallel_scanner_default_conn(self,duckdb_cursor):
        if not can_run:
            return

        parquet_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),'data','userdata1.parquet')

        arrow_dataset= pyarrow.dataset.dataset([
            parquet_filename,
            parquet_filename,
            parquet_filename,
        ]
        , format="parquet")

        scanner_filter = (pc.field("first_name") == pc.scalar('Jose')) & (pc.field("salary") > pc.scalar(134708.82))

        arrow_scanner = Scanner.from_dataset(arrow_dataset, filter=scanner_filter)

        rel = duckdb.from_arrow(arrow_scanner)

        assert rel.aggregate('count(*)').execute().fetchone()[0] == 12
Пример #6
0
    def test_parallel_scanner_replacement_scans(self,duckdb_cursor):
        if not can_run:
            return

        duckdb_conn = duckdb.connect()
        duckdb_conn.execute("PRAGMA threads=4")

        parquet_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),'data','userdata1.parquet')

        arrow_dataset= pyarrow.dataset.dataset([
            parquet_filename,
            parquet_filename,
            parquet_filename,
        ]
        , format="parquet")

        scanner_filter = (pc.field("first_name") == pc.scalar('Jose')) & (pc.field("salary") > pc.scalar(134708.82))

        arrow_scanner = Scanner.from_dataset(arrow_dataset, filter=scanner_filter)

        assert duckdb_conn.execute("select count(*) from arrow_scanner").fetchone()[0] == 12