def test_filter_table_errors(): t = pa.table({"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}) with pytest.raises(pa.ArrowTypeError): ep._filter_table(t, pc.divide(pc.field("a"), pc.scalar(2)), output_type=pa.Table) with pytest.raises(pa.ArrowInvalid): ep._filter_table(t, (pc.field("Z") <= pc.scalar(2)), output_type=pa.Table)
def test_complex_filter_table(): t = pa.table({ "a": [1, 2, 3, 4, 5, 6, 6], "b": [10, 20, 30, 40, 50, 60, 61] }) result = ep._filter_table( t, ((pc.bit_wise_and(pc.field("a"), pc.scalar(1)) == pc.scalar(0)) & (pc.multiply(pc.field("a"), pc.scalar(10)) == pc.field("b")))) assert result == pa.table({ "a": [2, 4, 6], # second six must be omitted because 6*10 != 61 "b": [20, 40, 60] })
def test_filter_table(use_datasets): t = pa.table({"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}) if use_datasets: t = ds.dataset([t]) result = ep._filter_table( t, (pc.field("a") <= pc.scalar(3)) & (pc.field("b") == pc.scalar(20)), output_type=pa.Table if not use_datasets else ds.InMemoryDataset) if use_datasets: result = result.to_table() assert result == pa.table({"a": [2], "b": [20]}) result = ep._filter_table( t, pc.field("b") > pc.scalar(30), output_type=pa.Table if not use_datasets else ds.InMemoryDataset) if use_datasets: result = result.to_table() assert result == pa.table({"a": [4, 5], "b": [40, 50]})
def test_filter_table_ordering(): table1 = pa.table({'a': [1, 2, 3, 4], 'b': ['a'] * 4}) table2 = pa.table({'a': [1, 2, 3, 4], 'b': ['b'] * 4}) table = pa.concat_tables([table1, table2]) for _ in range(20): # 20 seems to consistently cause errors when order is not preserved. # If the order problem is reintroduced this test will become flaky # which is still a signal that the order is not preserved. r = ep._filter_table(table, pc.field('a') == 1) assert r["b"] == pa.chunked_array([["a"], ["b"]])
def test_parallel_scanner_default_conn(self,duckdb_cursor): if not can_run: return parquet_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),'data','userdata1.parquet') arrow_dataset= pyarrow.dataset.dataset([ parquet_filename, parquet_filename, parquet_filename, ] , format="parquet") scanner_filter = (pc.field("first_name") == pc.scalar('Jose')) & (pc.field("salary") > pc.scalar(134708.82)) arrow_scanner = Scanner.from_dataset(arrow_dataset, filter=scanner_filter) rel = duckdb.from_arrow(arrow_scanner) assert rel.aggregate('count(*)').execute().fetchone()[0] == 12
def test_parallel_scanner_replacement_scans(self,duckdb_cursor): if not can_run: return duckdb_conn = duckdb.connect() duckdb_conn.execute("PRAGMA threads=4") parquet_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),'data','userdata1.parquet') arrow_dataset= pyarrow.dataset.dataset([ parquet_filename, parquet_filename, parquet_filename, ] , format="parquet") scanner_filter = (pc.field("first_name") == pc.scalar('Jose')) & (pc.field("salary") > pc.scalar(134708.82)) arrow_scanner = Scanner.from_dataset(arrow_dataset, filter=scanner_filter) assert duckdb_conn.execute("select count(*) from arrow_scanner").fetchone()[0] == 12