def test_joins(jointype, expected, use_threads, use_datasets): # Allocate table here instead of using parametrize # this prevents having arrow allocated memory forever around. expected = pa.table(expected) t1 = pa.Table.from_pydict({"colA": [1, 2, 6], "col2": ["a", "b", "f"]}) t2 = pa.Table.from_pydict({"colB": [99, 2, 1], "col3": ["Z", "B", "A"]}) if use_datasets: t1 = ds.dataset([t1]) t2 = ds.dataset([t2]) r = ep._perform_join(jointype, t1, "colA", t2, "colB", use_threads=use_threads, coalesce_keys=True) r = r.combine_chunks() if "right" in jointype: r = r.sort_by("colB") else: r = r.sort_by("colA") assert r == expected
def test_table_join_keys_order(): t1 = pa.table({ "colB": [10, 20, 60], "colA": [1, 2, 6], "colVals": ["a", "b", "f"] }) t2 = pa.table({ "colVals": ["Z", "B", "A"], "colX": [99, 2, 1], }) result = ep._perform_join("full outer", t1, "colA", t2, "colX", left_suffix="_l", right_suffix="_r", coalesce_keys=True) assert result.combine_chunks() == pa.table({ "colB": [10, 20, 60, None], "colA": [1, 2, 6, 99], "colVals_l": ["a", "b", "f", None], "colVals_r": ["A", "B", None, "Z"], })
def test_joins_corner_cases(): t1 = pa.Table.from_pydict({ "colA": [1, 2, 3, 4, 5, 6], "col2": ["a", "b", "c", "d", "e", "f"] }) t2 = pa.Table.from_pydict({ "colB": [1, 2, 3, 4, 5], "col3": ["A", "B", "C", "D", "E"] }) with pytest.raises(pa.ArrowInvalid): ep._perform_join("left outer", t1, "", t2, "") with pytest.raises(TypeError): ep._perform_join("left outer", None, "colA", t2, "colB") with pytest.raises(ValueError): ep._perform_join("super mario join", t1, "colA", t2, "colB")
def test_table_join_collisions(): t1 = pa.table({ "colA": [1, 2, 6], "colB": [10, 20, 60], "colVals": ["a", "b", "f"] }) t2 = pa.table({ "colB": [99, 20, 10], "colVals": ["Z", "B", "A"], "colUniq": [100, 200, 300], "colA": [99, 2, 1], }) result = ep._perform_join("full outer", t1, ["colA", "colB"], t2, ["colA", "colB"]) assert result.combine_chunks() == pa.table([ [1, 2, 6, None], [10, 20, 60, None], ["a", "b", "f", None], [10, 20, None, 99], ["A", "B", None, "Z"], [300, 200, None, 100], [1, 2, None, 99], ], names=[ "colA", "colB", "colVals", "colB", "colVals", "colUniq", "colA" ]) result = ep._perform_join("full outer", t1, "colA", t2, "colA", right_suffix="_r", coalesce_keys=False) assert result.combine_chunks() == pa.table({ "colA": [1, 2, 6, None], "colB": [10, 20, 60, None], "colVals": ["a", "b", "f", None], "colB_r": [10, 20, None, 99], "colVals_r": ["A", "B", None, "Z"], "colUniq": [300, 200, None, 100], "colA_r": [1, 2, None, 99], }) result = ep._perform_join("full outer", t1, "colA", t2, "colA", right_suffix="_r", coalesce_keys=True) assert result.combine_chunks() == pa.table({ "colA": [1, 2, 6, 99], "colB": [10, 20, 60, None], "colVals": ["a", "b", "f", None], "colB_r": [10, 20, None, 99], "colVals_r": ["A", "B", None, "Z"], "colUniq": [300, 200, None, 100] })