예제 #1
0
def test_joins(jointype, expected, use_threads, use_datasets):
    # Allocate table here instead of using parametrize
    # this prevents having arrow allocated memory forever around.
    expected = pa.table(expected)

    t1 = pa.Table.from_pydict({"colA": [1, 2, 6], "col2": ["a", "b", "f"]})

    t2 = pa.Table.from_pydict({"colB": [99, 2, 1], "col3": ["Z", "B", "A"]})

    if use_datasets:
        t1 = ds.dataset([t1])
        t2 = ds.dataset([t2])

    r = ep._perform_join(jointype,
                         t1,
                         "colA",
                         t2,
                         "colB",
                         use_threads=use_threads,
                         coalesce_keys=True)
    r = r.combine_chunks()
    if "right" in jointype:
        r = r.sort_by("colB")
    else:
        r = r.sort_by("colA")
    assert r == expected
예제 #2
0
def test_table_join_keys_order():
    t1 = pa.table({
        "colB": [10, 20, 60],
        "colA": [1, 2, 6],
        "colVals": ["a", "b", "f"]
    })

    t2 = pa.table({
        "colVals": ["Z", "B", "A"],
        "colX": [99, 2, 1],
    })

    result = ep._perform_join("full outer",
                              t1,
                              "colA",
                              t2,
                              "colX",
                              left_suffix="_l",
                              right_suffix="_r",
                              coalesce_keys=True)
    assert result.combine_chunks() == pa.table({
        "colB": [10, 20, 60, None],
        "colA": [1, 2, 6, 99],
        "colVals_l": ["a", "b", "f", None],
        "colVals_r": ["A", "B", None, "Z"],
    })
예제 #3
0
def test_joins_corner_cases():
    t1 = pa.Table.from_pydict({
        "colA": [1, 2, 3, 4, 5, 6],
        "col2": ["a", "b", "c", "d", "e", "f"]
    })

    t2 = pa.Table.from_pydict({
        "colB": [1, 2, 3, 4, 5],
        "col3": ["A", "B", "C", "D", "E"]
    })

    with pytest.raises(pa.ArrowInvalid):
        ep._perform_join("left outer", t1, "", t2, "")

    with pytest.raises(TypeError):
        ep._perform_join("left outer", None, "colA", t2, "colB")

    with pytest.raises(ValueError):
        ep._perform_join("super mario join", t1, "colA", t2, "colB")
예제 #4
0
def test_table_join_collisions():
    t1 = pa.table({
        "colA": [1, 2, 6],
        "colB": [10, 20, 60],
        "colVals": ["a", "b", "f"]
    })

    t2 = pa.table({
        "colB": [99, 20, 10],
        "colVals": ["Z", "B", "A"],
        "colUniq": [100, 200, 300],
        "colA": [99, 2, 1],
    })

    result = ep._perform_join("full outer", t1, ["colA", "colB"], t2,
                              ["colA", "colB"])
    assert result.combine_chunks() == pa.table([
        [1, 2, 6, None],
        [10, 20, 60, None],
        ["a", "b", "f", None],
        [10, 20, None, 99],
        ["A", "B", None, "Z"],
        [300, 200, None, 100],
        [1, 2, None, 99],
    ],
                                               names=[
                                                   "colA", "colB", "colVals",
                                                   "colB", "colVals",
                                                   "colUniq", "colA"
                                               ])

    result = ep._perform_join("full outer",
                              t1,
                              "colA",
                              t2,
                              "colA",
                              right_suffix="_r",
                              coalesce_keys=False)
    assert result.combine_chunks() == pa.table({
        "colA": [1, 2, 6, None],
        "colB": [10, 20, 60, None],
        "colVals": ["a", "b", "f", None],
        "colB_r": [10, 20, None, 99],
        "colVals_r": ["A", "B", None, "Z"],
        "colUniq": [300, 200, None, 100],
        "colA_r": [1, 2, None, 99],
    })

    result = ep._perform_join("full outer",
                              t1,
                              "colA",
                              t2,
                              "colA",
                              right_suffix="_r",
                              coalesce_keys=True)
    assert result.combine_chunks() == pa.table({
        "colA": [1, 2, 6, 99],
        "colB": [10, 20, 60, None],
        "colVals": ["a", "b", "f", None],
        "colB_r": [10, 20, None, 99],
        "colVals_r": ["A", "B", None, "Z"],
        "colUniq": [300, 200, None, 100]
    })