예제 #1
0
def test_align_partitions_unknown_divisions():
    df = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7], "b": [7, 6, 5, 4, 3, 2, 1]})
    # One known, one unknown
    ddf = dd.from_pandas(df, npartitions=2)
    ddf2 = dd.from_pandas(df, npartitions=2, sort=False)
    assert not ddf2.known_divisions

    with pytest.raises(ValueError):
        align_partitions(ddf, ddf2)

    # Both unknown
    ddf = dd.from_pandas(df + 1, npartitions=2, sort=False)
    ddf2 = dd.from_pandas(df, npartitions=2, sort=False)
    assert not ddf.known_divisions
    assert not ddf2.known_divisions

    with pytest.raises(ValueError):
        align_partitions(ddf, ddf2)
예제 #2
0
파일: test_multi.py 프로젝트: roxyboy/dask
def test_align_partitions():
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd')},
                     index=[10, 20, 30, 40, 50, 60])
    a = dd.repartition(A, [10, 40, 60])

    B = pd.DataFrame({'x': [1, 2, 3, 4], 'y': list('abda')},
                     index=[30, 70, 80, 100])
    b = dd.repartition(B, [30, 80, 100])

    (aa, bb), divisions, L = align_partitions(a, b)
    assert isinstance(a, dd.DataFrame)
    assert isinstance(b, dd.DataFrame)
    assert divisions == (10, 30, 40, 60, 80, 100)
    assert isinstance(L, list)
    assert len(divisions) == 1 + len(L)
    assert L == [[(aa._name, 0), (bb._name, 0)],
                 [(aa._name, 1), (bb._name, 1)],
                 [(aa._name, 2), (bb._name, 2)],
                 [(aa._name, 3), (bb._name, 3)],
                 [(aa._name, 4), (bb._name, 4)]]

    ldf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
                        'b': [7, 6, 5, 4, 3, 2, 1]})
    rdf = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7],
                        'd': [7, 6, 5, 4, 3, 2, 1]})

    for lhs, rhs in [(dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)),
                     (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)),
                     (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)),
                     (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2))]:
        (lresult, rresult), div, parts = dd.multi.align_partitions(lhs, rhs)
        assert eq(lresult, ldf)
        assert eq(rresult, rdf)

    # different index
    ldf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
                        'b': [7, 6, 5, 4, 3, 2, 1]},
                       index=list('abcdefg'))
    rdf = pd.DataFrame({'c': [1, 2, 3, 4, 5, 6, 7],
                        'd': [7, 6, 5, 4, 3, 2, 1]},
                       index=list('fghijkl'))

    for lhs, rhs in [(dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)),
                     (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)),
                     (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)),
                     (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2))]:
        (lresult, rresult), div, parts = dd.multi.align_partitions(lhs, rhs)
        assert eq(lresult, ldf)
        assert eq(rresult, rdf)
예제 #3
0
파일: test_multi.py 프로젝트: jayhetee/dask
def test_align_partitions():
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': list('abdabd')},
                     index=[10, 20, 30, 40, 50, 60])
    a = dd.repartition(A, [10, 40, 60])

    B = pd.DataFrame({'x': [1, 2, 3, 4], 'y': list('abda')},
                     index=[30, 70, 80, 100])
    b = dd.repartition(B, [30, 80, 100])

    (aa, bb), divisions, L = align_partitions(a, b)
    assert isinstance(a, dd.DataFrame)
    assert isinstance(b, dd.DataFrame)
    assert divisions == (10, 30, 40, 60, 80, 100)
    assert isinstance(L, list)
    assert len(divisions) == 1 + len(L)
    assert L == [[(aa._name, 0), None],
                 [(aa._name, 1), (bb._name, 0)],
                 [(aa._name, 2), (bb._name, 1)],
                 [None, (bb._name, 2)],
                 [None, (bb._name, 3)]]
예제 #4
0
def test_align_partitions():
    A = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6],
        'y': list('abdabd')
    },
                     index=[10, 20, 30, 40, 50, 60])
    a = dd.repartition(A, [10, 40, 60])

    B = pd.DataFrame({
        'x': [1, 2, 3, 4],
        'y': list('abda')
    },
                     index=[30, 70, 80, 100])
    b = dd.repartition(B, [30, 80, 100])

    s = dd.core.Scalar({('s', 0): 10}, 's', 'i8')

    (aa, bb), divisions, L = align_partitions(a, b)

    def _check(a, b, aa, bb):
        assert isinstance(a, dd.DataFrame)
        assert isinstance(b, dd.DataFrame)
        assert isinstance(aa, dd.DataFrame)
        assert isinstance(bb, dd.DataFrame)
        assert eq(a, aa)
        assert eq(b, bb)
        assert divisions == (10, 30, 40, 60, 80, 100)
        assert isinstance(L, list)
        assert len(divisions) == 1 + len(L)

    _check(a, b, aa, bb)
    assert L == [[(aa._name, 0), (bb._name, 0)], [(aa._name, 1),
                                                  (bb._name, 1)],
                 [(aa._name, 2), (bb._name, 2)], [(aa._name, 3),
                                                  (bb._name, 3)],
                 [(aa._name, 4), (bb._name, 4)]]

    (aa, ss, bb), divisions, L = align_partitions(a, s, b)
    _check(a, b, aa, bb)
    assert L == [[(aa._name, 0), None, (bb._name, 0)],
                 [(aa._name, 1), None, (bb._name, 1)],
                 [(aa._name, 2), None, (bb._name, 2)],
                 [(aa._name, 3), None, (bb._name, 3)],
                 [(aa._name, 4), None, (bb._name, 4)]]
    assert eq(ss, 10)

    ldf = pd.DataFrame({
        'a': [1, 2, 3, 4, 5, 6, 7],
        'b': [7, 6, 5, 4, 3, 2, 1]
    })
    rdf = pd.DataFrame({
        'c': [1, 2, 3, 4, 5, 6, 7],
        'd': [7, 6, 5, 4, 3, 2, 1]
    })

    for lhs, rhs in [(dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)),
                     (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)),
                     (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)),
                     (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2))]:
        (lresult, rresult), div, parts = align_partitions(lhs, rhs)
        assert eq(lresult, ldf)
        assert eq(rresult, rdf)

    # different index
    ldf = pd.DataFrame({
        'a': [1, 2, 3, 4, 5, 6, 7],
        'b': [7, 6, 5, 4, 3, 2, 1]
    },
                       index=list('abcdefg'))
    rdf = pd.DataFrame({
        'c': [1, 2, 3, 4, 5, 6, 7],
        'd': [7, 6, 5, 4, 3, 2, 1]
    },
                       index=list('fghijkl'))

    for lhs, rhs in [(dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)),
                     (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)),
                     (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)),
                     (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2))]:
        (lresult, rresult), div, parts = align_partitions(lhs, rhs)
        assert eq(lresult, ldf)
        assert eq(rresult, rdf)
예제 #5
0
def test_align_partitions():
    A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": list("abdabd")}, index=[10, 20, 30, 40, 50, 60])
    a = dd.repartition(A, [10, 40, 60])

    B = pd.DataFrame({"x": [1, 2, 3, 4], "y": list("abda")}, index=[30, 70, 80, 100])
    b = dd.repartition(B, [30, 80, 100])

    s = dd.core.Scalar({("s", 0): 10}, "s", "i8")

    (aa, bb), divisions, L = align_partitions(a, b)

    def _check(a, b, aa, bb):
        assert isinstance(a, dd.DataFrame)
        assert isinstance(b, dd.DataFrame)
        assert isinstance(aa, dd.DataFrame)
        assert isinstance(bb, dd.DataFrame)
        assert eq(a, aa)
        assert eq(b, bb)
        assert divisions == (10, 30, 40, 60, 80, 100)
        assert isinstance(L, list)
        assert len(divisions) == 1 + len(L)

    _check(a, b, aa, bb)
    assert L == [
        [(aa._name, 0), (bb._name, 0)],
        [(aa._name, 1), (bb._name, 1)],
        [(aa._name, 2), (bb._name, 2)],
        [(aa._name, 3), (bb._name, 3)],
        [(aa._name, 4), (bb._name, 4)],
    ]

    (aa, ss, bb), divisions, L = align_partitions(a, s, b)
    _check(a, b, aa, bb)
    assert L == [
        [(aa._name, 0), None, (bb._name, 0)],
        [(aa._name, 1), None, (bb._name, 1)],
        [(aa._name, 2), None, (bb._name, 2)],
        [(aa._name, 3), None, (bb._name, 3)],
        [(aa._name, 4), None, (bb._name, 4)],
    ]
    assert eq(ss, 10)

    ldf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7], "b": [7, 6, 5, 4, 3, 2, 1]})
    rdf = pd.DataFrame({"c": [1, 2, 3, 4, 5, 6, 7], "d": [7, 6, 5, 4, 3, 2, 1]})

    for lhs, rhs in [
        (dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)),
        (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)),
        (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)),
        (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2)),
    ]:
        (lresult, rresult), div, parts = align_partitions(lhs, rhs)
        assert eq(lresult, ldf)
        assert eq(rresult, rdf)

    # different index
    ldf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=list("abcdefg"))
    rdf = pd.DataFrame({"c": [1, 2, 3, 4, 5, 6, 7], "d": [7, 6, 5, 4, 3, 2, 1]}, index=list("fghijkl"))

    for lhs, rhs in [
        (dd.from_pandas(ldf, 1), dd.from_pandas(rdf, 1)),
        (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 2)),
        (dd.from_pandas(ldf, 2), dd.from_pandas(rdf, 3)),
        (dd.from_pandas(ldf, 3), dd.from_pandas(rdf, 2)),
    ]:
        (lresult, rresult), div, parts = align_partitions(lhs, rhs)
        assert eq(lresult, ldf)
        assert eq(rresult, rdf)