예제 #1
0
def test_add():
    df = pd.DataFrame(np.identity(12))
    df2 = df.copy()
    df2.index += 1

    sf1 = sp.SparseFrame(df)
    sf2 = sp.SparseFrame(df2)
    correct = sf1.add(sf2).todense()

    dsf = dsp.from_pandas(df, npartitions=4)
    dsf2 = dsp.from_pandas(df2, npartitions=4)

    res = dsf.add(dsf2).compute().todense()
    pdt.assert_frame_equal(res, correct)
예제 #2
0
def test_loc(iindexer, correct_shape):
    df = pd.DataFrame(np.random.rand(10, 2), index=list('ABCDEFGHIJ'))
    dsf = dsp.from_pandas(df, npartitions=2)
    res = dsf.loc[iindexer].compute()

    assert isinstance(res, sp.SparseFrame)
    assert res.shape == correct_shape
예제 #3
0
def test_todense_series():
    data = pd.DataFrame(np.random.rand(10, 2))
    dsf = dsp.from_pandas(data, npartitions=3)[0]
    res = dsf.todense()
    assert isinstance(res, dd.Series)
    computed = res.compute()
    pdt.assert_series_equal(computed, data[0], check_dtype=False)
예제 #4
0
def test_distributed_join(how):
    left = pd.DataFrame(np.identity(10),
                        index=np.arange(10),
                        columns=list('ABCDEFGHIJ'))
    right = pd.DataFrame(np.identity(10),
                         index=np.arange(5, 15),
                         columns=list('KLMNOPQRST'))
    correct = left.join(right, how=how).fillna(0)

    d_left = dsp.from_pandas(left, npartitions=2)
    d_right = dsp.from_pandas(right, npartitions=2)

    joined = d_left.join(d_right, how=how)

    res = joined.compute().todense()

    pdt.assert_frame_equal(correct, res)
예제 #5
0
def test_map_partitions():
    data = pd.DataFrame(np.random.rand(10, 2))
    dsf = dsp.from_pandas(data, npartitions=3)
    dsf = dsf.map_partitions(lambda x: x, dsf._meta)

    res = dsf.compute()

    assert isinstance(res, sp.SparseFrame)
    assert res.shape == (10, 2)
예제 #6
0
def test_repartition_n_divisions(start_part, end_part):
    df = pd.DataFrame(np.identity(10))
    dsf = dsp.from_pandas(df, npartitions=start_part)

    dsf2 = dsf.repartition(npartitions=end_part)

    assert isinstance(dsf2, dsp.SparseFrame)
    assert dsf2.npartitions == end_part

    df2 = dsf2.compute().todense()
    pdt.assert_frame_equal(df, df2)
예제 #7
0
def test_assign_column():
    s = pd.Series(np.arange(10))
    ds = dd.from_pandas(s, npartitions=2)

    f = pd.DataFrame(np.random.rand(10, 2), columns=['a', 'b'])
    dsf = dsp.from_pandas(f, npartitions=2)

    dsf = dsf.assign(new=ds)
    assert dsf._meta.empty
    sf = dsf.compute()
    assert np.all((sf.todense() == f.assign(new=s)).values)
예제 #8
0
def test_repartition_divisions(arg_dict):
    df = pd.DataFrame(np.identity(100))
    dsf = dsp.from_pandas(df, npartitions=4)

    dsf2 = dsf.repartition(**arg_dict)

    assert isinstance(dsf2, dsp.SparseFrame)
    if 'divisions' in arg_dict:
        assert tuple(dsf2.divisions) == tuple(arg_dict['divisions'])

    df2 = dsf2.compute().todense()
    pdt.assert_frame_equal(df, df2)
예제 #9
0
def test_map_partitions_mappable():
    data = pd.DataFrame(np.ones((10, 2)))
    dsf = dsp.from_pandas(data, chunksize=5)

    def foo(sf, x, y):
        return sp.SparseFrame(sf.data * x * y,
                              index=sf.index,
                              columns=sf.columns)

    dsf = dsf.map_partitions(foo, dsf._meta, x=(i for i in range(2, 4)), y=2)
    res = dsf.compute().todense()

    assert res.shape == (10, 2)
    assert (res.iloc[:5, :] == 4).all().all()
    assert (res.iloc[5:, :] == 6).all().all()
예제 #10
0
def test_groupby_sum(idx, sorted):

    df = pd.DataFrame(dict(A=np.ones(100), B=np.ones(100)), index=idx)
    correct = df.groupby(level=0).sum()
    correct.sort_index(inplace=True)

    spf = dsp.from_pandas(df, npartitions=2)
    if not sorted:
        spf.divisions = [None] * (spf.npartitions + 1)
    assert spf.npartitions == 2
    grouped = spf.groupby_sum(split_out=3)

    assert grouped.npartitions == 3
    res = grouped.compute().todense()
    res.sort_index(inplace=True)

    pdt.assert_frame_equal(res, correct)
예제 #11
0
def test_getitem(item, raises):
    df = pd.DataFrame(np.random.rand(10, 3),
                      columns=list('XYZ'),
                      index=list('ABCDEFGHIJ'))
    dsf = dsp.from_pandas(df, npartitions=2)

    correct_cols = item if isinstance(item, list) else [item]

    if raises:
        with pytest.raises(KeyError):
            dsf[item]
        return

    res = dsf[item]
    assert res.columns.tolist() == correct_cols
    res_computed = res.compute()
    assert res_computed.columns.tolist() == correct_cols
    if not isinstance(item, list):
        pdt.assert_series_equal(df[item], res_computed.todense())
    else:
        pdt.assert_frame_equal(df[item], res_computed.todense())
예제 #12
0
def test_repr():
    dsf = dsp.from_pandas(pd.DataFrame(np.random.rand(10, 2)), npartitions=3)
    assert isinstance(dsf.__repr__(), str)

    dsf = dsp.from_pandas(pd.DataFrame(np.random.rand(10, 100)), npartitions=3)
    assert isinstance(dsf.__repr__(), str)
예제 #13
0
def test_from_pandas():
    dsf = dsp.from_pandas(pd.DataFrame(np.random.rand(10, 2)), npartitions=3)
    res = dsf.compute()

    assert isinstance(res, sp.SparseFrame)
    assert res.shape == (10, 2)
예제 #14
0
def dsf_arange(sf_arange):
    return dsp.from_pandas(sf_arange.todense(), chunksize=5)
예제 #15
0
def dsf():
    return dsp.from_pandas(pd.DataFrame(np.random.rand(10, 2),
                                        columns=['A', 'B']),
                           npartitions=3)