示例#1
0
def test_compare_frame(seed, nelem):
    np.random.seed(seed)
    max_part_size = nelem
    # Make LHS
    lhs = pygdf.DataFrame()
    lhs['a'] = lhs_a = np.random.random(nelem)
    lhs['b'] = lhs_b = np.random.random(nelem)

    # Make RHS
    rhs = pygdf.DataFrame()
    rhs['a'] = rhs_a = np.random.random(nelem)
    rhs['b'] = rhs_b = np.random.random(nelem)

    # Sort by column "a"
    got_a = batcher_sortnet._compare_frame(lhs, rhs, max_part_size, by='a')
    # Check
    expect_a = np.hstack([lhs_a, rhs_a])
    expect_a.sort()
    np.testing.assert_array_equal(got_a[0].a.to_array(), expect_a[:nelem])
    np.testing.assert_array_equal(got_a[1].a.to_array(), expect_a[nelem:])

    # Sort by column "b"
    got_b = batcher_sortnet._compare_frame(lhs, rhs, max_part_size, by='b')
    # Check
    expect_b = np.hstack([lhs_b, rhs_b])
    expect_b.sort()
    np.testing.assert_array_equal(got_b[0].b.to_array(), expect_b[:nelem])
    np.testing.assert_array_equal(got_b[1].b.to_array(), expect_b[nelem:])
示例#2
0
def test_join_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how):
    chunksize = 50

    np.random.seed(0)

    # PyGDF
    left = gd.DataFrame({'x': np.random.randint(0, left_nkeys,
                                                size=left_nrows),
                         'a': np.arange(left_nrows, dtype=np.float64)}.items())
    right = gd.DataFrame({'x': np.random.randint(0, right_nkeys,
                                                 size=right_nrows),
                          'a': 1000 * np.arange(right_nrows,
                                                dtype=np.float64)}.items())

    expect = left.set_index('x').join(right.set_index('x'), how=how,
                                      sort=True, lsuffix='l', rsuffix='r')
    expect = expect.to_pandas()

    # Dask GDf
    left = dgd.from_pygdf(left, chunksize=chunksize)
    right = dgd.from_pygdf(right, chunksize=chunksize)

    joined = left.set_index('x').join(right.set_index('x'), how=how,
                                      lsuffix='l', rsuffix='r')
    got = joined.compute().to_pandas()

    # Check index
    np.testing.assert_array_equal(expect.index.values,
                                  got.index.values)

    # Check rows in each groups
    expect_rows = {}
    got_rows = {}

    def gather(df, grows):
        cola = np.sort(np.asarray(df.al))
        colb = np.sort(np.asarray(df.ar))

        grows[df['index'].values[0]] = (cola, colb)

    expect.reset_index().groupby('index')\
        .apply(partial(gather, grows=expect_rows))

    expect.reset_index().groupby('index')\
        .apply(partial(gather, grows=got_rows))

    for k in expect_rows:
        np.testing.assert_array_equal(expect_rows[k][0],
                                      got_rows[k][0])
        np.testing.assert_array_equal(expect_rows[k][1],
                                      got_rows[k][1])
示例#3
0
def pd2pygdf(df):
    if isinstance(df, np.ndarray):
        return np2pygdf(df)
    pdf = pygdf.DataFrame()
    for c, column in enumerate(df):
        pdf[c] = df[column]
    return pdf
示例#4
0
def test_dataframe_to_delayed():
    nelem = 100

    df = gd.DataFrame()
    df['x'] = np.arange(nelem)
    df['y'] = np.random.randint(nelem, size=nelem)

    ddf = dgd.from_pygdf(df, npartitions=5)

    delays = ddf.to_delayed()

    assert len(delays) == 5

    # Concat the delayed partitions
    got = gd.concat([d.compute() for d in delays])
    assert_frame_equal(got.to_pandas(), df.to_pandas())

    # Check individual partitions
    divs = ddf.divisions
    assert len(divs) == len(delays) + 1

    for i, part in enumerate(delays):
        s = divs[i]
        # The last divisions in the last index
        e = None if i + 1 == len(delays) else divs[i + 1]
        expect = df[s:e].to_pandas()
        got = part.compute().to_pandas()
        assert_frame_equal(got, expect)
示例#5
0
def test_serialize_dataframe():
    df = pygdf.DataFrame()
    df['a'] = np.arange(100)
    df['b'] = np.arange(100, dtype=np.float32)
    df['c'] = pd.Categorical(['a', 'b', 'c', '_', '_'] * 20,
                             categories=['a', 'b', 'c'])
    outdf = deserialize(*serialize(df))
    pd.util.testing.assert_frame_equal(df.to_pandas(), outdf.to_pandas())
示例#6
0
 def make_empty():
     df = gd.DataFrame()
     for k in on:
         df[k] = np.asarray([], dtype=dtypes[k])
     for k in left_val_names:
         df[fix_name(k, lsuffix)] = np.asarray([], dtype=dtypes[k])
     for k in right_val_names:
         df[fix_name(k, rsuffix)] = np.asarray([], dtype=dtypes[k])
     return df
示例#7
0
文件: core.py 项目: sklam/dask_gdf
def query(df, expr, callenv):
    boolmask = gd.queryutils.query_execute(df, expr, callenv)

    selected = gd.Series.from_array(boolmask)
    newdf = gd.DataFrame()
    for col in df.columns:
        newseries = df[col][selected]
        newdf[col] = newseries
    return newdf
示例#8
0
def test_sort_values(nelem, nparts, by):
    df = pygdf.DataFrame()
    df['a'] = np.ascontiguousarray(np.arange(nelem)[::-1])
    df['b'] = np.arange(100, nelem + 100)
    ddf = dgd.from_pygdf(df, npartitions=nparts)

    got = ddf.sort_values(by=by).compute().to_pandas()
    expect = df.sort_values(by=by).to_pandas().reset_index(drop=True)
    pd.util.testing.assert_frame_equal(got, expect)
示例#9
0
def test_serialize_dataframe_with_index():
    df = pygdf.DataFrame()
    df['a'] = np.arange(100)
    df['b'] = np.random.random(100)
    df['c'] = pd.Categorical(['a', 'b', 'c', '_', '_'] * 20,
                             categories=['a', 'b', 'c'])
    df = df.sort_values('b')
    outdf = deserialize(*serialize(df))
    pd.util.testing.assert_frame_equal(df.to_pandas(), outdf.to_pandas())
示例#10
0
def test_compare_frame_with_none():
    df = pygdf.DataFrame()
    max_part_size = 1
    df['a'] = [0]
    res = batcher_sortnet._compare_frame(df, None, max_part_size, by='a')
    assert res[0] is not None, res[1] is None
    res = batcher_sortnet._compare_frame(None, df, max_part_size, by='a')
    assert res[0] is not None, res[1] is None
    res = batcher_sortnet._compare_frame(None, None, max_part_size, by='a')
    assert res == (None, None)
示例#11
0
def test_merge_left(left_nrows, right_nrows, left_nkeys, right_nkeys,
                    how='left'):
    print(left_nrows, right_nrows, left_nkeys, right_nkeys)
    chunksize = 3

    np.random.seed(0)

    # PyGDF
    left = gd.DataFrame({'x': np.random.randint(0, left_nkeys,
                                                size=left_nrows),
                         'y': np.random.randint(0, left_nkeys,
                                                size=left_nrows),
                         'a': np.arange(left_nrows, dtype=np.float64)}.items())
    right = gd.DataFrame({'x': np.random.randint(0, right_nkeys,
                                                 size=right_nrows),
                          'y': np.random.randint(0, right_nkeys,
                                                 size=right_nrows),
                          'a': 1000 * np.arange(right_nrows,
                                                dtype=np.float64)}.items())

    print(left.to_pandas())
    print(right.to_pandas())

    expect = left.merge(right, on=('x', 'y'), how=how)
    expect = expect.to_pandas().sort_values(['x', 'y', 'a_x', 'a_y'])\
        .reset_index(drop=True)

    print("Expect".center(80, '='))
    print(expect)

    # Dask GDf
    left = dgd.from_pygdf(left, chunksize=chunksize)
    right = dgd.from_pygdf(right, chunksize=chunksize)

    joined = left.merge(right, on=('x', 'y'), how=how)

    print("Got".center(80, '='))
    got = joined.compute().to_pandas()

    got = got.sort_values(['x', 'y', 'a_x', 'a_y']).reset_index(drop=True)
    print(got)

    pd.util.testing.assert_frame_equal(expect, got)
示例#12
0
def test_serialize_groupby():
    df = pygdf.DataFrame()
    df['key'] = np.random.randint(0, 20, 100)
    df['val'] = np.arange(100, dtype=np.float32)
    gb = df.groupby('key')
    outgb = deserialize(*serialize(gb))

    got = gb.mean()
    expect = outgb.mean()
    pd.util.testing.assert_frame_equal(got.to_pandas(), expect.to_pandas())
示例#13
0
def test_join_inner(left_nrows, right_nrows, left_nkeys, right_nkeys):
    chunksize = 50

    np.random.seed(0)

    # PyGDF
    left = gd.DataFrame({'x': np.random.randint(0, left_nkeys,
                                                size=left_nrows),
                         'a': np.arange(left_nrows)}.items())
    right = gd.DataFrame({'x': np.random.randint(0, right_nkeys,
                                                 size=right_nrows),
                          'a': 1000 * np.arange(right_nrows)}.items())

    expect = left.set_index('x').join(right.set_index('x'), how='inner',
                                      sort=True, lsuffix='l', rsuffix='r')
    expect = expect.to_pandas()

    # Dask GDf
    left = dgd.from_pygdf(left, chunksize=chunksize)
    right = dgd.from_pygdf(right, chunksize=chunksize)

    joined = left.set_index('x').join(right.set_index('x'), how='inner',
                                      lsuffix='l', rsuffix='r')
    got = joined.compute().to_pandas()

    # Check index
    np.testing.assert_array_equal(expect.index.values,
                                  got.index.values)

    # Check rows in each groups
    expect_rows = {}
    got_rows = {}

    def gather(df, grows):
        grows[df['index'].values[0]] = (set(df.al), set(df.ar))

    expect.reset_index().groupby('index')\
        .apply(partial(gather, grows=expect_rows))

    expect.reset_index().groupby('index')\
        .apply(partial(gather, grows=got_rows))

    assert got_rows == expect_rows
示例#14
0
 def fix_left(df):
     newdf = gd.DataFrame()
     df = df.reset_index()
     for k in on:
         newdf[k] = df[k]
     for k in left_val_names:
         newdf[fix_name(k, lsuffix)] = df[k]
     for k in right_val_names:
         newdf[fix_name(k, rsuffix)] = nullcolumn(len(df), dtypes[k])
     return newdf
示例#15
0
def test_from_scalar_typing(data_type):
    if data_type == 'datetime64[ms]':
        scalar = np.dtype('int64').type(np.random.randint(0, 5))\
            .astype('datetime64[ms]')
    else:
        scalar = np.dtype(data_type).type(np.random.randint(0, 5))

    gdf = gd.DataFrame()
    gdf['a'] = [1, 2, 3, 4, 5]
    gdf['b'] = scalar
    assert (gdf['b'].dtype == np.dtype(data_type))
    assert (len(gdf['b']) == len(gdf['a']))
示例#16
0
def test_pca_inverse_transform(datatype):
    gdf = pygdf.DataFrame()
    gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
    gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)

    cutsvd = cuTSVD(n_components=1)
    Xcutsvd = cutsvd.fit_transform(gdf)

    print("Calling inverse_transform")
    input_gdf = cutsvd.inverse_transform(Xcutsvd)
    print(input_gdf)
    assert array_equal(input_gdf, gdf, 0.4, with_sign=True)
示例#17
0
def test_pca_inverse_transform(datatype):
    gdf = pygdf.DataFrame()
    gdf['0']=np.asarray([-1,-2,-3,1,2,3],dtype=datatype)
    gdf['1']=np.asarray([-1,-1,-2,1,1,2],dtype=datatype)

    cupca = cuPCA(n_components = 2)
    Xcupca = cupca.fit_transform(gdf)

    print("Calling inverse_transform")
    input_gdf = cupca.inverse_transform(Xcupca)

    assert array_equal(input_gdf, gdf,
            1e-3,with_sign=True)
示例#18
0
def test_frame_dtype_error():
    nelem = 20

    df1 = gd.DataFrame()
    df1['bad'] = np.arange(nelem)
    df1['bad'] = np.arange(nelem, dtype=np.float64)

    df2 = gd.DataFrame()
    df2['bad'] = np.arange(nelem)
    df2['bad'] = np.arange(nelem, dtype=np.float32)

    ddf1 = dgd.from_pygdf(df1, npartitions=5)
    ddf2 = dgd.from_pygdf(df2, npartitions=5)

    combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed())

    with pytest.raises(ValueError) as raises:
        combined.compute()

    print("out")
    raises.match(r"^Metadata mismatch found in `from_delayed`.")
    raises.match(r"\s+\|\s+".join(['bad', 'float32', 'float64']))
示例#19
0
def test_pca_fit_transform(datatype):
    gdf = pygdf.DataFrame()
    gdf['0']=np.asarray([-1,-2,-3,1,2,3],dtype=datatype)
    gdf['1']=np.asarray([-1,-1,-2,1,1,2],dtype=datatype)

    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype = datatype)

    print("Calling fit_transform")
    cupca = cuPCA(n_components = 2)
    Xcupca = cupca.fit_transform(gdf)
    skpca = skPCA(n_components = 2)
    Xskpca = skpca.fit_transform(X)

    assert array_equal(Xcupca, Xskpca,
            1e-3,with_sign=False)
示例#20
0
def test_dbscan_predict(datatype):
    gdf = pygdf.DataFrame()
    gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype)
    gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype)

    X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
                 dtype=datatype)

    print("Calling fit_predict")
    cudbscan = cuDBSCAN(eps=3, min_samples=2)
    cu_labels = cudbscan.fit_predict(gdf)
    skdbscan = skDBSCAN(eps=3, min_samples=2)
    sk_labels = skdbscan.fit_predict(X)
    print(X.shape[0])
    for i in range(X.shape[0]):
        assert cu_labels[i] == sk_labels[i]
示例#21
0
        def fix_column(lhs):
            df = gd.DataFrame()
            for k in lhs.columns:
                df[k + lsuffix] = lhs[k]

            for k, dtype in rhs_dtypes:
                data = np.zeros(len(lhs), dtype=dtype)
                mask_size = gd.utils.calc_chunk_size(data.size,
                                                     gd.utils.mask_bitsize)
                mask = np.zeros(mask_size, dtype=gd.utils.mask_dtype)
                sr = gd.Series.from_masked_array(data=data,
                                                 mask=mask,
                                                 null_count=data.size)

                df[k + rsuffix] = sr.set_index(df.index)

            return df
示例#22
0
def test_pca_fit(datatype):
    gdf = pygdf.DataFrame()
    gdf['0']=np.asarray([-1,-2,-3,1,2,3],dtype=datatype)
    gdf['1']=np.asarray([-1,-1,-2,1,1,2],dtype=datatype)

    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]], dtype = datatype)

    print("Calling fit")
    cupca = cuPCA(n_components = 2)
    cupca.fit(gdf)
    skpca = skPCA(n_components = 2)
    skpca.fit(X)

    for attr in ['singular_values_','components_','explained_variance_','explained_variance_ratio_','noise_variance_']:
        with_sign = False if attr in ['components_'] else True
        assert array_equal(getattr(cupca,attr),getattr(skpca,attr),
            1e-3,with_sign=with_sign)
示例#23
0
def test_frame_extra_columns_error():
    nelem = 20

    df = gd.DataFrame()
    df['x'] = np.arange(nelem)
    df['y'] = np.random.randint(nelem, size=nelem)
    ddf1 = dgd.from_pygdf(df, npartitions=5)

    df['z'] = np.arange(nelem)
    ddf2 = dgd.from_pygdf(df, npartitions=5)

    combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed())

    with pytest.raises(ValueError) as raises:
        out = combined.compute()

    raises.match(r"^Metadata mismatch found in `from_delayed`.")
    raises.match(r"extra columns")
示例#24
0
def test_mixing_series_frame_error():
    nelem = 20

    df = gd.DataFrame()
    df['x'] = np.arange(nelem)
    df['y'] = np.random.randint(nelem, size=nelem)

    ddf = dgd.from_pygdf(df, npartitions=5)

    delay_frame = ddf.to_delayed()
    delay_series = ddf.x.to_delayed()
    combined = dgd.from_delayed(delay_frame + delay_series)

    with pytest.raises(ValueError) as raises:
        out = combined.compute()

    raises.match(r"^Metadata mismatch found in `from_delayed`.")
    raises.match(r"Expected partition of type `DataFrame` but got `Series`")
示例#25
0
def test_sort_values_binned():
    np.random.seed(43)
    nelem = 100
    nparts = 5
    by = 'a'
    df = pygdf.DataFrame()
    df['a'] = np.random.randint(1, 5, nelem)
    ddf = dgd.from_pygdf(df, npartitions=nparts)

    parts = ddf.sort_values_binned(by=by).to_delayed()
    part_uniques = []
    for i, p in enumerate(parts):
        part = dask.compute(p)[0]
        part_uniques.append(set(part.a.unique()))

    # Partitions do not have intersecting keys
    for i in range(len(part_uniques)):
        for j in range(i + 1, len(part_uniques)):
            assert not (part_uniques[i] & part_uniques[j]), \
                    "should have empty intersection"
示例#26
0
def test_tsvd_fit(datatype):
    gdf = pygdf.DataFrame()
    gdf['0'] = np.asarray([-1, -2, -3, 1, 2, 3], dtype=datatype)
    gdf['1'] = np.asarray([-1, -1, -2, 1, 1, 2], dtype=datatype)

    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                 dtype=datatype)

    print("Calling fit")
    cutsvd = cuTSVD(n_components=1)
    cutsvd.fit(gdf)
    sktsvd = skTSVD(n_components=1)
    sktsvd.fit(X)

    for attr in [
            'singular_values_', 'components_', 'explained_variance_ratio_'
    ]:
        with_sign = False if attr in ['components_'] else True
        assert array_equal(getattr(cutsvd, attr),
                           getattr(sktsvd, attr),
                           0.4,
                           with_sign=with_sign)
示例#27
0
def test_index_in_dataframe_constructor():
    a = pd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.])
    b = gd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.])

    pd.testing.assert_frame_equal(a, b.to_pandas())
    assert pd.testing.assert_frame_equal(a.loc[4:], b.loc[4:].to_pandas())
示例#28
0
def np2pygdf(df):
    pdf = pygdf.DataFrame()
    for c in range(df.shape[1]):
        pdf[c] = df[:, c]
    return pdf
示例#29
0
def load_data(nelem, ident):
    df = gd.DataFrame()
    df['x'] = np.arange(nelem)
    df['ident'] = np.asarray([ident] * nelem)
    return df