Пример #1
0
def test_series_to_delayed():
    nelem = 100

    sr = gd.Series(np.random.randint(nelem, size=nelem))

    dsr = dgd.from_pygdf(sr, npartitions=5)

    delays = dsr.to_delayed()

    assert len(delays) == 5

    # Concat the delayed partitions
    got = gd.concat([d.compute() for d in delays])
    assert isinstance(got, gd.Series)
    np.testing.assert_array_equal(got.to_pandas(), sr.to_pandas())

    # Check individual partitions
    divs = dsr.divisions
    assert len(divs) == len(delays) + 1

    for i, part in enumerate(delays):
        s = divs[i]
        # The last divisions in the last index
        e = None if i + 1 == len(delays) else divs[i + 1]
        expect = sr[s:e].to_pandas()
        got = part.compute().to_pandas()
        np.testing.assert_array_equal(got, expect)
Пример #2
0
def test_serialize_ipc():
    sr = pygdf.Series(np.arange(10))
    # Non-IPC
    header, frames = serialize(sr)
    assert header['column']['data_buffer']['kind'] == 'normal'
    # IPC
    hostport = 'tcp://0.0.0.0:8888'
    fake_context = {
        'recipient': hostport,
        'sender': hostport,
    }

    assert sr._column.data._cached_ipch is None
    header, frames = serialize(sr, context=fake_context)
    assert header['column']['data_buffer']['kind'] == 'ipc'
    # Check that _cached_ipch is set on the buffer
    assert isinstance(sr._column.data._cached_ipch,
                      cuda.cudadrv.devicearray.IpcArrayHandle)

    # Spawn a new process to test the IPC handle deserialization
    mpctx = mp.get_context('spawn')
    result_queue = mpctx.Queue()

    proc = mpctx.Process(target=_load_ipc, args=(header, frames, result_queue))
    proc.start()
    out = result_queue.get()
    proc.join(3)
    # Verify that the output array matches the source
    np.testing.assert_array_equal(out.to_array(), sr.to_array())
Пример #3
0
def test_to_arrow_missing_categorical():
    pd_cat = pd.Categorical(['a', 'b', 'c'], categories=['a', 'b'])
    pa_cat = pa.array(pd_cat, from_pandas=True)
    gd_cat = gd.Series(pa_cat)

    assert isinstance(gd_cat, gd.Series)
    assert pa.Array.equals(pa_cat, gd_cat.to_arrow())
Пример #4
0
def query(df, expr, callenv):
    boolmask = gd.queryutils.query_execute(df, expr, callenv)

    selected = gd.Series(boolmask)
    newdf = gd.DataFrame()
    for col in df.columns:
        newseries = df[col][selected]
        newdf[col] = newseries
    return newdf
Пример #5
0
def test_from_arrow_missing_categorical():
    pd_cat = pd.Categorical(['a', 'b', 'c'], categories=['a', 'b'])
    pa_cat = pa.array(pd_cat, from_pandas=True)
    gd_cat = gd.Series(pa_cat)

    assert isinstance(gd_cat, gd.Series)
    pd.testing.assert_series_equal(
        pd.Series(pa_cat.to_pandas()),  # PyArrow returns a pd.Categorical
        gd_cat.to_pandas())
Пример #6
0
def test_assign():
    np.random.seed(0)
    df = pd.DataFrame({'x': np.random.randint(0, 5, size=20),
                       'y': np.random.normal(size=20)})

    dgf = dgd.from_pygdf(gd.DataFrame.from_pandas(df), npartitions=2)
    pdcol = pd.Series(np.arange(20) + 1000)
    newcol = dgd.from_pygdf(gd.Series(pdcol),
                            npartitions=dgf.npartitions)
    out = dgf.assign(z=newcol)

    got = out.compute().to_pandas()
    assert_frame_equal(got.loc[:, ['x', 'y']], df)
    np.testing.assert_array_equal(got['z'], pdcol)
Пример #7
0
def make_meta(x):
    """Create an empty pygdf object containing the desired metadata.

    Parameters
    ----------
    x : dict, tuple, list, pd.Series, pd.DataFrame, pd.Index, dtype, scalar
        To create a DataFrame, provide a `dict` mapping of `{name: dtype}`, or
        an iterable of `(name, dtype)` tuples. To create a `Series`, provide a
        tuple of `(name, dtype)`. If a pygdf object, names, dtypes, and index
        should match the desired output. If a dtype or scalar, a scalar of the
        same dtype is returned.

    Examples
    --------
    >>> make_meta([('a', 'i8'), ('b', 'O')])
    Empty DataFrame
    Columns: [a, b]
    Index: []
    >>> make_meta(('a', 'f8'))
    Series([], Name: a, dtype: float64)
    >>> make_meta('i8')
    1
    """
    if hasattr(x, '_meta'):
        return x._meta
    if isinstance(x, (gd.Series, gd.DataFrame, gd.index.Index)):
        out = x[:2]
        return out.copy() if hasattr(out, 'copy') else out

    meta = dd.utils.make_meta(x)

    if isinstance(meta, (pd.DataFrame, pd.Series, pd.Index)):
        meta2 = dd.utils.meta_nonempty(meta)
        if isinstance(meta2, pd.DataFrame):
            return gd.DataFrame.from_pandas(meta2)
        elif isinstance(meta2, pd.Series):
            return gd.Series(meta2)
        else:
            if isinstance(meta2, pd.RangeIndex):
                return gd.index.RangeIndex(meta2.start, meta2.stop)
            return gd.index.GenericIndex(meta2)

    return meta
Пример #8
0
def test_take(nelem, nparts):
    np.random.seed(0)

    # # Use unique index range as the sort may not be stable-ordering
    x = np.random.randint(0, nelem, size=nelem)
    y = np.random.random(nelem)

    selected = np.random.randint(0, nelem - 1, size=nelem // 2)

    df = pd.DataFrame({'x': x, 'y': y})

    ddf = dd.from_pandas(df, npartitions=nparts)
    dgdf = dgd.from_dask_dataframe(ddf)
    out = dgdf.take(gd.Series(selected), npartitions=5)
    got = out.compute().to_pandas()

    expect = df.take(selected)
    assert 1 < out.npartitions <= 5
    np.testing.assert_array_equal(got.index, np.arange(len(got)))
    np.testing.assert_array_equal(got.x, expect.x)
    np.testing.assert_array_equal(got.y, expect.y)
Пример #9
0
def test_serialize_generic_index():
    index = pygdf.index.GenericIndex(pygdf.Series(np.arange(10)))
    outindex = deserialize(*serialize(index))
    assert index == outindex
Пример #10
0
def test_serialize_series():
    sr = pygdf.Series(np.arange(100))
    outsr = deserialize(*serialize(sr))
    pd.util.testing.assert_series_equal(sr.to_pandas(), outsr.to_pandas())
Пример #11
0
def sum_of_squares(x):
    x = x.astype('f8')._column
    outcol = gd._gdf.apply_reduce(libgdf.gdf_sum_squared_generic, x)
    return gd.Series(outcol)