def test_series_to_delayed(): nelem = 100 sr = gd.Series(np.random.randint(nelem, size=nelem)) dsr = dgd.from_pygdf(sr, npartitions=5) delays = dsr.to_delayed() assert len(delays) == 5 # Concat the delayed partitions got = gd.concat([d.compute() for d in delays]) assert isinstance(got, gd.Series) np.testing.assert_array_equal(got.to_pandas(), sr.to_pandas()) # Check individual partitions divs = dsr.divisions assert len(divs) == len(delays) + 1 for i, part in enumerate(delays): s = divs[i] # The last divisions in the last index e = None if i + 1 == len(delays) else divs[i + 1] expect = sr[s:e].to_pandas() got = part.compute().to_pandas() np.testing.assert_array_equal(got, expect)
def test_serialize_ipc(): sr = pygdf.Series(np.arange(10)) # Non-IPC header, frames = serialize(sr) assert header['column']['data_buffer']['kind'] == 'normal' # IPC hostport = 'tcp://0.0.0.0:8888' fake_context = { 'recipient': hostport, 'sender': hostport, } assert sr._column.data._cached_ipch is None header, frames = serialize(sr, context=fake_context) assert header['column']['data_buffer']['kind'] == 'ipc' # Check that _cached_ipch is set on the buffer assert isinstance(sr._column.data._cached_ipch, cuda.cudadrv.devicearray.IpcArrayHandle) # Spawn a new process to test the IPC handle deserialization mpctx = mp.get_context('spawn') result_queue = mpctx.Queue() proc = mpctx.Process(target=_load_ipc, args=(header, frames, result_queue)) proc.start() out = result_queue.get() proc.join(3) # Verify that the output array matches the source np.testing.assert_array_equal(out.to_array(), sr.to_array())
def test_to_arrow_missing_categorical(): pd_cat = pd.Categorical(['a', 'b', 'c'], categories=['a', 'b']) pa_cat = pa.array(pd_cat, from_pandas=True) gd_cat = gd.Series(pa_cat) assert isinstance(gd_cat, gd.Series) assert pa.Array.equals(pa_cat, gd_cat.to_arrow())
def query(df, expr, callenv): boolmask = gd.queryutils.query_execute(df, expr, callenv) selected = gd.Series(boolmask) newdf = gd.DataFrame() for col in df.columns: newseries = df[col][selected] newdf[col] = newseries return newdf
def test_from_arrow_missing_categorical(): pd_cat = pd.Categorical(['a', 'b', 'c'], categories=['a', 'b']) pa_cat = pa.array(pd_cat, from_pandas=True) gd_cat = gd.Series(pa_cat) assert isinstance(gd_cat, gd.Series) pd.testing.assert_series_equal( pd.Series(pa_cat.to_pandas()), # PyArrow returns a pd.Categorical gd_cat.to_pandas())
def test_assign(): np.random.seed(0) df = pd.DataFrame({'x': np.random.randint(0, 5, size=20), 'y': np.random.normal(size=20)}) dgf = dgd.from_pygdf(gd.DataFrame.from_pandas(df), npartitions=2) pdcol = pd.Series(np.arange(20) + 1000) newcol = dgd.from_pygdf(gd.Series(pdcol), npartitions=dgf.npartitions) out = dgf.assign(z=newcol) got = out.compute().to_pandas() assert_frame_equal(got.loc[:, ['x', 'y']], df) np.testing.assert_array_equal(got['z'], pdcol)
def make_meta(x): """Create an empty pygdf object containing the desired metadata. Parameters ---------- x : dict, tuple, list, pd.Series, pd.DataFrame, pd.Index, dtype, scalar To create a DataFrame, provide a `dict` mapping of `{name: dtype}`, or an iterable of `(name, dtype)` tuples. To create a `Series`, provide a tuple of `(name, dtype)`. If a pygdf object, names, dtypes, and index should match the desired output. If a dtype or scalar, a scalar of the same dtype is returned. Examples -------- >>> make_meta([('a', 'i8'), ('b', 'O')]) Empty DataFrame Columns: [a, b] Index: [] >>> make_meta(('a', 'f8')) Series([], Name: a, dtype: float64) >>> make_meta('i8') 1 """ if hasattr(x, '_meta'): return x._meta if isinstance(x, (gd.Series, gd.DataFrame, gd.index.Index)): out = x[:2] return out.copy() if hasattr(out, 'copy') else out meta = dd.utils.make_meta(x) if isinstance(meta, (pd.DataFrame, pd.Series, pd.Index)): meta2 = dd.utils.meta_nonempty(meta) if isinstance(meta2, pd.DataFrame): return gd.DataFrame.from_pandas(meta2) elif isinstance(meta2, pd.Series): return gd.Series(meta2) else: if isinstance(meta2, pd.RangeIndex): return gd.index.RangeIndex(meta2.start, meta2.stop) return gd.index.GenericIndex(meta2) return meta
def test_take(nelem, nparts): np.random.seed(0) # # Use unique index range as the sort may not be stable-ordering x = np.random.randint(0, nelem, size=nelem) y = np.random.random(nelem) selected = np.random.randint(0, nelem - 1, size=nelem // 2) df = pd.DataFrame({'x': x, 'y': y}) ddf = dd.from_pandas(df, npartitions=nparts) dgdf = dgd.from_dask_dataframe(ddf) out = dgdf.take(gd.Series(selected), npartitions=5) got = out.compute().to_pandas() expect = df.take(selected) assert 1 < out.npartitions <= 5 np.testing.assert_array_equal(got.index, np.arange(len(got))) np.testing.assert_array_equal(got.x, expect.x) np.testing.assert_array_equal(got.y, expect.y)
def test_serialize_generic_index(): index = pygdf.index.GenericIndex(pygdf.Series(np.arange(10))) outindex = deserialize(*serialize(index)) assert index == outindex
def test_serialize_series(): sr = pygdf.Series(np.arange(100)) outsr = deserialize(*serialize(sr)) pd.util.testing.assert_series_equal(sr.to_pandas(), outsr.to_pandas())
def sum_of_squares(x): x = x.astype('f8')._column outcol = gd._gdf.apply_reduce(libgdf.gdf_sum_squared_generic, x) return gd.Series(outcol)