def test_gpu_parse_arrow_int(dtype): depdelay = np.array([0, 0, -3, -2, 11, 6, -7, -4, 4, -3], dtype=dtype) arrdelay = np.array([5, -3, 1, -2, 22, 11, -12, -5, 4, -9], dtype=dtype) d_depdelay = pa.array(depdelay) d_arrdelay = pa.array(arrdelay) batch = pa.RecordBatch.from_arrays([d_depdelay, d_arrdelay], ['depdelay', 'arrdelay']) schema_bytes = batch.schema.serialize().to_pybytes() recordbatches_bytes = batch.serialize().to_pybytes() schema = np.ndarray(shape=len(schema_bytes), dtype=np.byte, buffer=bytearray(schema_bytes)) rb_cpu_data = np.ndarray(shape=len(recordbatches_bytes), dtype=np.byte, buffer=bytearray(recordbatches_bytes)) rb_gpu_data = rmm.to_device(rb_cpu_data) gar = GpuArrowReader(schema, rb_gpu_data) columns = gar.to_dict() assert columns['depdelay'].dtype == dtype assert set(columns) == {"depdelay", "arrdelay"} assert list(columns['depdelay']) == [0, 0, -3, -2, 11, 6, -7, -4, 4, -3]
def test_gpu_parse_arrow_data(): batch = make_gpu_parse_arrow_data_batch() schema_data = batch.schema.serialize() recbatch_data = batch.serialize() # To ensure compatibility for OmniSci we're going to create this numpy # array to be read-only as that's how numpy arrays created from foreign # memory buffers will be set cpu_schema = np.frombuffer(schema_data, dtype=np.uint8) cpu_data = np.frombuffer(recbatch_data, dtype=np.uint8) gpu_data = rmm.to_device(cpu_data) del cpu_data # test reader reader = GpuArrowReader(cpu_schema, gpu_data) assert reader[0].name == 'dest_lat' assert reader[1].name == 'dest_lon' lat = reader[0].data.copy_to_host() lon = reader[1].data.copy_to_host() assert lat.size == 23 assert lon.size == 23 np.testing.assert_array_less(lat, 42) np.testing.assert_array_less(27, lat) np.testing.assert_array_less(lon, -76) np.testing.assert_array_less(-105, lon) dct = reader.to_dict() np.testing.assert_array_equal(lat, dct['dest_lat'].to_array()) np.testing.assert_array_equal(lon, dct['dest_lon'].to_array())
def test_gpu_parse_arrow_data(): batch = make_gpu_parse_arrow_data_batch() schema_data = batch.schema.serialize().to_pybytes() recbatch_data = batch.serialize().to_pybytes() cpu_schema = np.ndarray(shape=len(schema_data), dtype=np.byte, buffer=bytearray(schema_data)) cpu_data = np.ndarray(shape=len(recbatch_data), dtype=np.byte, buffer=bytearray(recbatch_data)) gpu_data = rmm.to_device(cpu_data) del cpu_data # test reader reader = GpuArrowReader(cpu_schema, gpu_data) assert reader[0].name == 'dest_lat' assert reader[1].name == 'dest_lon' lat = reader[0].data.copy_to_host() lon = reader[1].data.copy_to_host() assert lat.size == 23 assert lon.size == 23 np.testing.assert_array_less(lat, 42) np.testing.assert_array_less(27, lat) np.testing.assert_array_less(lon, -76) np.testing.assert_array_less(-105, lon) dct = reader.to_dict() np.testing.assert_array_equal(lat, dct['dest_lat'].to_array()) np.testing.assert_array_equal(lon, dct['dest_lon'].to_array())
def test_gpu_parse_arrow_timestamps(dtype): timestamp = ( cudf.datasets.timeseries( start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={} ) .reset_index()["timestamp"] .reset_index(drop=True) ) gdf = cudf.DataFrame({"timestamp": timestamp.astype(dtype)}) pdf = gdf.to_arrow(preserve_index=False) schema_data = pdf.schema.serialize() recbatch_data = pdf.to_batches()[0].serialize() # To ensure compatibility for OmniSci we're going to create this numpy # array to be read-only as that's how numpy arrays created from foreign # memory buffers will be set cpu_schema = np.frombuffer(schema_data, dtype=np.uint8) cpu_data = np.frombuffer(recbatch_data, dtype=np.uint8) gpu_data = rmm.to_device(cpu_data) del cpu_data # test reader reader = GpuArrowReader(cpu_schema, gpu_data) assert reader[0].name == "timestamp" timestamp_arr = reader[0].data.copy_to_host() np.testing.assert_array_equal(timestamp_arr, gdf["timestamp"].to_array()) dct = reader.to_dict() np.testing.assert_array_equal(timestamp_arr, dct["timestamp"].to_array())
def test_gpu_parse_arrow_cats(): batch = make_gpu_parse_arrow_cats_batch() schema_bytes = batch.schema.serialize().to_pybytes() recordbatches_bytes = batch.serialize().to_pybytes() schema = np.ndarray(shape=len(schema_bytes), dtype=np.byte, buffer=bytearray(schema_bytes)) rb_cpu_data = np.ndarray(shape=len(recordbatches_bytes), dtype=np.byte, buffer=bytearray(recordbatches_bytes)) rb_gpu_data = rmm.to_device(rb_cpu_data) gar = GpuArrowReader(schema, rb_gpu_data) columns = gar.to_dict() sr_idx = columns['idx'] sr_name = columns['name'] sr_weight = columns['weight'] assert sr_idx.dtype == np.int32 assert sr_name.dtype == 'category' assert sr_weight.dtype == np.double assert set(sr_name) == {'apple', 'pear', 'orange', 'grape'} expected = get_expected_values() for i in range(len(sr_idx)): got_idx = sr_idx[i] got_name = sr_name[i] got_weight = sr_weight[i] # the serialized data is not of order exp_idx, exp_name, exp_weight = expected[got_idx] assert got_idx == exp_idx assert got_name == exp_name np.testing.assert_almost_equal(got_weight, exp_weight)
def test_gpu_parse_arrow_data_bad_cpu_schema_good_gpu_schema(): batch = make_gpu_parse_arrow_data_batch() schema_data = batch.schema.serialize() recbatch_data = batch.serialize() # To ensure compatibility for OmniSci we're going to create this numpy # array to be read-only as that's how numpy arrays created from foreign # memory buffers will be set cpu_schema = np.frombuffer(schema_data, dtype=np.uint8) cpu_data = np.frombuffer(recbatch_data, dtype=np.uint8) # Concatenate the schema and recordbatch into a single GPU buffer gpu_data = cuda.to_device(np.concatenate([cpu_schema, cpu_data])) del cpu_data del cpu_schema # test reader reader = GpuArrowReader(b"", gpu_data) assert reader[0].name == "dest_lat" assert reader[1].name == "dest_lon" lat = reader[0].data.copy_to_host() lon = reader[1].data.copy_to_host() assert lat.size == 23 assert lon.size == 23 np.testing.assert_array_less(lat, 42) np.testing.assert_array_less(27, lat) np.testing.assert_array_less(lon, -76) np.testing.assert_array_less(-105, lon) dct = reader.to_dict() np.testing.assert_array_equal(lat, dct["dest_lat"].to_array()) np.testing.assert_array_equal(lon, dct["dest_lon"].to_array())
def test_reading_arrow_sparse_data(): schema, darr = read_data() gar = GpuArrowReader(schema, darr) df = DataFrame(gar.to_dict().items()) # preprocessing num_cols = set() cat_cols = set() response_set = set(['INCEARN ']) feature_names = set(df.columns) - response_set # Determine cat and numeric columns uniques = {} for k in feature_names: try: uniquevals = df[k].unique() uniques[k] = uniquevals except ValueError: num_cols.add(k) else: nunique = len(uniquevals) if nunique < 2: del df[k] elif 1 < nunique < 1000: cat_cols.add(k) else: num_cols.add(k) # Fix numeric columns for k in (num_cols - response_set): df[k] = df[k].fillna(df[k].mean()) assert df[k].null_count == 0 std = df[k].std() # drop near constant columns if not np.isfinite(std) or std < 1e-4: del df[k] print('drop near constant', k) else: df[k] = df[k].scale() # Expand categorical columns for k in cat_cols: cats = uniques[k][1:] # drop first df = df.one_hot_encoding(k, prefix=k, cats=cats) del df[k] # Print dtypes assert {df[k].dtype for k in df.columns} == {np.dtype('float64')} mat = df.as_matrix() assert mat.max() == 1 assert mat.min() == 0
def _parse_tdf_gpu(tdf): """ Parse the results of a select ipc_gpu into a GpuDataFrame Parameters ---------- tdf : TDataFrame Returns ------- gdf : GpuDataFrame """ from cudf.comm.gpuarrow import GpuArrowReader from cudf.dataframe import DataFrame from numba import cuda from numba.cuda.cudadrv import drvapi ipc_handle = drvapi.cu_ipc_mem_handle(*tdf.df_handle) ipch = cuda.driver.IpcHandle(None, ipc_handle, size=tdf.df_size) ctx = cuda.current_context() dptr = ipch.open(ctx) schema_buffer = load_buffer(tdf.sm_handle, tdf.sm_size) # save ptr value before overwritten below copy with np.frombuffer() ptr = schema_buffer[1] # TODO: extra copy. schema_buffer = np.frombuffer(schema_buffer[0].to_pybytes(), dtype=np.uint8) dtype = np.dtype(np.byte) darr = cuda.devicearray.DeviceNDArray(shape=dptr.size, strides=dtype.itemsize, dtype=dtype, gpu_data=dptr) reader = GpuArrowReader(schema_buffer, darr) df = DataFrame() df.set_tdf = MethodType(set_tdf, df) df.get_tdf = MethodType(get_tdf, df) for k, v in reader.to_dict().items(): df[k] = v df.set_tdf(tdf) # free shared memory from Python # https://github.com/omnisci/pymapd/issues/46 # https://github.com/omnisci/pymapd/issues/31 free_sm = shmdt(ctypes.cast(ptr, ctypes.c_void_p)) # noqa return df
def test_gpu_parse_arrow_cats(): batch = make_gpu_parse_arrow_cats_batch() stream = pa.BufferOutputStream() writer = pa.RecordBatchStreamWriter(stream, batch.schema) writer.write_batch(batch) writer.close() schema_bytes = batch.schema.serialize().to_pybytes() recordbatches_bytes = stream.getvalue().to_pybytes()[len(schema_bytes):] schema = np.ndarray(shape=len(schema_bytes), dtype=np.byte, buffer=bytearray(schema_bytes)) rb_cpu_data = np.ndarray( shape=len(recordbatches_bytes), dtype=np.byte, buffer=bytearray(recordbatches_bytes), ) rb_gpu_data = rmm.to_device(rb_cpu_data) gar = GpuArrowReader(schema, rb_gpu_data) columns = gar.to_dict() sr_idx = columns["idx"] sr_name = columns["name"] sr_weight = columns["weight"] assert sr_idx.dtype == np.int32 assert sr_name.dtype == "category" assert sr_weight.dtype == np.double assert set(sr_name) == {"apple", "pear", "orange", "grape"} expected = get_expected_values() for i in range(len(sr_idx)): got_idx = sr_idx[i] got_name = sr_name[i] got_weight = sr_weight[i] # the serialized data is not of order exp_idx, exp_name, exp_weight = expected[got_idx] assert got_idx == exp_idx assert got_name == exp_name np.testing.assert_almost_equal(got_weight, exp_weight)
def test_gpu_parse_arrow_int16(): batch = make_gpu_parse_arrow_int16_batch() schema_bytes = batch.schema.serialize().to_pybytes() recordbatches_bytes = batch.serialize().to_pybytes() schema = np.ndarray(shape=len(schema_bytes), dtype=np.byte, buffer=bytearray(schema_bytes)) rb_cpu_data = np.ndarray(shape=len(recordbatches_bytes), dtype=np.byte, buffer=bytearray(recordbatches_bytes)) rb_gpu_data = rmm.to_device(rb_cpu_data) gar = GpuArrowReader(schema, rb_gpu_data) columns = gar.to_dict() assert columns['depdelay'].dtype == np.int16 assert set(columns) == {"depdelay", "arrdelay"} assert list(columns['depdelay']) == [0, 0, -3, -2, 11, 6, -7, -4, 4, -3]
def test_fillna(): _, schema, darr = read_data() gar = GpuArrowReader(schema, darr) masked_col = gar[8] sr = Series(data=masked_col.data) dense = sr.nans_to_nulls().fillna(123) np.testing.assert_equal(123, dense.to_array()) assert len(dense) == len(sr) assert dense.null_count == 0
def test_fillna(): schema, darr = read_data() gar = GpuArrowReader(schema, darr) masked_col = gar[8] assert masked_col.null_count sr = Series.from_masked_array(data=masked_col.data, mask=masked_col.null, null_count=masked_col.null_count) dense = sr.fillna(123) np.testing.assert_equal(123, dense.to_array()) assert len(dense) == len(sr) assert dense.null_count == 0
def test_reading_arrow_sparse_data(): pdf, schema, darr = read_data() gar = GpuArrowReader(schema, darr) gdf = DataFrame(gar.to_dict().items()) assert_eq(pdf, gdf)
def _parse_tdf_gpu(tdf): """ Parse the results of a select ipc_gpu into a GpuDataFrame Parameters ---------- tdf : TDataFrame Returns ------- gdf : GpuDataFrame """ import pyarrow as pa from cudf.comm.gpuarrow import GpuArrowReader from cudf.core.dataframe import DataFrame from cudf._lib.arrow._cuda import Context, IpcMemHandle from numba import cuda ipc_handle = IpcMemHandle.from_buffer(pa.py_buffer(tdf.df_handle)) ctx = Context() ipc_buf = ctx.open_ipc_buffer(ipc_handle) ipc_buf.context.synchronize() schema_buffer, shm_ptr = load_buffer(tdf.sm_handle, tdf.sm_size) buffer = pa.BufferReader(schema_buffer) schema = pa.read_schema(buffer) # Dictionary Memo functionality used to # deserialize on the C++ side is not # exposed on the pyarrow side, so we need to # handle this on our own. dict_memo = {} try: dict_batch_reader = pa.RecordBatchStreamReader(buffer) updated_fields = [] for f in schema: if pa.types.is_dictionary(f.type): msg = dict_batch_reader.read_next_batch() dict_memo[f.name] = msg.column(0) updated_fields.append(pa.field(f.name, f.type.index_type)) else: updated_fields.append(pa.field(f.name, f.type)) schema = pa.schema(updated_fields) except pa.ArrowInvalid: # This message does not have any dictionary encoded # columns pass dtype = np.dtype(np.byte) darr = cuda.devicearray.DeviceNDArray( shape=ipc_buf.size, strides=dtype.itemsize, dtype=dtype, gpu_data=ipc_buf.to_numba(), ) reader = GpuArrowReader(schema, darr) df = DataFrame() df.set_tdf = MethodType(set_tdf, df) df.get_tdf = MethodType(get_tdf, df) for k, v in reader.to_dict().items(): if k in dict_memo: df[k] = pa.DictionaryArray.from_arrays(v, dict_memo[k]) else: df[k] = v df.set_tdf(tdf) # free shared memory from Python # https://github.com/omnisci/pymapd/issues/46 # https://github.com/omnisci/pymapd/issues/31 free_sm = shmdt(ctypes.cast(shm_ptr, ctypes.c_void_p)) # noqa return df