Пример #1
0
def test_gpu_parse_arrow_int(dtype):

    depdelay = np.array([0, 0, -3, -2, 11, 6, -7, -4, 4, -3], dtype=dtype)
    arrdelay = np.array([5, -3, 1, -2, 22, 11, -12, -5, 4, -9], dtype=dtype)
    d_depdelay = pa.array(depdelay)
    d_arrdelay = pa.array(arrdelay)
    batch = pa.RecordBatch.from_arrays([d_depdelay, d_arrdelay],
                                       ['depdelay', 'arrdelay'])

    schema_bytes = batch.schema.serialize().to_pybytes()
    recordbatches_bytes = batch.serialize().to_pybytes()

    schema = np.ndarray(shape=len(schema_bytes),
                        dtype=np.byte,
                        buffer=bytearray(schema_bytes))

    rb_cpu_data = np.ndarray(shape=len(recordbatches_bytes),
                             dtype=np.byte,
                             buffer=bytearray(recordbatches_bytes))

    rb_gpu_data = rmm.to_device(rb_cpu_data)
    gar = GpuArrowReader(schema, rb_gpu_data)
    columns = gar.to_dict()
    assert columns['depdelay'].dtype == dtype
    assert set(columns) == {"depdelay", "arrdelay"}
    assert list(columns['depdelay']) == [0, 0, -3, -2, 11, 6, -7, -4, 4, -3]
Пример #2
0
def test_gpu_parse_arrow_data():
    batch = make_gpu_parse_arrow_data_batch()
    schema_data = batch.schema.serialize()
    recbatch_data = batch.serialize()

    # To ensure compatibility for OmniSci we're going to create this numpy
    # array to be read-only as that's how numpy arrays created from foreign
    # memory buffers will be set
    cpu_schema = np.frombuffer(schema_data, dtype=np.uint8)
    cpu_data = np.frombuffer(recbatch_data, dtype=np.uint8)
    gpu_data = rmm.to_device(cpu_data)
    del cpu_data

    # test reader
    reader = GpuArrowReader(cpu_schema, gpu_data)
    assert reader[0].name == 'dest_lat'
    assert reader[1].name == 'dest_lon'
    lat = reader[0].data.copy_to_host()
    lon = reader[1].data.copy_to_host()
    assert lat.size == 23
    assert lon.size == 23
    np.testing.assert_array_less(lat, 42)
    np.testing.assert_array_less(27, lat)
    np.testing.assert_array_less(lon, -76)
    np.testing.assert_array_less(-105, lon)

    dct = reader.to_dict()
    np.testing.assert_array_equal(lat, dct['dest_lat'].to_array())
    np.testing.assert_array_equal(lon, dct['dest_lon'].to_array())
def test_gpu_parse_arrow_data():
    batch = make_gpu_parse_arrow_data_batch()
    schema_data = batch.schema.serialize().to_pybytes()
    recbatch_data = batch.serialize().to_pybytes()

    cpu_schema = np.ndarray(shape=len(schema_data),
                            dtype=np.byte,
                            buffer=bytearray(schema_data))
    cpu_data = np.ndarray(shape=len(recbatch_data),
                          dtype=np.byte,
                          buffer=bytearray(recbatch_data))
    gpu_data = rmm.to_device(cpu_data)
    del cpu_data

    # test reader
    reader = GpuArrowReader(cpu_schema, gpu_data)
    assert reader[0].name == 'dest_lat'
    assert reader[1].name == 'dest_lon'
    lat = reader[0].data.copy_to_host()
    lon = reader[1].data.copy_to_host()
    assert lat.size == 23
    assert lon.size == 23
    np.testing.assert_array_less(lat, 42)
    np.testing.assert_array_less(27, lat)
    np.testing.assert_array_less(lon, -76)
    np.testing.assert_array_less(-105, lon)

    dct = reader.to_dict()
    np.testing.assert_array_equal(lat, dct['dest_lat'].to_array())
    np.testing.assert_array_equal(lon, dct['dest_lon'].to_array())
Пример #4
0
def test_gpu_parse_arrow_timestamps(dtype):
    timestamp = (
        cudf.datasets.timeseries(
            start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={}
        )
        .reset_index()["timestamp"]
        .reset_index(drop=True)
    )
    gdf = cudf.DataFrame({"timestamp": timestamp.astype(dtype)})
    pdf = gdf.to_arrow(preserve_index=False)
    schema_data = pdf.schema.serialize()
    recbatch_data = pdf.to_batches()[0].serialize()

    # To ensure compatibility for OmniSci we're going to create this numpy
    # array to be read-only as that's how numpy arrays created from foreign
    # memory buffers will be set
    cpu_schema = np.frombuffer(schema_data, dtype=np.uint8)
    cpu_data = np.frombuffer(recbatch_data, dtype=np.uint8)
    gpu_data = rmm.to_device(cpu_data)
    del cpu_data

    # test reader
    reader = GpuArrowReader(cpu_schema, gpu_data)
    assert reader[0].name == "timestamp"
    timestamp_arr = reader[0].data.copy_to_host()
    np.testing.assert_array_equal(timestamp_arr, gdf["timestamp"].to_array())
    dct = reader.to_dict()
    np.testing.assert_array_equal(timestamp_arr, dct["timestamp"].to_array())
Пример #5
0
def test_gpu_parse_arrow_cats():
    batch = make_gpu_parse_arrow_cats_batch()
    schema_bytes = batch.schema.serialize().to_pybytes()
    recordbatches_bytes = batch.serialize().to_pybytes()

    schema = np.ndarray(shape=len(schema_bytes), dtype=np.byte,
                        buffer=bytearray(schema_bytes))
    rb_cpu_data = np.ndarray(shape=len(recordbatches_bytes), dtype=np.byte,
                             buffer=bytearray(recordbatches_bytes))
    rb_gpu_data = rmm.to_device(rb_cpu_data)

    gar = GpuArrowReader(schema, rb_gpu_data)
    columns = gar.to_dict()

    sr_idx = columns['idx']
    sr_name = columns['name']
    sr_weight = columns['weight']

    assert sr_idx.dtype == np.int32
    assert sr_name.dtype == 'category'
    assert sr_weight.dtype == np.double
    assert set(sr_name) == {'apple', 'pear', 'orange', 'grape'}

    expected = get_expected_values()
    for i in range(len(sr_idx)):
        got_idx = sr_idx[i]
        got_name = sr_name[i]
        got_weight = sr_weight[i]

        # the serialized data is not of order
        exp_idx, exp_name, exp_weight = expected[got_idx]

        assert got_idx == exp_idx
        assert got_name == exp_name
        np.testing.assert_almost_equal(got_weight, exp_weight)
Пример #6
0
def test_gpu_parse_arrow_data_bad_cpu_schema_good_gpu_schema():
    batch = make_gpu_parse_arrow_data_batch()
    schema_data = batch.schema.serialize()
    recbatch_data = batch.serialize()

    # To ensure compatibility for OmniSci we're going to create this numpy
    # array to be read-only as that's how numpy arrays created from foreign
    # memory buffers will be set
    cpu_schema = np.frombuffer(schema_data, dtype=np.uint8)
    cpu_data = np.frombuffer(recbatch_data, dtype=np.uint8)
    # Concatenate the schema and recordbatch into a single GPU buffer
    gpu_data = cuda.to_device(np.concatenate([cpu_schema, cpu_data]))
    del cpu_data
    del cpu_schema

    # test reader
    reader = GpuArrowReader(b"", gpu_data)
    assert reader[0].name == "dest_lat"
    assert reader[1].name == "dest_lon"
    lat = reader[0].data.copy_to_host()
    lon = reader[1].data.copy_to_host()
    assert lat.size == 23
    assert lon.size == 23
    np.testing.assert_array_less(lat, 42)
    np.testing.assert_array_less(27, lat)
    np.testing.assert_array_less(lon, -76)
    np.testing.assert_array_less(-105, lon)

    dct = reader.to_dict()
    np.testing.assert_array_equal(lat, dct["dest_lat"].to_array())
    np.testing.assert_array_equal(lon, dct["dest_lon"].to_array())
Пример #7
0
def test_reading_arrow_sparse_data():
    schema, darr = read_data()
    gar = GpuArrowReader(schema, darr)

    df = DataFrame(gar.to_dict().items())

    # preprocessing
    num_cols = set()
    cat_cols = set()
    response_set = set(['INCEARN '])
    feature_names = set(df.columns) - response_set

    # Determine cat and numeric columns
    uniques = {}
    for k in feature_names:
        try:
            uniquevals = df[k].unique()
            uniques[k] = uniquevals
        except ValueError:
            num_cols.add(k)
        else:
            nunique = len(uniquevals)
            if nunique < 2:
                del df[k]
            elif 1 < nunique < 1000:
                cat_cols.add(k)
            else:
                num_cols.add(k)

    # Fix numeric columns
    for k in (num_cols - response_set):
        df[k] = df[k].fillna(df[k].mean())
        assert df[k].null_count == 0
        std = df[k].std()
        # drop near constant columns
        if not np.isfinite(std) or std < 1e-4:
            del df[k]
            print('drop near constant', k)
        else:
            df[k] = df[k].scale()

    # Expand categorical columns
    for k in cat_cols:
        cats = uniques[k][1:]  # drop first
        df = df.one_hot_encoding(k, prefix=k, cats=cats)
        del df[k]

    # Print dtypes
    assert {df[k].dtype for k in df.columns} == {np.dtype('float64')}

    mat = df.as_matrix()

    assert mat.max() == 1
    assert mat.min() == 0
Пример #8
0
def _parse_tdf_gpu(tdf):
    """
    Parse the results of a select ipc_gpu into a GpuDataFrame

    Parameters
    ----------
    tdf : TDataFrame

    Returns
    -------
    gdf : GpuDataFrame
    """

    from cudf.comm.gpuarrow import GpuArrowReader
    from cudf.dataframe import DataFrame
    from numba import cuda
    from numba.cuda.cudadrv import drvapi

    ipc_handle = drvapi.cu_ipc_mem_handle(*tdf.df_handle)
    ipch = cuda.driver.IpcHandle(None, ipc_handle, size=tdf.df_size)
    ctx = cuda.current_context()
    dptr = ipch.open(ctx)

    schema_buffer = load_buffer(tdf.sm_handle, tdf.sm_size)

    # save ptr value before overwritten below copy with np.frombuffer()
    ptr = schema_buffer[1]

    # TODO: extra copy.
    schema_buffer = np.frombuffer(schema_buffer[0].to_pybytes(),
                                  dtype=np.uint8)

    dtype = np.dtype(np.byte)
    darr = cuda.devicearray.DeviceNDArray(shape=dptr.size,
                                          strides=dtype.itemsize,
                                          dtype=dtype,
                                          gpu_data=dptr)
    reader = GpuArrowReader(schema_buffer, darr)
    df = DataFrame()
    df.set_tdf = MethodType(set_tdf, df)
    df.get_tdf = MethodType(get_tdf, df)

    for k, v in reader.to_dict().items():
        df[k] = v

    df.set_tdf(tdf)

    # free shared memory from Python
    # https://github.com/omnisci/pymapd/issues/46
    # https://github.com/omnisci/pymapd/issues/31
    free_sm = shmdt(ctypes.cast(ptr, ctypes.c_void_p))  # noqa

    return df
Пример #9
0
def test_gpu_parse_arrow_cats():

    batch = make_gpu_parse_arrow_cats_batch()

    stream = pa.BufferOutputStream()
    writer = pa.RecordBatchStreamWriter(stream, batch.schema)
    writer.write_batch(batch)
    writer.close()

    schema_bytes = batch.schema.serialize().to_pybytes()
    recordbatches_bytes = stream.getvalue().to_pybytes()[len(schema_bytes):]

    schema = np.ndarray(shape=len(schema_bytes),
                        dtype=np.byte,
                        buffer=bytearray(schema_bytes))
    rb_cpu_data = np.ndarray(
        shape=len(recordbatches_bytes),
        dtype=np.byte,
        buffer=bytearray(recordbatches_bytes),
    )
    rb_gpu_data = rmm.to_device(rb_cpu_data)

    gar = GpuArrowReader(schema, rb_gpu_data)
    columns = gar.to_dict()

    sr_idx = columns["idx"]
    sr_name = columns["name"]
    sr_weight = columns["weight"]

    assert sr_idx.dtype == np.int32
    assert sr_name.dtype == "category"
    assert sr_weight.dtype == np.double
    assert set(sr_name) == {"apple", "pear", "orange", "grape"}

    expected = get_expected_values()
    for i in range(len(sr_idx)):
        got_idx = sr_idx[i]
        got_name = sr_name[i]
        got_weight = sr_weight[i]

        # the serialized data is not of order
        exp_idx, exp_name, exp_weight = expected[got_idx]

        assert got_idx == exp_idx
        assert got_name == exp_name
        np.testing.assert_almost_equal(got_weight, exp_weight)
Пример #10
0
def test_gpu_parse_arrow_int16():
    batch = make_gpu_parse_arrow_int16_batch()
    schema_bytes = batch.schema.serialize().to_pybytes()
    recordbatches_bytes = batch.serialize().to_pybytes()

    schema = np.ndarray(shape=len(schema_bytes), dtype=np.byte,
                        buffer=bytearray(schema_bytes))

    rb_cpu_data = np.ndarray(shape=len(recordbatches_bytes), dtype=np.byte,
                             buffer=bytearray(recordbatches_bytes))

    rb_gpu_data = rmm.to_device(rb_cpu_data)
    gar = GpuArrowReader(schema, rb_gpu_data)
    columns = gar.to_dict()
    assert columns['depdelay'].dtype == np.int16
    assert set(columns) == {"depdelay", "arrdelay"}
    assert list(columns['depdelay']) == [0, 0, -3, -2, 11, 6, -7, -4, 4, -3]
Пример #11
0
def test_fillna():
    _, schema, darr = read_data()
    gar = GpuArrowReader(schema, darr)
    masked_col = gar[8]
    sr = Series(data=masked_col.data)
    dense = sr.nans_to_nulls().fillna(123)
    np.testing.assert_equal(123, dense.to_array())
    assert len(dense) == len(sr)
    assert dense.null_count == 0
Пример #12
0
def test_fillna():
    schema, darr = read_data()
    gar = GpuArrowReader(schema, darr)
    masked_col = gar[8]
    assert masked_col.null_count
    sr = Series.from_masked_array(data=masked_col.data,
                                  mask=masked_col.null,
                                  null_count=masked_col.null_count)
    dense = sr.fillna(123)
    np.testing.assert_equal(123, dense.to_array())
    assert len(dense) == len(sr)
    assert dense.null_count == 0
Пример #13
0
def test_reading_arrow_sparse_data():
    pdf, schema, darr = read_data()
    gar = GpuArrowReader(schema, darr)
    gdf = DataFrame(gar.to_dict().items())
    assert_eq(pdf, gdf)
Пример #14
0
def _parse_tdf_gpu(tdf):
    """
    Parse the results of a select ipc_gpu into a GpuDataFrame

    Parameters
    ----------
    tdf : TDataFrame

    Returns
    -------
    gdf : GpuDataFrame
    """

    import pyarrow as pa
    from cudf.comm.gpuarrow import GpuArrowReader
    from cudf.core.dataframe import DataFrame
    from cudf._lib.arrow._cuda import Context, IpcMemHandle
    from numba import cuda

    ipc_handle = IpcMemHandle.from_buffer(pa.py_buffer(tdf.df_handle))
    ctx = Context()
    ipc_buf = ctx.open_ipc_buffer(ipc_handle)
    ipc_buf.context.synchronize()

    schema_buffer, shm_ptr = load_buffer(tdf.sm_handle, tdf.sm_size)

    buffer = pa.BufferReader(schema_buffer)
    schema = pa.read_schema(buffer)

    # Dictionary Memo functionality used to
    # deserialize on the C++ side is not
    # exposed on the pyarrow side, so we need to
    # handle this on our own.
    dict_memo = {}

    try:
        dict_batch_reader = pa.RecordBatchStreamReader(buffer)
        updated_fields = []

        for f in schema:
            if pa.types.is_dictionary(f.type):
                msg = dict_batch_reader.read_next_batch()
                dict_memo[f.name] = msg.column(0)
                updated_fields.append(pa.field(f.name, f.type.index_type))
            else:
                updated_fields.append(pa.field(f.name, f.type))

        schema = pa.schema(updated_fields)
    except pa.ArrowInvalid:
        # This message does not have any dictionary encoded
        # columns
        pass

    dtype = np.dtype(np.byte)
    darr = cuda.devicearray.DeviceNDArray(
        shape=ipc_buf.size,
        strides=dtype.itemsize,
        dtype=dtype,
        gpu_data=ipc_buf.to_numba(),
    )

    reader = GpuArrowReader(schema, darr)
    df = DataFrame()
    df.set_tdf = MethodType(set_tdf, df)
    df.get_tdf = MethodType(get_tdf, df)

    for k, v in reader.to_dict().items():
        if k in dict_memo:
            df[k] = pa.DictionaryArray.from_arrays(v, dict_memo[k])
        else:
            df[k] = v

    df.set_tdf(tdf)

    # free shared memory from Python
    # https://github.com/omnisci/pymapd/issues/46
    # https://github.com/omnisci/pymapd/issues/31
    free_sm = shmdt(ctypes.cast(shm_ptr, ctypes.c_void_p))  # noqa

    return df