def buffers_from_pyarrow(pa_arr, dtype=None): from cudf.core.buffer import Buffer from cudf.utils.cudautils import copy_array buffers = pa_arr.buffers() if buffers[0]: mask_dev_array = make_mask(len(pa_arr)) arrow_dev_array = rmm.to_device(np.array(buffers[0]).view("int8")) copy_array(arrow_dev_array, mask_dev_array) pamask = Buffer(mask_dev_array) else: pamask = None if dtype: new_dtype = dtype else: if isinstance(pa_arr, pa.DictionaryArray): new_dtype = pa_arr.indices.type.to_pandas_dtype() else: new_dtype = pa_arr.type.to_pandas_dtype() if buffers[1]: padata = Buffer( np.array(buffers[1]).view(new_dtype)[pa_arr.offset:pa_arr.offset + len(pa_arr)]) else: padata = Buffer(np.empty(0, dtype=new_dtype)) return (pamask, padata)
def copy(self): """Deep copy the buffer """ return Buffer( mem=cudautils.copy_array(self.mem), size=self.size, capacity=self.capacity, )
def buffers_from_pyarrow(pa_arr, dtype=None): """ Given a pyarrow array returns a 5 length tuple of: - size - offset - cudf.Buffer --> mask - cudf.Buffer --> data - cudf.Buffer --> string characters """ from cudf.core.buffer import Buffer from cudf.utils.cudautils import copy_array buffers = pa_arr.buffers() if pa_arr.null_count: mask_dev_array = make_mask(len(pa_arr)) arrow_dev_array = rmm.to_device(np.asarray(buffers[0]).view("int8")) copy_array(arrow_dev_array, mask_dev_array) pamask = Buffer(mask_dev_array) else: pamask = None offset = pa_arr.offset size = len(pa_arr) if dtype: data_dtype = dtype elif isinstance(pa_arr, pa.StringArray): data_dtype = np.int32 else: if isinstance(pa_arr, pa.DictionaryArray): data_dtype = pa_arr.indices.type.to_pandas_dtype() else: data_dtype = pa_arr.type.to_pandas_dtype() if buffers[1]: padata = Buffer(np.asarray(buffers[1]).view(data_dtype)) else: padata = Buffer.empty(0) pastrs = None if isinstance(pa_arr, pa.StringArray): pastrs = Buffer(np.asarray(buffers[2]).view(np.int8)) return (size, offset, pamask, padata, pastrs)
def buffers_from_pyarrow(pa_arr, dtype=None): from cudf.core.buffer import Buffer from cudf.utils.cudautils import copy_array buffers = pa_arr.buffers() if pa_arr.null_count: mask_dev_array = make_mask(len(pa_arr)) arrow_dev_array = rmm.to_device(np.asarray(buffers[0]).view("int8")) copy_array(arrow_dev_array, mask_dev_array) pamask = Buffer(mask_dev_array) else: pamask = None offset = pa_arr.offset size = pa_arr.offset + len(pa_arr) if dtype: data_dtype = dtype elif isinstance(pa_arr, pa.StringArray): data_dtype = np.int32 size = size + 1 # extra element holds number of bytes else: if isinstance(pa_arr, pa.DictionaryArray): data_dtype = pa_arr.indices.type.to_pandas_dtype() else: data_dtype = pa_arr.type.to_pandas_dtype() if buffers[1]: padata = Buffer( np.asarray(buffers[1]).view(data_dtype)[offset:offset + size]) else: padata = Buffer.empty(0) pastrs = None if isinstance(pa_arr, pa.StringArray): pastrs = Buffer(np.asarray(buffers[2]).view(np.int8)) return (pamask, padata, pastrs)