示例#1
0
def buffers_from_pyarrow(pa_arr, dtype=None):
    from cudf.core.buffer import Buffer
    from cudf.utils.cudautils import copy_array

    buffers = pa_arr.buffers()

    if buffers[0]:
        mask_dev_array = make_mask(len(pa_arr))
        arrow_dev_array = rmm.to_device(np.array(buffers[0]).view("int8"))
        copy_array(arrow_dev_array, mask_dev_array)
        pamask = Buffer(mask_dev_array)
    else:
        pamask = None

    if dtype:
        new_dtype = dtype
    else:
        if isinstance(pa_arr, pa.DictionaryArray):
            new_dtype = pa_arr.indices.type.to_pandas_dtype()
        else:
            new_dtype = pa_arr.type.to_pandas_dtype()

    if buffers[1]:
        padata = Buffer(
            np.array(buffers[1]).view(new_dtype)[pa_arr.offset:pa_arr.offset +
                                                 len(pa_arr)])
    else:
        padata = Buffer(np.empty(0, dtype=new_dtype))
    return (pamask, padata)
示例#2
0
 def copy(self):
     """Deep copy the buffer
     """
     return Buffer(
         mem=cudautils.copy_array(self.mem),
         size=self.size,
         capacity=self.capacity,
     )
示例#3
0
def buffers_from_pyarrow(pa_arr, dtype=None):
    """
    Given a pyarrow array returns a 5 length tuple of:
        - size
        - offset
        - cudf.Buffer --> mask
        - cudf.Buffer --> data
        - cudf.Buffer --> string characters
    """
    from cudf.core.buffer import Buffer
    from cudf.utils.cudautils import copy_array

    buffers = pa_arr.buffers()

    if pa_arr.null_count:
        mask_dev_array = make_mask(len(pa_arr))
        arrow_dev_array = rmm.to_device(np.asarray(buffers[0]).view("int8"))
        copy_array(arrow_dev_array, mask_dev_array)
        pamask = Buffer(mask_dev_array)
    else:
        pamask = None

    offset = pa_arr.offset
    size = len(pa_arr)

    if dtype:
        data_dtype = dtype
    elif isinstance(pa_arr, pa.StringArray):
        data_dtype = np.int32
    else:
        if isinstance(pa_arr, pa.DictionaryArray):
            data_dtype = pa_arr.indices.type.to_pandas_dtype()
        else:
            data_dtype = pa_arr.type.to_pandas_dtype()

    if buffers[1]:
        padata = Buffer(np.asarray(buffers[1]).view(data_dtype))
    else:
        padata = Buffer.empty(0)

    pastrs = None
    if isinstance(pa_arr, pa.StringArray):
        pastrs = Buffer(np.asarray(buffers[2]).view(np.int8))
    return (size, offset, pamask, padata, pastrs)
示例#4
0
文件: utils.py 项目: sriramch/cudf
def buffers_from_pyarrow(pa_arr, dtype=None):
    from cudf.core.buffer import Buffer
    from cudf.utils.cudautils import copy_array

    buffers = pa_arr.buffers()

    if pa_arr.null_count:
        mask_dev_array = make_mask(len(pa_arr))
        arrow_dev_array = rmm.to_device(np.asarray(buffers[0]).view("int8"))
        copy_array(arrow_dev_array, mask_dev_array)
        pamask = Buffer(mask_dev_array)
    else:
        pamask = None

    offset = pa_arr.offset
    size = pa_arr.offset + len(pa_arr)

    if dtype:
        data_dtype = dtype
    elif isinstance(pa_arr, pa.StringArray):
        data_dtype = np.int32
        size = size + 1  # extra element holds number of bytes
    else:
        if isinstance(pa_arr, pa.DictionaryArray):
            data_dtype = pa_arr.indices.type.to_pandas_dtype()
        else:
            data_dtype = pa_arr.type.to_pandas_dtype()

    if buffers[1]:
        padata = Buffer(
            np.asarray(buffers[1]).view(data_dtype)[offset:offset + size])
    else:
        padata = Buffer.empty(0)

    pastrs = None
    if isinstance(pa_arr, pa.StringArray):
        pastrs = Buffer(np.asarray(buffers[2]).view(np.int8))
    return (pamask, padata, pastrs)