示例#1
0
文件: string.py 项目: zeichuan/cudf
    def as_numerical_column(self, dtype, **kwargs):

        mem_dtype = np.dtype(dtype)
        str_dtype = mem_dtype
        out_dtype = mem_dtype

        if mem_dtype.type in (np.int8, np.int16):
            mem_dtype = np.dtype(np.int32)
            str_dtype = mem_dtype
        elif mem_dtype.type is np.datetime64:
            kwargs.update(units=np.datetime_data(mem_dtype)[0])
            mem_dtype = np.dtype(np.int64)

        out_arr = rmm.device_array(shape=len(self), dtype=mem_dtype)
        out_ptr = get_ctype_ptr(out_arr)
        kwargs.update({"devptr": out_ptr})

        _str_to_numeric_typecast_functions[str_dtype](self.str(), **kwargs)

        out_col = columnops.as_column(out_arr)

        if self.null_count > 0:
            mask_size = utils.calc_chunk_size(len(self.data),
                                              utils.mask_bitsize)
            out_mask_arr = rmm.device_array(mask_size, dtype="int8")
            out_mask_ptr = get_ctype_ptr(out_mask_arr)
            self.data.set_null_bitmask(out_mask_ptr, bdevmem=True)
            mask = Buffer(out_mask_arr)
            out_col = out_col.set_mask(mask)

        return out_col.astype(out_dtype)
示例#2
0
文件: string.py 项目: yutiansut/cudf
    def __init__(self, data, null_count=None, **kwargs):
        """
        Parameters
        ----------
        data : nvstrings.nvstrings
            The nvstrings object
        null_count : int; optional
            The number of null values in the mask.
        """
        from collections.abc import Sequence
        if isinstance(data, Sequence):
            data = nvstrings.to_device(data)
        assert isinstance(data, nvstrings.nvstrings)
        self._data = data
        self._dtype = np.dtype("object")

        if null_count is None:
            null_count = data.null_count()
        self._null_count = null_count
        self._mask = None
        if self._null_count > 0:
            mask_size = utils.calc_chunk_size(len(self.data),
                                              utils.mask_bitsize)
            out_mask_arr = rmm.device_array(mask_size, dtype='int8')
            out_mask_ptr = get_ctype_ptr(out_mask_arr)
            self.data.set_null_bitmask(out_mask_ptr, bdevmem=True)
            self._mask = Buffer(out_mask_arr)
        self._nvcategory = None
        self._indices = None
示例#3
0
def _mask_from_cuda_array_interface_desc(obj):
    from cudf.utils.utils import calc_chunk_size, mask_dtype, mask_bitsize
    from cudf.utils.cudautils import compact_mask_bytes

    desc = obj.__cuda_array_interface__
    mask = desc.get("mask", None)

    if mask is not None:
        desc = mask.__cuda_array_interface__
        ptr = desc["data"][0]
        nelem = desc["shape"][0]
        typestr = desc["typestr"]
        typecode = typestr[1]
        if typecode == "t":
            nelem = calc_chunk_size(nelem, mask_bitsize)
            mask = Buffer(
                data=ptr, size=nelem * mask_dtype.itemsize, owner=obj
            )
        elif typecode == "b":
            dtype = np.dtype(typestr)
            mask = compact_mask_bytes(
                rmm.device_array_from_ptr(
                    ptr, nelem=nelem, dtype=dtype, finalizer=None
                )
            )
            mask = Buffer(mask)
        else:
            raise NotImplementedError(
                f"Cannot infer mask from typestr {typestr}"
            )
    return mask
示例#4
0
文件: string.py 项目: zeichuan/cudf
    def serialize(self):
        header = {"null_count": self._null_count}
        header["type"] = pickle.dumps(type(self))
        frames = []
        sub_headers = []

        sbuf = rmm.device_array(self._data.byte_count(), dtype="int8")
        obuf = rmm.device_array(len(self._data) + 1, dtype="int32")
        mask_size = utils.calc_chunk_size(len(self._data), utils.mask_bitsize)
        nbuf = rmm.device_array(mask_size, dtype="int8")
        self.data.to_offsets(
            get_ctype_ptr(sbuf),
            get_ctype_ptr(obuf),
            nbuf=get_ctype_ptr(nbuf),
            bdevmem=True,
        )
        for item in [nbuf, sbuf, obuf]:
            sheader = item.__cuda_array_interface__.copy()
            sheader["dtype"] = item.dtype.str
            sub_headers.append(sheader)
            frames.append(item)

        header["nvstrings"] = len(self._data)
        header["subheaders"] = sub_headers
        return header, frames
示例#5
0
def make_device_arrays(array):
    buffers = array.buffers()
    dtypes = [np.dtype(np.int8), None, None]

    if pa.types.is_list(array.type):
        dtypes[1] = np.dtype(np.int32)
    elif pa.types.is_string(array.type) or pa.types.is_binary(array.type):
        dtypes[2] = np.dtype(np.int8)
        dtypes[1] = np.dtype(np.int32)
    elif not pa.types.is_dictionary(array.type):
        dtypes[1] = arrow_to_pandas_dtype(array.type)
    else:
        dtypes[1] = arrow_to_pandas_dtype(array.type.index_type)

    if buffers[0] is not None:
        buf = CudaBuffer.from_buffer(buffers[0])
        nbytes = min(buf.size, calc_chunk_size(len(array), mask_bitsize))
        buffers[0] = gpu_view_as(nbytes, buf, dtypes[0])

    for i in range(1, len(buffers)):
        if buffers[i] is not None:
            buf = CudaBuffer.from_buffer(buffers[i])
            nbytes = min(buf.size, len(array) * dtypes[i].itemsize)
            buffers[i] = gpu_view_as(nbytes, buf, dtypes[i])

    return buffers
示例#6
0
文件: string.py 项目: sriramch/cudf
    def to_arrow(self):
        sbuf = np.empty(self.nvstrings.byte_count(), dtype="int8")
        obuf = np.empty(len(self.nvstrings) + 1, dtype="int32")

        mask_size = utils.calc_chunk_size(
            len(self.nvstrings), utils.mask_bitsize
        )
        nbuf = np.empty(mask_size, dtype="int8")

        self.str().to_offsets(sbuf, obuf, nbuf=nbuf)
        sbuf = pa.py_buffer(sbuf)
        obuf = pa.py_buffer(obuf)
        nbuf = pa.py_buffer(nbuf)
        if self.null_count == len(self):
            return pa.NullArray.from_buffers(
                pa.null(), len(self), [pa.py_buffer((b""))], self.null_count
            )
        else:
            return pa.StringArray.from_buffers(
                len(self.nvstrings),
                obuf,
                sbuf,
                nbuf,
                self.nvstrings.null_count(),
            )
示例#7
0
文件: column.py 项目: harrism/cudf
 def allocate_mask(self, all_valid=True):
     """Return a new Column with a newly allocated mask buffer.
     If ``all_valid`` is True, the new mask is set to all valid.
     If ``all_valid`` is False, the new mask is set to all null.
     """
     nelem = len(self)
     mask_sz = utils.calc_chunk_size(nelem, utils.mask_bitsize)
     mask = cuda.device_array(mask_sz, dtype=utils.mask_dtype)
     cudautils.fill_value(mask, 0xff if all_valid else 0)
     return self.set_mask(mask=mask, null_count=0 if all_valid else nelem)
示例#8
0
文件: utils.py 项目: trevorsm7/cudf
def random_bitmask(size):
    """
    Parameters
    ----------
    size : int
        number of bits
    """
    sz = utils.calc_chunk_size(size, utils.mask_bitsize)
    data = np.random.randint(0, 255 + 1, size=sz)
    return data.astype(utils.mask_dtype)
示例#9
0
 def mask_array_view(self):
     """
     View the mask as a device array
     """
     result = rmm.device_array_from_ptr(
         ptr=self.mask.ptr,
         nelem=calc_chunk_size(len(self), mask_bitsize),
         dtype=np.int8,
     )
     result.gpu_data._obj = self
     return result
示例#10
0
文件: _gdf.py 项目: xincui-math/cudf
def cffi_view_to_column_mem(cffi_view):
    gdf_dtype = cffi_view.dtype
    if gdf_dtype == libgdf.GDF_STRING_CATEGORY:
        data_ptr = int(ffi.cast("uintptr_t", cffi_view.data))
        # We need to create this just to make sure the memory is properly freed
        data = rmm.device_array_from_ptr(data_ptr,
                                         nelem=cffi_view.size,
                                         dtype='int32',
                                         finalizer=rmm._make_finalizer(
                                             data_ptr, 0))
        nvcat_ptr = int(ffi.cast("uintptr_t", cffi_view.dtype_info.category))
        nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr)
        nvstr_obj = nvcat_obj.to_strings()
        mask = None
        if cffi_view.valid:
            mask_ptr = int(ffi.cast("uintptr_t", cffi_view.valid))
            mask = rmm.device_array_from_ptr(
                mask_ptr,
                nelem=calc_chunk_size(cffi_view.size, mask_bitsize),
                dtype=mask_dtype,
                finalizer=rmm._make_finalizer(mask_ptr, 0))
        return nvstr_obj, mask
    else:
        intaddr = int(ffi.cast("uintptr_t", cffi_view.data))
        data = rmm.device_array_from_ptr(
            intaddr,
            nelem=cffi_view.size,
            dtype=gdf_to_np_dtype(cffi_view.dtype),
            finalizer=rmm._make_finalizer(intaddr, 0))
        mask = None
        if cffi_view.valid:
            intaddr = int(ffi.cast("uintptr_t", cffi_view.valid))
            mask = rmm.device_array_from_ptr(
                intaddr,
                nelem=calc_chunk_size(cffi_view.size, mask_bitsize),
                dtype=mask_dtype,
                finalizer=rmm._make_finalizer(intaddr, 0))

        return data, mask
示例#11
0
文件: string.py 项目: yutiansut/cudf
    def to_arrow(self):
        sbuf = np.empty(self._data.byte_count(), dtype='int8')
        obuf = np.empty(len(self._data) + 1, dtype='int32')

        mask_size = utils.calc_chunk_size(len(self._data), utils.mask_bitsize)
        nbuf = np.empty(mask_size, dtype='int8')

        self.str().to_offsets(sbuf, obuf, nbuf=nbuf)
        sbuf = pa.py_buffer(sbuf)
        obuf = pa.py_buffer(obuf)
        nbuf = pa.py_buffer(nbuf)
        if self.null_count == len(self):
            return pa.NullArray.from_buffers(pa.null(), len(self), np.empty(0),
                                             self.null_count)
        else:
            return pa.StringArray.from_buffers(len(self._data), obuf, sbuf,
                                               nbuf, self._data.null_count())
示例#12
0
文件: _gdf.py 项目: raydouglass/cudf
def cffi_view_to_column_mem(cffi_view):
    intaddr = int(ffi.cast("uintptr_t", cffi_view.data))
    data = rmm.device_array_from_ptr(intaddr,
                                     nelem=cffi_view.size,
                                     dtype=gdf_to_np_dtype(cffi_view.dtype),
                                     finalizer=rmm._make_finalizer(intaddr, 0))

    if cffi_view.valid:
        intaddr = int(ffi.cast("uintptr_t", cffi_view.valid))
        mask = rmm.device_array_from_ptr(
            intaddr,
            nelem=calc_chunk_size(cffi_view.size, mask_bitsize),
            dtype=mask_dtype,
            finalizer=rmm._make_finalizer(intaddr, 0))
    else:
        mask = None

    return data, mask
示例#13
0
文件: string.py 项目: sriramch/cudf
    def as_numerical_column(self, dtype, **kwargs):

        mem_dtype = np.dtype(dtype)
        str_dtype = mem_dtype
        out_dtype = mem_dtype

        if mem_dtype.type in (np.int8, np.int16):
            mem_dtype = np.dtype(np.int32)
            str_dtype = mem_dtype
        elif mem_dtype.type is np.datetime64:
            kwargs.update(units=np.datetime_data(mem_dtype)[0])
            mem_dtype = np.dtype(np.int64)
            if "format" not in kwargs:
                if len(self.nvstrings) > 0:
                    # infer on host from the first not na element
                    fmt = pd.core.tools.datetimes._guess_datetime_format(
                        self[self.notna()][0]
                    )
                    kwargs.update(format=fmt)
            else:
                fmt = None

        out_arr = rmm.device_array(shape=len(self), dtype=mem_dtype)
        out_ptr = libcudf.cudf.get_ctype_ptr(out_arr)
        kwargs.update({"devptr": out_ptr})

        _str_to_numeric_typecast_functions[str_dtype](self.nvstrings, **kwargs)

        out_col = column.as_column(out_arr)

        if self.has_nulls:
            mask_size = utils.calc_chunk_size(
                len(self.nvstrings), utils.mask_bitsize
            )
            out_mask = column.column_empty(
                mask_size, dtype="int8", masked=False
            ).data
            out_mask_ptr = out_mask.ptr
            self.nvstrings.set_null_bitmask(out_mask_ptr, bdevmem=True)
            out_col.mask = out_mask

        return out_col.astype(out_dtype)
示例#14
0
def as_column(arbitrary, nan_as_null=True, dtype=None, length=None):
    """Create a Column from an arbitrary object

    Parameters
    ----------
    arbitrary : object
        Object to construct the Column from. See *Notes*.
    nan_as_null : bool,optional
        If True (default), treat NaN values in arbitrary as null.
    dtype : optional
        Optionally typecast the construted Column to the given
        dtype.
    length : int, optional
        If `arbitrary` is a scalar, broadcast into a Column of
        the given length.

    Returns
    -------
    A Column of the appropriate type and size.

    Notes
    -----
    Currently support inputs are:

    * ``Column``
    * ``Series``
    * ``Index``
    * Scalars (can be broadcasted to a specified `length`)
    * Objects exposing ``__cuda_array_interface__`` (e.g., numba device arrays)
    * Objects exposing ``__array_interface__``(e.g., numpy arrays)
    * pyarrow array
    * pandas.Categorical objects
    """

    from cudf.core.column import numerical, categorical, datetime, string
    from cudf.core.series import Series
    from cudf.core.index import Index

    if isinstance(arbitrary, ColumnBase):
        if dtype is not None:
            return arbitrary.astype(dtype)
        else:
            return arbitrary

    elif isinstance(arbitrary, Series):
        data = arbitrary._column
        if dtype is not None:
            data = data.astype(dtype)
    elif isinstance(arbitrary, Index):
        data = arbitrary._values
        if dtype is not None:
            data = data.astype(dtype)
    elif isinstance(arbitrary, nvstrings.nvstrings):
        byte_count = arbitrary.byte_count()
        if byte_count > libcudfxx.MAX_STRING_COLUMN_BYTES:
            raise MemoryError(
                "Cannot construct string columns "
                "containing > {} bytes. "
                "Consider using dask_cudf to partition "
                "your data.".format(libcudfxx.MAX_STRING_COLUMN_BYTES_STR)
            )
        sbuf = Buffer.empty(arbitrary.byte_count())
        obuf = Buffer.empty(
            (arbitrary.size() + 1) * np.dtype("int32").itemsize
        )

        nbuf = None
        if arbitrary.null_count() > 0:
            mask_size = calc_chunk_size(arbitrary.size(), mask_bitsize)
            nbuf = Buffer.empty(mask_size)
            arbitrary.set_null_bitmask(nbuf.ptr, bdevmem=True)
        arbitrary.to_offsets(sbuf.ptr, obuf.ptr, None, bdevmem=True)
        children = (
            build_column(obuf, dtype="int32"),
            build_column(sbuf, dtype="int8"),
        )
        data = build_column(
            data=None, dtype="object", mask=nbuf, children=children
        )
        data._nvstrings = arbitrary

    elif isinstance(arbitrary, Buffer):
        if dtype is None:
            raise TypeError(f"dtype cannot be None if 'arbitrary' is a Buffer")
        data = build_column(arbitrary, dtype=dtype)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary), dtype=arbitrary.dtype)
        if (
            data.dtype in [np.float16, np.float32, np.float64]
            and arbitrary.size > 0
        ):
            if nan_as_null:
                mask = libcudf.unaryops.nans_to_nulls(data)
                data = data.set_mask(mask)

        elif data.dtype.kind == "M":
            null = column_empty_like(data, masked=True, newsize=1)
            col = libcudf.replace.replace(
                as_column(Buffer(arbitrary), dtype=arbitrary.dtype),
                as_column(
                    Buffer(np.array([np.datetime64("NaT")], dtype=data.dtype)),
                    dtype=arbitrary.dtype,
                ),
                null,
            )
            data = datetime.DatetimeColumn(
                data=Buffer(arbitrary), dtype=data.dtype, mask=col.mask
            )

    elif hasattr(arbitrary, "__cuda_array_interface__"):
        desc = arbitrary.__cuda_array_interface__
        data = _data_from_cuda_array_interface_desc(arbitrary)
        mask = _mask_from_cuda_array_interface_desc(arbitrary)
        dtype = np.dtype(desc["typestr"])
        col = build_column(data, dtype=dtype, mask=mask)
        return col

    elif isinstance(arbitrary, np.ndarray):
        # CUDF assumes values are always contiguous
        if not arbitrary.flags["C_CONTIGUOUS"]:
            arbitrary = np.ascontiguousarray(arbitrary)

        if dtype is not None:
            arbitrary = arbitrary.astype(dtype)

        if arbitrary.dtype.kind == "M":
            data = datetime.DatetimeColumn.from_numpy(arbitrary)

        elif arbitrary.dtype.kind in ("O", "U"):
            data = as_column(pa.Array.from_pandas(arbitrary))
        else:
            data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            pa_size, pa_offset, nbuf, obuf, sbuf = buffers_from_pyarrow(
                arbitrary
            )
            children = (
                build_column(data=obuf, dtype="int32"),
                build_column(data=sbuf, dtype="int8"),
            )

            data = string.StringColumn(
                mask=nbuf, children=children, size=pa_size, offset=pa_offset
            )

        elif isinstance(arbitrary, pa.NullArray):
            new_dtype = pd.api.types.pandas_dtype(dtype)
            if (type(dtype) == str and dtype == "empty") or dtype is None:
                new_dtype = pd.api.types.pandas_dtype(
                    arbitrary.type.to_pandas_dtype()
                )

            if is_categorical_dtype(new_dtype):
                arbitrary = arbitrary.dictionary_encode()
            else:
                if nan_as_null:
                    arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype))
                else:
                    # casting a null array doesn't make nans valid
                    # so we create one with valid nans from scratch:
                    if new_dtype == np.dtype("object"):
                        arbitrary = utils.scalar_broadcast_to(
                            None, (len(arbitrary),), dtype=new_dtype
                        )
                    else:
                        arbitrary = utils.scalar_broadcast_to(
                            np.nan, (len(arbitrary),), dtype=new_dtype
                        )
            data = as_column(arbitrary, nan_as_null=nan_as_null)
        elif isinstance(arbitrary, pa.DictionaryArray):
            codes = as_column(arbitrary.indices)
            if isinstance(arbitrary.dictionary, pa.NullArray):
                categories = as_column([], dtype="object")
            else:
                categories = as_column(arbitrary.dictionary)
            dtype = CategoricalDtype(
                categories=categories, ordered=arbitrary.type.ordered
            )
            data = categorical.CategoricalColumn(
                dtype=dtype,
                mask=codes.base_mask,
                children=(codes,),
                size=codes.size,
                offset=codes.offset,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            dtype = np.dtype("M8[{}]".format(arbitrary.type.unit))
            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary, dtype=dtype
            )

            data = datetime.DatetimeColumn(
                data=padata,
                mask=pamask,
                dtype=dtype,
                size=pa_size,
                offset=pa_offset,
            )
        elif isinstance(arbitrary, pa.Date64Array):
            raise NotImplementedError
            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary, dtype="M8[ms]"
            )
            data = datetime.DatetimeColumn(
                data=padata,
                mask=pamask,
                dtype=np.dtype("M8[ms]"),
                size=pa_size,
                offset=pa_offset,
            )
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value",
                UserWarning,
            )
            data = as_column(arbitrary.cast(pa.int32())).astype("M8[ms]")
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            # Needed because of bug in PyArrow
            # https://issues.apache.org/jira/browse/ARROW-4766
            if len(arbitrary) > 0:
                arbitrary = arbitrary.cast(pa.int8())
            else:
                arbitrary = pa.array([], type=pa.int8())

            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary, dtype=dtype
            )
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                dtype=dtype,
                size=pa_size,
                offset=pa_offset,
            )
        else:
            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary
            )
            data = numerical.NumericalColumn(
                data=padata,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()),
                mask=pamask,
                size=pa_size,
                offset=pa_offset,
            )

    elif isinstance(arbitrary, pa.ChunkedArray):
        gpu_cols = [
            as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks
        ]

        if dtype and dtype != "empty":
            new_dtype = dtype
        else:
            pa_type = arbitrary.type
            if pa.types.is_dictionary(pa_type):
                new_dtype = "category"
            else:
                new_dtype = np.dtype(pa_type.to_pandas_dtype())

        data = ColumnBase._concat(gpu_cols, dtype=new_dtype)

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        if is_categorical_dtype(arbitrary):
            data = as_column(pa.array(arbitrary, from_pandas=True))
        elif arbitrary.dtype == np.bool:
            # Bug in PyArrow or HDF that requires us to do this
            data = as_column(pa.array(np.asarray(arbitrary), from_pandas=True))
        else:
            data = as_column(pa.array(arbitrary, from_pandas=nan_as_null))

    elif isinstance(arbitrary, pd.Timestamp):
        # This will always treat NaTs as nulls since it's not technically a
        # discrete value like NaN
        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))

    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
        length = length or 1
        data = as_column(
            utils.scalar_broadcast_to(arbitrary, length, dtype=dtype)
        )
        if not nan_as_null:
            data = data.fillna(np.nan)

    elif isinstance(arbitrary, memoryview):
        data = as_column(
            np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null
        )

    else:
        try:
            data = as_column(
                memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null
            )
        except TypeError:
            pa_type = None
            np_type = None
            try:
                if dtype is not None:
                    dtype = pd.api.types.pandas_dtype(dtype)
                    if is_categorical_dtype(dtype):
                        raise TypeError
                    else:
                        np_type = np.dtype(dtype).type
                        if np_type == np.bool_:
                            pa_type = pa.bool_()
                        else:
                            pa_type = np_to_pa_dtype(np.dtype(dtype))
                data = as_column(
                    pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null),
                    dtype=dtype,
                    nan_as_null=nan_as_null,
                )
            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                if is_categorical_dtype(dtype):
                    sr = pd.Series(arbitrary, dtype="category")
                    data = as_column(sr, nan_as_null=nan_as_null)
                elif np_type == np.str_:
                    sr = pd.Series(arbitrary, dtype="str")
                    data = as_column(sr, nan_as_null=nan_as_null)
                else:
                    data = as_column(
                        np.asarray(arbitrary, dtype=np.dtype(dtype)),
                        nan_as_null=nan_as_null,
                    )
    return data
示例#15
0
文件: _gdf.py 项目: zhouyonglong/cudf
def libgdf_join(col_lhs, col_rhs, on, how, method='sort'):
    joiner = _join_how_api[how]
    method_api = _join_method_api[method]
    gdf_context = ffi.new('gdf_context*')

    libgdf.gdf_context_view(gdf_context, 0, method_api, 0, 0, 0)

    if how not in ['left', 'inner', 'outer']:
        msg = "new join api only supports left or inner"
        raise ValueError(msg)

    list_lhs = []
    list_rhs = []
    result_cols = []

    result_col_names = []

    left_idx = []
    right_idx = []
    # idx = 0
    for name, col in col_lhs.items():
        list_lhs.append(col._column.cffi_view)
        if name not in on:
            result_cols.append(columnview(0, None, dtype=col._column.dtype))
            result_col_names.append(name)

    for name in on:
        result_cols.append(columnview(0, None,
                                      dtype=col_lhs[name]._column.dtype))
        result_col_names.append(name)
        left_idx.append(list(col_lhs.keys()).index(name))
        right_idx.append(list(col_rhs.keys()).index(name))

    for name, col in col_rhs.items():
        list_rhs.append(col._column.cffi_view)
        if name not in on:
            result_cols.append(columnview(0, None, dtype=col._column.dtype))
            result_col_names.append(name)

    num_cols_to_join = len(on)
    result_num_cols = len(list_lhs) + len(list_rhs) - num_cols_to_join

    joiner(list_lhs,
           len(list_lhs),
           left_idx,
           list_rhs,
           len(list_rhs),
           right_idx,
           num_cols_to_join,
           result_num_cols,
           result_cols,
           ffi.NULL,
           ffi.NULL,
           gdf_context)

    res = []
    valids = []

    for col in result_cols:

        intaddr = int(ffi.cast("uintptr_t", col.data))
        res.append(rmm.device_array_from_ptr(ptr=intaddr,
                                             nelem=col.size,
                                             dtype=gdf_to_np_dtype(col.dtype),
                                             finalizer=rmm._make_finalizer(
                                                 intaddr, 0)))
        intaddr = int(ffi.cast("uintptr_t", col.valid))
        valids.append(rmm.device_array_from_ptr(ptr=intaddr,
                                                nelem=calc_chunk_size(
                                                    col.size, mask_bitsize),
                                                dtype=mask_dtype,
                                                finalizer=rmm._make_finalizer(
                                                    intaddr, 0)))

    return res, valids