示例#1
0
def column_empty(row_count, dtype="object", masked=False):
    """Allocate a new column like the given row_count and dtype.
    """
    dtype = pd.api.types.pandas_dtype(dtype)
    children = ()

    if is_categorical_dtype(dtype):
        data = None
        children = (build_column(
            data=Buffer.empty(row_count * np.dtype("int32").itemsize),
            dtype="int32",
        ), )
    elif dtype.kind in "OU":
        data = None
        children = (
            build_column(
                data=Buffer.empty(
                    (row_count + 1) * np.dtype("int32").itemsize),
                dtype="int32",
            ),
            build_column(
                data=Buffer.empty(row_count * np.dtype("int8").itemsize),
                dtype="int8",
            ),
        )
    else:
        data = Buffer.empty(row_count * dtype.itemsize)

    if masked:
        mask = Buffer(cudautils.make_empty_mask(row_count))
    else:
        mask = None

    return build_column(data, dtype, mask=mask, children=children)
示例#2
0
文件: column.py 项目: zivzone/cudf
def column_empty(row_count, dtype="object", masked=False):
    """Allocate a new column like the given row_count and dtype.
    """
    dtype = pd.api.types.pandas_dtype(dtype)
    children = ()

    if is_categorical_dtype(dtype):
        data = None
        children = (build_column(
            data=Buffer.empty(row_count * np.dtype("int32").itemsize),
            dtype="int32",
        ), )
    elif dtype.kind in "OU":
        data = None
        children = (
            build_column(
                data=Buffer(cupy.zeros(row_count + 1, dtype="int32")),
                dtype="int32",
            ),
            build_column(
                data=Buffer.empty(row_count * np.dtype("int8").itemsize),
                dtype="int8",
            ),
        )
    else:
        data = Buffer.empty(row_count * dtype.itemsize)

    if masked:
        mask = create_null_mask(row_count, state=MaskState.ALL_NULL)
    else:
        mask = None

    return build_column(data, dtype, mask=mask, children=children)
示例#3
0
 def __init__(self, dtype, mask=None, offset=0, children=()):
     """
     Parameters
     ----------
     dtype : CategoricalDtype
     mask : Buffer
         The validity mask
     offset : int
         Data offset
     children : Tuple[Column]
         Two non-null columns containing the categories and codes
         respectively
     """
     data = Buffer.empty(0)
     size = children[0].size
     if isinstance(dtype, pd.api.types.CategoricalDtype):
         dtype = CategoricalDtype.from_pandas(dtype)
     if not isinstance(dtype, CategoricalDtype):
         raise ValueError("dtype must be instance of CategoricalDtype")
     super().__init__(
         data,
         size=size,
         dtype=dtype,
         mask=mask,
         offset=offset,
         children=children,
     )
     self._codes = None
示例#4
0
文件: utils.py 项目: zivzone/cudf
def buffers_from_pyarrow(pa_arr, dtype=None):
    """
    Given a pyarrow array returns a 5 length tuple of:
        - size
        - offset
        - cudf.Buffer --> mask
        - cudf.Buffer --> data
        - cudf.Buffer --> string characters
    """
    from cudf._libxx.null_mask import bitmask_allocation_size_bytes

    buffers = pa_arr.buffers()

    if pa_arr.null_count:
        mask_size = bitmask_allocation_size_bytes(len(pa_arr))
        pamask = pyarrow_buffer_to_cudf_buffer(buffers[0], mask_size=mask_size)
    else:
        pamask = None

    offset = pa_arr.offset
    size = len(pa_arr)

    if buffers[1]:
        padata = pyarrow_buffer_to_cudf_buffer(buffers[1])
    else:
        padata = Buffer.empty(0)

    pastrs = None
    if isinstance(pa_arr, pa.StringArray):
        pastrs = pyarrow_buffer_to_cudf_buffer(buffers[2])
    return (size, offset, pamask, padata, pastrs)
示例#5
0
文件: string.py 项目: sriramch/cudf
    def __init__(self, mask=None, offset=0, children=()):
        """
        Parameters
        ----------
        mask : Buffer
            The validity mask
        offset : int
            Data offset
        children : Tuple[Column]
            Two non-null columns containing the string data and offsets
            respectively
        """

        data = Buffer.empty(0)
        dtype = np.dtype("object")

        if children[0].size == 0:
            size = 0
        else:
            # one less because the last element of offsets is the number of
            # bytes in the data buffer
            size = children[0].size - 1

        super().__init__(data, size, dtype, mask=mask, children=children)

        self._nvstrings = None
        self._nvcategory = None
        self._indices = None
示例#6
0
def buffers_from_pyarrow(pa_arr, dtype=None):
    """
    Given a pyarrow array returns a 5 length tuple of:
        - size
        - offset
        - cudf.Buffer --> mask
        - cudf.Buffer --> data
        - cudf.Buffer --> string characters
    """
    from cudf.core.buffer import Buffer
    from cudf.utils.cudautils import copy_array

    buffers = pa_arr.buffers()

    if pa_arr.null_count:
        mask_dev_array = make_mask(len(pa_arr))
        arrow_dev_array = rmm.to_device(np.asarray(buffers[0]).view("int8"))
        copy_array(arrow_dev_array, mask_dev_array)
        pamask = Buffer(mask_dev_array)
    else:
        pamask = None

    offset = pa_arr.offset
    size = len(pa_arr)

    if dtype:
        data_dtype = dtype
    elif isinstance(pa_arr, pa.StringArray):
        data_dtype = np.int32
    else:
        if isinstance(pa_arr, pa.DictionaryArray):
            data_dtype = pa_arr.indices.type.to_pandas_dtype()
        else:
            data_dtype = pa_arr.type.to_pandas_dtype()

    if buffers[1]:
        padata = Buffer(np.asarray(buffers[1]).view(data_dtype))
    else:
        padata = Buffer.empty(0)

    pastrs = None
    if isinstance(pa_arr, pa.StringArray):
        pastrs = Buffer(np.asarray(buffers[2]).view(np.int8))
    return (size, offset, pamask, padata, pastrs)
示例#7
0
    def write(self, content):
        from cudf.core.buffer import Buffer
        from cupy.cuda import MemoryPointer
        from cupy.cuda.memory import UnownedMemory

        if not self._has_write_headers:
            self._headers = headers = pickle.loads(content)
            buffer_types = headers["buffer_types"]
            for buffer_type, size in buffer_types:
                if buffer_type == "cuda":
                    self._buffers.append(Buffer.empty(size))
                else:
                    self._buffers.append(BytesIO())
            self._has_write_headers = True
            return

        cur_buf = self._buffers[self._cur_buffer_index]
        cur_buf_size = self._headers["buffer_types"][self._cur_buffer_index][1]
        if isinstance(cur_buf, Buffer):
            cur_cupy_memory = UnownedMemory(cur_buf.ptr, len(cur_buf), cur_buf)
            cupy_pointer = MemoryPointer(cur_cupy_memory, self._offset)

            if isinstance(content, bytes):
                content_length = len(content)
                source_mem = np.frombuffer(
                    content, dtype="uint8").ctypes.data_as(ctypes.c_void_p)
            else:
                source_mem = MemoryPointer(
                    UnownedMemory(content.ptr, len(content), content), 0)
                content_length = source_mem.mem.size
            cupy_pointer.copy_from(source_mem, content_length)
        else:
            content_length = len(content)
            cur_buf.write(content)
        if content_length + self._offset >= cur_buf_size:
            if isinstance(cur_buf, BytesIO):
                self._buffers[self._cur_buffer_index] = cur_buf.getvalue()
            self._cur_buffer_index += 1
            self._offset = 0
        else:
            self._offset += content_length
示例#8
0
文件: utils.py 项目: wjsi/mars
async def read_buffers(header: Dict, reader: StreamReader):
    if cupy is not None and cudf is not None:
        from cudf.core.buffer import Buffer as CPBuffer
        from cupy.cuda.memory import (
            UnownedMemory as CPUnownedMemory,
            MemoryPointer as CPMemoryPointer,
        )
    else:
        CPBuffer = CPUnownedMemory = CPMemoryPointer = None

    # construct a empty cuda buffer and copy from host
    is_cuda_buffers = header.get("is_cuda_buffers")
    buffer_sizes = header.pop(BUFFER_SIZES_NAME)

    buffers = []
    for is_cuda_buffer, buf_size in zip(is_cuda_buffers, buffer_sizes):
        if is_cuda_buffer:  # pragma: no cover
            if buf_size == 0:
                content = await reader.readexactly(buf_size)
                buffers.append(content)
            else:
                cuda_buffer = CPBuffer.empty(buf_size)
                cupy_memory = CPUnownedMemory(cuda_buffer.ptr, buf_size,
                                              cuda_buffer)
                offset = 0
                chunk_size = CUDA_CHUNK_SIZE
                while offset < buf_size:
                    read_size = (chunk_size if (offset + chunk_size) < buf_size
                                 else buf_size - offset)
                    content = await reader.readexactly(read_size)
                    source_mem = np.frombuffer(
                        content, dtype="uint8").ctypes.data_as(ctypes.c_void_p)
                    cupy_pointer = CPMemoryPointer(cupy_memory, offset)
                    cupy_pointer.copy_from(source_mem, len(content))
                    offset += read_size
                buffers.append(cuda_buffer)
        else:
            buffers.append(await reader.readexactly(buf_size))
    return buffers
示例#9
0
    def read(self, size: int):
        # we read cuda_header first and then read cuda buffers one by one,
        # the return value's size is not exactly the specified size.
        from cudf.core.buffer import Buffer
        from cupy.cuda import MemoryPointer
        from cupy.cuda.memory import UnownedMemory

        if not self._has_read_headers:
            self._has_read_headers = True
            return pickle.dumps(self._headers)
        if len(self._buffers) == 0:
            return ""
        cur_buf = self._buffers[0]
        # current buf read to end
        if cur_buf is None:
            # empty cuda buffer
            content = Buffer.empty(0)
            self._offset = 0
            self._buffers.pop(0)
            return content
        elif size >= cur_buf.size - self._offset:
            if isinstance(cur_buf, UnownedMemory):
                cupy_pointer = MemoryPointer(cur_buf, self._offset)
                content = Buffer(cupy_pointer.ptr,
                                 size=cur_buf.size - self._offset)
            else:
                content = cur_buf[self._offset:self._offset + size]
            self._offset = 0
            self._buffers.pop(0)
            return content
        else:
            if isinstance(cur_buf, UnownedMemory):
                cupy_pointer = MemoryPointer(cur_buf, self._offset)
                self._offset += size
                return Buffer(cupy_pointer.ptr, size=size)
            else:
                self._offset += size
                return cur_buf[self._offset, self._offset + size]
示例#10
0
文件: string.py 项目: trevorsm7/cudf
    def as_numerical_column(self, dtype, **kwargs):

        mem_dtype = np.dtype(dtype)
        str_dtype = mem_dtype
        out_dtype = mem_dtype

        if mem_dtype.type in (np.int8, np.int16):
            mem_dtype = np.dtype(np.int32)
            str_dtype = mem_dtype
        elif mem_dtype.type is np.datetime64:
            kwargs.update(units=np.datetime_data(mem_dtype)[0])
            mem_dtype = np.dtype(np.int64)
            if "format" not in kwargs:
                if len(self.nvstrings) > 0:
                    # infer on host from the first not na element
                    fmt = pd.core.tools.datetimes._guess_datetime_format(
                        self[self.notna()][0])
                    kwargs.update(format=fmt)
            else:
                fmt = None

        out_arr = rmm.device_array(shape=len(self), dtype=mem_dtype)
        out_ptr = libcudf.cudf.get_ctype_ptr(out_arr)
        kwargs.update({"devptr": out_ptr})

        _str_to_numeric_typecast_functions[str_dtype](self.nvstrings, **kwargs)

        out_col = column.as_column(out_arr)

        if self.has_nulls:
            mask_size = utils.calc_chunk_size(len(self.nvstrings),
                                              utils.mask_bitsize)
            out_mask = Buffer.empty(mask_size)
            out_mask_ptr = out_mask.ptr
            self.nvstrings.set_null_bitmask(out_mask_ptr, bdevmem=True)
            out_col = out_col.set_mask(out_mask)

        return out_col.astype(out_dtype)
示例#11
0
文件: utils.py 项目: sriramch/cudf
def buffers_from_pyarrow(pa_arr, dtype=None):
    from cudf.core.buffer import Buffer
    from cudf.utils.cudautils import copy_array

    buffers = pa_arr.buffers()

    if pa_arr.null_count:
        mask_dev_array = make_mask(len(pa_arr))
        arrow_dev_array = rmm.to_device(np.asarray(buffers[0]).view("int8"))
        copy_array(arrow_dev_array, mask_dev_array)
        pamask = Buffer(mask_dev_array)
    else:
        pamask = None

    offset = pa_arr.offset
    size = pa_arr.offset + len(pa_arr)

    if dtype:
        data_dtype = dtype
    elif isinstance(pa_arr, pa.StringArray):
        data_dtype = np.int32
        size = size + 1  # extra element holds number of bytes
    else:
        if isinstance(pa_arr, pa.DictionaryArray):
            data_dtype = pa_arr.indices.type.to_pandas_dtype()
        else:
            data_dtype = pa_arr.type.to_pandas_dtype()

    if buffers[1]:
        padata = Buffer(
            np.asarray(buffers[1]).view(data_dtype)[offset:offset + size])
    else:
        padata = Buffer.empty(0)

    pastrs = None
    if isinstance(pa_arr, pa.StringArray):
        pastrs = Buffer(np.asarray(buffers[2]).view(np.int8))
    return (pamask, padata, pastrs)
示例#12
0
def as_column(arbitrary, nan_as_null=True, dtype=None, length=None):
    """Create a Column from an arbitrary object

    Parameters
    ----------
    arbitrary : object
        Object to construct the Column from. See *Notes*.
    nan_as_null : bool,optional
        If True (default), treat NaN values in arbitrary as null.
    dtype : optional
        Optionally typecast the construted Column to the given
        dtype.
    length : int, optional
        If `arbitrary` is a scalar, broadcast into a Column of
        the given length.

    Returns
    -------
    A Column of the appropriate type and size.

    Notes
    -----
    Currently support inputs are:

    * ``Column``
    * ``Series``
    * ``Index``
    * Scalars (can be broadcasted to a specified `length`)
    * Objects exposing ``__cuda_array_interface__`` (e.g., numba device arrays)
    * Objects exposing ``__array_interface__``(e.g., numpy arrays)
    * pyarrow array
    * pandas.Categorical objects
    """

    from cudf.core.column import numerical, categorical, datetime, string
    from cudf.core.series import Series
    from cudf.core.index import Index

    if isinstance(arbitrary, ColumnBase):
        if dtype is not None:
            return arbitrary.astype(dtype)
        else:
            return arbitrary

    elif isinstance(arbitrary, Series):
        data = arbitrary._column
        if dtype is not None:
            data = data.astype(dtype)
    elif isinstance(arbitrary, Index):
        data = arbitrary._values
        if dtype is not None:
            data = data.astype(dtype)
    elif isinstance(arbitrary, nvstrings.nvstrings):
        byte_count = arbitrary.byte_count()
        if byte_count > libcudfxx.MAX_STRING_COLUMN_BYTES:
            raise MemoryError(
                "Cannot construct string columns "
                "containing > {} bytes. "
                "Consider using dask_cudf to partition "
                "your data.".format(libcudfxx.MAX_STRING_COLUMN_BYTES_STR)
            )
        sbuf = Buffer.empty(arbitrary.byte_count())
        obuf = Buffer.empty(
            (arbitrary.size() + 1) * np.dtype("int32").itemsize
        )

        nbuf = None
        if arbitrary.null_count() > 0:
            mask_size = calc_chunk_size(arbitrary.size(), mask_bitsize)
            nbuf = Buffer.empty(mask_size)
            arbitrary.set_null_bitmask(nbuf.ptr, bdevmem=True)
        arbitrary.to_offsets(sbuf.ptr, obuf.ptr, None, bdevmem=True)
        children = (
            build_column(obuf, dtype="int32"),
            build_column(sbuf, dtype="int8"),
        )
        data = build_column(
            data=None, dtype="object", mask=nbuf, children=children
        )
        data._nvstrings = arbitrary

    elif isinstance(arbitrary, Buffer):
        if dtype is None:
            raise TypeError(f"dtype cannot be None if 'arbitrary' is a Buffer")
        data = build_column(arbitrary, dtype=dtype)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary), dtype=arbitrary.dtype)
        if (
            data.dtype in [np.float16, np.float32, np.float64]
            and arbitrary.size > 0
        ):
            if nan_as_null:
                mask = libcudf.unaryops.nans_to_nulls(data)
                data = data.set_mask(mask)

        elif data.dtype.kind == "M":
            null = column_empty_like(data, masked=True, newsize=1)
            col = libcudf.replace.replace(
                as_column(Buffer(arbitrary), dtype=arbitrary.dtype),
                as_column(
                    Buffer(np.array([np.datetime64("NaT")], dtype=data.dtype)),
                    dtype=arbitrary.dtype,
                ),
                null,
            )
            data = datetime.DatetimeColumn(
                data=Buffer(arbitrary), dtype=data.dtype, mask=col.mask
            )

    elif hasattr(arbitrary, "__cuda_array_interface__"):
        desc = arbitrary.__cuda_array_interface__
        data = _data_from_cuda_array_interface_desc(arbitrary)
        mask = _mask_from_cuda_array_interface_desc(arbitrary)
        dtype = np.dtype(desc["typestr"])
        col = build_column(data, dtype=dtype, mask=mask)
        return col

    elif isinstance(arbitrary, np.ndarray):
        # CUDF assumes values are always contiguous
        if not arbitrary.flags["C_CONTIGUOUS"]:
            arbitrary = np.ascontiguousarray(arbitrary)

        if dtype is not None:
            arbitrary = arbitrary.astype(dtype)

        if arbitrary.dtype.kind == "M":
            data = datetime.DatetimeColumn.from_numpy(arbitrary)

        elif arbitrary.dtype.kind in ("O", "U"):
            data = as_column(pa.Array.from_pandas(arbitrary))
        else:
            data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            pa_size, pa_offset, nbuf, obuf, sbuf = buffers_from_pyarrow(
                arbitrary
            )
            children = (
                build_column(data=obuf, dtype="int32"),
                build_column(data=sbuf, dtype="int8"),
            )

            data = string.StringColumn(
                mask=nbuf, children=children, size=pa_size, offset=pa_offset
            )

        elif isinstance(arbitrary, pa.NullArray):
            new_dtype = pd.api.types.pandas_dtype(dtype)
            if (type(dtype) == str and dtype == "empty") or dtype is None:
                new_dtype = pd.api.types.pandas_dtype(
                    arbitrary.type.to_pandas_dtype()
                )

            if is_categorical_dtype(new_dtype):
                arbitrary = arbitrary.dictionary_encode()
            else:
                if nan_as_null:
                    arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype))
                else:
                    # casting a null array doesn't make nans valid
                    # so we create one with valid nans from scratch:
                    if new_dtype == np.dtype("object"):
                        arbitrary = utils.scalar_broadcast_to(
                            None, (len(arbitrary),), dtype=new_dtype
                        )
                    else:
                        arbitrary = utils.scalar_broadcast_to(
                            np.nan, (len(arbitrary),), dtype=new_dtype
                        )
            data = as_column(arbitrary, nan_as_null=nan_as_null)
        elif isinstance(arbitrary, pa.DictionaryArray):
            codes = as_column(arbitrary.indices)
            if isinstance(arbitrary.dictionary, pa.NullArray):
                categories = as_column([], dtype="object")
            else:
                categories = as_column(arbitrary.dictionary)
            dtype = CategoricalDtype(
                categories=categories, ordered=arbitrary.type.ordered
            )
            data = categorical.CategoricalColumn(
                dtype=dtype,
                mask=codes.base_mask,
                children=(codes,),
                size=codes.size,
                offset=codes.offset,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            dtype = np.dtype("M8[{}]".format(arbitrary.type.unit))
            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary, dtype=dtype
            )

            data = datetime.DatetimeColumn(
                data=padata,
                mask=pamask,
                dtype=dtype,
                size=pa_size,
                offset=pa_offset,
            )
        elif isinstance(arbitrary, pa.Date64Array):
            raise NotImplementedError
            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary, dtype="M8[ms]"
            )
            data = datetime.DatetimeColumn(
                data=padata,
                mask=pamask,
                dtype=np.dtype("M8[ms]"),
                size=pa_size,
                offset=pa_offset,
            )
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value",
                UserWarning,
            )
            data = as_column(arbitrary.cast(pa.int32())).astype("M8[ms]")
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            # Needed because of bug in PyArrow
            # https://issues.apache.org/jira/browse/ARROW-4766
            if len(arbitrary) > 0:
                arbitrary = arbitrary.cast(pa.int8())
            else:
                arbitrary = pa.array([], type=pa.int8())

            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary, dtype=dtype
            )
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                dtype=dtype,
                size=pa_size,
                offset=pa_offset,
            )
        else:
            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary
            )
            data = numerical.NumericalColumn(
                data=padata,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()),
                mask=pamask,
                size=pa_size,
                offset=pa_offset,
            )

    elif isinstance(arbitrary, pa.ChunkedArray):
        gpu_cols = [
            as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks
        ]

        if dtype and dtype != "empty":
            new_dtype = dtype
        else:
            pa_type = arbitrary.type
            if pa.types.is_dictionary(pa_type):
                new_dtype = "category"
            else:
                new_dtype = np.dtype(pa_type.to_pandas_dtype())

        data = ColumnBase._concat(gpu_cols, dtype=new_dtype)

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        if is_categorical_dtype(arbitrary):
            data = as_column(pa.array(arbitrary, from_pandas=True))
        elif arbitrary.dtype == np.bool:
            # Bug in PyArrow or HDF that requires us to do this
            data = as_column(pa.array(np.asarray(arbitrary), from_pandas=True))
        else:
            data = as_column(pa.array(arbitrary, from_pandas=nan_as_null))

    elif isinstance(arbitrary, pd.Timestamp):
        # This will always treat NaTs as nulls since it's not technically a
        # discrete value like NaN
        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))

    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
        length = length or 1
        data = as_column(
            utils.scalar_broadcast_to(arbitrary, length, dtype=dtype)
        )
        if not nan_as_null:
            data = data.fillna(np.nan)

    elif isinstance(arbitrary, memoryview):
        data = as_column(
            np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null
        )

    else:
        try:
            data = as_column(
                memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null
            )
        except TypeError:
            pa_type = None
            np_type = None
            try:
                if dtype is not None:
                    dtype = pd.api.types.pandas_dtype(dtype)
                    if is_categorical_dtype(dtype):
                        raise TypeError
                    else:
                        np_type = np.dtype(dtype).type
                        if np_type == np.bool_:
                            pa_type = pa.bool_()
                        else:
                            pa_type = np_to_pa_dtype(np.dtype(dtype))
                data = as_column(
                    pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null),
                    dtype=dtype,
                    nan_as_null=nan_as_null,
                )
            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                if is_categorical_dtype(dtype):
                    sr = pd.Series(arbitrary, dtype="category")
                    data = as_column(sr, nan_as_null=nan_as_null)
                elif np_type == np.str_:
                    sr = pd.Series(arbitrary, dtype="str")
                    data = as_column(sr, nan_as_null=nan_as_null)
                else:
                    data = as_column(
                        np.asarray(arbitrary, dtype=np.dtype(dtype)),
                        nan_as_null=nan_as_null,
                    )
    return data
示例#13
0
    def _concat(cls, objs, dtype=None):
        from cudf.core.series import Series
        from cudf.core.column import (
            StringColumn,
            CategoricalColumn,
            NumericalColumn,
        )

        if len(objs) == 0:
            dtype = pd.api.types.pandas_dtype(dtype)
            if is_categorical_dtype(dtype):
                dtype = CategoricalDtype()
            return column_empty(0, dtype=dtype, masked=True)

        # If all columns are `NumericalColumn` with different dtypes,
        # we cast them to a common dtype.
        # Notice, we can always cast pure null columns
        not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs))
        if len(not_null_cols) > 0 and (
            len(
                [
                    o
                    for o in not_null_cols
                    if not isinstance(o, NumericalColumn)
                    or np.issubdtype(o.dtype, np.datetime64)
                ]
            )
            == 0
        ):
            col_dtypes = [o.dtype for o in not_null_cols]
            # Use NumPy to find a common dtype
            common_dtype = np.find_common_type(col_dtypes, [])
            # Cast all columns to the common dtype
            for i in range(len(objs)):
                objs[i] = objs[i].astype(common_dtype)

        # Find the first non-null column:
        head = objs[0]
        for i, obj in enumerate(objs):
            if len(obj) != obj.null_count:
                head = obj
                break

        for i, obj in enumerate(objs):
            # Check that all columns are the same type:
            if not pd.api.types.is_dtype_equal(objs[i].dtype, head.dtype):
                # if all null, cast to appropriate dtype
                if len(obj) == obj.null_count:
                    from cudf.core.column import column_empty_like

                    objs[i] = column_empty_like(
                        head, dtype=head.dtype, masked=True, newsize=len(obj)
                    )

        # Handle categories for categoricals
        if all(isinstance(o, CategoricalColumn) for o in objs):
            cats = (
                Series(ColumnBase._concat([o.categories for o in objs]))
                .drop_duplicates()
                ._column
            )
            objs = [
                o.cat()._set_categories(cats, is_unique=True) for o in objs
            ]

        head = objs[0]
        for obj in objs:
            if not (obj.dtype == head.dtype):
                raise ValueError("All series must be of same type")

        newsize = sum(map(len, objs))
        if newsize > libcudfxx.MAX_COLUMN_SIZE:
            raise MemoryError(
                "Result of concat cannot have "
                "size > {}".format(libcudfxx.MAX_COLUMN_SIZE_STR)
            )

        # Handle strings separately
        if all(isinstance(o, StringColumn) for o in objs):
            result_nbytes = sum(o._nbytes for o in objs)
            if result_nbytes > libcudfxx.MAX_STRING_COLUMN_BYTES:
                raise MemoryError(
                    "Result of concat cannot have > {}  bytes".format(
                        libcudfxx.MAX_STRING_COLUMN_BYTES_STR
                    )
                )
            objs = [o.nvstrings for o in objs]
            return as_column(nvstrings.from_strings(*objs))

        # Filter out inputs that have 0 length
        objs = [o for o in objs if len(o) > 0]
        nulls = any(col.nullable for col in objs)

        if is_categorical_dtype(head):
            data_dtype = head.codes.dtype
            data = None
            children = (column_empty(newsize, dtype=head.codes.dtype),)
        else:
            data_dtype = head.dtype
            data = Buffer.empty(size=newsize * data_dtype.itemsize)
            children = ()

        # Allocate output mask only if there's nulls in the input objects
        mask = None
        if nulls:
            mask = Buffer(utils.make_mask(newsize))

        col = build_column(
            data=data, dtype=head.dtype, mask=mask, children=children
        )

        # Performance the actual concatenation
        if newsize > 0:
            col = libcudf.concat._column_concat(objs, col)

        return col