示例#1
0
    def __init__(self, dtype, mask=None, size=None, offset=0, children=()):
        """
        Parameters
        ----------
        dtype : CategoricalDtype
        mask : Buffer
            The validity mask
        offset : int
            Data offset
        children : Tuple[Column]
            Two non-null columns containing the categories and codes
            respectively
        """
        if size is None:
            size = children[0].size
            size = size - offset
        if isinstance(dtype, pd.api.types.CategoricalDtype):
            dtype = CategoricalDtype.from_pandas(dtype)
        if not isinstance(dtype, CategoricalDtype):
            raise ValueError("dtype must be instance of CategoricalDtype")
        super().__init__(
            data=None,
            size=size,
            dtype=dtype,
            mask=mask,
            offset=offset,
            children=children,
        )

        self._codes = None
示例#2
0
    def set_categories(self, new_categories, **kwargs):
        """Returns a new Series with the categories set to the
        specified *new_categories*."""
        data = self._parent
        new_categories = column.as_column(new_categories)
        # when called with rename=True, the pandas behavior is
        # to replace the current category values with the new
        # categories.
        if kwargs.pop("rename", False):
            # enforce same length
            if len(new_categories) != len(data.categories):
                raise ValueError("new_categories must have the same "
                                 "number of items as old categories")
            elif not kwargs.get("inplace", False):
                # return a copy if inplace=False
                data = data.replace(categories=new_categories, **kwargs)
            else:
                # mutate inplace if inplace=True
                data.categories = new_categories
                ordered = kwargs.get("ordered", self.ordered)
                data._dtype = CategoricalDtype(
                    categories=column.as_column(new_categories),
                    ordered=ordered,
                )
        elif not self._categories_equal(new_categories, **kwargs):
            data = self._set_categories(new_categories, **kwargs)
        if data is not None:
            from cudf import Series

            return Series(data=data)
示例#3
0
def build_categorical_column(
    categories, codes, mask=None, size=None, offset=0, ordered=None
):
    """
    Build a CategoricalColumn

    Parameters
    ----------
    categories : Column
        Column of categories
    codes : Column
        Column of codes, the size of the resulting Column will be
        the size of `codes`
    mask : Buffer
        Null mask
    size : int, optional
    offset : int, optional
    ordered : bool
        Indicates whether the categories are ordered
    """
    dtype = CategoricalDtype(categories=as_column(categories), ordered=ordered)
    return build_column(
        data=None,
        dtype=dtype,
        mask=mask,
        size=size,
        offset=offset,
        children=(as_column(codes),),
    )
示例#4
0
    def deserialize(cls, header: dict, frames: list) -> CategoricalColumn:
        n_dtype_frames = header["dtype_frames_count"]
        dtype = CategoricalDtype.deserialize(header["dtype"],
                                             frames[:n_dtype_frames])
        n_data_frames = header["data_frames_count"]

        column_type = pickle.loads(header["data"]["type-serialized"])
        data = column_type.deserialize(
            header["data"],
            frames[n_dtype_frames:n_dtype_frames + n_data_frames],
        )
        mask = None
        if "mask" in header:
            mask = Buffer.deserialize(header["mask"],
                                      [frames[n_dtype_frames + n_data_frames]])
        return cast(
            CategoricalColumn,
            column.build_column(
                data=None,
                dtype=dtype,
                mask=mask,
                children=(column.as_column(data.base_data,
                                           dtype=data.dtype), ),
            ),
        )
示例#5
0
    def as_categorical_column(self, dtype, **kwargs):
        if isinstance(dtype, str) and dtype == "category":
            return self
        if (
            isinstance(
                dtype, (cudf.core.dtypes.CategoricalDtype, pd.CategoricalDtype)
            )
            and (dtype.categories is None)
            and (dtype.ordered is None)
        ):
            return self

        if isinstance(dtype, pd.CategoricalDtype):
            dtype = CategoricalDtype(
                categories=dtype.categories, ordered=dtype.ordered
            )

        if not isinstance(self.categories, type(dtype.categories._values)):
            # If both categories are of different Column types,
            # return a column full of Nulls.
            return _create_empty_categorical_column(self, dtype)

        return self.cat().set_categories(
            new_categories=dtype.categories, ordered=dtype.ordered
        )
示例#6
0
    def __init__(self, **kwargs):
        """
        Parameters
        ----------
        data : Buffer
            The code values
        mask : Buffer; optional
            The validity mask
        null_count : int; optional
            The number of null values in the mask.
        categories : iterable
            The categories
        ordered : bool
            whether the categorical has a logical ordering (e.g. less than)
        """

        ordered = bool(kwargs.pop("ordered"))
        categories = kwargs.pop("categories", [])
        # Default to String dtype if len(categories) == 0, like pandas does
        categories = (
            column.as_column(categories)
            if len(categories) > 0
            else column.column_empty(0, np.dtype("object"), masked=False)
        )

        dtype = CategoricalDtype(
            categories=column.as_column(categories), ordered=ordered
        )
        kwargs.update({"dtype": dtype})
        super(CategoricalColumn, self).__init__(**kwargs)
        self._categories = categories
        self._ordered = ordered
示例#7
0
    def _set_categories(self, new_categories, **kwargs):
        """Returns a new CategoricalColumn with the categories set to the
        specified *new_categories*.

        Notes
        -----
        Assumes ``new_categories`` is the same dtype as the current categories
        """

        from cudf import DataFrame, Series

        cur_cats = self._parent.categories
        new_cats = column.as_column(new_categories)

        # Join the old and new categories to build a map from
        # old to new codes, inserting na_sentinel for any old
        # categories that don't exist in the new categories

        # Ensure new_categories is unique first
        if not (kwargs.get("is_unique", False) or new_cats.is_unique):
            # drop_duplicates() instead of unique() to preserve order
            new_cats = Series(new_cats).drop_duplicates()._column

        cur_codes = self.codes
        cur_order = cudautils.arange(len(cur_codes))
        old_codes = cudautils.arange(len(cur_cats), dtype=cur_codes.dtype)
        new_codes = cudautils.arange(len(new_cats), dtype=cur_codes.dtype)

        new_df = DataFrame({"new_codes": new_codes, "cats": new_cats})
        old_df = DataFrame({"old_codes": old_codes, "cats": cur_cats})
        cur_df = DataFrame({"old_codes": cur_codes, "order": cur_order})

        # Join the old and new categories and line up their codes
        df = old_df.merge(new_df, on="cats", how="left")
        # Join the old and new codes to "recode" the codes data buffer
        df = cur_df.merge(df, on="old_codes", how="left")
        df = df.sort_values(by="order").reset_index(True)

        ordered = kwargs.get("ordered", self.ordered)
        new_codes = df["new_codes"]._column
        new_dtype = CategoricalDtype(categories=new_cats, ordered=ordered)

        if kwargs.get("inplace", False):
            self._parent.data = None
            self._parent.mask = new_codes.mask
            self._parent.dtype = new_dtype
            self._parent.children = (new_codes, )
            return None

        return column.build_column(
            data=None,
            dtype=new_dtype,
            mask=new_codes.mask,
            children=(new_codes, ),
        )
示例#8
0
    def deserialize(cls, header, frames):
        n_dtype_frames = header["dtype_frames_count"]
        dtype = CategoricalDtype.deserialize(header["dtype"],
                                             frames[:n_dtype_frames])
        n_data_frames = header["data_frames_count"]

        column_type = pickle.loads(header["data"]["type"])
        data = column_type.deserialize(
            header["data"],
            frames[n_dtype_frames:n_dtype_frames + n_data_frames],
        )
        mask = None
        if header["frame_count"] > n_dtype_frames + n_data_frames:
            mask = Buffer(frames[n_dtype_frames + n_data_frames])
        return column.build_column(data=None,
                                   dtype=dtype,
                                   mask=mask,
                                   children=(data, ))
示例#9
0
 def categories(self, value):
     self.dtype = CategoricalDtype(categories=value,
                                   ordered=self.dtype.ordered)
示例#10
0
    def set_categories(
        self,
        new_categories,
        ordered=None,
        rename=False,
        inplace=False,
    ):
        """
        Set the categories to the specified new_categories.


        `new_categories` can include new categories (which
        will result in unused categories) or remove old categories
        (which results in values set to null). If `rename==True`,
        the categories will simple be renamed (less or more items
        than in old categories will result in values set to null or
        in unused categories respectively).

        This method can be used to perform more than one action
        of adding, removing, and reordering simultaneously and
        is therefore faster than performing the individual steps
        via the more specialised methods.

        On the other hand this methods does not do checks
        (e.g., whether the old categories are included in the
        new categories on a reorder), which can result in
        surprising changes.

        Parameters
        ----------

        new_categories : list-like
            The categories in new order.

        ordered : bool, default None
            Whether or not the categorical is treated as
            a ordered categorical. If not given, do
            not change the ordered information.

        rename : bool, default False
            Whether or not the `new_categories` should be
            considered as a rename of the old categories
            or as reordered categories.

        inplace : bool, default False
            Whether or not to reorder the categories in-place
            or return a copy of this categorical with
            reordered categories.

        Returns
        -------
        cat
            Categorical with reordered categories
            or None if inplace.

        Examples
        --------
        >>> import cudf
        >>> s = cudf.Series([1, 1, 2, 10, 2, 10], dtype='category')
        >>> s
        0     1
        1     1
        2     2
        3    10
        4     2
        5    10
        dtype: category
        Categories (3, int64): [1, 2, 10]
        >>> s.cat.set_categories([1, 10])
        0      1
        1      1
        2   null
        3     10
        4   null
        5     10
        dtype: category
        Categories (2, int64): [1, 10]
        >>> s.cat.set_categories([1, 10], inplace=True)
        >>> s
        0      1
        1      1
        2   null
        3     10
        4   null
        5     10
        dtype: category
        Categories (2, int64): [1, 10]
        """
        ordered = ordered if ordered is not None else self.ordered
        new_categories = column.as_column(new_categories)

        if isinstance(new_categories, CategoricalColumn):
            new_categories = new_categories.categories

        # when called with rename=True, the pandas behavior is
        # to replace the current category values with the new
        # categories.
        if rename:
            # enforce same length
            if len(new_categories) != len(self._column.categories):
                raise ValueError("new_categories must have the same "
                                 "number of items as old categories")

            out_col = column.build_categorical_column(
                categories=new_categories,
                codes=self._column.base_children[0],
                mask=self._column.base_mask,
                size=self._column.size,
                offset=self._column.offset,
                ordered=ordered,
            )
        else:
            out_col = self._column
            if not (type(out_col.categories) is type(new_categories)):
                # If both categories are of different Column types,
                # return a column full of Nulls.
                out_col = _create_empty_categorical_column(
                    self._column,
                    CategoricalDtype(categories=new_categories,
                                     ordered=ordered),
                )
            elif (not self._categories_equal(new_categories, ordered=ordered)
                  or not self.ordered == ordered):
                out_col = self._set_categories(
                    self._column.categories,
                    new_categories,
                    ordered=ordered,
                )
        return self._return_or_inplace(out_col, inplace=inplace)
示例#11
0
def as_column(arbitrary, nan_as_null=True, dtype=None, length=None):
    """Create a Column from an arbitrary object

    Parameters
    ----------
    arbitrary : object
        Object to construct the Column from. See *Notes*.
    nan_as_null : bool,optional
        If True (default), treat NaN values in arbitrary as null.
    dtype : optional
        Optionally typecast the construted Column to the given
        dtype.
    length : int, optional
        If `arbitrary` is a scalar, broadcast into a Column of
        the given length.

    Returns
    -------
    A Column of the appropriate type and size.

    Notes
    -----
    Currently support inputs are:

    * ``Column``
    * ``Series``
    * ``Index``
    * Scalars (can be broadcasted to a specified `length`)
    * Objects exposing ``__cuda_array_interface__`` (e.g., numba device arrays)
    * Objects exposing ``__array_interface__``(e.g., numpy arrays)
    * pyarrow array
    * pandas.Categorical objects
    """

    from cudf.core.column import numerical, categorical, datetime, string
    from cudf.core.series import Series
    from cudf.core.index import Index

    if isinstance(arbitrary, ColumnBase):
        if dtype is not None:
            return arbitrary.astype(dtype)
        else:
            return arbitrary

    elif isinstance(arbitrary, Series):
        data = arbitrary._column
        if dtype is not None:
            data = data.astype(dtype)
    elif isinstance(arbitrary, Index):
        data = arbitrary._values
        if dtype is not None:
            data = data.astype(dtype)
    elif isinstance(arbitrary, nvstrings.nvstrings):
        byte_count = arbitrary.byte_count()
        if byte_count > libcudfxx.MAX_STRING_COLUMN_BYTES:
            raise MemoryError(
                "Cannot construct string columns "
                "containing > {} bytes. "
                "Consider using dask_cudf to partition "
                "your data.".format(libcudfxx.MAX_STRING_COLUMN_BYTES_STR)
            )
        sbuf = Buffer.empty(arbitrary.byte_count())
        obuf = Buffer.empty(
            (arbitrary.size() + 1) * np.dtype("int32").itemsize
        )

        nbuf = None
        if arbitrary.null_count() > 0:
            mask_size = calc_chunk_size(arbitrary.size(), mask_bitsize)
            nbuf = Buffer.empty(mask_size)
            arbitrary.set_null_bitmask(nbuf.ptr, bdevmem=True)
        arbitrary.to_offsets(sbuf.ptr, obuf.ptr, None, bdevmem=True)
        children = (
            build_column(obuf, dtype="int32"),
            build_column(sbuf, dtype="int8"),
        )
        data = build_column(
            data=None, dtype="object", mask=nbuf, children=children
        )
        data._nvstrings = arbitrary

    elif isinstance(arbitrary, Buffer):
        if dtype is None:
            raise TypeError(f"dtype cannot be None if 'arbitrary' is a Buffer")
        data = build_column(arbitrary, dtype=dtype)

    elif cuda.devicearray.is_cuda_ndarray(arbitrary):
        data = as_column(Buffer(arbitrary), dtype=arbitrary.dtype)
        if (
            data.dtype in [np.float16, np.float32, np.float64]
            and arbitrary.size > 0
        ):
            if nan_as_null:
                mask = libcudf.unaryops.nans_to_nulls(data)
                data = data.set_mask(mask)

        elif data.dtype.kind == "M":
            null = column_empty_like(data, masked=True, newsize=1)
            col = libcudf.replace.replace(
                as_column(Buffer(arbitrary), dtype=arbitrary.dtype),
                as_column(
                    Buffer(np.array([np.datetime64("NaT")], dtype=data.dtype)),
                    dtype=arbitrary.dtype,
                ),
                null,
            )
            data = datetime.DatetimeColumn(
                data=Buffer(arbitrary), dtype=data.dtype, mask=col.mask
            )

    elif hasattr(arbitrary, "__cuda_array_interface__"):
        desc = arbitrary.__cuda_array_interface__
        data = _data_from_cuda_array_interface_desc(arbitrary)
        mask = _mask_from_cuda_array_interface_desc(arbitrary)
        dtype = np.dtype(desc["typestr"])
        col = build_column(data, dtype=dtype, mask=mask)
        return col

    elif isinstance(arbitrary, np.ndarray):
        # CUDF assumes values are always contiguous
        if not arbitrary.flags["C_CONTIGUOUS"]:
            arbitrary = np.ascontiguousarray(arbitrary)

        if dtype is not None:
            arbitrary = arbitrary.astype(dtype)

        if arbitrary.dtype.kind == "M":
            data = datetime.DatetimeColumn.from_numpy(arbitrary)

        elif arbitrary.dtype.kind in ("O", "U"):
            data = as_column(pa.Array.from_pandas(arbitrary))
        else:
            data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null)

    elif isinstance(arbitrary, pa.Array):
        if isinstance(arbitrary, pa.StringArray):
            pa_size, pa_offset, nbuf, obuf, sbuf = buffers_from_pyarrow(
                arbitrary
            )
            children = (
                build_column(data=obuf, dtype="int32"),
                build_column(data=sbuf, dtype="int8"),
            )

            data = string.StringColumn(
                mask=nbuf, children=children, size=pa_size, offset=pa_offset
            )

        elif isinstance(arbitrary, pa.NullArray):
            new_dtype = pd.api.types.pandas_dtype(dtype)
            if (type(dtype) == str and dtype == "empty") or dtype is None:
                new_dtype = pd.api.types.pandas_dtype(
                    arbitrary.type.to_pandas_dtype()
                )

            if is_categorical_dtype(new_dtype):
                arbitrary = arbitrary.dictionary_encode()
            else:
                if nan_as_null:
                    arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype))
                else:
                    # casting a null array doesn't make nans valid
                    # so we create one with valid nans from scratch:
                    if new_dtype == np.dtype("object"):
                        arbitrary = utils.scalar_broadcast_to(
                            None, (len(arbitrary),), dtype=new_dtype
                        )
                    else:
                        arbitrary = utils.scalar_broadcast_to(
                            np.nan, (len(arbitrary),), dtype=new_dtype
                        )
            data = as_column(arbitrary, nan_as_null=nan_as_null)
        elif isinstance(arbitrary, pa.DictionaryArray):
            codes = as_column(arbitrary.indices)
            if isinstance(arbitrary.dictionary, pa.NullArray):
                categories = as_column([], dtype="object")
            else:
                categories = as_column(arbitrary.dictionary)
            dtype = CategoricalDtype(
                categories=categories, ordered=arbitrary.type.ordered
            )
            data = categorical.CategoricalColumn(
                dtype=dtype,
                mask=codes.base_mask,
                children=(codes,),
                size=codes.size,
                offset=codes.offset,
            )
        elif isinstance(arbitrary, pa.TimestampArray):
            dtype = np.dtype("M8[{}]".format(arbitrary.type.unit))
            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary, dtype=dtype
            )

            data = datetime.DatetimeColumn(
                data=padata,
                mask=pamask,
                dtype=dtype,
                size=pa_size,
                offset=pa_offset,
            )
        elif isinstance(arbitrary, pa.Date64Array):
            raise NotImplementedError
            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary, dtype="M8[ms]"
            )
            data = datetime.DatetimeColumn(
                data=padata,
                mask=pamask,
                dtype=np.dtype("M8[ms]"),
                size=pa_size,
                offset=pa_offset,
            )
        elif isinstance(arbitrary, pa.Date32Array):
            # No equivalent np dtype and not yet supported
            warnings.warn(
                "Date32 values are not yet supported so this will "
                "be typecast to a Date64 value",
                UserWarning,
            )
            data = as_column(arbitrary.cast(pa.int32())).astype("M8[ms]")
        elif isinstance(arbitrary, pa.BooleanArray):
            # Arrow uses 1 bit per value while we use int8
            dtype = np.dtype(np.bool)
            # Needed because of bug in PyArrow
            # https://issues.apache.org/jira/browse/ARROW-4766
            if len(arbitrary) > 0:
                arbitrary = arbitrary.cast(pa.int8())
            else:
                arbitrary = pa.array([], type=pa.int8())

            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary, dtype=dtype
            )
            data = numerical.NumericalColumn(
                data=padata,
                mask=pamask,
                dtype=dtype,
                size=pa_size,
                offset=pa_offset,
            )
        else:
            pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow(
                arbitrary
            )
            data = numerical.NumericalColumn(
                data=padata,
                dtype=np.dtype(arbitrary.type.to_pandas_dtype()),
                mask=pamask,
                size=pa_size,
                offset=pa_offset,
            )

    elif isinstance(arbitrary, pa.ChunkedArray):
        gpu_cols = [
            as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks
        ]

        if dtype and dtype != "empty":
            new_dtype = dtype
        else:
            pa_type = arbitrary.type
            if pa.types.is_dictionary(pa_type):
                new_dtype = "category"
            else:
                new_dtype = np.dtype(pa_type.to_pandas_dtype())

        data = ColumnBase._concat(gpu_cols, dtype=new_dtype)

    elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
        if is_categorical_dtype(arbitrary):
            data = as_column(pa.array(arbitrary, from_pandas=True))
        elif arbitrary.dtype == np.bool:
            # Bug in PyArrow or HDF that requires us to do this
            data = as_column(pa.array(np.asarray(arbitrary), from_pandas=True))
        else:
            data = as_column(pa.array(arbitrary, from_pandas=nan_as_null))

    elif isinstance(arbitrary, pd.Timestamp):
        # This will always treat NaTs as nulls since it's not technically a
        # discrete value like NaN
        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))

    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
        length = length or 1
        data = as_column(
            utils.scalar_broadcast_to(arbitrary, length, dtype=dtype)
        )
        if not nan_as_null:
            data = data.fillna(np.nan)

    elif isinstance(arbitrary, memoryview):
        data = as_column(
            np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null
        )

    else:
        try:
            data = as_column(
                memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null
            )
        except TypeError:
            pa_type = None
            np_type = None
            try:
                if dtype is not None:
                    dtype = pd.api.types.pandas_dtype(dtype)
                    if is_categorical_dtype(dtype):
                        raise TypeError
                    else:
                        np_type = np.dtype(dtype).type
                        if np_type == np.bool_:
                            pa_type = pa.bool_()
                        else:
                            pa_type = np_to_pa_dtype(np.dtype(dtype))
                data = as_column(
                    pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null),
                    dtype=dtype,
                    nan_as_null=nan_as_null,
                )
            except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                if is_categorical_dtype(dtype):
                    sr = pd.Series(arbitrary, dtype="category")
                    data = as_column(sr, nan_as_null=nan_as_null)
                elif np_type == np.str_:
                    sr = pd.Series(arbitrary, dtype="str")
                    data = as_column(sr, nan_as_null=nan_as_null)
                else:
                    data = as_column(
                        np.asarray(arbitrary, dtype=np.dtype(dtype)),
                        nan_as_null=nan_as_null,
                    )
    return data
示例#12
0
    def _concat(cls, objs, dtype=None):
        from cudf.core.series import Series
        from cudf.core.column import (
            StringColumn,
            CategoricalColumn,
            NumericalColumn,
        )

        if len(objs) == 0:
            dtype = pd.api.types.pandas_dtype(dtype)
            if is_categorical_dtype(dtype):
                dtype = CategoricalDtype()
            return column_empty(0, dtype=dtype, masked=True)

        # If all columns are `NumericalColumn` with different dtypes,
        # we cast them to a common dtype.
        # Notice, we can always cast pure null columns
        not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs))
        if len(not_null_cols) > 0 and (
            len(
                [
                    o
                    for o in not_null_cols
                    if not isinstance(o, NumericalColumn)
                    or np.issubdtype(o.dtype, np.datetime64)
                ]
            )
            == 0
        ):
            col_dtypes = [o.dtype for o in not_null_cols]
            # Use NumPy to find a common dtype
            common_dtype = np.find_common_type(col_dtypes, [])
            # Cast all columns to the common dtype
            for i in range(len(objs)):
                objs[i] = objs[i].astype(common_dtype)

        # Find the first non-null column:
        head = objs[0]
        for i, obj in enumerate(objs):
            if len(obj) != obj.null_count:
                head = obj
                break

        for i, obj in enumerate(objs):
            # Check that all columns are the same type:
            if not pd.api.types.is_dtype_equal(objs[i].dtype, head.dtype):
                # if all null, cast to appropriate dtype
                if len(obj) == obj.null_count:
                    from cudf.core.column import column_empty_like

                    objs[i] = column_empty_like(
                        head, dtype=head.dtype, masked=True, newsize=len(obj)
                    )

        # Handle categories for categoricals
        if all(isinstance(o, CategoricalColumn) for o in objs):
            cats = (
                Series(ColumnBase._concat([o.categories for o in objs]))
                .drop_duplicates()
                ._column
            )
            objs = [
                o.cat()._set_categories(cats, is_unique=True) for o in objs
            ]

        head = objs[0]
        for obj in objs:
            if not (obj.dtype == head.dtype):
                raise ValueError("All series must be of same type")

        newsize = sum(map(len, objs))
        if newsize > libcudfxx.MAX_COLUMN_SIZE:
            raise MemoryError(
                "Result of concat cannot have "
                "size > {}".format(libcudfxx.MAX_COLUMN_SIZE_STR)
            )

        # Handle strings separately
        if all(isinstance(o, StringColumn) for o in objs):
            result_nbytes = sum(o._nbytes for o in objs)
            if result_nbytes > libcudfxx.MAX_STRING_COLUMN_BYTES:
                raise MemoryError(
                    "Result of concat cannot have > {}  bytes".format(
                        libcudfxx.MAX_STRING_COLUMN_BYTES_STR
                    )
                )
            objs = [o.nvstrings for o in objs]
            return as_column(nvstrings.from_strings(*objs))

        # Filter out inputs that have 0 length
        objs = [o for o in objs if len(o) > 0]
        nulls = any(col.nullable for col in objs)

        if is_categorical_dtype(head):
            data_dtype = head.codes.dtype
            data = None
            children = (column_empty(newsize, dtype=head.codes.dtype),)
        else:
            data_dtype = head.dtype
            data = Buffer.empty(size=newsize * data_dtype.itemsize)
            children = ()

        # Allocate output mask only if there's nulls in the input objects
        mask = None
        if nulls:
            mask = Buffer(utils.make_mask(newsize))

        col = build_column(
            data=data, dtype=head.dtype, mask=mask, children=children
        )

        # Performance the actual concatenation
        if newsize > 0:
            col = libcudf.concat._column_concat(objs, col)

        return col
示例#13
0
文件: column.py 项目: zivzone/cudf
    def _concat(cls, objs, dtype=None):

        if len(objs) == 0:
            dtype = pd.api.types.pandas_dtype(dtype)
            if is_categorical_dtype(dtype):
                dtype = CategoricalDtype()
            return column_empty(0, dtype=dtype, masked=True)

        # If all columns are `NumericalColumn` with different dtypes,
        # we cast them to a common dtype.
        # Notice, we can always cast pure null columns
        not_null_cols = list(filter(lambda o: o.valid_count > 0, objs))
        if len(not_null_cols) > 0 and (len([
                o for o in not_null_cols if not is_numerical_dtype(o.dtype)
                or np.issubdtype(o.dtype, np.datetime64)
        ]) == 0):
            col_dtypes = [o.dtype for o in not_null_cols]
            # Use NumPy to find a common dtype
            common_dtype = np.find_common_type(col_dtypes, [])
            # Cast all columns to the common dtype
            for i in range(len(objs)):
                objs[i] = objs[i].astype(common_dtype)

        # Find the first non-null column:
        head = objs[0]
        for i, obj in enumerate(objs):
            if obj.valid_count > 0:
                head = obj
                break

        for i, obj in enumerate(objs):
            # Check that all columns are the same type:
            if not pd.api.types.is_dtype_equal(obj.dtype, head.dtype):
                # if all null, cast to appropriate dtype
                if obj.valid_count == 0:
                    objs[i] = column_empty_like(head,
                                                dtype=head.dtype,
                                                masked=True,
                                                newsize=len(obj))
                else:
                    raise ValueError("All columns must be the same type")

        cats = None
        is_categorical = all(is_categorical_dtype(o.dtype) for o in objs)

        # Combine CategoricalColumn categories
        if is_categorical:
            # Combine and de-dupe the categories
            cats = (cudf.concat([o.cat().categories for o in objs
                                 ]).to_series().drop_duplicates()._column)
            objs = [
                o.cat()._set_categories(cats, is_unique=True) for o in objs
            ]
            # Map `objs` into a list of the codes until we port Categorical to
            # use the libcudf++ Category data type.
            objs = [o.cat().codes._column for o in objs]
            head = head.cat().codes._column

        newsize = sum(map(len, objs))
        if newsize > libcudfxx.MAX_COLUMN_SIZE:
            raise MemoryError("Result of concat cannot have "
                              "size > {}".format(
                                  libcudfxx.MAX_COLUMN_SIZE_STR))

        # Filter out inputs that have 0 length
        objs = [o for o in objs if len(o) > 0]

        # Perform the actual concatenation
        if newsize > 0:
            col = libcudfxx.concat.concat_columns(objs)
        else:
            col = column_empty(0, head.dtype, masked=True)

        if is_categorical:
            col = build_categorical_column(
                categories=cats,
                codes=as_column(col.base_data, dtype=col.dtype),
                mask=col.base_mask,
                size=col.size,
                offset=col.offset,
            )

        return col