예제 #1
0
파일: column.py 프로젝트: zhouyonglong/cudf
    def _concat(cls, objs):
        head = objs[0]
        for o in objs:
            if not o.is_type_equivalent(head):
                raise ValueError("All series must be of same type")
        # Filter out inputs that have 0 length
        objs = [o for o in objs if len(o) > 0]
        nulls = any(o.null_count for o in objs)
        newsize = sum(map(len, objs))
        mem = rmm.device_array(shape=newsize, dtype=head.data.dtype)
        data = Buffer.from_empty(mem, size=newsize)

        # Allocate output mask only if there's nulls in the input objects
        mask = None
        if nulls:
            mask = Buffer(utils.make_mask(newsize))

        # Libcudf calculates the null_count so we don't need to set it
        col = head.replace(data=data, mask=mask)

        # Performance the actual concatenation
        if newsize > 0:
            col = _gdf._column_concat(objs, col)

        return col
예제 #2
0
def column_empty_like(column, dtype, masked):
    """Allocate a new column like the given *column*
    """
    data = rmm.device_array(shape=len(column), dtype=dtype)
    params = dict(data=Buffer(data))
    if masked:
        mask = utils.make_mask(data.size)
        params.update(dict(mask=Buffer(mask), null_count=data.size))
    return Column(**params)
예제 #3
0
def compact_mask_bytes(boolbytes):
    """Convert booleans (in bytes) to a bitmask
    """
    bits = make_mask(boolbytes.size)
    if bits.size > 0:
        # Fill zero
        gpu_fill_value.forall(bits.size)(bits, 0)
        # Compact
        gpu_compact_mask_bytes.forall(bits.size)(boolbytes, bits)
    return bits
예제 #4
0
    def _concat(cls, objs, dtype=None):
        from cudf.dataframe.string import StringColumn
        from cudf.dataframe.categorical import CategoricalColumn

        if len(objs) == 0:
            if pd.api.types.is_categorical_dtype(dtype):
                return CategoricalColumn(data=Column(
                    Buffer.null(np.dtype('int8'))),
                                         null_count=0,
                                         ordered=False)
            elif dtype == np.dtype('object'):
                return StringColumn(data=nvstrings.to_device([]), null_count=0)
            else:
                dtype = np.dtype(dtype)
                return Column(Buffer.null(dtype))

        # Handle strings separately
        if all(isinstance(o, StringColumn) for o in objs):
            objs = [o._data for o in objs]
            return StringColumn(data=nvstrings.from_strings(*objs))

        # Handle categories for categoricals
        if all(isinstance(o, CategoricalColumn) for o in objs):
            new_cats = tuple(set([val for o in objs for val in o]))
            objs = [o.cat()._set_categories(new_cats) for o in objs]

        head = objs[0]
        for o in objs:
            if not o.is_type_equivalent(head):
                raise ValueError("All series must be of same type")
        # Filter out inputs that have 0 length
        objs = [o for o in objs if len(o) > 0]
        nulls = sum(o.null_count for o in objs)
        newsize = sum(map(len, objs))
        mem = rmm.device_array(shape=newsize, dtype=head.data.dtype)
        data = Buffer.from_empty(mem, size=newsize)

        # Allocate output mask only if there's nulls in the input objects
        mask = None
        if nulls:
            mask = Buffer(utils.make_mask(newsize))

        col = head.replace(data=data, mask=mask, null_count=nulls)

        # Performance the actual concatenation
        if newsize > 0:
            col = _gdf._column_concat(objs, col)

        return col
예제 #5
0
def make_empty_mask(size):
    bits = make_mask(size)
    if bits.size > 0:
        gpu_fill_value.forall(bits.size)(bits, 0)
    return bits
예제 #6
0
def mask_from_devary(ary):
    bits = make_mask(len(ary))
    if bits.size > 0:
        gpu_fill_value.forall(bits.size)(bits, 0)
        gpu_mask_from_devary.forall(bits.size)(ary, bits)
    return bits
예제 #7
0
    def _concat(cls, objs, dtype=None):
        from cudf.core.series import Series
        from cudf.core.column import (
            StringColumn,
            CategoricalColumn,
            NumericalColumn,
        )

        if len(objs) == 0:
            dtype = pd.api.types.pandas_dtype(dtype)
            if is_categorical_dtype(dtype):
                dtype = CategoricalDtype()
            return column_empty(0, dtype=dtype, masked=True)

        # If all columns are `NumericalColumn` with different dtypes,
        # we cast them to a common dtype.
        # Notice, we can always cast pure null columns
        not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs))
        if len(not_null_cols) > 0 and (
            len(
                [
                    o
                    for o in not_null_cols
                    if not isinstance(o, NumericalColumn)
                    or np.issubdtype(o.dtype, np.datetime64)
                ]
            )
            == 0
        ):
            col_dtypes = [o.dtype for o in not_null_cols]
            # Use NumPy to find a common dtype
            common_dtype = np.find_common_type(col_dtypes, [])
            # Cast all columns to the common dtype
            for i in range(len(objs)):
                objs[i] = objs[i].astype(common_dtype)

        # Find the first non-null column:
        head = objs[0]
        for i, obj in enumerate(objs):
            if len(obj) != obj.null_count:
                head = obj
                break

        for i, obj in enumerate(objs):
            # Check that all columns are the same type:
            if not pd.api.types.is_dtype_equal(objs[i].dtype, head.dtype):
                # if all null, cast to appropriate dtype
                if len(obj) == obj.null_count:
                    from cudf.core.column import column_empty_like

                    objs[i] = column_empty_like(
                        head, dtype=head.dtype, masked=True, newsize=len(obj)
                    )

        # Handle categories for categoricals
        if all(isinstance(o, CategoricalColumn) for o in objs):
            cats = (
                Series(ColumnBase._concat([o.categories for o in objs]))
                .drop_duplicates()
                ._column
            )
            objs = [
                o.cat()._set_categories(cats, is_unique=True) for o in objs
            ]

        head = objs[0]
        for obj in objs:
            if not (obj.dtype == head.dtype):
                raise ValueError("All series must be of same type")

        newsize = sum(map(len, objs))
        if newsize > libcudfxx.MAX_COLUMN_SIZE:
            raise MemoryError(
                "Result of concat cannot have "
                "size > {}".format(libcudfxx.MAX_COLUMN_SIZE_STR)
            )

        # Handle strings separately
        if all(isinstance(o, StringColumn) for o in objs):
            result_nbytes = sum(o._nbytes for o in objs)
            if result_nbytes > libcudfxx.MAX_STRING_COLUMN_BYTES:
                raise MemoryError(
                    "Result of concat cannot have > {}  bytes".format(
                        libcudfxx.MAX_STRING_COLUMN_BYTES_STR
                    )
                )
            objs = [o.nvstrings for o in objs]
            return as_column(nvstrings.from_strings(*objs))

        # Filter out inputs that have 0 length
        objs = [o for o in objs if len(o) > 0]
        nulls = any(col.nullable for col in objs)

        if is_categorical_dtype(head):
            data_dtype = head.codes.dtype
            data = None
            children = (column_empty(newsize, dtype=head.codes.dtype),)
        else:
            data_dtype = head.dtype
            data = Buffer.empty(size=newsize * data_dtype.itemsize)
            children = ()

        # Allocate output mask only if there's nulls in the input objects
        mask = None
        if nulls:
            mask = Buffer(utils.make_mask(newsize))

        col = build_column(
            data=data, dtype=head.dtype, mask=mask, children=children
        )

        # Performance the actual concatenation
        if newsize > 0:
            col = libcudf.concat._column_concat(objs, col)

        return col
예제 #8
0
    def _concat(cls, objs, dtype=None):
        from cudf.dataframe.series import Series
        from cudf.dataframe.string import StringColumn
        from cudf.dataframe.categorical import CategoricalColumn
        from cudf.dataframe.numerical import NumericalColumn

        if len(objs) == 0:
            dtype = pd.api.types.pandas_dtype(dtype)
            if dtype.type in (np.object_, np.str_):
                return StringColumn(data=nvstrings.to_device([]), null_count=0)
            elif is_categorical_dtype(dtype):
                return CategoricalColumn(
                    data=Column(Buffer.null(np.dtype("int8"))),
                    null_count=0,
                    ordered=False,
                )
            else:
                return Column(Buffer.null(dtype))

        # If all columns are `NumericalColumn` with different dtypes,
        # we cast them to a common dtype.
        # Notice, we can always cast pure null columns
        not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs))
        if len(not_null_cols) > 0 and (len([
                o for o in not_null_cols if not isinstance(o, NumericalColumn)
                or np.issubdtype(o.dtype, np.datetime64)
        ]) == 0):
            col_dtypes = [o.dtype for o in not_null_cols]
            # Use NumPy to find a common dtype
            common_dtype = np.find_common_type(col_dtypes, [])
            # Cast all columns to the common dtype
            for i in range(len(objs)):
                objs[i] = objs[i].astype(common_dtype)

        # Find the first non-null column:
        head = objs[0]
        for i, obj in enumerate(objs):
            if len(obj) != obj.null_count:
                head = obj
                break

        for i, obj in enumerate(objs):
            # Check that all columns are the same type:
            if not objs[i].is_type_equivalent(head):
                # if all null, cast to appropriate dtype
                if len(obj) == obj.null_count:
                    from cudf.dataframe.columnops import column_empty_like

                    objs[i] = column_empty_like(head,
                                                dtype=head.dtype,
                                                masked=True,
                                                newsize=len(obj))

        # Handle categories for categoricals
        if all(isinstance(o, CategoricalColumn) for o in objs):
            cats = (Series(Column._concat([o.categories for o in objs
                                           ])).drop_duplicates()._column)
            objs = [
                o.cat()._set_categories(cats, is_unique=True) for o in objs
            ]

        head = objs[0]
        for obj in objs:
            if not (obj.is_type_equivalent(head)):
                raise ValueError("All series must be of same type")

        # Handle strings separately
        if all(isinstance(o, StringColumn) for o in objs):
            objs = [o._data for o in objs]
            return StringColumn(data=nvstrings.from_strings(*objs))

        # Filter out inputs that have 0 length
        objs = [o for o in objs if len(o) > 0]
        nulls = sum(o.null_count for o in objs)
        newsize = sum(map(len, objs))
        mem = rmm.device_array(shape=newsize, dtype=head.data.dtype)
        data = Buffer.from_empty(mem, size=newsize)

        # Allocate output mask only if there's nulls in the input objects
        mask = None
        if nulls:
            mask = Buffer(utils.make_mask(newsize))

        col = head.replace(data=data, mask=mask, null_count=nulls)

        # Performance the actual concatenation
        if newsize > 0:
            col = _column_concat(objs, col)

        return col