예제 #1
0
    def __init__(self, values, name=None):
        if isinstance(values, pd.Series) and \
                pd.api.types.is_categorical_dtype(values.dtype):
            values = CategoricalColumn(
                data=Buffer(values.cat.codes.values),
                categories=values.cat.categories.tolist(),
                ordered=values.cat.ordered)
        elif isinstance(values, (pd.Categorical, pd.CategoricalIndex)):
            values = CategoricalColumn(data=Buffer(values.codes),
                                       categories=values.categories.tolist(),
                                       ordered=values.ordered)

        self._values = values
        self.name = name
        self.names = [name]
예제 #2
0
    def __setitem__(self, key, value):
        """
        Set the value of self[key] to value.

        If value and self are of different types,
        value is coerced to self.dtype
        """
        import cudf.bindings.copying as cpp_copying
        from cudf.dataframe import columnops

        if isinstance(key, slice):
            key_start, key_stop, key_stride = key.indices(len(self))
            if key_stride != 1:
                raise NotImplementedError("Stride not supported in slice")
            nelem = abs(key_stop - key_start)
        else:
            key = columnops.as_column(key)
            if pd.api.types.is_bool_dtype(key.dtype):
                if not len(key) == len(self):
                    raise ValueError(
                        "Boolean mask must be of same length as column")
                key = columnops.as_column(cudautils.arange(len(self)))[key]
            nelem = len(key)

        if utils.is_scalar(value):
            if is_categorical_dtype(self.dtype):
                from cudf.dataframe.categorical import CategoricalColumn
                from cudf.dataframe.buffer import Buffer
                from cudf.utils.cudautils import fill_value

                data = rmm.device_array(nelem, dtype="int8")
                fill_value(data, self._encode(value))
                value = CategoricalColumn(
                    data=Buffer(data),
                    categories=self._categories,
                    ordered=False,
                )
            elif value is None:
                value = columnops.column_empty(nelem, self.dtype, masked=True)
            else:
                to_dtype = pd.api.types.pandas_dtype(self.dtype)
                value = utils.scalar_broadcast_to(value, nelem, to_dtype)

        value = columnops.as_column(value).astype(self.dtype)

        if len(value) != nelem:
            msg = (f"Size mismatch: cannot set value "
                   f"of size {len(value)} to indexing result of size "
                   f"{nelem}")
            raise ValueError(msg)

        if isinstance(key, slice):
            out = cpp_copying.apply_copy_range(self, value, key_start,
                                               key_stop, 0)
        else:
            out = cpp_copying.apply_scatter(value, key, self)

        self._data = out.data
        self._mask = out.mask
        self._update_null_count()
예제 #3
0
    def __init__(self, values, name=None):
        if isinstance(values, pd.Series) and \
                pd.api.types.is_categorical_dtype(values.dtype):
            values = CategoricalColumn(
                data=Buffer(values.cat.codes.values),
                categories=values.cat.categories.tolist(),
                ordered=values.cat.ordered)
        elif isinstance(values, (pd.Categorical, pd.CategoricalIndex)):
            values = CategoricalColumn(data=Buffer(values.codes),
                                       categories=values.categories.tolist(),
                                       ordered=values.ordered)
        elif isinstance(values, (list, tuple)):
            values = columnops.as_column(
                pd.Categorical(values, categories=values))

        assert values.null_count == 0
        self._values = values
        self.name = name
        self.names = [name]
예제 #4
0
파일: index.py 프로젝트: zeichuan/cudf
 def __init__(self, values, **kwargs):
     kwargs = _setdefault_name(values, kwargs)
     if isinstance(values, CategoricalColumn):
         values = values
     elif isinstance(values, pd.Series) and (is_categorical_dtype(
             values.dtype)):
         values = CategoricalColumn(
             data=Buffer(values.cat.codes.values),
             categories=values.cat.categories,
             ordered=values.cat.ordered,
         )
     elif isinstance(values, (pd.Categorical, pd.CategoricalIndex)):
         values = CategoricalColumn(
             data=Buffer(values.codes),
             categories=values.categories,
             ordered=values.ordered,
         )
     elif isinstance(values, (list, tuple)):
         values = columnops.as_column(
             pd.Categorical(values, categories=values))
     super(CategoricalIndex, self).__init__(values, **kwargs)
     assert self._values.null_count == 0
예제 #5
0
    def _concat(cls, objs, dtype=None):
        from cudf.dataframe.string import StringColumn
        from cudf.dataframe.categorical import CategoricalColumn

        if len(objs) == 0:
            if pd.api.types.is_categorical_dtype(dtype):
                return CategoricalColumn(data=Column(
                    Buffer.null(np.dtype('int8'))),
                                         null_count=0,
                                         ordered=False)
            elif dtype == np.dtype('object'):
                return StringColumn(data=nvstrings.to_device([]), null_count=0)
            else:
                dtype = np.dtype(dtype)
                return Column(Buffer.null(dtype))

        # Handle strings separately
        if all(isinstance(o, StringColumn) for o in objs):
            objs = [o._data for o in objs]
            return StringColumn(data=nvstrings.from_strings(*objs))

        # Handle categories for categoricals
        if all(isinstance(o, CategoricalColumn) for o in objs):
            new_cats = tuple(set([val for o in objs for val in o]))
            objs = [o.cat()._set_categories(new_cats) for o in objs]

        head = objs[0]
        for o in objs:
            if not o.is_type_equivalent(head):
                raise ValueError("All series must be of same type")
        # Filter out inputs that have 0 length
        objs = [o for o in objs if len(o) > 0]
        nulls = sum(o.null_count for o in objs)
        newsize = sum(map(len, objs))
        mem = rmm.device_array(shape=newsize, dtype=head.data.dtype)
        data = Buffer.from_empty(mem, size=newsize)

        # Allocate output mask only if there's nulls in the input objects
        mask = None
        if nulls:
            mask = Buffer(utils.make_mask(newsize))

        col = head.replace(data=data, mask=mask, null_count=nulls)

        # Performance the actual concatenation
        if newsize > 0:
            col = _gdf._column_concat(objs, col)

        return col
예제 #6
0
파일: general.py 프로젝트: Quansight/pygdf
def melt(frame, id_vars=None, value_vars=None, var_name='variable',
         value_name='value'):
    """Unpivots a DataFrame from wide format to long format,
    optionally leaving identifier variables set.

    Parameters
    ----------
    frame : DataFrame
    id_vars : tuple, list, or ndarray, optional
        Column(s) to use as identifier variables.
        default: None
    value_vars : tuple, list, or ndarray, optional
        Column(s) to unpivot.
        default: all columns that are not set as `id_vars`.
    var_name : scalar
        Name to use for the `variable` column.
        default: frame.columns.name or 'variable'
    value_name : str
        Name to use for the `value` column.
        default: 'value'

    Returns
    -------
    out : DataFrame
        Melted result

    Difference from pandas:
     * Does not support 'col_level' because cuDF does not have multi-index

    Examples
    --------

    .. code-block:: python
        import cudf
        import numpy as np

        df = cudf.DataFrame({'A': {0: 1, 1: 1, 2: 5},
                             'B': {0: 1, 1: 3, 2: 6},
                             'C': {0: 1.0, 1: np.nan, 2: 4.0},
                             'D': {0: 2.0, 1: 5.0, 2: 6.0}})
        df2 = cudf.melt(frame=df, id_vars=['A', 'B'], value_vars=['C', 'D'])
        print(df2)

    Output:
    .. code-block:: python
             A    B variable value
        0    1    1        C   1.0
        1    1    3        C
        2    5    6        C   4.0
        3    1    1        D   2.0
        4    1    3        D   5.0
        5    5    6        D   6.0
    """

    # Arg cleaning
    import collections
    # id_vars
    if id_vars is not None:
        if not isinstance(id_vars, collections.abc.Sequence):
            id_vars = [id_vars]
        id_vars = list(id_vars)
        missing = set(id_vars) - set(frame.columns)
        if not len(missing) == 0:
            raise KeyError(
                "The following 'id_vars' are not present"
                " in the DataFrame: {missing}"
                "".format(missing=list(missing)))
    else:
        id_vars = []

    # value_vars
    if value_vars is not None:
        if not isinstance(value_vars, collections.abc.Sequence):
            value_vars = [value_vars]
        value_vars = list(value_vars)
        missing = set(value_vars) - set(frame.columns)
        if not len(missing) == 0:
            raise KeyError(
                "The following 'value_vars' are not present"
                " in the DataFrame: {missing}"
                "".format(missing=list(missing)))
    else:
        # then all remaining columns in frame
        value_vars = frame.columns.drop(id_vars)
        value_vars = list(value_vars)

    # Error for unimplemented support for datatype
    dtypes = [frame[col].dtype for col in id_vars + value_vars]
    if any(pd.api.types.is_categorical_dtype(t) for t in dtypes):
        raise NotImplementedError('Categorical columns are not yet '
                                  'supported for function')

    # Check dtype homogeneity in value_var
    # Because heterogeneous concat is unimplemented
    dtypes = [frame[col].dtype for col in value_vars]
    if len(dtypes) > 0:
        dtype = dtypes[0]
        if any(t != dtype for t in dtypes):
            raise ValueError('all cols in value_vars must have the same dtype')

    # overlap
    overlap = set(id_vars).intersection(set(value_vars))
    if not len(overlap) == 0:
        raise KeyError(
            "'value_vars' and 'id_vars' cannot have overlap."
            " The following 'value_vars' are ALSO present"
            " in 'id_vars': {overlap}"
            "".format(overlap=list(overlap)))

    N = len(frame)
    K = len(value_vars)

    def _tile(A, reps):
        series_list = [A] * reps
        if reps > 0:
            return Series._concat(objs=series_list, index=None)
        else:
            return Series(Buffer.null(dtype=A.dtype))

    # Step 1: tile id_vars
    mdata = collections.OrderedDict()
    for col in id_vars:
        mdata[col] = _tile(frame[col], K)

    # Step 2: add variable
    var_cols = []
    for i, var in enumerate(value_vars):
        var_cols.append(Series(Buffer(
            cudautils.full(size=N, value=i, dtype=np.int8))))
    temp = Series._concat(objs=var_cols, index=None)
    mdata[var_name] = Series(CategoricalColumn(
        categories=tuple(value_vars), data=temp._column.data, ordered=False))

    # Step 3: add values
    mdata[value_name] = Series._concat(
        objs=[frame[val] for val in value_vars],
        index=None)

    return DataFrame(mdata)
예제 #7
0
    def _apply_agg(self, agg_type, result, add_col_values,
                   ctx, val_columns, val_columns_out, sort_result=True):
        """
        Parameters
        ----------
        agg_type : str
            The aggregation function to run.
        result : DataFrame
            The DataFrame to store the result of the aggregation into.
        add_col_values : bool
            Boolean to indicate whether this is the first aggregation being
            run and should add the additional columns' values.
        ctx : gdf_context cffi object
            Context object to pass information such as if the dataframe
            is sorted and/or which method to use for grouping.
        val_columns : list of *str*
            The list of column names that the aggregation should be performed
            on.
        val_columns_out : list of *str*
            The list of columns names that the aggregation results should be
            output into.
        """
        if sort_result:
            ctx.flag_sort_result = 1

        ncols = len(self._by)
        cols = [self._df[thisBy]._column.cffi_view for thisBy in self._by]

        first_run = add_col_values
        need_to_index = False

        col_count = 0
        for val_col in val_columns:
            col_agg = self._df[val_col]._column.cffi_view

            # assuming here that if there are multiple aggregations that the
            # aggregated results will be in the same order for GDF_SORT method
            if need_to_index:
                out_col_indices_series = Series(
                    Buffer(rmm.device_array(col_agg.size, dtype=np.int32)))
                out_col_indices = out_col_indices_series._column.cffi_view
            else:
                out_col_indices = ffi.NULL

            out_col_values_series = [Series(Buffer(rmm.device_array(
                col_agg.size,
                dtype=self._df[self._by[i]]._column.data.dtype)))
                for i in range(0, ncols)]
            out_col_values = [
                out_col_values_series[i]._column.cffi_view
                for i in range(0, ncols)]

            if agg_type == "count":
                out_col_agg_series = Series(
                    Buffer(rmm.device_array(col_agg.size, dtype=np.int64)))
            else:
                out_col_agg_series = Series(Buffer(rmm.device_array(
                    col_agg.size, dtype=self._df[val_col]._column.data.dtype)))

            out_col_agg = out_col_agg_series._column.cffi_view

            agg_func = self._NAMED_FUNCTIONS.get(agg_type, None)
            if agg_func is None:
                raise RuntimeError(
                    "ERROR: this aggregator has not been implemented yet")
            err = agg_func(
                ncols,
                cols,
                col_agg,
                out_col_indices,
                out_col_values,
                out_col_agg,
                ctx)

            if (err is not None):
                print(err)
                raise RuntimeError(err)

            num_row_results = out_col_agg.size

            if first_run:
                for i, thisBy in enumerate(self._by):
                    result[thisBy] = out_col_values_series[i][
                        :num_row_results]

                    if is_categorical_dtype(self._df[thisBy].dtype):
                        result[thisBy] = CategoricalColumn(
                            data=result[thisBy].data,
                            categories=self._df[thisBy].cat.categories,
                            ordered=self._df[thisBy].cat.ordered)

            out_col_agg_series.data.size = num_row_results
            out_col_agg_series = out_col_agg_series.reset_index()

            result[val_columns_out[col_count]
                   ] = out_col_agg_series[:num_row_results]

            out_col_agg_series.data.size = num_row_results
            out_col_agg_series = out_col_agg_series.reset_index()

            first_run = False
            col_count = col_count + 1

        return result
예제 #8
0
    def _apply_agg(self, agg_type, result, add_col_values,
                   ctx, val_columns, val_columns_out, sort_result=True):
        """
        Parameters
        ----------
        agg_type : str
            The aggregation function to run.
        result : DataFrame
            The DataFrame to store the result of the aggregation into.
        add_col_values : bool
            Boolean to indicate whether this is the first aggregation being
            run and should add the additional columns' values.
        ctx : gdf_context cffi object
            Context object to pass information such as if the dataframe
            is sorted and/or which method to use for grouping.
        val_columns : list of *str*
            The list of column names that the aggregation should be performed
            on.
        val_columns_out : list of *str*
            The list of columns names that the aggregation results should be
            output into.
        """

        if sort_result:
            ctx.flag_sort_result = 1

        ncols = len(self._by)
        cols = [self._df[thisBy]._column.cffi_view for thisBy in self._by]

        first_run = add_col_values
        need_to_index = self._as_index

        col_count = 0
        if isinstance(val_columns, (str, Number)):
            val_columns = [val_columns]
        for val_col in val_columns:
            col_agg = self._df[val_col]._column.cffi_view

            # assuming here that if there are multiple aggregations that the
            # aggregated results will be in the same order for GDF_SORT method
            if need_to_index:
                out_col_indices_series = Series(
                    Buffer(
                        rmm.device_array(
                            col_agg.size,
                            dtype=np.int32
                        )
                    )
                )
                out_col_indices = out_col_indices_series._column.cffi_view
            else:
                out_col_indices = ffi.NULL

            out_col_values_series = []
            for i in range(0, ncols):
                if self._df[self._by[i]].dtype == np.dtype('object'):
                    # This isn't ideal, but no better way to create an
                    # nvstrings object of correct size
                    gather_map = zeros(col_agg.size, dtype='int32')
                    col = Series([''], dtype='str')[gather_map]\
                        .reset_index(drop=True)
                else:
                    col = Series(
                        Buffer(
                            rmm.device_array(
                                col_agg.size,
                                dtype=self._df[self._by[i]]._column.data.dtype
                            )
                        )
                    )
                out_col_values_series.append(col)
            out_col_values = [
                out_col_values_series[i]._column.cffi_view
                for i in range(0, ncols)]

            if agg_type == "count":
                out_col_agg_series = Series(
                    Buffer(
                        rmm.device_array(
                            col_agg.size,
                            dtype=np.int64
                        )
                    )
                )
            elif agg_type == "mean":
                out_col_agg_series = Series(
                    Buffer(
                        rmm.device_array(
                            col_agg.size,
                            dtype=np.float64
                        )
                    )
                )
            else:
                if self._df[val_col].dtype == np.dtype('object'):
                    # This isn't ideal, but no better way to create an
                    # nvstrings object of correct size
                    gather_map = zeros(col_agg.size, dtype='int32')
                    out_col_agg_series = Series(
                        [''],
                        dtype='str'
                    )[gather_map].reset_index(drop=True)
                else:
                    out_col_agg_series = Series(
                        Buffer(
                            rmm.device_array(
                                col_agg.size,
                                dtype=self._df[val_col]._column.data.dtype
                            )
                        )
                    )

            out_col_agg = out_col_agg_series._column.cffi_view

            agg_func = self._NAMED_FUNCTIONS.get(agg_type, None)
            if agg_func is None:
                raise RuntimeError(
                    "ERROR: this aggregator has not been implemented yet")

            err = agg_func(
                ncols,
                cols,
                col_agg,
                out_col_indices,
                out_col_values,
                out_col_agg,
                ctx)

            if (err is not None):
                raise RuntimeError(err)

            num_row_results = out_col_agg.size

            # NVStrings columns are not the same going in as coming out but we
            # can't create entire CFFI views otherwise multiple objects will
            # try to free the memory
            for i, col in enumerate(out_col_values_series):
                if col.dtype == np.dtype("object") and len(col) > 0:
                    import nvcategory
                    nvcat_ptr = int(
                        ffi.cast(
                            "uintptr_t",
                            out_col_values[i].dtype_info.category
                        )
                    )
                    nvcat_obj = None
                    if nvcat_ptr:
                        nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr)
                        nvstr_obj = nvcat_obj.to_strings()
                    else:
                        import nvstrings
                        nvstr_obj = nvstrings.to_device([])
                    out_col_values_series[i]._column._data = nvstr_obj
                    out_col_values_series[i]._column._nvcategory = nvcat_obj
            if out_col_agg_series.dtype == np.dtype("object") and \
                    len(out_col_agg_series) > 0:
                import nvcategory
                nvcat_ptr = int(
                    ffi.cast(
                        "uintptr_t",
                        out_col_agg.dtype_info.category
                    )
                )
                nvcat_obj = None
                if nvcat_ptr:
                    nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr)
                    nvstr_obj = nvcat_obj.to_strings()
                else:
                    import nvstrings
                    nvstr_obj = nvstrings.to_device([])
                out_col_agg_series._column._data = nvstr_obj
                out_col_agg_series._column._nvcategory = nvcat_obj

            if first_run:
                for i, thisBy in enumerate(self._by):
                    result[thisBy] = out_col_values_series[i][
                        :num_row_results]

                    if is_categorical_dtype(self._df[thisBy].dtype):
                        result[thisBy] = CategoricalColumn(
                            data=result[thisBy].data,
                            categories=self._df[thisBy].cat.categories,
                            ordered=self._df[thisBy].cat.ordered
                        )

            if out_col_agg_series.dtype != np.dtype("object"):
                out_col_agg_series.data.size = num_row_results
            out_col_agg_series = out_col_agg_series.reset_index(drop=True)

            if isinstance(val_columns_out, (str, Number)):
                result[val_columns_out] = out_col_agg_series[:num_row_results]
            else:
                result[val_columns_out[col_count]
                       ] = out_col_agg_series[:num_row_results]

            if out_col_agg_series.dtype != np.dtype("object"):
                out_col_agg_series.data.size = num_row_results
            out_col_agg_series = out_col_agg_series.reset_index(drop=True)

            first_run = False
            col_count = col_count + 1

        return result
예제 #9
0
    def _concat(cls, objs, dtype=None):
        from cudf.dataframe.series import Series
        from cudf.dataframe.string import StringColumn
        from cudf.dataframe.categorical import CategoricalColumn
        from cudf.dataframe.numerical import NumericalColumn

        if len(objs) == 0:
            dtype = pd.api.types.pandas_dtype(dtype)
            if dtype.type in (np.object_, np.str_):
                return StringColumn(data=nvstrings.to_device([]), null_count=0)
            elif is_categorical_dtype(dtype):
                return CategoricalColumn(
                    data=Column(Buffer.null(np.dtype("int8"))),
                    null_count=0,
                    ordered=False,
                )
            else:
                return Column(Buffer.null(dtype))

        # If all columns are `NumericalColumn` with different dtypes,
        # we cast them to a common dtype.
        # Notice, we can always cast pure null columns
        not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs))
        if len(not_null_cols) > 0 and (len([
                o for o in not_null_cols if not isinstance(o, NumericalColumn)
                or np.issubdtype(o.dtype, np.datetime64)
        ]) == 0):
            col_dtypes = [o.dtype for o in not_null_cols]
            # Use NumPy to find a common dtype
            common_dtype = np.find_common_type(col_dtypes, [])
            # Cast all columns to the common dtype
            for i in range(len(objs)):
                objs[i] = objs[i].astype(common_dtype)

        # Find the first non-null column:
        head = objs[0]
        for i, obj in enumerate(objs):
            if len(obj) != obj.null_count:
                head = obj
                break

        for i, obj in enumerate(objs):
            # Check that all columns are the same type:
            if not objs[i].is_type_equivalent(head):
                # if all null, cast to appropriate dtype
                if len(obj) == obj.null_count:
                    from cudf.dataframe.columnops import column_empty_like

                    objs[i] = column_empty_like(head,
                                                dtype=head.dtype,
                                                masked=True,
                                                newsize=len(obj))

        # Handle categories for categoricals
        if all(isinstance(o, CategoricalColumn) for o in objs):
            cats = (Series(Column._concat([o.categories for o in objs
                                           ])).drop_duplicates()._column)
            objs = [
                o.cat()._set_categories(cats, is_unique=True) for o in objs
            ]

        head = objs[0]
        for obj in objs:
            if not (obj.is_type_equivalent(head)):
                raise ValueError("All series must be of same type")

        # Handle strings separately
        if all(isinstance(o, StringColumn) for o in objs):
            objs = [o._data for o in objs]
            return StringColumn(data=nvstrings.from_strings(*objs))

        # Filter out inputs that have 0 length
        objs = [o for o in objs if len(o) > 0]
        nulls = sum(o.null_count for o in objs)
        newsize = sum(map(len, objs))
        mem = rmm.device_array(shape=newsize, dtype=head.data.dtype)
        data = Buffer.from_empty(mem, size=newsize)

        # Allocate output mask only if there's nulls in the input objects
        mask = None
        if nulls:
            mask = Buffer(utils.make_mask(newsize))

        col = head.replace(data=data, mask=mask, null_count=nulls)

        # Performance the actual concatenation
        if newsize > 0:
            col = _column_concat(objs, col)

        return col