Пример #1
0
    def _create_categorical(cls, data, dtype=None):
        """
        *this is an internal non-public method*

        create the correct categorical from data and the properties

        Parameters
        ----------
        data : data for new Categorical
        dtype : CategoricalDtype, defaults to existing

        Returns
        -------
        Categorical
        """
        if isinstance(data, (cls, ABCSeries)) and is_categorical_dtype(data):
            data = data.values

        if not isinstance(data, ABCCategorical):
            return Categorical(data, dtype=dtype)

        if isinstance(dtype, CategoricalDtype) and dtype != data.dtype:
            # we want to silently ignore dtype='category'
            data = data._set_dtype(dtype)
        return data
Пример #2
0
    def _create_from_codes(self, codes, categories=None, ordered=None,
                           name=None):
        """
        *this is an internal non-public method*

        create the correct categorical from codes

        Parameters
        ----------
        codes : new codes
        categories : optional categories, defaults to existing
        ordered : optional ordered attribute, defaults to existing
        name : optional name attribute, defaults to existing

        Returns
        -------
        CategoricalIndex
        """

        if categories is None:
            categories = self.categories
        if ordered is None:
            ordered = self.ordered
        if name is None:
            name = self.name
        cat = Categorical.from_codes(codes, categories=categories,
                                     ordered=ordered)
        return CategoricalIndex(cat, name=name)
Пример #3
0
    def _shallow_copy(self, values=None, name: Label = no_default):
        name = self.name if name is no_default else name

        if values is not None:
            values = Categorical(values, dtype=self.dtype)

        return super()._shallow_copy(values=values, name=name)
Пример #4
0
    def _is_dtype_compat(self, other) -> bool:
        """
        *this is an internal non-public method*

        provide a comparison between the dtype of self and other (coercing if
        needed)

        Raises
        ------
        TypeError if the dtypes are not compatible
        """
        if is_categorical_dtype(other):
            if isinstance(other, CategoricalIndex):
                other = other._values
            if not other.is_dtype_equal(self):
                raise TypeError(
                    "categories must match existing categories when appending")
        else:
            values = other
            if not is_list_like(values):
                values = [values]
            cat = Categorical(other, dtype=self.dtype)
            other = CategoricalIndex(cat)
            if not other.isin(values).all():
                raise TypeError(
                    "cannot append a non-category item to a CategoricalIndex")

        return other
Пример #5
0
    def where(self, cond, other=None):
        if other is None:
            other = self._na_value
        values = np.where(cond, self.values, other)

        cat = Categorical(values, dtype=self.dtype)
        return self._shallow_copy(cat, **self._get_attributes_dict())
Пример #6
0
    def __new__(
        cls,
        data=None,
        categories=None,
        ordered=None,
        dtype: Dtype | None = None,
        copy: bool = False,
        name: Hashable = None,
    ) -> CategoricalIndex:

        name = maybe_extract_name(name, data, cls)

        if data is None:
            # GH#38944
            warnings.warn(
                "Constructing a CategoricalIndex without passing data is "
                "deprecated and will raise in a future version. "
                "Use CategoricalIndex([], ...) instead.",
                FutureWarning,
                stacklevel=find_stack_level(),
            )
            data = []

        if is_scalar(data):
            raise cls._scalar_data_error(data)

        data = Categorical(data,
                           categories=categories,
                           ordered=ordered,
                           dtype=dtype,
                           copy=copy)

        return cls._simple_new(data, name=name)
Пример #7
0
    def _reindex_non_unique(  # type: ignore[override]
            self, target: Index) -> tuple[Index, np.ndarray | None, np.ndarray
                                          | None]:
        """
        reindex from a non-unique; which CategoricalIndex's are almost
        always
        """
        # TODO: rule out `indexer is None` here to make the signature
        #  match the parent class's signature. This should be equivalent
        #  to ruling out `self.equals(target)`
        new_target, indexer = self.reindex(target)
        new_indexer = None

        check = indexer == -1
        # error: Item "bool" of "Union[Any, bool]" has no attribute "any"
        if check.any():  # type: ignore[union-attr]
            new_indexer = np.arange(len(self.take(indexer)), dtype=np.intp)
            new_indexer[check] = -1

        cats = self.categories.get_indexer(target)
        if not (cats == -1).any():
            # .reindex returns normal Index. Revert to CategoricalIndex if
            # all targets are included in my categories
            cat = Categorical(new_target, dtype=self.dtype)
            new_target = type(self)._simple_new(cat, name=self.name)

        return new_target, indexer, new_indexer
Пример #8
0
def my_qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
    x_is_series, series_index, name, x = pandas.core.reshape.tile._preprocess_for_cut(
        x)

    x, dtype = pandas.core.reshape.tile._coerce_to_type(x)

    if is_integer(q):
        quantiles = np.linspace(0, 1, q + 1)
    else:
        quantiles = q
    bins = quantile(x, quantiles)

    labels = pandas.core.reshape.tile._format_labels(bins,
                                                     precision,
                                                     right=False,
                                                     dtype=dtype)
    bins[-1] += (bins[-1] - bins[0]) * 0.01
    t = str(labels).split("\n")[0].split("(")[1][1:-2].split("),")
    for i in range(len(t) - 1):
        t[i] += ")"
    t[-1] = t[-1].replace(")", "]")
    labels = Categorical(t)

    fac, bins = pandas.core.reshape.tile._bins_to_cuts(x,
                                                       bins,
                                                       labels=labels,
                                                       right=False,
                                                       precision=precision,
                                                       include_lowest=True,
                                                       dtype=dtype,
                                                       duplicates=duplicates)

    return pandas.core.reshape.tile._postprocess_for_cut(
        fac, bins, retbins, x_is_series, series_index, name, dtype)
Пример #9
0
    def _create_from_codes(self,
                           codes,
                           categories=None,
                           ordered=None,
                           name=None):
        """
        *this is an internal non-public method*

        create the correct categorical from codes

        Parameters
        ----------
        codes : new codes
        categories : optional categories, defaults to existing
        ordered : optional ordered attribute, defaults to existing
        name : optional name attribute, defaults to existing

        Returns
        -------
        CategoricalIndex
        """

        if categories is None:
            categories = self.categories
        if ordered is None:
            ordered = self.ordered
        if name is None:
            name = self.name
        cat = Categorical.from_codes(codes,
                                     categories=categories,
                                     ordered=ordered)
        return CategoricalIndex(cat, name=name)
Пример #10
0
    def _shallow_copy(self, values=None, **kwargs):
        if values is None:
            values = self.values

        cat = Categorical(values, dtype=self.dtype)

        name = kwargs.get("name", self.name)
        return type(self)._simple_new(cat, name=name)
Пример #11
0
    def _shallow_copy(self, values=None, name: Label = no_default):
        name = self.name if name is no_default else name

        if values is None:
            values = self.values

        cat = Categorical(values, dtype=self.dtype)

        return type(self)._simple_new(cat, name=name)
Пример #12
0
    def astype(self, dtype, copy=True):
        """
        Cast to an ExtensionArray or NumPy array with dtype 'dtype'.

        Parameters
        ----------
        dtype : str or dtype
            Typecode or data-type to which the array is cast.

        copy : bool, default True
            Whether to copy the data, even if not necessary. If False,
            a copy is made only if the old dtype does not match the
            new dtype.

        Returns
        -------
        array : ExtensionArray or ndarray
            ExtensionArray or NumPy ndarray with 'dtype' for its dtype.
        """
        from pandas import Index
        from pandas.core.arrays.string_ import StringDtype

        if dtype is not None:
            dtype = pandas_dtype(dtype)

        if is_interval_dtype(dtype):
            if dtype == self.dtype:
                return self.copy() if copy else self

            # need to cast to different subtype
            try:
                # We need to use Index rules for astype to prevent casting
                #  np.nan entries to int subtypes
                new_left = Index(self._left, copy=False).astype(dtype.subtype)
                new_right = Index(self._right,
                                  copy=False).astype(dtype.subtype)
            except TypeError as err:
                msg = (
                    f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible"
                )
                raise TypeError(msg) from err
            # TODO: do astype directly on self._combined
            combined = _get_combined_data(new_left, new_right)
            return type(self)._simple_new(combined, closed=self.closed)
        elif is_categorical_dtype(dtype):
            return Categorical(np.asarray(self))
        elif isinstance(dtype, StringDtype):
            return dtype.construct_array_type()._from_sequence(self,
                                                               copy=False)

        # TODO: This try/except will be repeated.
        try:
            return np.asarray(self).astype(dtype, copy=copy)
        except (TypeError, ValueError) as err:
            msg = f"Cannot cast {type(self).__name__} to dtype {dtype}"
            raise TypeError(msg) from err
Пример #13
0
 def where(self, cond, other=None):
     # TODO: Investigate an alternative implementation with
     # 1. copy the underlying Categorical
     # 2. setitem with `cond` and `other`
     # 3. Rebuild CategoricalIndex.
     if other is None:
         other = self._na_value
     values = np.where(cond, self.values, other)
     cat = Categorical(values, dtype=self.dtype)
     return self._shallow_copy(cat, **self._get_attributes_dict())
Пример #14
0
 def where(self, cond, other=None):
     # TODO: Investigate an alternative implementation with
     # 1. copy the underlying Categorical
     # 2. setitem with `cond` and `other`
     # 3. Rebuild CategoricalIndex.
     if other is None:
         other = self._na_value
     values = np.where(cond, self._values, other)
     cat = Categorical(values, dtype=self.dtype)
     return type(self)._simple_new(cat, name=self.name)
Пример #15
0
    def _create_categorical(self,
                            data,
                            categories=None,
                            ordered=None,
                            dtype=None):
        """
        *this is an internal non-public method*

        create the correct categorical from data and the properties

        Parameters
        ----------
        data : data for new Categorical
        categories : optional categories, defaults to existing
        ordered : optional ordered attribute, defaults to existing
        dtype : CategoricalDtype, defaults to existing

        Returns
        -------
        Categorical
        """
        if (isinstance(data, (ABCSeries, type(self)))
                and is_categorical_dtype(data)):
            data = data.values

        if not isinstance(data, ABCCategorical):
            if ordered is None and dtype is None:
                ordered = False
            data = Categorical(data,
                               categories=categories,
                               ordered=ordered,
                               dtype=dtype)
        else:
            if categories is not None:
                data = data.set_categories(categories, ordered=ordered)
            elif ordered is not None and ordered != data.ordered:
                data = data.set_ordered(ordered)
            if isinstance(dtype, CategoricalDtype):
                # we want to silently ignore dtype='category'
                data = data._set_dtype(dtype)
        return data
Пример #16
0
    def _shallow_copy(
        self, values: Optional[Categorical] = None, name: Label = no_default
    ):
        name = self.name if name is no_default else name

        if values is not None:
            # In tests we only get here with Categorical objects that
            #  have matching .ordered, and values.categories a subset of
            #  our own.  However we do _not_ have a dtype match in general.
            values = Categorical(values, dtype=self.dtype)

        return super()._shallow_copy(values=values, name=name)
Пример #17
0
    def __new__(
        cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None
    ):

        dtype = CategoricalDtype._from_values_or_dtype(data, categories, ordered, dtype)

        name = maybe_extract_name(name, data, cls)

        if not is_categorical_dtype(data):
            # don't allow scalars
            # if data is None, then categories must be provided
            if is_scalar(data):
                if data is not None or categories is None:
                    raise cls._scalar_data_error(data)
                data = []

        assert isinstance(dtype, CategoricalDtype), dtype
        if not isinstance(data, Categorical) or data.dtype != dtype:
            data = Categorical(data, dtype=dtype)

        data = data.copy() if copy else data

        return cls._simple_new(data, name=name)
Пример #18
0
    def __new__(
        cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None
    ):

        name = maybe_extract_name(name, data, cls)

        if is_scalar(data):
            raise cls._scalar_data_error(data)

        data = Categorical(
            data, categories=categories, ordered=ordered, dtype=dtype, copy=copy
        )

        return cls._simple_new(data, name=name)
Пример #19
0
    def _create_categorical(cls, data, categories=None, ordered=None,
                            dtype=None):
        """
        *this is an internal non-public method*

        create the correct categorical from data and the properties

        Parameters
        ----------
        data : data for new Categorical
        categories : optional categories, defaults to existing
        ordered : optional ordered attribute, defaults to existing
        dtype : CategoricalDtype, defaults to existing

        Returns
        -------
        Categorical
        """
        if (isinstance(data, (cls, ABCSeries)) and
                is_categorical_dtype(data)):
            data = data.values

        if not isinstance(data, ABCCategorical):
            if ordered is None and dtype is None:
                ordered = False
            data = Categorical(data, categories=categories, ordered=ordered,
                               dtype=dtype)
        else:
            if categories is not None:
                data = data.set_categories(categories, ordered=ordered)
            elif ordered is not None and ordered != data.ordered:
                data = data.set_ordered(ordered)
            if isinstance(dtype, CategoricalDtype) and dtype != data.dtype:
                # we want to silently ignore dtype='category'
                data = data._set_dtype(dtype)
        return data
Пример #20
0
    def _is_dtype_compat(self, other) -> Categorical:
        """
        *this is an internal non-public method*

        provide a comparison between the dtype of self and other (coercing if
        needed)

        Parameters
        ----------
        other : Index

        Returns
        -------
        Categorical

        Raises
        ------
        TypeError if the dtypes are not compatible
        """
        if is_categorical_dtype(other):
            other = extract_array(other)
            if not other._categories_match_up_to_permutation(self):
                raise TypeError(
                    "categories must match existing categories when appending"
                )

        elif other._is_multi:
            # preempt raising NotImplementedError in isna call
            raise TypeError("MultiIndex is not dtype-compatible with CategoricalIndex")
        else:
            values = other

            cat = Categorical(other, dtype=self.dtype)
            other = CategoricalIndex(cat)
            if not other.isin(values).all():
                raise TypeError(
                    "cannot append a non-category item to a CategoricalIndex"
                )
            other = other._values

            if not ((other == values) | (isna(other) & isna(values))).all():
                # GH#37667 see test_equals_non_category
                raise TypeError(
                    "categories must match existing categories when appending"
                )

        return other
Пример #21
0
    def astype(self, dtype, copy=True):
        """
        Cast to an ExtensionArray or NumPy array with dtype 'dtype'.

        Parameters
        ----------
        dtype : str or dtype
            Typecode or data-type to which the array is cast.

        copy : bool, default True
            Whether to copy the data, even if not necessary. If False,
            a copy is made only if the old dtype does not match the
            new dtype.

        Returns
        -------
        array : ExtensionArray or ndarray
            ExtensionArray or NumPy ndarray with 'dtype' for its dtype.
        """
        dtype = pandas_dtype(dtype)
        if is_interval_dtype(dtype):
            if dtype == self.dtype:
                return self.copy() if copy else self

            # need to cast to different subtype
            try:
                new_left = self.left.astype(dtype.subtype)
                new_right = self.right.astype(dtype.subtype)
            except TypeError:
                msg = (
                    "Cannot convert {dtype} to {new_dtype}; subtypes are "
                    "incompatible"
                )
                raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype))
            return self._shallow_copy(new_left, new_right)
        elif is_categorical_dtype(dtype):
            return Categorical(np.asarray(self))
        # TODO: This try/except will be repeated.
        try:
            return np.asarray(self).astype(dtype, copy=copy)
        except (TypeError, ValueError):
            msg = "Cannot cast {name} to dtype {dtype}"
            raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
Пример #22
0
    def _reindex_non_unique(self, target):
        """
        reindex from a non-unique; which CategoricalIndex's are almost
        always
        """
        new_target, indexer = self.reindex(target)
        new_indexer = None

        check = indexer == -1
        if check.any():
            new_indexer = np.arange(len(self.take(indexer)))
            new_indexer[check] = -1

        cats = self.categories.get_indexer(target)
        if not (cats == -1).any():
            # .reindex returns normal Index. Revert to CategoricalIndex if
            # all targets are included in my categories
            new_target = Categorical(new_target, dtype=self.dtype)
            new_target = self._shallow_copy(new_target)

        return new_target, indexer, new_indexer
Пример #23
0
    def _create_from_codes(self, codes, dtype=None, name=None):
        """
        *this is an internal non-public method*

        create the correct categorical from codes

        Parameters
        ----------
        codes : new codes
        dtype: CategoricalDtype, defaults to existing
        name : optional name attribute, defaults to existing

        Returns
        -------
        CategoricalIndex
        """
        if dtype is None:
            dtype = self.dtype
        if name is None:
            name = self.name
        cat = Categorical.from_codes(codes, dtype=dtype)
        return CategoricalIndex(cat, name=name)
Пример #24
0
    def _create_from_codes(self, codes, dtype=None, name=None):
        """
        *this is an internal non-public method*

        create the correct categorical from codes

        Parameters
        ----------
        codes : new codes
        dtype: CategoricalDtype, defaults to existing
        name : optional name attribute, defaults to existing

        Returns
        -------
        CategoricalIndex
        """

        if dtype is None:
            dtype = self.dtype
        if name is None:
            name = self.name
        cat = Categorical.from_codes(codes, dtype=dtype)
        return CategoricalIndex(cat, name=name)
Пример #25
0
    def __new__(cls,
                data=None,
                categories=None,
                ordered=None,
                dtype=None,
                copy=False,
                name=None):

        name = maybe_extract_name(name, data, cls)

        if is_scalar(data):
            # don't allow scalars
            # if data is None, then categories must be provided
            if data is not None or categories is None:
                raise cls._scalar_data_error(data)
            data = []

        data = Categorical(data,
                           categories=categories,
                           ordered=ordered,
                           dtype=dtype,
                           copy=copy)

        return cls._simple_new(data, name=name)
Пример #26
0
    def reindex(
        self, target, method=None, level=None, limit=None, tolerance=None
    ) -> tuple[Index, npt.NDArray[np.intp] | None]:
        """
        Create index with target's values (move/add/delete values as necessary)

        Returns
        -------
        new_index : pd.Index
            Resulting index
        indexer : np.ndarray[np.intp] or None
            Indices of output values in original index

        """
        if method is not None:
            raise NotImplementedError(
                "argument method is not implemented for CategoricalIndex.reindex"
            )
        if level is not None:
            raise NotImplementedError(
                "argument level is not implemented for CategoricalIndex.reindex"
            )
        if limit is not None:
            raise NotImplementedError(
                "argument limit is not implemented for CategoricalIndex.reindex"
            )

        target = ibase.ensure_index(target)

        if self.equals(target):
            indexer = None
            missing = np.array([], dtype=np.intp)
        else:
            indexer, missing = self.get_indexer_non_unique(target)
            if not self.is_unique:
                # GH#42568
                warnings.warn(
                    "reindexing with a non-unique Index is deprecated and will "
                    "raise in a future version.",
                    FutureWarning,
                    stacklevel=find_stack_level(),
                )

        if len(self) and indexer is not None:
            new_target = self.take(indexer)
        else:
            new_target = target

        # filling in missing if needed
        if len(missing):
            cats = self.categories.get_indexer(target)

            if not isinstance(target, CategoricalIndex) or (cats == -1).any():
                new_target, indexer, _ = super()._reindex_non_unique(target)
            else:

                codes = new_target.codes.copy()
                codes[indexer == -1] = cats[missing]
                cat = self._data._from_backing_data(codes)
                new_target = type(self)._simple_new(cat, name=self.name)

        # we always want to return an Index type here
        # to be consistent with .reindex for other index types (e.g. they don't
        # coerce based on the actual values, only on the dtype)
        # unless we had an initial Categorical to begin with
        # in which case we are going to conform to the passed Categorical
        if is_categorical_dtype(target):
            cat = Categorical(new_target, dtype=target.dtype)
            new_target = type(self)._simple_new(cat, name=self.name)
        else:
            # e.g. test_reindex_with_categoricalindex, test_reindex_duplicate_target
            new_target = np.asarray(new_target)
            new_target = Index._with_infer(new_target, name=self.name)

        return new_target, indexer
Пример #27
0
    def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
        """
        Create index with target's values (move/add/delete values as necessary)

        Returns
        -------
        new_index : pd.Index
            Resulting index
        indexer : np.ndarray or None
            Indices of output values in original index

        """
        if method is not None:
            raise NotImplementedError(
                "argument method is not implemented for CategoricalIndex.reindex"
            )
        if level is not None:
            raise NotImplementedError(
                "argument level is not implemented for CategoricalIndex.reindex"
            )
        if limit is not None:
            raise NotImplementedError(
                "argument limit is not implemented for CategoricalIndex.reindex"
            )

        target = ibase.ensure_index(target)

        missing: List[int]
        if self.equals(target):
            indexer = None
            missing = []
        else:
            indexer, missing = self.get_indexer_non_unique(np.array(target))

        if len(self.codes) and indexer is not None:
            new_target = self.take(indexer)
        else:
            new_target = target

        # filling in missing if needed
        if len(missing):
            cats = self.categories.get_indexer(target)

            if (cats == -1).any():
                # coerce to a regular index here!
                result = Index(np.array(self), name=self.name)
                new_target, indexer, _ = result._reindex_non_unique(np.array(target))
            else:

                codes = new_target.codes.copy()
                codes[indexer == -1] = cats[missing]
                cat = self._data._from_backing_data(codes)
                new_target = type(self)._simple_new(cat, name=self.name)

        # we always want to return an Index type here
        # to be consistent with .reindex for other index types (e.g. they don't
        # coerce based on the actual values, only on the dtype)
        # unless we had an initial Categorical to begin with
        # in which case we are going to conform to the passed Categorical
        new_target = np.asarray(new_target)
        if is_categorical_dtype(target):
            new_target = Categorical(new_target, dtype=target.dtype)
            new_target = target._shallow_copy(new_target, name=self.name)
        else:
            new_target = Index(new_target, name=self.name)

        return new_target, indexer
Пример #28
0
def my_cut(x,
           bins,
           right=True,
           labels=None,
           retbins=False,
           precision=3,
           include_lowest=False,
           duplicates='raise'):

    x_is_series, series_index, name, x = pandas.core.reshape.tile._preprocess_for_cut(
        x)
    x, dtype = pandas.core.reshape.tile._coerce_to_type(x)

    if not np.iterable(bins):
        if pd._libs.lib.is_scalar(bins) and bins < 1:
            raise ValueError("`bins` should be a positive integer.")

        try:  # for array-like
            sz = x.size
        except AttributeError:
            x = np.asarray(x)
            sz = x.size

        if sz == 0:
            raise ValueError('Cannot cut empty array')
        nanmin = pd.core.nanops._nanminmax('min', fill_value_typ='+inf')
        nanmax = pd.core.nanops._nanminmax('max', fill_value_typ='-inf')
        rng = (nanmin(x), nanmax(x))
        mn, mx = [mi + 0.0 for mi in rng]

        if mn == mx:  # adjust end points before binning
            mn -= .001 * abs(mn) if mn != 0 else .001
            mx += .001 * abs(mx) if mx != 0 else .001
            bins = np.linspace(mn, mx, bins + 1, endpoint=True)
        else:  # adjust end points after binning
            bins = np.linspace(mn, mx, bins + 1, endpoint=True)
            bins2 = copy.deepcopy(bins)
            adj = (mx - mn) * 0.001  # 0.1% of the range
            if right:
                bins[0] -= adj
            else:
                bins[-1] += adj
    elif isinstance(bins, pd.IntervalIndex):
        if bins.is_overlapping:
            raise ValueError('Overlapping IntervalIndex is not accepted.')

    else:
        if pd.core.dtypes.common.is_datetime64tz_dtype(bins):
            bins = np.asarray(bins, dtype=None)
        else:
            bins = np.asarray(bins)
        bins = pandas.core.reshape.tile._convert_bin_to_numeric_type(
            bins, dtype)
        if (np.diff(bins) < 0).any():
            raise ValueError('bins must increase monotonically.')

    labels = pandas.core.reshape.tile._format_labels(bins2,
                                                     precision,
                                                     right=False,
                                                     dtype=dtype)
    t = str(labels).split("\n")[0].split("(")[1][1:-2].split("),")
    for i in range(len(t) - 1):
        t[i] += ")"
    t[-1] = t[-1].replace(")", "]")
    labels = Categorical(t)

    fac, bins = pandas.core.reshape.tile._bins_to_cuts(
        x,
        bins,
        right=right,
        labels=labels,
        precision=precision,
        include_lowest=include_lowest,
        dtype=dtype,
        duplicates=duplicates)
    return pandas.core.reshape.tile._postprocess_for_cut(
        fac, bins, retbins, x_is_series, series_index, name, dtype)
Пример #29
0
def recode_for_groupby(c, sort, observed):
    """
    Code the categories to ensure we can groupby for categoricals.

    If observed=True, we return a new Categorical with the observed
    categories only.

    If sort=False, return a copy of self, coded with categories as
    returned by .unique(), followed by any categories not appearing in
    the data. If sort=True, return self.

    This method is needed solely to ensure the categorical index of the
    GroupBy result has categories in the order of appearance in the data
    (GH-8868).

    Parameters
    ----------
    c : Categorical
    sort : boolean
        The value of the sort parameter groupby was called with.
    observed : boolean
        Account only for the observed values

    Returns
    -------
    New Categorical
        If sort=False, the new categories are set to the order of
        appearance in codes (unless ordered=True, in which case the
        original order is preserved), followed by any unrepresented
        categories in the original order.
    Categorical or None
        If we are observed, return the original categorical, otherwise None
    """

    # we only care about observed values
    if observed:
        unique_codes = unique1d(c.codes)

        take_codes = unique_codes[unique_codes != -1]
        if c.ordered:
            take_codes = np.sort(take_codes)

        # we recode according to the uniques
        categories = c.categories.take(take_codes)
        codes = _recode_for_categories(c.codes, c.categories, categories)

        # return a new categorical that maps our new codes
        # and categories
        dtype = CategoricalDtype(categories, ordered=c.ordered)
        return Categorical(codes, dtype=dtype, fastpath=True), c

    # Already sorted according to c.categories; all is fine
    if sort:
        return c, None

    # sort=False should order groups in as-encountered order (GH-8868)
    cat = c.unique()

    # But for groupby to work, all categories should be present,
    # including those missing from the data (GH-13179), which .unique()
    # above dropped
    cat = cat.add_categories(c.categories[~c.categories.isin(cat.categories)])

    return c.reorder_categories(cat.categories), None