예제 #1
0
    def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False,
                 levels=None):

        if fastpath:
            # fast path
            self._codes = values
            self.name = name
            self.categories = categories
            self.ordered = ordered
            return

        if name is None:
            name = getattr(values, 'name', None)

        # TODO: Remove after deprecation period in 2017/ after 0.18
        if not levels is None:
            warn("Creating a 'Categorical' with 'levels' is deprecated, use 'categories' instead",
                 FutureWarning)
            if categories is None:
                categories = levels
            else:
                raise ValueError("Cannot pass in both 'categories' and (deprecated) 'levels', "
                                 "use only 'categories'")

        # sanitize input
        if com.is_categorical_dtype(values):

            # we are either a Series or a Categorical
            cat = values
            if isinstance(values, com.ABCSeries):
                cat = values.values
            if categories is None:
                categories = cat.categories
            if ordered is None:
                ordered = cat.ordered
            values = values.__array__()

        elif isinstance(values, Index):
            pass

        else:

            # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array
            # which is fine, but since factorize does this correctly no need here
            # this is an issue because _sanitize_array also coerces np.nan to a string
            # under certain versions of numpy as well
            values = com._possibly_infer_to_datetimelike(values, convert_dates=True)
            if not isinstance(values, np.ndarray):
                values = _convert_to_list_like(values)
                from pandas.core.series import _sanitize_array
                # On list with NaNs, int values will be converted to float. Use "object" dtype
                # to prevent this. In the end objects will be casted to int/... in the category
                # assignment step.
                dtype = 'object' if isnull(values).any() else None
                values = _sanitize_array(values, None, dtype=dtype)

        if categories is None:
            try:
                codes, categories = factorize(values, sort=True)
                # If the underlying data structure was sortable, and the user doesn't want to
                # "forget" this order, the categorical also is sorted/ordered
                if ordered is None:
                    ordered = True
            except TypeError:
                codes, categories = factorize(values, sort=False)
                if ordered:
                    # raise, as we don't have a sortable data structure and so the user should
                    # give us one by specifying categories
                    raise TypeError("'values' is not ordered, please explicitly specify the "
                                    "categories order by passing in a categories argument.")
        else:
            # there were two ways if categories are present
            # - the old one, where each value is a int pointer to the levels array -> not anymore
            #   possible, but code outside of pandas could call us like that, so make some checks
            # - the new one, where each value is also in the categories array (or np.nan)

            # make sure that we always have the same type here, no matter what we get passed in
            categories = self._validate_categories(categories)

            codes = _get_codes_for_values(values, categories)

            # TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016
            if com.is_integer_dtype(values) and not com.is_integer_dtype(categories):
                warn("Values and categories have different dtypes. Did you mean to use\n"
                     "'Categorical.from_codes(codes, categories)'?", RuntimeWarning)

            if com.is_integer_dtype(values) and (codes == -1).all():
                warn("None of the categories were found in values. Did you mean to use\n"
                     "'Categorical.from_codes(codes, categories)'?", RuntimeWarning)

            # if we got categories, we can assume that the order is intended
            # if ordered is unspecified
            if ordered is None:
                ordered = True

        self.ordered = False if ordered is None else ordered
        self._codes = codes
        self.categories = categories
        self.name = name
예제 #2
0
    def __init__(self,
                 values,
                 levels=None,
                 ordered=None,
                 name=None,
                 fastpath=False,
                 compat=False):

        if fastpath:
            # fast path
            self._codes = values
            self.name = name
            self.levels = levels
            self.ordered = ordered
            return

        if name is None:
            name = getattr(values, 'name', None)

        # sanitize input
        if com.is_categorical_dtype(values):

            # we are either a Series or a Categorical
            cat = values
            if isinstance(values, com.ABCSeries):
                cat = values.values
            if levels is None:
                levels = cat.levels
            if ordered is None:
                ordered = cat.ordered
            values = values.__array__()

        elif isinstance(values, Index):
            pass

        else:

            # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array
            # which is fine, but since factorize does this correctly no need here
            # this is an issue because _sanitize_array also coerces np.nan to a string
            # under certain versions of numpy as well
            values = com._possibly_infer_to_datetimelike(values,
                                                         convert_dates=True)
            if not isinstance(values, np.ndarray):
                values = _convert_to_list_like(values)
                from pandas.core.series import _sanitize_array
                # On list with NaNs, int values will be converted to float. Use "object" dtype
                # to prevent this. In the end objects will be casted to int/... in the level
                # assignment step.
                dtype = 'object' if isnull(values).any() else None
                values = _sanitize_array(values, None, dtype=dtype)

        if levels is None:
            try:
                codes, levels = factorize(values, sort=True)
                # If the underlying data structure was sortable, and the user doesn't want to
                # "forget" this order, the categorical also is sorted/ordered
                if ordered is None:
                    ordered = True
            except TypeError:
                codes, levels = factorize(values, sort=False)
                if ordered:
                    # raise, as we don't have a sortable data structure and so the usershould
                    # give us one by specifying levels
                    raise TypeError(
                        "'values' is not ordered, please explicitly specify the level "
                        "order by passing in a level argument.")
        else:
            # there are two ways if levels are present
            # the old one, where each value is a int pointer to the levels array
            # the new one, where each value is also in the level array (or np.nan)

            # make sure that we always have the same type here, no matter what we get passed in
            levels = self._validate_levels(levels)

            # There can be two ways: the old which passed in codes and levels directly
            # and values have to be inferred and the new  one, which passes in values and levels
            # and _codes have to be inferred.

            # min and max can be higher and lower if not all levels are in the values
            if compat and (com.is_integer_dtype(values) and
                           (np.min(values) >= -1) and
                           (np.max(values) < len(levels))):
                warn(
                    "Using 'values' as codes is deprecated.\n"
                    "'Categorical(... , compat=True)' is only there for historical reasons and "
                    "should not be used in new code!\n"
                    "See https://github.com/pydata/pandas/pull/7217",
                    FutureWarning)
                codes = values
            else:
                codes = _get_codes_for_values(values, levels)

                # if we got levels, we can assume that the order is intended
                # if ordered is unspecified
                if ordered is None:
                    ordered = True

        self.ordered = False if ordered is None else ordered
        self._codes = codes
        self.levels = levels
        self.name = name
예제 #3
0
    def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, compat=False):

        if fastpath:
            # fast path
            self._codes = values
            self.name = name
            self.levels = levels
            self.ordered = ordered
            return

        if name is None:
            name = getattr(values, 'name', None)

        # sanitize input
        if com.is_categorical_dtype(values):

            # we are either a Series or a Categorical
            cat = values
            if isinstance(values, com.ABCSeries):
                cat = values.values
            if levels is None:
                levels = cat.levels
            if ordered is None:
                ordered = cat.ordered
            values = values.__array__()

        elif isinstance(values, Index):
            pass

        else:

            # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array
            # which is fine, but since factorize does this correctly no need here
            # this is an issue because _sanitize_array also coerces np.nan to a string
            # under certain versions of numpy as well
            values = com._possibly_infer_to_datetimelike(values)
            if not isinstance(values, np.ndarray):
                values = _convert_to_list_like(values)
                from pandas.core.series import _sanitize_array
                # On list with NaNs, int values will be converted to float. Use "object" dtype
                # to prevent this. In the end objects will be casted to int/... in the level
                # assignment step.
                dtype = 'object' if com.isnull(values).any() else None
                values = _sanitize_array(values, None, dtype=dtype)

        if levels is None:
            try:
                codes, levels = factorize(values, sort=True)
                # If the underlying data structure was sortable, and the user doesn't want to
                # "forget" this order, the categorical also is sorted/ordered
                if ordered is None:
                    ordered = True
            except TypeError:
                codes, levels = factorize(values, sort=False)
                if ordered:
                    # raise, as we don't have a sortable data structure and so the usershould
                    # give us one by specifying levels
                    raise TypeError("'values' is not ordered, please explicitly specify the level "
                                    "order by passing in a level argument.")
        else:
            # there are two ways if levels are present
            # the old one, where each value is a int pointer to the levels array
            # the new one, where each value is also in the level array (or np.nan)

            # make sure that we always have the same type here, no matter what we get passed in
            levels = self._validate_levels(levels)

            # There can be two ways: the old which passed in codes and levels directly
            # and values have to be inferred and the new  one, which passes in values and levels
            # and _codes have to be inferred.

            # min and max can be higher and lower if not all levels are in the values
            if compat and (com.is_integer_dtype(values) and
                               (np.min(values) >= -1) and (np.max(values) < len(levels))):
                warn("Using 'values' as codes is deprecated.\n"
                     "'Categorical(... , compat=True)' is only there for historical reasons and "
                     "should not be used in new code!\n"
                     "See https://github.com/pydata/pandas/pull/7217", FutureWarning)
                codes = values
            else:
                codes = _get_codes_for_values(values, levels)

                # if we got levels, we can assume that the order is intended
                # if ordered is unspecified
                if ordered is None:
                    ordered = True

        self.ordered = False if ordered is None else ordered
        self._codes = codes
        self.levels = levels
        self.name = name
예제 #4
0
    def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False,
                 levels=None):

        if fastpath:
            # fast path
            self._codes = values
            self.name = name
            self.categories = categories
            self.ordered = ordered
            return

        if name is None:
            name = getattr(values, 'name', None)

        # TODO: Remove after deprecation period in 2017/ after 0.18
        if not levels is None:
            warn("Creating a 'Categorical' with 'levels' is deprecated, use 'categories' instead",
                 FutureWarning)
            if categories is None:
                categories = levels
            else:
                raise ValueError("Cannot pass in both 'categories' and (deprecated) 'levels', "
                                 "use only 'categories'")

        # sanitize input
        if com.is_categorical_dtype(values):

            # we are either a Series or a Categorical
            cat = values
            if isinstance(values, com.ABCSeries):
                cat = values.values
            if categories is None:
                categories = cat.categories
            if ordered is None:
                ordered = cat.ordered
            values = values.__array__()

        elif isinstance(values, Index):
            pass

        else:

            # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array
            # which is fine, but since factorize does this correctly no need here
            # this is an issue because _sanitize_array also coerces np.nan to a string
            # under certain versions of numpy as well
            values = com._possibly_infer_to_datetimelike(values, convert_dates=True)
            if not isinstance(values, np.ndarray):
                values = _convert_to_list_like(values)
                from pandas.core.series import _sanitize_array
                # On list with NaNs, int values will be converted to float. Use "object" dtype
                # to prevent this. In the end objects will be casted to int/... in the category
                # assignment step.
                dtype = 'object' if isnull(values).any() else None
                values = _sanitize_array(values, None, dtype=dtype)

        if categories is None:
            try:
                codes, categories = factorize(values, sort=True)
                # If the underlying data structure was sortable, and the user doesn't want to
                # "forget" this order, the categorical also is sorted/ordered
                if ordered is None:
                    ordered = True
            except TypeError:
                codes, categories = factorize(values, sort=False)
                if ordered:
                    # raise, as we don't have a sortable data structure and so the user should
                    # give us one by specifying categories
                    raise TypeError("'values' is not ordered, please explicitly specify the "
                                    "categories order by passing in a categories argument.")
        else:
            # there were two ways if categories are present
            # - the old one, where each value is a int pointer to the levels array -> not anymore
            #   possible, but code outside of pandas could call us like that, so make some checks
            # - the new one, where each value is also in the categories array (or np.nan)

            # make sure that we always have the same type here, no matter what we get passed in
            categories = self._validate_categories(categories)

            codes = _get_codes_for_values(values, categories)

            # TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016
            if com.is_integer_dtype(values) and not com.is_integer_dtype(categories):
                warn("Values and categories have different dtypes. Did you mean to use\n"
                     "'Categorical.from_codes(codes, categories)'?", RuntimeWarning)

            if com.is_integer_dtype(values) and (codes == -1).all():
                warn("None of the categories were found in values. Did you mean to use\n"
                     "'Categorical.from_codes(codes, categories)'?", RuntimeWarning)

            # if we got categories, we can assume that the order is intended
            # if ordered is unspecified
            if ordered is None:
                ordered = True

        self.ordered = False if ordered is None else ordered
        self._codes = codes
        self.categories = categories
        self.name = name