Exemplo n.º 1
0
    def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False,
                 levels=None):

        if fastpath:
            # fast path
            self._codes = values
            self.name = name
            self.categories = categories
            self.ordered = ordered
            return

        if name is None:
            name = getattr(values, 'name', None)

        # TODO: Remove after deprecation period in 2017/ after 0.18
        if not levels is None:
            warn("Creating a 'Categorical' with 'levels' is deprecated, use 'categories' instead",
                 FutureWarning)
            if categories is None:
                categories = levels
            else:
                raise ValueError("Cannot pass in both 'categories' and (deprecated) 'levels', "
                                 "use only 'categories'")

        # sanitize input
        if com.is_categorical_dtype(values):

            # we are either a Series or a Categorical
            cat = values
            if isinstance(values, com.ABCSeries):
                cat = values.values
            if categories is None:
                categories = cat.categories
            if ordered is None:
                ordered = cat.ordered
            values = values.__array__()

        elif isinstance(values, Index):
            pass

        else:

            # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array
            # which is fine, but since factorize does this correctly no need here
            # this is an issue because _sanitize_array also coerces np.nan to a string
            # under certain versions of numpy as well
            values = com._possibly_infer_to_datetimelike(values, convert_dates=True)
            if not isinstance(values, np.ndarray):
                values = _convert_to_list_like(values)
                from pandas.core.series import _sanitize_array
                # On list with NaNs, int values will be converted to float. Use "object" dtype
                # to prevent this. In the end objects will be casted to int/... in the category
                # assignment step.
                dtype = 'object' if isnull(values).any() else None
                values = _sanitize_array(values, None, dtype=dtype)

        if categories is None:
            try:
                codes, categories = factorize(values, sort=True)
                # If the underlying data structure was sortable, and the user doesn't want to
                # "forget" this order, the categorical also is sorted/ordered
                if ordered is None:
                    ordered = True
            except TypeError:
                codes, categories = factorize(values, sort=False)
                if ordered:
                    # raise, as we don't have a sortable data structure and so the user should
                    # give us one by specifying categories
                    raise TypeError("'values' is not ordered, please explicitly specify the "
                                    "categories order by passing in a categories argument.")
        else:
            # there were two ways if categories are present
            # - the old one, where each value is a int pointer to the levels array -> not anymore
            #   possible, but code outside of pandas could call us like that, so make some checks
            # - the new one, where each value is also in the categories array (or np.nan)

            # make sure that we always have the same type here, no matter what we get passed in
            categories = self._validate_categories(categories)

            codes = _get_codes_for_values(values, categories)

            # TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016
            if com.is_integer_dtype(values) and not com.is_integer_dtype(categories):
                warn("Values and categories have different dtypes. Did you mean to use\n"
                     "'Categorical.from_codes(codes, categories)'?", RuntimeWarning)

            if com.is_integer_dtype(values) and (codes == -1).all():
                warn("None of the categories were found in values. Did you mean to use\n"
                     "'Categorical.from_codes(codes, categories)'?", RuntimeWarning)

            # if we got categories, we can assume that the order is intended
            # if ordered is unspecified
            if ordered is None:
                ordered = True

        self.ordered = False if ordered is None else ordered
        self._codes = codes
        self.categories = categories
        self.name = name
Exemplo n.º 2
0
    def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, compat=False):

        if fastpath:
            # fast path
            self._codes = values
            self.name = name
            self.levels = levels
            self.ordered = ordered
            return

        if name is None:
            name = getattr(values, 'name', None)

        # sanitize input
        if com.is_categorical_dtype(values):

            # we are either a Series or a Categorical
            cat = values
            if isinstance(values, com.ABCSeries):
                cat = values.values
            if levels is None:
                levels = cat.levels
            if ordered is None:
                ordered = cat.ordered
            values = values.__array__()

        elif isinstance(values, Index):
            pass

        else:

            # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array
            # which is fine, but since factorize does this correctly no need here
            # this is an issue because _sanitize_array also coerces np.nan to a string
            # under certain versions of numpy as well
            values = com._possibly_infer_to_datetimelike(values)
            if not isinstance(values, np.ndarray):
                values = _convert_to_list_like(values)
                from pandas.core.series import _sanitize_array
                # On list with NaNs, int values will be converted to float. Use "object" dtype
                # to prevent this. In the end objects will be casted to int/... in the level
                # assignment step.
                dtype = 'object' if com.isnull(values).any() else None
                values = _sanitize_array(values, None, dtype=dtype)

        if levels is None:
            try:
                codes, levels = factorize(values, sort=True)
                # If the underlying data structure was sortable, and the user doesn't want to
                # "forget" this order, the categorical also is sorted/ordered
                if ordered is None:
                    ordered = True
            except TypeError:
                codes, levels = factorize(values, sort=False)
                if ordered:
                    # raise, as we don't have a sortable data structure and so the usershould
                    # give us one by specifying levels
                    raise TypeError("'values' is not ordered, please explicitly specify the level "
                                    "order by passing in a level argument.")
        else:
            # there are two ways if levels are present
            # the old one, where each value is a int pointer to the levels array
            # the new one, where each value is also in the level array (or np.nan)

            # make sure that we always have the same type here, no matter what we get passed in
            levels = self._validate_levels(levels)

            # There can be two ways: the old which passed in codes and levels directly
            # and values have to be inferred and the new  one, which passes in values and levels
            # and _codes have to be inferred.

            # min and max can be higher and lower if not all levels are in the values
            if compat and (com.is_integer_dtype(values) and
                               (np.min(values) >= -1) and (np.max(values) < len(levels))):
                warn("Using 'values' as codes is deprecated.\n"
                     "'Categorical(... , compat=True)' is only there for historical reasons and "
                     "should not be used in new code!\n"
                     "See https://github.com/pydata/pandas/pull/7217", FutureWarning)
                codes = values
            else:
                codes = _get_codes_for_values(values, levels)

                # if we got levels, we can assume that the order is intended
                # if ordered is unspecified
                if ordered is None:
                    ordered = True

        self.ordered = False if ordered is None else ordered
        self._codes = codes
        self.levels = levels
        self.name = name
Exemplo n.º 3
0
    def __init__(self,
                 values,
                 levels=None,
                 ordered=None,
                 name=None,
                 fastpath=False,
                 compat=False):

        if fastpath:
            # fast path
            self._codes = values
            self.name = name
            self.levels = levels
            self.ordered = ordered
            return

        if name is None:
            name = getattr(values, 'name', None)

        # sanitize input
        if com.is_categorical_dtype(values):

            # we are either a Series or a Categorical
            cat = values
            if isinstance(values, com.ABCSeries):
                cat = values.values
            if levels is None:
                levels = cat.levels
            if ordered is None:
                ordered = cat.ordered
            values = values.__array__()

        elif isinstance(values, Index):
            pass

        else:

            # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array
            # which is fine, but since factorize does this correctly no need here
            # this is an issue because _sanitize_array also coerces np.nan to a string
            # under certain versions of numpy as well
            values = com._possibly_infer_to_datetimelike(values,
                                                         convert_dates=True)
            if not isinstance(values, np.ndarray):
                values = _convert_to_list_like(values)
                from pandas.core.series import _sanitize_array
                # On list with NaNs, int values will be converted to float. Use "object" dtype
                # to prevent this. In the end objects will be casted to int/... in the level
                # assignment step.
                dtype = 'object' if isnull(values).any() else None
                values = _sanitize_array(values, None, dtype=dtype)

        if levels is None:
            try:
                codes, levels = factorize(values, sort=True)
                # If the underlying data structure was sortable, and the user doesn't want to
                # "forget" this order, the categorical also is sorted/ordered
                if ordered is None:
                    ordered = True
            except TypeError:
                codes, levels = factorize(values, sort=False)
                if ordered:
                    # raise, as we don't have a sortable data structure and so the usershould
                    # give us one by specifying levels
                    raise TypeError(
                        "'values' is not ordered, please explicitly specify the level "
                        "order by passing in a level argument.")
        else:
            # there are two ways if levels are present
            # the old one, where each value is a int pointer to the levels array
            # the new one, where each value is also in the level array (or np.nan)

            # make sure that we always have the same type here, no matter what we get passed in
            levels = self._validate_levels(levels)

            # There can be two ways: the old which passed in codes and levels directly
            # and values have to be inferred and the new  one, which passes in values and levels
            # and _codes have to be inferred.

            # min and max can be higher and lower if not all levels are in the values
            if compat and (com.is_integer_dtype(values) and
                           (np.min(values) >= -1) and
                           (np.max(values) < len(levels))):
                warn(
                    "Using 'values' as codes is deprecated.\n"
                    "'Categorical(... , compat=True)' is only there for historical reasons and "
                    "should not be used in new code!\n"
                    "See https://github.com/pydata/pandas/pull/7217",
                    FutureWarning)
                codes = values
            else:
                codes = _get_codes_for_values(values, levels)

                # if we got levels, we can assume that the order is intended
                # if ordered is unspecified
                if ordered is None:
                    ordered = True

        self.ordered = False if ordered is None else ordered
        self._codes = codes
        self.levels = levels
        self.name = name
Exemplo n.º 4
0
    def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False,
                 levels=None):

        if fastpath:
            # fast path
            self._codes = values
            self.name = name
            self.categories = categories
            self.ordered = ordered
            return

        if name is None:
            name = getattr(values, 'name', None)

        # TODO: Remove after deprecation period in 2017/ after 0.18
        if not levels is None:
            warn("Creating a 'Categorical' with 'levels' is deprecated, use 'categories' instead",
                 FutureWarning)
            if categories is None:
                categories = levels
            else:
                raise ValueError("Cannot pass in both 'categories' and (deprecated) 'levels', "
                                 "use only 'categories'")

        # sanitize input
        if com.is_categorical_dtype(values):

            # we are either a Series or a Categorical
            cat = values
            if isinstance(values, com.ABCSeries):
                cat = values.values
            if categories is None:
                categories = cat.categories
            if ordered is None:
                ordered = cat.ordered
            values = values.__array__()

        elif isinstance(values, Index):
            pass

        else:

            # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array
            # which is fine, but since factorize does this correctly no need here
            # this is an issue because _sanitize_array also coerces np.nan to a string
            # under certain versions of numpy as well
            values = com._possibly_infer_to_datetimelike(values, convert_dates=True)
            if not isinstance(values, np.ndarray):
                values = _convert_to_list_like(values)
                from pandas.core.series import _sanitize_array
                # On list with NaNs, int values will be converted to float. Use "object" dtype
                # to prevent this. In the end objects will be casted to int/... in the category
                # assignment step.
                dtype = 'object' if isnull(values).any() else None
                values = _sanitize_array(values, None, dtype=dtype)

        if categories is None:
            try:
                codes, categories = factorize(values, sort=True)
                # If the underlying data structure was sortable, and the user doesn't want to
                # "forget" this order, the categorical also is sorted/ordered
                if ordered is None:
                    ordered = True
            except TypeError:
                codes, categories = factorize(values, sort=False)
                if ordered:
                    # raise, as we don't have a sortable data structure and so the user should
                    # give us one by specifying categories
                    raise TypeError("'values' is not ordered, please explicitly specify the "
                                    "categories order by passing in a categories argument.")
        else:
            # there were two ways if categories are present
            # - the old one, where each value is a int pointer to the levels array -> not anymore
            #   possible, but code outside of pandas could call us like that, so make some checks
            # - the new one, where each value is also in the categories array (or np.nan)

            # make sure that we always have the same type here, no matter what we get passed in
            categories = self._validate_categories(categories)

            codes = _get_codes_for_values(values, categories)

            # TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016
            if com.is_integer_dtype(values) and not com.is_integer_dtype(categories):
                warn("Values and categories have different dtypes. Did you mean to use\n"
                     "'Categorical.from_codes(codes, categories)'?", RuntimeWarning)

            if com.is_integer_dtype(values) and (codes == -1).all():
                warn("None of the categories were found in values. Did you mean to use\n"
                     "'Categorical.from_codes(codes, categories)'?", RuntimeWarning)

            # if we got categories, we can assume that the order is intended
            # if ordered is unspecified
            if ordered is None:
                ordered = True

        self.ordered = False if ordered is None else ordered
        self._codes = codes
        self.categories = categories
        self.name = name
Exemplo n.º 5
0
    def __init__(self, data, sparse_index=None, index=None, fill_value=None,
                 kind='integer', dtype=None, copy=False):
        from pandas.core.internals import SingleBlockManager

        if isinstance(data, SingleBlockManager):
            data = data.internal_values()

        if fill_value is None and isinstance(dtype, SparseDtype):
            fill_value = dtype.fill_value

        if isinstance(data, (type(self), ABCSparseSeries)):
            # disable normal inference on dtype, sparse_index, & fill_value
            if sparse_index is None:
                sparse_index = data.sp_index
            if fill_value is None:
                fill_value = data.fill_value
            if dtype is None:
                dtype = data.dtype
            # TODO: make kind=None, and use data.kind?
            data = data.sp_values

        # Handle use-provided dtype
        if isinstance(dtype, compat.string_types):
            # Two options: dtype='int', regular numpy dtype
            # or dtype='Sparse[int]', a sparse dtype
            try:
                dtype = SparseDtype.construct_from_string(dtype)
            except TypeError:
                dtype = pandas_dtype(dtype)

        if isinstance(dtype, SparseDtype):
            if fill_value is None:
                fill_value = dtype.fill_value
            dtype = dtype.subtype

        if index is not None and not is_scalar(data):
            raise Exception("must only pass scalars with an index ")

        if is_scalar(data):
            if index is not None:
                if data is None:
                    data = np.nan

            if index is not None:
                npoints = len(index)
            elif sparse_index is None:
                npoints = 1
            else:
                npoints = sparse_index.length

            dtype = infer_dtype_from_scalar(data)[0]
            data = construct_1d_arraylike_from_scalar(
                data, npoints, dtype
            )

        if dtype is not None:
            dtype = pandas_dtype(dtype)

        # TODO: disentangle the fill_value dtype inference from
        # dtype inference
        if data is None:
            # XXX: What should the empty dtype be? Object or float?
            data = np.array([], dtype=dtype)

        if not is_array_like(data):
            try:
                # probably shared code in sanitize_series
                from pandas.core.series import _sanitize_array
                data = _sanitize_array(data, index=None)
            except ValueError:
                # NumPy may raise a ValueError on data like [1, []]
                # we retry with object dtype here.
                if dtype is None:
                    dtype = object
                    data = np.atleast_1d(np.asarray(data, dtype=dtype))
                else:
                    raise

        if copy:
            # TODO: avoid double copy when dtype forces cast.
            data = data.copy()

        if fill_value is None:
            fill_value_dtype = data.dtype if dtype is None else dtype
            if fill_value_dtype is None:
                fill_value = np.nan
            else:
                fill_value = na_value_for_dtype(fill_value_dtype)

        if isinstance(data, type(self)) and sparse_index is None:
            sparse_index = data._sparse_index
            sparse_values = np.asarray(data.sp_values, dtype=dtype)
        elif sparse_index is None:
            sparse_values, sparse_index, fill_value = make_sparse(
                data, kind=kind, fill_value=fill_value, dtype=dtype
            )
        else:
            sparse_values = np.asarray(data, dtype=dtype)
            if len(sparse_values) != sparse_index.npoints:
                raise AssertionError("Non array-like type {type} must "
                                     "have the same length as the index"
                                     .format(type=type(sparse_values)))
        self._sparse_index = sparse_index
        self._sparse_values = sparse_values
        self._dtype = SparseDtype(sparse_values.dtype, fill_value)
Exemplo n.º 6
0
    def __init__(self,
                 data,
                 sparse_index=None,
                 index=None,
                 fill_value=None,
                 kind='integer',
                 dtype=None,
                 copy=False):
        from pandas.core.internals import SingleBlockManager

        if isinstance(data, SingleBlockManager):
            data = data.internal_values()

        if fill_value is None and isinstance(dtype, SparseDtype):
            fill_value = dtype.fill_value

        if isinstance(data, (type(self), ABCSparseSeries)):
            # disable normal inference on dtype, sparse_index, & fill_value
            if sparse_index is None:
                sparse_index = data.sp_index
            if fill_value is None:
                fill_value = data.fill_value
            if dtype is None:
                dtype = data.dtype
            # TODO: make kind=None, and use data.kind?
            data = data.sp_values

        # Handle use-provided dtype
        if isinstance(dtype, compat.string_types):
            # Two options: dtype='int', regular numpy dtype
            # or dtype='Sparse[int]', a sparse dtype
            try:
                dtype = SparseDtype.construct_from_string(dtype)
            except TypeError:
                dtype = pandas_dtype(dtype)

        if isinstance(dtype, SparseDtype):
            if fill_value is None:
                fill_value = dtype.fill_value
            dtype = dtype.subtype

        if index is not None and not is_scalar(data):
            raise Exception("must only pass scalars with an index ")

        if is_scalar(data):
            if index is not None:
                if data is None:
                    data = np.nan

            if index is not None:
                npoints = len(index)
            elif sparse_index is None:
                npoints = 1
            else:
                npoints = sparse_index.length

            dtype = infer_dtype_from_scalar(data)[0]
            data = construct_1d_arraylike_from_scalar(data, npoints, dtype)

        if dtype is not None:
            dtype = pandas_dtype(dtype)

        # TODO: disentangle the fill_value dtype inference from
        # dtype inference
        if data is None:
            # XXX: What should the empty dtype be? Object or float?
            data = np.array([], dtype=dtype)

        if not is_array_like(data):
            try:
                # probably shared code in sanitize_series
                from pandas.core.series import _sanitize_array
                data = _sanitize_array(data, index=None)
            except ValueError:
                # NumPy may raise a ValueError on data like [1, []]
                # we retry with object dtype here.
                if dtype is None:
                    dtype = object
                    data = np.atleast_1d(np.asarray(data, dtype=dtype))
                else:
                    raise

        if copy:
            # TODO: avoid double copy when dtype forces cast.
            data = data.copy()

        if fill_value is None:
            fill_value_dtype = data.dtype if dtype is None else dtype
            if fill_value_dtype is None:
                fill_value = np.nan
            else:
                fill_value = na_value_for_dtype(fill_value_dtype)

        if isinstance(data, type(self)) and sparse_index is None:
            sparse_index = data._sparse_index
            sparse_values = np.asarray(data.sp_values, dtype=dtype)
        elif sparse_index is None:
            sparse_values, sparse_index, fill_value = make_sparse(
                data, kind=kind, fill_value=fill_value, dtype=dtype)
        else:
            sparse_values = np.asarray(data, dtype=dtype)
            if len(sparse_values) != sparse_index.npoints:
                raise AssertionError(
                    "Non array-like type {type} must "
                    "have the same length as the index".format(
                        type=type(sparse_values)))
        self._sparse_index = sparse_index
        self._sparse_values = sparse_values
        self._dtype = SparseDtype(sparse_values.dtype, fill_value)