def __init__(self, values, categories=None, ordered=None, name=None, fastpath=False, levels=None): if fastpath: # fast path self._codes = values self.name = name self.categories = categories self.ordered = ordered return if name is None: name = getattr(values, 'name', None) # TODO: Remove after deprecation period in 2017/ after 0.18 if not levels is None: warn("Creating a 'Categorical' with 'levels' is deprecated, use 'categories' instead", FutureWarning) if categories is None: categories = levels else: raise ValueError("Cannot pass in both 'categories' and (deprecated) 'levels', " "use only 'categories'") # sanitize input if com.is_categorical_dtype(values): # we are either a Series or a Categorical cat = values if isinstance(values, com.ABCSeries): cat = values.values if categories is None: categories = cat.categories if ordered is None: ordered = cat.ordered values = values.__array__() elif isinstance(values, Index): pass else: # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array # which is fine, but since factorize does this correctly no need here # this is an issue because _sanitize_array also coerces np.nan to a string # under certain versions of numpy as well values = com._possibly_infer_to_datetimelike(values, convert_dates=True) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array # On list with NaNs, int values will be converted to float. Use "object" dtype # to prevent this. In the end objects will be casted to int/... in the category # assignment step. dtype = 'object' if isnull(values).any() else None values = _sanitize_array(values, None, dtype=dtype) if categories is None: try: codes, categories = factorize(values, sort=True) # If the underlying data structure was sortable, and the user doesn't want to # "forget" this order, the categorical also is sorted/ordered if ordered is None: ordered = True except TypeError: codes, categories = factorize(values, sort=False) if ordered: # raise, as we don't have a sortable data structure and so the user should # give us one by specifying categories raise TypeError("'values' is not ordered, please explicitly specify the " "categories order by passing in a categories argument.") else: # there were two ways if categories are present # - the old one, where each value is a int pointer to the levels array -> not anymore # possible, but code outside of pandas could call us like that, so make some checks # - the new one, where each value is also in the categories array (or np.nan) # make sure that we always have the same type here, no matter what we get passed in categories = self._validate_categories(categories) codes = _get_codes_for_values(values, categories) # TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016 if com.is_integer_dtype(values) and not com.is_integer_dtype(categories): warn("Values and categories have different dtypes. Did you mean to use\n" "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) if com.is_integer_dtype(values) and (codes == -1).all(): warn("None of the categories were found in values. Did you mean to use\n" "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) # if we got categories, we can assume that the order is intended # if ordered is unspecified if ordered is None: ordered = True self.ordered = False if ordered is None else ordered self._codes = codes self.categories = categories self.name = name
def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, compat=False): if fastpath: # fast path self._codes = values self.name = name self.levels = levels self.ordered = ordered return if name is None: name = getattr(values, 'name', None) # sanitize input if com.is_categorical_dtype(values): # we are either a Series or a Categorical cat = values if isinstance(values, com.ABCSeries): cat = values.values if levels is None: levels = cat.levels if ordered is None: ordered = cat.ordered values = values.__array__() elif isinstance(values, Index): pass else: # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array # which is fine, but since factorize does this correctly no need here # this is an issue because _sanitize_array also coerces np.nan to a string # under certain versions of numpy as well values = com._possibly_infer_to_datetimelike(values) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array # On list with NaNs, int values will be converted to float. Use "object" dtype # to prevent this. In the end objects will be casted to int/... in the level # assignment step. dtype = 'object' if com.isnull(values).any() else None values = _sanitize_array(values, None, dtype=dtype) if levels is None: try: codes, levels = factorize(values, sort=True) # If the underlying data structure was sortable, and the user doesn't want to # "forget" this order, the categorical also is sorted/ordered if ordered is None: ordered = True except TypeError: codes, levels = factorize(values, sort=False) if ordered: # raise, as we don't have a sortable data structure and so the usershould # give us one by specifying levels raise TypeError("'values' is not ordered, please explicitly specify the level " "order by passing in a level argument.") else: # there are two ways if levels are present # the old one, where each value is a int pointer to the levels array # the new one, where each value is also in the level array (or np.nan) # make sure that we always have the same type here, no matter what we get passed in levels = self._validate_levels(levels) # There can be two ways: the old which passed in codes and levels directly # and values have to be inferred and the new one, which passes in values and levels # and _codes have to be inferred. # min and max can be higher and lower if not all levels are in the values if compat and (com.is_integer_dtype(values) and (np.min(values) >= -1) and (np.max(values) < len(levels))): warn("Using 'values' as codes is deprecated.\n" "'Categorical(... , compat=True)' is only there for historical reasons and " "should not be used in new code!\n" "See https://github.com/pydata/pandas/pull/7217", FutureWarning) codes = values else: codes = _get_codes_for_values(values, levels) # if we got levels, we can assume that the order is intended # if ordered is unspecified if ordered is None: ordered = True self.ordered = False if ordered is None else ordered self._codes = codes self.levels = levels self.name = name
def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, compat=False): if fastpath: # fast path self._codes = values self.name = name self.levels = levels self.ordered = ordered return if name is None: name = getattr(values, 'name', None) # sanitize input if com.is_categorical_dtype(values): # we are either a Series or a Categorical cat = values if isinstance(values, com.ABCSeries): cat = values.values if levels is None: levels = cat.levels if ordered is None: ordered = cat.ordered values = values.__array__() elif isinstance(values, Index): pass else: # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array # which is fine, but since factorize does this correctly no need here # this is an issue because _sanitize_array also coerces np.nan to a string # under certain versions of numpy as well values = com._possibly_infer_to_datetimelike(values, convert_dates=True) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array # On list with NaNs, int values will be converted to float. Use "object" dtype # to prevent this. In the end objects will be casted to int/... in the level # assignment step. dtype = 'object' if isnull(values).any() else None values = _sanitize_array(values, None, dtype=dtype) if levels is None: try: codes, levels = factorize(values, sort=True) # If the underlying data structure was sortable, and the user doesn't want to # "forget" this order, the categorical also is sorted/ordered if ordered is None: ordered = True except TypeError: codes, levels = factorize(values, sort=False) if ordered: # raise, as we don't have a sortable data structure and so the usershould # give us one by specifying levels raise TypeError( "'values' is not ordered, please explicitly specify the level " "order by passing in a level argument.") else: # there are two ways if levels are present # the old one, where each value is a int pointer to the levels array # the new one, where each value is also in the level array (or np.nan) # make sure that we always have the same type here, no matter what we get passed in levels = self._validate_levels(levels) # There can be two ways: the old which passed in codes and levels directly # and values have to be inferred and the new one, which passes in values and levels # and _codes have to be inferred. # min and max can be higher and lower if not all levels are in the values if compat and (com.is_integer_dtype(values) and (np.min(values) >= -1) and (np.max(values) < len(levels))): warn( "Using 'values' as codes is deprecated.\n" "'Categorical(... , compat=True)' is only there for historical reasons and " "should not be used in new code!\n" "See https://github.com/pydata/pandas/pull/7217", FutureWarning) codes = values else: codes = _get_codes_for_values(values, levels) # if we got levels, we can assume that the order is intended # if ordered is unspecified if ordered is None: ordered = True self.ordered = False if ordered is None else ordered self._codes = codes self.levels = levels self.name = name
def __init__(self, data, sparse_index=None, index=None, fill_value=None, kind='integer', dtype=None, copy=False): from pandas.core.internals import SingleBlockManager if isinstance(data, SingleBlockManager): data = data.internal_values() if fill_value is None and isinstance(dtype, SparseDtype): fill_value = dtype.fill_value if isinstance(data, (type(self), ABCSparseSeries)): # disable normal inference on dtype, sparse_index, & fill_value if sparse_index is None: sparse_index = data.sp_index if fill_value is None: fill_value = data.fill_value if dtype is None: dtype = data.dtype # TODO: make kind=None, and use data.kind? data = data.sp_values # Handle use-provided dtype if isinstance(dtype, compat.string_types): # Two options: dtype='int', regular numpy dtype # or dtype='Sparse[int]', a sparse dtype try: dtype = SparseDtype.construct_from_string(dtype) except TypeError: dtype = pandas_dtype(dtype) if isinstance(dtype, SparseDtype): if fill_value is None: fill_value = dtype.fill_value dtype = dtype.subtype if index is not None and not is_scalar(data): raise Exception("must only pass scalars with an index ") if is_scalar(data): if index is not None: if data is None: data = np.nan if index is not None: npoints = len(index) elif sparse_index is None: npoints = 1 else: npoints = sparse_index.length dtype = infer_dtype_from_scalar(data)[0] data = construct_1d_arraylike_from_scalar( data, npoints, dtype ) if dtype is not None: dtype = pandas_dtype(dtype) # TODO: disentangle the fill_value dtype inference from # dtype inference if data is None: # XXX: What should the empty dtype be? Object or float? data = np.array([], dtype=dtype) if not is_array_like(data): try: # probably shared code in sanitize_series from pandas.core.series import _sanitize_array data = _sanitize_array(data, index=None) except ValueError: # NumPy may raise a ValueError on data like [1, []] # we retry with object dtype here. if dtype is None: dtype = object data = np.atleast_1d(np.asarray(data, dtype=dtype)) else: raise if copy: # TODO: avoid double copy when dtype forces cast. data = data.copy() if fill_value is None: fill_value_dtype = data.dtype if dtype is None else dtype if fill_value_dtype is None: fill_value = np.nan else: fill_value = na_value_for_dtype(fill_value_dtype) if isinstance(data, type(self)) and sparse_index is None: sparse_index = data._sparse_index sparse_values = np.asarray(data.sp_values, dtype=dtype) elif sparse_index is None: sparse_values, sparse_index, fill_value = make_sparse( data, kind=kind, fill_value=fill_value, dtype=dtype ) else: sparse_values = np.asarray(data, dtype=dtype) if len(sparse_values) != sparse_index.npoints: raise AssertionError("Non array-like type {type} must " "have the same length as the index" .format(type=type(sparse_values))) self._sparse_index = sparse_index self._sparse_values = sparse_values self._dtype = SparseDtype(sparse_values.dtype, fill_value)
def __init__(self, data, sparse_index=None, index=None, fill_value=None, kind='integer', dtype=None, copy=False): from pandas.core.internals import SingleBlockManager if isinstance(data, SingleBlockManager): data = data.internal_values() if fill_value is None and isinstance(dtype, SparseDtype): fill_value = dtype.fill_value if isinstance(data, (type(self), ABCSparseSeries)): # disable normal inference on dtype, sparse_index, & fill_value if sparse_index is None: sparse_index = data.sp_index if fill_value is None: fill_value = data.fill_value if dtype is None: dtype = data.dtype # TODO: make kind=None, and use data.kind? data = data.sp_values # Handle use-provided dtype if isinstance(dtype, compat.string_types): # Two options: dtype='int', regular numpy dtype # or dtype='Sparse[int]', a sparse dtype try: dtype = SparseDtype.construct_from_string(dtype) except TypeError: dtype = pandas_dtype(dtype) if isinstance(dtype, SparseDtype): if fill_value is None: fill_value = dtype.fill_value dtype = dtype.subtype if index is not None and not is_scalar(data): raise Exception("must only pass scalars with an index ") if is_scalar(data): if index is not None: if data is None: data = np.nan if index is not None: npoints = len(index) elif sparse_index is None: npoints = 1 else: npoints = sparse_index.length dtype = infer_dtype_from_scalar(data)[0] data = construct_1d_arraylike_from_scalar(data, npoints, dtype) if dtype is not None: dtype = pandas_dtype(dtype) # TODO: disentangle the fill_value dtype inference from # dtype inference if data is None: # XXX: What should the empty dtype be? Object or float? data = np.array([], dtype=dtype) if not is_array_like(data): try: # probably shared code in sanitize_series from pandas.core.series import _sanitize_array data = _sanitize_array(data, index=None) except ValueError: # NumPy may raise a ValueError on data like [1, []] # we retry with object dtype here. if dtype is None: dtype = object data = np.atleast_1d(np.asarray(data, dtype=dtype)) else: raise if copy: # TODO: avoid double copy when dtype forces cast. data = data.copy() if fill_value is None: fill_value_dtype = data.dtype if dtype is None else dtype if fill_value_dtype is None: fill_value = np.nan else: fill_value = na_value_for_dtype(fill_value_dtype) if isinstance(data, type(self)) and sparse_index is None: sparse_index = data._sparse_index sparse_values = np.asarray(data.sp_values, dtype=dtype) elif sparse_index is None: sparse_values, sparse_index, fill_value = make_sparse( data, kind=kind, fill_value=fill_value, dtype=dtype) else: sparse_values = np.asarray(data, dtype=dtype) if len(sparse_values) != sparse_index.npoints: raise AssertionError( "Non array-like type {type} must " "have the same length as the index".format( type=type(sparse_values))) self._sparse_index = sparse_index self._sparse_values = sparse_values self._dtype = SparseDtype(sparse_values.dtype, fill_value)