def test_empty_print(self): factor = Categorical([], ["a", "b", "c"], name="cat") expected = ("Categorical([], Name: cat, Levels (3): " "Index([a, b, c], dtype=object)") # hack because array_repr changed in numpy > 1.6.x actual = repr(factor) pat = "Index\(\['a', 'b', 'c']" sub = "Index([a, b, c]" actual = re.sub(pat, sub, actual) self.assertEqual(actual, expected) factor = Categorical([], ["a", "b", "c"]) expected = ("Categorical([], Levels (3): " "Index([a, b, c], dtype=object)") # hack because array_repr changed in numpy > 1.6.x actual = repr(factor) pat = "Index\(\['a', 'b', 'c']" sub = "Index([a, b, c]" actual = re.sub(pat, sub, actual) self.assertEqual(actual, expected) factor = Categorical([], []) expected = ("Categorical([], Levels (0): " "Index([], dtype=object)") self.assertEqual(repr(factor), expected)
def _concat_categorical(to_concat, axis=0): """Concatenate an object/categorical array of arrays, each of which is a single dtype Parameters ---------- to_concat : array of arrays axis : int Axis to provide concatenation in the current implementation this is always 0, e.g. we only have 1D categoricals Returns ------- Categorical A single array, preserving the combined dtypes """ from pandas.core.categorical import Categorical def convert_categorical(x): # coerce to object dtype if com.is_categorical_dtype(x.dtype): return x.get_values() return x.ravel() if get_dtype_kinds(to_concat) - set(['object', 'category']): # convert to object type and perform a regular concat return _concat_compat( [np.array(x, copy=False, dtype=object) for x in to_concat], axis=0) # we could have object blocks and categoricals here # if we only have a single categoricals then combine everything # else its a non-compat categorical categoricals = [x for x in to_concat if com.is_categorical_dtype(x.dtype)] # validate the categories categories = categoricals[0] rawcats = categories.categories for x in categoricals[1:]: if not categories.is_dtype_equal(x): raise ValueError("incompatible categories in categorical concat") # we've already checked that all categoricals are the same, so if their # length is equal to the input then we have all the same categories if len(categoricals) == len(to_concat): # concating numeric types is much faster than concating object types # and fastpath takes a shorter path through the constructor return Categorical(np.concatenate([x.codes for x in to_concat], axis=0), rawcats, ordered=categoricals[0].ordered, fastpath=True) else: concatted = np.concatenate(list(map(convert_categorical, to_concat)), axis=0) return Categorical(concatted, rawcats)
def melt_stub(df, stub, i, j, value_vars, sep): newdf = melt(df, id_vars=i, value_vars=value_vars, value_name=stub.rstrip(sep), var_name=j) newdf[j] = Categorical(newdf[j]) newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") return newdf.set_index(i + [j])
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = algos.take_nd(values, inds, axis=1) columns = columns[inds] # may need to coerce categoricals here if self.is_categorical is not None: categories = self.is_categorical.categories ordered = self.is_categorical.ordered values = [ Categorical(values[:, i], categories=categories, ordered=ordered) for i in range(values.shape[-1]) ] return DataFrame(values, index=index, columns=columns)
def _create_categorical(self, data, categories=None, ordered=None): """ *this is an internal non-public method* create the correct categorical from data and the properties Parameters ---------- data : data for new Categorical categories : optional categories, defaults to existing ordered : optional ordered attribute, defaults to existing Returns ------- Categorical """ if not isinstance(data, ABCCategorical): ordered = False if ordered is None else ordered from pandas.core.categorical import Categorical data = Categorical(data, categories=categories, ordered=ordered) else: if categories is not None: data = data.set_categories(categories) if ordered is not None: data = data.set_ordered(ordered) return data
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False): if name is None and isinstance(x, Series): name = x.name x = np.asarray(x) side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) if len(algos.unique(bins)) < len(bins): raise Exception('Bin edges must be unique: %s' % repr(bins)) if include_lowest: ids[x == bins[0]] = 1 na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: increases = 0 while True: try: levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest) except ValueError: increases += 1 precision += 1 if increases >= 20: raise else: break else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, name=name) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) if not retbins: return fac return fac, bins
def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates='raise'): if duplicates not in ['raise', 'drop']: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") unique_bins = algos.unique(bins) if len(unique_bins) < len(bins): if duplicates == 'raise': raise ValueError("Bin edges must be unique: {}.\nYou " "can drop duplicate edges by setting " "the 'duplicates' kwarg".format(repr(bins))) else: bins = unique_bins side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) if include_lowest: ids[x == bins[0]] = 1 na_mask = isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: increases = 0 while True: try: levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) except ValueError: increases += 1 precision += 1 if increases >= 20: raise else: break else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, ordered=True, fastpath=True) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) return fac, bins
def sparse_dummies(df, column): """Returns sparse OHE matrix for the column of the dataframe""" categories = Categorical(df[column]) column_names = np.array( [f"{column}_{str(i)}" for i in range(len(categories.categories))]) N = len(categories) row_numbers = np.arange(N, dtype=np.int) ones = np.ones((N, )) return csr_matrix((ones, (row_numbers, categories.codes))), column_names
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False): if name is None and isinstance(x, Series): name = x.name x = np.asarray(x) side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) if include_lowest: ids[x == bins[0]] = 1 na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: fmt = lambda v: _format_label(v, precision=precision) if right: levels = [ '(%s, %s]' % (fmt(a), fmt(b)) for a, b in zip(bins, bins[1:]) ] if include_lowest: levels[0] = '[' + levels[0][1:] else: levels = [ '[%s, %s)' % (fmt(a), fmt(b)) for a, b in zip(bins, bins[1:]) ] else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, name=name) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) if not retbins: return fac return fac, bins
def melt_stub(df, stub, i, j, value_vars, sep): newdf = melt(df, id_vars=i, value_vars=value_vars, value_name=stub.rstrip(sep), var_name=j) newdf[j] = Categorical(newdf[j]) newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") # GH17627 Cast numerics suffixes to int/float newdf[j] = to_numeric(newdf[j], errors='ignore') return newdf.set_index(i + [j])
def where(self, cond, other=None): if other is None: other = self._na_value values = np.where(cond, self.values, other) from pandas.core.categorical import Categorical cat = Categorical(values, categories=self.categories, ordered=self.ordered) return self._shallow_copy(cat, **self._get_attributes_dict())
def test_na_flags_int_levels(self): # #1457 levels = range(10) labels = np.random.randint(0, 10, 20) labels[::5] = -1 cat = Categorical(labels, levels) repr(cat) self.assert_(np.array_equal(com.isnull(cat), labels == -1))
def sparse_dummies(df, column): '''Returns sparse OHE matrix for the column of the dataframe''' categories = Categorical(df[column]) column_names = np.array([ "{}_{}".format(column, str(i)) for i in range(len(categories.categories)) ]) N = len(categories) row_numbers = np.arange(N, dtype=np.int) ones = np.ones((N, )) return csr_matrix((ones, (row_numbers, categories.codes))), column_names
def _indicator_post_merge(self, result): result['_left_indicator'] = result['_left_indicator'].fillna(0) result['_right_indicator'] = result['_right_indicator'].fillna(0) result[self.indicator_name] = Categorical((result['_left_indicator'] + result['_right_indicator']), categories=[1,2,3]) result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(['left_only', 'right_only', 'both']) result = result.drop(labels=['_left_indicator', '_right_indicator'], axis=1) return result
def union_categoricals(to_union): """ Combine list-like of Categoricals, unioning categories. All must have the same dtype, and none can be ordered. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categoricals Returns ------- Categorical A single array, categories will be ordered as they appear in the list Raises ------ TypeError If any of the categoricals are ordered or all do not have the same dtype ValueError Emmpty list of categoricals passed """ from pandas import Index, Categorical if len(to_union) == 0: raise ValueError('No Categoricals to union') first = to_union[0] if any(c.ordered for c in to_union): raise TypeError("Can only combine unordered Categoricals") if not all( com.is_dtype_equal(c.categories.dtype, first.categories.dtype) for c in to_union): raise TypeError("dtype of categories must be the same") cats = first.categories unique_cats = cats.append([c.categories for c in to_union[1:]]).unique() categories = Index(unique_cats) new_codes = [] for c in to_union: indexer = categories.get_indexer(c.categories) new_codes.append(indexer.take(c.codes)) codes = np.concatenate(new_codes) return Categorical(codes, categories=categories, ordered=False, fastpath=True)
def test_big_print(self): factor = Categorical([0,1,2,0,1,2]*100, ['a', 'b', 'c'], name='cat') expected = [" a", " b", " c", " a", " b", " c", " a", " b", " c", " a", " b", " c", " a", "...", " c", " a", " b", " c", " a", " b", " c", " a", " b", " c", " a", " b", " c", "Levels (3): Index([a, b, c], dtype=object)", "Name: cat, Length: 600" ] expected = "\n".join(expected) # hack because array_repr changed in numpy > 1.6.x actual = repr(factor) pat = "Index\(\['a', 'b', 'c']" sub = "Index([a, b, c]" actual = re.sub(pat, sub, actual) self.assertEqual(actual, expected)
def test_describe(self): # string type desc = self.factor.describe() expected = DataFrame.from_dict( dict(counts=[3, 2, 3], freqs=[3 / 8., 2 / 8., 3 / 8.], levels=['a', 'b', 'c'])).set_index('levels') tm.assert_frame_equal(desc, expected) # check an integer one desc = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]).describe() expected = DataFrame.from_dict( dict(counts=[5, 3, 3], freqs=[5 / 11., 3 / 11., 3 / 11.], levels=[1, 2, 3])).set_index('levels') tm.assert_frame_equal(desc, expected)
def get_result(self): values, _ = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # may need to coerce categoricals here if self.is_categorical is not None: categories = self.is_categorical.categories ordered = self.is_categorical.ordered values = [ Categorical(values[:, i], categories=categories, ordered=ordered) for i in range(values.shape[-1]) ] return self.constructor(values, index=index, columns=columns)
def sparse_dummies(df, column): '''Returns sparse OHE matrix for the column of the dataframe''' print(column) categories = Categorical(df[column]) print(categories) # return a CategoricalDtype object column_names = np.array([ "{}_{}".format(column, str(i)) for i in range(len(categories.categories)) ]) print(column_names) # f-string, format strings N = len(categories) row_numbers = np.arange(N, dtype=np.int) ones = np.ones((N, )) # categories.codes encode the strinig with number # create a matrix with 1's only at (i, i's category) return csr_matrix((ones, (row_numbers, categories.codes))), column_names
def lexsort_indexer(keys, orders=None, na_position='last'): from pandas.core.categorical import Categorical labels = [] shape = [] if isinstance(orders, bool): orders = [orders] * len(keys) elif orders is None: orders = [True] * len(keys) for key, order in zip(keys, orders): # we are already a Categorical if is_categorical_dtype(key): c = key # create the Categorical else: c = Categorical(key, ordered=True) if na_position not in ['last', 'first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) n = len(c.categories) codes = c.codes.copy() mask = (c.codes == -1) if order: # ascending if na_position == 'last': codes = np.where(mask, n, codes) elif na_position == 'first': codes += 1 else: # not order means descending if na_position == 'last': codes = np.where(mask, n, n - codes - 1) elif na_position == 'first': codes = np.where(mask, 0, n - codes) if mask.any(): n += 1 shape.append(n) labels.append(codes) return indexer_from_factorized(labels, shape)
def _create_categorical(self, data, categories=None, ordered=None, dtype=None): """ *this is an internal non-public method* create the correct categorical from data and the properties Parameters ---------- data : data for new Categorical categories : optional categories, defaults to existing ordered : optional ordered attribute, defaults to existing dtype : CategoricalDtype, defaults to existing Returns ------- Categorical """ if (isinstance(data, (ABCSeries, type(self))) and is_categorical_dtype(data)): data = data.values if not isinstance(data, ABCCategorical): if ordered is None and dtype is None: ordered = False from pandas.core.categorical import Categorical data = Categorical(data, categories=categories, ordered=ordered, dtype=dtype) else: from pandas.core.dtypes.dtypes import CategoricalDtype if categories is not None: data = data.set_categories(categories, ordered=ordered) elif ordered is not None and ordered != data.ordered: data = data.set_ordered(ordered) if isinstance(dtype, CategoricalDtype): # we want to silently ignore dtype='category' data = data._set_dtype(dtype) return data
def where(self, cond, other=None): """ .. versionadded:: 0.19.0 Return an Index of same shape as self and whose corresponding entries are from self where cond is True and otherwise are from other. Parameters ---------- cond : boolean same length as self other : scalar, or array-like """ if other is None: other = self._na_value values = np.where(cond, self.values, other) from pandas.core.categorical import Categorical cat = Categorical(values, categories=self.categories, ordered=self.ordered) return self._shallow_copy(cat, **self._get_attributes_dict())
def union_categoricals(to_union): """ Combine list-like of Categoricals, unioning categories. All must have the same dtype, and none can be ordered. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categoricals Returns ------- Categorical A single array, categories will be ordered as they appear in the list Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical ValueError Emmpty list of categoricals passed """ from pandas import Index, Categorical if len(to_union) == 0: raise ValueError('No Categoricals to union') first = to_union[0] if not all( is_dtype_equal(c.categories.dtype, first.categories.dtype) for c in to_union): raise TypeError("dtype of categories must be the same") if all(first.is_dtype_equal(other) for other in to_union[1:]): return Categorical(np.concatenate([c.codes for c in to_union]), categories=first.categories, ordered=first.ordered, fastpath=True) elif all(not c.ordered for c in to_union): # not ordered pass else: # to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') cats = first.categories unique_cats = cats.append([c.categories for c in to_union[1:]]).unique() categories = Index(unique_cats) new_codes = [] for c in to_union: if len(c.categories) > 0: indexer = categories.get_indexer(c.categories) new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) else: # must be all NaN new_codes.append(c.codes) new_codes = np.concatenate(new_codes) return Categorical(new_codes, categories=categories, ordered=False, fastpath=True)
def union_categoricals(to_union, sort_categories=False): """ Combine list-like of Categoricals, unioning categories. All categories must have the same dtype. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categoricals sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. Returns ------- result : Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Emmpty list of categoricals passed """ from pandas import Index, Categorical if len(to_union) == 0: raise ValueError('No Categoricals to union') first = to_union[0] if not all( is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:]): raise TypeError("dtype of categories must be the same") ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered new_codes = np.concatenate([c.codes for c in to_union]) if sort_categories and ordered: raise TypeError("Cannot use sort_categories=True with " "ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) new_codes = take_1d(indexer, new_codes, fill_value=-1) elif all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = Index(cats.unique()) if sort_categories: categories = categories.sort_values() new_codes = [] for c in to_union: if len(c.categories) > 0: indexer = categories.get_indexer(c.categories) new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) else: # must be all NaN new_codes.append(c.codes) new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
def empty(types, size, cats=None, cols=None, index_type=None, index_name=None): """ Create empty DataFrame to assign into Parameters ---------- types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples applies to non-categorical columns. If there are only categorical columns, an empty string of None will do. size: int Number of rows to allocate cats: dict {col: labels} Location and labels for categorical columns, e.g., {1: ['mary', 'mo]} will create column index 1 (inserted amongst the numerical columns) with two possible values. If labels is an integers, `{'col': 5}`, will generate temporary labels using range. If None, or column name is missing, will assume 16-bit integers (a reasonable default). cols: list of labels assigned column names, including categorical ones. Returns ------- - dataframe with correct shape and data-types - list of numpy views, in order, of the columns of the dataframe. Assign to this. """ df = DataFrame() views = {} cols = cols if cols is not None else range(cols) if isinstance(types, str): types = types.split(',') for t, col in zip(types, cols): if str(t) == 'category': if cats is None or col not in cats: df[str(col)] = Categorical( [], categories=RangeIndex(0, 2**14), fastpath=True) elif isinstance(cats[col], int): df[str(col)] = Categorical( [], categories=RangeIndex(0, cats[col]), fastpath=True) else: # explicit labels list df[str(col)] = Categorical([], categories=cats[col], fastpath=True) else: df[str(col)] = np.empty(0, dtype=t) if index_type is not None and index_type is not False: if index_name is None: raise ValueError('If using an index, must give an index name') if str(index_type) == 'category': if cats is None or index_name not in cats: c = Categorical( [], categories=RangeIndex(0, 2**14), fastpath=True) elif isinstance(cats[index_name], int): c = Categorical( [], categories=RangeIndex(0, cats[index_name]), fastpath=True) else: # explicit labels list c = Categorical([], categories=cats[index_name], fastpath=True) print(cats, index_name, c) vals = np.empty(size, dtype=c.codes.dtype) index = CategoricalIndex(c) index._data._codes = vals views[index_name] = vals else: index = np.empty(size, dtype=index_type) views[index_name] = index axes = [df._data.axes[0], index] else: axes = [df._data.axes[0], RangeIndex(size)] # allocate and create blocks blocks = [] for block in df._data.blocks: if block.is_categorical: categories = block.values.categories code = np.zeros(shape=size, dtype=block.values.codes.dtype) values = Categorical(values=code, categories=categories, fastpath=True) else: new_shape = (block.values.shape[0], size) values = np.empty(shape=new_shape, dtype=block.values.dtype) new_block = block.make_block_same_class(values=values) blocks.append(new_block) # create block manager df = DataFrame(BlockManager(blocks, axes)) # create views for block in df._data.blocks: dtype = block.dtype inds = block.mgr_locs.indexer if isinstance(inds, slice): inds = list(range(inds.start, inds.stop, inds.step)) for i, ind in enumerate(inds): col = df.columns[ind] if str(dtype) == 'category': views[col] = block.values._codes views[col+'-catdef'] = block.values else: views[col] = block.values[i] if index_name is not None and index_name is not False: df.index.name = index_name if str(index_type) == 'category': views[index_name+'-catdef'] = df._data.axes[1].values return df, views
def _bins_to_cuts_new( x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False, ): x_is_series = isinstance(x, Series) series_index = None # Added this line to the original code bins = np.array(sorted(list(set(bins)))) if x_is_series: series_index = x.index if name is None: name = x.name x = np.asarray(x) side = "left" if right else "right" ids = bins.searchsorted(x, side=side) if len(algos.unique(bins)) < len(bins): raise ValueError("Bin edges must be unique: %s" % repr(bins)) if include_lowest: ids[x == bins[0]] = 1 na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: increases = 0 while True: try: levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest) except ValueError: increases += 1 precision += 1 if increases >= 20: raise else: break else: if len(labels) != len(bins) - 1: raise ValueError("Bin labels must be one fewer than " "the number of bin edges") levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, ordered=True, fastpath=True) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) if x_is_series: fac = Series(fac, index=series_index, name=name) if not retbins: return fac return fac, bins
def factorize(index): if index.is_unique: return index, np.arange(len(index)) cat = Categorical(index, ordered=True) return cat.categories, cat.codes
def test_levels_none(self): factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) self.assert_(factor.equals(self.factor))