def guess_categorical(data): if safe_is_pandas_categorical(data): return True if isinstance(data, _CategoricalBox): return True data = np.asarray(data) if safe_issubdtype(data.dtype, np.number): return False return True
def sniff(self, data): if hasattr(data, "contrast"): self._contrast = data.contrast # returns a bool: are we confident that we found all the levels? if isinstance(data, _CategoricalBox): if data.levels is not None: self._levels = tuple(data.levels) return True else: # unbox and fall through data = data.data if safe_is_pandas_categorical(data): # pandas.Categorical has its own NA detection, so don't try to # second-guess it. self._levels = tuple(pandas_Categorical_categories(data)) return True # fastpath to avoid doing an item-by-item iteration over boolean # arrays, as requested by #44 if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_): self._level_set = set([True, False]) return True data = _categorical_shape_fix(data) for value in data: if self._NA_action.is_categorical_NA(value): continue if value is True or value is False: self._level_set.update([True, False]) else: try: self._level_set.add(value) except TypeError: raise PatsyError("Error interpreting categorical data: " "all items must be hashable", self._origin) # If everything we've seen is boolean, assume that everything else # would be too. Otherwise we need to keep looking. return self._level_set == set([True, False])
def sniff(self, data): if hasattr(data, "contrast"): self._contrast = data.contrast # returns a bool: are we confident that we found all the levels? if isinstance(data, _CategoricalBox): if data.levels is not None: self._levels = tuple(data.levels) return True else: # unbox and fall through data = data.data if safe_is_pandas_categorical(data): # pandas.Categorical has its own NA detection, so don't try to # second-guess it. self._levels = tuple(pandas_Categorical_categories(data)) return True # fastpath to avoid doing an item-by-item iteration over boolean # arrays, as requested by #44 if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_): self._level_set = set([True, False]) return True data = _categorical_shape_fix(data) for value in data: if self._NA_action.is_categorical_NA(value): continue if value is True or value is False: self._level_set.update([True, False]) else: try: self._level_set.add(value) except TypeError: raise PatsyError( "Error interpreting categorical data: " "all items must be hashable", self._origin) # If everything we've seen is boolean, assume that everything else # would be too. Otherwise we need to keep looking. return self._level_set == set([True, False])
def categorical_to_int(data, levels, NA_action, origin=None): assert isinstance(levels, tuple) # In this function, missing values are always mapped to -1 if safe_is_pandas_categorical(data): data_levels_tuple = tuple(pandas_Categorical_categories(data)) if not data_levels_tuple == levels: raise PatsyError("mismatching levels: expected %r, got %r" % (levels, data_levels_tuple), origin) # pandas.Categorical also uses -1 to indicate NA, and we don't try to # second-guess its NA detection, so we can just pass it back. return pandas_Categorical_codes(data) if isinstance(data, _CategoricalBox): if data.levels is not None and tuple(data.levels) != levels: raise PatsyError("mismatching levels: expected %r, got %r" % (levels, tuple(data.levels)), origin) data = data.data data = _categorical_shape_fix(data) try: level_to_int = dict(zip(levels, range(len(levels)))) except TypeError: raise PatsyError("Error interpreting categorical data: " "all items must be hashable", origin) # fastpath to avoid doing an item-by-item iteration over boolean arrays, # as requested by #44 if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_): if level_to_int[False] == 0 and level_to_int[True] == 1: return data.astype(np.int_) out = np.empty(len(data), dtype=int) for i, value in enumerate(data): if NA_action.is_categorical_NA(value): out[i] = -1 else: try: out[i] = level_to_int[value] except KeyError: SHOW_LEVELS = 4 level_strs = [] if len(levels) <= SHOW_LEVELS: level_strs += [repr(level) for level in levels] else: level_strs += [repr(level) for level in levels[:SHOW_LEVELS//2]] level_strs.append("...") level_strs += [repr(level) for level in levels[-SHOW_LEVELS//2:]] level_str = "[%s]" % (", ".join(level_strs)) raise PatsyError("Error converting data to categorical: " "observation with value %r does not match " "any of the expected levels (expected: %s)" % (value, level_str), origin) except TypeError: raise PatsyError("Error converting data to categorical: " "encountered unhashable value %r" % (value,), origin) if have_pandas and isinstance(data, pandas.Series): out = pandas.Series(out, index=data.index) return out
def categorical_to_int(data, levels, NA_action, origin=None): assert isinstance(levels, tuple) # In this function, missing values are always mapped to -1 if safe_is_pandas_categorical(data): data_levels_tuple = tuple(pandas_Categorical_categories(data)) if not data_levels_tuple == levels: raise PatsyError( "mismatching levels: expected %r, got %r" % (levels, data_levels_tuple), origin) # pandas.Categorical also uses -1 to indicate NA, and we don't try to # second-guess its NA detection, so we can just pass it back. return pandas_Categorical_codes(data) if isinstance(data, _CategoricalBox): if data.levels is not None and tuple(data.levels) != levels: raise PatsyError( "mismatching levels: expected %r, got %r" % (levels, tuple(data.levels)), origin) data = data.data data = _categorical_shape_fix(data) try: level_to_int = dict(zip(levels, range(len(levels)))) except TypeError: raise PatsyError( "Error interpreting categorical data: " "all items must be hashable", origin) # fastpath to avoid doing an item-by-item iteration over boolean arrays, # as requested by #44 if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_): if level_to_int[False] == 0 and level_to_int[True] == 1: return data.astype(np.int_) out = np.empty(len(data), dtype=int) for i, value in enumerate(data): if NA_action.is_categorical_NA(value): out[i] = -1 else: try: out[i] = level_to_int[value] except KeyError: SHOW_LEVELS = 4 level_strs = [] if len(levels) <= SHOW_LEVELS: level_strs += [repr(level) for level in levels] else: level_strs += [ repr(level) for level in levels[:SHOW_LEVELS // 2] ] level_strs.append("...") level_strs += [ repr(level) for level in levels[-SHOW_LEVELS // 2:] ] level_str = "[%s]" % (", ".join(level_strs)) raise PatsyError( "Error converting data to categorical: " "observation with value %r does not match " "any of the expected levels (expected: %s)" % (value, level_str), origin) except TypeError: raise PatsyError( "Error converting data to categorical: " "encountered unhashable value %r" % (value, ), origin) if have_pandas and isinstance(data, pandas.Series): out = pandas.Series(out, index=data.index) return out