def __new__(cls, input_array, design_info=None, default_column_prefix="column"): """Create a DesignMatrix, or cast an existing matrix to a DesignMatrix. A call like:: DesignMatrix(my_array) will convert an arbitrary array_like object into a DesignMatrix. The return from this function is guaranteed to be a two-dimensional ndarray with a real-valued floating point dtype, and a ``.design_info`` attribute which matches its shape. If the `design_info` argument is not given, then one is created via :meth:`DesignInfo.from_array` using the given `default_column_prefix`. Depending on the input array, it is possible this will pass through its input unchanged, or create a view. """ # Pass through existing DesignMatrixes. The design_info check is # necessary because numpy is sort of annoying and cannot be stopped # from turning non-design-matrix arrays into DesignMatrix # instances. (E.g., my_dm.diagonal() will return a DesignMatrix # object, but one without a design_info attribute.) if (isinstance(input_array, DesignMatrix) and hasattr(input_array, "design_info")): return input_array self = atleast_2d_column_default(input_array).view(cls) # Upcast integer to floating point if safe_issubdtype(self.dtype, np.integer): self = np.asarray(self, dtype=float).view(cls) if self.ndim > 2: raise ValueError("DesignMatrix must be 2d") assert self.ndim == 2 if design_info is None: design_info = DesignInfo.from_array(self, default_column_prefix) if len(design_info.column_names) != self.shape[1]: raise ValueError("wrong number of column names for design matrix " "(got %s, wanted %s)" % (len(design_info.column_names), self.shape[1])) self.design_info = design_info if not safe_issubdtype(self.dtype, np.floating): raise ValueError( "design matrix must be real-valued floating point") return self
def _eval_factor(factor_info, data, NA_action): factor = factor_info.factor result = factor.eval(factor_info.state, data) # Returns either a 2d ndarray, or a DataFrame, plus is_NA mask if factor_info.type == "numerical": result = atleast_2d_column_default(result, preserve_pandas=True) _max_allowed_dim(2, result, factor) if result.shape[1] != factor_info.num_columns: raise PatsyError("when evaluating factor %s, I got %s columns " "instead of the %s I was expecting" % (factor.name(), factor_info.num_columns, result.shape[1]), factor) if not safe_issubdtype(np.asarray(result).dtype, np.number): raise PatsyError("when evaluating numeric factor %s, " "I got non-numeric data of type '%s'" % (factor.name(), result.dtype), factor) return result, NA_action.is_numerical_NA(result) # returns either a 1d ndarray or a pandas.Series, plus is_NA mask else: assert factor_info.type == "categorical" result = categorical_to_int(result, factor_info.categories, NA_action, origin=factor_info.factor) assert result.ndim == 1 return result, np.asarray(result == -1)
def from_array(cls, array_like, default_column_prefix="column"): """Find or construct a DesignInfo appropriate for a given array_like. If the input `array_like` already has a ``.design_info`` attribute, then it will be returned. Otherwise, a new DesignInfo object will be constructed, using names either taken from the `array_like` (e.g., for a pandas DataFrame with named columns), or constructed using `default_column_prefix`. This is how :func:`dmatrix` (for example) creates a DesignInfo object if an arbitrary matrix is passed in. :arg array_like: An ndarray or pandas container. :arg default_column_prefix: If it's necessary to invent column names, then this will be used to construct them. :returns: a DesignInfo object """ if hasattr(array_like, "design_info") and isinstance(array_like.design_info, cls): return array_like.design_info arr = atleast_2d_column_default(array_like, preserve_pandas=True) if arr.ndim > 2: raise ValueError("design matrix can't have >2 dimensions") columns = getattr(arr, "columns", range(arr.shape[1])) if (hasattr(columns, "dtype") and not safe_issubdtype(columns.dtype, np.integer)): column_names = [str(obj) for obj in columns] else: column_names = ["%s%s" % (default_column_prefix, i) for i in columns] return DesignInfo(column_names)
def __new__(cls, input_array, design_info=None, default_column_prefix="column"): """Create a DesignMatrix, or cast an existing matrix to a DesignMatrix. A call like:: DesignMatrix(my_array) will convert an arbitrary array_like object into a DesignMatrix. The return from this function is guaranteed to be a two-dimensional ndarray with a real-valued floating point dtype, and a ``.design_info`` attribute which matches its shape. If the `design_info` argument is not given, then one is created via :meth:`DesignInfo.from_array` using the given `default_column_prefix`. Depending on the input array, it is possible this will pass through its input unchanged, or create a view. """ # Pass through existing DesignMatrixes. The design_info check is # necessary because numpy is sort of annoying and cannot be stopped # from turning non-design-matrix arrays into DesignMatrix # instances. (E.g., my_dm.diagonal() will return a DesignMatrix # object, but one without a design_info attribute.) if (isinstance(input_array, DesignMatrix) and hasattr(input_array, "design_info")): return input_array self = atleast_2d_column_default(input_array).view(cls) # Upcast integer to floating point if safe_issubdtype(self.dtype, np.integer): self = np.asarray(self, dtype=float).view(cls) if self.ndim > 2: raise ValueError("DesignMatrix must be 2d") assert self.ndim == 2 if design_info is None: design_info = DesignInfo.from_array(self, default_column_prefix) if len(design_info.column_names) != self.shape[1]: raise ValueError("wrong number of column names for design matrix " "(got %s, wanted %s)" % (len(design_info.column_names), self.shape[1])) self.design_info = design_info if not safe_issubdtype(self.dtype, np.floating): raise ValueError("design matrix must be real-valued floating point") return self
def guess_categorical(data): if safe_is_pandas_categorical(data): return True if isinstance(data, _CategoricalBox): return True data = np.asarray(data) if safe_issubdtype(data.dtype, np.number): return False return True
def transform(self, x): x = asarray_or_pandas(x) # This doesn't copy data unless our input is a DataFrame that has # heterogenous types. And in that case we're going to be munging the # types anyway, so copying isn't a big deal. x_arr = np.asarray(x) if safe_issubdtype(x_arr.dtype, np.integer): dt = float else: dt = x_arr.dtype mean_val = np.asarray(self._sum / self._count, dtype=dt) centered = atleast_2d_column_default(x, preserve_pandas=True) - mean_val return pandas_friendly_reshape(centered, x.shape)
def transform(self, x): x = asarray_or_pandas(x) # This doesn't copy data unless our input is a DataFrame that has # heterogeneous types. And in that case we're going to be munging the # types anyway, so copying isn't a big deal. x_arr = np.asarray(x) if safe_issubdtype(x_arr.dtype, np.integer): dt = float else: dt = x_arr.dtype mean_val = np.asarray(self._sum / self._count, dtype=dt) centered = atleast_2d_column_default(x, preserve_pandas=True) - mean_val return pandas_friendly_reshape(centered, x.shape)
def code_contrast_matrix(intercept, levels, contrast, default=None): if contrast is None: contrast = default if callable(contrast): contrast = contrast() if isinstance(contrast, ContrastMatrix): return contrast as_array = np.asarray(contrast) if safe_issubdtype(as_array.dtype, np.number): return ContrastMatrix(as_array, _name_levels("custom", range(as_array.shape[1]))) if intercept: return contrast.code_with_intercept(levels) else: return contrast.code_without_intercept(levels)
def eval(self, data, NA_action): result = self.factor.eval(self._state, data) result = atleast_2d_column_default(result, preserve_pandas=True) _max_allowed_dim(2, result, self.factor) if result.shape[1] != self._expected_columns: raise PatsyError("when evaluating factor %s, I got %s columns " "instead of the %s I was expecting" % (self.factor.name(), self._expected_columns, result.shape[1]), self.factor) if not safe_issubdtype(np.asarray(result).dtype, np.number): raise PatsyError("when evaluating numeric factor %s, " "I got non-numeric data of type '%s'" % (self.factor.name(), result.dtype), self.factor) return result, NA_action.is_numerical_NA(result)
def sniff(self, data): if hasattr(data, "contrast"): self._contrast = data.contrast # returns a bool: are we confident that we found all the levels? if isinstance(data, _CategoricalBox): if data.levels is not None: self._levels = tuple(data.levels) return True else: # unbox and fall through data = data.data if safe_is_pandas_categorical(data): # pandas.Categorical has its own NA detection, so don't try to # second-guess it. self._levels = tuple(pandas_Categorical_categories(data)) return True # fastpath to avoid doing an item-by-item iteration over boolean # arrays, as requested by #44 if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_): self._level_set = set([True, False]) return True data = _categorical_shape_fix(data) for value in data: if self._NA_action.is_categorical_NA(value): continue if value is True or value is False: self._level_set.update([True, False]) else: try: self._level_set.add(value) except TypeError: raise PatsyError("Error interpreting categorical data: " "all items must be hashable", self._origin) # If everything we've seen is boolean, assume that everything else # would be too. Otherwise we need to keep looking. return self._level_set == set([True, False])
def sniff(self, data): if hasattr(data, "contrast"): self._contrast = data.contrast # returns a bool: are we confident that we found all the levels? if isinstance(data, _CategoricalBox): if data.levels is not None: self._levels = tuple(data.levels) return True else: # unbox and fall through data = data.data if safe_is_pandas_categorical(data): # pandas.Categorical has its own NA detection, so don't try to # second-guess it. self._levels = tuple(pandas_Categorical_categories(data)) return True # fastpath to avoid doing an item-by-item iteration over boolean # arrays, as requested by #44 if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_): self._level_set = set([True, False]) return True data = _categorical_shape_fix(data) for value in data: if self._NA_action.is_categorical_NA(value): continue if value is True or value is False: self._level_set.update([True, False]) else: try: self._level_set.add(value) except TypeError: raise PatsyError( "Error interpreting categorical data: " "all items must be hashable", self._origin) # If everything we've seen is boolean, assume that everything else # would be too. Otherwise we need to keep looking. return self._level_set == set([True, False])
def categorical_to_int(data, levels, NA_action, origin=None): assert isinstance(levels, tuple) # In this function, missing values are always mapped to -1 if safe_is_pandas_categorical(data): data_levels_tuple = tuple(pandas_Categorical_categories(data)) if not data_levels_tuple == levels: raise PatsyError("mismatching levels: expected %r, got %r" % (levels, data_levels_tuple), origin) # pandas.Categorical also uses -1 to indicate NA, and we don't try to # second-guess its NA detection, so we can just pass it back. return pandas_Categorical_codes(data) if isinstance(data, _CategoricalBox): if data.levels is not None and tuple(data.levels) != levels: raise PatsyError("mismatching levels: expected %r, got %r" % (levels, tuple(data.levels)), origin) data = data.data data = _categorical_shape_fix(data) try: level_to_int = dict(zip(levels, range(len(levels)))) except TypeError: raise PatsyError("Error interpreting categorical data: " "all items must be hashable", origin) # fastpath to avoid doing an item-by-item iteration over boolean arrays, # as requested by #44 if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_): if level_to_int[False] == 0 and level_to_int[True] == 1: return data.astype(np.int_) out = np.empty(len(data), dtype=int) for i, value in enumerate(data): if NA_action.is_categorical_NA(value): out[i] = -1 else: try: out[i] = level_to_int[value] except KeyError: SHOW_LEVELS = 4 level_strs = [] if len(levels) <= SHOW_LEVELS: level_strs += [repr(level) for level in levels] else: level_strs += [repr(level) for level in levels[:SHOW_LEVELS//2]] level_strs.append("...") level_strs += [repr(level) for level in levels[-SHOW_LEVELS//2:]] level_str = "[%s]" % (", ".join(level_strs)) raise PatsyError("Error converting data to categorical: " "observation with value %r does not match " "any of the expected levels (expected: %s)" % (value, level_str), origin) except TypeError: raise PatsyError("Error converting data to categorical: " "encountered unhashable value %r" % (value,), origin) if have_pandas and isinstance(data, pandas.Series): out = pandas.Series(out, index=data.index) return out
def categorical_to_int(data, levels, NA_action, origin=None): assert isinstance(levels, tuple) # In this function, missing values are always mapped to -1 if safe_is_pandas_categorical(data): data_levels_tuple = tuple(pandas_Categorical_categories(data)) if not data_levels_tuple == levels: raise PatsyError( "mismatching levels: expected %r, got %r" % (levels, data_levels_tuple), origin) # pandas.Categorical also uses -1 to indicate NA, and we don't try to # second-guess its NA detection, so we can just pass it back. return pandas_Categorical_codes(data) if isinstance(data, _CategoricalBox): if data.levels is not None and tuple(data.levels) != levels: raise PatsyError( "mismatching levels: expected %r, got %r" % (levels, tuple(data.levels)), origin) data = data.data data = _categorical_shape_fix(data) try: level_to_int = dict(zip(levels, range(len(levels)))) except TypeError: raise PatsyError( "Error interpreting categorical data: " "all items must be hashable", origin) # fastpath to avoid doing an item-by-item iteration over boolean arrays, # as requested by #44 if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_): if level_to_int[False] == 0 and level_to_int[True] == 1: return data.astype(np.int_) out = np.empty(len(data), dtype=int) for i, value in enumerate(data): if NA_action.is_categorical_NA(value): out[i] = -1 else: try: out[i] = level_to_int[value] except KeyError: SHOW_LEVELS = 4 level_strs = [] if len(levels) <= SHOW_LEVELS: level_strs += [repr(level) for level in levels] else: level_strs += [ repr(level) for level in levels[:SHOW_LEVELS // 2] ] level_strs.append("...") level_strs += [ repr(level) for level in levels[-SHOW_LEVELS // 2:] ] level_str = "[%s]" % (", ".join(level_strs)) raise PatsyError( "Error converting data to categorical: " "observation with value %r does not match " "any of the expected levels (expected: %s)" % (value, level_str), origin) except TypeError: raise PatsyError( "Error converting data to categorical: " "encountered unhashable value %r" % (value, ), origin) if have_pandas and isinstance(data, pandas.Series): out = pandas.Series(out, index=data.index) return out