def test_nanvar_with_ddof(self): x = np.random.uniform(0, 10, (20, 100)) np.fill_diagonal(x, np.nan) for axis in [None, 0, 1]: np.testing.assert_almost_equal( np.nanvar(x, axis=axis, ddof=10), nanvar(csr_matrix(x), axis=axis, ddof=10), )
def __compute_statistics(self): # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = [self.__attributes, self.__class_vars, self.__metas] # Filter out any matrices with size 0 matrices = list(filter(lambda tup: tup[1].size, matrices)) self._variable_types = np.array([type(var) for var in self.variables]) self._variable_names = np.array( [var.name.lower() for var in self.variables]) self._min = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), time_f=lambda x: ut.nanmin(x, axis=0), ) self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean( x, axis=0), ) self._missing = self.__compute_stat( matrices, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._max = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), time_f=lambda x: ut.nanmax(x, axis=0), ) # Since scipy apparently can't do mode on sparse matrices, cast it to # dense. This can be very inefficient for large matrices, and should # be changed def __mode(x, *args, **kwargs): if sp.issparse(x): x = x.todense(order="C") # return ss.mode(x, *args, **kwargs)[0] return ut.nanmode( x, *args, **kwargs)[0] # Temporary replacement for scipy < 1.2.0 self._center = self.__compute_stat( matrices, discrete_f=lambda x: __mode(x, axis=0), continuous_f=lambda x: ut.nanmean(x, axis=0), time_f=lambda x: ut.nanmean(x, axis=0), )
def __compute_statistics(self): # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = [self.__attributes, self.__class_vars, self.__metas] # Filter out any matrices with size 0 matrices = list(filter(lambda tup: tup[1].size, matrices)) self._variable_types = np.array([type(var) for var in self.variables]) self._variable_names = np.array([var.name.lower() for var in self.variables]) self._min = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), time_f=lambda x: ut.nanmin(x, axis=0), ) self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0), ) self._missing = self.__compute_stat( matrices, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._max = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), time_f=lambda x: ut.nanmax(x, axis=0), ) # Since scipy apparently can't do mode on sparse matrices, cast it to # dense. This can be very inefficient for large matrices, and should # be changed def __mode(x, *args, **kwargs): if sp.issparse(x): x = x.todense(order="C") # return ss.mode(x, *args, **kwargs)[0] return ut.nanmode(x, *args, **kwargs)[0] # Temporary replacement for scipy self._center = self.__compute_stat( matrices, discrete_f=lambda x: __mode(x, axis=0), continuous_f=lambda x: ut.nanmean(x, axis=0), time_f=lambda x: ut.nanmean(x, axis=0), )
def get_continuous_stats(self, column): """ Return mean, variance and distance betwwen pairs of missing values for the given columns. The method is called by inherited `fit_rows` to construct a row-distance model """ mean = util.nanmean(column) var = util.nanvar(column) if self.normalize: if var == 0: return None dist_missing2_cont = 1 else: dist_missing2_cont = 2 * var if np.isnan(dist_missing2_cont): dist_missing2_cont = 0 return mean, var, dist_missing2_cont
def __compute_statistics(self): # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = [self.__attributes, self.__class_vars, self.__metas] # Filter out any matrices with size 0 matrices = list(filter(lambda tup: tup[1].size, matrices)) self._variable_types = np.array([type(var) for var in self.variables]) self._variable_names = np.array( [var.name.lower() for var in self.variables]) self._min = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), time_f=lambda x: ut.nanmin(x, axis=0), ) self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean( x, axis=0), ) self._missing = self.__compute_stat( matrices, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._max = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), time_f=lambda x: ut.nanmax(x, axis=0), ) self._center = self.__compute_stat( matrices, discrete_f=lambda x: ss.mode(x)[0], continuous_f=lambda x: ut.nanmean(x, axis=0), time_f=lambda x: ut.nanmean(x, axis=0), )
def __compute_statistics(self): # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = [self.__attributes, self.__class_vars, self.__metas] # Filter out any matrices with size 0 matrices = list(filter(lambda tup: tup[1].size, matrices)) self._variable_types = np.array([type(var) for var in self.variables]) self._variable_names = np.array([var.name.lower() for var in self.variables]) self._min = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), time_f=lambda x: ut.nanmin(x, axis=0), ) self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0), ) self._missing = self.__compute_stat( matrices, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._max = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), time_f=lambda x: ut.nanmax(x, axis=0), ) self._center = self.__compute_stat( matrices, discrete_f=lambda x: ss.mode(x)[0], continuous_f=lambda x: ut.nanmean(x, axis=0), time_f=lambda x: ut.nanmean(x, axis=0), )
def test_nanvar(self, array): for X in self.data: X_sparse = array(X) np.testing.assert_array_equal(nanvar(X_sparse), np.nanvar(X))
def coefficient_of_variation(x: np.ndarray) -> np.ndarray: mu = ut.nanmean(x, axis=0) mask = ~np.isclose(mu, 0, atol=1e-12) result = np.full_like(mu, fill_value=np.inf) result[mask] = np.sqrt(ut.nanvar(x, axis=0)[mask]) / mu[mask] return result
def test_nanvar(self, array): for X in self.data: X_sparse = array(X) np.testing.assert_array_equal( nanvar(X_sparse), np.nanvar(X))
def __compute_statistics(self): # We will compute statistics over all data at once matrices = [self._data.X, self._data._Y, self._data.metas] # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = zip([ self._domain.attributes, self._domain.class_vars, self._domain.metas ], matrices) # Filter out any matrices with size 0, filter the zipped matrices to # eliminate variables in a single swoop matrices = list(filter(lambda tup: tup[1].size, matrices)) def _apply_to_types(attrs_x_pair, discrete_f=None, continuous_f=None, time_f=None, string_f=None, default_val=np.nan): """Apply functions to variable types e.g. discrete_f to discrete variables. Default value is returned if there is no function defined for specific variable types.""" attrs, x = attrs_x_pair result = np.full(len(attrs), default_val) disc_var_idx, cont_var_idx, time_var_idx, str_var_idx = self._attr_indices(attrs) if discrete_f and x[:, disc_var_idx].size: result[disc_var_idx] = discrete_f(x[:, disc_var_idx].astype(np.float64)) if continuous_f and x[:, cont_var_idx].size: result[cont_var_idx] = continuous_f(x[:, cont_var_idx].astype(np.float64)) if time_f and x[:, time_var_idx].size: result[time_var_idx] = time_f(x[:, time_var_idx].astype(np.float64)) if string_f and x[:, str_var_idx].size: result[str_var_idx] = string_f(x[:, str_var_idx].astype(np.object)) return result self._variable_types = [type(var) for var in self._attributes] self._variable_names = [var.name.lower() for var in self._attributes] # Compute the center _center = partial( _apply_to_types, discrete_f=lambda x: ss.mode(x)[0], continuous_f=lambda x: ut.nanmean(x, axis=0), ) self._center = np.hstack(map(_center, matrices)) # Compute the dispersion def _entropy(x): p = [ut.bincount(row)[0] for row in x.T] p = [pk / np.sum(pk) for pk in p] return np.fromiter((ss.entropy(pk) for pk in p), dtype=np.float64) _dispersion = partial( _apply_to_types, discrete_f=lambda x: _entropy(x), continuous_f=lambda x: ut.nanvar(x, axis=0), ) self._dispersion = np.hstack(map(_dispersion, matrices)) # Compute minimum values _max = partial( _apply_to_types, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), ) self._max = np.hstack(map(_max, matrices)) # Compute maximum values _min = partial( _apply_to_types, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), ) self._min = np.hstack(map(_min, matrices)) # Compute # of missing values _missing = partial( _apply_to_types, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._missing = np.hstack(map(_missing, matrices))
def test_nanvar(self, array): for X in self.data: X_sparse = array(X) np.testing.assert_almost_equal( nanvar(X_sparse), np.nanvar(X), decimal=14) # np.nanvar and bn.nanvar differ slightly