def __compute_statistics(self): # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = [self.__attributes, self.__class_vars, self.__metas] # Filter out any matrices with size 0 matrices = list(filter(lambda tup: tup[1].size, matrices)) self._variable_types = np.array([type(var) for var in self.variables]) self._variable_names = np.array([var.name.lower() for var in self.variables]) self._min = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), time_f=lambda x: ut.nanmin(x, axis=0), ) self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0), ) self._missing = self.__compute_stat( matrices, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._max = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), time_f=lambda x: ut.nanmax(x, axis=0), ) # Since scipy apparently can't do mode on sparse matrices, cast it to # dense. This can be very inefficient for large matrices, and should # be changed def __mode(x, *args, **kwargs): if sp.issparse(x): x = x.todense(order="C") # return ss.mode(x, *args, **kwargs)[0] return ut.nanmode(x, *args, **kwargs)[0] # Temporary replacement for scipy self._center = self.__compute_stat( matrices, discrete_f=lambda x: __mode(x, axis=0), continuous_f=lambda x: ut.nanmean(x, axis=0), time_f=lambda x: ut.nanmean(x, axis=0), )
def test_nanmin_nanmax(self): warnings.filterwarnings("ignore", r".*All-NaN slice encountered.*") for X in self.data: X_sparse = csr_matrix(X) for axis in [None, 0, 1]: np.testing.assert_array_equal(nanmin(X, axis=axis), np.nanmin(X, axis=axis)) np.testing.assert_array_equal(nanmin(X_sparse, axis=axis), np.nanmin(X, axis=axis)) np.testing.assert_array_equal(nanmax(X, axis=axis), np.nanmax(X, axis=axis)) np.testing.assert_array_equal(nanmax(X_sparse, axis=axis), np.nanmax(X, axis=axis))
def update_sel_range(self, y_data): if y_data is None: curve1 = curve2 = pg.PlotDataItem(x=self.x_data, y=self.__mean) else: curve1 = pg.PlotDataItem(x=self.x_data, y=nanmin(y_data, axis=0)) curve2 = pg.PlotDataItem(x=self.x_data, y=nanmax(y_data, axis=0)) self.sel_range.setCurves(curve1, curve2)
def _get_histogram_edges(self): """Get the edges in the histogram based on the attribute type. In case of a continuous variable, we split the variable range into n bins. In case of a discrete variable, bins don't make sense, so we just return the attribute values. This will return the staring and ending edge, not just the edges in between (in the case of a continuous variable). Returns ------- np.ndarray """ if self.attribute.is_discrete: return np.array([self.attribute.to_val(v) for v in self.attribute.values]) else: edges = np.linspace(ut.nanmin(self.x), ut.nanmax(self.x), self.n_bins) edge_diff = edges[1] - edges[0] edges = np.hstack((edges, [edges[-1] + edge_diff])) # If the variable takes on a single value, we still need to spit # out some reasonable bin edges if np.all(edges == edges[0]): edges = np.array([edges[0] - 1, edges[0], edges[0] + 1]) return edges
def _get_histogram_edges(self): """Get the edges in the histogram based on the attribute type. In case of a continuous variable, we split the variable range into n bins. In case of a discrete variable, bins don't make sense, so we just return the attribute values. This will return the staring and ending edge, not just the edges in between (in the case of a continuous variable). Returns ------- np.ndarray """ if self.attribute.is_discrete: return np.array( [self.attribute.to_val(v) for v in self.attribute.values]) else: edges = np.linspace(ut.nanmin(self.x), ut.nanmax(self.x), self.n_bins) edge_diff = edges[1] - edges[0] edges = np.hstack((edges, [edges[-1] + edge_diff])) # If the variable takes on a single value, we still need to spit # out some reasonable bin edges if np.all(edges == edges[0]): edges = np.array([edges[0] - 1, edges[0], edges[0] + 1]) return edges
def _get_range_curve(self): color = QColor(self.color) color.setAlpha(LinePlotStyle.RANGE_ALPHA) bottom, top = nanmin(self.y_data, axis=0), nanmax(self.y_data, axis=0) return pg.FillBetweenItem( pg.PlotDataItem(x=self.x_data, y=bottom), pg.PlotDataItem(x=self.x_data, y=top), brush=color )
def _get_range_curve(self): color = QColor(self.color) color.setAlpha(self.graph.range_settings[Updater.ALPHA_LABEL]) bottom, top = nanmin(self.y_data, axis=0), nanmax(self.y_data, axis=0) return pg.FillBetweenItem( pg.PlotDataItem(x=self.x_data, y=bottom), pg.PlotDataItem(x=self.x_data, y=top), brush=color )
def test_nanmin_nanmax(self): for X in self.data: X_sparse = csr_matrix(X) for axis in [None, 0, 1]: np.testing.assert_array_equal( nanmin(X, axis=axis), np.nanmin(X, axis=axis)) np.testing.assert_array_equal( nanmin(X_sparse, axis=axis), np.nanmin(X, axis=axis)) np.testing.assert_array_equal( nanmax(X, axis=axis), np.nanmax(X, axis=axis)) np.testing.assert_array_equal( nanmax(X_sparse, axis=axis), np.nanmax(X, axis=axis))
def test_nanmin_nanmax(self): warnings.filterwarnings("ignore", r".*All-NaN slice encountered.*") for X in self.data: X_sparse = csr_matrix(X) for axis in [None, 0, 1]: np.testing.assert_array_equal( nanmin(X, axis=axis), np.nanmin(X, axis=axis)) np.testing.assert_array_equal( nanmin(X_sparse, axis=axis), np.nanmin(X, axis=axis)) np.testing.assert_array_equal( nanmax(X, axis=axis), np.nanmax(X, axis=axis)) np.testing.assert_array_equal( nanmax(X_sparse, axis=axis), np.nanmax(X, axis=axis))
def __compute_statistics(self): # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = [self.__attributes, self.__class_vars, self.__metas] # Filter out any matrices with size 0 matrices = list(filter(lambda tup: tup[1].size, matrices)) self._variable_types = np.array([type(var) for var in self.variables]) self._variable_names = np.array( [var.name.lower() for var in self.variables]) self._min = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), time_f=lambda x: ut.nanmin(x, axis=0), ) self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean( x, axis=0), ) self._missing = self.__compute_stat( matrices, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._max = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), time_f=lambda x: ut.nanmax(x, axis=0), ) self._center = self.__compute_stat( matrices, discrete_f=lambda x: ss.mode(x)[0], continuous_f=lambda x: ut.nanmean(x, axis=0), time_f=lambda x: ut.nanmean(x, axis=0), )
def __compute_statistics(self): # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = [self.__attributes, self.__class_vars, self.__metas] # Filter out any matrices with size 0 matrices = list(filter(lambda tup: tup[1].size, matrices)) self._variable_types = np.array([type(var) for var in self.variables]) self._variable_names = np.array([var.name.lower() for var in self.variables]) self._min = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), time_f=lambda x: ut.nanmin(x, axis=0), ) self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0), ) self._missing = self.__compute_stat( matrices, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._max = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), time_f=lambda x: ut.nanmax(x, axis=0), ) self._center = self.__compute_stat( matrices, discrete_f=lambda x: ss.mode(x)[0], continuous_f=lambda x: ut.nanmean(x, axis=0), time_f=lambda x: ut.nanmean(x, axis=0), )
def __call__(self, data: Table, attribute): values, _ = data.get_column_view(attribute) points = [] if values.size: mn, mx = ut.nanmin(values), ut.nanmax(values) if not np.isnan(mn): minf = int(1 + np.floor(mn / self.width)) maxf = int(1 + np.floor(mx / self.width)) if maxf - minf - 1 >= 100: raise TooManyIntervals points = [i * self.width for i in range(minf, maxf)] return Discretizer.create_discretized_var(data.domain[attribute], points, ndigits=self.digits)
def __call__(self, data, attribute, fixed=None): if fixed: min, max = fixed[attribute.name] points = self._split_eq_width(min, max) else: if type(data) == SqlTable: stats = BasicStats(data, attribute) points = self._split_eq_width(stats.min, stats.max) else: values = data[:, attribute] values = values.X if values.X.size else values.Y min, max = ut.nanmin(values), ut.nanmax(values) points = self._split_eq_width(min, max) return Discretizer.create_discretized_var( data.domain[attribute], points)
def __call__(self, data: Table, attribute, fixed=None): if fixed: mn, mx = fixed[attribute.name] points = self._split_eq_width(mn, mx) else: if type(data) == SqlTable: stats = BasicStats(data, attribute) points = self._split_eq_width(stats.min, stats.max) else: values, _ = data.get_column_view(attribute) if values.size: mn, mx = ut.nanmin(values), ut.nanmax(values) points = self._split_eq_width(mn, mx) else: points = [] return Discretizer.create_discretized_var(data.domain[attribute], points)
def calculate_log_reg_coefficients(self): self.log_reg_coeffs = [] self.log_reg_cont_data_extremes = [] self.b0 = None if self.classifier is None or self.domain is None: return if not isinstance(self.classifier, LogisticRegressionClassifier): return self.domain = self.reconstruct_domain(self.classifier.original_domain, self.domain) self.data = Table.from_table(self.domain, self.classifier.original_data) attrs, ranges, start = self.domain.attributes, [], 0 for attr in attrs: stop = start + len(attr.values) if attr.is_discrete else start + 1 ranges.append(slice(start, stop)) start = stop self.b0 = self.classifier.intercept coeffs = self.classifier.coefficients if len(self.domain.class_var.values) == 2: self.b0 = np.hstack((self.b0 * (-1), self.b0)) coeffs = np.vstack((coeffs * (-1), coeffs)) self.log_reg_coeffs = [coeffs[:, ranges[i]] for i in range(len(attrs))] self.log_reg_coeffs_orig = self.log_reg_coeffs.copy() min_values = nanmin(self.data.X, axis=0) max_values = nanmax(self.data.X, axis=0) for i, min_t, max_t in zip(range(len(self.log_reg_coeffs)), min_values, max_values): if self.log_reg_coeffs[i].shape[1] == 1: coef = self.log_reg_coeffs[i] self.log_reg_coeffs[i] = np.hstack( (coef * min_t, coef * max_t)) self.log_reg_cont_data_extremes.append( [sorted([min_t, max_t], reverse=(c < 0)) for c in coef]) else: self.log_reg_cont_data_extremes.append([None])
def calculate_log_reg_coefficients(self): self.log_reg_coeffs = [] self.log_reg_cont_data_extremes = [] self.b0 = None if self.classifier is None or self.domain is None: return if not isinstance(self.classifier, LogisticRegressionClassifier): return self.domain = self.reconstruct_domain(self.classifier.original_domain, self.domain) self.data = self.classifier.original_data.transform(self.domain) attrs, ranges, start = self.domain.attributes, [], 0 for attr in attrs: stop = start + len(attr.values) if attr.is_discrete else start + 1 ranges.append(slice(start, stop)) start = stop self.b0 = self.classifier.intercept coeffs = self.classifier.coefficients if len(self.domain.class_var.values) == 2: self.b0 = np.hstack((self.b0 * (-1), self.b0)) coeffs = np.vstack((coeffs * (-1), coeffs)) self.log_reg_coeffs = [coeffs[:, ranges[i]] for i in range(len(attrs))] self.log_reg_coeffs_orig = self.log_reg_coeffs.copy() min_values = nanmin(self.data.X, axis=0) max_values = nanmax(self.data.X, axis=0) for i, min_t, max_t in zip(range(len(self.log_reg_coeffs)), min_values, max_values): if self.log_reg_coeffs[i].shape[1] == 1: coef = self.log_reg_coeffs[i] self.log_reg_coeffs[i] = np.hstack((coef * min_t, coef * max_t)) self.log_reg_cont_data_extremes.append( [sorted([min_t, max_t], reverse=(c < 0)) for c in coef]) else: self.log_reg_cont_data_extremes.append([None])
def __call__(self, data: Table, attribute): fmt = [ "%Y", "%y %b", "%y %b %d", "%y %b %d %H:%M", "%y %b %d %H:%M", "%H:%M:%S" ][self.unit] values, _ = data.get_column_view(attribute) times = [] if values.size: mn, mx = ut.nanmin(values), ut.nanmax(values) if not np.isnan(mn): mn = utc_from_timestamp(mn).timetuple() mx = utc_from_timestamp(mx).timetuple() times = _time_range(mn, mx, self.unit, self.width, 0, 100) if times is None: raise TooManyIntervals times = [time.struct_time(t + (0, 0, 0)) for t in times][1:-1] points = np.array([calendar.timegm(t) for t in times]) values = [time.strftime(fmt, t) for t in times] values = _simplified_time_intervals(values) var = data.domain[attribute] return DiscreteVariable(name=var.name, values=values, compute_value=Discretizer(var, points), sparse=var.sparse)
def __compute_statistics(self): # We will compute statistics over all data at once matrices = [self._data.X, self._data._Y, self._data.metas] # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = zip([ self._domain.attributes, self._domain.class_vars, self._domain.metas ], matrices) # Filter out any matrices with size 0, filter the zipped matrices to # eliminate variables in a single swoop matrices = list(filter(lambda tup: tup[1].size, matrices)) def _apply_to_types(attrs_x_pair, discrete_f=None, continuous_f=None, time_f=None, string_f=None, default_val=np.nan): """Apply functions to variable types e.g. discrete_f to discrete variables. Default value is returned if there is no function defined for specific variable types.""" attrs, x = attrs_x_pair result = np.full(len(attrs), default_val) disc_var_idx, cont_var_idx, time_var_idx, str_var_idx = self._attr_indices(attrs) if discrete_f and x[:, disc_var_idx].size: result[disc_var_idx] = discrete_f(x[:, disc_var_idx].astype(np.float64)) if continuous_f and x[:, cont_var_idx].size: result[cont_var_idx] = continuous_f(x[:, cont_var_idx].astype(np.float64)) if time_f and x[:, time_var_idx].size: result[time_var_idx] = time_f(x[:, time_var_idx].astype(np.float64)) if string_f and x[:, str_var_idx].size: result[str_var_idx] = string_f(x[:, str_var_idx].astype(np.object)) return result self._variable_types = [type(var) for var in self._attributes] self._variable_names = [var.name.lower() for var in self._attributes] # Compute the center _center = partial( _apply_to_types, discrete_f=lambda x: ss.mode(x)[0], continuous_f=lambda x: ut.nanmean(x, axis=0), ) self._center = np.hstack(map(_center, matrices)) # Compute the dispersion def _entropy(x): p = [ut.bincount(row)[0] for row in x.T] p = [pk / np.sum(pk) for pk in p] return np.fromiter((ss.entropy(pk) for pk in p), dtype=np.float64) _dispersion = partial( _apply_to_types, discrete_f=lambda x: _entropy(x), continuous_f=lambda x: ut.nanvar(x, axis=0), ) self._dispersion = np.hstack(map(_dispersion, matrices)) # Compute minimum values _max = partial( _apply_to_types, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), ) self._max = np.hstack(map(_max, matrices)) # Compute maximum values _min = partial( _apply_to_types, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), ) self._min = np.hstack(map(_min, matrices)) # Compute # of missing values _missing = partial( _apply_to_types, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._missing = np.hstack(map(_missing, matrices))