def test_dtype(self, array): x = array([0, np.nan, 2, 3]) w = np.array([0, 1.5, 0, 0]) self.assertIsInstance(countnans(x, w, dtype=np.int32), np.int32) self.assertEqual(countnans(x, w, dtype=np.int32), 1) self.assertIsInstance(countnans(x, w, dtype=np.float64), np.float64) self.assertEqual(countnans(x, w, dtype=np.float64), 1.5)
def test_2d_weights(self, array): # pylint: disable=bad-whitespace x = array([[np.nan, np.nan, 1, 1], [0, np.nan, 2, np.nan]]) w = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]) np.testing.assert_equal(countnans(x, w), 17) np.testing.assert_equal(countnans(x, w, axis=0), [1, 8, 0, 8]) np.testing.assert_equal(countnans(x, w, axis=1), [3, 14])
def test_dtype(self, array): x = array([0, np.nan, 2, 3]) w = np.array([0, 1.5, 0, 0]) self.assertIsInstance(countnans(x, w, dtype=np.int32), np.int32) self.assertEqual(countnans(x, w, dtype=np.int32), 1) self.assertIsInstance(countnans(x, w, dtype=np.float64), np.float64) self.assertEqual(countnans(x, w, dtype=np.float64), 1.5)
def __compute_statistics(self): # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = [self.__attributes, self.__class_vars, self.__metas] # Filter out any matrices with size 0 matrices = list(filter(lambda tup: tup[1].size, matrices)) self._variable_types = np.array([type(var) for var in self.variables]) self._variable_names = np.array( [var.name.lower() for var in self.variables]) self._min = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), time_f=lambda x: ut.nanmin(x, axis=0), ) self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, continuous_f=coefficient_of_variation, ) self._missing = self.__compute_stat( matrices, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), default_val=len(matrices[0]) if matrices else 0) self._max = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), time_f=lambda x: ut.nanmax(x, axis=0), ) # Since scipy apparently can't do mode on sparse matrices, cast it to # dense. This can be very inefficient for large matrices, and should # be changed def __mode(x, *args, **kwargs): if sp.issparse(x): x = x.todense(order="C") # return ss.mode(x, *args, **kwargs)[0] # Temporary replacement for scipy return ut.nanmode(x, *args, **kwargs)[0] self._center = self.__compute_stat( matrices, discrete_f=None, continuous_f=lambda x: ut.nanmean(x, axis=0), time_f=lambda x: ut.nanmean(x, axis=0), ) self._median = self.__compute_stat( matrices, discrete_f=lambda x: __mode(x, axis=0), continuous_f=lambda x: ut.nanmedian(x, axis=0), time_f=lambda x: ut.nanmedian(x, axis=0), )
def test_2d_weights(self, array): # pylint: disable=bad-whitespace x = array([[np.nan, np.nan, 1, 1 ], [ 0, np.nan, 2, np.nan ]]) w = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]) np.testing.assert_equal(countnans(x, w), 17) np.testing.assert_equal(countnans(x, w, axis=0), [1, 8, 0, 8]) np.testing.assert_equal(countnans(x, w, axis=1), [3, 14])
def __compute_statistics(self): # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = [self.__attributes, self.__class_vars, self.__metas] # Filter out any matrices with size 0 matrices = list(filter(lambda tup: tup[1].size, matrices)) self._variable_types = np.array([type(var) for var in self.variables]) self._variable_names = np.array([var.name.lower() for var in self.variables]) self._min = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), time_f=lambda x: ut.nanmin(x, axis=0), ) self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0), ) self._missing = self.__compute_stat( matrices, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._max = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), time_f=lambda x: ut.nanmax(x, axis=0), ) # Since scipy apparently can't do mode on sparse matrices, cast it to # dense. This can be very inefficient for large matrices, and should # be changed def __mode(x, *args, **kwargs): if sp.issparse(x): x = x.todense(order="C") # return ss.mode(x, *args, **kwargs)[0] return ut.nanmode(x, *args, **kwargs)[0] # Temporary replacement for scipy self._center = self.__compute_stat( matrices, discrete_f=lambda x: __mode(x, axis=0), continuous_f=lambda x: ut.nanmean(x, axis=0), time_f=lambda x: ut.nanmean(x, axis=0), )
def test_shape_matches_dense_and_sparse(self, array): x = array([ [0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1], [1, 2, 2, 1, np.nan, 1, 2, 3, np.nan, 3], ]) expected = 4 self.assertEqual(countnans(x), expected)
def test_shape_matches_dense_and_sparse_with_axis_1(self, array): x = array([ [0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1], [1, 2, 2, 1, np.nan, 1, 2, 3, np.nan, 3], ]) expected = [2, 2] np.testing.assert_equal(countnans(x, axis=1), expected)
def __compute_statistics(self): # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = [self.__attributes, self.__class_vars, self.__metas] # Filter out any matrices with size 0 matrices = list(filter(lambda tup: tup[1].size, matrices)) self._variable_types = np.array([type(var) for var in self.variables]) self._variable_names = np.array( [var.name.lower() for var in self.variables]) self._min = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), time_f=lambda x: ut.nanmin(x, axis=0), ) self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean( x, axis=0), ) self._missing = self.__compute_stat( matrices, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._max = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), time_f=lambda x: ut.nanmax(x, axis=0), ) self._center = self.__compute_stat( matrices, discrete_f=lambda x: ss.mode(x)[0], continuous_f=lambda x: ut.nanmean(x, axis=0), time_f=lambda x: ut.nanmean(x, axis=0), )
def __compute_statistics(self): # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = [self.__attributes, self.__class_vars, self.__metas] # Filter out any matrices with size 0 matrices = list(filter(lambda tup: tup[1].size, matrices)) self._variable_types = np.array([type(var) for var in self.variables]) self._variable_names = np.array([var.name.lower() for var in self.variables]) self._min = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), time_f=lambda x: ut.nanmin(x, axis=0), ) self._dispersion = self.__compute_stat( matrices, discrete_f=_categorical_entropy, continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0), ) self._missing = self.__compute_stat( matrices, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._max = self.__compute_stat( matrices, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), time_f=lambda x: ut.nanmax(x, axis=0), ) self._center = self.__compute_stat( matrices, discrete_f=lambda x: ss.mode(x)[0], continuous_f=lambda x: ut.nanmean(x, axis=0), time_f=lambda x: ut.nanmean(x, axis=0), )
def check_data(self): def error(err): err() self.data = None self.clear_messages() if self.data is not None: self.graph_variables = [var for var in self.data.domain.attributes if var.is_continuous] self.valid_data = ~countnans(self.data.X, axis=1).astype(bool) if len(self.graph_variables) < 1: error(self.Error.not_enough_attrs) elif not np.sum(self.valid_data): error(self.Error.no_valid_data) else: if not np.all(self.valid_data): self.Information.hidden_instances() if len(self.graph_variables) > MAX_FEATURES: self.Information.too_many_features() self.graph_variables = self.graph_variables[:MAX_FEATURES]
def commit(self): self.Error.clear() # Kill any running jobs self.cancel() assert self.__state == self.State.Pending if self.data is None: return # Make sure the dataset is ok if ut.countnans(self.data.X) > 0: self.Error.data_has_nans() return if len(self.data.domain.attributes) < 1: self.Error.empty_dataset() return # Prepare the tasks to run queue = TaskQueue(parent=self) if self.pca_projection is None and self.apply_pca: queue.push(namespace(task=self._compute_pca_projection)) if self.graph is None: queue.push(namespace(task=self._compute_graph, progress_callback=True)) if self.partition is None: queue.push(namespace(task=self._compute_partition)) # Prepare callbacks queue.on_progress.connect(lambda val: self.progressBarSet(100 * val)) queue.on_complete.connect(self._processing_complete) queue.on_complete.connect(self._send_data) queue.on_exception.connect(self._handle_exceptions) # Run the task queue self.progressBarInit() self.setBlocking(True) self.__future = self.__executor.submit(queue.start) self.__state = self.State.Running
def check_data(self): def error(err): err() self.data = None self.clear_messages() if self.data is not None: self.infoLabel.setText("%i instances on input\n%i features" % ( len(self.data), len(self.data.domain.attributes))) self.graph_variables = [var for var in self.data.domain.attributes if var.is_continuous] self.valid_data = ~countnans(self.data.X, axis=1).astype(bool) if len(self.graph_variables) < 1: error(self.Error.not_enough_attrs) elif not np.sum(self.valid_data): error(self.Error.no_valid_data) else: if not np.all(self.valid_data): self.Information.hidden_instances() if len(self.graph_variables) > MAX_FEATURES: self.Information.too_many_features() self.graph_variables = self.graph_variables[:MAX_FEATURES]
def test_on_columns(self, array): x = array([[1, np.nan, 1, 2], [2, np.nan, 2, 3]]) expected = [0, 2, 0, 0] np.testing.assert_equal(countnans(x, axis=0), expected)
def test_2d_matrix(self, array): x = array([[1, np.nan, 1, 2], [2, np.nan, 2, 3]]) expected = 2 self.assertEqual(countnans(x), expected)
def test_1d_array_with_axis_1_raises_exception(self, array): with self.assertRaises(ValueError): countnans(array([0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1]), axis=1)
def test_1d_array_with_axis_0(self, array): x = array([0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1]) expected = 2 self.assertEqual(countnans(x, axis=0), expected)
def test_1d_array(self, array): x = array([0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1]) self.assertEqual(countnans(x), 2)
def test_shape_matches_dense_and_sparse(self, array): x = array([[0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1], [1, 2, 2, 1, np.nan, 1, 2, 3, np.nan, 3]]) expected = 4 self.assertEqual(countnans(x), expected)
def __compute_statistics(self): # We will compute statistics over all data at once matrices = [self._data.X, self._data._Y, self._data.metas] # Since data matrices can of mixed sparsity, we need to compute # attributes separately for each of them. matrices = zip([ self._domain.attributes, self._domain.class_vars, self._domain.metas ], matrices) # Filter out any matrices with size 0, filter the zipped matrices to # eliminate variables in a single swoop matrices = list(filter(lambda tup: tup[1].size, matrices)) def _apply_to_types(attrs_x_pair, discrete_f=None, continuous_f=None, time_f=None, string_f=None, default_val=np.nan): """Apply functions to variable types e.g. discrete_f to discrete variables. Default value is returned if there is no function defined for specific variable types.""" attrs, x = attrs_x_pair result = np.full(len(attrs), default_val) disc_var_idx, cont_var_idx, time_var_idx, str_var_idx = self._attr_indices(attrs) if discrete_f and x[:, disc_var_idx].size: result[disc_var_idx] = discrete_f(x[:, disc_var_idx].astype(np.float64)) if continuous_f and x[:, cont_var_idx].size: result[cont_var_idx] = continuous_f(x[:, cont_var_idx].astype(np.float64)) if time_f and x[:, time_var_idx].size: result[time_var_idx] = time_f(x[:, time_var_idx].astype(np.float64)) if string_f and x[:, str_var_idx].size: result[str_var_idx] = string_f(x[:, str_var_idx].astype(np.object)) return result self._variable_types = [type(var) for var in self._attributes] self._variable_names = [var.name.lower() for var in self._attributes] # Compute the center _center = partial( _apply_to_types, discrete_f=lambda x: ss.mode(x)[0], continuous_f=lambda x: ut.nanmean(x, axis=0), ) self._center = np.hstack(map(_center, matrices)) # Compute the dispersion def _entropy(x): p = [ut.bincount(row)[0] for row in x.T] p = [pk / np.sum(pk) for pk in p] return np.fromiter((ss.entropy(pk) for pk in p), dtype=np.float64) _dispersion = partial( _apply_to_types, discrete_f=lambda x: _entropy(x), continuous_f=lambda x: ut.nanvar(x, axis=0), ) self._dispersion = np.hstack(map(_dispersion, matrices)) # Compute minimum values _max = partial( _apply_to_types, discrete_f=lambda x: ut.nanmax(x, axis=0), continuous_f=lambda x: ut.nanmax(x, axis=0), ) self._max = np.hstack(map(_max, matrices)) # Compute maximum values _min = partial( _apply_to_types, discrete_f=lambda x: ut.nanmin(x, axis=0), continuous_f=lambda x: ut.nanmin(x, axis=0), ) self._min = np.hstack(map(_min, matrices)) # Compute # of missing values _missing = partial( _apply_to_types, discrete_f=lambda x: ut.countnans(x, axis=0), continuous_f=lambda x: ut.countnans(x, axis=0), string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0), time_f=lambda x: ut.countnans(x, axis=0), ) self._missing = np.hstack(map(_missing, matrices))
def test_countnans(self): np.testing.assert_equal(countnans([[1, np.nan], [2, np.nan]], axis=0), [0, 2])
def test_1d_array(self, array): x = array([0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1]) self.assertEqual(countnans(x), 2)
def test_1d_weights_with_axis_1(self, array): x = array([[1, 1, np.nan, 1], [np.nan, 1, 1, 1]]) w = np.array([0.5, 1]) np.testing.assert_equal(countnans(x, w, axis=1), [.5, 1])
def test_on_rows(self, array): x = array([[1, np.nan, 1, 2], [2, np.nan, 2, 3]]) expected = [1, 1] np.testing.assert_equal(countnans(x, axis=1), expected)
def test_1d_array_with_axis_0(self, array): x = array([0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1]) expected = 2 self.assertEqual(countnans(x, axis=0), expected)
def test_2d_matrix(self, array): x = array([[1, np.nan, 1, 2], [2, np.nan, 2, 3]]) expected = 2 self.assertEqual(countnans(x), expected)
def test_shape_matches_dense_and_sparse_with_axis_1(self, array): x = array([[0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1], [1, 2, 2, 1, np.nan, 1, 2, 3, np.nan, 3]]) expected = [2, 2] np.testing.assert_equal(countnans(x, axis=1), expected)
def test_on_rows(self, array): x = array([[1, np.nan, 1, 2], [2, np.nan, 2, 3]]) expected = [1, 1] np.testing.assert_equal(countnans(x, axis=1), expected)
def test_1d_weights_with_axis_1(self, array): x = array([[1, 1, np.nan, 1], [np.nan, 1, 1, 1]]) w = np.array([0.5, 1]) np.testing.assert_equal(countnans(x, w, axis=1), [.5, 1])
def test_countnans(self): np.testing.assert_equal(countnans([[1, np.nan], [2, np.nan]], axis=0), [0, 2])
def test_on_columns(self, array): x = array([[1, np.nan, 1, 2], [2, np.nan, 2, 3]]) expected = [0, 2, 0, 0] np.testing.assert_equal(countnans(x, axis=0), expected)
def test_1d_array_with_axis_1_raises_exception(self, array): with self.assertRaises(ValueError): countnans(array([0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1]), axis=1)