Пример #1
0
    def test_dtype(self, array):
        x = array([0, np.nan, 2, 3])
        w = np.array([0, 1.5, 0, 0])

        self.assertIsInstance(countnans(x, w, dtype=np.int32), np.int32)
        self.assertEqual(countnans(x, w, dtype=np.int32), 1)
        self.assertIsInstance(countnans(x, w, dtype=np.float64), np.float64)
        self.assertEqual(countnans(x, w, dtype=np.float64), 1.5)
Пример #2
0
    def test_2d_weights(self, array):
        # pylint: disable=bad-whitespace
        x = array([[np.nan, np.nan, 1, 1], [0, np.nan, 2, np.nan]])
        w = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])

        np.testing.assert_equal(countnans(x, w), 17)
        np.testing.assert_equal(countnans(x, w, axis=0), [1, 8, 0, 8])
        np.testing.assert_equal(countnans(x, w, axis=1), [3, 14])
Пример #3
0
    def test_dtype(self, array):
        x = array([0, np.nan, 2, 3])
        w = np.array([0, 1.5, 0, 0])

        self.assertIsInstance(countnans(x, w, dtype=np.int32), np.int32)
        self.assertEqual(countnans(x, w, dtype=np.int32), 1)
        self.assertIsInstance(countnans(x, w, dtype=np.float64), np.float64)
        self.assertEqual(countnans(x, w, dtype=np.float64), 1.5)
Пример #4
0
    def __compute_statistics(self):
        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = [self.__attributes, self.__class_vars, self.__metas]
        # Filter out any matrices with size 0
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        self._variable_types = np.array([type(var) for var in self.variables])
        self._variable_names = np.array(
            [var.name.lower() for var in self.variables])
        self._min = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
            time_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._dispersion = self.__compute_stat(
            matrices,
            discrete_f=_categorical_entropy,
            continuous_f=coefficient_of_variation,
        )
        self._missing = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
            default_val=len(matrices[0]) if matrices else 0)
        self._max = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
            time_f=lambda x: ut.nanmax(x, axis=0),
        )

        # Since scipy apparently can't do mode on sparse matrices, cast it to
        # dense. This can be very inefficient for large matrices, and should
        # be changed
        def __mode(x, *args, **kwargs):
            if sp.issparse(x):
                x = x.todense(order="C")
            # return ss.mode(x, *args, **kwargs)[0]
            # Temporary replacement for scipy
            return ut.nanmode(x, *args, **kwargs)[0]

        self._center = self.__compute_stat(
            matrices,
            discrete_f=None,
            continuous_f=lambda x: ut.nanmean(x, axis=0),
            time_f=lambda x: ut.nanmean(x, axis=0),
        )

        self._median = self.__compute_stat(
            matrices,
            discrete_f=lambda x: __mode(x, axis=0),
            continuous_f=lambda x: ut.nanmedian(x, axis=0),
            time_f=lambda x: ut.nanmedian(x, axis=0),
        )
Пример #5
0
    def test_2d_weights(self, array):
        # pylint: disable=bad-whitespace
        x = array([[np.nan, np.nan, 1,      1 ],
                   [     0, np.nan, 2, np.nan ]])
        w = np.array([[1, 2, 3, 4],
                      [5, 6, 7, 8]])

        np.testing.assert_equal(countnans(x, w), 17)
        np.testing.assert_equal(countnans(x, w, axis=0), [1, 8, 0, 8])
        np.testing.assert_equal(countnans(x, w, axis=1), [3, 14])
Пример #6
0
    def __compute_statistics(self):
        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = [self.__attributes, self.__class_vars, self.__metas]
        # Filter out any matrices with size 0
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        self._variable_types = np.array([type(var) for var in self.variables])
        self._variable_names = np.array([var.name.lower() for var in self.variables])
        self._min = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
            time_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._dispersion = self.__compute_stat(
            matrices,
            discrete_f=_categorical_entropy,
            continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0),
        )
        self._missing = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
        )
        self._max = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
            time_f=lambda x: ut.nanmax(x, axis=0),
        )

        # Since scipy apparently can't do mode on sparse matrices, cast it to
        # dense. This can be very inefficient for large matrices, and should
        # be changed
        def __mode(x, *args, **kwargs):
            if sp.issparse(x):
                x = x.todense(order="C")
            # return ss.mode(x, *args, **kwargs)[0]
            return ut.nanmode(x, *args, **kwargs)[0]  # Temporary replacement for scipy

        self._center = self.__compute_stat(
            matrices,
            discrete_f=lambda x: __mode(x, axis=0),
            continuous_f=lambda x: ut.nanmean(x, axis=0),
            time_f=lambda x: ut.nanmean(x, axis=0),
        )
Пример #7
0
    def test_shape_matches_dense_and_sparse(self, array):
        x = array([
            [0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1],
            [1, 2, 2, 1, np.nan, 1, 2, 3, np.nan, 3],
        ])
        expected = 4

        self.assertEqual(countnans(x), expected)
Пример #8
0
    def test_shape_matches_dense_and_sparse_with_axis_1(self, array):
        x = array([
            [0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1],
            [1, 2, 2, 1, np.nan, 1, 2, 3, np.nan, 3],
        ])
        expected = [2, 2]

        np.testing.assert_equal(countnans(x, axis=1), expected)
Пример #9
0
    def __compute_statistics(self):
        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = [self.__attributes, self.__class_vars, self.__metas]
        # Filter out any matrices with size 0
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        self._variable_types = np.array([type(var) for var in self.variables])
        self._variable_names = np.array(
            [var.name.lower() for var in self.variables])
        self._min = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
            time_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._dispersion = self.__compute_stat(
            matrices,
            discrete_f=_categorical_entropy,
            continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(
                x, axis=0),
        )
        self._missing = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
        )
        self._max = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
            time_f=lambda x: ut.nanmax(x, axis=0),
        )
        self._center = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ss.mode(x)[0],
            continuous_f=lambda x: ut.nanmean(x, axis=0),
            time_f=lambda x: ut.nanmean(x, axis=0),
        )
Пример #10
0
    def __compute_statistics(self):
        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = [self.__attributes, self.__class_vars, self.__metas]
        # Filter out any matrices with size 0
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        self._variable_types = np.array([type(var) for var in self.variables])
        self._variable_names = np.array([var.name.lower() for var in self.variables])
        self._min = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
            time_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._dispersion = self.__compute_stat(
            matrices,
            discrete_f=_categorical_entropy,
            continuous_f=lambda x: np.sqrt(ut.nanvar(x, axis=0)) / ut.nanmean(x, axis=0),
        )
        self._missing = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
        )
        self._max = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
            time_f=lambda x: ut.nanmax(x, axis=0),
        )
        self._center = self.__compute_stat(
            matrices,
            discrete_f=lambda x: ss.mode(x)[0],
            continuous_f=lambda x: ut.nanmean(x, axis=0),
            time_f=lambda x: ut.nanmean(x, axis=0),
        )
Пример #11
0
    def check_data(self):
        def error(err):
            err()
            self.data = None

        self.clear_messages()
        if self.data is not None:
            self.graph_variables = [var for var in self.data.domain.attributes
                                    if var.is_continuous]
            self.valid_data = ~countnans(self.data.X, axis=1).astype(bool)
            if len(self.graph_variables) < 1:
                error(self.Error.not_enough_attrs)
            elif not np.sum(self.valid_data):
                error(self.Error.no_valid_data)
            else:
                if not np.all(self.valid_data):
                    self.Information.hidden_instances()
                if len(self.graph_variables) > MAX_FEATURES:
                    self.Information.too_many_features()
                    self.graph_variables = self.graph_variables[:MAX_FEATURES]
    def commit(self):
        self.Error.clear()
        # Kill any running jobs
        self.cancel()
        assert self.__state == self.State.Pending

        if self.data is None:
            return

        # Make sure the dataset is ok
        if ut.countnans(self.data.X) > 0:
            self.Error.data_has_nans()
            return

        if len(self.data.domain.attributes) < 1:
            self.Error.empty_dataset()
            return

        # Prepare the tasks to run
        queue = TaskQueue(parent=self)

        if self.pca_projection is None and self.apply_pca:
            queue.push(namespace(task=self._compute_pca_projection))

        if self.graph is None:
            queue.push(namespace(task=self._compute_graph, progress_callback=True))

        if self.partition is None:
            queue.push(namespace(task=self._compute_partition))

        # Prepare callbacks
        queue.on_progress.connect(lambda val: self.progressBarSet(100 * val))
        queue.on_complete.connect(self._processing_complete)
        queue.on_complete.connect(self._send_data)
        queue.on_exception.connect(self._handle_exceptions)

        # Run the task queue
        self.progressBarInit()
        self.setBlocking(True)
        self.__future = self.__executor.submit(queue.start)
        self.__state = self.State.Running
Пример #13
0
    def check_data(self):
        def error(err):
            err()
            self.data = None

        self.clear_messages()
        if self.data is not None:
            self.infoLabel.setText("%i instances on input\n%i features" % (
                len(self.data), len(self.data.domain.attributes)))
            self.graph_variables = [var for var in self.data.domain.attributes
                                    if var.is_continuous]
            self.valid_data = ~countnans(self.data.X, axis=1).astype(bool)
            if len(self.graph_variables) < 1:
                error(self.Error.not_enough_attrs)
            elif not np.sum(self.valid_data):
                error(self.Error.no_valid_data)
            else:
                if not np.all(self.valid_data):
                    self.Information.hidden_instances()
                if len(self.graph_variables) > MAX_FEATURES:
                    self.Information.too_many_features()
                    self.graph_variables = self.graph_variables[:MAX_FEATURES]
Пример #14
0
    def test_on_columns(self, array):
        x = array([[1, np.nan, 1, 2], [2, np.nan, 2, 3]])
        expected = [0, 2, 0, 0]

        np.testing.assert_equal(countnans(x, axis=0), expected)
Пример #15
0
    def test_2d_matrix(self, array):
        x = array([[1, np.nan, 1, 2], [2, np.nan, 2, 3]])
        expected = 2

        self.assertEqual(countnans(x), expected)
Пример #16
0
 def test_1d_array_with_axis_1_raises_exception(self, array):
     with self.assertRaises(ValueError):
         countnans(array([0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1]), axis=1)
Пример #17
0
    def test_1d_array_with_axis_0(self, array):
        x = array([0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1])
        expected = 2

        self.assertEqual(countnans(x, axis=0), expected)
Пример #18
0
 def test_1d_array(self, array):
     x = array([0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1])
     self.assertEqual(countnans(x), 2)
Пример #19
0
    def test_shape_matches_dense_and_sparse(self, array):
        x = array([[0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1],
                   [1, 2, 2, 1, np.nan, 1, 2, 3, np.nan, 3]])
        expected = 4

        self.assertEqual(countnans(x), expected)
    def __compute_statistics(self):
        # We will compute statistics over all data at once
        matrices = [self._data.X, self._data._Y, self._data.metas]

        # Since data matrices can of mixed sparsity, we need to compute
        # attributes separately for each of them.
        matrices = zip([
            self._domain.attributes, self._domain.class_vars, self._domain.metas
        ], matrices)
        # Filter out any matrices with size 0, filter the zipped matrices to 
        # eliminate variables in a single swoop
        matrices = list(filter(lambda tup: tup[1].size, matrices))

        def _apply_to_types(attrs_x_pair, discrete_f=None, continuous_f=None,
                            time_f=None, string_f=None, default_val=np.nan):
            """Apply functions to variable types e.g. discrete_f to discrete 
            variables. Default value is returned if there is no function 
            defined for specific variable types."""
            attrs, x = attrs_x_pair
            result = np.full(len(attrs), default_val)
            disc_var_idx, cont_var_idx, time_var_idx, str_var_idx = self._attr_indices(attrs)
            if discrete_f and x[:, disc_var_idx].size:
                result[disc_var_idx] = discrete_f(x[:, disc_var_idx].astype(np.float64))
            if continuous_f and x[:, cont_var_idx].size:
                result[cont_var_idx] = continuous_f(x[:, cont_var_idx].astype(np.float64))
            if time_f and x[:, time_var_idx].size:
                result[time_var_idx] = time_f(x[:, time_var_idx].astype(np.float64))
            if string_f and x[:, str_var_idx].size:
                result[str_var_idx] = string_f(x[:, str_var_idx].astype(np.object))
            return result

        self._variable_types = [type(var) for var in self._attributes]
        self._variable_names = [var.name.lower() for var in self._attributes]

        # Compute the center
        _center = partial(
            _apply_to_types,
            discrete_f=lambda x: ss.mode(x)[0],
            continuous_f=lambda x: ut.nanmean(x, axis=0),
        )
        self._center = np.hstack(map(_center, matrices))

        # Compute the dispersion
        def _entropy(x):
            p = [ut.bincount(row)[0] for row in x.T]
            p = [pk / np.sum(pk) for pk in p]
            return np.fromiter((ss.entropy(pk) for pk in p), dtype=np.float64)
        _dispersion = partial(
            _apply_to_types,
            discrete_f=lambda x: _entropy(x),
            continuous_f=lambda x: ut.nanvar(x, axis=0),
        )
        self._dispersion = np.hstack(map(_dispersion, matrices))

        # Compute minimum values
        _max = partial(
            _apply_to_types,
            discrete_f=lambda x: ut.nanmax(x, axis=0),
            continuous_f=lambda x: ut.nanmax(x, axis=0),
        )
        self._max = np.hstack(map(_max, matrices))

        # Compute maximum values
        _min = partial(
            _apply_to_types,
            discrete_f=lambda x: ut.nanmin(x, axis=0),
            continuous_f=lambda x: ut.nanmin(x, axis=0),
        )
        self._min = np.hstack(map(_min, matrices))

        # Compute # of missing values
        _missing = partial(
            _apply_to_types,
            discrete_f=lambda x: ut.countnans(x, axis=0),
            continuous_f=lambda x: ut.countnans(x, axis=0),
            string_f=lambda x: (x == StringVariable.Unknown).sum(axis=0),
            time_f=lambda x: ut.countnans(x, axis=0),
        )
        self._missing = np.hstack(map(_missing, matrices))
Пример #21
0
 def test_countnans(self):
     np.testing.assert_equal(countnans([[1, np.nan], [2, np.nan]], axis=0),
                             [0, 2])
Пример #22
0
 def test_1d_array(self, array):
     x = array([0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1])
     self.assertEqual(countnans(x), 2)
Пример #23
0
    def test_1d_weights_with_axis_1(self, array):
        x = array([[1, 1, np.nan, 1],
                   [np.nan, 1, 1, 1]])
        w = np.array([0.5, 1])

        np.testing.assert_equal(countnans(x, w, axis=1), [.5, 1])
Пример #24
0
    def test_on_rows(self, array):
        x = array([[1, np.nan, 1, 2],
                   [2, np.nan, 2, 3]])
        expected = [1, 1]

        np.testing.assert_equal(countnans(x, axis=1), expected)
Пример #25
0
    def test_1d_array_with_axis_0(self, array):
        x = array([0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1])
        expected = 2

        self.assertEqual(countnans(x, axis=0), expected)
Пример #26
0
    def test_2d_matrix(self, array):
        x = array([[1, np.nan, 1, 2],
                   [2, np.nan, 2, 3]])
        expected = 2

        self.assertEqual(countnans(x), expected)
Пример #27
0
    def test_shape_matches_dense_and_sparse_with_axis_1(self, array):
        x = array([[0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1],
                   [1, 2, 2, 1, np.nan, 1, 2, 3, np.nan, 3]])
        expected = [2, 2]

        np.testing.assert_equal(countnans(x, axis=1), expected)
Пример #28
0
    def test_on_rows(self, array):
        x = array([[1, np.nan, 1, 2], [2, np.nan, 2, 3]])
        expected = [1, 1]

        np.testing.assert_equal(countnans(x, axis=1), expected)
Пример #29
0
    def test_1d_weights_with_axis_1(self, array):
        x = array([[1, 1, np.nan, 1], [np.nan, 1, 1, 1]])
        w = np.array([0.5, 1])

        np.testing.assert_equal(countnans(x, w, axis=1), [.5, 1])
Пример #30
0
 def test_countnans(self):
     np.testing.assert_equal(countnans([[1, np.nan],
                                        [2, np.nan]], axis=0), [0, 2])
Пример #31
0
    def test_on_columns(self, array):
        x = array([[1, np.nan, 1, 2],
                   [2, np.nan, 2, 3]])
        expected = [0, 2, 0, 0]

        np.testing.assert_equal(countnans(x, axis=0), expected)
Пример #32
0
 def test_1d_array_with_axis_1_raises_exception(self, array):
     with self.assertRaises(ValueError):
         countnans(array([0, 1, 0, 2, 2, np.nan, 1, np.nan, 0, 1]), axis=1)