def test_eigenvals_only(self): data = self.generate_normed_data() out = covariance_eig(data, norm=1, eigvals_only=True) self.assertNotIsInstance(out, tuple) out = covariance_eig(data, norm=1, eigvals_only=False) self.assertIsInstance(out, tuple)
def test_bad_norm(self): d, n = 3, 10 data = self.generate_normed_data(d, n) data *= 2 with self.assertWarns(PrivacyLeakWarning): covariance_eig(data, epsilon=float("inf"), norm=None) with self.assertRaises(ValueError): covariance_eig(data, epsilon=float("inf"), norm=1)
def test_large_dims(self): n, d = 10, 3 data = self.generate_normed_data(d, n) out = covariance_eig(data, norm=1, dims=50) self.assertIsNotNone(out) self.assertEqual(out[0].size, 3) self.assertEqual(out[1].size, 3 * 3)
def test_svd(self): data = self.generate_normed_data(5, 10) u, s, v = np.linalg.svd(data.T.dot(data)) vals, vecs = covariance_eig(data, norm=1, epsilon=float("inf")) self.assertTrue(np.allclose(vals, s)) self.assertTrue(np.allclose(abs(vecs.T.dot(u)), np.eye(5))) self.assertTrue(np.allclose(abs(vecs.T.dot(v.T)), np.eye(5)))
def test_inf_epsilon(self): d, n = 3, 50 data = self.generate_normed_data(d, n) dp_vals, dp_vecs = covariance_eig(data, epsilon=float("inf"), norm=1) vals, vecs = np.linalg.eig(data.T.dot(data)) self.assertTrue( np.allclose(vals[vals.argsort()], dp_vals[dp_vals.argsort()])) self.assertTrue(np.allclose(abs(dp_vecs.T.dot(vecs).sum(axis=1)), 1)) self.assertTrue(np.allclose(abs(dp_vecs.T.dot(vecs).sum(axis=0)), 1))
def test_simple(self): d = 5 data = self.generate_normed_data(d) vals, vecs = covariance_eig(data, norm=1) self.assertIsNotNone(vals) self.assertIsNotNone(vecs) self.assertEqual(d, vals.size) self.assertEqual(d, vecs.shape[0]) # Unitary matrix output self.assertTrue(np.allclose(vecs.dot(vecs.T), np.eye(d))) self.assertTrue(np.all(vals >= 0))
def test_dims(self): d, n = 5, 10 data = self.generate_normed_data(d, n) vals, vecs = covariance_eig(data, norm=1, dims=3) self.assertEqual(vecs.shape, (5, 3)) self.assertEqual(vals.shape, (5, )) vals, vecs = covariance_eig(data, norm=1, dims=10) self.assertEqual(vecs.shape, (5, 5)) self.assertEqual(vals.shape, (5, )) vals, vecs = covariance_eig(data, norm=1, dims=0) self.assertEqual(vecs.shape, (5, 0)) self.assertEqual(vals.shape, (5, )) with self.assertRaises(ValueError): covariance_eig(data, dims=-5, norm=1) with self.assertRaises(TypeError): covariance_eig(data, dims=0.5, norm=1)
def _fit_full(self, X, n_components): self.accountant.check(self.epsilon, 0) n_samples, n_features = X.shape if self.centered: self.mean_ = np.zeros_like(np.mean(X, axis=0)) else: if self.bounds is None: warnings.warn( "Bounds parameter hasn't been specified, so falling back to determining range from the data.\n" "This will result in additional privacy leakage. To ensure differential privacy with no " "additional privacy loss, specify `range` for each valued returned by np.mean().", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = check_bounds(self.bounds, n_features) self.mean_ = mean(X, epsilon=self.epsilon / 2, bounds=self.bounds, axis=0, accountant=BudgetAccountant()) X -= self.mean_ if self.data_norm is None: warnings.warn("Data norm has not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning) self.data_norm = np.linalg.norm(X, axis=1).max() X = clip_to_norm(X, self.data_norm) s, u = covariance_eig(X, epsilon=self.epsilon if self.centered else self.epsilon / 2, norm=self.data_norm, dims=n_components if isinstance(n_components, Integral) else None) u, _ = svd_flip(u, np.zeros_like(u).T) s = np.sqrt(s) components_ = u.T # Get variance explained by singular values explained_variance_ = np.sort((s ** 2) / (n_samples - 1))[::-1] total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var singular_values_ = s.copy() # Store the singular values. # Post-process the number of components required if n_components == 'mle': # TODO: Update when sklearn requirement changes to >= 0.23, removing try...except try: n_components = sk_pca._infer_dimension(explained_variance_, n_samples) except AttributeError: n_components = sk_pca._infer_dimension_(explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold ratio_cumsum = stable_cumsum(explained_variance_ratio_) n_components = np.searchsorted(ratio_cumsum, n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): self.noise_variance_ = explained_variance_[n_components:].mean() else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = explained_variance_ratio_[:n_components] self.singular_values_ = singular_values_[:n_components] self.accountant.spend(self.epsilon, 0) return u, s[:n_components], u.T