def test_infer_dim_bad_spec(): # Test a spectrum that drops to near zero for PR #16224 spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) n_samples = 10 n_features = 5 ret = _infer_dimension(spectrum, n_samples, n_features) assert ret == 0
def _fit_full_daal4py(self, X, n_components): n_samples, n_features = X.shape # due to need to flip components, need to do full decomposition self._fit_daal4py(X, min(n_samples, n_features)) U = self._transform_daal4py(X, whiten=True, check_X=False, scale_eigenvalues=True) V = self.components_ U, V = svd_flip(U, V) U = U.copy() V = V.copy() S = self.singular_values_.copy() if n_components == 'mle': n_components = \ _infer_dimension(self.explained_variance_, n_samples) elif 0 < n_components < 1.0: n_components = _n_components_from_fraction( self.explained_variance_ratio_, n_components) # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): self.noise_variance_ = self.explained_variance_[n_components:].mean() else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = self.components_[:n_components] self.n_components_ = n_components self.explained_variance_ = self.explained_variance_[:n_components] self.explained_variance_ratio_ = \ self.explained_variance_ratio_[:n_components] self.singular_values_ = self.singular_values_[:n_components] return U, S, V
def _fit_daal4py(self, X, n_components): n_samples, n_features = X.shape n_sf_min = min(n_samples, n_features) _validate_n_components(n_components, n_samples, n_features) if n_components == 'mle': daal_n_components = n_features elif n_components < 1: daal_n_components = n_sf_min else: daal_n_components = n_components fpType = getFPType(X) centering_algo = daal4py.normalization_zscore(fptype=fpType, doScale=False) pca_alg = daal4py.pca(fptype=fpType, method='svdDense', normalization=centering_algo, resultsToCompute='mean|variance|eigenvalue', isDeterministic=True, nComponents=daal_n_components) pca_res = pca_alg.compute(X) self.mean_ = pca_res.means.ravel() variances_ = pca_res.variances.ravel() components_ = pca_res.eigenvectors explained_variance_ = pca_res.eigenvalues.ravel() tot_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / tot_var if n_components == 'mle': n_components = \ _infer_dimension(explained_variance_, n_samples) elif 0 < n_components < 1.0: n_components = _n_components_from_fraction( explained_variance_ratio_, n_components) # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < n_sf_min: if explained_variance_.shape[0] == n_sf_min: self.noise_variance_ = explained_variance_[n_components:].mean( ) else: resid_var_ = variances_.sum() resid_var_ -= explained_variance_[:n_components].sum() self.noise_variance_ = resid_var_ / (n_sf_min - n_components) else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = \ explained_variance_ratio_[:n_components] self.singular_values_ = np.sqrt( (n_samples - 1) * self.explained_variance_)
def test_small_eigenvalues_mle(): # Test rank associated with tiny eigenvalues are given a log-likelihood of # -inf. The inferred rank will be 1 spectrum = np.array([1, 1e-30, 1e-30, 1e-30]) assert _assess_dimension(spectrum, rank=1, n_samples=10) > -np.inf for rank in (2, 3): assert _assess_dimension(spectrum, rank, 10) == -np.inf assert _infer_dimension(spectrum, 10) == 1
def test_infer_dim_3(): n, p = 100, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * 0.1 X[:10] += np.array([3, 4, 5, 1, 2]) X[10:20] += np.array([6, 0, 7, 2, -1]) X[30:40] += 2 * np.array([-1, 1, -1, 1, -1]) pca = PCA(n_components=p, svd_solver="full") pca.fit(X) spect = pca.explained_variance_ assert _infer_dimension(spect, n) > 2
def test_infer_dim_2(): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * 0.1 X[:10] += np.array([3, 4, 5, 1, 2]) X[10:20] += np.array([6, 0, 7, 2, -1]) pca = PCA(n_components=p, svd_solver="full") pca.fit(X) spect = pca.explained_variance_ assert _infer_dimension(spect, n) > 1
def _decompose_full(self, mat): if self.n_components != "mle": if not 0 <= self.n_components <= self.n_samples_: raise ValueError("n_components=%r must be between 1 and " "n_samples=%r with " "svd_solver='%s'" % ( self.n_components, self.n_samples_, self.svd_solver, )) elif self.n_components >= 1: if not isinstance(self.n_components, numbers.Integral): raise ValueError( "n_components=%r must be of type int " "when greater than or equal to 1, " "was of type=%r" % (self.n_components, type(self.n_components))) U, S, Vt = linalg.svd(mat, full_matrices=False) U[:, S < self.tol] = 0.0 Vt[S < self.tol] = 0.0 S[S < self.tol] = 0.0 # flip eigenvectors' sign to enforce deterministic output U, Vt = svd_flip(U, Vt) # Get variance explained by singular values explained_variance_ = (S**2) / (self.n_samples_ - 1) total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var # Postprocess the number of components required if self.n_components == "mle": self.n_components = _infer_dimension(explained_variance_, self.n_samples_) elif 0 < self.n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold # side='right' ensures that number of features selected # their variance is always greater than self.n_components float # passed. More discussion in issue: #15669 ratio_cumsum = stable_cumsum(explained_variance_ratio_) self.n_components = (np.searchsorted( ratio_cumsum, self.n_components, side="right") + 1) self.n_components = self.n_components return ( U[:, :self.n_components], S[:self.n_components], Vt[:self.n_components], )
def _fit_full(self, X, n_components): """Fit the model by computing full SVD on X""" n_samples, n_features = X.shape _validate_n_components(n_components, n_samples, n_features) # Center data self.mean_ = np.mean(X, axis=0) X -= self.mean_ if X.shape[0] > X.shape[1] and (X.dtype == np.float64 or X.dtype == np.float32): U, S, V = _daal4py_svd(X) else: U, S, V = np.linalg.svd(X, full_matrices=False) # flip eigenvectors' sign to enforce deterministic output U, V = svd_flip(U, V) components_ = V # Get variance explained by singular values explained_variance_ = (S**2) / (n_samples - 1) total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var # Postprocess the number of components required if n_components == 'mle': n_components = \ _infer_dimension(explained_variance_, n_samples) elif 0 < n_components < 1.0: n_components = _n_components_from_fraction(explained_variance_ratio_, n_components) # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): self.noise_variance_ = explained_variance_[n_components:].mean() else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = \ explained_variance_ratio_[:n_components] self.singular_values_ = S[:n_components] return U, S, V
def _fit_full(self, X, n_components): X = check_array(X, dtype=[np.float64, np.float32]) n_samples, n_features = X.shape self._validate_n_components(n_components, n_samples, n_features) self._fit_full_daal4py(X, min(X.shape)) U = self._transform_daal4py(X, whiten=True, check_X=False, scale_eigenvalues=True) V = self.components_ S = self.singular_values_ if n_components == 'mle': if sklearn_check_version('0.23'): n_components = _infer_dimension(self.explained_variance_, n_samples) else: n_components = _infer_dimension_(self.explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: ratio_cumsum = stable_cumsum(self.explained_variance_ratio_) n_components = np.searchsorted( ratio_cumsum, n_components, side='right') + 1 if n_components < min(n_features, n_samples): self.noise_variance_ = self.explained_variance_[ n_components:].mean() else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = self.components_[:n_components] self.n_components_ = n_components self.explained_variance_ = self.explained_variance_[:n_components] self.explained_variance_ratio_ = self.explained_variance_ratio_[: n_components] self.singular_values_ = self.singular_values_[:n_components] return U, S, V
def _fit_full(self, X, n_components): n_samples, n_features = X.shape self._validate_n_components(n_components, n_samples, n_features) self._fit_full_daal4py(X, min(X.shape)) U = None V = self.components_ S = self.singular_values_ if n_components == 'mle': if sklearn_check_version('0.23'): n_components = _infer_dimension(self.explained_variance_, n_samples) else: n_components = \ _infer_dimension_(self.explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: ratio_cumsum = stable_cumsum(self.explained_variance_ratio_) n_components = np.searchsorted( ratio_cumsum, n_components, side='right') + 1 if n_components < min(n_features, n_samples): self.noise_variance_ = self.explained_variance_[ n_components:].mean() else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = self.components_[:n_components] self.n_components_ = n_components self.explained_variance_ = self.explained_variance_[:n_components] self.explained_variance_ratio_ = self.explained_variance_ratio_[: n_components] self.singular_values_ = self.singular_values_[:n_components] return U, S, V
def _fit_full_daal4py(self, X, n_components): n_samples, n_features = X.shape n_sf_min = min(n_samples, n_features) if n_components == 'mle': daal_n_components = n_features elif n_components < 1: daal_n_components = n_sf_min else: daal_n_components = n_components fpType = getFPType(X) covariance_algo = daal4py.covariance( fptype=fpType, outputMatrixType='covarianceMatrix') covariance_res = covariance_algo.compute(X) self.mean_ = covariance_res.mean.ravel() covariance = covariance_res.covariance variances_ = np.array([covariance[i, i] for i in range(n_features)]) pca_alg = daal4py.pca(fptype=fpType, method='correlationDense', resultsToCompute='eigenvalue', isDeterministic=True, nComponents=daal_n_components) pca_res = pca_alg.compute(X, covariance) components_ = pca_res.eigenvectors explained_variance_ = np.maximum(pca_res.eigenvalues.ravel(), 0) tot_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / tot_var if n_components == 'mle': if sklearn_check_version('0.23'): n_components = _infer_dimension(explained_variance_, n_samples) else: n_components = \ _infer_dimension_(explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: ratio_cumsum = stable_cumsum(explained_variance_ratio_) n_components = np.searchsorted( ratio_cumsum, n_components, side='right') + 1 if n_components < n_sf_min: if explained_variance_.shape[0] == n_sf_min: self.noise_variance_ = explained_variance_[n_components:].mean( ) else: resid_var_ = variances_.sum() resid_var_ -= explained_variance_[:n_components].sum() self.noise_variance_ = resid_var_ / (n_sf_min - n_components) else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = explained_variance_ratio_[: n_components] self.singular_values_ = np.sqrt( (n_samples - 1) * self.explained_variance_)
def _fit_full(self, X, n_components): self.accountant.check(self.epsilon, 0) n_samples, n_features = X.shape if self.centered: self.mean_ = np.zeros_like(np.mean(X, axis=0)) else: if self.bounds is None: warnings.warn( "Bounds parameter hasn't been specified, so falling back to determining range from the data.\n" "This will result in additional privacy leakage. To ensure differential privacy with no " "additional privacy loss, specify `range` for each valued returned by np.mean().", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = check_bounds(self.bounds, n_features) self.mean_ = mean(X, epsilon=self.epsilon / 2, bounds=self.bounds, axis=0, accountant=BudgetAccountant()) X -= self.mean_ if self.data_norm is None: warnings.warn( "Data norm has not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning) self.data_norm = np.linalg.norm(X, axis=1).max() X = clip_to_norm(X, self.data_norm) XtX = np.dot(X.T, X) mech = Wishart().set_epsilon(self.epsilon if self.centered else self.epsilon / 2).\ set_sensitivity(self.data_norm) noisy_input = mech.randomise(XtX) u, s, v = np.linalg.svd(noisy_input) u, v = svd_flip(u, v) s = np.sqrt(s) components_ = v # Get variance explained by singular values explained_variance_ = (s**2) / (n_samples - 1) total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var singular_values_ = s.copy() # Store the singular values. # Post-process the number of components required if n_components == 'mle': try: n_components = sk_pca._infer_dimension(explained_variance_, n_samples) except AttributeError: n_components = sk_pca._infer_dimension_( explained_variance_, n_samples, n_features) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold ratio_cumsum = stable_cumsum(explained_variance_ratio_) n_components = np.searchsorted(ratio_cumsum, n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): self.noise_variance_ = explained_variance_[n_components:].mean() else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = explained_variance_ratio_[: n_components] self.singular_values_ = singular_values_[:n_components] self.accountant.spend(self.epsilon, 0) return u, s, v
def _fit_full(self, X, n_components): self.accountant.check(self.epsilon, 0) n_samples, n_features = X.shape if self.centered: self.mean_ = np.zeros_like(np.mean(X, axis=0)) else: if self.bounds is None: warnings.warn( "Bounds parameter hasn't been specified, so falling back to determining range from the data.\n" "This will result in additional privacy leakage. To ensure differential privacy with no " "additional privacy loss, specify `range` for each valued returned by np.mean().", PrivacyLeakWarning) self.bounds = (np.min(X, axis=0), np.max(X, axis=0)) self.bounds = self._check_bounds(self.bounds, n_features) self.mean_ = mean(X, epsilon=self.epsilon / 2, bounds=self.bounds, axis=0, accountant=BudgetAccountant()) X -= self.mean_ if self.data_norm is None: warnings.warn("Data norm has not been specified and will be calculated on the data provided. This will " "result in additional privacy leakage. To ensure differential privacy and no additional " "privacy leakage, specify `data_norm` at initialisation.", PrivacyLeakWarning) self.data_norm = np.linalg.norm(X, axis=1).max() X = self._clip_to_norm(X, self.data_norm) sigma_vec, u_mtx = covariance_eig(X, epsilon=self.epsilon if self.centered else self.epsilon / 2, norm=self.data_norm, dims=n_components if isinstance(n_components, Integral) else None) u_mtx, _ = svd_flip(u_mtx, np.zeros_like(u_mtx).T) sigma_vec = np.sqrt(sigma_vec) components_ = u_mtx.T # Get variance explained by singular values explained_variance_ = np.sort((sigma_vec ** 2) / (n_samples - 1))[::-1] total_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / total_var singular_values_ = sigma_vec.copy() # Store the singular values. # Post-process the number of components required if n_components == 'mle': n_components = sk_pca._infer_dimension(explained_variance_, n_samples) elif 0 < n_components < 1.0: # number of components for which the cumulated explained # variance percentage is superior to the desired threshold ratio_cumsum = stable_cumsum(explained_variance_ratio_) n_components = np.searchsorted(ratio_cumsum, n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): self.noise_variance_ = explained_variance_[n_components:].mean() else: self.noise_variance_ = 0. self.n_samples_, self.n_features_ = n_samples, n_features self.components_ = components_[:n_components] self.n_components_ = n_components self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = explained_variance_ratio_[:n_components] self.singular_values_ = singular_values_[:n_components] self.accountant.spend(self.epsilon, 0) return u_mtx, sigma_vec[:n_components], u_mtx.T