def test_randomized_pca_inverse(self): # Test that randomized PCA is inversible on dense data rng = np.random.RandomState(0) n, p = 50, 3 X = mt.tensor(rng.randn(n, p)) # spherical data X[:, 1] *= .00001 # make middle component relatively small X += [5, 4, 3] # make a large mean # same check that we can find the original data from the transformed signal # (since the data is almost of rank n_components) pca = PCA(n_components=2, svd_solver='randomized', random_state=0).fit(X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) assert_almost_equal(X.execute(), Y_inverse.execute(), decimal=2) # same as above with whitening (approximate reconstruction) pca = PCA(n_components=2, whiten=True, svd_solver='randomized', random_state=0).fit(X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) relative_max_delta = (mt.abs(X - Y_inverse) / mt.abs(X).mean()).max() self.assertLess(relative_max_delta.execute(), 1e-5)
def test_singular_values(self): # Check that the PCA output has the correct singular values rng = np.random.RandomState(0) n_samples = 100 n_features = 80 X = mt.tensor(rng.randn(n_samples, n_features)) pca = PCA(n_components=2, svd_solver='full', random_state=rng).fit(X) rpca = PCA(n_components=2, svd_solver='randomized', random_state=rng).fit(X) assert_array_almost_equal(pca.singular_values_.fetch(), rpca.singular_values_.fetch(), 1) # Compare to the Frobenius norm X_pca = pca.transform(X) X_rpca = rpca.transform(X) assert_array_almost_equal( mt.sum(pca.singular_values_**2.0).execute(), (mt.linalg.norm(X_pca, "fro")**2.0).execute(), 12) assert_array_almost_equal( mt.sum(rpca.singular_values_**2.0).execute(), (mt.linalg.norm(X_rpca, "fro")**2.0).execute(), 0) # Compare to the 2-norms of the score vectors assert_array_almost_equal( pca.singular_values_.fetch(), mt.sqrt(mt.sum(X_pca**2.0, axis=0)).execute(), 12) assert_array_almost_equal( rpca.singular_values_.fetch(), mt.sqrt(mt.sum(X_rpca**2.0, axis=0)).execute(), 2) # Set the singular values and see what we get back rng = np.random.RandomState(0) n_samples = 100 n_features = 110 X = mt.tensor(rng.randn(n_samples, n_features)) pca = PCA(n_components=3, svd_solver='full', random_state=rng) rpca = PCA(n_components=3, svd_solver='randomized', random_state=rng) X_pca = pca.fit_transform(X) X_pca /= mt.sqrt(mt.sum(X_pca**2.0, axis=0)) X_pca[:, 0] *= 3.142 X_pca[:, 1] *= 2.718 X_hat = mt.dot(X_pca, pca.components_) pca.fit(X_hat) rpca.fit(X_hat) assert_array_almost_equal(pca.singular_values_.fetch(), [3.142, 2.718, 1.0], 14) assert_array_almost_equal(rpca.singular_values_.fetch(), [3.142, 2.718, 1.0], 14)
def testWhitening(self): # Check that PCA output has unit-variance rng = np.random.RandomState(0) n_samples = 100 n_features = 80 n_components = 30 rank = 50 # some low rank data with correlated features X = mt.dot( rng.randn(n_samples, rank), mt.dot(mt.diag(mt.linspace(10.0, 1.0, rank)), rng.randn(rank, n_features))) # the component-wise variance of the first 50 features is 3 times the # mean component-wise variance of the remaining 30 features X[:, :50] *= 3 self.assertEqual(X.shape, (n_samples, n_features)) # the component-wise variance is thus highly varying: self.assertGreater(X.std(axis=0).std().execute(), 43.8) for solver, copy in product(self.solver_list, (True, False)): # whiten the data while projecting to the lower dim subspace X_ = X.copy() # make sure we keep an original across iterations. pca = PCA(n_components=n_components, whiten=True, copy=copy, svd_solver=solver, random_state=0, iterated_power=7) # test fit_transform X_whitened = pca.fit_transform(X_.copy()) self.assertEqual(X_whitened.shape, (n_samples, n_components)) X_whitened2 = pca.transform(X_) assert_array_almost_equal(X_whitened.fetch(), X_whitened2.fetch()) assert_almost_equal(X_whitened.std(ddof=1, axis=0).execute(), np.ones(n_components), decimal=6) assert_almost_equal( X_whitened.mean(axis=0).execute(), np.zeros(n_components)) X_ = X.copy() pca = PCA(n_components=n_components, whiten=False, copy=copy, svd_solver=solver).fit(X_) X_unwhitened = pca.transform(X_) self.assertEqual(X_unwhitened.shape, (n_samples, n_components)) # in that case the output components still have varying variances assert_almost_equal( X_unwhitened.std(axis=0).std().execute(), 74.1, 1)
def testExplainedVariance(self): # Check that PCA output has unit-variance rng = np.random.RandomState(0) n_samples = 100 n_features = 80 X = mt.tensor(rng.randn(n_samples, n_features)) pca = PCA(n_components=2, svd_solver='full').fit(X) rpca = PCA(n_components=2, svd_solver='randomized', random_state=42).fit(X) assert_array_almost_equal(pca.explained_variance_.execute(), rpca.explained_variance_.execute(), 1) assert_array_almost_equal(pca.explained_variance_ratio_.execute(), rpca.explained_variance_ratio_.execute(), 1) # compare to empirical variances expected_result = np.linalg.eig(np.cov(X.execute(), rowvar=False))[0] expected_result = sorted(expected_result, reverse=True)[:2] X_pca = pca.transform(X) assert_array_almost_equal(pca.explained_variance_.execute(), mt.var(X_pca, ddof=1, axis=0).execute()) assert_array_almost_equal(pca.explained_variance_.execute(), expected_result) X_rpca = rpca.transform(X) assert_array_almost_equal(rpca.explained_variance_.execute(), mt.var(X_rpca, ddof=1, axis=0).execute(), decimal=1) assert_array_almost_equal(rpca.explained_variance_.execute(), expected_result, decimal=1) # Same with correlated data X = datasets.make_classification(n_samples, n_features, n_informative=n_features - 2, random_state=rng)[0] X = mt.tensor(X) pca = PCA(n_components=2).fit(X) rpca = PCA(n_components=2, svd_solver='randomized', random_state=rng).fit(X) assert_array_almost_equal(pca.explained_variance_ratio_.execute(), rpca.explained_variance_ratio_.execute(), 5)
def _check_pca_int_dtype_upcast_to_double(self, svd_solver): # Ensure that all int types will be upcast to float64 X_i64 = mt.tensor(np.random.RandomState(0).randint(0, 1000, (1000, 4))) X_i64 = X_i64.astype(np.int64, copy=False) X_i32 = X_i64.astype(np.int32, copy=False) pca_64 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_i64) pca_32 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_i32) self.assertEqual(pca_64.components_.dtype, np.float64) self.assertEqual(pca_32.components_.dtype, np.float64) self.assertEqual(pca_64.transform(X_i64).dtype, np.float64) self.assertEqual(pca_32.transform(X_i32).dtype, np.float64) assert_array_almost_equal(pca_64.components_.execute(), pca_32.components_.execute(), decimal=5)
def _check_pca_float_dtype_preservation(self, svd_solver): # Ensure that PCA does not upscale the dtype when input is float32 X_64 = mt.tensor(np.random.RandomState(0).rand(1000, 4).astype(np.float64, copy=False)) X_32 = X_64.astype(np.float32) pca_64 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_64) pca_32 = PCA(n_components=3, svd_solver=svd_solver, random_state=0).fit(X_32) self.assertEqual(pca_64.components_.dtype, np.float64) self.assertEqual(pca_32.components_.dtype, np.float32) self.assertEqual(pca_64.transform(X_64).dtype, np.float64) self.assertEqual(pca_32.transform(X_32).dtype, np.float32) # decimal=5 fails on mac with scipy = 1.1.0 assert_array_almost_equal(pca_64.components_.execute(), pca_32.components_.execute(), decimal=4)
def test_pca_inverse(self): # Test that the projection of data can be inverted rng = np.random.RandomState(0) n, p = 50, 3 X = mt.tensor(rng.randn(n, p)) # spherical data X[:, 1] *= .00001 # make middle component relatively small X += [5, 4, 3] # make a large mean # same check that we can find the original data from the transformed # signal (since the data is almost of rank n_components) pca = PCA(n_components=2, svd_solver='full').fit(X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) assert_almost_equal(X.execute(), Y_inverse.execute(), decimal=3) # same as above with whitening (approximate reconstruction) for solver in self.solver_list: pca = PCA(n_components=2, whiten=True, svd_solver=solver) pca.fit(X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) assert_almost_equal(X.execute(), Y_inverse.execute(), decimal=3)