def testPCA(self): X = self.iris for n_comp in np.arange(X.shape[1]): pca = PCA(n_components=n_comp, svd_solver='full') X_r = pca.fit(X).transform(X).fetch() np.testing.assert_equal(X_r.shape[1], n_comp) X_r2 = pca.fit_transform(X).fetch() assert_array_almost_equal(X_r, X_r2) X_r = pca.transform(X).fetch() X_r2 = pca.fit_transform(X).fetch() assert_array_almost_equal(X_r, X_r2) # Test get_covariance and get_precision cov = pca.get_covariance() precision = pca.get_precision() assert_array_almost_equal( mt.dot(cov, precision).execute(), np.eye(X.shape[1]), 12) # test explained_variance_ratio_ == 1 with all components pca = PCA(svd_solver='full') pca.fit(X) np.testing.assert_allclose( pca.explained_variance_ratio_.sum().execute(), 1.0, 3)
def test_n_components_none(self): for solver in self.solver_list: # Ensures that n_components == None is handled correctly X = self.iris # We conduct the same test on X.T so that it is invariant to axis. for data in [X, X.T]: pca = PCA(svd_solver=solver) pca.fit(data) self.assertEqual(pca.n_components_, min(data.shape))
def test_singular_values(self): # Check that the PCA output has the correct singular values rng = np.random.RandomState(0) n_samples = 100 n_features = 80 X = mt.tensor(rng.randn(n_samples, n_features)) pca = PCA(n_components=2, svd_solver='full', random_state=rng).fit(X) rpca = PCA(n_components=2, svd_solver='randomized', random_state=rng).fit(X) assert_array_almost_equal(pca.singular_values_.fetch(), rpca.singular_values_.fetch(), 1) # Compare to the Frobenius norm X_pca = pca.transform(X) X_rpca = rpca.transform(X) assert_array_almost_equal( mt.sum(pca.singular_values_**2.0).execute(), (mt.linalg.norm(X_pca, "fro")**2.0).execute(), 12) assert_array_almost_equal( mt.sum(rpca.singular_values_**2.0).execute(), (mt.linalg.norm(X_rpca, "fro")**2.0).execute(), 0) # Compare to the 2-norms of the score vectors assert_array_almost_equal( pca.singular_values_.fetch(), mt.sqrt(mt.sum(X_pca**2.0, axis=0)).execute(), 12) assert_array_almost_equal( rpca.singular_values_.fetch(), mt.sqrt(mt.sum(X_rpca**2.0, axis=0)).execute(), 2) # Set the singular values and see what we get back rng = np.random.RandomState(0) n_samples = 100 n_features = 110 X = mt.tensor(rng.randn(n_samples, n_features)) pca = PCA(n_components=3, svd_solver='full', random_state=rng) rpca = PCA(n_components=3, svd_solver='randomized', random_state=rng) X_pca = pca.fit_transform(X) X_pca /= mt.sqrt(mt.sum(X_pca**2.0, axis=0)) X_pca[:, 0] *= 3.142 X_pca[:, 1] *= 2.718 X_hat = mt.dot(X_pca, pca.components_) pca.fit(X_hat) rpca.fit(X_hat) assert_array_almost_equal(pca.singular_values_.fetch(), [3.142, 2.718, 1.0], 14) assert_array_almost_equal(rpca.singular_values_.fetch(), [3.142, 2.718, 1.0], 14)
def test_pca_score(self): # Test that probabilistic PCA scoring yields a reasonable score n, p = 1000, 3 rng = np.random.RandomState(0) X = mt.tensor(rng.randn(n, p) * .1) + mt.array([3, 4, 5]) for solver in self.solver_list: pca = PCA(n_components=2, svd_solver=solver) pca.fit(X) ll1 = pca.score(X) h = -0.5 * mt.log(2 * mt.pi * mt.exp(1) * 0.1**2) * p np.testing.assert_almost_equal((ll1 / h).execute(), 1, 0)
def test_infer_dim_3(self): n, p = 100, 5 rng = np.random.RandomState(0) X = mt.tensor(rng.randn(n, p) * .1) X[:10] += mt.array([3, 4, 5, 1, 2]) X[10:20] += mt.array([6, 0, 7, 2, -1]) X[30:40] += 2 * mt.array([-1, 1, -1, 1, -1]) pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ self.assertGreater(_infer_dimension_(spect, n, p).execute(), 2)
def test_infer_dim_2(self): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = mt.tensor(rng.randn(n, p) * .1) X[:10] += mt.array([3, 4, 5, 1, 2]) X[10:20] += mt.array([6, 0, 7, 2, -1]) pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ self.assertGreater(_infer_dimension_(spect, n, p).execute(), 1)
def test_infer_dim_1(self): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = (mt.tensor(rng.randn(n, p)) * .1 + mt.tensor(rng.randn(n, 1)) * mt.array([3, 4, 5, 1, 2]) + mt.array([1, 0, 7, 4, 6])) pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ ll = mt.array([_assess_dimension_(spect, k, n, p) for k in range(p)]).execute() self.assertGreater(ll[1], ll.max() - .01 * n)
def test_pca_score3(self): # Check that probabilistic PCA selects the right model n, p = 200, 3 rng = np.random.RandomState(0) Xl = mt.tensor(rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7])) Xt = mt.tensor(rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7])) ll = mt.zeros(p) for k in range(p): pca = PCA(n_components=k, svd_solver='full') pca.fit(Xl) ll[k] = pca.score(Xt) assert ll.argmax().execute() == 1
def test_pca_score2(self): # Test that probabilistic PCA correctly separated different datasets n, p = 100, 3 rng = np.random.RandomState(0) X = mt.tensor(rng.randn(n, p) * .1) + mt.array([3, 4, 5]) for solver in self.solver_list: pca = PCA(n_components=2, svd_solver=solver) pca.fit(X) ll1 = pca.score(X) ll2 = pca.score(mt.tensor(rng.randn(n, p) * .2) + mt.array([3, 4, 5])) self.assertGreater(ll1.fetch(), ll2.fetch()) # Test that it gives different scores if whiten=True pca = PCA(n_components=2, whiten=True, svd_solver=solver) pca.fit(X) ll2 = pca.score(X) assert ll1.fetch() > ll2.fetch()
def test_pca_zero_noise_variance_edge_cases(self): # ensure that noise_variance_ is 0 in edge cases # when n_components == min(n_samples, n_features) n, p = 100, 3 rng = np.random.RandomState(0) X = mt.tensor(rng.randn(n, p) * .1) + mt.array([3, 4, 5]) # arpack raises ValueError for n_components == min(n_samples, # n_features) svd_solvers = ['full', 'randomized'] for svd_solver in svd_solvers: pca = PCA(svd_solver=svd_solver, n_components=p) pca.fit(X) self.assertEqual(pca.noise_variance_, 0) pca.fit(X.T) self.assertEqual(pca.noise_variance_, 0)
def test_infer_dim_by_explained_variance(self): X = self.iris pca = PCA(n_components=0.95, svd_solver='full') pca.fit(X) self.assertEqual(pca.n_components, 0.95) self.assertEqual(pca.n_components_, 2) pca = PCA(n_components=0.01, svd_solver='full') pca.fit(X) self.assertEqual(pca.n_components, 0.01) self.assertEqual(pca.n_components_, 1) rng = np.random.RandomState(0) # more features than samples X = mt.tensor(rng.rand(5, 20)) pca = PCA(n_components=.5, svd_solver='full').fit(X) self.assertEqual(pca.n_components, 0.5) self.assertEqual(pca.n_components_, 2)
def test_n_components_mle(self): # Ensure that n_components == 'mle' doesn't raise error for auto/full # svd_solver and raises error for arpack/randomized svd_solver rng = np.random.RandomState(0) n_samples = 600 n_features = 10 X = mt.tensor(rng.randn(n_samples, n_features)) n_components_dict = {} for solver in self.solver_list: pca = PCA(n_components='mle', svd_solver=solver) if solver in ['auto', 'full']: pca.fit(X) n_components_dict[solver] = pca.n_components_ else: # arpack/randomized solver error_message = ("n_components='mle' cannot be a string with " "svd_solver='{}'".format(solver)) assert_raise_message(ValueError, error_message, pca.fit, X) self.assertEqual(n_components_dict['auto'], n_components_dict['full'])
def testPCARandomizedSolver(self): # PCA on dense arrays X = self.iris # Loop excluding the 0, invalid for randomized for n_comp in np.arange(1, X.shape[1]): pca = PCA(n_components=n_comp, svd_solver='randomized', random_state=0) X_r = pca.fit(X).transform(X) np.testing.assert_equal(X_r.shape[1], n_comp) X_r2 = pca.fit_transform(X) assert_array_almost_equal(X_r.fetch(), X_r2.fetch()) X_r = pca.transform(X) assert_array_almost_equal(X_r.fetch(), X_r2.fetch()) # Test get_covariance and get_precision cov = pca.get_covariance() precision = pca.get_precision() assert_array_almost_equal( mt.dot(cov, precision).execute(), mt.eye(X.shape[1]).execute(), 12) pca = PCA(n_components=0, svd_solver='randomized', random_state=0) with self.assertRaises(ValueError): pca.fit(X) pca = PCA(n_components=0, svd_solver='randomized', random_state=0) with self.assertRaises(ValueError): pca.fit(X) # Check internal state self.assertEqual( pca.n_components, PCA(n_components=0, svd_solver='randomized', random_state=0).n_components) self.assertEqual( pca.svd_solver, PCA(n_components=0, svd_solver='randomized', random_state=0).svd_solver)
def test_pca_inverse(self): # Test that the projection of data can be inverted rng = np.random.RandomState(0) n, p = 50, 3 X = mt.tensor(rng.randn(n, p)) # spherical data X[:, 1] *= .00001 # make middle component relatively small X += [5, 4, 3] # make a large mean # same check that we can find the original data from the transformed # signal (since the data is almost of rank n_components) pca = PCA(n_components=2, svd_solver='full').fit(X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) assert_almost_equal(X.execute(), Y_inverse.execute(), decimal=3) # same as above with whitening (approximate reconstruction) for solver in self.solver_list: pca = PCA(n_components=2, whiten=True, svd_solver=solver) pca.fit(X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) assert_almost_equal(X.execute(), Y_inverse.execute(), decimal=3)
def test_pca_bad_solver(self): X = mt.tensor(np.random.RandomState(0).rand(5, 4)) pca = PCA(n_components=3, svd_solver='bad_argument') with self.assertRaises(ValueError): pca.fit(X)
def test_svd_solver_auto(self): rng = np.random.RandomState(0) X = mt.tensor(rng.uniform(size=(1000, 50))) # case: n_components in (0,1) => 'full' pca = PCA(n_components=.5) pca.fit(X) pca_test = PCA(n_components=.5, svd_solver='full') pca_test.fit(X) assert_array_almost_equal(pca.components_.execute(), pca_test.components_.execute()) # case: max(X.shape) <= 500 => 'full' pca = PCA(n_components=5, random_state=0) Y = X[:10, :] pca.fit(Y) pca_test = PCA(n_components=5, svd_solver='full', random_state=0) pca_test.fit(Y) assert_array_almost_equal(pca.components_.execute(), pca_test.components_.execute()) # case: n_components >= .8 * min(X.shape) => 'full' pca = PCA(n_components=50) pca.fit(X) pca_test = PCA(n_components=50, svd_solver='full') pca_test.fit(X) assert_array_almost_equal(pca.components_.execute(), pca_test.components_.execute()) # n_components >= 1 and n_components < .8 * min(X.shape) => 'randomized' pca = PCA(n_components=10, random_state=0) pca.fit(X) pca_test = PCA(n_components=10, svd_solver='randomized', random_state=0) pca_test.fit(X) assert_array_almost_equal(pca.components_.execute(), pca_test.components_.execute())