Пример #1
0
    def test_randomized_pca_inverse(self):
        # Test that randomized PCA is inversible on dense data
        rng = np.random.RandomState(0)
        n, p = 50, 3
        X = mt.tensor(rng.randn(n, p))  # spherical data
        X[:, 1] *= .00001  # make middle component relatively small
        X += [5, 4, 3]  # make a large mean

        # same check that we can find the original data from the transformed signal
        # (since the data is almost of rank n_components)
        pca = PCA(n_components=2, svd_solver='randomized',
                  random_state=0).fit(X)
        Y = pca.transform(X)
        Y_inverse = pca.inverse_transform(Y)
        assert_almost_equal(X.to_numpy(), Y_inverse.to_numpy(), decimal=2)

        # same as above with whitening (approximate reconstruction)
        pca = PCA(n_components=2,
                  whiten=True,
                  svd_solver='randomized',
                  random_state=0).fit(X)
        Y = pca.transform(X)
        Y_inverse = pca.inverse_transform(Y)
        relative_max_delta = (mt.abs(X - Y_inverse) / mt.abs(X).mean()).max()
        self.assertLess(relative_max_delta.to_numpy(), 1e-5)
Пример #2
0
    def test_singular_values(self):
        # Check that the PCA output has the correct singular values

        rng = np.random.RandomState(0)
        n_samples = 100
        n_features = 80

        X = mt.tensor(rng.randn(n_samples, n_features))

        pca = PCA(n_components=2, svd_solver='full', random_state=rng).fit(X)
        rpca = PCA(n_components=2, svd_solver='randomized',
                   random_state=rng).fit(X)
        assert_array_almost_equal(pca.singular_values_.fetch(),
                                  rpca.singular_values_.fetch(), 1)

        # Compare to the Frobenius norm
        X_pca = pca.transform(X)
        X_rpca = rpca.transform(X)
        assert_array_almost_equal(
            mt.sum(pca.singular_values_**2.0).to_numpy(),
            (mt.linalg.norm(X_pca, "fro")**2.0).to_numpy(), 12)
        assert_array_almost_equal(
            mt.sum(rpca.singular_values_**2.0).to_numpy(),
            (mt.linalg.norm(X_rpca, "fro")**2.0).to_numpy(), 0)

        # Compare to the 2-norms of the score vectors
        assert_array_almost_equal(
            pca.singular_values_.fetch(),
            mt.sqrt(mt.sum(X_pca**2.0, axis=0)).to_numpy(), 12)
        assert_array_almost_equal(
            rpca.singular_values_.fetch(),
            mt.sqrt(mt.sum(X_rpca**2.0, axis=0)).to_numpy(), 2)

        # Set the singular values and see what we get back
        rng = np.random.RandomState(0)
        n_samples = 100
        n_features = 110

        X = mt.tensor(rng.randn(n_samples, n_features))

        pca = PCA(n_components=3, svd_solver='full', random_state=rng)
        rpca = PCA(n_components=3, svd_solver='randomized', random_state=rng)
        X_pca = pca.fit_transform(X)

        X_pca /= mt.sqrt(mt.sum(X_pca**2.0, axis=0))
        X_pca[:, 0] *= 3.142
        X_pca[:, 1] *= 2.718

        X_hat = mt.dot(X_pca, pca.components_)
        pca.fit(X_hat)
        rpca.fit(X_hat)
        assert_array_almost_equal(pca.singular_values_.fetch(),
                                  [3.142, 2.718, 1.0], 14)
        assert_array_almost_equal(rpca.singular_values_.fetch(),
                                  [3.142, 2.718, 1.0], 14)
Пример #3
0
    def testWhitening(self):
        # Check that PCA output has unit-variance
        rng = np.random.RandomState(0)
        n_samples = 100
        n_features = 80
        n_components = 30
        rank = 50

        # some low rank data with correlated features
        X = mt.dot(
            rng.randn(n_samples, rank),
            mt.dot(mt.diag(mt.linspace(10.0, 1.0, rank)),
                   rng.randn(rank, n_features)))
        # the component-wise variance of the first 50 features is 3 times the
        # mean component-wise variance of the remaining 30 features
        X[:, :50] *= 3

        self.assertEqual(X.shape, (n_samples, n_features))

        # the component-wise variance is thus highly varying:
        self.assertGreater(X.std(axis=0).std().to_numpy(), 43.8)

        for solver, copy in product(self.solver_list, (True, False)):
            # whiten the data while projecting to the lower dim subspace
            X_ = X.copy()  # make sure we keep an original across iterations.
            pca = PCA(n_components=n_components,
                      whiten=True,
                      copy=copy,
                      svd_solver=solver,
                      random_state=0,
                      iterated_power=7)
            # test fit_transform
            X_whitened = pca.fit_transform(X_.copy())
            self.assertEqual(X_whitened.shape, (n_samples, n_components))
            X_whitened2 = pca.transform(X_)
            assert_array_almost_equal(X_whitened.fetch(), X_whitened2.fetch())

            assert_almost_equal(X_whitened.std(ddof=1, axis=0).to_numpy(),
                                np.ones(n_components),
                                decimal=6)
            assert_almost_equal(
                X_whitened.mean(axis=0).to_numpy(), np.zeros(n_components))

            X_ = X.copy()
            pca = PCA(n_components=n_components,
                      whiten=False,
                      copy=copy,
                      svd_solver=solver).fit(X_)
            X_unwhitened = pca.transform(X_)
            self.assertEqual(X_unwhitened.shape, (n_samples, n_components))

            # in that case the output components still have varying variances
            assert_almost_equal(
                X_unwhitened.std(axis=0).std().to_numpy(), 74.1, 1)
Пример #4
0
    def testPCA(self):
        X = self.iris

        for n_comp in np.arange(X.shape[1]):
            pca = PCA(n_components=n_comp, svd_solver='full')

            X_r = pca.fit(X).transform(X).fetch()
            np.testing.assert_equal(X_r.shape[1], n_comp)

            X_r2 = pca.fit_transform(X).fetch()
            assert_array_almost_equal(X_r, X_r2)

            X_r = pca.transform(X).fetch()
            X_r2 = pca.fit_transform(X).fetch()
            assert_array_almost_equal(X_r, X_r2)

            # Test get_covariance and get_precision
            cov = pca.get_covariance()
            precision = pca.get_precision()
            assert_array_almost_equal(
                mt.dot(cov, precision).to_numpy(), np.eye(X.shape[1]), 12)

        # test explained_variance_ratio_ == 1 with all components
        pca = PCA(svd_solver='full')
        pca.fit(X)
        np.testing.assert_allclose(
            pca.explained_variance_ratio_.sum().to_numpy(), 1.0, 3)
Пример #5
0
def test_pca_randomized_solver(setup):
    # PCA on dense arrays
    X = iris

    # Loop excluding the 0, invalid for randomized
    for n_comp in np.arange(1, X.shape[1]):
        pca = PCA(n_components=n_comp, svd_solver='randomized', random_state=0)

        X_r = pca.fit(X).transform(X)
        np.testing.assert_equal(X_r.shape[1], n_comp)

        X_r2 = pca.fit_transform(X)
        assert_array_almost_equal(X_r.fetch(), X_r2.fetch())

        X_r = pca.transform(X)
        assert_array_almost_equal(X_r.fetch(), X_r2.fetch())

        # Test get_covariance and get_precision
        cov = pca.get_covariance()
        precision = pca.get_precision()
        assert_array_almost_equal(mt.dot(cov, precision).to_numpy(),
                                  mt.eye(X.shape[1]).to_numpy(), 12)

    pca = PCA(n_components=0, svd_solver='randomized', random_state=0)
    with pytest.raises(ValueError):
        pca.fit(X)

    pca = PCA(n_components=0, svd_solver='randomized', random_state=0)
    with pytest.raises(ValueError):
        pca.fit(X)
    # Check internal state
    assert pca.n_components == PCA(n_components=0,
                         svd_solver='randomized', random_state=0).n_components
    assert pca.svd_solver == PCA(n_components=0,
                         svd_solver='randomized', random_state=0).svd_solver
Пример #6
0
    def testExplainedVariance(self):
        # Check that PCA output has unit-variance
        rng = np.random.RandomState(0)
        n_samples = 100
        n_features = 80

        X = mt.tensor(rng.randn(n_samples, n_features))

        pca = PCA(n_components=2, svd_solver='full').fit(X)
        rpca = PCA(n_components=2, svd_solver='randomized',
                   random_state=42).fit(X)
        assert_array_almost_equal(pca.explained_variance_.to_numpy(),
                                  rpca.explained_variance_.to_numpy(), 1)
        assert_array_almost_equal(pca.explained_variance_ratio_.to_numpy(),
                                  rpca.explained_variance_ratio_.to_numpy(), 1)

        # compare to empirical variances
        expected_result = np.linalg.eig(np.cov(X.to_numpy(), rowvar=False))[0]
        expected_result = sorted(expected_result, reverse=True)[:2]

        X_pca = pca.transform(X)
        assert_array_almost_equal(pca.explained_variance_.to_numpy(),
                                  mt.var(X_pca, ddof=1, axis=0).to_numpy())
        assert_array_almost_equal(pca.explained_variance_.to_numpy(),
                                  expected_result)

        X_rpca = rpca.transform(X)
        assert_array_almost_equal(rpca.explained_variance_.to_numpy(),
                                  mt.var(X_rpca, ddof=1, axis=0).to_numpy(),
                                  decimal=1)
        assert_array_almost_equal(rpca.explained_variance_.to_numpy(),
                                  expected_result,
                                  decimal=1)

        # Same with correlated data
        X = datasets.make_classification(n_samples,
                                         n_features,
                                         n_informative=n_features - 2,
                                         random_state=rng)[0]
        X = mt.tensor(X)

        pca = PCA(n_components=2).fit(X)
        rpca = PCA(n_components=2, svd_solver='randomized',
                   random_state=rng).fit(X)
        assert_array_almost_equal(pca.explained_variance_ratio_.to_numpy(),
                                  rpca.explained_variance_ratio_.to_numpy(), 5)
Пример #7
0
def _check_pca_int_dtype_upcast_to_double(svd_solver):
    # Ensure that all int types will be upcast to float64
    X_i64 = mt.tensor(np.random.RandomState(0).randint(0, 1000, (1000, 4)))
    X_i64 = X_i64.astype(np.int64, copy=False)
    X_i32 = X_i64.astype(np.int32, copy=False)

    pca_64 = PCA(n_components=3, svd_solver=svd_solver,
                 random_state=0).fit(X_i64)
    pca_32 = PCA(n_components=3, svd_solver=svd_solver,
                 random_state=0).fit(X_i32)

    assert pca_64.components_.dtype == np.float64
    assert pca_32.components_.dtype == np.float64
    assert pca_64.transform(X_i64).dtype == np.float64
    assert pca_32.transform(X_i32).dtype == np.float64

    assert_array_almost_equal(pca_64.components_.to_numpy(), pca_32.components_.to_numpy(),
                              decimal=5)
Пример #8
0
def _check_pca_float_dtype_preservation(svd_solver):
    # Ensure that PCA does not upscale the dtype when input is float32
    X_64 = mt.tensor(np.random.RandomState(0).rand(1000, 4).astype(np.float64,
                                                                   copy=False))
    X_32 = X_64.astype(np.float32)

    pca_64 = PCA(n_components=3, svd_solver=svd_solver,
                 random_state=0).fit(X_64)
    pca_32 = PCA(n_components=3, svd_solver=svd_solver,
                 random_state=0).fit(X_32)

    assert pca_64.components_.dtype == np.float64
    assert pca_32.components_.dtype == np.float32
    assert pca_64.transform(X_64).dtype == np.float64
    assert pca_32.transform(X_32).dtype == np.float32

    # decimal=5 fails on mac with scipy = 1.1.0
    assert_array_almost_equal(pca_64.components_.to_numpy(), pca_32.components_.to_numpy(),
                              decimal=4)
Пример #9
0
    def test_pca_inverse(self):
        # Test that the projection of data can be inverted
        rng = np.random.RandomState(0)
        n, p = 50, 3
        X = mt.tensor(rng.randn(n, p))  # spherical data
        X[:, 1] *= .00001  # make middle component relatively small
        X += [5, 4, 3]  # make a large mean

        # same check that we can find the original data from the transformed
        # signal (since the data is almost of rank n_components)
        pca = PCA(n_components=2, svd_solver='full').fit(X)
        Y = pca.transform(X)
        Y_inverse = pca.inverse_transform(Y)
        assert_almost_equal(X.to_numpy(), Y_inverse.to_numpy(), decimal=3)

        # same as above with whitening (approximate reconstruction)
        for solver in self.solver_list:
            pca = PCA(n_components=2, whiten=True, svd_solver=solver)
            pca.fit(X)
            Y = pca.transform(X)
            Y_inverse = pca.inverse_transform(Y)
            assert_almost_equal(X.to_numpy(), Y_inverse.to_numpy(), decimal=3)