示例#1
0
def test_incremental_pca(svd_solver):
    # Incremental PCA on dense arrays.
    X = iris.data
    X = da.from_array(X, chunks=(3, -1))
    batch_size = X.shape[0] // 3
    ipca = IncrementalPCA(n_components=2, batch_size=batch_size, svd_solver=svd_solver)
    pca = PCA(n_components=2, svd_solver=svd_solver)
    pca.fit_transform(X)

    X_transformed = ipca.fit_transform(X)

    assert X_transformed.shape == (X.shape[0], 2)
    np.testing.assert_allclose(
        ipca.explained_variance_ratio_.sum(),
        pca.explained_variance_ratio_.sum(),
        rtol=1e-3,
    )

    for n_components in [1, 2, X.shape[1]]:
        ipca = IncrementalPCA(n_components, batch_size=batch_size)
        ipca.fit(X)
        cov = ipca.get_covariance()
        precision = ipca.get_precision()
        np.testing.assert_allclose(
            np.dot(cov, precision), np.eye(X.shape[1]), atol=1e-13
        )

        assert isinstance(pca.singular_values_, type(ipca.singular_values_))
        assert isinstance(pca.mean_, type(ipca.mean_))
        assert isinstance(pca.explained_variance_, type(ipca.explained_variance_))
        assert isinstance(
            pca.explained_variance_ratio_, type(ipca.explained_variance_ratio_)
        )
示例#2
0
def test_compare_with_sklearn(svd_solver, batch_number):
    X = iris.data
    X_da = da.from_array(X, chunks=(3, -1))
    batch_size = X.shape[0] // batch_number
    ipca = sd.IncrementalPCA(n_components=2, batch_size=batch_size)
    ipca.fit(X)
    ipca_da = IncrementalPCA(
        n_components=2, batch_size=batch_size, svd_solver=svd_solver
    )
    ipca_da.fit(X_da)
    np.testing.assert_allclose(ipca.components_, ipca_da.components_, atol=1e-13)
    np.testing.assert_allclose(
        ipca.explained_variance_, ipca_da.explained_variance_, atol=1e-13
    )
    np.testing.assert_allclose(
        ipca.explained_variance_, ipca_da.explained_variance_, atol=1e-13
    )
    np.testing.assert_allclose(
        ipca.explained_variance_ratio_, ipca_da.explained_variance_ratio_, atol=1e-13
    )
    if svd_solver == "randomized":
        # noise variance in randomized solver is probabilistic.
        assert_almost_equal(ipca.noise_variance_, ipca_da.noise_variance_, decimal=1)
    else:
        np.testing.assert_allclose(
            ipca.noise_variance_, ipca_da.noise_variance_, atol=1e-13
        )
示例#3
0
def test_incremental_pca_set_params():
    # Test that components_ sign is stable over batch sizes.
    rng = np.random.RandomState(1999)
    n_samples = 100
    n_features = 20
    X = rng.randn(n_samples, n_features)
    X2 = rng.randn(n_samples, n_features)
    X3 = rng.randn(n_samples, n_features)
    X = da.from_array(X, chunks=[4, -1])
    X2 = da.from_array(X2, chunks=[4, -1])
    X3 = da.from_array(X3, chunks=[4, -1])

    ipca = IncrementalPCA(n_components=20)
    ipca.fit(X)
    # Decreasing number of components
    ipca.set_params(n_components=10)
    with pytest.raises(ValueError):
        ipca.partial_fit(X2)
    # Increasing number of components
    ipca.set_params(n_components=15)
    with pytest.raises(ValueError):
        ipca.partial_fit(X3)
    # Returning to original setting
    ipca.set_params(n_components=20)
    ipca.partial_fit(X)
示例#4
0
def test_singular_values(svd_solver):
    # Check that the IncrementalPCA output has the correct singular values

    rng = np.random.RandomState(0)
    n_samples = 1000
    n_features = 100

    X = datasets.make_low_rank_matrix(
        n_samples, n_features, tail_strength=0.0, effective_rank=10, random_state=rng
    )
    X = da.from_array(X, chunks=[200, -1])

    pca = PCA(n_components=10, svd_solver=svd_solver, random_state=rng).fit(X)
    ipca = IncrementalPCA(n_components=10, batch_size=100, svd_solver=svd_solver).fit(X)
    assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2)

    # Compare to the Frobenius norm
    X_pca = pca.transform(X)
    X_ipca = ipca.transform(X)
    assert_array_almost_equal(
        np.sum(pca.singular_values_ ** 2.0), np.linalg.norm(X_pca, "fro") ** 2.0, 12
    )
    assert_array_almost_equal(
        np.sum(ipca.singular_values_ ** 2.0), np.linalg.norm(X_ipca, "fro") ** 2.0, 2
    )

    # Compare to the 2-norms of the score vectors
    assert_array_almost_equal(
        pca.singular_values_, np.sqrt(np.sum(X_pca ** 2.0, axis=0)), 12
    )
    assert_array_almost_equal(
        ipca.singular_values_, np.sqrt(np.sum(X_ipca ** 2.0, axis=0)), 2
    )

    # Set the singular values and see what we get back
    rng = np.random.RandomState(0)
    n_samples = 100
    n_features = 110

    X = datasets.make_low_rank_matrix(
        n_samples, n_features, tail_strength=0.0, effective_rank=3, random_state=rng
    )
    X = da.from_array(X, chunks=[4, -1])

    pca = PCA(n_components=3, svd_solver=svd_solver, random_state=rng)
    ipca = IncrementalPCA(n_components=3, batch_size=100, svd_solver=svd_solver)

    X_pca = pca.fit_transform(X)
    X_pca /= np.sqrt(np.sum(X_pca ** 2.0, axis=0))
    X_pca[:, 0] *= 3.142
    X_pca[:, 1] *= 2.718

    X_hat = np.dot(X_pca, pca.components_)
    pca.fit(X_hat)
    X_hat = da.from_array(X_hat, chunks=(4, -1))
    ipca.fit(X_hat)
    assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14)
    assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)
示例#5
0
def test_incremental_pca_num_features_change():
    # Test that changing n_components will raise an error.
    rng = np.random.RandomState(1999)
    n_samples = 100
    X = rng.randn(n_samples, 20)
    X2 = rng.randn(n_samples, 50)
    X = da.from_array(X, chunks=[4, -1])
    X2 = da.from_array(X2, chunks=[4, -1])

    ipca = IncrementalPCA(n_components=None)
    ipca.fit(X)
    with pytest.raises(ValueError):
        ipca.partial_fit(X2)