示例#1
0
def test_pca_singular_values(svd_solver):
    rng = np.random.RandomState(0)
    n_samples, n_features = 100, 80
    X = rng.randn(n_samples, n_features)

    pca = PCA(n_components=2, svd_solver=svd_solver, random_state=rng)
    X_trans = pca.fit_transform(X)

    # compare to the Frobenius norm
    assert_allclose(np.sum(pca.singular_values_**2),
                    np.linalg.norm(X_trans, "fro")**2)
    # Compare to the 2-norms of the score vectors
    assert_allclose(pca.singular_values_, np.sqrt(np.sum(X_trans**2, axis=0)))

    # set the singular values and see what er get back
    n_samples, n_features = 100, 110
    X = rng.randn(n_samples, n_features)

    pca = PCA(n_components=3, svd_solver=svd_solver, random_state=rng)
    X_trans = pca.fit_transform(X)
    X_trans /= np.sqrt(np.sum(X_trans**2, axis=0))
    X_trans[:, 0] *= 3.142
    X_trans[:, 1] *= 2.718
    X_hat = np.dot(X_trans, pca.components_)
    pca.fit(X_hat)
    assert_allclose(pca.singular_values_, [3.142, 2.718, 1.0])
示例#2
0
def test_pca_score_consistency_solvers(svd_solver):
    # Check the consistency of score between solvers
    X, _ = datasets.load_digits(return_X_y=True)
    pca_full = PCA(n_components=30, svd_solver='full', random_state=0)
    pca_other = PCA(n_components=30, svd_solver=svd_solver, random_state=0)
    pca_full.fit(X)
    pca_other.fit(X)
    assert_allclose(pca_full.score(X), pca_other.score(X), rtol=5e-6)
示例#3
0
def test_n_components_mle(svd_solver):
    # Ensure that n_components == 'mle' doesn't raise error for auto/full
    rng = np.random.RandomState(0)
    n_samples, n_features = 600, 10
    X = rng.randn(n_samples, n_features)
    pca = PCA(n_components='mle', svd_solver=svd_solver)
    pca.fit(X)
    assert pca.n_components_ == 0
示例#4
0
def test_pca_svd_solver_auto(data, n_components, expected_solver):
    pca_auto = PCA(n_components=n_components, random_state=0)
    pca_test = PCA(n_components=n_components,
                   svd_solver=expected_solver,
                   random_state=0)
    pca_auto.fit(data)
    pca_test.fit(data)
    assert_allclose(pca_auto.components_, pca_test.components_)
示例#5
0
def test_pca_sparse_input(svd_solver):
    X = np.random.RandomState(0).rand(5, 4)
    X = sp.sparse.csr_matrix(X)
    assert sp.sparse.issparse(X)

    pca = PCA(n_components=3, svd_solver=svd_solver)
    with pytest.raises(TypeError):
        pca.fit(X)
示例#6
0
def test_pca_sanity_noise_variance(svd_solver):
    # Sanity check for the noise_variance_. For more details see
    # https://github.com/scikit-learn/scikit-learn/issues/7568
    # https://github.com/scikit-learn/scikit-learn/issues/8541
    # https://github.com/scikit-learn/scikit-learn/issues/8544
    X, _ = datasets.load_digits(return_X_y=True)
    pca = PCA(n_components=30, svd_solver=svd_solver, random_state=0)
    pca.fit(X)
    assert np.all((pca.explained_variance_ - pca.noise_variance_) >= 0)
示例#7
0
def test_no_empty_slice_warning():
    # test if we avoid numpy warnings for computing over empty arrays
    n_components = 10
    n_features = n_components + 2  # anything > n_comps triggered it in 0.16
    X = np.random.uniform(-1, 1, size=(n_components, n_features))
    pca = PCA(n_components=n_components)
    with pytest.warns(None) as record:
        pca.fit(X)
    assert not record.list
示例#8
0
def test_n_components_mle_error(svd_solver):
    # Ensure that n_components == 'mle' will raise an error for unsupported
    # solvers
    rng = np.random.RandomState(0)
    n_samples, n_features = 600, 10
    X = rng.randn(n_samples, n_features)
    pca = PCA(n_components='mle', svd_solver=svd_solver)
    err_msg = ("n_components='mle' cannot be a string with svd_solver='{}'".
               format(svd_solver))
    with pytest.raises(ValueError, match=err_msg):
        pca.fit(X)
示例#9
0
def test_infer_dim_3():
    n, p = 100, 5
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1
    X[:10] += np.array([3, 4, 5, 1, 2])
    X[10:20] += np.array([6, 0, 7, 2, -1])
    X[30:40] += 2 * np.array([-1, 1, -1, 1, -1])
    pca = PCA(n_components=p, svd_solver='full')
    pca.fit(X)
    spect = pca.explained_variance_
    assert _infer_dimension_(spect, n, p) > 2
示例#10
0
def test_infer_dim_2():
    # TODO: explain what this is testing
    # Or at least use explicit variable names...
    n, p = 1000, 5
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1
    X[:10] += np.array([3, 4, 5, 1, 2])
    X[10:20] += np.array([6, 0, 7, 2, -1])
    pca = PCA(n_components=p, svd_solver='full')
    pca.fit(X)
    spect = pca.explained_variance_
    assert _infer_dimension_(spect, n, p) > 1
示例#11
0
def test_infer_dim_1():
    # TODO: explain what this is testing
    # Or at least use explicit variable names...
    n, p = 1000, 5
    rng = np.random.RandomState(0)
    X = (rng.randn(n, p) * .1 + rng.randn(n, 1) * np.array([3, 4, 5, 1, 2]) +
         np.array([1, 0, 7, 4, 6]))
    pca = PCA(n_components=p, svd_solver='full')
    pca.fit(X)
    spect = pca.explained_variance_
    ll = np.array([_assess_dimension_(spect, k, n, p) for k in range(p)])
    assert ll[1] > ll.max() - .01 * n
示例#12
0
def test_pca_zero_noise_variance_edge_cases(svd_solver):
    # ensure that noise_variance_ is 0 in edge cases
    # when n_components == min(n_samples, n_features)
    n, p = 100, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1 + np.array([3, 4, 5])

    pca = PCA(n_components=p, svd_solver=svd_solver)
    pca.fit(X)
    assert pca.noise_variance_ == 0

    pca.fit(X.T)
    assert pca.noise_variance_ == 0
示例#13
0
def test_pca_singular_values_consistency(svd_solver):
    rng = np.random.RandomState(0)
    n_samples, n_features = 100, 80
    X = rng.randn(n_samples, n_features)

    pca_full = PCA(n_components=2, svd_solver='full', random_state=rng)
    pca_other = PCA(n_components=2, svd_solver=svd_solver, random_state=rng)

    pca_full.fit(X)
    pca_other.fit(X)

    assert_allclose(pca_full.singular_values_,
                    pca_other.singular_values_,
                    rtol=5e-3)
def test_singular_values():
    # Check that the IncrementalPCA output has the correct singular values

    rng = np.random.RandomState(0)
    n_samples = 1000
    n_features = 100

    X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0,
                                      effective_rank=10, random_state=rng)

    pca = PCA(n_components=10, svd_solver='full', random_state=rng).fit(X)
    ipca = IncrementalPCA(n_components=10, batch_size=100).fit(X)
    assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2)

    # Compare to the Frobenius norm
    X_pca = pca.transform(X)
    X_ipca = ipca.transform(X)
    assert_array_almost_equal(np.sum(pca.singular_values_**2.0),
                              np.linalg.norm(X_pca, "fro")**2.0, 12)
    assert_array_almost_equal(np.sum(ipca.singular_values_**2.0),
                              np.linalg.norm(X_ipca, "fro")**2.0, 2)

    # Compare to the 2-norms of the score vectors
    assert_array_almost_equal(pca.singular_values_,
                              np.sqrt(np.sum(X_pca**2.0, axis=0)), 12)
    assert_array_almost_equal(ipca.singular_values_,
                              np.sqrt(np.sum(X_ipca**2.0, axis=0)), 2)

    # Set the singular values and see what we get back
    rng = np.random.RandomState(0)
    n_samples = 100
    n_features = 110

    X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0,
                                      effective_rank=3, random_state=rng)

    pca = PCA(n_components=3, svd_solver='full', random_state=rng)
    ipca = IncrementalPCA(n_components=3, batch_size=100)

    X_pca = pca.fit_transform(X)
    X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0))
    X_pca[:, 0] *= 3.142
    X_pca[:, 1] *= 2.718

    X_hat = np.dot(X_pca, pca.components_)
    pca.fit(X_hat)
    ipca.fit(X_hat)
    assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14)
    assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)
示例#15
0
def test_pca_score3():
    # Check that probabilistic PCA selects the right model
    n, p = 200, 3
    rng = np.random.RandomState(0)
    Xl = (rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) +
          np.array([1, 0, 7]))
    Xt = (rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) +
          np.array([1, 0, 7]))
    ll = np.zeros(p)
    for k in range(p):
        pca = PCA(n_components=k, svd_solver='full')
        pca.fit(Xl)
        ll[k] = pca.score(Xt)

    assert ll.argmax() == 1
示例#16
0
def test_pca_vs_spca():
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng)
    Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)
    spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2)
    pca = PCA(n_components=2)
    pca.fit(Y)
    spca.fit(Y)
    results_test_pca = pca.transform(Z)
    results_test_spca = spca.transform(Z)
    assert_allclose(np.abs(spca.components_.dot(pca.components_.T)),
                    np.eye(2),
                    atol=1e-5)
    results_test_pca *= np.sign(results_test_pca[0, :])
    results_test_spca *= np.sign(results_test_spca[0, :])
    assert_allclose(results_test_pca, results_test_spca)
示例#17
0
def test_pca_explained_variance_equivalence_solver(svd_solver):
    rng = np.random.RandomState(0)
    n_samples, n_features = 100, 80
    X = rng.randn(n_samples, n_features)

    pca_full = PCA(n_components=2, svd_solver='full')
    pca_other = PCA(n_components=2, svd_solver=svd_solver, random_state=0)

    pca_full.fit(X)
    pca_other.fit(X)

    assert_allclose(pca_full.explained_variance_,
                    pca_other.explained_variance_,
                    rtol=5e-2)
    assert_allclose(pca_full.explained_variance_ratio_,
                    pca_other.explained_variance_ratio_,
                    rtol=5e-2)
示例#18
0
def test_pca_score(svd_solver):
    # Test that probabilistic PCA scoring yields a reasonable score
    n, p = 1000, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
    pca = PCA(n_components=2, svd_solver=svd_solver)
    pca.fit(X)

    ll1 = pca.score(X)
    h = -0.5 * np.log(2 * np.pi * np.exp(1) * 0.1**2) * p
    assert_allclose(ll1 / h, 1, rtol=5e-2)

    ll2 = pca.score(rng.randn(n, p) * .2 + np.array([3, 4, 5]))
    assert ll1 > ll2

    pca = PCA(n_components=2, whiten=True, svd_solver=svd_solver)
    pca.fit(X)
    ll2 = pca.score(X)
    assert ll1 > ll2
示例#19
0
def test_pca_validation(svd_solver, data, n_components, err_msg):
    # Ensures that solver-specific extreme inputs for the n_components
    # parameter raise errors
    smallest_d = 2  # The smallest dimension
    lower_limit = {'randomized': 1, 'arpack': 1, 'full': 0, 'auto': 0}
    pca_fitted = PCA(n_components, svd_solver=svd_solver)

    solver_reported = 'full' if svd_solver == 'auto' else svd_solver
    err_msg = err_msg.format(n_components, lower_limit[svd_solver], smallest_d,
                             solver_reported)
    with pytest.raises(ValueError, match=err_msg):
        pca_fitted.fit(data)

    # Additional case for arpack
    if svd_solver == 'arpack':
        n_components = smallest_d

        err_msg = ("n_components={}L? must be strictly less than "
                   r"min\(n_samples, n_features\)={}L? with "
                   "svd_solver=\'arpack\'".format(n_components, smallest_d))
        with pytest.raises(ValueError, match=err_msg):
            PCA(n_components, svd_solver=svd_solver).fit(data)
示例#20
0
def test_pca(svd_solver, n_components):
    X = iris.data
    pca = PCA(n_components=n_components, svd_solver=svd_solver)

    # check the shape of fit.transform
    X_r = pca.fit(X).transform(X)
    assert X_r.shape[1] == n_components

    # check the equivalence of fit.transform and fit_transform
    X_r2 = pca.fit_transform(X)
    assert_allclose(X_r, X_r2)
    X_r = pca.transform(X)
    assert_allclose(X_r, X_r2)

    # Test get_covariance and get_precision
    cov = pca.get_covariance()
    precision = pca.get_precision()
    assert_allclose(np.dot(cov, precision), np.eye(X.shape[1]), atol=1e-12)
示例#21
0
def test_pca_bad_solver():
    X = np.random.RandomState(0).rand(5, 4)
    pca = PCA(n_components=3, svd_solver='bad_argument')
    with pytest.raises(ValueError):
        pca.fit(X)
示例#22
0
def test_n_components_none(data, solver, n_components_):
    pca = PCA(svd_solver=solver)
    pca.fit(data)
    assert pca.n_components_ == n_components_
示例#23
0
def test_infer_dim_by_explained_variance(X, n_components,
                                         n_components_validated):
    pca = PCA(n_components=n_components, svd_solver='full')
    pca.fit(X)
    assert pca.n_components == pytest.approx(n_components)
    assert pca.n_components_ == n_components_validated