def test_incremental_pca_sparse(matrix_class): # Incremental PCA on sparse arrays. X = iris.data pca = PCA(n_components=2) pca.fit_transform(X) X_sparse = matrix_class(X) batch_size = X_sparse.shape[0] // 3 ipca = IncrementalPCA(n_components=2, batch_size=batch_size) X_transformed = ipca.fit_transform(X_sparse) assert X_transformed.shape == (X_sparse.shape[0], 2) np.testing.assert_allclose(ipca.explained_variance_ratio_.sum(), pca.explained_variance_ratio_.sum(), rtol=1e-3) for n_components in [1, 2, X.shape[1]]: ipca = IncrementalPCA(n_components, batch_size=batch_size) ipca.fit(X_sparse) cov = ipca.get_covariance() precision = ipca.get_precision() np.testing.assert_allclose(np.dot(cov, precision), np.eye(X_sparse.shape[1]), atol=1e-13) with pytest.raises( TypeError, match="IncrementalPCA.partial_fit does not support " "sparse input. Either convert data to dense " "or use IncrementalPCA.fit to do so in batches."): ipca.partial_fit(X_sparse)
def test_feature_union_weights(): # test feature union with transformer weights X = iris.data y = iris.target pca = PCA(n_components=2, svd_solver='randomized', random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", Transf()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert X_fit_transformed_wo_method.shape == (X.shape[0], 7)
def test_pca_explained_variance_empirical(X, svd_solver): pca = PCA(n_components=2, svd_solver=svd_solver, random_state=0) X_pca = pca.fit_transform(X) assert_allclose(pca.explained_variance_, np.var(X_pca, ddof=1, axis=0)) expected_result = np.linalg.eig(np.cov(X, rowvar=False))[0] expected_result = sorted(expected_result, reverse=True)[:2] assert_allclose(pca.explained_variance_, expected_result, rtol=5e-3)
def test_pca_check_projection_list(svd_solver): # Test that the projection of data is correct X = [[1.0, 0.0], [0.0, 1.0]] pca = PCA(n_components=1, svd_solver=svd_solver, random_state=0) X_trans = pca.fit_transform(X) assert X_trans.shape, (2, 1) assert_allclose(X_trans.mean(), 0.00, atol=1e-12) assert_allclose(X_trans.std(), 0.71, rtol=5e-3)
def test_pca_score_consistency_solvers(svd_solver): # Check the consistency of score between solvers X, _ = datasets.load_digits(return_X_y=True) pca_full = PCA(n_components=30, svd_solver='full', random_state=0) pca_other = PCA(n_components=30, svd_solver=svd_solver, random_state=0) pca_full.fit(X) pca_other.fit(X) assert_allclose(pca_full.score(X), pca_other.score(X), rtol=5e-6)
def test_pca_sparse_input(svd_solver): X = np.random.RandomState(0).rand(5, 4) X = sp.sparse.csr_matrix(X) assert sp.sparse.issparse(X) pca = PCA(n_components=3, svd_solver=svd_solver) with pytest.raises(TypeError): pca.fit(X)
def test_n_components_mle(svd_solver): # Ensure that n_components == 'mle' doesn't raise error for auto/full rng = np.random.RandomState(0) n_samples, n_features = 600, 10 X = rng.randn(n_samples, n_features) pca = PCA(n_components='mle', svd_solver=svd_solver) pca.fit(X) assert pca.n_components_ == 0
def test_no_empty_slice_warning(): # test if we avoid numpy warnings for computing over empty arrays n_components = 10 n_features = n_components + 2 # anything > n_comps triggered it in 0.16 X = np.random.uniform(-1, 1, size=(n_components, n_features)) pca = PCA(n_components=n_components) with pytest.warns(None) as record: pca.fit(X) assert not record.list
def test_pca_sanity_noise_variance(svd_solver): # Sanity check for the noise_variance_. For more details see # https://github.com/scikit-learn/scikit-learn/issues/7568 # https://github.com/scikit-learn/scikit-learn/issues/8541 # https://github.com/scikit-learn/scikit-learn/issues/8544 X, _ = datasets.load_digits(return_X_y=True) pca = PCA(n_components=30, svd_solver=svd_solver, random_state=0) pca.fit(X) assert np.all((pca.explained_variance_ - pca.noise_variance_) >= 0)
def test_pca_deterministic_output(svd_solver): rng = np.random.RandomState(0) X = rng.rand(10, 10) transformed_X = np.zeros((20, 2)) for i in range(20): pca = PCA(n_components=2, svd_solver=svd_solver, random_state=rng) transformed_X[i, :] = pca.fit_transform(X)[0] assert_allclose(transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2))
def test_infer_dim_3(): n, p = 100, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 X[:10] += np.array([3, 4, 5, 1, 2]) X[10:20] += np.array([6, 0, 7, 2, -1]) X[30:40] += 2 * np.array([-1, 1, -1, 1, -1]) pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ assert _infer_dimension_(spect, n, p) > 2
def test_n_components_mle_error(svd_solver): # Ensure that n_components == 'mle' will raise an error for unsupported # solvers rng = np.random.RandomState(0) n_samples, n_features = 600, 10 X = rng.randn(n_samples, n_features) pca = PCA(n_components='mle', svd_solver=svd_solver) err_msg = ("n_components='mle' cannot be a string with svd_solver='{}'". format(svd_solver)) with pytest.raises(ValueError, match=err_msg): pca.fit(X)
def test_infer_dim_2(): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 X[:10] += np.array([3, 4, 5, 1, 2]) X[10:20] += np.array([6, 0, 7, 2, -1]) pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ assert _infer_dimension_(spect, n, p) > 1
def test_infer_dim_1(): # TODO: explain what this is testing # Or at least use explicit variable names... n, p = 1000, 5 rng = np.random.RandomState(0) X = (rng.randn(n, p) * .1 + rng.randn(n, 1) * np.array([3, 4, 5, 1, 2]) + np.array([1, 0, 7, 4, 6])) pca = PCA(n_components=p, svd_solver='full') pca.fit(X) spect = pca.explained_variance_ ll = np.array([_assess_dimension_(spect, k, n, p) for k in range(p)]) assert ll[1] > ll.max() - .01 * n
def test_pca_inverse(svd_solver, whiten): # Test that the projection of data can be inverted rng = np.random.RandomState(0) n, p = 50, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= .00001 # make middle component relatively small X += [5, 4, 3] # make a large mean # same check that we can find the original data from the transformed # signal (since the data is almost of rank n_components) pca = PCA(n_components=2, svd_solver=svd_solver, whiten=whiten).fit(X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) assert_allclose(X, Y_inverse, rtol=5e-6)
def test_pipeline_score_samples_pca_lof(): X = iris.data # Test that the score_samples method is implemented on a pipeline. # Test that the score_samples method on pipeline yields same results as # applying transform and score_samples steps separately. pca = PCA(svd_solver='full', n_components='mle', whiten=True) lof = LocalOutlierFactor(novelty=True) pipe = Pipeline([('pca', pca), ('lof', lof)]) pipe.fit(X) # Check the shapes assert pipe.score_samples(X).shape == (X.shape[0], ) # Check the values lof.fit(pca.fit_transform(X)) assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X)))
def test_pca_singular_values(svd_solver): rng = np.random.RandomState(0) n_samples, n_features = 100, 80 X = rng.randn(n_samples, n_features) pca = PCA(n_components=2, svd_solver=svd_solver, random_state=rng) X_trans = pca.fit_transform(X) # compare to the Frobenius norm assert_allclose(np.sum(pca.singular_values_**2), np.linalg.norm(X_trans, "fro")**2) # Compare to the 2-norms of the score vectors assert_allclose(pca.singular_values_, np.sqrt(np.sum(X_trans**2, axis=0))) # set the singular values and see what er get back n_samples, n_features = 100, 110 X = rng.randn(n_samples, n_features) pca = PCA(n_components=3, svd_solver=svd_solver, random_state=rng) X_trans = pca.fit_transform(X) X_trans /= np.sqrt(np.sum(X_trans**2, axis=0)) X_trans[:, 0] *= 3.142 X_trans[:, 1] *= 2.718 X_hat = np.dot(X_trans, pca.components_) pca.fit(X_hat) assert_allclose(pca.singular_values_, [3.142, 2.718, 1.0])
def test_pca_score3(): # Check that probabilistic PCA selects the right model n, p = 200, 3 rng = np.random.RandomState(0) Xl = (rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7])) Xt = (rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7])) ll = np.zeros(p) for k in range(p): pca = PCA(n_components=k, svd_solver='full') pca.fit(Xl) ll[k] = pca.score(Xt) assert ll.argmax() == 1
def test_pca_svd_solver_auto(data, n_components, expected_solver): pca_auto = PCA(n_components=n_components, random_state=0) pca_test = PCA(n_components=n_components, svd_solver=expected_solver, random_state=0) pca_auto.fit(data) pca_test.fit(data) assert_allclose(pca_auto.components_, pca_test.components_)
def test_pca_vs_spca(): rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng) Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2) pca = PCA(n_components=2) pca.fit(Y) spca.fit(Y) results_test_pca = pca.transform(Z) results_test_spca = spca.transform(Z) assert_allclose(np.abs(spca.components_.dot(pca.components_.T)), np.eye(2), atol=1e-5) results_test_pca *= np.sign(results_test_pca[0, :]) results_test_spca *= np.sign(results_test_spca[0, :]) assert_allclose(results_test_pca, results_test_spca)
def test_make_union(): pca = PCA(svd_solver='full') mock = Transf() fu = make_union(pca, mock) names, transformers = zip(*fu.transformer_list) assert names == ("pca", "transf") assert transformers == (pca, mock)
def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver='randomized', whiten=True) clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert predict.shape == (n_samples, ) proba = pipe.predict_proba(X) assert proba.shape == (n_samples, n_classes) log_proba = pipe.predict_log_proba(X) assert log_proba.shape == (n_samples, n_classes) decision_function = pipe.decision_function(X) assert decision_function.shape == (n_samples, n_classes) pipe.score(X, y)
def test_pipeline_transform(): # Test whether pipeline works with a transformer at the end. # Also test pipeline.transform and pipeline.inverse_transform X = iris.data pca = PCA(n_components=2, svd_solver='full') pipeline = Pipeline([('pca', pca)]) # test transform and fit_transform: X_trans = pipeline.fit(X).transform(X) X_trans2 = pipeline.fit_transform(X) X_trans3 = pca.fit_transform(X) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(X_trans, X_trans3) X_back = pipeline.inverse_transform(X_trans) X_back2 = pca.inverse_transform(X_trans) assert_array_almost_equal(X_back, X_back2)
def test_incremental_pca_against_pca_iris(): # Test that IncrementalPCA and PCA are approximate (to a sign flip). X = iris.data Y_pca = PCA(n_components=2).fit_transform(X) Y_ipca = IncrementalPCA(n_components=2, batch_size=25).fit_transform(X) assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)
def test_fit_predict_on_pipeline_without_fit_predict(): # tests that a pipeline does not have fit_predict method when final # step of pipeline does not have fit_predict defined scaler = StandardScaler() pca = PCA(svd_solver='full') pipe = Pipeline([('scaler', scaler), ('pca', pca)]) assert_raises_regex(AttributeError, "'PCA' object has no attribute 'fit_predict'", getattr, pipe, 'fit_predict')
def test_pca_dim(): # Check automated dimensionality setting rng = np.random.RandomState(0) n, p = 100, 5 X = rng.randn(n, p) * .1 X[:10] += np.array([3, 4, 5, 1, 2]) pca = PCA(n_components='mle', svd_solver='full').fit(X) assert pca.n_components == 'mle' assert pca.n_components_ == 1
def test_truncated_svd_eq_pca(X_sparse): # TruncatedSVD should be equal to PCA on centered data X_dense = X_sparse.toarray() X_c = X_dense - X_dense.mean(axis=0) params = dict(n_components=10, random_state=42) svd = TruncatedSVD(algorithm='arpack', **params) pca = PCA(svd_solver='arpack', **params) Xt_svd = svd.fit_transform(X_c) Xt_pca = pca.fit_transform(X_c) assert_allclose(Xt_svd, Xt_pca, rtol=1e-9) assert_allclose(pca.mean_, 0, atol=1e-9) assert_allclose(svd.components_, pca.components_)
def test_whitening(): # Test that PCA and IncrementalPCA transforms match to sign flip. X = datasets.make_low_rank_matrix(1000, 10, tail_strength=0., effective_rank=2, random_state=1999) prec = 3 n_samples, n_features = X.shape for nc in [None, 9]: pca = PCA(whiten=True, n_components=nc).fit(X) ipca = IncrementalPCA(whiten=True, n_components=nc, batch_size=250).fit(X) Xt_pca = pca.transform(X) Xt_ipca = ipca.transform(X) assert_almost_equal(np.abs(Xt_pca), np.abs(Xt_ipca), decimal=prec) Xinv_ipca = ipca.inverse_transform(Xt_ipca) Xinv_pca = pca.inverse_transform(Xt_pca) assert_almost_equal(X, Xinv_ipca, decimal=prec) assert_almost_equal(X, Xinv_pca, decimal=prec) assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec)
def test_incremental_pca_against_pca_random_data(): # Test that IncrementalPCA and PCA are approximate (to a sign flip). rng = np.random.RandomState(1999) n_samples = 100 n_features = 3 X = rng.randn(n_samples, n_features) + 5 * rng.rand(1, n_features) Y_pca = PCA(n_components=3).fit_transform(X) Y_ipca = IncrementalPCA(n_components=3, batch_size=25).fit_transform(X) assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)
def test_kernel_pca_linear_kernel(): rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) X_pred = rng.random_sample((2, 4)) # for a linear kernel, kernel PCA should find the same projection as PCA # modulo the sign (direction) # fit only the first four components: fifth is near zero eigenvalue, so # can be trimmed due to roundoff error assert_array_almost_equal( np.abs(KernelPCA(4).fit(X_fit).transform(X_pred)), np.abs(PCA(4).fit(X_fit).transform(X_pred)))