def test_solvers(X_sparse, solver, kind): X = X_sparse if kind == 'sparse' else X_sparse.toarray() svd_a = TruncatedSVD(30, algorithm="arpack") svd = TruncatedSVD(30, algorithm=solver, random_state=42) Xa = svd_a.fit_transform(X)[:, :6] Xr = svd.fit_transform(X)[:, :6] assert_allclose(Xa, Xr, rtol=2e-3) comp_a = np.abs(svd_a.components_) comp = np.abs(svd.components_) # All elements are equal, but some elements are more equal than others. assert_allclose(comp_a[:9], comp[:9], rtol=1e-3) assert_allclose(comp_a[9:], comp[9:], atol=1e-2)
def test_inverse_transform(algo, X_sparse): # We need a lot of components for the reconstruction to be "almost # equal" in all positions. XXX Test means or sums instead? tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm=algo) Xt = tsvd.fit_transform(X_sparse) Xinv = tsvd.inverse_transform(Xt) assert_allclose(Xinv, X_sparse.toarray(), rtol=1e-1, atol=2e-1)
def test_sparse_formats(fmt, X_sparse): n_samples = X_sparse.shape[0] Xfmt = (X_sparse.toarray() if fmt == "dense" else getattr( X_sparse, "to" + fmt)()) tsvd = TruncatedSVD(n_components=11) Xtrans = tsvd.fit_transform(Xfmt) assert Xtrans.shape == (n_samples, 11) Xtrans = tsvd.transform(Xfmt) assert Xtrans.shape == (n_samples, 11)
def test_feature_union(): # basic sanity check for feature union X = iris.data X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert X_transformed.shape == (X.shape[0], 3) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # Test clone fs2 = assert_no_warnings(clone, fs) assert fs.transformer_list[0][1] is not fs2.transformer_list[0][1] # test setting parameters fs.set_params(select__k=2) assert fs.fit_transform(X, y).shape == (X.shape[0], 4) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert X_transformed.shape == (X.shape[0], 8) # test error if some elements do not support transform assert_raises_regex( TypeError, 'All estimators should implement fit and ' 'transform.*\\bNoTrans\\b', FeatureUnion, [("transform", Transf()), ("no_transform", NoTrans())]) # test that init accepts tuples fs = FeatureUnion((("svd", svd), ("select", select))) fs.fit(X, y)
def test_truncated_svd_eq_pca(X_sparse): # TruncatedSVD should be equal to PCA on centered data X_dense = X_sparse.toarray() X_c = X_dense - X_dense.mean(axis=0) params = dict(n_components=10, random_state=42) svd = TruncatedSVD(algorithm='arpack', **params) pca = PCA(svd_solver='arpack', **params) Xt_svd = svd.fit_transform(X_c) Xt_pca = pca.fit_transform(X_c) assert_allclose(Xt_svd, Xt_pca, rtol=1e-9) assert_allclose(pca.mean_, 0, atol=1e-9) assert_allclose(svd.components_, pca.components_)
def test_singular_values_expected(solver): # Set the singular values and see what we get back rng = np.random.RandomState(0) n_samples = 100 n_features = 110 X = rng.randn(n_samples, n_features) pca = TruncatedSVD(n_components=3, algorithm=solver, random_state=rng) X_pca = pca.fit_transform(X) X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0)) X_pca[:, 0] *= 3.142 X_pca[:, 1] *= 2.718 X_hat_pca = np.dot(X_pca, pca.components_) pca.fit(X_hat_pca) assert_allclose(pca.singular_values_, [3.142, 2.718, 1.0], rtol=1e-14)
def test_explained_variance(X_sparse, kind, n_components, solver): X = X_sparse if kind == 'sparse' else X_sparse.toarray() svd = TruncatedSVD(n_components, algorithm=solver) X_tr = svd.fit_transform(X) # Assert that all the values are greater than 0 assert_array_less(0.0, svd.explained_variance_ratio_) # Assert that total explained variance is less than 1 assert_array_less(svd.explained_variance_ratio_.sum(), 1.0) # Test that explained_variance is correct total_variance = np.var(X_sparse.toarray(), axis=0).sum() variances = np.var(X_tr, axis=0) true_explained_variance_ratio = variances / total_variance assert_allclose( svd.explained_variance_ratio_, true_explained_variance_ratio, )
def test_random_hasher(): # test random forest hashing on circles dataset # make sure that it is linearly separable. # even after projected to two SVD dimensions # Note: Not all random_states produce perfect results. hasher = RandomTreesEmbedding(n_estimators=30, random_state=1) X, y = datasets.make_circles(factor=0.5) X_transformed = hasher.fit_transform(X) # test fit and transform: hasher = RandomTreesEmbedding(n_estimators=30, random_state=1) assert_array_equal( hasher.fit(X).transform(X).toarray(), X_transformed.toarray()) # one leaf active per data point per forest assert X_transformed.shape[0] == X.shape[0] assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators) svd = TruncatedSVD(n_components=2) X_reduced = svd.fit_transform(X_transformed) linear_clf = LinearSVC() linear_clf.fit(X_reduced, y) assert linear_clf.score(X_reduced, y) == 1.
def test_integers(X_sparse): n_samples = X_sparse.shape[0] Xint = X_sparse.astype(np.int64) tsvd = TruncatedSVD(n_components=6) Xtrans = tsvd.fit_transform(Xint) assert Xtrans.shape == (n_samples, tsvd.n_components)