def test_make_circles(): factor = 0.3 for (n_samples, n_outer, n_inner) in [(7, 3, 4), (8, 4, 4)]: # Testing odd and even case, because in the past make_circles always # created an even number of samples. X, y = make_circles(n_samples, shuffle=False, noise=None, factor=factor) assert X.shape == (n_samples, 2), "X shape mismatch" assert y.shape == (n_samples, ), "y shape mismatch" center = [0.0, 0.0] for x, label in zip(X, y): dist_sqr = ((x - center)**2).sum() dist_exp = 1.0 if label == 0 else factor**2 assert_almost_equal(dist_sqr, dist_exp, err_msg="Point is not on expected circle") assert X[y == 0].shape == (n_outer, 2), ( "Samples not correctly distributed across circles.") assert X[y == 1].shape == (n_inner, 2), ( "Samples not correctly distributed across circles.") with pytest.raises(ValueError): make_circles(factor=-0.01) with pytest.raises(ValueError): make_circles(factor=1.)
def test_gridsearch_pipeline(): # Test if we can do a grid-search to find parameters to separate # circles with a perceptron model. X, y = make_circles(n_samples=400, factor=.3, noise=.05, random_state=0) kpca = KernelPCA(kernel="rbf", n_components=2) pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))]) param_grid = dict(kernel_pca__gamma=2.**np.arange(-2, 2)) grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid) grid_search.fit(X, y) assert grid_search.best_score_ == 1
def test_random_trees_dense_type(): # Test that the `sparse_output` parameter of RandomTreesEmbedding # works by returning a dense array. # Create the RTE with sparse=False hasher = RandomTreesEmbedding(n_estimators=10, sparse_output=False) X, y = datasets.make_circles(factor=0.5) X_transformed = hasher.fit_transform(X) # Assert that type is ndarray, not scipy.sparse.csr.csr_matrix assert type(X_transformed) == np.ndarray
def test_single_linkage_clustering(): # Check that we get the correct result in two emblematic cases moons, moon_labels = make_moons(noise=0.05, random_state=42) clustering = AgglomerativeClustering(n_clusters=2, linkage='single') clustering.fit(moons) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, moon_labels), 1) circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42) clustering = AgglomerativeClustering(n_clusters=2, linkage='single') clustering.fit(circles) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, circle_labels), 1)
def test_random_trees_dense_equal(): # Test that the `sparse_output` parameter of RandomTreesEmbedding # works by returning the same array for both argument values. # Create the RTEs hasher_dense = RandomTreesEmbedding(n_estimators=10, sparse_output=False, random_state=0) hasher_sparse = RandomTreesEmbedding(n_estimators=10, sparse_output=True, random_state=0) X, y = datasets.make_circles(factor=0.5) X_transformed_dense = hasher_dense.fit_transform(X) X_transformed_sparse = hasher_sparse.fit_transform(X) # Assert that dense and sparse hashers have same array. assert_array_equal(X_transformed_sparse.toarray(), X_transformed_dense)
def test_nested_circles(): # Test the linear separability of the first 2D KPCA transform X, y = make_circles(n_samples=400, factor=.3, noise=.05, random_state=0) # 2D nested circles are not linearly separable train_score = Perceptron(max_iter=5).fit(X, y).score(X, y) assert train_score < 0.8 # Project the circles data into the first 2 components of a RBF Kernel # PCA model. # Note that the gamma value is data dependent. If this test breaks # and the gamma value has to be updated, the Kernel PCA example will # have to be updated too. kpca = KernelPCA(kernel="rbf", n_components=2, fit_inverse_transform=True, gamma=2.) X_kpca = kpca.fit_transform(X) # The data is perfectly linearly separable in that space train_score = Perceptron(max_iter=5).fit(X_kpca, y).score(X_kpca, y) assert train_score == 1.0
def test_random_hasher(): # test random forest hashing on circles dataset # make sure that it is linearly separable. # even after projected to two SVD dimensions # Note: Not all random_states produce perfect results. hasher = RandomTreesEmbedding(n_estimators=30, random_state=1) X, y = datasets.make_circles(factor=0.5) X_transformed = hasher.fit_transform(X) # test fit and transform: hasher = RandomTreesEmbedding(n_estimators=30, random_state=1) assert_array_equal( hasher.fit(X).transform(X).toarray(), X_transformed.toarray()) # one leaf active per data point per forest assert X_transformed.shape[0] == X.shape[0] assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators) svd = TruncatedSVD(n_components=2) X_reduced = svd.fit_transform(X_transformed) linear_clf = LinearSVC() linear_clf.fit(X_reduced, y) assert linear_clf.score(X_reduced, y) == 1.