def test_make_circles(): factor = 0.3 for (n_samples, n_outer, n_inner) in [(7, 3, 4), (8, 4, 4)]: # Testing odd and even case, because in the past make_circles always # created an even number of samples. X, y = make_circles(n_samples, shuffle=False, noise=None, factor=factor) assert X.shape == (n_samples, 2), "X shape mismatch" assert y.shape == (n_samples, ), "y shape mismatch" center = [0.0, 0.0] for x, label in zip(X, y): dist_sqr = ((x - center)**2).sum() dist_exp = 1.0 if label == 0 else factor**2 assert_almost_equal(dist_sqr, dist_exp, err_msg="Point is not on expected circle") assert X[y == 0].shape == (n_outer, 2), ( "Samples not correctly distributed across circles.") assert X[y == 1].shape == (n_inner, 2), ( "Samples not correctly distributed across circles.") with pytest.raises(ValueError): make_circles(factor=-0.01) with pytest.raises(ValueError): make_circles(factor=1.)
def test_random_trees_dense_type(): # Test that the `sparse_output` parameter of RandomTreesEmbedding # works by returning a dense array. # Create the RTE with sparse=False hasher = RandomTreesEmbedding(n_estimators=10, sparse_output=False) X, y = datasets.make_circles(factor=0.5) X_transformed = hasher.fit_transform(X) # Assert that type is ndarray, not scipy.sparse.csr.csr_matrix assert type(X_transformed) == np.ndarray
def test_gridsearch_pipeline(): # Test if we can do a grid-search to find parameters to separate # circles with a perceptron model. X, y = make_circles(n_samples=400, factor=.3, noise=.05, random_state=0) kpca = KernelPCA(kernel="rbf", n_components=2) pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))]) param_grid = dict(kernel_pca__gamma=2.**np.arange(-2, 2)) grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid) grid_search.fit(X, y) assert grid_search.best_score_ == 1
def test_single_linkage_clustering(): # Check that we get the correct result in two emblematic cases moons, moon_labels = make_moons(noise=0.05, random_state=42) clustering = AgglomerativeClustering(n_clusters=2, linkage='single') clustering.fit(moons) assert_almost_equal( normalized_mutual_info_score(clustering.labels_, moon_labels), 1) circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42) clustering = AgglomerativeClustering(n_clusters=2, linkage='single') clustering.fit(circles) assert_almost_equal( normalized_mutual_info_score(clustering.labels_, circle_labels), 1)
def test_random_trees_dense_equal(): # Test that the `sparse_output` parameter of RandomTreesEmbedding # works by returning the same array for both argument values. # Create the RTEs hasher_dense = RandomTreesEmbedding(n_estimators=10, sparse_output=False, random_state=0) hasher_sparse = RandomTreesEmbedding(n_estimators=10, sparse_output=True, random_state=0) X, y = datasets.make_circles(factor=0.5) X_transformed_dense = hasher_dense.fit_transform(X) X_transformed_sparse = hasher_sparse.fit_transform(X) # Assert that dense and sparse hashers have same array. assert_array_equal(X_transformed_sparse.toarray(), X_transformed_dense)
def test_random_hasher(): # test random forest hashing on circles dataset # make sure that it is linearly separable. # even after projected to two SVD dimensions # Note: Not all random_states produce perfect results. hasher = RandomTreesEmbedding(n_estimators=30, random_state=1) X, y = datasets.make_circles(factor=0.5) X_transformed = hasher.fit_transform(X) # test fit and transform: hasher = RandomTreesEmbedding(n_estimators=30, random_state=1) assert_array_equal(hasher.fit(X).transform(X).toarray(), X_transformed.toarray()) # one leaf active per data point per forest assert X_transformed.shape[0] == X.shape[0] assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators) svd = TruncatedSVD(n_components=2) X_reduced = svd.fit_transform(X_transformed) linear_clf = LinearSVC() linear_clf.fit(X_reduced, y) assert linear_clf.score(X_reduced, y) == 1.
def test_nested_circles(): # Test the linear separability of the first 2D KPCA transform X, y = make_circles(n_samples=400, factor=.3, noise=.05, random_state=0) # 2D nested circles are not linearly separable train_score = Perceptron(max_iter=5).fit(X, y).score(X, y) assert train_score < 0.8 # Project the circles data into the first 2 components of a RBF Kernel # PCA model. # Note that the gamma value is data dependent. If this test breaks # and the gamma value has to be updated, the Kernel PCA example will # have to be updated too. kpca = KernelPCA(kernel="rbf", n_components=2, fit_inverse_transform=True, gamma=2.) X_kpca = kpca.fit_transform(X) # The data is perfectly linearly separable in that space train_score = Perceptron(max_iter=5).fit(X_kpca, y).score(X_kpca, y) assert train_score == 1.0
""" print(__doc__) # Authors: Mathieu Blondel # Andreas Mueller # License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt from mrex.decomposition import PCA, KernelPCA from mrex.datasets import make_circles np.random.seed(0) X, y = make_circles(n_samples=400, factor=.3, noise=.05) kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10) X_kpca = kpca.fit_transform(X) X_back = kpca.inverse_transform(X_kpca) pca = PCA() X_pca = pca.fit_transform(X) # Plot results plt.figure() plt.subplot(2, 2, 1, aspect='equal') plt.title("Original space") reds = y == 0 blues = y == 1
print(__doc__) import numpy as np import matplotlib.pyplot as plt from matplotlib.ticker import NullFormatter from mrex import manifold, datasets from time import time n_samples = 300 n_components = 2 (fig, subplots) = plt.subplots(3, 5, figsize=(15, 8)) perplexities = [5, 30, 50, 100] X, y = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05) red = y == 0 green = y == 1 ax = subplots[0][0] ax.scatter(X[red, 0], X[red, 1], c="r") ax.scatter(X[green, 0], X[green, 1], c="g") ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) plt.axis('tight') for i, perplexity in enumerate(perplexities): ax = subplots[0][i + 1] t0 = time()
# some parameter combinations will not converge as can be seen on the # plots so they are ignored here with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=ConvergenceWarning, module="mrex") mlp.fit(X, y) mlps.append(mlp) print("Training set score: %f" % mlp.score(X, y)) print("Training set loss: %f" % mlp.loss_) for mlp, label, args in zip(mlps, labels, plot_args): ax.plot(mlp.loss_curve_, label=label, **args) fig, axes = plt.subplots(2, 2, figsize=(15, 10)) # load / generate some toy datasets iris = datasets.load_iris() X_digits, y_digits = datasets.load_digits(return_X_y=True) data_sets = [(iris.data, iris.target), (X_digits, y_digits), datasets.make_circles(noise=0.2, factor=0.5, random_state=1), datasets.make_moons(noise=0.3, random_state=0)] for ax, data, name in zip(axes.ravel(), data_sets, ['iris', 'digits', 'circles', 'moons']): plot_on_dataset(*data, ax=ax, name=name) fig.legend(ax.get_lines(), labels, ncol=3, loc="upper center") plt.show()
'linearsvc__C': np.logspace(-2, 7, 10), }), (GradientBoostingClassifier(n_estimators=50, random_state=0), { 'learning_rate': np.logspace(-4, 0, 10) }), (SVC(random_state=0), { 'C': np.logspace(-2, 7, 10) }), ] names = [get_name(e) for e, g in classifiers] n_samples = 100 datasets = [ make_moons(n_samples=n_samples, noise=0.2, random_state=0), make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1), make_classification(n_samples=n_samples, n_features=2, n_redundant=0, n_informative=2, random_state=2, n_clusters_per_class=1) ] fig, axes = plt.subplots(nrows=len(datasets), ncols=len(classifiers) + 1, figsize=(21, 9)) cm = plt.cm.PiYG cm_bright = ListedColormap(['#b30065', '#178000']) # iterate over datasets for ds_cnt, (X, y) in enumerate(datasets): print('\ndataset %d\n---------' % ds_cnt)
propagate correctly around the circle. """ print(__doc__) # Authors: Clay Woolam <*****@*****.**> # Andreas Mueller <*****@*****.**> # License: BSD import numpy as np import matplotlib.pyplot as plt from mrex.semi_supervised import label_propagation from mrex.datasets import make_circles # generate ring with inner box n_samples = 200 X, y = make_circles(n_samples=n_samples, shuffle=False) outer, inner = 0, 1 labels = np.full(n_samples, -1.) labels[0] = outer labels[-1] = inner # ############################################################################# # Learn with LabelSpreading label_spread = label_propagation.LabelSpreading(kernel='knn', alpha=0.8) label_spread.fit(X, labels) # ############################################################################# # Plot output labels output_labels = label_spread.transduction_ plt.figure(figsize=(8.5, 4)) plt.subplot(1, 2, 1)
GaussianNB(), QuadraticDiscriminantAnalysis() ] X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [ make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable ] figure = plt.figure(figsize=(27, 9)) i = 1 # iterate over datasets for ds_cnt, ds in enumerate(datasets): # preprocess dataset, split into training and test part X, y = ds X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=.4, random_state=42) x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
excellent accuracy. For sparse binary data, BernoulliNB is particularly well-suited. The bottom row compares the decision boundary obtained by BernoulliNB in the transformed space with an ExtraTreesClassifier forests learned on the original data. """ import numpy as np import matplotlib.pyplot as plt from mrex.datasets import make_circles from mrex.ensemble import RandomTreesEmbedding, ExtraTreesClassifier from mrex.decomposition import TruncatedSVD from mrex.naive_bayes import BernoulliNB # make a synthetic dataset X, y = make_circles(factor=0.5, random_state=0, noise=0.05) # use RandomTreesEmbedding to transform data hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=3) X_transformed = hasher.fit_transform(X) # Visualize result after dimensionality reduction using truncated SVD svd = TruncatedSVD(n_components=2) X_reduced = svd.fit_transform(X_transformed) # Learn a Naive Bayes classifier on the transformed data nb = BernoulliNB() nb.fit(X_transformed, y) # Learn an ExtraTreesClassifier for comparison trees = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0)