def test_ovr_partial_fit(): # Test if partial_fit is working as intended X, y = shuffle(iris.data, iris.target, random_state=0) ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(X[:100], y[:100], np.unique(y)) ovr.partial_fit(X[100:], y[100:]) pred = ovr.predict(X) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(X, y).predict(X) assert_almost_equal(pred, pred2) assert_equal(len(ovr.estimators_), len(np.unique(y))) assert_greater(np.mean(y == pred), 0.65) # Test when mini batches doesn't have all classes # with SGDClassifier X = np.abs(np.random.randn(14, 2)) y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3] ovr = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)) ovr.partial_fit(X[:7], y[:7], np.unique(y)) ovr.partial_fit(X[7:], y[7:]) pred = ovr.predict(X) ovr1 = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)) pred1 = ovr1.fit(X, y).predict(X) assert_equal(np.mean(pred == y), np.mean(pred1 == y)) # test partial_fit only exists if estimator has it: ovr = OneVsRestClassifier(SVC()) assert_false(hasattr(ovr, "partial_fit"))
def test_k_means_function(): # test calling the k_means function directly # catch output old_stdout = sys.stdout sys.stdout = StringIO() try: cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, verbose=True) finally: sys.stdout = old_stdout centers = cluster_centers assert_equal(centers.shape, (n_clusters, n_features)) labels = labels assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignment are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_greater(inertia, 0.0) # check warning when centers are passed assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters, init=centers) # to many clusters desired assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1)
def check_clustering(name, Alg): X, y = make_blobs(n_samples=50, random_state=1) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) n_samples, n_features = X.shape # catch deprecation and neighbors warnings with warnings.catch_warnings(record=True): alg = Alg() set_fast_parameters(alg) if hasattr(alg, "n_clusters"): alg.set_params(n_clusters=3) set_random_state(alg) if name == 'AffinityPropagation': alg.set_params(preference=-100) alg.set_params(max_iter=100) # fit alg.fit(X) # with lists alg.fit(X.tolist()) assert_equal(alg.labels_.shape, (n_samples,)) pred = alg.labels_ assert_greater(adjusted_rand_score(pred, y), 0.4) # fit another time with ``fit_predict`` and compare results if name is 'SpectralClustering': # there is no way to make Spectral clustering deterministic :( return set_random_state(alg) with warnings.catch_warnings(record=True): pred2 = alg.fit_predict(X) assert_array_equal(pred, pred2)
def test_euclidean_distances(): # Check the pairwise Euclidean distances computation X = [[0]] Y = [[1], [2]] D = euclidean_distances(X, Y) assert_array_almost_equal(D, [[1., 2.]]) X = csr_matrix(X) Y = csr_matrix(Y) D = euclidean_distances(X, Y) assert_array_almost_equal(D, [[1., 2.]]) rng = np.random.RandomState(0) X = rng.random_sample((10, 4)) Y = rng.random_sample((20, 4)) X_norm_sq = (X ** 2).sum(axis=1).reshape(1, -1) Y_norm_sq = (Y ** 2).sum(axis=1).reshape(1, -1) # check that we still get the right answers with {X,Y}_norm_squared D1 = euclidean_distances(X, Y) D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq) D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq) D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq, Y_norm_squared=Y_norm_sq) assert_array_almost_equal(D2, D1) assert_array_almost_equal(D3, D1) assert_array_almost_equal(D4, D1) # check we get the wrong answer with wrong {X,Y}_norm_squared X_norm_sq *= 0.5 Y_norm_sq *= 0.5 wrong_D = euclidean_distances(X, Y, X_norm_squared=np.zeros_like(X_norm_sq), Y_norm_squared=np.zeros_like(Y_norm_sq)) assert_greater(np.max(np.abs(wrong_D - D1)), .01)
def check_regressors_train(name, Regressor): X, y = _boston_subset() y = StandardScaler().fit_transform(y) # X is already scaled y = multioutput_estimator_convert_y_2d(name, y) rnd = np.random.RandomState(0) # catch deprecation warnings with warnings.catch_warnings(record=True): regressor = Regressor() set_fast_parameters(regressor) if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'): # linear regressors need to set alpha, but not generalized CV ones regressor.alpha = 0.01 if name == 'PassiveAggressiveRegressor': regressor.C = 0.01 # raises error on malformed input for fit assert_raises(ValueError, regressor.fit, X, y[:-1]) # fit if name in CROSS_DECOMPOSITION: y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))]) y_ = y_.T else: y_ = y set_random_state(regressor) regressor.fit(X, y_) regressor.fit(X.tolist(), y_.tolist()) regressor.predict(X) # TODO: find out why PLS and CCA fail. RANSAC is random # and furthermore assumes the presence of outliers, hence # skipped if name not in ('PLSCanonical', 'CCA', 'RANSACRegressor'): print(regressor) assert_greater(regressor.score(X, y_), 0.5)
def check_min_samples_leaf(name): X, y = hastie_X, hastie_y # Test if leaves contain more than leaf_count training examples ForestEstimator = FOREST_ESTIMATORS[name] # test boundary value assert_raises(ValueError, ForestEstimator(min_samples_leaf=-1).fit, X, y) assert_raises(ValueError, ForestEstimator(min_samples_leaf=0).fit, X, y) est = ForestEstimator(min_samples_leaf=5, n_estimators=1, random_state=0) est.fit(X, y) out = est.estimators_[0].tree_.apply(X) node_counts = np.bincount(out) # drop inner nodes leaf_count = node_counts[node_counts != 0] assert_greater(np.min(leaf_count), 4, "Failed with {0}".format(name)) est = ForestEstimator(min_samples_leaf=0.25, n_estimators=1, random_state=0) est.fit(X, y) out = est.estimators_[0].tree_.apply(X) node_counts = np.bincount(out) # drop inner nodes leaf_count = node_counts[node_counts != 0] assert_greater(np.min(leaf_count), len(X) * 0.25 - 1, "Failed with {0}".format(name))
def check_class_weight_classifiers(name, Classifier): if name == "NuSVC": # the sparse version has a parameter that doesn't do anything raise SkipTest if name.endswith("NB"): # NaiveBayes classifiers have a somewhat different interface. # FIXME SOON! raise SkipTest for n_centers in [2, 3]: # create a very noisy dataset X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) n_centers = len(np.unique(y_train)) if n_centers == 2: class_weight = {0: 1000, 1: 0.0001} else: class_weight = {0: 1000, 1: 0.0001, 2: 0.0001} with warnings.catch_warnings(record=True): classifier = Classifier(class_weight=class_weight) if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) if hasattr(classifier, "min_weight_fraction_leaf"): classifier.set_params(min_weight_fraction_leaf=0.01) set_random_state(classifier) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) assert_greater(np.mean(y_pred == 0), 0.89)
def test_oneclass_decision_function(): # Test OneClassSVM decision function clf = svm.OneClassSVM() rnd = check_random_state(2) # Generate train data X = 0.3 * rnd.randn(100, 2) X_train = np.r_[X + 2, X - 2] # Generate some regular novel observations X = 0.3 * rnd.randn(20, 2) X_test = np.r_[X + 2, X - 2] # Generate some abnormal novel observations X_outliers = rnd.uniform(low=-4, high=4, size=(20, 2)) # fit the model clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1) clf.fit(X_train) # predict things y_pred_test = clf.predict(X_test) assert_greater(np.mean(y_pred_test == 1), .9) y_pred_outliers = clf.predict(X_outliers) assert_greater(np.mean(y_pred_outliers == -1), .9) dec_func_test = clf.decision_function(X_test) assert_array_equal((dec_func_test > 0).ravel(), y_pred_test == 1) dec_func_outliers = clf.decision_function(X_outliers) assert_array_equal((dec_func_outliers > 0).ravel(), y_pred_outliers == 1)
def check_min_samples_split(name): X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] # test boundary value assert_raises(ValueError, ForestEstimator(min_samples_split=-1).fit, X, y) assert_raises(ValueError, ForestEstimator(min_samples_split=0).fit, X, y) assert_raises(ValueError, ForestEstimator(min_samples_split=1.1).fit, X, y) est = ForestEstimator(min_samples_split=10, n_estimators=1, random_state=0) est.fit(X, y) node_idx = est.estimators_[0].tree_.children_left != -1 node_samples = est.estimators_[0].tree_.n_node_samples[node_idx] assert_greater(np.min(node_samples), len(X) * 0.5 - 1, "Failed with {0}".format(name)) est = ForestEstimator(min_samples_split=0.5, n_estimators=1, random_state=0) est.fit(X, y) node_idx = est.estimators_[0].tree_.children_left != -1 node_samples = est.estimators_[0].tree_.n_node_samples[node_idx] assert_greater(np.min(node_samples), len(X) * 0.5 - 1, "Failed with {0}".format(name))
def test_randomized_svd_low_rank_with_noise(): """Check that extmath.randomized_svd can handle noisy matrices""" n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X wity structure approximate rank `rank` and an # important noisy component X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.5, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method # without the iterated power method _, sa, _ = randomized_svd(X, k, n_iter=0) # the approximation does not tolerate the noise: assert_greater(np.abs(s[:k] - sa).max(), 0.05) # compute the singular values of X using the fast approximate method with # iterated power method _, sap, _ = randomized_svd(X, k, n_iter=5) # the iterated power method is helping getting rid of the noise: assert_almost_equal(s[:k], sap, decimal=3)
def test_randomized_svd_infinite_rank(): """Check that extmath.randomized_svd can handle noisy matrices""" n_samples = 100 n_features = 500 rank = 5 k = 10 # let us try again without 'low_rank component': just regularly but slowly # decreasing singular values: the rank of the data matrix is infinite X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=1.0, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method # without the iterated power method _, sa, _ = randomized_svd(X, k, n_iter=0) # the approximation does not tolerate the noise: assert_greater(np.abs(s[:k] - sa).max(), 0.1) # compute the singular values of X using the fast approximate method with # iterated power method _, sap, _ = randomized_svd(X, k, n_iter=5) # the iterated power method is still managing to get most of the structure # at the requested rank assert_almost_equal(s[:k], sap, decimal=3)
def test_predict_iris(): # Test logistic regression with the iris dataset n_samples, n_features = iris.data.shape target = iris.target_names[iris.target] # Test that both multinomial and OvR solvers handle # multiclass data correctly and give good accuracy # score (>0.95) for the training data. for clf in [LogisticRegression(C=len(iris.data)), LogisticRegression(C=len(iris.data), solver='lbfgs', multi_class='multinomial'), LogisticRegression(C=len(iris.data), solver='newton-cg', multi_class='multinomial'), LogisticRegression(C=len(iris.data), solver='sag', tol=1e-2, multi_class='ovr', random_state=42), LogisticRegression(C=len(iris.data), solver='saga', tol=1e-2, multi_class='ovr', random_state=42) ]: clf.fit(iris.data, target) assert_array_equal(np.unique(target), clf.classes_) pred = clf.predict(iris.data) assert_greater(np.mean(pred == target), .95) probabilities = clf.predict_proba(iris.data) assert_array_almost_equal(probabilities.sum(axis=1), np.ones(n_samples)) pred = iris.target_names[probabilities.argmax(axis=1)] assert_greater(np.mean(pred == target), .95)
def test_spectral_amg_mode(): # Test the amg mode of SpectralClustering centers = np.array([ [0., 0., 0.], [10., 10., 10.], [20., 20., 20.], ]) X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=1., random_state=42) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix S = sparse.coo_matrix(S) try: from pyamg import smoothed_aggregation_solver amg_loaded = True except ImportError: amg_loaded = False if amg_loaded: labels = spectral_clustering(S, n_clusters=len(centers), random_state=0, mode="amg") # We don't care too much that it's good, just that it *worked*. # There does have to be some lower limit on the performance though. assert_greater(np.mean(labels == true_labels), .3) else: assert_raises(ValueError, spectral_embedding, S, n_components=len(centers), random_state=0, mode="amg")
def test_ovo_ties(): # test that ties are broken using the decision function, not defaulting to # the smallest label X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) y = np.array([2, 0, 1, 2]) multi_clf = OneVsOneClassifier(Perceptron()) ovo_prediction = multi_clf.fit(X, y).predict(X) # recalculate votes to make sure we have a tie predictions = np.vstack([clf.predict(X) for clf in multi_clf.estimators_]) scores = np.vstack([clf.decision_function(X) for clf in multi_clf.estimators_]) # classifiers are in order 0-1, 0-2, 1-2 # aggregate votes: votes = np.zeros((4, 3)) votes[np.arange(4), predictions[0]] += 1 votes[np.arange(4), 2 * predictions[1]] += 1 votes[np.arange(4), 1 + predictions[2]] += 1 # for the first point, there is one vote per class assert_array_equal(votes[0, :], 1) # for the rest, there is no tie and the prediction is the argmax assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:]) # for the tie, the prediction is the class with the highest score assert_equal(ovo_prediction[0], 1) # score for one is greater than score for zero assert_greater(scores[2, 0] - scores[0, 0], scores[0, 0] + scores[1, 0]) # score for one is greater than score for two assert_greater(scores[2, 0] - scores[0, 0], -scores[1, 0] - scores[2, 0])
def test_warm_start(solver, warm_start, fit_intercept, multi_class): # A 1-iteration second fit on same data should give almost same result # with warm starting, and quite different result without warm starting. # Warm starting does not work with liblinear solver. X, y = iris.data, iris.target clf = LogisticRegression(tol=1e-4, multi_class=multi_class, warm_start=warm_start, solver=solver, random_state=42, max_iter=100, fit_intercept=fit_intercept) with ignore_warnings(category=ConvergenceWarning): clf.fit(X, y) coef_1 = clf.coef_ clf.max_iter = 1 clf.fit(X, y) cum_diff = np.sum(np.abs(coef_1 - clf.coef_)) msg = ("Warm starting issue with %s solver in %s mode " "with fit_intercept=%s and warm_start=%s" % (solver, multi_class, str(fit_intercept), str(warm_start))) if warm_start: assert_greater(2.0, cum_diff, msg) else: assert_greater(cum_diff, 2.0, msg)
def test_enet_path(): # We use a large number of samples and of informative features so that # the l1_ratio selected is more toward ridge than lasso X, y, X_test, y_test = build_dataset(n_samples=200, n_features=100, n_informative_features=100) max_iter = 150 with warnings.catch_warnings(): # Here we have a small number of iterations, and thus the # ElasticNet might not converge. This is to speed up tests warnings.simplefilter("ignore", UserWarning) clf = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter) clf.fit(X, y) # Well-conditionned settings, we should have selected our # smallest penalty assert_almost_equal(clf.alpha_, min(clf.alphas_)) # Non-sparse ground truth: we should have seleted an elastic-net # that is closer to ridge than to lasso assert_equal(clf.l1_ratio_, min(clf.l1_ratio)) clf = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter, precompute=True) clf.fit(X, y) # Well-conditionned settings, we should have selected our # smallest penalty assert_almost_equal(clf.alpha_, min(clf.alphas_)) # Non-sparse ground truth: we should have seleted an elastic-net # that is closer to ridge than to lasso assert_equal(clf.l1_ratio_, min(clf.l1_ratio)) # We are in well-conditionned settings with low noise: we should # have a good test-set performance assert_greater(clf.score(X_test, y_test), 0.99)
def test_rfe_estimator_tags(): rfe = RFE(SVC(kernel='linear')) assert_equal(rfe._estimator_type, "classifier") # make sure that cross-validation is stratified iris = load_iris() score = cross_val_score(rfe, iris.data, iris.target) assert_greater(score.min(), .7)
def check_regressors_train(name, Regressor, X, y): if name == 'OrthogonalMatchingPursuitCV': # FIXME: This test is unstable on Travis, see issue #3190. check_skip_travis() rnd = np.random.RandomState(0) # catch deprecation warnings with warnings.catch_warnings(record=True): regressor = Regressor() if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'): # linear regressors need to set alpha, but not generalized CV ones regressor.alpha = 0.01 # raises error on malformed input for fit assert_raises(ValueError, regressor.fit, X, y[:-1]) # fit if name in ('PLSCanonical', 'PLSRegression', 'CCA'): y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))]) y_ = y_.T else: y_ = y set_random_state(regressor) regressor.fit(X, y_) regressor.predict(X) # TODO: find out why PLS and CCA fail. RANSAC is random # and furthermore assumes the presence of outliers, hence # skipped if name not in ('PLSCanonical', 'CCA', 'RANSACRegressor'): assert_greater(regressor.score(X, y_), 0.5)
def test_k_means_function(): # test calling the k_means function directly # catch output old_stdout = sys.stdout sys.stdout = StringIO() try: cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, sample_weight=None, verbose=True) finally: sys.stdout = old_stdout centers = cluster_centers assert_equal(centers.shape, (n_clusters, n_features)) labels = labels assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignment are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_greater(inertia, 0.0) # check warning when centers are passed assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters, sample_weight=None, init=centers) # to many clusters desired assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1, sample_weight=None) # kmeans for algorithm='elkan' raises TypeError on sparse matrix assert_raise_message(TypeError, "algorithm='elkan' not supported for " "sparse input X", k_means, X=X_csr, n_clusters=2, sample_weight=None, algorithm="elkan")
def test_lasso_cv(): X, y, X_test, y_test = build_dataset() max_iter = 150 clf = LassoCV(n_alphas=10, eps=1e-3, max_iter=max_iter).fit(X, y) assert_almost_equal(clf.alpha_, 0.056, 2) clf = LassoCV(n_alphas=10, eps=1e-3, max_iter=max_iter, precompute=True) clf.fit(X, y) assert_almost_equal(clf.alpha_, 0.056, 2) # Check that the lars and the coordinate descent implementation # select a similar alpha lars = LassoLarsCV(normalize=False, max_iter=30).fit(X, y) # for this we check that they don't fall in the grid of # clf.alphas further than 1 assert_true(np.abs( np.searchsorted(clf.alphas_[::-1], lars.alpha_) - np.searchsorted(clf.alphas_[::-1], clf.alpha_)) <= 1) # check that they also give a similar MSE mse_lars = interpolate.interp1d(lars.cv_alphas_, lars.cv_mse_path_.T) np.testing.assert_approx_equal(mse_lars(clf.alphas_[5]).mean(), clf.mse_path_[5].mean(), significant=2) # test set assert_greater(clf.score(X_test, y_test), 0.99)
def check_classifiers_classes(name, Classifier, X, y, y_names): if name in ["LabelPropagation", "LabelSpreading"]: # TODO some complication with -1 label y_ = y else: y_ = y_names classes = np.unique(y_) # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() # fit try: classifier.fit(X, y_) except Exception as e: print(e) y_pred = classifier.predict(X) # training set performance assert_array_equal(np.unique(y_), np.unique(y_pred)) accuracy = accuracy_score(y_, y_pred) assert_greater(accuracy, 0.78, "accuracy %f of %s not greater than 0.78" % (accuracy, name)) #assert_array_equal( #clf.classes_, classes, #"Unexpected classes_ attribute for %r" % clf) if np.any(classifier.classes_ != classes): print("Unexpected classes_ attribute for %r: " "expected %s, got %s" % (classifier, classes, classifier.classes_))
def test_nmf_decreasing(): # test that the objective function is decreasing at each iteration n_samples = 20 n_features = 15 n_components = 10 alpha = 0.1 l1_ratio = 0.5 tol = 0. # initialization rng = np.random.mtrand.RandomState(42) X = rng.randn(n_samples, n_features) np.abs(X, X) W0, H0 = nmf._initialize_nmf(X, n_components, init='random', random_state=42) for beta_loss in (-1.2, 0, 0.2, 1., 2., 2.5): for solver in ('cd', 'mu'): if solver != 'mu' and beta_loss != 2: # not implemented continue W, H = W0.copy(), H0.copy() previous_loss = None for _ in range(30): # one more iteration starting from the previous results W, H, _ = non_negative_factorization( X, W, H, beta_loss=beta_loss, init='custom', n_components=n_components, max_iter=1, alpha=alpha, solver=solver, tol=tol, l1_ratio=l1_ratio, verbose=0, regularization='both', random_state=0, update_H=True) loss = nmf._beta_divergence(X, W, H, beta_loss) if previous_loss is not None: assert_greater(previous_loss, loss) previous_loss = loss
def test_class_weight_classifiers(): # test that class_weight works and that the semantics are consistent classifiers = all_estimators(type_filter="classifier") with warnings.catch_warnings(record=True): classifiers = [c for c in classifiers if "class_weight" in c[1]().get_params().keys()] for n_centers in [2, 3]: # create a very noisy dataset X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) for name, Classifier in classifiers: if name == "NuSVC": # the sparse version has a parameter that doesn't do anything continue if name.endswith("NB"): # NaiveBayes classifiers have a somewhat different interface. # FIXME SOON! continue if n_centers == 2: class_weight = {0: 1000, 1: 0.0001} else: class_weight = {0: 1000, 1: 0.0001, 2: 0.0001} with warnings.catch_warnings(record=True): classifier = Classifier(class_weight=class_weight) if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) set_random_state(classifier) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) assert_greater(np.mean(y_pred == 0), 0.9)
def test_ovo_partial_fit_predict(): X, y = shuffle(iris.data, iris.target) ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(X[:100], y[:100], np.unique(y)) ovo1.partial_fit(X[100:], y[100:]) pred1 = ovo1.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) ovo2.fit(X, y) pred2 = ovo2.predict(X) assert_equal(len(ovo1.estimators_), n_classes * (n_classes - 1) / 2) assert_greater(np.mean(y == pred1), 0.65) assert_almost_equal(pred1, pred2) # Test when mini-batches don't have all target classes ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target)) ovo1.partial_fit(iris.data[60:], iris.target[60:]) pred1 = ovo1.predict(iris.data) ovo2 = OneVsOneClassifier(MultinomialNB()) pred2 = ovo2.fit(iris.data, iris.target).predict(iris.data) assert_almost_equal(pred1, pred2) assert_equal(len(ovo1.estimators_), len(np.unique(iris.target))) assert_greater(np.mean(iris.target == pred1), 0.65)
def test_ovr_partial_fit(): # Test if partial_fit is working as intented X, y = shuffle(iris.data, iris.target, random_state=0) ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(X[:100], y[:100], np.unique(y)) ovr.partial_fit(X[100:], y[100:]) pred = ovr.predict(X) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(X, y).predict(X) assert_almost_equal(pred, pred2) assert_equal(len(ovr.estimators_), len(np.unique(y))) assert_greater(np.mean(y == pred), 0.65) # Test when mini batches doesn't have all classes ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target)) ovr.partial_fit(iris.data[60:], iris.target[60:]) pred = ovr.predict(iris.data) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(iris.data, iris.target).predict(iris.data) assert_almost_equal(pred, pred2) assert_equal(len(ovr.estimators_), len(np.unique(iris.target))) assert_greater(np.mean(iris.target == pred), 0.65)
def test_classifiers_classes(): # test if classifiers can cope with non-consecutive classes classifiers = all_estimators(type_filter='classifier') X, y = make_blobs(random_state=12345) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) y = 2 * y + 1 classes = np.unique(y) # TODO: make work with next line :) #y = y.astype(np.str) for name, Clf in classifiers: if Clf in dont_test: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): clf = Clf() # fit clf.fit(X, y) y_pred = clf.predict(X) # training set performance assert_array_equal(np.unique(y), np.unique(y_pred)) assert_greater(zero_one_score(y, y_pred), 0.78, "accuracy of %s not greater than 0.78" % str(Clf)) assert_array_equal( clf.classes_, classes, "Unexpected classes_ attribute for %r" % clf)
def test_warm_start(): # A 1-iteration second fit on same data should give almost same result # with warm starting, and quite different result without warm starting. # Warm starting does not work with liblinear solver. X, y = iris.data, iris.target solvers = ['newton-cg', 'sag'] # old scipy doesn't have maxiter if sp_version >= (0, 12): solvers.append('lbfgs') for warm_start in [True, False]: for fit_intercept in [True, False]: for solver in solvers: for multi_class in ['ovr', 'multinomial']: clf = LogisticRegression(tol=1e-4, multi_class=multi_class, warm_start=warm_start, solver=solver, random_state=42, max_iter=100, fit_intercept=fit_intercept) clf.fit(X, y) coef_1 = clf.coef_ clf.max_iter = 1 with ignore_warnings(): clf.fit(X, y) cum_diff = np.sum(np.abs(coef_1 - clf.coef_)) msg = ("Warm starting issue with %s solver in %s mode " "with fit_intercept=%s and warm_start=%s" % (solver, multi_class, str(fit_intercept), str(warm_start))) if warm_start: assert_greater(2.0, cum_diff, msg) else: assert_greater(cum_diff, 2.0, msg)
def test_classifiers_classes(): # test if classifiers can cope with non-consecutive classes estimators = all_estimators() classifiers = [(name, E) for name, E in estimators if issubclass(E, ClassifierMixin)] iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) y = 2 * y + 1 # TODO: make work with next line :) #y = y.astype(np.str) for name, Clf in classifiers: if Clf in dont_test or Clf in meta_estimators: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): clf = Clf() # fit clf.fit(X, y) y_pred = clf.predict(X) # training set performance assert_array_equal(np.unique(y), np.unique(y_pred)) assert_greater(zero_one_score(y, y_pred), 0.78)
def test_lml_improving(): """ Test that hyperparameter-tuning improves log-marginal likelihood. """ for kernel in kernels: if kernel == fixed_kernel: continue gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) assert_greater(gpr.log_marginal_likelihood(gpr.kernel_.theta), gpr.log_marginal_likelihood(kernel.theta))
def test_fit_linear_multi(): for data in (mult_dense, mult_sparse): clf = LinearSVC(random_state=0) clf.fit(data, mult_target) y_pred = clf.predict(data) acc = np.mean(y_pred == mult_target) assert_greater(acc, 0.85)
def test_enet_path(): # We use a large number of samples and of informative features so that # the l1_ratio selected is more toward ridge than lasso X, y, X_test, y_test = build_dataset(n_samples=200, n_features=100, n_informative_features=100) max_iter = 150 # Here we have a small number of iterations, and thus the # ElasticNet might not converge. This is to speed up tests clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1], eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter) ignore_warnings(clf.fit)(X, y) # Well-conditioned settings, we should have selected our # smallest penalty assert_almost_equal(clf.alpha_, min(clf.alphas_)) # Non-sparse ground truth: we should have seleted an elastic-net # that is closer to ridge than to lasso assert_equal(clf.l1_ratio_, min(clf.l1_ratio)) clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1], eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter, precompute=True) ignore_warnings(clf.fit)(X, y) # Well-conditioned settings, we should have selected our # smallest penalty assert_almost_equal(clf.alpha_, min(clf.alphas_)) # Non-sparse ground truth: we should have seleted an elastic-net # that is closer to ridge than to lasso assert_equal(clf.l1_ratio_, min(clf.l1_ratio)) # We are in well-conditioned settings with low noise: we should # have a good test-set performance assert_greater(clf.score(X_test, y_test), 0.99) # Multi-output/target case X, y, X_test, y_test = build_dataset(n_features=10, n_targets=3) clf = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter) ignore_warnings(clf.fit)(X, y) # We are in well-conditioned settings with low noise: we should # have a good test-set performance assert_greater(clf.score(X_test, y_test), 0.99) assert_equal(clf.coef_.shape, (3, 10)) # Mono-output should have same cross-validated alpha_ and l1_ratio_ # in both cases. X, y, _, _ = build_dataset(n_features=10) clf1 = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7]) clf1.fit(X, y) clf2 = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7]) clf2.fit(X, y[:, np.newaxis]) assert_almost_equal(clf1.l1_ratio_, clf2.l1_ratio_) assert_almost_equal(clf1.alpha_, clf2.alpha_)
def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372 # The digits samples are dependent: they are apparently grouped by authors # although we don't have any information on the groups segment locations # for this data. We can highlight this fact be computing k-fold cross- # validation with and without shuffling: we observe that the shuffling case # wrongly makes the IID assumption and is therefore too optimistic: it # estimates a much higher accuracy (around 0.96) than the non # shuffling variant (around 0.86). digits = load_digits() X, y = digits.data[:800], digits.target[:800] model = SVC(C=10, gamma=0.005) n = len(y) cv = cval.KFold(n, 5, shuffle=False) mean_score = cval.cross_val_score(model, X, y, cv=cv).mean() assert_greater(0.88, mean_score) assert_greater(mean_score, 0.85) # Shuffling the data artificially breaks the dependency and hides the # overfitting of the model with regards to the writing style of the authors # by yielding a seriously overestimated score: cv = cval.KFold(n, 5, shuffle=True, random_state=0) mean_score = cval.cross_val_score(model, X, y, cv=cv).mean() assert_greater(mean_score, 0.95) cv = cval.KFold(n, 5, shuffle=True, random_state=1) mean_score = cval.cross_val_score(model, X, y, cv=cv).mean() assert_greater(mean_score, 0.95) # Similarly, StratifiedKFold should try to shuffle the data as little # as possible (while respecting the balanced class constraints) # and thus be able to detect the dependency by not overestimating # the CV score either. As the digits dataset is approximately balanced # the estimated mean score is close to the score measured with # non-shuffled KFold cv = cval.StratifiedKFold(y, 5) mean_score = cval.cross_val_score(model, X, y, cv=cv).mean() assert_greater(0.88, mean_score) assert_greater(mean_score, 0.85)
def test_ovr_fit_predict_svc(): ovr = OneVsRestClassifier(svm.SVC()) ovr.fit(iris.data, iris.target) assert_equal(len(ovr.estimators_), 3) assert_greater(ovr.score(iris.data, iris.target), .9)
def test_permutation_score(): iris = load_iris() X = iris.data X_sparse = coo_matrix(X) y = iris.target svm = SVC(kernel='linear') cv = StratifiedKFold(2) score, scores, pvalue = permutation_test_score(svm, X, y, n_permutations=30, cv=cv, scoring="accuracy") assert_greater(score, 0.9) assert_almost_equal(pvalue, 0.0, 1) score_group, _, pvalue_group = permutation_test_score(svm, X, y, n_permutations=30, cv=cv, scoring="accuracy", groups=np.ones( y.size), random_state=0) assert_true(score_group == score) assert_true(pvalue_group == pvalue) # check that we obtain the same results with a sparse representation svm_sparse = SVC(kernel='linear') cv_sparse = StratifiedKFold(2) score_group, _, pvalue_group = permutation_test_score(svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse, scoring="accuracy", groups=np.ones( y.size), random_state=0) assert_true(score_group == score) assert_true(pvalue_group == pvalue) # test with custom scoring object def custom_score(y_true, y_pred): return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) / y_true.shape[0]) scorer = make_scorer(custom_score) score, _, pvalue = permutation_test_score(svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0) assert_almost_equal(score, .93, 2) assert_almost_equal(pvalue, 0.01, 3) # set random y y = np.mod(np.arange(len(y)), 3) score, scores, pvalue = permutation_test_score(svm, X, y, n_permutations=30, cv=cv, scoring="accuracy") assert_less(score, 0.5) assert_greater(pvalue, 0.2)
def test_incremental_variance_numerical_stability(): # Test Youngs and Cramer incremental variance formulas. def np_var(A): return A.var(axis=0) # Naive one pass variance computation - not numerically stable # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance def one_pass_var(X): n = X.shape[0] exp_x2 = (X ** 2).sum(axis=0) / n expx_2 = (X.sum(axis=0) / n) ** 2 return exp_x2 - expx_2 # Two-pass algorithm, stable. # We use it as a benchmark. It is not an online algorithm # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm def two_pass_var(X): mean = X.mean(axis=0) Y = X.copy() return np.mean((Y - mean)**2, axis=0) # Naive online implementation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm # This works only for chunks for size 1 def naive_mean_variance_update(x, last_mean, last_variance, last_sample_count): updated_sample_count = (last_sample_count + 1) samples_ratio = last_sample_count / float(updated_sample_count) updated_mean = x / updated_sample_count + last_mean * samples_ratio updated_variance = last_variance * samples_ratio + \ (x - last_mean) * (x - updated_mean) / updated_sample_count return updated_mean, updated_variance, updated_sample_count # We want to show a case when one_pass_var has error > 1e-3 while # _batch_mean_variance_update has less. tol = 200 n_features = 2 n_samples = 10000 x1 = np.array(1e8, dtype=np.float64) x2 = np.log(1e-5, dtype=np.float64) A0 = np.full((n_samples // 2, n_features), x1, dtype=np.float64) A1 = np.full((n_samples // 2, n_features), x2, dtype=np.float64) A = np.vstack((A0, A1)) # Older versions of numpy have different precision # In some old version, np.var is not stable if np.abs(np_var(A) - two_pass_var(A)).max() < 1e-6: stable_var = np_var else: stable_var = two_pass_var # Naive one pass var: >tol (=1063) assert_greater(np.abs(stable_var(A) - one_pass_var(A)).max(), tol) # Starting point for online algorithms: after A0 # Naive implementation: >tol (436) mean, var, n = A0[0, :], np.zeros(n_features), n_samples // 2 for i in range(A1.shape[0]): mean, var, n = \ naive_mean_variance_update(A1[i, :], mean, var, n) assert_equal(n, A.shape[0]) # the mean is also slightly unstable assert_greater(np.abs(A.mean(axis=0) - mean).max(), 1e-6) assert_greater(np.abs(stable_var(A) - var).max(), tol) # Robust implementation: <tol (177) mean, var = A0[0, :], np.zeros(n_features) n = np.full(n_features, n_samples // 2, dtype=np.int32) for i in range(A1.shape[0]): mean, var, n = \ _incremental_mean_and_var(A1[i, :].reshape((1, A1.shape[1])), mean, var, n) assert_array_equal(n, A.shape[0]) assert_array_almost_equal(A.mean(axis=0), mean) assert_greater(tol, np.abs(stable_var(A) - var).max())
def test_minibatch_update_consistency(): """Check that dense and sparse minibatch update give the same results""" rng = np.random.RandomState(42) old_centers = centers + rng.normal(size=centers.shape) new_centers = old_centers.copy() new_centers_csr = old_centers.copy() counts = np.zeros(new_centers.shape[0], dtype=np.int32) counts_csr = np.zeros(new_centers.shape[0], dtype=np.int32) x_squared_norms = (X**2).sum(axis=1) x_squared_norms_csr = csr_row_norm_l2(X_csr, squared=True) buffer = np.zeros(centers.shape[1], dtype=np.double) buffer_csr = np.zeros(centers.shape[1], dtype=np.double) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] # step 1: compute the dense minibatch update old_inertia, incremental_diff = _mini_batch_step(X_mb, x_mb_squared_norms, new_centers, counts, buffer, 1) assert_greater(old_inertia, 0.0) # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia(X_mb, x_mb_squared_norms, new_centers) assert_greater(new_inertia, 0.0) assert_less(new_inertia, old_inertia) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers - old_centers)**2) assert_almost_equal(incremental_diff, effective_diff) # step 2: compute the sparse minibatch update old_inertia_csr, incremental_diff_csr = _mini_batch_step( X_mb_csr, x_mb_squared_norms_csr, new_centers_csr, counts_csr, buffer_csr, 1) assert_greater(old_inertia_csr, 0.0) # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia(X_mb_csr, x_mb_squared_norms_csr, new_centers_csr) assert_greater(new_inertia_csr, 0.0) assert_less(new_inertia_csr, old_inertia_csr) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers_csr - old_centers)**2) assert_almost_equal(incremental_diff_csr, effective_diff) # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_array_almost_equal(new_centers, new_centers_csr) assert_almost_equal(incremental_diff, incremental_diff_csr) assert_almost_equal(old_inertia, old_inertia_csr) assert_almost_equal(new_inertia, new_inertia_csr)
def test_score(): km1 = KMeans(n_clusters=n_clusters, max_iter=1, random_state=42) s1 = km1.fit(X).score(X) km2 = KMeans(n_clusters=n_clusters, max_iter=10, random_state=42) s2 = km2.fit(X).score(X) assert_greater(s2, s1)
def test_assert_greater(): assert_greater(1, 0) assert_raises(AssertionError, assert_greater, 0, 1)
def test_binnedstratifiedkfold_has_more_stable_distribution_moments_between_folds( ): """check if BinnedStratifiedKFold performs on average better than KFold in terms of lower between-fold variance of fold mean(y_test) and fold std(y_test) """ binned_has_more_stable_std_list = [] binned_has_more_stable_mean_list = [] for trial in range(100): n_folds = 2 + int(10 * np.random.rand()) y = np.random.randn(30) np.random.shuffle(y) ymeans_binned = [] ystds_binned = [] cv_bs = BinnedStratifiedKFold(n_folds=n_folds, shuffle=False, random_state=None) bskf = cv_bs.split(y) cv = KFold(n_folds=n_folds, shuffle=True, random_state=None) kf = cv.split(y) #bins = np.percentile(y, np.arange(n_folds)) bins = np.array([np.percentile(y, q) for q in range(n_folds)]) for train_index, test_index in bskf: y_test = y[test_index] ymeans_binned.append(y_test.mean()) ystds_binned.append(y_test.std()) hist_, _ = np.histogram(y[test_index], bins=bins) assert_true(all(abs(hist_ - np.mean(hist_)) <= 1), msg="too ragged bins") ymeans_regular = [] ystds_regular = [] for train_index_reg, test_index_reg in kf: ymeans_regular.append(y[test_index_reg].mean()) ystds_regular.append(y[test_index_reg].std()) binned_has_more_stable_std = np.std(ystds_regular) > np.std( ystds_binned) binned_has_more_stable_std_list.append(binned_has_more_stable_std) binned_has_more_stable_mean = np.std(ymeans_regular) > np.std( ymeans_binned) binned_has_more_stable_mean_list.append(binned_has_more_stable_mean) binned_has_more_stable_std_fraction = np.mean( binned_has_more_stable_std_list) binned_has_more_stable_mean_fraction = np.mean( binned_has_more_stable_mean_list) assert_greater(binned_has_more_stable_std_fraction, 0.5) assert_greater(binned_has_more_stable_mean_fraction, 0.5) print(" std(y_test) of BinnedStratifiedKFold was more stable than " "one of KFold in\t%.2f%% cases" % \ (100.0*binned_has_more_stable_std_fraction)) print("mean(y_test) of BinnedStratifiedKFold was more stable than " "one of KFold in\t%.2f%% cases" % \ (100.0*binned_has_more_stable_mean_fraction))
def test_permutation_score(): iris = load_iris() X = iris.data X_sparse = coo_matrix(X) y = iris.target svm = SVC(kernel='linear') cv = cval.StratifiedKFold(y, 2) score, scores, pvalue = cval.permutation_test_score(svm, X, y, cv=cv, scoring="accuracy") assert_greater(score, 0.9) assert_almost_equal(pvalue, 0.0, 1) score_label, _, pvalue_label = cval.permutation_test_score( svm, X, y, cv=cv, scoring="accuracy", labels=np.ones(y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # test with custom scoring object scorer = make_scorer(fbeta_score, beta=2) score_label, _, pvalue_label = cval.permutation_test_score(svm, X, y, scoring=scorer, cv=cv, labels=np.ones( y.size), random_state=0) assert_almost_equal(score_label, .97, 2) assert_almost_equal(pvalue_label, 0.01, 3) # check that we obtain the same results with a sparse representation svm_sparse = SVC(kernel='linear') cv_sparse = cval.StratifiedKFold(y, 2) score_label, _, pvalue_label = cval.permutation_test_score( svm_sparse, X_sparse, y, cv=cv_sparse, scoring="accuracy", labels=np.ones(y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # set random y y = np.mod(np.arange(len(y)), 3) score, scores, pvalue = cval.permutation_test_score(svm, X, y, cv=cv, scoring="accuracy") assert_less(score, 0.5) assert_greater(pvalue, 0.2) # test with deprecated interface with warnings.catch_warnings(record=True): score, scores, pvalue = cval.permutation_test_score( svm, X, y, score_func=accuracy_score, cv=cv) assert_less(score, 0.5) assert_greater(pvalue, 0.2)
def test_all_estimators(): estimators = all_estimators(include_meta_estimators=True) # Meta sanity-check to make sure that the estimator introspection runs # properly assert_greater(len(estimators), 0)
def test_radius_neighbors(): # Checks whether Returned distances are less than `radius` # At least one point should be returned when the `radius` is set # to mean distance from the considering point to other points in # the database. # Moreover, this test compares the radius neighbors of LSHForest # with the `sklearn.neighbors.NearestNeighbors`. n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = ignore_warnings(LSHForest, category=DeprecationWarning)() # Test unfitted estimator assert_raises(ValueError, lshf.radius_neighbors, X[0]) ignore_warnings(lshf.fit)(X) for i in range(n_iter): # Select a random point in the dataset as the query query = X[rng.randint(0, n_samples)].reshape(1, -1) # At least one neighbor should be returned when the radius is the # mean distance from the query to the points of the dataset. mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=False) assert_equal(neighbors.shape, (1,)) assert_equal(neighbors.dtype, object) assert_greater(neighbors[0].shape[0], 0) # All distances to points in the results of the radius query should # be less than mean_dist distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_array_less(distances[0], mean_dist) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.radius_neighbors(queries, return_distance=True) # dists and inds should not be 1D arrays or arrays of variable lengths # hence the use of the object dtype. assert_equal(distances.shape, (n_queries,)) assert_equal(distances.dtype, object) assert_equal(neighbors.shape, (n_queries,)) assert_equal(neighbors.dtype, object) # Compare with exact neighbor search query = X[rng.randint(0, n_samples)].reshape(1, -1) mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X) distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist) distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist) # Radius-based queries do not sort the result points and the order # depends on the method, the random_state and the dataset order. Therefore # we need to sort the results ourselves before performing any comparison. sorted_dists_exact = np.sort(distances_exact[0]) sorted_dists_approx = np.sort(distances_approx[0]) # Distances to exact neighbors are less than or equal to approximate # counterparts as the approximate radius query might have missed some # closer neighbors. assert_true(np.all(np.less_equal(sorted_dists_exact, sorted_dists_approx)))
def check_classifiers_train(name, Classifier): X_m, y_m = make_blobs(random_state=0) X_m, y_m = shuffle(X_m, y_m, random_state=7) X_m = StandardScaler().fit_transform(X_m) # generate binary problem from multi-class one y_b = y_m[y_m != 2] X_b = X_m[y_m != 2] for (X, y) in [(X_m, y_m), (X_b, y_b)]: # catch deprecation warnings classes = np.unique(y) n_classes = len(classes) n_samples, n_features = X.shape with warnings.catch_warnings(record=True): classifier = Classifier() if name in ['BernoulliNB', 'MultinomialNB']: X -= X.min() set_fast_parameters(classifier) # raises error on malformed input for fit assert_raises(ValueError, classifier.fit, X, y[:-1]) # fit classifier.fit(X, y) # with lists classifier.fit(X.tolist(), y.tolist()) assert_true(hasattr(classifier, "classes_")) y_pred = classifier.predict(X) assert_equal(y_pred.shape, (n_samples,)) # training set performance if name not in ['BernoulliNB', 'MultinomialNB']: assert_greater(accuracy_score(y, y_pred), 0.85) # raises error on malformed input for predict assert_raises(ValueError, classifier.predict, X.T) if hasattr(classifier, "decision_function"): try: # decision_function agrees with predict: decision = classifier.decision_function(X) if n_classes is 2: assert_equal(decision.shape, (n_samples,)) dec_pred = (decision.ravel() > 0).astype(np.int) assert_array_equal(dec_pred, y_pred) if (n_classes is 3 and not isinstance(classifier, BaseLibSVM)): # 1on1 of LibSVM works differently assert_equal(decision.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input assert_raises(ValueError, classifier.decision_function, X.T) # raises error on malformed input for decision_function assert_raises(ValueError, classifier.decision_function, X.T) except NotImplementedError: pass if hasattr(classifier, "predict_proba"): # predict_proba agrees with predict: y_prob = classifier.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # check that probas for all classes sum to one assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples)) # raises error on malformed input assert_raises(ValueError, classifier.predict_proba, X.T) # raises error on malformed input for predict_proba assert_raises(ValueError, classifier.predict_proba, X.T)
def test_classifiers_train(): # test if classifiers do something sensible on training set # also test all shapes / shape errors estimators = all_estimators() classifiers = [(name, E) for name, E in estimators if issubclass(E, ClassifierMixin)] iris = load_iris() X_m, y_m = iris.data, iris.target X_m, y_m = shuffle(X_m, y_m, random_state=7) X_m = Scaler().fit_transform(X_m) # generate binary problem from multi-class one y_b = y_m[y_m != 2] X_b = X_m[y_m != 2] for (X, y) in [(X_m, y_m), (X_b, y_b)]: # do it once with binary, once with multiclass n_labels = len(np.unique(y)) n_samples, n_features = X.shape for name, Clf in classifiers: if Clf in dont_test or Clf in meta_estimators: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): clf = Clf() # raises error on malformed input for fit assert_raises(ValueError, clf.fit, X, y[:-1]) # fit clf.fit(X, y) y_pred = clf.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance assert_greater(zero_one_score(y, y_pred), 0.78) # raises error on malformed input for predict assert_raises(ValueError, clf.predict, X.T) if hasattr(clf, "decision_function"): try: # decision_function agrees with predict: decision = clf.decision_function(X) if n_labels is 2: assert_equal(decision.ravel().shape, (n_samples, )) dec_pred = (decision.ravel() > 0).astype(np.int) assert_array_equal(dec_pred, y_pred) if n_labels is 3 and not isinstance(clf, BaseLibSVM): # 1on1 of LibSVM works differently assert_equal(decision.shape, (n_samples, n_labels)) assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input assert_raises(ValueError, clf.decision_function, X.T) # raises error on malformed input for decision_function assert_raises(ValueError, clf.decision_function, X.T) except NotImplementedError: pass if hasattr(clf, "predict_proba"): try: # predict_proba agrees with predict: y_prob = clf.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_labels)) # raises error on malformed input assert_raises(ValueError, clf.predict_proba, X.T) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # raises error on malformed input for predict_proba assert_raises(ValueError, clf.predict_proba, X.T) except NotImplementedError: pass
def test_factor_analysis(): # Test FactorAnalysis ability to recover the data covariance structure rng = np.random.RandomState(0) n_samples, n_features, n_components = 20, 5, 3 # Some random settings for the generative model W = rng.randn(n_components, n_features) # latent variable of dim 3, 20 of it h = rng.randn(n_samples, n_components) # using gamma to model different noise variance # per component noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features) # generate observations # wlog, mean is 0 X = np.dot(h, W) + noise assert_raises(ValueError, FactorAnalysis, svd_method='foo') fa_fail = FactorAnalysis() fa_fail.svd_method = 'foo' assert_raises(ValueError, fa_fail.fit, X) fas = [] for method in ['randomized', 'lapack']: fa = FactorAnalysis(n_components=n_components, svd_method=method) fa.fit(X) fas.append(fa) X_t = fa.transform(X) assert_equal(X_t.shape, (n_samples, n_components)) assert_almost_equal(fa.loglike_[-1], fa.score_samples(X).sum()) assert_almost_equal(fa.score_samples(X).mean(), fa.score(X)) diff = np.all(np.diff(fa.loglike_)) assert_greater(diff, 0., 'Log likelihood dif not increase') # Sample Covariance scov = np.cov(X, rowvar=0., bias=1.) # Model Covariance mcov = fa.get_covariance() diff = np.sum(np.abs(scov - mcov)) / W.size assert_less(diff, 0.1, "Mean absolute difference is %f" % diff) fa = FactorAnalysis(n_components=n_components, noise_variance_init=np.ones(n_features)) assert_raises(ValueError, fa.fit, X[:, :2]) f = lambda x, y: np.abs(getattr(x, y)) # sign will not be equal fa1, fa2 = fas for attr in ['loglike_', 'components_', 'noise_variance_']: assert_almost_equal(f(fa1, attr), f(fa2, attr)) fa1.max_iter = 1 fa1.verbose = True assert_warns(ConvergenceWarning, fa1.fit, X) # Test get_covariance and get_precision with n_components == n_features # with n_components < n_features and with n_components == 0 for n_components in [0, 2, X.shape[1]]: fa.n_components = n_components fa.fit(X) cov = fa.get_covariance() precision = fa.get_precision() assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]), 12)
def test_calibration(): """Test calibration objects with isotonic and sigmoid""" n_samples = 100 X, y = make_classification(n_samples=2 * n_samples, n_features=6, random_state=42) sample_weight = np.random.RandomState(seed=42).uniform(size=y.size) X -= X.min() # MultinomialNB only allows positive X # split train and test X_train, y_train, sw_train = \ X[:n_samples], y[:n_samples], sample_weight[:n_samples] X_test, y_test = X[n_samples:], y[n_samples:] # Naive-Bayes clf = MultinomialNB().fit(X_train, y_train, sample_weight=sw_train) prob_pos_clf = clf.predict_proba(X_test)[:, 1] pc_clf = CalibratedClassifierCV(clf, cv=y.size + 1) assert_raises(ValueError, pc_clf.fit, X, y) # Naive Bayes with calibration for this_X_train, this_X_test in [(X_train, X_test), (sparse.csr_matrix(X_train), sparse.csr_matrix(X_test))]: for method in ['isotonic', 'sigmoid']: pc_clf = CalibratedClassifierCV(clf, method=method, cv=2) # Note that this fit overwrites the fit on the entire training # set pc_clf.fit(this_X_train, y_train, sample_weight=sw_train) prob_pos_pc_clf = pc_clf.predict_proba(this_X_test)[:, 1] # Check that brier score has improved after calibration assert_greater(brier_score_loss(y_test, prob_pos_clf), brier_score_loss(y_test, prob_pos_pc_clf)) # Check invariance against relabeling [0, 1] -> [1, 2] pc_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train) prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1] assert_array_almost_equal(prob_pos_pc_clf, prob_pos_pc_clf_relabeled) # Check invariance against relabeling [0, 1] -> [-1, 1] pc_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train) prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1] assert_array_almost_equal(prob_pos_pc_clf, prob_pos_pc_clf_relabeled) # Check invariance against relabeling [0, 1] -> [1, 0] pc_clf.fit(this_X_train, (y_train + 1) % 2, sample_weight=sw_train) prob_pos_pc_clf_relabeled = \ pc_clf.predict_proba(this_X_test)[:, 1] if method == "sigmoid": assert_array_almost_equal(prob_pos_pc_clf, 1 - prob_pos_pc_clf_relabeled) else: # Isotonic calibration is not invariant against relabeling # but should improve in both cases assert_greater( brier_score_loss(y_test, prob_pos_clf), brier_score_loss((y_test + 1) % 2, prob_pos_pc_clf_relabeled)) # check that calibration can also deal with regressors that have # a decision_function clf_base_regressor = CalibratedClassifierCV(Ridge()) clf_base_regressor.fit(X_train, y_train) clf_base_regressor.predict(X_test) # Check failure cases: # only "isotonic" and "sigmoid" should be accepted as methods clf_invalid_method = CalibratedClassifierCV(clf, method="foo") assert_raises(ValueError, clf_invalid_method.fit, X_train, y_train) # base-estimators should provide either decision_function or # predict_proba (most regressors, for instance, should fail) clf_base_regressor = \ CalibratedClassifierCV(RandomForestRegressor(), method="sigmoid") assert_raises(RuntimeError, clf_base_regressor.fit, X_train, y_train)
def test_check_accuracy_on_digits(): # Non regression test to make sure that any further refactoring / optim # of the NB models do not harm the performance on a slightly non-linearly # separable dataset digits = load_digits() X, y = digits.data, digits.target binary_3v8 = np.logical_or(digits.target == 3, digits.target == 8) X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8] # Multinomial NB scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10) assert_greater(scores.mean(), 0.86) scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10) assert_greater(scores.mean(), 0.94) # Bernoulli NB scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10) assert_greater(scores.mean(), 0.83) scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10) assert_greater(scores.mean(), 0.92) # Gaussian NB scores = cross_val_score(GaussianNB(), X, y, cv=10) assert_greater(scores.mean(), 0.77) scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10) assert_greater(scores.mean(), 0.86)
print("Timimg exact rbf: \t\t", exact_spent_time) rbf_transform = Fastfood( sigma=sigma, n_components=number_of_features_to_generate, tradeoff_mem_accuracy="mem", random_state=42, ) _ = rbf_transform.fit(X) fastfood_fast_vec_start = datetime.datetime.utcnow() # Fastfood: approximate kernel mapping _ = rbf_transform.transform(X) _ = rbf_transform.transform(Y) fastfood_fast_vec_end = datetime.datetime.utcnow() fastfood_fast_vec_spent_time = fastfood_fast_vec_end - fastfood_fast_vec_start print("Timimg fastfood fast vectorized: \t\t", fastfood_fast_vec_spent_time) rks_rbf_transform = RBFSampler( gamma=gamma, n_components=number_of_features_to_generate, random_state=42 ) _ = rks_rbf_transform.fit(X) rks_start = datetime.datetime.utcnow() # Random Kitchens Sinks: approximate kernel mapping _ = rks_rbf_transform.transform(X) _ = rks_rbf_transform.transform(Y) rks_end = datetime.datetime.utcnow() rks_spent_time = rks_end - rks_start print("Timimg rks: \t\t\t", rks_spent_time) assert_greater(rks_spent_time, fastfood_fast_vec_spent_time)
def test_classifiers_train(): # test if classifiers do something sensible on training set # also test all shapes / shape errors classifiers = all_estimators(type_filter='classifier') X_m, y_m = make_blobs(random_state=0) X_m, y_m = shuffle(X_m, y_m, random_state=7) X_m = StandardScaler().fit_transform(X_m) # generate binary problem from multi-class one y_b = y_m[y_m != 2] X_b = X_m[y_m != 2] for (X, y) in [(X_m, y_m), (X_b, y_b)]: # do it once with binary, once with multiclass classes = np.unique(y) n_classes = len(classes) n_samples, n_features = X.shape for name, Classifier in classifiers: if name in dont_test: continue if name in ['MultinomialNB', 'BernoulliNB']: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() # raises error on malformed input for fit assert_raises(ValueError, classifier.fit, X, y[:-1]) # fit classifier.fit(X, y) assert_true(hasattr(classifier, "classes_")) y_pred = classifier.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance assert_greater(accuracy_score(y, y_pred), 0.85) # raises error on malformed input for predict assert_raises(ValueError, classifier.predict, X.T) if hasattr(classifier, "decision_function"): try: # decision_function agrees with predict: decision = classifier.decision_function(X) if n_classes is 2: assert_equal(decision.ravel().shape, (n_samples, )) dec_pred = (decision.ravel() > 0).astype(np.int) assert_array_equal(dec_pred, y_pred) if (n_classes is 3 and not isinstance(classifier, BaseLibSVM)): # 1on1 of LibSVM works differently assert_equal(decision.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input assert_raises(ValueError, classifier.decision_function, X.T) # raises error on malformed input for decision_function assert_raises(ValueError, classifier.decision_function, X.T) except NotImplementedError: pass if hasattr(classifier, "predict_proba"): try: # predict_proba agrees with predict: y_prob = classifier.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # check that probas for all classes sum to one assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples)) # raises error on malformed input assert_raises(ValueError, classifier.predict_proba, X.T) # raises error on malformed input for predict_proba assert_raises(ValueError, classifier.predict_proba, X.T) except NotImplementedError: pass
def test_explained_variance(): # Test sparse data svd_a_10_sp = TruncatedSVD(10, algorithm="arpack") svd_r_10_sp = TruncatedSVD(10, algorithm="randomized", random_state=42) svd_a_20_sp = TruncatedSVD(20, algorithm="arpack") svd_r_20_sp = TruncatedSVD(20, algorithm="randomized", random_state=42) X_trans_a_10_sp = svd_a_10_sp.fit_transform(X) X_trans_r_10_sp = svd_r_10_sp.fit_transform(X) X_trans_a_20_sp = svd_a_20_sp.fit_transform(X) X_trans_r_20_sp = svd_r_20_sp.fit_transform(X) # Test dense data svd_a_10_de = TruncatedSVD(10, algorithm="arpack") svd_r_10_de = TruncatedSVD(10, algorithm="randomized", random_state=42) svd_a_20_de = TruncatedSVD(20, algorithm="arpack") svd_r_20_de = TruncatedSVD(20, algorithm="randomized", random_state=42) X_trans_a_10_de = svd_a_10_de.fit_transform(X.toarray()) X_trans_r_10_de = svd_r_10_de.fit_transform(X.toarray()) X_trans_a_20_de = svd_a_20_de.fit_transform(X.toarray()) X_trans_r_20_de = svd_r_20_de.fit_transform(X.toarray()) # helper arrays for tests below svds = (svd_a_10_sp, svd_r_10_sp, svd_a_20_sp, svd_r_20_sp, svd_a_10_de, svd_r_10_de, svd_a_20_de, svd_r_20_de) svds_trans = ( (svd_a_10_sp, X_trans_a_10_sp), (svd_r_10_sp, X_trans_r_10_sp), (svd_a_20_sp, X_trans_a_20_sp), (svd_r_20_sp, X_trans_r_20_sp), (svd_a_10_de, X_trans_a_10_de), (svd_r_10_de, X_trans_r_10_de), (svd_a_20_de, X_trans_a_20_de), (svd_r_20_de, X_trans_r_20_de), ) svds_10_v_20 = ( (svd_a_10_sp, svd_a_20_sp), (svd_r_10_sp, svd_r_20_sp), (svd_a_10_de, svd_a_20_de), (svd_r_10_de, svd_r_20_de), ) svds_sparse_v_dense = ( (svd_a_10_sp, svd_a_10_de), (svd_a_20_sp, svd_a_20_de), (svd_r_10_sp, svd_r_10_de), (svd_r_20_sp, svd_r_20_de), ) # Assert the 1st component is equal for svd_10, svd_20 in svds_10_v_20: assert_array_almost_equal( svd_10.explained_variance_ratio_, svd_20.explained_variance_ratio_[:10], decimal=5, ) # Assert that 20 components has higher explained variance than 10 for svd_10, svd_20 in svds_10_v_20: assert_greater( svd_20.explained_variance_ratio_.sum(), svd_10.explained_variance_ratio_.sum(), ) # Assert that all the values are greater than 0 for svd in svds: assert_array_less(0.0, svd.explained_variance_ratio_) # Assert that total explained variance is less than 1 for svd in svds: assert_array_less(svd.explained_variance_ratio_.sum(), 1.0) # Compare sparse vs. dense for svd_sparse, svd_dense in svds_sparse_v_dense: assert_array_almost_equal(svd_sparse.explained_variance_ratio_, svd_dense.explained_variance_ratio_) # Test that explained_variance is correct for svd, transformed in svds_trans: total_variance = np.var(X.toarray(), axis=0).sum() variances = np.var(transformed, axis=0) true_explained_variance_ratio = variances / total_variance assert_array_almost_equal( svd.explained_variance_ratio_, true_explained_variance_ratio, )
def test_linear_svr_fit_intercept(): reg = LinearSVR(random_state=0, fit_intercept=True) reg.fit(reg_dense, reg_target) assert_greater(reg.score(reg_dense, reg_target), 0.99)
def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)
def check_explicit_sparse_zeros(tree, max_depth=3, n_features=10): TreeEstimator = ALL_TREES[tree] # n_samples set n_feature to ease construction of a simultaneous # construction of a csr and csc matrix n_samples = n_features samples = np.arange(n_samples) # Generate X, y random_state = check_random_state(0) indices = [] data = [] offset = 0 indptr = [offset] for i in range(n_features): n_nonzero_i = random_state.binomial(n_samples, 0.5) indices_i = random_state.permutation(samples)[:n_nonzero_i] indices.append(indices_i) data_i = random_state.binomial(3, 0.5, size=(n_nonzero_i, )) - 1 data.append(data_i) offset += n_nonzero_i indptr.append(offset) indices = np.concatenate(indices) data = np.array(np.concatenate(data), dtype=np.float32) X_sparse = csc_matrix((data, indices, indptr), shape=(n_samples, n_features)) X = X_sparse.toarray() X_sparse_test = csr_matrix((data, indices, indptr), shape=(n_samples, n_features)) X_test = X_sparse_test.toarray() y = random_state.randint(0, 3, size=(n_samples, )) # Ensure that X_sparse_test owns its data, indices and indptr array X_sparse_test = X_sparse_test.copy() # Ensure that we have explicit zeros assert_greater((X_sparse.data == 0.).sum(), 0) assert_greater((X_sparse_test.data == 0.).sum(), 0) # Perform the comparison d = TreeEstimator(random_state=0, max_depth=max_depth).fit(X, y) s = TreeEstimator(random_state=0, max_depth=max_depth).fit(X_sparse, y) assert_tree_equal( d.tree_, s.tree_, "{0} with dense and sparse format gave different " "trees".format(tree)) Xs = (X_test, X_sparse_test) for X1, X2 in product(Xs, Xs): assert_array_almost_equal(s.tree_.apply(X1), d.tree_.apply(X2)) assert_array_almost_equal(s.apply(X1), d.apply(X2)) assert_array_almost_equal(s.apply(X1), s.tree_.apply(X1)) assert_array_almost_equal( s.tree_.decision_path(X1).toarray(), d.tree_.decision_path(X2).toarray()) assert_array_almost_equal( s.decision_path(X1).toarray(), d.decision_path(X2).toarray()) assert_array_almost_equal( s.decision_path(X1).toarray(), s.tree_.decision_path(X1).toarray()) assert_array_almost_equal(s.predict(X1), d.predict(X2)) if tree in CLF_TREES: assert_array_almost_equal(s.predict_proba(X1), d.predict_proba(X2))
def test_multiclass_sgd(): clf = SGDClassifier(random_state=0) clf.fit(mult_dense, mult_target) assert_greater(clf.score(mult_dense, mult_target), 0.80) assert_equal(list(clf.classes_), [0, 1, 2])
def test_lgmlvq_iris(): check_estimator(LgmlvqModel) model = LgmlvqModel() model.fit(iris.data, iris.target) assert_greater(model.score(iris.data, iris.target), 0.95) assert_raise_message(ValueError, 'regularization must be a positive float', LgmlvqModel(regularization=-1.0).fit, iris.data, iris.target) assert_raise_message(ValueError, 'length of regularization' ' must be number of prototypes', LgmlvqModel(regularization=[-1.0]).fit, iris.data, iris.target) assert_raise_message(ValueError, 'length of regularization must be number of classes', LgmlvqModel(regularization=[-1.0], classwise=True).fit, iris.data, iris.target) assert_raise_message(ValueError, 'initial matrices must be a list', LgmlvqModel(initial_matrices=np.array( [[1, 2], [3, 4], [5, 6]])).fit, iris.data, iris.target) assert_raise_message(ValueError, 'length of matrices wrong', LgmlvqModel( initial_matrices=[[[1, 2], [3, 4], [5, 6]]]).fit, iris.data, iris.target) assert_raise_message(ValueError, 'each matrix should have', LgmlvqModel( initial_matrices=[[[1]], [[1]], [[1]]]).fit, iris.data, iris.target) assert_raise_message(ValueError, 'length of matrices wrong', LgmlvqModel(initial_matrices=[[[1, 2, 3]]], classwise=True).fit, iris.data, iris.target) assert_raise_message(ValueError, 'each matrix should have', LgmlvqModel(initial_matrices=[[[1]], [[1]], [[1]]], classwise=True).fit, iris.data, iris.target) assert_raise_message(ValueError, 'classwise must be a boolean', LgmlvqModel(classwise="a").fit, iris.data, iris.target) assert_raise_message(ValueError, 'dim must be a list of positive ints', LgmlvqModel(dim=[-1]).fit, iris.data, iris.target) assert_raise_message(ValueError, 'dim length must be number of prototypes', LgmlvqModel(dim=[1, 1]).fit, iris.data, iris.target) assert_raise_message(ValueError, 'dim length must be number of classes', LgmlvqModel(dim=[1, 1], classwise=True).fit, iris.data, iris.target) LgmlvqModel(classwise=True, dim=[1], prototypes_per_class=2).fit( iris.data, iris.target) model = LgmlvqModel(regularization=0.1) model.fit(iris.data, iris.target) model = LgmlvqModel(initial_prototypes=[[0, 2, 1], [1, 6, 2]], initial_matrices=[np.ones([2, 2]), np.ones([2, 2])], dim=[2, 2]) x = np.array([[0, 0], [0, 4], [1, 4], [1, 8]]) y = np.array([1, 1, 2, 2]) model.fit(x, y)
def test_linear_svr(): reg = LinearSVR(random_state=0) reg.fit(reg_dense, reg_target) assert_greater(reg.score(reg_dense, reg_target), 0.99)
def test_perceptron_accuracy(): for data in (X, X_csr): clf = Perceptron(max_iter=100, tol=None, shuffle=False) clf.fit(data, y) score = clf.score(data, y) assert_greater(score, 0.7)
def test_lml_improving(kernel): # Test that hyperparameter-tuning improves log-marginal likelihood. gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) assert_greater(gpr.log_marginal_likelihood(gpr.kernel_.theta), gpr.log_marginal_likelihood(kernel.theta))
def check_classifiers_train(name, classifier_orig, readonly_memmap=False): X_m, y_m = make_blobs(n_samples=300, random_state=0) X_m, y_m = shuffle(X_m, y_m, random_state=7) X_m = StandardScaler().fit_transform(X_m) # generate binary problem from multi-class one y_b = y_m[y_m != 2] X_b = X_m[y_m != 2] if readonly_memmap: X_b, y_b = create_memmap_backed_data([X_b, y_b]) for (X, y) in [(X_b, y_b)]: classes = np.unique(y) n_classes = len(classes) n_samples, _ = X.shape classifier = clone(classifier_orig) X = pairwise_estimator_convert_X(X, classifier) y = multioutput_estimator_convert_y_2d(classifier, y) set_random_state(classifier) # raises error on malformed input for fit with assert_raises(ValueError, msg="The classifier {} does not " "raise an error when incorrect/malformed input " "data for fit is passed. The number of training " "examples is not the same as the number of labels. " "Perhaps use check_X_y in fit.".format(name)): classifier.fit(X, y[:-1]) # fit classifier.fit(X, y) # with lists classifier.fit(X.tolist(), y.tolist()) assert hasattr(classifier, "classes_") y_pred = classifier.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance assert_greater(accuracy_score(y, y_pred), 0.83) # raises error on malformed input for predict msg = ("The classifier {} does not raise an error when the number of " "features in {} is different from the number of features in " "fit.") with assert_raises(ValueError, msg=msg.format(name, "predict")): classifier.predict(X.T) if hasattr(classifier, "decision_function"): try: # decision_function agrees with predict decision = classifier.decision_function(X) if n_classes == 2: assert_equal(decision.shape, (n_samples, 1)) dec_pred = (decision.ravel() > 0).astype(np.int) assert_array_equal(dec_pred, y_pred) else: assert_equal(decision.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input for decision_function with assert_raises(ValueError, msg=msg.format(name, "decision_function")): classifier.decision_function(X.T) except NotImplementedError: pass if hasattr(classifier, "predict_proba"): # predict_proba agrees with predict y_prob = classifier.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # check that probas for all classes sum to one assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples)) # raises error on malformed input for predict_proba with assert_raises(ValueError, msg=msg.format(name, "predict_proba")): classifier.predict_proba(X.T) if hasattr(classifier, "predict_log_proba"): # predict_log_proba is a transformation of predict_proba y_log_prob = classifier.predict_log_proba(X) assert_allclose(y_log_prob, np.log(y_prob), 8, atol=1e-9) assert_array_equal(np.argsort(y_log_prob), np.argsort(y_prob))
def test_whitening(): # Check that PCA output has unit-variance rng = np.random.RandomState(0) n_samples = 100 n_features = 80 n_components = 30 rank = 50 # some low rank data with correlated features X = np.dot( rng.randn(n_samples, rank), np.dot(np.diag(np.linspace(10.0, 1.0, rank)), rng.randn(rank, n_features)), ) # the component-wise variance of the first 50 features is 3 times the # mean component-wise variance of the remaining 30 features X[:, :50] *= 3 assert_equal(X.shape, (n_samples, n_features)) # the component-wise variance is thus highly varying: assert_greater(X.std(axis=0).std(), 43.8) dX = da.from_array(X, chunks=(50, n_features)) for solver, copy in product(solver_list, (True, False)): # whiten the data while projecting to the lower dim subspace X_ = dX.copy() # make sure we keep an original across iterations. pca = dd.PCA( n_components=n_components, whiten=True, copy=copy, svd_solver=solver, random_state=0, iterated_power=4, ) # test fit_transform X_whitened = pca.fit_transform(X_.copy()) assert_equal(X_whitened.shape, (n_samples, n_components)) # X_whitened2 = pca.transform(X_) # XXX: These differ for randomized. # assert_eq(X_whitened.compute(), X_whitened2.compute(), # atol=tol, rtol=tol) assert_almost_equal(X_whitened.std(ddof=1, axis=0), np.ones(n_components), decimal=6) assert_almost_equal(X_whitened.mean(axis=0), np.zeros(n_components)) X_ = dX.copy() pca = dd.PCA( n_components=n_components, whiten=False, copy=copy, svd_solver=solver, random_state=0, ).fit(X_) X_unwhitened = pca.transform(X_) assert_equal(X_unwhitened.shape, (n_samples, n_components)) # in that case the output components still have varying variances assert_almost_equal(X_unwhitened.std(axis=0).std(), 74.1, 1)