def test_parameters_sampler_replacement(): # raise error if n_iter too large params = {'first': [0, 1], 'second': ['a', 'b', 'c']} sampler = ParameterSampler(params, n_iter=7) assert_raises(ValueError, list, sampler) # degenerates to GridSearchCV if n_iter the same as grid_size sampler = ParameterSampler(params, n_iter=6) samples = list(sampler) assert_equal(len(samples), 6) for values in ParameterGrid(params): assert_true(values in samples) # test sampling without replacement in a large grid params = {'a': range(10), 'b': range(10), 'c': range(10)} sampler = ParameterSampler(params, n_iter=99, random_state=42) samples = list(sampler) assert_equal(len(samples), 99) hashable_samples = ["a%db%dc%d" % (p['a'], p['b'], p['c']) for p in samples] assert_equal(len(set(hashable_samples)), 99) # doesn't go into infinite loops params_distribution = {'first': bernoulli(.5), 'second': ['a', 'b', 'c']} sampler = ParameterSampler(params_distribution, n_iter=7) samples = list(sampler) assert_equal(len(samples), 7)
def test_radius_neighbors_classifier_when_no_neighbors(): """ Test radius-based classifier when no neighbors found. In this case it should rise an informative exception """ X = np.array([[1.0, 1.0], [2.0, 2.0]]) y = np.array([1, 2]) radius = 0.1 z1 = np.array([[1.01, 1.01], [2.01, 2.01]]) # no outliers z2 = np.array([[1.01, 1.01], [1.4, 1.4]]) # one outlier weight_func = _weight_func for outlier_label in [0, -1, None]: for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: rnc = neighbors.RadiusNeighborsClassifier clf = rnc(radius=radius, weights=weights, algorithm=algorithm, outlier_label=outlier_label) clf.fit(X, y) assert_array_equal(np.array([1, 2]), clf.predict(z1)) if outlier_label is None: assert_raises(ValueError, clf.predict, z2) elif False: assert_array_equal(np.array([1, outlier_label]), clf.predict(z2))
def test_grid_search_precomputed_kernel(): """Test that grid search works when the input features are given in the form of a precomputed kernel matrix """ X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) # compute the training kernel matrix corresponding to the linear kernel K_train = np.dot(X_[:180], X_[:180].T) y_train = y_[:180] clf = SVC(kernel='precomputed') cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(K_train, y_train) assert_true(cv.best_score_ >= 0) # compute the test kernel matrix K_test = np.dot(X_[180:], X_[:180].T) y_test = y_[180:] y_pred = cv.predict(K_test) assert_true(np.mean(y_pred == y_test) >= 0) # test error is raised when the precomputed kernel is not array-like # or sparse assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
def test_k_means_function(): # test calling the k_means function directly # catch output old_stdout = sys.stdout sys.stdout = StringIO() try: cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, sample_weight=None, verbose=True) finally: sys.stdout = old_stdout centers = cluster_centers assert_equal(centers.shape, (n_clusters, n_features)) labels = labels assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignment are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_greater(inertia, 0.0) # check warning when centers are passed assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters, sample_weight=None, init=centers) # to many clusters desired assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1, sample_weight=None) # kmeans for algorithm='elkan' raises TypeError on sparse matrix assert_raise_message(TypeError, "algorithm='elkan' not supported for " "sparse input X", k_means, X=X_csr, n_clusters=2, sample_weight=None, algorithm="elkan")
def test_grid_search_precomputed_kernel_error_kernel_function(): """Test that grid search returns an error when using a kernel_function""" X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) kernel_function = lambda x1, x2: np.dot(x1, x2.T) clf = SVC(kernel=kernel_function) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) assert_raises(ValueError, cv.fit, X_, y_)
def test_ovr_multilabel_predict_proba(): base_clf = MultinomialNB(alpha=1) for au in (False, True): X, Y = datasets.make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=au, return_indicator=True, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test, Y_test = X[80:], Y[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) # decision function only estimator. Fails in current implementation. decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train) assert_raises(AttributeError, decision_only.predict_proba, X_test) # Estimator with predict_proba disabled, depending on parameters. decision_only = OneVsRestClassifier(svm.SVC(probability=False)) decision_only.fit(X_train, Y_train) assert_raises(AttributeError, decision_only.predict_proba, X_test) Y_pred = clf.predict(X_test) Y_proba = clf.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = Y_proba > .5 assert_array_equal(pred, Y_pred)
def test_k_means_function(): # test calling the k_means function directly # catch output old_stdout = sys.stdout sys.stdout = StringIO() try: cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, verbose=True) finally: sys.stdout = old_stdout centers = cluster_centers assert_equal(centers.shape, (n_clusters, n_features)) labels = labels assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignment are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_greater(inertia, 0.0) # check warning when centers are passed assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters, init=centers) # to many clusters desired assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1)
def test_score_memmap(): # Ensure a scalar score of memmap type is accepted iris = load_iris() X, y = iris.data, iris.target clf = MockClassifier() tf = tempfile.NamedTemporaryFile(mode='wb', delete=False) tf.write(b'Hello world!!!!!') tf.close() scores = np.memmap(tf.name, dtype=np.float64) score = np.memmap(tf.name, shape=(), mode='r', dtype=np.float64) try: cross_val_score(clf, X, y, scoring=lambda est, X, y: score) # non-scalar should still fail assert_raises(ValueError, cross_val_score, clf, X, y, scoring=lambda est, X, y: scores) finally: # Best effort to release the mmap file handles before deleting the # backing file under Windows scores, score = None, None for _ in range(3): try: os.unlink(tf.name) break except WindowsError: sleep(1.)
def test_skewed_chi2_sampler(): """test that RBFSampler approximates kernel on random data""" # compute exact kernel c = 0.03 # appreviations for easier formular X_c = (X + c)[:, np.newaxis, :] Y_c = (Y + c)[np.newaxis, :, :] # we do it in log-space in the hope that it's more stable # this array is n_samples_x x n_samples_y big x n_features log_kernel = ((np.log(X_c) / 2.) + (np.log(Y_c) / 2.) + np.log(2.) - np.log(X_c + Y_c)) # reduce to n_samples_x x n_samples_y by summing over features in log-space kernel = np.exp(log_kernel.sum(axis=2)) # approximate kernel mapping transform = SkewedChi2Sampler(skewedness=c, n_components=1000, random_state=42) X_trans = transform.fit_transform(X) Y_trans = transform.transform(Y) kernel_approx = np.dot(X_trans, Y_trans.T) assert_array_almost_equal(kernel, kernel_approx, 1) # test error is raised on negative input Y_neg = Y.copy() Y_neg[0, 0] = -1 assert_raises(ValueError, transform.transform, Y_neg)
def check_transformers_unfitted(name, Transformer): X, y = _boston_subset() with warnings.catch_warnings(record=True): transformer = Transformer() assert_raises(NotFittedError, transformer.transform, X)
def check_regressors_train(name, Regressor): X, y = _boston_subset() y = StandardScaler().fit_transform(y) # X is already scaled y = multioutput_estimator_convert_y_2d(name, y) rnd = np.random.RandomState(0) # catch deprecation warnings with warnings.catch_warnings(record=True): regressor = Regressor() set_fast_parameters(regressor) if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'): # linear regressors need to set alpha, but not generalized CV ones regressor.alpha = 0.01 if name == 'PassiveAggressiveRegressor': regressor.C = 0.01 # raises error on malformed input for fit assert_raises(ValueError, regressor.fit, X, y[:-1]) # fit if name in CROSS_DECOMPOSITION: y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))]) y_ = y_.T else: y_ = y set_random_state(regressor) regressor.fit(X, y_) regressor.fit(X.tolist(), y_.tolist()) regressor.predict(X) # TODO: find out why PLS and CCA fail. RANSAC is random # and furthermore assumes the presence of outliers, hence # skipped if name not in ('PLSCanonical', 'CCA', 'RANSACRegressor'): print(regressor) assert_greater(regressor.score(X, y_), 0.5)
def check_min_samples_leaf(name): X, y = hastie_X, hastie_y # Test if leaves contain more than leaf_count training examples ForestEstimator = FOREST_ESTIMATORS[name] # test boundary value assert_raises(ValueError, ForestEstimator(min_samples_leaf=-1).fit, X, y) assert_raises(ValueError, ForestEstimator(min_samples_leaf=0).fit, X, y) est = ForestEstimator(min_samples_leaf=5, n_estimators=1, random_state=0) est.fit(X, y) out = est.estimators_[0].tree_.apply(X) node_counts = np.bincount(out) # drop inner nodes leaf_count = node_counts[node_counts != 0] assert_greater(np.min(leaf_count), 4, "Failed with {0}".format(name)) est = ForestEstimator(min_samples_leaf=0.25, n_estimators=1, random_state=0) est.fit(X, y) out = est.estimators_[0].tree_.apply(X) node_counts = np.bincount(out) # drop inner nodes leaf_count = node_counts[node_counts != 0] assert_greater(np.min(leaf_count), len(X) * 0.25 - 1, "Failed with {0}".format(name))
def test_set_params(): # test nested estimator parameter setting clf = Pipeline([("svc", SVC())]) # non-existing parameter in svc assert_raises(ValueError, clf.set_params, svc__stupid_param=True) # non-existing parameter of pipeline assert_raises(ValueError, clf.set_params, svm__stupid_param=True)
def test_multioutput_number_of_output_differ(): y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]]) y_pred = np.array([[0, 0], [1, 0], [0, 0]]) for name in MULTIOUTPUT_METRICS: metric = ALL_METRICS[name] assert_raises(ValueError, metric, y_true, y_pred)
def test_scikit_vs_scipy(): """Test scikit linkage with full connectivity (i.e. unstructured) vs scipy """ n, p, k = 10, 5, 3 rng = np.random.RandomState(0) # Not using a lil_matrix here, just to check that non sparse # matrices are well handled connectivity = np.ones((n, n)) for linkage in _TREE_BUILDERS.keys(): for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out = hierarchy.linkage(X, method=linkage) children_ = out[:, :2].astype(np.int) children, _, n_leaves, _ = _TREE_BUILDERS[linkage](X, connectivity) cut = _hc_cut(k, children, n_leaves) cut_ = _hc_cut(k, children_, n_leaves) assess_same_labelling(cut, cut_) # Test error management in _hc_cut assert_raises(ValueError, _hc_cut, n_leaves + 1, children, n_leaves)
def test_constant_size_multioutput_regressor(): random_state = np.random.RandomState(seed=1) X = random_state.randn(10, 10) y = random_state.randn(10, 5) est = DummyRegressor(strategy='constant', constant=[1, 2, 3, 4]) assert_raises(ValueError, est.fit, X, y)
def test_cross_val_score(): clf = MockClassifier() for a in range(-10, 10): clf.a = a # Smoke test scores = cval.cross_val_score(clf, X, y) assert_array_equal(scores, clf.score(X, y)) # test with multioutput y scores = cval.cross_val_score(clf, X_sparse, X) assert_array_equal(scores, clf.score(X_sparse, X)) scores = cval.cross_val_score(clf, X_sparse, y) assert_array_equal(scores, clf.score(X_sparse, y)) # test with multioutput y scores = cval.cross_val_score(clf, X_sparse, X) assert_array_equal(scores, clf.score(X_sparse, X)) # test with X as list clf = MockListClassifier() scores = cval.cross_val_score(clf, X.tolist(), y) assert_raises(ValueError, cval.cross_val_score, clf, X, y, scoring="sklearn")
def test_nan(): # Test proper NaN handling. # Regression test for Issue #252: fit used to go into an infinite loop. Xnan = np.array(X, dtype=np.float64) Xnan[0, 1] = np.nan logistic = LogisticRegression(random_state=0) assert_raises(ValueError, logistic.fit, Xnan, Y1)
def test_spectral_amg_mode(): # Test the amg mode of SpectralClustering centers = np.array([ [0., 0., 0.], [10., 10., 10.], [20., 20., 20.], ]) X, true_labels = make_blobs(n_samples=100, centers=centers, cluster_std=1., random_state=42) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix S = sparse.coo_matrix(S) try: from pyamg import smoothed_aggregation_solver amg_loaded = True except ImportError: amg_loaded = False if amg_loaded: labels = spectral_clustering(S, n_clusters=len(centers), random_state=0, mode="amg") # We don't care too much that it's good, just that it *worked*. # There does have to be some lower limit on the performance though. assert_greater(np.mean(labels == true_labels), .3) else: assert_raises(ValueError, spectral_embedding, S, n_components=len(centers), random_state=0, mode="amg")
def test_warm_start_smaller_n_estimators(Cls): # Test if warm start with smaller n_estimators raises error X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) est = Cls(n_estimators=100, max_depth=1, warm_start=True) est.fit(X, y) est.set_params(n_estimators=99) assert_raises(ValueError, est.fit, X, y)
def test_logistic_regressioncv_class_weights(): X, y = make_classification(n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0) # Test the liblinear fails when class_weight of type dict is # provided, when it is multiclass. However it can handle # binary problems. clf_lib = LogisticRegressionCV(class_weight={0: 0.1, 1: 0.2}, solver='liblinear') assert_raises(ValueError, clf_lib.fit, X, y) y_ = y.copy() y_[y == 2] = 1 clf_lib.fit(X, y_) assert_array_equal(clf_lib.classes_, [0, 1]) # Test for class_weight=auto X, y = make_classification(n_samples=20, n_features=20, n_informative=10, random_state=0) clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False, class_weight='auto') clf_lbf.fit(X, y) clf_lib = LogisticRegressionCV(solver='liblinear', fit_intercept=False, class_weight='auto') clf_lib.fit(X, y) assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4)
def check_boston(presort, loss, subsample): # Check consistency on dataset boston house prices with least squares # and least absolute deviation. ones = np.ones(len(boston.target)) last_y_pred = None for sample_weight in None, ones, 2 * ones: clf = GradientBoostingRegressor(n_estimators=100, loss=loss, max_depth=4, subsample=subsample, min_samples_split=2, random_state=1, presort=presort) assert_raises(ValueError, clf.predict, boston.data) clf.fit(boston.data, boston.target, sample_weight=sample_weight) leaves = clf.apply(boston.data) assert_equal(leaves.shape, (506, 100)) y_pred = clf.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert_less(mse, 6.0) if last_y_pred is not None: assert_array_almost_equal(last_y_pred, y_pred) last_y_pred = y_pred
def test_multilabel_binarizer_non_integer_labels(): tuple_classes = np.empty(3, dtype=object) tuple_classes[:] = [(1,), (2,), (3,)] inputs = [ ([('2', '3'), ('1',), ('1', '2')], ['1', '2', '3']), ([('b', 'c'), ('a',), ('a', 'b')], ['a', 'b', 'c']), ([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes), ] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) for inp, classes in inputs: # fit_transform() mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit_transform(inp), indicator_mat) assert_array_equal(mlb.classes_, classes) assert_array_equal(mlb.inverse_transform(indicator_mat), inp) # fit().transform() mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) assert_array_equal(mlb.classes_, classes) assert_array_equal(mlb.inverse_transform(indicator_mat), inp) mlb = MultiLabelBinarizer() assert_raises(TypeError, mlb.fit_transform, [({}), ({}, {'a': 'b'})])
def test_thresholded_scorers(): """Test scorers that take thresholds.""" X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) score1 = SCORERS['roc_auc'](clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) assert_almost_equal(score1, score3) logscore = SCORERS['log_loss'](clf, X_test, y_test) logloss = log_loss(y_test, clf.predict_proba(X_test)) assert_almost_equal(-logscore, logloss) # same for an estimator without decision_function clf = DecisionTreeClassifier() clf.fit(X_train, y_train) score1 = SCORERS['roc_auc'](clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) # Test that an exception is raised on more than two classes X, y = make_blobs(random_state=0, centers=3) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf.fit(X_train, y_train) assert_raises(ValueError, SCORERS['roc_auc'], clf, X_test, y_test)
def test_partial_fit(): # Checks whether inserting array is consistent with fitted data. # `partial_fit` method should set all attribute values correctly. n_samples = 12 n_samples_partial_fit = 3 n_features = 2 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) X_partial_fit = rng.rand(n_samples_partial_fit, n_features) lshf = ignore_warnings(LSHForest, category=DeprecationWarning)() # Test unfitted estimator ignore_warnings(lshf.partial_fit)(X) assert_array_equal(X, lshf._fit_X) ignore_warnings(lshf.fit)(X) # Insert wrong dimension assert_raises(ValueError, lshf.partial_fit, np.random.randn(n_samples_partial_fit, n_features - 1)) ignore_warnings(lshf.partial_fit)(X_partial_fit) # size of _input_array = samples + 1 after insertion assert_equal(lshf._fit_X.shape[0], n_samples + n_samples_partial_fit) # size of original_indices_[1] = samples + 1 assert_equal(len(lshf.original_indices_[0]), n_samples + n_samples_partial_fit) # size of trees_[1] = samples + 1 assert_equal(len(lshf.trees_[1]), n_samples + n_samples_partial_fit)
def test_kernel_density_sampling(n_samples=100, n_features=3): rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) bandwidth = 0.2 for kernel in ['gaussian', 'tophat']: # draw a tophat sample kde = KernelDensity(bandwidth, kernel=kernel).fit(X) samp = kde.sample(100) assert_equal(X.shape, samp.shape) # check that samples are in the right range nbrs = NearestNeighbors(n_neighbors=1).fit(X) dist, ind = nbrs.kneighbors(X, return_distance=True) if kernel == 'tophat': assert np.all(dist < bandwidth) elif kernel == 'gaussian': # 5 standard deviations is safe for 100 samples, but there's a # very small chance this test could fail. assert np.all(dist < 5 * bandwidth) # check unsupported kernels for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']: kde = KernelDensity(bandwidth, kernel=kernel).fit(X) assert_raises(NotImplementedError, kde.sample, 100) # non-regression test: used to return a scalar X = rng.randn(4, 1) kde = KernelDensity(kernel="gaussian").fit(X) assert_equal(kde.sample().shape, (1, 1))
def test_pca_randomized_solver(): # PCA on dense arrays X = iris.data # Loop excluding the 0, invalid for randomized for n_comp in np.arange(1, X.shape[1]): pca = PCA(n_components=n_comp, svd_solver='randomized', random_state=0) X_r = pca.fit(X).transform(X) np.testing.assert_equal(X_r.shape[1], n_comp) X_r2 = pca.fit_transform(X) assert_array_almost_equal(X_r, X_r2) X_r = pca.transform(X) assert_array_almost_equal(X_r, X_r2) # Test get_covariance and get_precision cov = pca.get_covariance() precision = pca.get_precision() assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]), 12) pca = PCA(n_components=0, svd_solver='randomized', random_state=0) assert_raises(ValueError, pca.fit, X) pca = PCA(n_components=0, svd_solver='randomized', random_state=0) assert_raises(ValueError, pca.fit, X) # Check internal state assert_equal(pca.n_components, PCA(n_components=0, svd_solver='randomized', random_state=0).n_components) assert_equal(pca.svd_solver, PCA(n_components=0, svd_solver='randomized', random_state=0).svd_solver)
def test_zero_estimator_clf(): # Test if ZeroEstimator works for classification. X = iris.data y = np.array(iris.target) est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init=ZeroEstimator()) est.fit(X, y) assert_greater(est.score(X, y), 0.96) est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init='zero') est.fit(X, y) assert_greater(est.score(X, y), 0.96) # binary clf mask = y != 0 y[mask] = 1 y[~mask] = 0 est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init='zero') est.fit(X, y) assert_greater(est.score(X, y), 0.96) est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init='foobar') assert_raises(ValueError, est.fit, X, y)
def test_ward_clustering(): """ Check that we obtain the correct number of clusters with Ward clustering. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(100, 50) connectivity = grid_to_graph(*mask.shape) clustering = Ward(n_clusters=10, connectivity=connectivity) clustering.fit(X) # test caching clustering = Ward(n_clusters=10, connectivity=connectivity, memory=mkdtemp()) clustering.fit(X) labels = clustering.labels_ assert_true(np.size(np.unique(labels)) == 10) # Turn caching off now clustering = Ward(n_clusters=10, connectivity=connectivity) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) np.testing.assert_array_equal(clustering.labels_, labels) clustering.connectivity = None clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10) # Check that we raise a TypeError on dense matrices clustering = Ward(n_clusters=10, connectivity=connectivity.todense()) assert_raises(TypeError, clustering.fit, X) clustering = Ward(n_clusters=10, connectivity=sparse.lil_matrix( connectivity.todense()[:10, :10])) assert_raises(ValueError, clustering.fit, X)
def test_staged_predict_proba(): # Test whether staged predict proba eventually gives # the same prediction. X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingClassifier(n_estimators=20) # test raise NotFittedError if not fitted assert_raises(NotFittedError, lambda X: np.fromiter( clf.staged_predict_proba(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) # test if prediction for last stage equals ``predict`` for y_pred in clf.staged_predict(X_test): assert_equal(y_test.shape, y_pred.shape) assert_array_equal(clf.predict(X_test), y_pred) # test if prediction for last stage equals ``predict_proba`` for staged_proba in clf.staged_predict_proba(X_test): assert_equal(y_test.shape[0], staged_proba.shape[0]) assert_equal(2, staged_proba.shape[1]) assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
def test_bad_input(): # Test that it gives proper exception on deficient input # impossible value of C assert_raises(ValueError, svm.SVC(C=-1).fit, X, Y) # impossible value of nu clf = svm.NuSVC(nu=0.0) assert_raises(ValueError, clf.fit, X, Y) Y2 = Y[:-1] # wrong dimensions for labels assert_raises(ValueError, clf.fit, X, Y2) # Test with arrays that are non-contiguous. for clf in (svm.SVC(), svm.LinearSVC(random_state=0)): Xf = np.asfortranarray(X) assert not Xf.flags['C_CONTIGUOUS'] yf = np.ascontiguousarray(np.tile(Y, (2, 1)).T) yf = yf[:, -1] assert not yf.flags['F_CONTIGUOUS'] assert not yf.flags['C_CONTIGUOUS'] clf.fit(Xf, yf) assert_array_equal(clf.predict(T), true_result) # error for precomputed kernelsx clf = svm.SVC(kernel='precomputed') assert_raises(ValueError, clf.fit, X, Y) # sample_weight bad dimensions clf = svm.SVC() assert_raises(ValueError, clf.fit, X, Y, sample_weight=range(len(X) - 1)) # predict with sparse input when trained with dense clf = svm.SVC().fit(X, Y) assert_raises(ValueError, clf.predict, sparse.lil_matrix(X)) Xt = np.array(X).T clf.fit(np.dot(X, Xt), Y) assert_raises(ValueError, clf.predict, X) clf = svm.SVC() clf.fit(X, Y) assert_raises(ValueError, clf.predict, Xt)
def test_stratified_shuffle_split_init(): y = np.asarray([0, 1, 1, 1, 2, 2, 2]) # Check that error is raised if there is a class with only one sample assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.2) # Check that error is raised if the test set size is smaller than n_classes assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 2) # Check that error is raised if the train set size is smaller than # n_classes assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 3, 2) y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2]) # Check that errors are raised if there is not enough samples assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.5, 0.6) assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 8, 0.6) assert_raises(ValueError, cval.StratifiedShuffleSplit, y, 3, 0.6, 8) # Train size or test size too small assert_raises(ValueError, cval.StratifiedShuffleSplit, y, train_size=2) assert_raises(ValueError, cval.StratifiedShuffleSplit, y, test_size=2)
def check_label_kfold(shuffle): rng = np.random.RandomState(0) # Parameters of the test n_labels = 15 n_samples = 1000 n_folds = 5 # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed labels = rng.randint(0, n_labels, n_samples) folds = cval.LabelKFold(labels, n_folds=n_folds, shuffle=shuffle, random_state=rng).idxs ideal_n_labels_per_fold = n_samples // n_folds # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in cval.LabelKFold(labels, n_folds=n_folds, shuffle=shuffle, random_state=rng): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Construct the test data labels = [ 'Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia' ] labels = np.asarray(labels, dtype=object) n_labels = len(np.unique(labels)) n_samples = len(labels) n_folds = 5 tolerance = 0.05 * n_samples # 5 percent error allowed folds = cval.LabelKFold(labels, n_folds=n_folds, shuffle=shuffle, random_state=rng).idxs ideal_n_labels_per_fold = n_samples // n_folds # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split for train, test in cval.LabelKFold(labels, n_folds=n_folds, shuffle=shuffle, random_state=rng): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Should fail if there are more folds than labels labels = np.array([1, 1, 1, 2, 2]) assert_raises(ValueError, cval.LabelKFold, labels, n_folds=3)
def test_kfold_valueerrors(): # Check that errors are raised if there is not enough samples assert_raises(ValueError, cval.KFold, 3, 4) # Check that a warning is raised if the least populated class has too few # members. y = [3, 3, -1, -1, 2] cv = assert_warns_message(Warning, "The least populated class", cval.StratifiedKFold, y, 3) # Check that despite the warning the folds are still computed even # though all the classes are not necessarily represented at on each # side of the split at each split check_cv_coverage(cv, expected_n_iter=3, n_samples=len(y)) # Error when number of folds is <= 1 assert_raises(ValueError, cval.KFold, 2, 0) assert_raises(ValueError, cval.KFold, 2, 1) assert_raises(ValueError, cval.StratifiedKFold, y, 0) assert_raises(ValueError, cval.StratifiedKFold, y, 1) # When n is not integer: assert_raises(ValueError, cval.KFold, 2.5, 2) # When n_folds is not integer: assert_raises(ValueError, cval.KFold, 5, 1.5) assert_raises(ValueError, cval.StratifiedKFold, y, 1.5)
def test_shufflesplit_errors(): assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=2.0) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=1.0) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=0.1, train_size=0.95) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=11) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=10) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=8, train_size=3) assert_raises(ValueError, cval.ShuffleSplit, 10, train_size=1j) assert_raises(ValueError, cval.ShuffleSplit, 10, test_size=None, train_size=None)
def test_precomputed(): # SVC with a precomputed kernel. # We test it with a toy dataset and with iris. clf = svm.SVC(kernel='precomputed') # Gram matrix for train data (square matrix) # (we use just a linear kernel) K = np.dot(X, np.array(X).T) clf.fit(K, Y) # Gram matrix for test data (rectangular matrix) KT = np.dot(T, np.array(X).T) pred = clf.predict(KT) assert_raises(ValueError, clf.predict, KT.T) assert_array_equal(clf.dual_coef_, [[-0.25, .25]]) assert_array_equal(clf.support_, [1, 3]) assert_array_equal(clf.intercept_, [0]) assert_array_almost_equal(clf.support_, [1, 3]) assert_array_equal(pred, true_result) # Gram matrix for test data but compute KT[i,j] # for support vectors j only. KT = np.zeros_like(KT) for i in range(len(T)): for j in clf.support_: KT[i, j] = np.dot(T[i], X[j]) pred = clf.predict(KT) assert_array_equal(pred, true_result) # same as before, but using a callable function instead of the kernel # matrix. kernel is just a linear kernel kfunc = lambda x, y: np.dot(x, y.T) clf = svm.SVC(kernel=kfunc) clf.fit(X, Y) pred = clf.predict(T) assert_array_equal(clf.dual_coef_, [[-0.25, .25]]) assert_array_equal(clf.intercept_, [0]) assert_array_almost_equal(clf.support_, [1, 3]) assert_array_equal(pred, true_result) # test a precomputed kernel with the iris dataset # and check parameters against a linear SVC clf = svm.SVC(kernel='precomputed') clf2 = svm.SVC(kernel='linear') K = np.dot(iris.data, iris.data.T) clf.fit(K, iris.target) clf2.fit(iris.data, iris.target) pred = clf.predict(K) assert_array_almost_equal(clf.support_, clf2.support_) assert_array_almost_equal(clf.dual_coef_, clf2.dual_coef_) assert_array_almost_equal(clf.intercept_, clf2.intercept_) assert_almost_equal(np.mean(pred == iris.target), .99, decimal=2) # Gram matrix for test data but compute KT[i,j] # for support vectors j only. K = np.zeros_like(K) for i in range(len(iris.data)): for j in clf.support_: K[i, j] = np.dot(iris.data[i], iris.data[j]) pred = clf.predict(K) assert_almost_equal(np.mean(pred == iris.target), .99, decimal=2) clf = svm.SVC(kernel=kfunc) clf.fit(iris.data, iris.target) assert_almost_equal(np.mean(pred == iris.target), .99, decimal=2)
def test_svc_bad_kernel(): svc = svm.SVC(kernel=lambda x, y: x) assert_raises(ValueError, svc.fit, X, Y)
def check_unfitted_feature_importances(name): assert_raises(ValueError, getattr, FOREST_ESTIMATORS[name](random_state=0), "feature_importances_")
def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something')
def test_check_array(): # accept_sparse == False # raise error on sparse inputs X = [[1, 2], [3, 4]] X_csr = sp.csr_matrix(X) assert_raises(TypeError, check_array, X_csr) # ensure_2d=False X_array = check_array([0, 1, 2], ensure_2d=False) assert X_array.ndim == 1 # ensure_2d=True with 1d array assert_raise_message(ValueError, 'Expected 2D array, got 1D array instead', check_array, [0, 1, 2], ensure_2d=True) # ensure_2d=True with scalar array assert_raise_message(ValueError, 'Expected 2D array, got scalar array instead', check_array, 10, ensure_2d=True) # don't allow ndim > 3 X_ndim = np.arange(8).reshape(2, 2, 2) assert_raises(ValueError, check_array, X_ndim) check_array(X_ndim, allow_nd=True) # doesn't raise # dtype and order enforcement. X_C = np.arange(4).reshape(2, 2).copy("C") X_F = X_C.copy("F") X_int = X_C.astype(np.int) X_float = X_C.astype(np.float) Xs = [X_C, X_F, X_int, X_float] dtypes = [np.int32, np.int, np.float, np.float32, None, np.bool, object] orders = ['C', 'F', None] copys = [True, False] for X, dtype, order, copy in product(Xs, dtypes, orders, copys): X_checked = check_array(X, dtype=dtype, order=order, copy=copy) if dtype is not None: assert X_checked.dtype == dtype else: assert X_checked.dtype == X.dtype if order == 'C': assert X_checked.flags['C_CONTIGUOUS'] assert not X_checked.flags['F_CONTIGUOUS'] elif order == 'F': assert X_checked.flags['F_CONTIGUOUS'] assert not X_checked.flags['C_CONTIGUOUS'] if copy: assert X is not X_checked else: # doesn't copy if it was already good if (X.dtype == X_checked.dtype and X_checked.flags['C_CONTIGUOUS'] == X.flags['C_CONTIGUOUS'] and X_checked.flags['F_CONTIGUOUS'] == X.flags['F_CONTIGUOUS']): assert X is X_checked # allowed sparse != None X_csc = sp.csc_matrix(X_C) X_coo = X_csc.tocoo() X_dok = X_csc.todok() X_int = X_csc.astype(np.int) X_float = X_csc.astype(np.float) Xs = [X_csc, X_coo, X_dok, X_int, X_float] accept_sparses = [['csr', 'coo'], ['coo', 'dok']] for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses, copys): with warnings.catch_warnings(record=True) as w: X_checked = check_array(X, dtype=dtype, accept_sparse=accept_sparse, copy=copy) if (dtype is object or sp.isspmatrix_dok(X)) and len(w): message = str(w[0].message) messages = [ "object dtype is not supported by sparse matrices", "Can't check dok sparse matrix for nan or inf." ] assert message in messages else: assert len(w) == 0 if dtype is not None: assert X_checked.dtype == dtype else: assert X_checked.dtype == X.dtype if X.format in accept_sparse: # no change if allowed assert X.format == X_checked.format else: # got converted assert X_checked.format == accept_sparse[0] if copy: assert X is not X_checked else: # doesn't copy if it was already good if X.dtype == X_checked.dtype and X.format == X_checked.format: assert X is X_checked # other input formats # convert lists to arrays X_dense = check_array([[1, 2], [3, 4]]) assert isinstance(X_dense, np.ndarray) # raise on too deep lists assert_raises(ValueError, check_array, X_ndim.tolist()) check_array(X_ndim.tolist(), allow_nd=True) # doesn't raise # convert weird stuff to arrays X_no_array = _NotAnArray(X_dense) result = check_array(X_no_array) assert isinstance(result, np.ndarray) # deprecation warning if string-like array with dtype="numeric" expected_warn_regex = r"converted to decimal numbers if dtype='numeric'" X_str = [['11', '12'], ['13', 'xx']] for X in [X_str, np.array(X_str, dtype='U'), np.array(X_str, dtype='S')]: with pytest.warns(FutureWarning, match=expected_warn_regex): check_array(X, dtype="numeric") # deprecation warning if byte-like array with dtype="numeric" X_bytes = [[b'a', b'b'], [b'c', b'd']] for X in [X_bytes, np.array(X_bytes, dtype='V1')]: with pytest.warns(FutureWarning, match=expected_warn_regex): check_array(X, dtype="numeric")
def test_pipeline_init(): # Test the various init parameters of the pipeline. assert_raises(TypeError, Pipeline) # Check that we can't instantiate pipelines with objects without fit # method assert_raises_regex( TypeError, 'Last step of Pipeline should implement fit. ' '.*NoFit.*', Pipeline, [('clf', NoFit())]) # Smoke test with only an estimator clf = NoTrans() pipe = Pipeline([('svc', clf)]) assert_equal( pipe.get_params(deep=True), dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False))) # Check that params are set pipe.set_params(svc__a=0.1) assert_equal(clf.a, 0.1) assert_equal(clf.b, None) # Smoke test the repr: repr(pipe) # Test with two objects clf = SVC() filter1 = SelectKBest(f_classif) pipe = Pipeline([('anova', filter1), ('svc', clf)]) # Check that we can't instantiate with non-transformers on the way # Note that NoTrans implements fit, but not transform assert_raises_regex( TypeError, 'All intermediate steps should be transformers' '.*\\bNoTrans\\b.*', Pipeline, [('t', NoTrans()), ('svc', clf)]) # Check that params are set pipe.set_params(svc__C=0.1) assert_equal(clf.C, 0.1) # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong assert_raises(ValueError, pipe.set_params, anova__C=0.1) # Test clone pipe2 = clone(pipe) assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc']) # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop('svc') params.pop('anova') params2.pop('svc') params2.pop('anova') assert_equal(params, params2)
def test_iforest_error(): """Test that it gives proper exception on deficient input.""" X = iris.data # Test max_samples assert_raises(ValueError, IsolationForest(max_samples=-1).fit, X) assert_raises(ValueError, IsolationForest(max_samples=0.0).fit, X) assert_raises(ValueError, IsolationForest(max_samples=2.0).fit, X) # The dataset has less than 256 samples, explicitly setting # max_samples > n_samples should result in a warning. If not set # explicitly there should be no warning assert_warns_message( UserWarning, "max_samples will be set to n_samples for estimation", IsolationForest(max_samples=1000).fit, X) assert_no_warnings(IsolationForest(max_samples='auto').fit, X) assert_no_warnings(IsolationForest(max_samples=np.int64(2)).fit, X) assert_raises(ValueError, IsolationForest(max_samples='foobar').fit, X) assert_raises(ValueError, IsolationForest(max_samples=1.5).fit, X) # test X_test n_features match X_train one: assert_raises(ValueError, IsolationForest().fit(X).predict, X[:, 1:])
def test_neighbors_badargs(): """Test bad argument values: these should all raise ValueErrors""" assert_raises(ValueError, neighbors.NearestNeighbors, algorithm='blah') X = rng.random_sample((10, 2)) Xsparse = csr_matrix(X) y = np.ones(10) for cls in (neighbors.KNeighborsClassifier, neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor): assert_raises(ValueError, cls, weights='blah') assert_raises(ValueError, cls, p=-1) assert_raises(ValueError, cls, algorithm='blah') nbrs = cls(algorithm='ball_tree', metric='haversine') assert_raises(ValueError, nbrs.predict, X) assert_raises(ValueError, ignore_warnings(nbrs.fit), Xsparse, y) nbrs = cls() assert_raises(ValueError, nbrs.fit, np.ones((0, 2)), np.ones(0)) assert_raises(ValueError, nbrs.fit, X[:, :, None], y) nbrs.fit(X, y) assert_raises(ValueError, nbrs.predict, []) nbrs = neighbors.NearestNeighbors().fit(X) assert_raises(ValueError, nbrs.kneighbors_graph, X, mode='blah') assert_raises(ValueError, nbrs.radius_neighbors_graph, X, mode='blah')
def test_weighted_dbscan(): # ensure sample_weight is validated assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2]) assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2, 3, 4]) # ensure sample_weight has an effect assert_array_equal([], dbscan([[0], [1]], sample_weight=None, min_samples=6)[0]) assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5], min_samples=6)[0]) assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5], min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6], min_samples=6)[0]) # points within eps of each other: assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5, sample_weight=[5, 1], min_samples=6)[0]) # and effect of non-positive and non-integer sample_weight: assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0], eps=1.5, min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1], eps=1.5, min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0], eps=1.5, min_samples=6)[0]) assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1], eps=1.5, min_samples=6)[0]) # for non-negative sample_weight, cores should be identical to repetition rng = np.random.RandomState(42) sample_weight = rng.randint(0, 5, X.shape[0]) core1, label1 = dbscan(X, sample_weight=sample_weight) assert_equal(len(label1), len(X)) X_repeated = np.repeat(X, sample_weight, axis=0) core_repeated, label_repeated = dbscan(X_repeated) core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool) core_repeated_mask[core_repeated] = True core_mask = np.zeros(X.shape[0], dtype=bool) core_mask[core1] = True assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask) # sample_weight should work with precomputed distance matrix D = pairwise_distances(X) core3, label3 = dbscan(D, sample_weight=sample_weight, metric='precomputed') assert_array_equal(core1, core3) assert_array_equal(label1, label3) # sample_weight should work with estimator est = DBSCAN().fit(X, sample_weight=sample_weight) core4 = est.core_sample_indices_ label4 = est.labels_ assert_array_equal(core1, core4) assert_array_equal(label1, label4) est = DBSCAN() label5 = est.fit_predict(X, sample_weight=sample_weight) core5 = est.core_sample_indices_ assert_array_equal(core1, core5) assert_array_equal(label1, label5) assert_array_equal(label1, est.labels_)
def test_classifiers_train(): # test if classifiers do something sensible on training set # also test all shapes / shape errors classifiers = all_estimators(type_filter='classifier') X_m, y_m = make_blobs(random_state=0) X_m, y_m = shuffle(X_m, y_m, random_state=7) X_m = StandardScaler().fit_transform(X_m) # generate binary problem from multi-class one y_b = y_m[y_m != 2] X_b = X_m[y_m != 2] for (X, y) in [(X_m, y_m), (X_b, y_b)]: # do it once with binary, once with multiclass classes = np.unique(y) n_classes = len(classes) n_samples, n_features = X.shape for name, Classifier in classifiers: if name in dont_test: continue if name in ['MultinomialNB', 'BernoulliNB']: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() # raises error on malformed input for fit assert_raises(ValueError, classifier.fit, X, y[:-1]) # fit classifier.fit(X, y) assert_true(hasattr(classifier, "classes_")) y_pred = classifier.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance assert_greater(accuracy_score(y, y_pred), 0.85) # raises error on malformed input for predict assert_raises(ValueError, classifier.predict, X.T) if hasattr(classifier, "decision_function"): try: # decision_function agrees with predict: decision = classifier.decision_function(X) if n_classes is 2: assert_equal(decision.ravel().shape, (n_samples, )) dec_pred = (decision.ravel() > 0).astype(np.int) assert_array_equal(dec_pred, y_pred) if (n_classes is 3 and not isinstance(classifier, BaseLibSVM)): # 1on1 of LibSVM works differently assert_equal(decision.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input assert_raises(ValueError, classifier.decision_function, X.T) # raises error on malformed input for decision_function assert_raises(ValueError, classifier.decision_function, X.T) except NotImplementedError: pass if hasattr(classifier, "predict_proba"): try: # predict_proba agrees with predict: y_prob = classifier.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # check that probas for all classes sum to one assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples)) # raises error on malformed input assert_raises(ValueError, classifier.predict_proba, X.T) # raises error on malformed input for predict_proba assert_raises(ValueError, classifier.predict_proba, X.T) except NotImplementedError: pass
def test_multioutput_enetcv_error(): X = np.random.randn(10, 2) y = np.random.randn(10, 2) clf = ElasticNetCV() assert_raises(ValueError, clf.fit, X, y)
def test_hdbscan_no_centroid_medoid_for_noise(): clusterer = HDBSCAN().fit(X) assert_raises(ValueError, clusterer.weighted_cluster_centroid, -1) assert_raises(ValueError, clusterer.weighted_cluster_medoid, -1)
def test_fast_dot(): # Check fast dot blas wrapper function if fast_dot is np.dot: return rng = np.random.RandomState(42) A = rng.random_sample([2, 10]) B = rng.random_sample([2, 10]) try: linalg.get_blas_funcs(['gemm'])[0] has_blas = True except (AttributeError, ValueError): has_blas = False if has_blas: # Test _fast_dot for invalid input. # Maltyped data. for dt1, dt2 in [['f8', 'f4'], ['i4', 'i4']]: assert_raises(ValueError, _fast_dot, A.astype(dt1), B.astype(dt2).T) # Malformed data. # ndim == 0 E = np.empty(0) assert_raises(ValueError, _fast_dot, E, E) # ndim == 1 assert_raises(ValueError, _fast_dot, A, A[0]) # ndim > 2 assert_raises(ValueError, _fast_dot, A.T, np.array([A, A])) # min(shape) == 1 assert_raises(ValueError, _fast_dot, A, A[0, :][None, :]) # test for matrix mismatch error assert_raises(ValueError, _fast_dot, A, A) # Test cov-like use case + dtypes. for dtype in ['f8', 'f4']: A = A.astype(dtype) B = B.astype(dtype) # col < row C = np.dot(A.T, A) C_ = fast_dot(A.T, A) assert_almost_equal(C, C_, decimal=5) C = np.dot(A.T, B) C_ = fast_dot(A.T, B) assert_almost_equal(C, C_, decimal=5) C = np.dot(A, B.T) C_ = fast_dot(A, B.T) assert_almost_equal(C, C_, decimal=5) # Test square matrix * rectangular use case. A = rng.random_sample([2, 2]) for dtype in ['f8', 'f4']: A = A.astype(dtype) B = B.astype(dtype) C = np.dot(A, B) C_ = fast_dot(A, B) assert_almost_equal(C, C_, decimal=5) C = np.dot(A.T, B) C_ = fast_dot(A.T, B) assert_almost_equal(C, C_, decimal=5) if has_blas: for x in [np.array([[d] * 10] * 2) for d in [np.inf, np.nan]]: assert_raises(ValueError, _fast_dot, x, x.T)
def test_save_filepath_condition(): r = BaseRecommender() invalid_filepath = 'no suffix' assert_raises(ValueError, r.save, invalid_filepath)
def test_transformers(): # test if transformers do something sensible on training set # also test all shapes / shape errors transformers = all_estimators(type_filter='transformer') X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) n_samples, n_features = X.shape X = StandardScaler().fit_transform(X) X -= X.min() succeeded = True for name, Transformer in transformers: if name in dont_test: continue # these don't actually fit the data: if name in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']: continue # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() set_random_state(transformer) if hasattr(transformer, 'compute_importances'): transformer.compute_importances = True if name == 'SelectKBest': # SelectKBest has a default of k=10 # which is more feature than we have. transformer.k = 1 elif name in ['GaussianRandomProjection', 'SparseRandomProjection']: # Due to the jl lemma and very few samples, the number # of components of the random matrix projection will be greater # than the number of features. # So we impose a smaller number (avoid "auto" mode) transformer.n_components = 1 elif name == "MiniBatchDictionaryLearning": transformer.set_params(n_iter=5) # default = 1000 elif name == "KernelPCA": transformer.remove_zero_eig = False # fit if name in ('_PLS', 'PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'): random_state = np.random.RandomState(seed=12345) y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))]) y_ = y_.T else: y_ = y try: transformer.fit(X, y_) X_pred = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple): for x_pred in X_pred: assert_equal(x_pred.shape[0], n_samples) else: assert_equal(X_pred.shape[0], n_samples) except Exception as e: print(transformer) print(e) print() succeeded = False continue if hasattr(transformer, 'transform'): if name in ('_PLS', 'PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'): X_pred2 = transformer.transform(X, y_) X_pred3 = transformer.fit_transform(X, y=y_) else: X_pred2 = transformer.transform(X) X_pred3 = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3): assert_array_almost_equal( x_pred, x_pred2, 2, "fit_transform not correct in %s" % Transformer) assert_array_almost_equal( x_pred3, x_pred2, 2, "fit_transform not correct in %s" % Transformer) else: assert_array_almost_equal( X_pred, X_pred2, 2, "fit_transform not correct in %s" % Transformer) assert_array_almost_equal( X_pred3, X_pred2, 2, "fit_transform not correct in %s" % Transformer) # raises error on malformed input for transform assert_raises(ValueError, transformer.transform, X.T) assert_true(succeeded)
def test_agglomerative_clustering(): """ Check that we obtain the correct number of clusters with agglomerative clustering. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rnd.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) for linkage in ("ward", "complete", "average"): clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) clustering.fit(X) # test caching clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, memory=mkdtemp(), linkage=linkage) clustering.fit(X) labels = clustering.labels_ assert_true(np.size(np.unique(labels)) == 10) # Turn caching off now clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) np.testing.assert_array_equal(clustering.labels_, labels) clustering.connectivity = None clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10) # Check that we raise a TypeError on dense matrices clustering = AgglomerativeClustering( n_clusters=10, connectivity=sparse.lil_matrix(connectivity.toarray()[:10, :10]), linkage=linkage) assert_raises(ValueError, clustering.fit, X) # Test that using ward with another metric than euclidean raises an # exception clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity.toarray(), affinity="manhattan", linkage="ward") assert_raises(ValueError, clustering.fit, X) # Test using another metric than euclidean works with linkage complete for affinity in PAIRED_DISTANCES.keys(): # Compare our (structured) implementation to scipy clustering = AgglomerativeClustering(n_clusters=10, connectivity=np.ones( (n_samples, n_samples)), affinity=affinity, linkage="complete") clustering.fit(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=None, affinity=affinity, linkage="complete") clustering2.fit(X) assert_almost_equal( normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1)
def test_hdbscan_badargs(): assert_raises(ValueError, hdbscan, X='fail') assert_raises(ValueError, hdbscan, X=None) assert_raises(ValueError, hdbscan, X, min_cluster_size='fail') assert_raises(ValueError, hdbscan, X, min_samples='fail') assert_raises(ValueError, hdbscan, X, min_samples=-1) assert_raises(ValueError, hdbscan, X, metric='imperial') assert_raises(ValueError, hdbscan, X, metric=None) assert_raises(ValueError, hdbscan, X, metric='minkowski', p=-1) assert_raises(ValueError, hdbscan, X, metric='minkowski', p=-1, algorithm='prims_kdtree') assert_raises(ValueError, hdbscan, X, metric='minkowski', p=-1, algorithm='prims_balltree') assert_raises(ValueError, hdbscan, X, metric='minkowski', p=-1, algorithm='boruvka_balltree') assert_raises(ValueError, hdbscan, X, metric='precomputed', algorithm='boruvka_kdtree') assert_raises(ValueError, hdbscan, X, metric='precomputed', algorithm='prims_kdtree') assert_raises(ValueError, hdbscan, X, metric='precomputed', algorithm='prims_balltree') assert_raises(ValueError, hdbscan, X, metric='precomputed', algorithm='boruvka_balltree') assert_raises(ValueError, hdbscan, X, alpha=-1) assert_raises(ValueError, hdbscan, X, alpha='fail') assert_raises(Exception, hdbscan, X, algorithm='something_else') assert_raises(TypeError, hdbscan, X, metric='minkowski', p=None) assert_raises(ValueError, hdbscan, X, leaf_size=0)
def test_format_invariance_with_1d_vectors(name): random_state = check_random_state(0) y1 = random_state.randint(0, 2, size=(20, )) y2 = random_state.randint(0, 2, size=(20, )) y1_list = list(y1) y2_list = list(y2) y1_1d, y2_1d = np.array(y1), np.array(y2) assert_array_equal(y1_1d.ndim, 1) assert_array_equal(y2_1d.ndim, 1) y1_column = np.reshape(y1_1d, (-1, 1)) y2_column = np.reshape(y2_1d, (-1, 1)) y1_row = np.reshape(y1_1d, (1, -1)) y2_row = np.reshape(y2_1d, (1, -1)) with ignore_warnings(): metric = ALL_METRICS[name] measure = metric(y1, y2) assert_allclose(metric(y1_list, y2_list), measure, err_msg="%s is not representation invariant with list" "" % name) assert_allclose(metric(y1_1d, y2_1d), measure, err_msg="%s is not representation invariant with " "np-array-1d" % name) assert_allclose(metric(y1_column, y2_column), measure, err_msg="%s is not representation invariant with " "np-array-column" % name) # Mix format support assert_allclose(metric(y1_1d, y2_list), measure, err_msg="%s is not representation invariant with mix " "np-array-1d and list" % name) assert_allclose(metric(y1_list, y2_1d), measure, err_msg="%s is not representation invariant with mix " "np-array-1d and list" % name) assert_allclose(metric(y1_1d, y2_column), measure, err_msg="%s is not representation invariant with mix " "np-array-1d and np-array-column" % name) assert_allclose(metric(y1_column, y2_1d), measure, err_msg="%s is not representation invariant with mix " "np-array-1d and np-array-column" % name) assert_allclose(metric(y1_list, y2_column), measure, err_msg="%s is not representation invariant with mix " "list and np-array-column" % name) assert_allclose(metric(y1_column, y2_list), measure, err_msg="%s is not representation invariant with mix " "list and np-array-column" % name) # These mix representations aren't allowed assert_raises(ValueError, metric, y1_1d, y2_row) assert_raises(ValueError, metric, y1_row, y2_1d) assert_raises(ValueError, metric, y1_list, y2_row) assert_raises(ValueError, metric, y1_row, y2_list) assert_raises(ValueError, metric, y1_column, y2_row) assert_raises(ValueError, metric, y1_row, y2_column) # NB: We do not test for y1_row, y2_row as these may be # interpreted as multilabel or multioutput data. if (name not in (MULTIOUTPUT_METRICS | THRESHOLDED_MULTILABEL_METRICS | MULTILABELS_METRICS)): assert_raises(ValueError, metric, y1_row, y2_row)
def test_multioutput_enetcv_error(): rng = np.random.RandomState(0) X = rng.randn(10, 2) y = rng.randn(10, 2) clf = ElasticNetCV() assert_raises(ValueError, clf.fit, X, y)
def test_validate_function(): """Check that valid functions are accepted & invalid ones raise error""" # Check arity tests fun = make_function(function=_protected_sqrt, name='sqrt', arity=1) # non-integer arity assert_raises(ValueError, make_function, _protected_sqrt, 'sqrt', '1') assert_raises(ValueError, make_function, _protected_sqrt, 'sqrt', 1.0) # non-matching arity assert_raises(ValueError, make_function, _protected_sqrt, 'sqrt', 2) assert_raises(ValueError, make_function, maximum, 'max', 1) # Check name test assert_raises(ValueError, make_function, _protected_sqrt, 2, 1) # Check return type tests def bad_fun1(x1, x2): return 'ni' assert_raises(ValueError, make_function, bad_fun1, 'ni', 2) # Check return shape tests def bad_fun2(x1): return np.ones((2, 1)) assert_raises(ValueError, make_function, bad_fun2, 'ni', 1) # Check closure for negatives test def _unprotected_sqrt(x1): with np.errstate(divide='ignore', invalid='ignore'): return np.sqrt(x1) assert_raises(ValueError, make_function, _unprotected_sqrt, 'sqrt', 1) # Check closure for zeros test def _unprotected_div(x1, x2): with np.errstate(divide='ignore', invalid='ignore'): return np.divide(x1, x2) assert_raises(ValueError, make_function, _unprotected_div, 'div', 2)
def test_multioutput_number_of_output_differ(name): y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]]) y_pred = np.array([[0, 0], [1, 0], [0, 0]]) metric = ALL_METRICS[name] assert_raises(ValueError, metric, y_true, y_pred)
def test_unknown_strategey_regressor(): X = [[0]] * 5 y = [1, 2, 4, 6, 8] est = DummyRegressor(strategy='gona') assert_raises(ValueError, est.fit, X, y)
def check_sample_weight_invariance(name, metric, y1, y2): rng = np.random.RandomState(0) sample_weight = rng.randint(1, 10, size=len(y1)) # check that unit weights gives the same score as no weight unweighted_score = metric(y1, y2, sample_weight=None) assert_allclose(unweighted_score, metric(y1, y2, sample_weight=np.ones(shape=len(y1))), err_msg="For %s sample_weight=None is not equivalent to " "sample_weight=ones" % name) # check that the weighted and unweighted scores are unequal weighted_score = metric(y1, y2, sample_weight=sample_weight) # use context manager to supply custom error message with assert_raises(AssertionError) as cm: assert_allclose(unweighted_score, weighted_score) cm.msg = ("Unweighted and weighted scores are unexpectedly almost " "equal (%s) and (%s) for %s" % (unweighted_score, weighted_score, name)) # check that sample_weight can be a list weighted_score_list = metric(y1, y2, sample_weight=sample_weight.tolist()) assert_allclose( weighted_score, weighted_score_list, err_msg=("Weighted scores for array and list " "sample_weight input are not equal (%s != %s) for %s") % (weighted_score, weighted_score_list, name)) # check that integer weights is the same as repeated samples repeat_weighted_score = metric(np.repeat(y1, sample_weight, axis=0), np.repeat(y2, sample_weight, axis=0), sample_weight=None) assert_allclose(weighted_score, repeat_weighted_score, err_msg="Weighting %s is not equal to repeating samples" % name) # check that ignoring a fraction of the samples is equivalent to setting # the corresponding weights to zero sample_weight_subset = sample_weight[1::2] sample_weight_zeroed = np.copy(sample_weight) sample_weight_zeroed[::2] = 0 y1_subset = y1[1::2] y2_subset = y2[1::2] weighted_score_subset = metric(y1_subset, y2_subset, sample_weight=sample_weight_subset) weighted_score_zeroed = metric(y1, y2, sample_weight=sample_weight_zeroed) assert_allclose( weighted_score_subset, weighted_score_zeroed, err_msg=("Zeroing weights does not give the same result as " "removing the corresponding samples (%s != %s) for %s" % (weighted_score_zeroed, weighted_score_subset, name))) if not name.startswith('unnormalized'): # check that the score is invariant under scaling of the weights by a # common factor for scaling in [2, 0.3]: assert_allclose(weighted_score, metric(y1, y2, sample_weight=sample_weight * scaling), err_msg="%s sample_weight is not invariant " "under scaling" % name) # Check that if number of samples in y_true and sample_weight are not # equal, meaningful error is raised. error_message = ("Found input variables with inconsistent numbers of " "samples: [{}, {}, {}]".format( _num_samples(y1), _num_samples(y2), _num_samples(sample_weight) * 2)) assert_raise_message(ValueError, error_message, metric, y1, y2, sample_weight=np.hstack( [sample_weight, sample_weight]))
def test_quantile_strategy_empty_train(): est = DummyRegressor(strategy="quantile", quantile=0.4) assert_raises(ValueError, est.fit, [], [])
def test_constants_not_specified_regressor(): X = [[0]] * 5 y = [1, 2, 4, 6, 8] est = DummyRegressor(strategy='constant') assert_raises(TypeError, est.fit, X, y)