def test_linearsvc_parameters(): # Test possible parameter combinations in LinearSVC # Generate list of possible parameter combinations losses = ['hinge', 'squared_hinge', 'logistic_regression', 'foo'] penalties, duals = ['l1', 'l2', 'bar'], [True, False] X, y = make_classification(n_samples=5, n_features=5) for loss, penalty, dual in itertools.product(losses, penalties, duals): clf = svm.LinearSVC(penalty=penalty, loss=loss, dual=dual) if ((loss, penalty) == ('hinge', 'l1') or (loss, penalty, dual) == ('hinge', 'l2', False) or (penalty, dual) == ('l1', True) or loss == 'foo' or penalty == 'bar'): assert_raises_regexp(ValueError, "Unsupported set of arguments.*penalty='%s.*" "loss='%s.*dual=%s" % (penalty, loss, dual), clf.fit, X, y) else: clf.fit(X, y) # Incorrect loss value - test if explicit error message is raised assert_raises_regexp(ValueError, ".*loss='l3' is not supported.*", svm.LinearSVC(loss="l3").fit, X, y)
def test_non_positive_precomputed_distances(): # Precomputed distance matrices must be positive. bad_dist = np.array([[0., -1.], [1., 0.]]) for method in ['barnes_hut', 'exact']: tsne = TSNE(metric="precomputed", method=method) assert_raises_regexp(ValueError, "All distances .*precomputed.*", tsne.fit_transform, bad_dist)
def test_check_scoring(): # Test all branches of check_scoring estimator = EstimatorWithoutFit() pattern = (r"estimator should a be an estimator implementing 'fit' method," r" .* was passed") assert_raises_regexp(TypeError, pattern, check_scoring, estimator) estimator = EstimatorWithFitAndScore() estimator.fit([[1]], [1]) scorer = check_scoring(estimator) assert_true(scorer is _passthrough_scorer) assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0) estimator = EstimatorWithFitAndPredict() estimator.fit([[1]], [1]) pattern = (r"If no scoring is specified, the estimator passed should have" r" a 'score' method\. The estimator .* does not\.") assert_raises_regexp(TypeError, pattern, check_scoring, estimator) scorer = check_scoring(estimator, "accuracy") assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0) estimator = EstimatorWithFit() scorer = check_scoring(estimator, "accuracy") assert_true(isinstance(scorer, _PredictScorer)) estimator = EstimatorWithFit() scorer = check_scoring(estimator, allow_none=True) assert_true(scorer is None)
def check_scoring_validator_for_single_metric_usecases(scoring_validator): # Test all branches of single metric usecases estimator = EstimatorWithoutFit() pattern = (r"estimator should be an estimator implementing 'fit' method," r" .* was passed") assert_raises_regexp(TypeError, pattern, scoring_validator, estimator) estimator = EstimatorWithFitAndScore() estimator.fit([[1]], [1]) scorer = scoring_validator(estimator) assert_true(scorer is _passthrough_scorer) assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0) estimator = EstimatorWithFitAndPredict() estimator.fit([[1]], [1]) pattern = (r"If no scoring is specified, the estimator passed should have" r" a 'score' method\. The estimator .* does not\.") assert_raises_regexp(TypeError, pattern, scoring_validator, estimator) scorer = scoring_validator(estimator, "accuracy") assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0) estimator = EstimatorWithFit() scorer = scoring_validator(estimator, "accuracy") assert_true(isinstance(scorer, _PredictScorer)) # Test the allow_none parameter for check_scoring alone if scoring_validator is check_scoring: estimator = EstimatorWithFit() scorer = scoring_validator(estimator, allow_none=True) assert_true(scorer is None)
def test_ransac_resid_thresh_no_inliers(): # When residual_threshold=0.0 there are no inliers and a # ValueError with a message should be raised base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=0.0, random_state=0) assert_raises_regexp(ValueError, "No inliers.*residual_threshold.*0\.0", ransac_estimator.fit, X, y)
def test_base_estimator(): # Test different base estimators. from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC # XXX doesn't work with y_class because RF doesn't support classes_ # Shouldn't AdaBoost run a LabelBinarizer? clf = AdaBoostClassifier(RandomForestClassifier()) clf.fit(X, y_regr) clf = AdaBoostClassifier(SVC(), algorithm="SAMME") clf.fit(X, y_class) from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVR clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0) clf.fit(X, y_regr) clf = AdaBoostRegressor(SVR(), random_state=0) clf.fit(X, y_regr) # Check that an empty discrete ensemble fails in fit, not predict. X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]] y_fail = ["foo", "bar", 1, 2] clf = AdaBoostClassifier(SVC(), algorithm="SAMME") assert_raises_regexp(ValueError, "worse than random", clf.fit, X_fail, y_fail)
def test_underflow_or_overlow(): with np.errstate(all="raise"): # Generate some weird data with hugely unscaled features rng = np.random.RandomState(0) n_samples = 100 n_features = 10 X = rng.normal(size=(n_samples, n_features)) X[:, :2] *= 1e300 assert_true(np.isfinite(X).all()) # Use MinMaxScaler to scale the data without introducing a numerical # instability (computing the standard deviation naively is not possible # on this data) X_scaled = MinMaxScaler().fit_transform(X) assert_true(np.isfinite(X_scaled).all()) # Define a ground truth on the scaled data ground_truth = rng.normal(size=n_features) y = (np.dot(X_scaled, ground_truth) > 0.0).astype(np.int32) assert_array_equal(np.unique(y), [0, 1]) model = SGDClassifier(alpha=0.1, loss="squared_hinge", n_iter=500) # smoke test: model is stable on scaled data model.fit(X_scaled, y) assert_true(np.isfinite(model.coef_).all()) # model is numerically unstable on unscaled data msg_regxp = ( r"Floating-point under-/overflow occurred at epoch #.*" " Scaling input data with StandardScaler or MinMaxScaler" " might help." ) assert_raises_regexp(ValueError, msg_regxp, model.fit, X, y)
def test_lda_no_component_error(): # test `perplexity` before `fit` rng = np.random.RandomState(0) X = rng.randint(4, size=(20, 10)) lda = LatentDirichletAllocation() regex = ("This LatentDirichletAllocation instance is not fitted yet. " "Call 'fit' with appropriate arguments before using this method.") assert_raises_regexp(NotFittedError, regex, lda.perplexity, X)
def test_no_sparse_on_barnes_hut(): # No sparse matrices allowed on Barnes-Hut. random_state = check_random_state(0) X = random_state.randn(100, 2) X[(np.random.randint(0, 100, 50), np.random.randint(0, 2, 50))] = 0.0 X_csr = sp.csr_matrix(X) tsne = TSNE(n_iter=199, method="barnes_hut") assert_raises_regexp(TypeError, "A sparse matrix was.*", tsne.fit_transform, X_csr)
def test_unfitted(): X = "foo!" # input validation not required when SVM not fitted clf = svm.SVC() assert_raises_regexp(Exception, r".*\bSVC\b.*\bnot\b.*\bfitted\b", clf.predict, X) clf = svm.NuSVR() assert_raises_regexp(Exception, r".*\bNuSVR\b.*\bnot\b.*\bfitted\b", clf.predict, X)
def test_k_means_n_init(): rnd = np.random.RandomState(0) X = rnd.normal(size=(40, 2)) # two regression tests on bad n_init argument # previous bug: n_init <= 0 threw non-informative TypeError (#3858) assert_raises_regexp(ValueError, "n_init", KMeans(n_init=0).fit, X) assert_raises_regexp(ValueError, "n_init", KMeans(n_init=-1).fit, X)
def test_lda_no_component_error(): # test `transform` and `perplexity` before `fit` rng = np.random.RandomState(0) X = rng.randint(4, size=(20, 10)) lda = LatentDirichletAllocation() regex = r"^no 'components_' attribute" assert_raises_regexp(NotFittedError, regex, lda.transform, X) assert_raises_regexp(NotFittedError, regex, lda.perplexity, X)
def test_distance_not_available(): # 'metric' must be valid. tsne = TSNE(metric="not available", method='exact') assert_raises_regexp(ValueError, "Unknown metric not available.*", tsne.fit_transform, np.array([[0.0], [1.0]])) tsne = TSNE(metric="not available", method='barnes_hut') assert_raises_regexp(ValueError, "Metric 'not available' not valid.*", tsne.fit_transform, np.array([[0.0], [1.0]]))
def test_non_positive_computed_distances(): # Computed distance matrices must be positive. def metric(x, y): return -1 tsne = TSNE(metric=metric, method='exact') X = np.array([[0.0, 0.0], [1.0, 1.0]]) assert_raises_regexp(ValueError, "All distances .*metric given.*", tsne.fit_transform, X)
def test_pca_initialization_not_compatible_with_precomputed_kernel(): # Precomputed distance matrices must be square matrices. tsne = TSNE(metric="precomputed", init="pca") assert_raises_regexp( ValueError, 'The parameter init="pca" cannot be ' 'used with metric="precomputed".', tsne.fit_transform, np.array([[0.0], [1.0]]), )
def test_lda_transform_mismatch(): # test `n_features` mismatch in partial_fit and transform rng = np.random.RandomState(0) X = rng.randint(4, size=(20, 10)) X_2 = rng.randint(4, size=(10, 8)) n_topics = rng.randint(3, 6) lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng) lda.partial_fit(X) assert_raises_regexp(ValueError, r"^The provided data has", lda.partial_fit, X_2)
def test_ovr_partial_fit_exceptions(): ovr = OneVsRestClassifier(MultinomialNB()) X = np.abs(np.random.randn(14, 2)) y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3] ovr.partial_fit(X[:7], y[:7], np.unique(y)) # A new class value which was not in the first call of partial_fit # It should raise ValueError y1 = [5] + y[7:-1] assert_raises_regexp(ValueError, r"Mini-batch contains \[.+\] while " r"classes must be subset of \[.+\]", ovr.partial_fit, X=X[7:], y=y1)
def test_lda_partial_fit_dim_mismatch(): # test `n_features` mismatch in `partial_fit` rng = np.random.RandomState(0) n_topics = rng.randint(3, 6) n_col = rng.randint(6, 10) X_1 = np.random.randint(4, size=(10, n_col)) X_2 = np.random.randint(4, size=(10, n_col + 1)) lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=5., total_samples=20, random_state=rng) lda.partial_fit(X_1) assert_raises_regexp(ValueError, r"^The provided data has", lda.partial_fit, X_2)
def test_invalid_params(): # test `_check_params` method X = np.ones((5, 10)) invalid_models = ( ('n_topics', LatentDirichletAllocation(n_topics=0)), ('learning_method', LatentDirichletAllocation(learning_method='unknown')), ('total_samples', LatentDirichletAllocation(total_samples=0)), ('learning_offset', LatentDirichletAllocation(learning_offset=-1)), ) for param, model in invalid_models: regex = r"^Invalid %r parameter" % param assert_raises_regexp(ValueError, regex, model.fit, X)
def test_ransac_resid_thresh_no_inliers(): # When residual_threshold=0.0 there are no inliers and a # ValueError with a message should be raised base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=0.0, random_state=0, max_trials=5) msg = ("RANSAC could not find a valid consensus set") assert_raises_regexp(ValueError, msg, ransac_estimator.fit, X, y) assert_equal(ransac_estimator.n_skips_no_inliers_, 5) assert_equal(ransac_estimator.n_skips_invalid_data_, 0) assert_equal(ransac_estimator.n_skips_invalid_model_, 0)
def test_partial_fit_weight_class_auto(self): # partial_fit with class_weight='auto' not supported assert_raises_regexp(ValueError, "class_weight 'auto' is not supported for " "partial_fit. In order to use 'auto' weights, " "use compute_class_weight\('auto', classes, y\). " "In place of y you can us a large enough sample " "of the full training set target to properly " "estimate the class frequency distributions. " "Pass the resulting weights as the class_weight " "parameter.", self.factory(class_weight='auto').partial_fit, X, Y, classes=np.unique(Y))
def test_ransac_no_valid_model(): def is_model_valid(estimator, X, y): return False base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, is_model_valid=is_model_valid, max_trials=5) msg = ("RANSAC could not find a valid consensus set") assert_raises_regexp(ValueError, msg, ransac_estimator.fit, X, y) assert_equal(ransac_estimator.n_skips_no_inliers_, 0) assert_equal(ransac_estimator.n_skips_invalid_data_, 0) assert_equal(ransac_estimator.n_skips_invalid_model_, 5)
def test_ovo_partial_fit_predict(): temp = datasets.load_iris() X, y = temp.data, temp.target ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(X[:100], y[:100], np.unique(y)) ovo1.partial_fit(X[100:], y[100:]) pred1 = ovo1.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) ovo2.fit(X, y) pred2 = ovo2.predict(X) assert_equal(len(ovo1.estimators_), n_classes * (n_classes - 1) / 2) assert_greater(np.mean(y == pred1), 0.65) assert_almost_equal(pred1, pred2) # Test when mini-batches have binary target classes ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(X[:60], y[:60], np.unique(y)) ovo1.partial_fit(X[60:], y[60:]) pred1 = ovo1.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) pred2 = ovo2.fit(X, y).predict(X) assert_almost_equal(pred1, pred2) assert_equal(len(ovo1.estimators_), len(np.unique(y))) assert_greater(np.mean(y == pred1), 0.65) ovo = OneVsOneClassifier(MultinomialNB()) X = np.random.rand(14, 2) y = [1, 1, 2, 3, 3, 0, 0, 4, 4, 4, 4, 4, 2, 2] ovo.partial_fit(X[:7], y[:7], [0, 1, 2, 3, 4]) ovo.partial_fit(X[7:], y[7:]) pred = ovo.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) pred2 = ovo2.fit(X, y).predict(X) assert_almost_equal(pred, pred2) # raises error when mini-batch does not have classes from all_classes ovo = OneVsOneClassifier(MultinomialNB()) error_y = [0, 1, 2, 3, 4, 5, 2] message_re = escape("Mini-batch contains {0} while " "it must be subset of {1}".format(np.unique(error_y), np.unique(y))) assert_raises_regexp(ValueError, message_re, ovo.partial_fit, X[:7], error_y, np.unique(y)) # test partial_fit only exists if estimator has it: ovr = OneVsOneClassifier(SVC()) assert_false(hasattr(ovr, "partial_fit"))
def test_ransac_exceed_max_skips(): def is_data_valid(X, y): return False base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, is_data_valid=is_data_valid, max_trials=5, max_skips=3) msg = ("RANSAC skipped more iterations than `max_skips`") assert_raises_regexp(ValueError, msg, ransac_estimator.fit, X, y) assert_equal(ransac_estimator.n_skips_no_inliers_, 0) assert_equal(ransac_estimator.n_skips_invalid_data_, 4) assert_equal(ransac_estimator.n_skips_invalid_model_, 0)
def test_lda_preplexity_mismatch(): # test dimension mismatch in `perplexity` method rng = np.random.RandomState(0) n_topics = rng.randint(3, 6) n_samples = rng.randint(6, 10) X = np.random.randint(4, size=(n_samples, 10)) lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=5., total_samples=20, random_state=rng) lda.fit(X) # invalid samples invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_topics)) assert_raises_regexp(ValueError, r'Number of samples', lda.perplexity, X, invalid_n_samples) # invalid topic number invalid_n_topics = rng.randint(4, size=(n_samples, n_topics + 1)) assert_raises_regexp(ValueError, r'Number of topics', lda.perplexity, X, invalid_n_topics)
def test_scoring_is_not_metric(): assert_raises_regexp(ValueError, 'make_scorer', check_scoring, LogisticRegression(), f1_score) assert_raises_regexp(ValueError, 'make_scorer', check_scoring, LogisticRegression(), roc_auc_score) assert_raises_regexp(ValueError, 'make_scorer', check_scoring, Ridge(), r2_score) assert_raises_regexp(ValueError, 'make_scorer', check_scoring, KMeans(), cluster_module.adjusted_rand_score)
def test_check_scoring_and_check_multimetric_scoring(): check_scoring_validator_for_single_metric_usecases(check_scoring) # To make sure the check_scoring is correctly applied to the constituent # scorers check_scoring_validator_for_single_metric_usecases( check_multimetric_scoring_single_metric_wrapper) # For multiple metric use cases # Make sure it works for the valid cases for scoring in (('accuracy',), ['precision'], {'acc': 'accuracy', 'precision': 'precision'}, ('accuracy', 'precision'), ['precision', 'accuracy'], {'accuracy': make_scorer(accuracy_score), 'precision': make_scorer(precision_score)}): estimator = LinearSVC(random_state=0) estimator.fit([[1], [2], [3]], [1, 1, 0]) scorers, is_multi = _check_multimetric_scoring(estimator, scoring) assert_true(is_multi) assert_true(isinstance(scorers, dict)) assert_equal(sorted(scorers.keys()), sorted(list(scoring))) assert_true(all([isinstance(scorer, _PredictScorer) for scorer in list(scorers.values())])) if 'acc' in scoring: assert_almost_equal(scorers['acc']( estimator, [[1], [2], [3]], [1, 0, 0]), 2. / 3.) if 'accuracy' in scoring: assert_almost_equal(scorers['accuracy']( estimator, [[1], [2], [3]], [1, 0, 0]), 2. / 3.) if 'precision' in scoring: assert_almost_equal(scorers['precision']( estimator, [[1], [2], [3]], [1, 0, 0]), 0.5) estimator = EstimatorWithFitAndPredict() estimator.fit([[1]], [1]) # Make sure it raises errors when scoring parameter is not valid. # More weird corner cases are tested at test_validation.py error_message_regexp = ".*must be unique strings.*" for scoring in ((make_scorer(precision_score), # Tuple of callables make_scorer(accuracy_score)), [5], (make_scorer(precision_score),), (), ('f1', 'f1')): assert_raises_regexp(ValueError, error_message_regexp, _check_multimetric_scoring, estimator, scoring=scoring)
def test_correct_labelsize(): # Assert 1 < n_labels < n_samples dataset = datasets.load_iris() X = dataset.data # n_labels = n_samples y = np.arange(X.shape[0]) assert_raises_regexp(ValueError, 'Number of labels is %d\. Valid values are 2 ' 'to n_samples - 1 \(inclusive\)' % len(np.unique(y)), silhouette_score, X, y) # n_labels = 1 y = np.zeros(X.shape[0]) assert_raises_regexp(ValueError, 'Number of labels is %d\. Valid values are 2 ' 'to n_samples - 1 \(inclusive\)' % len(np.unique(y)), silhouette_score, X, y)
def test_correct_labelsize(): """ Assert 2 <= n_labels <= nsample -1 """ dataset = datasets.load_iris() X = dataset.data # n_labels = n_samples y = np.arange(X.shape[0]) assert_raises_regexp(ValueError, "Number of labels is %d " "but should be more than 2" "and less than n_samples - 1" % len(np.unique(y)), silhouette_score, X, y) # n_labels = 1 y = np.zeros(X.shape[0]) assert_raises_regexp(ValueError, "Number of labels is %d " "but should be more than 2" "and less than n_samples - 1" % len(np.unique(y)), silhouette_score, X, y)
def test_pairwise_precomputed(func): # Test correct shape assert_raises_regexp(ValueError, '.* shape .*', func, np.zeros((5, 3)), metric='precomputed') # with two args assert_raises_regexp(ValueError, '.* shape .*', func, np.zeros((5, 3)), np.zeros((4, 4)), metric='precomputed') # even if shape[1] agrees (although thus second arg is spurious) assert_raises_regexp(ValueError, '.* shape .*', func, np.zeros((5, 3)), np.zeros((4, 3)), metric='precomputed') # Test not copied (if appropriate dtype) S = np.zeros((5, 5)) S2 = func(S, metric="precomputed") assert S is S2 # with two args S = np.zeros((5, 3)) S2 = func(S, np.zeros((3, 3)), metric="precomputed") assert S is S2 # Test always returns float dtype S = func(np.array([[1]], dtype='int'), metric='precomputed') assert_equal('f', S.dtype.kind) # Test converts list to array-like S = func([[1.]], metric='precomputed') assert isinstance(S, np.ndarray)
def test_pairwise_precomputed(): for func in [pairwise_distances, pairwise_kernels]: # Test correct shape assert_raises_regexp(ValueError, '.* shape .*', func, np.zeros((5, 3)), metric='precomputed') # with two args assert_raises_regexp(ValueError, '.* shape .*', func, np.zeros((5, 3)), np.zeros((4, 4)), metric='precomputed') # even if shape[1] agrees (although thus second arg is spurious) assert_raises_regexp(ValueError, '.* shape .*', func, np.zeros((5, 3)), np.zeros((4, 3)), metric='precomputed') # Test not copied (if appropriate dtype) S = np.zeros((5, 5)) S2 = func(S, metric="precomputed") assert_true(S is S2) # with two args S = np.zeros((5, 3)) S2 = func(S, np.zeros((3, 3)), metric="precomputed") assert_true(S is S2) # Test always returns float dtype S = func(np.array([[1]], dtype='int'), metric='precomputed') assert_equal('f', S.dtype.kind) # Test converts list to array-like S = func([[1]], metric='precomputed') assert_true(isinstance(S, np.ndarray))
def test_init_not_available(): # 'init' must be 'pca', 'random', or numpy array. tsne = TSNE(init="not available") m = "'init' must be 'pca', 'random', or a numpy array" assert_raises_regexp(ValueError, m, tsne.fit_transform, np.array([[0.0], [1.0]]))
def test_group_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_groups = 15 n_samples = 1000 n_splits = 5 X = y = np.ones(n_samples) # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed groups = rng.randint(0, n_groups, n_samples) ideal_n_groups_per_fold = n_samples // n_splits len(np.unique(groups)) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) lkf = GroupKFold(n_splits=n_splits) for i, (_, test) in enumerate(lkf.split(X, y, groups)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(groups)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each group appears only in 1 fold for group in np.unique(groups): assert_equal(len(np.unique(folds[groups == group])), 1) # Check that no group is on both sides of the split groups = np.asarray(groups, dtype=object) for train, test in lkf.split(X, y, groups): assert_equal(len(np.intersect1d(groups[train], groups[test])), 0) # Construct the test data groups = np.array([ 'Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia' ]) n_groups = len(np.unique(groups)) n_samples = len(groups) n_splits = 5 tolerance = 0.05 * n_samples # 5 percent error allowed ideal_n_groups_per_fold = n_samples // n_splits X = y = np.ones(n_samples) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) for i, (_, test) in enumerate(lkf.split(X, y, groups)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(groups)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_groups_per_fold)) # Check that each group appears only in 1 fold with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) for group in np.unique(groups): assert_equal(len(np.unique(folds[groups == group])), 1) # Check that no group is on both sides of the split groups = np.asarray(groups, dtype=object) for train, test in lkf.split(X, y, groups): assert_equal(len(np.intersect1d(groups[train], groups[test])), 0) # groups can also be a list cv_iter = list(lkf.split(X, y, groups.tolist())) for (train1, test1), (train2, test2) in zip(lkf.split(X, y, groups), cv_iter): assert_array_equal(train1, train2) assert_array_equal(test1, test2) # Should fail if there are more folds than groups groups = np.array([1, 1, 1, 2, 2]) X = y = np.ones(len(groups)) assert_raises_regexp(ValueError, "Cannot have number of splits.*greater", next, GroupKFold(n_splits=3).split(X, y, groups))
def test_init_not_available(): # 'init' must be 'pca' or 'random'. assert_raises_regexp(ValueError, "'init' must be either 'pca' or 'random'", TSNE, init="not available")
def test_non_square_precomputed_distances(): # Precomputed distance matrices must be square matrices. tsne = TSNE(metric="precomputed") assert_raises_regexp(ValueError, ".* square distance matrix", tsne.fit_transform, np.array([[0.0], [1.0]]))
def test_too_few_iterations(): # Number of gradient descent iterations must be at least 200. tsne = TSNE(n_iter=199) assert_raises_regexp(ValueError, "n_iter .*", tsne.fit_transform, np.array([[0.0], [0.0]]))
def test_pairwise_precomputed_non_negative(): # Test non-negative values assert_raises_regexp(ValueError, '.* non-negative values.*', pairwise_distances, np.full((5, 5), -1), metric='precomputed')
def test_distance_not_available(): # 'metric' must be valid. tsne = TSNE(metric="not available") assert_raises_regexp(ValueError, "Unknown metric not available.*", tsne.fit_transform, np.array([[0.0], [1.0]]))
def test_init_not_available(): # 'init' must be 'pca' or 'random'. m = "'init' must be 'pca', 'random' or a NumPy array" assert_raises_regexp(ValueError, m, TSNE, init="not available")
def test_pairwise_distances_chunked_reduce_invalid(bad_reduce, err_type, message): X = np.arange(10).reshape(-1, 1) S_chunks = pairwise_distances_chunked(X, None, reduce_func=bad_reduce, working_memory=64) assert_raises_regexp(err_type, message, next, S_chunks)
def test_label_kfold(): rng = np.random.RandomState(0) # Parameters of the test n_labels = 15 n_samples = 1000 n_folds = 5 X = y = np.ones(n_samples) # Construct the test data tolerance = 0.05 * n_samples # 5 percent error allowed labels = rng.randint(0, n_labels, n_samples) ideal_n_labels_per_fold = n_samples // n_folds len(np.unique(labels)) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) lkf = LabelKFold(n_folds=n_folds) for i, (_, test) in enumerate(lkf.split(X, y, labels)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in lkf.split(X, y, labels): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Construct the test data labels = [ 'Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert', 'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary', 'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia' ] n_labels = len(np.unique(labels)) n_samples = len(labels) n_folds = 5 tolerance = 0.05 * n_samples # 5 percent error allowed ideal_n_labels_per_fold = n_samples // n_folds X = y = np.ones(n_samples) # Get the test fold indices from the test set indices of each fold folds = np.zeros(n_samples) for i, (_, test) in enumerate(lkf.split(X, y, labels)): folds[test] = i # Check that folds have approximately the same size assert_equal(len(folds), len(labels)) for i in np.unique(folds): assert_greater_equal(tolerance, abs(sum(folds == i) - ideal_n_labels_per_fold)) # Check that each label appears only in 1 fold with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) for label in np.unique(labels): assert_equal(len(np.unique(folds[labels == label])), 1) # Check that no label is on both sides of the split labels = np.asarray(labels, dtype=object) for train, test in lkf.split(X, y, labels): assert_equal(len(np.intersect1d(labels[train], labels[test])), 0) # Should fail if there are more folds than labels labels = np.array([1, 1, 1, 2, 2]) X = y = np.ones(len(labels)) assert_raises_regexp(ValueError, "Cannot have number of folds.*greater", next, LabelKFold(n_folds=3).split(X, y, labels))
def test_pca_initialization_not_compatible_with_precomputed_kernel(): # Precomputed distance matrices must be square matrices. tsne = TSNE(metric="precomputed", init="pca") assert_raises_regexp(ValueError, "The parameter init=\"pca\" cannot be " "used with metric=\"precomputed\".", tsne.fit_transform, np.array([[0.0], [1.0]]))
def test_method_not_available(): # 'nethod' must be 'barnes_hut' or 'exact' tsne = TSNE(method='not available') assert_raises_regexp(ValueError, "'method' must be 'barnes_hut' or ", tsne.fit_transform, np.array([[0.0], [1.0]]))
def test_early_exaggeration_too_small(): # Early exaggeration factor must be >= 1. tsne = TSNE(early_exaggeration=0.99) assert_raises_regexp(ValueError, "early_exaggeration .*", tsne.fit_transform, np.array([[0.0], [0.0]]))
def test_angle_out_of_range_checks(): # check the angle parameter range for angle in [-1, -1e-6, 1 + 1e-6, 2]: tsne = TSNE(angle=angle) assert_raises_regexp(ValueError, "'angle' must be between 0.0 - 1.0", tsne.fit_transform, np.array([[0.0], [1.0]]))
def test_lda_negative_input(): # test pass dense matrix with sparse negative input. X = -np.ones((5, 10)) lda = LatentDirichletAllocation() regex = r"^Negative values in data passed" assert_raises_regexp(ValueError, regex, lda.fit, X)
def test_n_components_range(): # barnes_hut method should only be used with n_components <= 3 tsne = TSNE(n_components=4, method="barnes_hut") assert_raises_regexp(ValueError, "'n_components' should be .*", tsne.fit_transform, np.array([[0.0], [1.0]]))
def test_error(): # Test for appropriate exception on errors msg = "Penalty term must be positive" assert_raises_regexp(ValueError, msg, LogisticRegression(C=-1).fit, X, Y1) assert_raises_regexp(ValueError, msg, LogisticRegression(C="test").fit, X, Y1) msg = "Tolerance for stopping criteria must be positive" assert_raises_regexp(ValueError, msg, LogisticRegression(tol=-1).fit, X, Y1) assert_raises_regexp(ValueError, msg, LogisticRegression(tol="test").fit, X, Y1) msg = "Maximum number of iteration must be positive" assert_raises_regexp(ValueError, msg, LogisticRegression(max_iter=-1).fit, X, Y1) assert_raises_regexp(ValueError, msg, LogisticRegression(max_iter="test").fit, X, Y1)