def test_group_shuffle_split(): for groups_i in test_groups: X = y = np.ones(len(groups_i)) n_splits = 6 test_size = 1./3 slo = GroupShuffleSplit(n_splits, test_size=test_size, random_state=0) # Make sure the repr works repr(slo) # Test that the length is correct assert_equal(slo.get_n_splits(X, y, groups=groups_i), n_splits) l_unique = np.unique(groups_i) l = np.asarray(groups_i) for train, test in slo.split(X, y, groups=groups_i): # First test: no train group is in the test set and vice versa l_train_unique = np.unique(l[train]) l_test_unique = np.unique(l[test]) assert_false(np.any(np.in1d(l[train], l_test_unique))) assert_false(np.any(np.in1d(l[test], l_train_unique))) # Second test: train and test add up to all the data assert_equal(l[train].size + l[test].size, l.size) # Third test: train and test are disjoint assert_array_equal(np.intersect1d(train, test), []) # Fourth test: # unique train and test groups are correct, +- 1 for rounding error assert_true(abs(len(l_test_unique) - round(test_size * len(l_unique))) <= 1) assert_true(abs(len(l_train_unique) - round((1.0 - test_size) * len(l_unique))) <= 1)
def test_predict_proba_disabled(): """Test predict_proba when disabled on estimator.""" X = np.arange(20).reshape(5, -1) y = [0, 0, 1, 1, 1] clf = SVC(probability=False) gs = GridSearchCV(clf, {}, cv=2).fit(X, y) assert_false(hasattr(gs, "predict_proba"))
def fit(self, X_subset, y_subset): assert_false( hasattr(self, 'fit_called_'), 'fit is called the second time' ) self.fit_called_ = True return super(type(self), self).fit(X_subset, y_subset)
def test_lda_store_covariance(): # Test for slover 'lsqr' and 'eigen' # 'store_covariance' has no effect on 'lsqr' and 'eigen' solvers for solver in ('lsqr', 'eigen'): clf = LinearDiscriminantAnalysis(solver=solver).fit(X6, y6) assert_true(hasattr(clf, 'covariance_')) # Test the actual attribute: clf = LinearDiscriminantAnalysis(solver=solver, store_covariance=True).fit(X6, y6) assert_true(hasattr(clf, 'covariance_')) assert_array_almost_equal( clf.covariance_, np.array([[0.422222, 0.088889], [0.088889, 0.533333]]) ) # Test for SVD slover, the default is to not set the covariances_ attribute clf = LinearDiscriminantAnalysis(solver='svd').fit(X6, y6) assert_false(hasattr(clf, 'covariance_')) # Test the actual attribute: clf = LinearDiscriminantAnalysis(solver=solver, store_covariance=True).fit(X6, y6) assert_true(hasattr(clf, 'covariance_')) assert_array_almost_equal( clf.covariance_, np.array([[0.422222, 0.088889], [0.088889, 0.533333]]) )
def test_as_float_array(): # Test function for as_float_array X = np.ones((3, 10), dtype=np.int32) X = X + np.arange(10, dtype=np.int32) # Checks that the return type is ok X2 = as_float_array(X, copy=False) np.testing.assert_equal(X2.dtype, np.float32) # Another test X = X.astype(np.int64) X2 = as_float_array(X, copy=True) # Checking that the array wasn't overwritten assert_true(as_float_array(X, False) is not X) # Checking that the new type is ok np.testing.assert_equal(X2.dtype, np.float64) # Here, X is of the right type, it shouldn't be modified X = np.ones((3, 2), dtype=np.float32) assert_true(as_float_array(X, copy=False) is X) # Test that if X is fortran ordered it stays X = np.asfortranarray(X) assert_true(np.isfortran(as_float_array(X, copy=True))) # Test the copy parameter with some matrices matrices = [ np.matrix(np.arange(5)), sp.csc_matrix(np.arange(5)).toarray(), sparse_random_matrix(10, 10, density=0.10).toarray() ] for M in matrices: N = as_float_array(M, copy=True) N[0, 0] = np.nan assert_false(np.isnan(M).any())
def test_unsorted_indices(): # test that the result with sorted and unsorted indices in csr is the same # we use a subset of digits as iris, blobs or make_classification didn't # show the problem digits = load_digits() X, y = digits.data[:50], digits.target[:50] X_test = sparse.csr_matrix(digits.data[50:100]) X_sparse = sparse.csr_matrix(X) coef_dense = svm.SVC(kernel='linear', probability=True, random_state=0).fit(X, y).coef_ sparse_svc = svm.SVC(kernel='linear', probability=True, random_state=0).fit(X_sparse, y) coef_sorted = sparse_svc.coef_ # make sure dense and sparse SVM give the same result assert_array_almost_equal(coef_dense, coef_sorted.toarray()) X_sparse_unsorted = X_sparse[np.arange(X.shape[0])] X_test_unsorted = X_test[np.arange(X_test.shape[0])] # make sure we scramble the indices assert_false(X_sparse_unsorted.has_sorted_indices) assert_false(X_test_unsorted.has_sorted_indices) unsorted_svc = svm.SVC(kernel='linear', probability=True, random_state=0).fit(X_sparse_unsorted, y) coef_unsorted = unsorted_svc.coef_ # make sure unsorted indices give same result assert_array_almost_equal(coef_unsorted.toarray(), coef_sorted.toarray()) assert_array_almost_equal(sparse_svc.predict_proba(X_test_unsorted), sparse_svc.predict_proba(X_test))
def test_scale_function_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_scaled = scale(X, with_mean=False) assert_false(np.any(np.isnan(X_scaled))) X_csr_scaled = scale(X_csr, with_mean=False) assert_false(np.any(np.isnan(X_csr_scaled.data))) # test csc has same outcome X_csc_scaled = scale(X_csr.tocsc(), with_mean=False) assert_array_almost_equal(X_scaled, X_csc_scaled.toarray()) # raises value error on axis != 0 assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1) assert_array_almost_equal(X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is not X) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
def test_pickle_version_warning(): # check that warnings are raised when unpickling in a different version # first, check no warning when in the same version: iris = datasets.load_iris() tree = DecisionTreeClassifier().fit(iris.data, iris.target) tree_pickle = pickle.dumps(tree) assert_true(b"version" in tree_pickle) assert_no_warnings(pickle.loads, tree_pickle) # check that warning is raised on different version tree_pickle_other = tree_pickle.replace(sklearn.__version__.encode(), b"something") message = ("Trying to unpickle estimator DecisionTreeClassifier from " "version {0} when using version {1}. This might lead to " "breaking code or invalid results. " "Use at your own risk.".format("something", sklearn.__version__)) assert_warns_message(UserWarning, message, pickle.loads, tree_pickle_other) # check that not including any version also works: # TreeNoVersion has no getstate, like pre-0.18 tree = TreeNoVersion().fit(iris.data, iris.target) tree_pickle_noversion = pickle.dumps(tree) assert_false(b"version" in tree_pickle_noversion) message = message.replace("something", "pre-0.18") message = message.replace("DecisionTreeClassifier", "TreeNoVersion") # check we got the warning about using pre-0.18 pickle assert_warns_message(UserWarning, message, pickle.loads, tree_pickle_noversion) # check that no warning is raised for external estimators TreeNoVersion.__module__ = "notsklearn" assert_no_warnings(pickle.loads, tree_pickle_noversion)
def test_sgd_optimizer_trigger_stopping(): params = [np.zeros(shape) for shape in shapes] lr = 2e-6 optimizer = SGDOptimizer(params, lr, lr_schedule='adaptive') assert_false(optimizer.trigger_stopping('', False)) assert_equal(lr / 5, optimizer.learning_rate) assert_true(optimizer.trigger_stopping('', False))
def test_ovo_partial_fit_predict(): X, y = shuffle(iris.data, iris.target) ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(X[:100], y[:100], np.unique(y)) ovo1.partial_fit(X[100:], y[100:]) pred1 = ovo1.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) ovo2.fit(X, y) pred2 = ovo2.predict(X) assert_equal(len(ovo1.estimators_), n_classes * (n_classes - 1) / 2) assert_greater(np.mean(y == pred1), 0.65) assert_almost_equal(pred1, pred2) # Test when mini-batches don't have all target classes ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target)) ovo1.partial_fit(iris.data[60:], iris.target[60:]) pred1 = ovo1.predict(iris.data) ovo2 = OneVsOneClassifier(MultinomialNB()) pred2 = ovo2.fit(iris.data, iris.target).predict(iris.data) assert_almost_equal(pred1, pred2) assert_equal(len(ovo1.estimators_), len(np.unique(iris.target))) assert_greater(np.mean(iris.target == pred1), 0.65) # test partial_fit only exists if estimator has it: ovr = OneVsOneClassifier(SVC()) assert_false(hasattr(ovr, "partial_fit"))
def test_error(): # Test that it gives proper exception on deficient input. X, y = iris.data, iris.target base = DecisionTreeClassifier() # Test max_samples assert_raises(ValueError, BaggingClassifier(base, max_samples=-1).fit, X, y) assert_raises(ValueError, BaggingClassifier(base, max_samples=0.0).fit, X, y) assert_raises(ValueError, BaggingClassifier(base, max_samples=2.0).fit, X, y) assert_raises(ValueError, BaggingClassifier(base, max_samples=1000).fit, X, y) assert_raises(ValueError, BaggingClassifier(base, max_samples="foobar").fit, X, y) # Test max_features assert_raises(ValueError, BaggingClassifier(base, max_features=-1).fit, X, y) assert_raises(ValueError, BaggingClassifier(base, max_features=0.0).fit, X, y) assert_raises(ValueError, BaggingClassifier(base, max_features=2.0).fit, X, y) assert_raises(ValueError, BaggingClassifier(base, max_features=5).fit, X, y) assert_raises(ValueError, BaggingClassifier(base, max_features="foobar").fit, X, y) # Test support of decision_function assert_false(hasattr(BaggingClassifier(base).fit(X, y), 'decision_function'))
def test_feature_names(): cv = CountVectorizer(max_df=0.5) # test for Value error on unfitted/empty vocabulary assert_raises(ValueError, cv.get_feature_names) assert_false(cv.fixed_vocabulary_) # test for vocabulary learned from data X = cv.fit_transform(ALL_FOOD_DOCS) n_samples, n_features = X.shape assert_equal(len(cv.vocabulary_), n_features) feature_names = cv.get_feature_names() assert_equal(len(feature_names), n_features) assert_array_equal(['beer', 'burger', 'celeri', 'coke', 'pizza', 'salad', 'sparkling', 'tomato', 'water'], feature_names) for idx, name in enumerate(feature_names): assert_equal(idx, cv.vocabulary_.get(name)) # test for custom vocabulary vocab = ['beer', 'burger', 'celeri', 'coke', 'pizza', 'salad', 'sparkling', 'tomato', 'water'] cv = CountVectorizer(vocabulary=vocab) feature_names = cv.get_feature_names() assert_array_equal(['beer', 'burger', 'celeri', 'coke', 'pizza', 'salad', 'sparkling', 'tomato', 'water'], feature_names) assert_true(cv.fixed_vocabulary_) for idx, name in enumerate(feature_names): assert_equal(idx, cv.vocabulary_.get(name))
def test_nmf_fit_nn_output(): # Test that the decomposition does not contain negative values A = np.c_[5 * np.ones(5) - np.arange(1, 6), 5 * np.ones(5) + np.arange(1, 6)] for init in (None, "nndsvd", "nndsvda", "nndsvdar"): model = NMF(n_components=2, init=init, random_state=0) transf = model.fit_transform(A) assert_false((model.components_ < 0).any() or (transf < 0).any())
def test_set_params(): """set_params should be able to set estimators""" clf1 = LogisticRegression(random_state=123, C=1.0) clf2 = RandomForestClassifier(random_state=123, max_depth=None) clf3 = GaussianNB() eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft', weights=[1, 2]) assert_true('lr' in eclf1.named_estimators) assert_true(eclf1.named_estimators.lr is eclf1.estimators[0][1]) assert_true(eclf1.named_estimators.lr is eclf1.named_estimators['lr']) eclf1.fit(X, y) assert_true('lr' in eclf1.named_estimators_) assert_true(eclf1.named_estimators_.lr is eclf1.estimators_[0]) assert_true(eclf1.named_estimators_.lr is eclf1.named_estimators_['lr']) eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft', weights=[1, 2]) eclf2.set_params(nb=clf2).fit(X, y) assert_false(hasattr(eclf2, 'nb')) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params()) assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params()) eclf1.set_params(lr__C=10.0) eclf2.set_params(nb__max_depth=5) assert_true(eclf1.estimators[0][1].get_params()['C'] == 10.0) assert_true(eclf2.estimators[1][1].get_params()['max_depth'] == 5) assert_equal(eclf1.get_params()["lr__C"], eclf1.get_params()["lr"].get_params()['C'])
def test_column_transformer_get_set_params(): ct = ColumnTransformer([('trans1', StandardScaler(), [0]), ('trans2', StandardScaler(), [1])]) exp = {'n_jobs': 1, 'remainder': 'drop', 'trans1': ct.transformers[0][1], 'trans1__copy': True, 'trans1__with_mean': True, 'trans1__with_std': True, 'trans2': ct.transformers[1][1], 'trans2__copy': True, 'trans2__with_mean': True, 'trans2__with_std': True, 'transformers': ct.transformers, 'transformer_weights': None} assert_dict_equal(ct.get_params(), exp) ct.set_params(trans1__with_mean=False) assert_false(ct.get_params()['trans1__with_mean']) ct.set_params(trans1='passthrough') exp = {'n_jobs': 1, 'remainder': 'drop', 'trans1': 'passthrough', 'trans2': ct.transformers[1][1], 'trans2__copy': True, 'trans2__with_mean': True, 'trans2__with_std': True, 'transformers': ct.transformers, 'transformer_weights': None} assert_dict_equal(ct.get_params(), exp)
def check_estimators_overwrite_params(name, Estimator, X, y): with warnings.catch_warnings(record=True): # catch deprecation warnings estimator = Estimator() if hasattr(estimator, 'batch_size'): # FIXME # for MiniBatchDictLearning estimator.batch_size = 1 if name in ['GaussianRandomProjection', 'SparseRandomProjection']: # Due to the jl lemma and very few samples, the number # of components of the random matrix projection will be # greater # than the number of features. # So we impose a smaller number (avoid "auto" mode) estimator = Estimator(n_components=1) elif name == "SelectKBest": estimator = Estimator(k=1) set_random_state(estimator) params = estimator.get_params() estimator.fit(X, y) new_params = estimator.get_params() for k, v in params.items(): assert_false(np.any(new_params[k] != v), "Estimator %s changes its parameter %s" " from %s to %s during fit." % (name, k, v, new_params[k]))
def test_check_increasing_down(): x = [0, 1, 2, 3, 4, 5] y = [0, -1.5, -2.77, -8.99, -8.99, -50] # Check that we got increasing=False and no warnings is_increasing = assert_no_warnings(check_increasing, x, y) assert_false(is_increasing)
def test_initialize_nn_output(): # Test that initialization does not return negative values rng = np.random.mtrand.RandomState(42) data = np.abs(rng.randn(10, 10)) for init in ('random', 'nndsvd', 'nndsvda', 'nndsvdar'): W, H = nmf._initialize_nmf(data, 10, init=init, random_state=0) assert_false((W < 0).any() or (H < 0).any())
def test_optimizer_copy(acq_func): # Checks that the base estimator, the objective and target values # are copied correctly. base_estimator = ExtraTreesRegressor(random_state=2) opt = Optimizer([(-2.0, 2.0)], base_estimator, acq_func=acq_func, n_initial_points=1, acq_optimizer="sampling") # run three iterations so that we have some points and objective values if "ps" in acq_func: opt.run(bench1_with_time, n_iter=3) else: opt.run(bench1, n_iter=3) opt_copy = opt.copy() copied_estimator = opt_copy.base_estimator_ if "ps" in acq_func: assert_true(isinstance(copied_estimator, MultiOutputRegressor)) # check that the base_estimator is not wrapped multiple times is_multi = isinstance(copied_estimator.estimator, MultiOutputRegressor) assert_false(is_multi) else: assert_false(isinstance(copied_estimator, MultiOutputRegressor)) assert_array_equal(opt_copy.Xi, opt.Xi) assert_array_equal(opt_copy.yi, opt.yi)
def test_pipeline_memory(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: memory = Memory(cachedir=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma='scale', probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the transformer in the cached pipeline ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert_false(hasattr(transf, 'means_')) # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert_equal(ts, cached_pipe.named_steps['transf'].timestamp_) # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma='scale', probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert_equal(ts, cached_pipe_2.named_steps['transf_2'].timestamp_) finally: shutil.rmtree(cachedir)
def test_check_is_permutation(): p = np.arange(100) assert_true(_check_is_permutation(p, 100)) assert_false(_check_is_permutation(np.delete(p, 23), 100)) p[0] = 23 assert_false(_check_is_permutation(p, 100))
def test_check_is_partition(): p = np.arange(100) assert_true(cval._check_is_partition(p, 100)) assert_false(cval._check_is_partition(np.delete(p, 23), 100)) p[0] = 23 assert_false(cval._check_is_partition(p, 100))
def test_randomized_svd_sign_flip_with_transpose(): # Check if the randomized_svd sign flipping is always done based on u # irrespective of transpose. # See https://github.com/scikit-learn/scikit-learn/issues/5608 # for more details. def max_loading_is_positive(u, v): """ returns bool tuple indicating if the values maximising np.abs are positive across all rows for u and across all columns for v. """ u_based = (np.abs(u).max(axis=0) == u.max(axis=0)).all() v_based = (np.abs(v).max(axis=1) == v.max(axis=1)).all() return u_based, v_based mat = np.arange(10 * 8).reshape(10, -1) # Without transpose u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True) u_based, v_based = max_loading_is_positive(u_flipped, v_flipped) assert_true(u_based) assert_false(v_based) # With transpose u_flipped_with_transpose, _, v_flipped_with_transpose = randomized_svd( mat, 3, flip_sign=True, transpose=True) u_based, v_based = max_loading_is_positive( u_flipped_with_transpose, v_flipped_with_transpose) assert_true(u_based) assert_false(v_based)
def test_mutual_info_options(): X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=float) y = np.array([0, 1, 2, 2, 1], dtype=float) X_csr = csr_matrix(X) for mutual_info in (mutual_info_regression, mutual_info_classif): assert_raises(ValueError, mutual_info_regression, X_csr, y, discrete_features=False) mi_1 = mutual_info(X, y, discrete_features='auto', random_state=0) mi_2 = mutual_info(X, y, discrete_features=False, random_state=0) mi_3 = mutual_info(X_csr, y, discrete_features='auto', random_state=0) mi_4 = mutual_info(X_csr, y, discrete_features=True, random_state=0) assert_array_equal(mi_1, mi_2) assert_array_equal(mi_3, mi_4) assert_false(np.allclose(mi_1, mi_3))
def test_ovr_partial_fit(): # Test if partial_fit is working as intended X, y = shuffle(iris.data, iris.target, random_state=0) ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(X[:100], y[:100], np.unique(y)) ovr.partial_fit(X[100:], y[100:]) pred = ovr.predict(X) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(X, y).predict(X) assert_almost_equal(pred, pred2) assert_equal(len(ovr.estimators_), len(np.unique(y))) assert_greater(np.mean(y == pred), 0.65) # Test when mini batches doesn't have all classes # with SGDClassifier X = np.abs(np.random.randn(14, 2)) y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3] ovr = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)) ovr.partial_fit(X[:7], y[:7], np.unique(y)) ovr.partial_fit(X[7:], y[7:]) pred = ovr.predict(X) ovr1 = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)) pred1 = ovr1.fit(X, y).predict(X) assert_equal(np.mean(pred == y), np.mean(pred1 == y)) # test partial_fit only exists if estimator has it: ovr = OneVsRestClassifier(SVC()) assert_false(hasattr(ovr, "partial_fit"))
def test_check_increasing_down_extreme(): x = [0, 1, 2, 3, 4, 5] y = [0, -1, -2, -3, -4, -5] # Check that we got increasing=False and no warnings is_increasing = assert_no_warnings(check_increasing, x, y) assert_false(is_increasing)
def check_estimators_overwrite_params(name, Estimator): X, y = make_blobs(random_state=0, n_samples=9) y = multioutput_estimator_convert_y_2d(name, y) # some want non-negative input X -= X.min() with warnings.catch_warnings(record=True): # catch deprecation warnings estimator = Estimator() if name == 'MiniBatchDictLearning' or name == 'MiniBatchSparsePCA': # FIXME # for MiniBatchDictLearning and MiniBatchSparsePCA estimator.batch_size = 1 set_fast_parameters(estimator) set_random_state(estimator) params = estimator.get_params() estimator.fit(X, y) new_params = estimator.get_params() for k, v in params.items(): assert_false(np.any(new_params[k] != v), "Estimator %s changes its parameter %s" " from %s to %s during fit." % (name, k, v, new_params[k]))
def test_pipeline_init(self): # Test the various init parameters of the pipeline. assert_raises(TypeError, SparkPipeline) # Check that we can't instantiate pipelines with objects without fit # method pipe = assert_raises(TypeError, SparkPipeline, [('svc', IncorrectT)]) # Smoke test with only an estimator clf = T() pipe = SparkPipeline([('svc', clf)]) assert_equal(pipe.get_params(deep=True), dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False) )) # Check that params are set pipe.set_params(svc__a=0.1) assert_equal(clf.a, 0.1) assert_equal(clf.b, None) # Smoke test the repr: repr(pipe) # Test with two objects vect = SparkCountVectorizer() filter = SparkVarianceThreshold() pipe = SparkPipeline([('vect', vect), ('filter', filter)]) # Check that we can't use the same stage name twice assert_raises(ValueError, SparkPipeline, [('vect', vect), ('vect', vect)]) # Check that params are set pipe.set_params(vect__min_df=0.1) assert_equal(vect.min_df, 0.1) # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong assert_raises(ValueError, pipe.set_params, filter__min_df=0.1) # Test clone pipe2 = clone(pipe) assert_false(pipe.named_steps['vect'] is pipe2.named_steps['vect']) # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop('vect') params.pop('filter') params2.pop('vect') params2.pop('filter') assert_equal(params, params2)
def test_transform_nan(): # Test that SparsePCA won't return NaN when there is 0 feature in all # samples. rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array Y[:, 0] = 0 estimator = SparsePCA(n_components=8) assert_false(np.any(np.isnan(estimator.fit_transform(Y))))
def test_imputation_copy(): # Test imputation with copy X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0) # copy=True, dense => copy X = X_orig.copy().toarray() imputer = Imputer(missing_values=0, strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_false(np.all(X == Xt)) # copy=True, sparse csr => copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, dense => no copy X = X_orig.copy().toarray() imputer = Imputer(missing_values=0, strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_true(np.all(X == Xt)) # copy=False, sparse csr, axis=1 => no copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_true(np.all(X.data == Xt.data)) # copy=False, sparse csc, axis=0 => no copy X = X_orig.copy().tocsc() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_true(np.all(X.data == Xt.data)) # copy=False, sparse csr, axis=0 => copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, sparse csc, axis=1 => copy X = X_orig.copy().tocsc() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, sparse csr, axis=1, missing_values=0 => copy X = X_orig.copy() imputer = Imputer(missing_values=0, strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) assert_false(sparse.issparse(Xt))
def check_oob_score_raise_error(name): ForestEstimator = FOREST_ESTIMATORS[name] if name in FOREST_TRANSFORMERS: for oob_score in [True, False]: assert_raises(TypeError, ForestEstimator, oob_score=oob_score) assert_raises(NotImplementedError, ForestEstimator()._set_oob_score, X, y) else: # Unfitted / no bootstrap / no oob_score for oob_score, bootstrap in [(True, False), (False, True), (False, False)]: est = ForestEstimator(oob_score=oob_score, bootstrap=bootstrap, random_state=0) assert_false(hasattr(est, "oob_score_")) # No bootstrap assert_raises(ValueError, ForestEstimator(oob_score=True, bootstrap=False).fit, X, y)
def test_plain_has_no_average_attr(self): clf = self.factory(average=True, eta0=.01) clf.fit(X, Y) assert_true(hasattr(clf, 'average_coef_')) assert_true(hasattr(clf, 'average_intercept_')) assert_true(hasattr(clf, 'standard_intercept_')) assert_true(hasattr(clf, 'standard_coef_')) clf = self.factory() clf.fit(X, Y) assert_false(hasattr(clf, 'average_coef_')) assert_false(hasattr(clf, 'average_intercept_')) assert_false(hasattr(clf, 'standard_intercept_')) assert_false(hasattr(clf, 'standard_coef_'))
def test_vectorizer_pipeline_grid_selection(): # raw documents data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS # label junk food as -1, the others as +1 target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS) # split the dataset for model development and final evaluation train_data, test_data, target_train, target_test = train_test_split( data, target, test_size=.1, random_state=0) pipeline = Pipeline([('vect', TfidfVectorizer()), ('svc', LinearSVC())]) parameters = { 'vect__ngram_range': [(1, 1), (1, 2)], 'vect__norm': ('l1', 'l2'), 'svc__loss': ('hinge', 'squared_hinge'), } # find the best parameters for both the feature extraction and the # classifier grid_search = GridSearchCV(pipeline, parameters, n_jobs=1) # Check that the best model found by grid search is 100% correct on the # held out evaluation set. pred = grid_search.fit(train_data, target_train).predict(test_data) assert_array_equal(pred, target_test) # on this toy dataset bigram representation which is used in the last of # the grid_search is considered the best estimator since they all converge # to 100% accuracy models assert_equal(grid_search.best_score_, 1.0) best_vectorizer = grid_search.best_estimator_.named_steps['vect'] assert_equal(best_vectorizer.ngram_range, (1, 1)) assert_equal(best_vectorizer.norm, 'l2') assert_false(best_vectorizer.fixed_vocabulary_)
def test_cv_iterable_wrapper(): y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) with warnings.catch_warnings(record=True): from sklearn.cross_validation import StratifiedKFold as OldSKF cv = OldSKF(y_multiclass, n_folds=3) wrapped_old_skf = _CVIterableWrapper(cv) # Check if split works correctly np.testing.assert_equal(list(cv), list(wrapped_old_skf.split())) # Check if get_n_splits works correctly assert_equal(len(cv), wrapped_old_skf.get_n_splits()) kf_iter = KFold(n_splits=5).split(X, y) kf_iter_wrapped = check_cv(kf_iter) # Since the wrapped iterable is enlisted and stored, # split can be called any number of times to produce # consistent results. np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)), list(kf_iter_wrapped.split(X, y))) # If the splits are randomized, successive calls to split yields different # results kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y) kf_randomized_iter_wrapped = check_cv(kf_randomized_iter) np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)), list(kf_randomized_iter_wrapped.split(X, y))) try: np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)), list(kf_randomized_iter_wrapped.split(X, y))) splits_are_equal = True except AssertionError: splits_are_equal = False assert_false(splits_are_equal, "If the splits are randomized, " "successive calls to split should yield different results")
def test_imputation_copy(): """Test imputation with copy=True.""" l = 5 # Test default behaviour and with copy=True for params in [{}, {'copy': True}]: X = sparse_random_matrix(l, l, density=0.75, random_state=0) # Dense imputer = Imputer(missing_values=0, strategy="mean", **params) Xt = imputer.fit(X).transform(X) Xt[0, 0] = np.nan # Check that the objects are different and that they don't use # the same buffer assert_false(np.all(X.todense() == Xt)) # Sparse imputer = Imputer(missing_values=0, strategy="mean", **params) X = X.todense() Xt = imputer.fit(X).transform(X) Xt[0, 0] = np.nan # Check that the objects are different and that they don't use # the same buffer assert_false(np.all(X == Xt))
def test_label_binarizer_set_label_encoding(): lb = LabelBinarizer(neg_label=-2, pos_label=0) # two-class case with pos_label=0 inp = np.array([0, 1, 1, 0]) expected = np.array([[-2, 0, 0, -2]]).T got = lb.fit_transform(inp) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) lb = LabelBinarizer(neg_label=-2, pos_label=2) # multi-class case inp = np.array([3, 2, 1, 2, 0]) expected = np.array([[-2, -2, -2, +2], [-2, -2, +2, -2], [-2, +2, -2, -2], [-2, -2, +2, -2], [+2, -2, -2, -2]]) got = lb.fit_transform(inp) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def test_random_search_results(): # Make a dataset with a lot of noise to get various kind of prediction # errors across CV folds and parameter settings X, y = make_classification(n_samples=200, n_features=100, n_informative=3, random_state=0) # scipy.stats dists now supports `seed` but we still support scipy 0.12 # which doesn't support the seed. Hence the assertions in the test for # random_search alone should not depend on randomization. n_folds = 3 n_search_iter = 30 params = dict(C=expon(scale=10), gamma=expon(scale=0.1)) random_search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_folds, iid=False, param_distributions=params) random_search.fit(X, y) random_search_iid = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_folds, iid=True, param_distributions=params) random_search_iid.fit(X, y) param_keys = ('param_C', 'param_gamma') score_keys = ('test_mean_score', 'test_rank_score', 'test_split0_score', 'test_split1_score', 'test_split2_score', 'test_std_score') n_cand = n_search_iter for search, iid in zip((random_search, random_search_iid), (False, True)): assert_equal(iid, search.iid) results = search.results_ # Check results structure check_results_array_types(results, param_keys, score_keys) check_results_keys(results, param_keys, score_keys, n_cand) # For random_search, all the param array vals should be unmasked assert_false(any(results['param_C'].mask) or any(results['param_gamma'].mask)) check_results_grid_scores_consistency(search)
def test_column_transformer_get_set_params(): ct = ColumnTransformer([('trans1', StandardScaler(), [0]), ('trans2', StandardScaler(), [1])]) exp = {'n_jobs': None, 'remainder': 'drop', 'sparse_threshold': 0.3, 'trans1': ct.transformers[0][1], 'trans1__copy': True, 'trans1__with_mean': True, 'trans1__with_std': True, 'trans2': ct.transformers[1][1], 'trans2__copy': True, 'trans2__with_mean': True, 'trans2__with_std': True, 'transformers': ct.transformers, 'transformer_weights': None} assert_dict_equal(ct.get_params(), exp) ct.set_params(trans1__with_mean=False) assert_false(ct.get_params()['trans1__with_mean']) ct.set_params(trans1='passthrough') exp = {'n_jobs': None, 'remainder': 'drop', 'sparse_threshold': 0.3, 'trans1': 'passthrough', 'trans2': ct.transformers[1][1], 'trans2__copy': True, 'trans2__with_mean': True, 'trans2__with_std': True, 'transformers': ct.transformers, 'transformer_weights': None} assert_dict_equal(ct.get_params(), exp)
def test_equal_similarities_and_preferences(): # Unequal distances X = np.array([[0, 0], [1, 1], [-2, -2]]) S = -euclidean_distances(X, squared=True) assert_false(_equal_similarities_and_preferences(S, np.array(0))) assert_false(_equal_similarities_and_preferences(S, np.array([0, 0]))) assert_false(_equal_similarities_and_preferences(S, np.array([0, 1]))) # Equal distances X = np.array([[0, 0], [1, 1]]) S = -euclidean_distances(X, squared=True) # Different preferences assert_false(_equal_similarities_and_preferences(S, np.array([0, 1]))) # Same preferences assert _equal_similarities_and_preferences(S, np.array([0, 0])) assert _equal_similarities_and_preferences(S, np.array(0))
def test_check_is_permutation(): rng = np.random.RandomState(0) p = np.arange(100) rng.shuffle(p) assert_true(_check_is_permutation(p, 100)) assert_false(_check_is_permutation(np.delete(p, 23), 100)) p[0] = 23 assert_false(_check_is_permutation(p, 100)) # Check if the additional duplicate indices are caught assert_false(_check_is_permutation(np.hstack((p, 0)), 100))
def test_forest_attributes(): mr = MondrianForestRegressor(n_estimators=5, random_state=0) mr.fit([[1, 2, 3], [4, 5, 6]], [1, 2]) assert_false(hasattr(mr, "classes_")) assert_false(hasattr(mr, "n_classes_")) mr.partial_fit([[1, 2, 3], [4, 5, 6]], [1, 2]) assert_false(hasattr(mr, "classes_")) assert_false(hasattr(mr, "n_classes_")) mr = MondrianForestClassifier(n_estimators=5, random_state=0) mr.fit([[1, 2, 3], [4, 5, 6]], [1, 2]) assert_true(hasattr(mr, "classes_")) assert_true(hasattr(mr, "n_classes_")) mr = MondrianForestClassifier(n_estimators=5, random_state=0) mr.partial_fit([[1, 2, 3], [4, 5, 6]], [1, 2]) assert_true(hasattr(mr, "classes_")) assert_true(hasattr(mr, "n_classes_"))
def test_single_validate_constraint(): # Test categorical cons = Single(0,1.0,'categorical') assert_true(cons._validate_constraint(1.0)) assert_false(cons._validate_constraint(1.1)) cons = Single(0,'a','categorical') assert_true(cons._validate_constraint('a')) assert_false(cons._validate_constraint('b')) # Test real cons = Single(0,1.0,'real') assert_true(cons._validate_constraint(1.0)) assert_false(cons._validate_constraint(1.1)) # Test integer cons = Single(0,1,'integer') assert_true(cons._validate_constraint(1)) assert_false(cons._validate_constraint(2))
def test_ovr_multilabel_predict_proba(): base_clf = MultinomialNB(alpha=1) for au in (False, True): X, Y = datasets.make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=au, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test = X[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) # Decision function only estimator. decision_only = OneVsRestClassifier(svm.SVR(gamma='scale') ).fit(X_train, Y_train) assert_false(hasattr(decision_only, 'predict_proba')) # Estimator with predict_proba disabled, depending on parameters. decision_only = OneVsRestClassifier(svm.SVC(gamma='scale', probability=False)) assert_false(hasattr(decision_only, 'predict_proba')) decision_only.fit(X_train, Y_train) assert_false(hasattr(decision_only, 'predict_proba')) assert_true(hasattr(decision_only, 'decision_function')) # Estimator which can get predict_proba enabled after fitting gs = GridSearchCV(svm.SVC(gamma='scale', probability=False), param_grid={'probability': [True]}) proba_after_fit = OneVsRestClassifier(gs) assert_false(hasattr(proba_after_fit, 'predict_proba')) proba_after_fit.fit(X_train, Y_train) assert_true(hasattr(proba_after_fit, 'predict_proba')) Y_pred = clf.predict(X_test) Y_proba = clf.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = Y_proba > .5 assert_array_equal(pred, Y_pred)
def test_scaler_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) scaler = StandardScaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) scaler_csr = StandardScaler(with_mean=False).fit(X_csr) X_csr_scaled = scaler_csr.transform(X_csr, copy=True) assert_false(np.any(np.isnan(X_csr_scaled.data))) scaler_csc = StandardScaler(with_mean=False).fit(X_csc) X_csc_scaled = scaler_csr.transform(X_csc, copy=True) assert_false(np.any(np.isnan(X_csc_scaled.data))) assert_equal(scaler.mean_, scaler_csr.mean_) assert_array_almost_equal(scaler.std_, scaler_csr.std_) assert_equal(scaler.mean_, scaler_csc.mean_) assert_array_almost_equal(scaler.std_, scaler_csc.std_) assert_array_almost_equal( X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert_true(X_scaled is not X) assert_true(X_csr_scaled is not X_csr) X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) assert_true(X_csr_scaled_back is not X_csr) assert_true(X_csr_scaled_back is not X_csr_scaled) assert_array_almost_equal(X_csr_scaled_back.toarray(), X) X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc()) assert_true(X_csc_scaled_back is not X_csc) assert_true(X_csc_scaled_back is not X_csc_scaled) assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
def test_imputation_copy(): # Test imputation with copy X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0) # copy=True, dense => copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_false(np.all(X == Xt)) # copy=True, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, dense => no copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_array_almost_equal(X, Xt) # copy=False, sparse csr, axis=1 => no copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_array_almost_equal(X.data, Xt.data) # copy=False, sparse csc => no copy X = X_orig.copy().tocsc() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_array_almost_equal(X.data, Xt.data) # copy=False, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data))
def test_label_binarizer_column_y(): # first for binary classification vs multi-label with 1 possible class # lists are multi-label, array is multi-class :-/ inp_list = [[1], [2], [1]] inp_array = np.array(inp_list) multilabel_indicator = np.array([[1, 0], [0, 1], [1, 0]]) binaryclass_array = np.array([[0], [1], [0]]) lb_1 = LabelBinarizer() out_1 = lb_1.fit_transform(inp_list) lb_2 = LabelBinarizer() out_2 = lb_2.fit_transform(inp_array) assert_array_equal(out_1, multilabel_indicator) assert_true(assert_warns(DeprecationWarning, getattr, lb_1, "multilabel_")) assert_false(assert_warns(DeprecationWarning, getattr, lb_1, "indicator_matrix_")) assert_array_equal(out_2, binaryclass_array) assert_false(assert_warns(DeprecationWarning, getattr, lb_2, "multilabel_")) # second for multiclass classification vs multi-label with multiple # classes inp_list = [[1], [2], [1], [3]] inp_array = np.array(inp_list) # the indicator matrix output is the same in this case indicator = np.array([[1, 0, 0], [0, 1, 0], [1, 0, 0], [0, 0, 1]]) lb_1 = LabelBinarizer() out_1 = lb_1.fit_transform(inp_list) lb_2 = LabelBinarizer() out_2 = lb_2.fit_transform(inp_array) assert_array_equal(out_1, out_2) assert_true(assert_warns(DeprecationWarning, getattr, lb_1, "multilabel_")) assert_array_equal(out_2, indicator) assert_false(assert_warns(DeprecationWarning, getattr, lb_2, "multilabel_"))
def test_bad_input(): # Test that it gives proper exception on deficient input # impossible value of C assert_raises(ValueError, svm.SVC(C=-1).fit, X, Y) # impossible value of nu clf = svm.NuSVC(nu=0.0) assert_raises(ValueError, clf.fit, X, Y) Y2 = Y[:-1] # wrong dimensions for labels assert_raises(ValueError, clf.fit, X, Y2) # Test with arrays that are non-contiguous. for clf in (svm.SVC(), svm.LinearSVC(random_state=0)): Xf = np.asfortranarray(X) assert_false(Xf.flags['C_CONTIGUOUS']) yf = np.ascontiguousarray(np.tile(Y, (2, 1)).T) yf = yf[:, -1] assert_false(yf.flags['F_CONTIGUOUS']) assert_false(yf.flags['C_CONTIGUOUS']) clf.fit(Xf, yf) assert_array_equal(clf.predict(T), true_result) # error for precomputed kernelsx clf = svm.SVC(kernel='precomputed') assert_raises(ValueError, clf.fit, X, Y) # sample_weight bad dimensions clf = svm.SVC() assert_raises(ValueError, clf.fit, X, Y, sample_weight=range(len(X) - 1)) # predict with sparse input when trained with dense clf = svm.SVC().fit(X, Y) assert_raises(ValueError, clf.predict, sparse.lil_matrix(X)) Xt = np.array(X).T clf.fit(np.dot(X, Xt), Y) assert_raises(ValueError, clf.predict, X) clf = svm.SVC() clf.fit(X, Y) assert_raises(ValueError, clf.predict, Xt)
def test_astype_copy_memory(): a_int32 = np.ones(3, np.int32) # Check that dtype conversion works b_float32 = astype(a_int32, dtype=np.float32, copy=False) assert_equal(b_float32.dtype, np.float32) # Changing dtype forces a copy even if copy=False assert_false(np.may_share_memory(b_float32, a_int32)) # Check that copy can be skipped if requested dtype match c_int32 = astype(a_int32, dtype=np.int32, copy=False) assert_true(c_int32 is a_int32) # Check that copy can be forced, and is the case by default: d_int32 = astype(a_int32, dtype=np.int32, copy=True) assert_false(np.may_share_memory(d_int32, a_int32)) e_int32 = astype(a_int32, dtype=np.int32) assert_false(np.may_share_memory(e_int32, a_int32))
def test_label_binarizer(): lb = LabelBinarizer() # one-class case defaults to negative label inp = ["pos", "pos", "pos", "pos"] expected = np.array([[0, 0, 0, 0]]).T got = lb.fit_transform(inp) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) # two-class case inp = ["neg", "pos", "pos", "neg"] expected = np.array([[0, 1, 1, 0]]).T got = lb.fit_transform(inp) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) assert_array_equal(lb.classes_, ["neg", "pos"]) assert_array_equal(expected, got) to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]]) assert_array_equal(lb.inverse_transform(to_invert), inp) # multi-class case inp = ["spam", "ham", "eggs", "ham", "0"] expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]) got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam']) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def test_metaestimator_delegation(): # Ensures specified metaestimators have methods iff subestimator does def hides(method): @property def wrapper(obj): if obj.hidden_method == method.__name__: raise AttributeError('%r is hidden' % obj.hidden_method) return functools.partial(method, obj) return wrapper class SubEstimator(BaseEstimator): def __init__(self, param=1, hidden_method=None): self.param = param self.hidden_method = hidden_method def fit(self, X, y=None, *args, **kwargs): self.coef_ = np.arange(X.shape[1]) return True def _check_fit(self): check_is_fitted(self, 'coef_') @hides def inverse_transform(self, X, *args, **kwargs): self._check_fit() return X @hides def transform(self, X, *args, **kwargs): self._check_fit() return X @hides def predict(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def predict_proba(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def predict_log_proba(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def decision_function(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def score(self, X, *args, **kwargs): self._check_fit() return 1.0 methods = [ k for k in iterkeys(SubEstimator.__dict__) if not k.startswith('_') and not k.startswith('fit') ] methods.sort() for delegator_data in DELEGATING_METAESTIMATORS: delegate = SubEstimator() delegator = delegator_data.construct(delegate) for method in methods: if method in delegator_data.skip_methods: continue assert_true(hasattr(delegate, method)) assert_true( hasattr(delegator, method), msg="%s does not have method %r when its delegate does" % (delegator_data.name, method)) # delegation before fit raises a NotFittedError assert_raises(NotFittedError, getattr(delegator, method), delegator_data.fit_args[0]) delegator.fit(*delegator_data.fit_args) for method in methods: if method in delegator_data.skip_methods: continue # smoke test delegation getattr(delegator, method)(delegator_data.fit_args[0]) for method in methods: if method in delegator_data.skip_methods: continue delegate = SubEstimator(hidden_method=method) delegator = delegator_data.construct(delegate) assert_false(hasattr(delegate, method)) assert_false(hasattr(delegator, method), msg="%s has method %r when its delegate does not" % (delegator_data.name, method))
def test_all_estimator_no_base_class(): # test that all_estimators doesn't find abstract classes. for name, Estimator in all_estimators(): msg = ("Base estimators such as {0} should not be included" " in all_estimators").format(name) assert_false(name.lower().startswith('base'), msg=msg)
def test_unfitted(): """Non-regression: before fit, there should be not fitted attributes.""" ms = MeanShift() assert_false(hasattr(ms, "cluster_centers_")) assert_false(hasattr(ms, "labels_"))
def test_search_iid_param(): # Test the IID parameter # noise-free simple 2d-data X, y = make_blobs(centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0, cluster_std=0.1, shuffle=False, n_samples=80) # split dataset into two folds that are not iid # first one contains data of all 4 blobs, second only from two. mask = np.ones(X.shape[0], dtype=np.bool) mask[np.where(y == 1)[0][::2]] = 0 mask[np.where(y == 2)[0][::2]] = 0 # this leads to perfect classification on one fold and a score of 1/3 on # the other # create "cv" for splits cv = [[mask, ~mask], [~mask, mask]] # once with iid=True (default) grid_search = GridSearchCV(SVC(), param_grid={'C': [1, 10]}, cv=cv) random_search = RandomizedSearchCV(SVC(), n_iter=2, param_distributions={'C': [1, 10]}, cv=cv) for search in (grid_search, random_search): search.fit(X, y) assert_true(search.iid) # Test the first candidate cv_scores = np.array( list(search.cv_results_['split%d_test_score' % s][0] for s in range(search.n_splits_))) mean = search.cv_results_['mean_test_score'][0] std = search.cv_results_['std_test_score'][0] assert_equal(search.cv_results_['param_C'][0], 1) assert_array_almost_equal(cv_scores, [1, 1. / 3.]) # for first split, 1/4 of dataset is in test, for second 3/4. # take weighted average and weighted std expected_mean = 1 * 1. / 4. + 1. / 3. * 3. / 4. expected_std = np.sqrt(1. / 4 * (expected_mean - 1)**2 + 3. / 4 * (expected_mean - 1. / 3.)**2) assert_almost_equal(mean, expected_mean) assert_almost_equal(std, expected_std) # once with iid=False grid_search = GridSearchCV(SVC(), param_grid={'C': [1, 10]}, cv=cv, iid=False) random_search = RandomizedSearchCV(SVC(), n_iter=2, param_distributions={'C': [1, 10]}, cv=cv, iid=False) for search in (grid_search, random_search): search.fit(X, y) assert_false(search.iid) cv_scores = np.array( list(search.cv_results_['split%d_test_score' % s][0] for s in range(search.n_splits_))) mean = search.cv_results_['mean_test_score'][0] std = search.cv_results_['std_test_score'][0] assert_equal(search.cv_results_['param_C'][0], 1) # scores are the same as above assert_array_almost_equal(cv_scores, [1, 1. / 3.]) # Unweighted mean/std is used assert_almost_equal(mean, np.mean(cv_scores)) assert_almost_equal(std, np.std(cv_scores))
def test_pipeline_ducktyping(): pipeline = make_pipeline(Mult(5)) pipeline.predict pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf()) assert_false(hasattr(pipeline, 'predict')) pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(None) assert_false(hasattr(pipeline, 'predict')) pipeline.transform pipeline.inverse_transform pipeline = make_pipeline(Transf(), NoInvTransf()) assert_false(hasattr(pipeline, 'predict')) pipeline.transform assert_false(hasattr(pipeline, 'inverse_transform')) pipeline = make_pipeline(NoInvTransf(), Transf()) assert_false(hasattr(pipeline, 'predict')) pipeline.transform assert_false(hasattr(pipeline, 'inverse_transform'))
def test_pipeline_init(): # Test the various init parameters of the pipeline. assert_raises(TypeError, Pipeline) # Check that we can't instantiate pipelines with objects without fit # method assert_raises_regex( TypeError, 'Last step of Pipeline should implement fit. ' '.*NoFit.*', Pipeline, [('clf', NoFit())]) # Smoke test with only an estimator clf = NoTrans() pipe = Pipeline([('svc', clf)]) assert_equal( pipe.get_params(deep=True), dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False))) # Check that params are set pipe.set_params(svc__a=0.1) assert_equal(clf.a, 0.1) assert_equal(clf.b, None) # Smoke test the repr: repr(pipe) # Test with two objects clf = SVC() filter1 = SelectKBest(f_classif) pipe = Pipeline([('anova', filter1), ('svc', clf)]) # Check that we can't instantiate with non-transformers on the way # Note that NoTrans implements fit, but not transform assert_raises_regex( TypeError, 'All intermediate steps should be transformers' '.*\\bNoTrans\\b.*', Pipeline, [('t', NoTrans()), ('svc', clf)]) # Check that params are set pipe.set_params(svc__C=0.1) assert_equal(clf.C, 0.1) # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong assert_raises(ValueError, pipe.set_params, anova__C=0.1) # Test clone pipe2 = assert_no_warnings(clone, pipe) assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc']) # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop('svc') params.pop('anova') params2.pop('svc') params2.pop('anova') assert_equal(params, params2)
def test_sgd_proba(self): # Check SGD.predict_proba # Hinge loss does not allow for conditional prob estimate. # We cannot use the factory here, because it defines predict_proba # anyway. clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=10).fit(X, Y) assert_false(hasattr(clf, "predict_proba")) assert_false(hasattr(clf, "predict_log_proba")) # log and modified_huber losses can output probability estimates # binary case for loss in ["log", "modified_huber"]: clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10) clf.fit(X, Y) p = clf.predict_proba([[3, 2]]) assert_true(p[0, 1] > 0.5) p = clf.predict_proba([[-1, -1]]) assert_true(p[0, 1] < 0.5) p = clf.predict_log_proba([[3, 2]]) assert_true(p[0, 1] > p[0, 0]) p = clf.predict_log_proba([[-1, -1]]) assert_true(p[0, 1] < p[0, 0]) # log loss multiclass probability estimates clf = self.factory(loss="log", alpha=0.01, n_iter=10).fit(X2, Y2) d = clf.decision_function([[.1, -.1], [.3, .2]]) p = clf.predict_proba([[.1, -.1], [.3, .2]]) assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1)) assert_almost_equal(p[0].sum(), 1) assert_true(np.all(p[0] >= 0)) p = clf.predict_proba([[-1, -1]]) d = clf.decision_function([[-1, -1]]) assert_array_equal(np.argsort(p[0]), np.argsort(d[0])) l = clf.predict_log_proba([[3, 2]]) p = clf.predict_proba([[3, 2]]) assert_array_almost_equal(np.log(p), l) l = clf.predict_log_proba([[-1, -1]]) p = clf.predict_proba([[-1, -1]]) assert_array_almost_equal(np.log(p), l) # Modified Huber multiclass probability estimates; requires a separate # test because the hard zero/one probabilities may destroy the # ordering present in decision_function output. clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10) clf.fit(X2, Y2) d = clf.decision_function([[3, 2]]) p = clf.predict_proba([[3, 2]]) if not isinstance(self, SparseSGDClassifierTestCase): assert_equal(np.argmax(d, axis=1), np.argmax(p, axis=1)) else: # XXX the sparse test gets a different X2 (?) assert_equal(np.argmin(d, axis=1), np.argmin(p, axis=1)) # the following sample produces decision_function values < -1, # which would cause naive normalization to fail (see comment # in SGDClassifier.predict_proba) x = X.mean(axis=0) d = clf.decision_function([x]) if np.all(d < -1): # XXX not true in sparse test case (why?) p = clf.predict_proba([x]) assert_array_almost_equal(p[0], [1 / 3.] * 3)
def test_scaler_int(): # test that scaler converts integer input to floating # for both sparse and dense matrices rng = np.random.RandomState(42) X = rng.randint(20, size=(4, 5)) X[:, 0] = 0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) with warnings.catch_warnings(record=True): X_null = null_transform.fit_transform(X_csr) assert_array_equal(X_null.data, X_csr.data) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.data, X_csr.data) with warnings.catch_warnings(record=True): scaler = StandardScaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) with warnings.catch_warnings(record=True): scaler_csr = StandardScaler(with_mean=False).fit(X_csr) X_csr_scaled = scaler_csr.transform(X_csr, copy=True) assert_false(np.any(np.isnan(X_csr_scaled.data))) with warnings.catch_warnings(record=True): scaler_csc = StandardScaler(with_mean=False).fit(X_csc) X_csc_scaled = scaler_csr.transform(X_csc, copy=True) assert_false(np.any(np.isnan(X_csc_scaled.data))) assert_equal(scaler.mean_, scaler_csr.mean_) assert_array_almost_equal(scaler.std_, scaler_csr.std_) assert_equal(scaler.mean_, scaler_csc.mean_) assert_array_almost_equal(scaler.std_, scaler_csc.std_) assert_array_almost_equal(X_scaled.mean(axis=0), [0., 1.109, 1.856, 21., 1.559], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0( X_csr_scaled.astype(np.float)) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert_true(X_scaled is not X) assert_true(X_csr_scaled is not X_csr) X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled) assert_true(X_csr_scaled_back is not X_csr) assert_true(X_csr_scaled_back is not X_csr_scaled) assert_array_almost_equal(X_csr_scaled_back.toarray(), X) X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc()) assert_true(X_csc_scaled_back is not X_csc) assert_true(X_csc_scaled_back is not X_csc_scaled) assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
def test_initialize_nn_output(): """Test that NNDSVD does not return negative values""" data = np.abs(random_state.randn(10, 10)) for var in (None, 'a', 'ar'): W, H = nmf._initialize_nmf(data, 10) assert_false((W < 0).any() or (H < 0).any())
def test_nls_nn_output(): """Test that NLS solver doesn't return negative values""" A = np.atleast_2d(range(1, 5)) Ap, _, _ = nmf._nls_subproblem(np.dot(A.T, -A), A.T, A, 0.001, 100) assert_false((Ap < 0).any())