def test_calibration_multiclass(): """Test calibration for multiclass """ # test multi-class setting with classifier that implements # only decision function clf = LinearSVC() X, y_idx = make_blobs(n_samples=100, n_features=2, random_state=42, centers=3, cluster_std=3.0) # Use categorical labels to check that CalibratedClassifierCV supports # them correctly target_names = np.array(['a', 'b', 'c']) y = target_names[y_idx] X_train, y_train = X[::2], y[::2] X_test, y_test = X[1::2], y[1::2] clf.fit(X_train, y_train) for method in ['isotonic', 'sigmoid']: cal_clf = CalibratedClassifierCV(clf, method=method, cv=2) cal_clf.fit(X_train, y_train) probas = cal_clf.predict_proba(X_test) assert_array_almost_equal(np.sum(probas, axis=1), np.ones(len(X_test))) # Check that log-loss of calibrated classifier is smaller than # log-loss of naively turned OvR decision function to probabilities # via softmax def softmax(y_pred): e = np.exp(-y_pred) return e / e.sum(axis=1).reshape(-1, 1) uncalibrated_log_loss = \ log_loss(y_test, softmax(clf.decision_function(X_test))) calibrated_log_loss = log_loss(y_test, probas) assert uncalibrated_log_loss >= calibrated_log_loss # Test that calibration of a multiclass classifier decreases log-loss # for RandomForestClassifier X, y = make_blobs(n_samples=100, n_features=2, random_state=42, cluster_std=3.0) X_train, y_train = X[::2], y[::2] X_test, y_test = X[1::2], y[1::2] clf = RandomForestClassifier(n_estimators=10, random_state=42) clf.fit(X_train, y_train) clf_probs = clf.predict_proba(X_test) loss = log_loss(y_test, clf_probs) for method in ['isotonic', 'sigmoid']: cal_clf = CalibratedClassifierCV(clf, method=method, cv=3) cal_clf.fit(X_train, y_train) cal_clf_probs = cal_clf.predict_proba(X_test) cal_loss = log_loss(y_test, cal_clf_probs) assert loss > cal_loss
def test_thresholded_scorers(): # Test scorers that take thresholds. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) assert_almost_equal(score1, score3) logscore = get_scorer('neg_log_loss')(clf, X_test, y_test) logloss = log_loss(y_test, clf.predict_proba(X_test)) assert_almost_equal(-logscore, logloss) # same for an estimator without decision_function clf = DecisionTreeClassifier() clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) # test with a regressor (no decision_function) reg = DecisionTreeRegressor() reg.fit(X_train, y_train) score1 = get_scorer('roc_auc')(reg, X_test, y_test) score2 = roc_auc_score(y_test, reg.predict(X_test)) assert_almost_equal(score1, score2) # Test that an exception is raised on more than two classes X, y = make_blobs(random_state=0, centers=3) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf.fit(X_train, y_train) with pytest.raises(ValueError, match="multiclass format is not supported"): get_scorer('roc_auc')(clf, X_test, y_test) # test error is raised with a single class present in model # (predict_proba shape is not suitable for binary auc) X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = DecisionTreeClassifier() clf.fit(X_train, np.zeros_like(y_train)) with pytest.raises(ValueError, match="need classifier with two classes"): get_scorer('roc_auc')(clf, X_test, y_test) # for proba scorers with pytest.raises(ValueError, match="need classifier with two classes"): get_scorer('neg_log_loss')(clf, X_test, y_test)
def test_decision_function_shape(): # check that decision_function_shape='ovr' gives # correct shape and is consistent with predict clf = svm.SVC(kernel='linear', C=0.1, decision_function_shape='ovr').fit(iris.data, iris.target) dec = clf.decision_function(iris.data) assert dec.shape == (len(iris.data), 3) assert_array_equal(clf.predict(iris.data), np.argmax(dec, axis=1)) # with five classes: X, y = make_blobs(n_samples=80, centers=5, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = svm.SVC(kernel='linear', C=0.1, decision_function_shape='ovr').fit(X_train, y_train) dec = clf.decision_function(X_test) assert dec.shape == (len(X_test), 5) assert_array_equal(clf.predict(X_test), np.argmax(dec, axis=1)) # check shape of ovo_decition_function=True clf = svm.SVC(kernel='linear', C=0.1, decision_function_shape='ovo').fit(X_train, y_train) dec = clf.decision_function(X_train) assert dec.shape == (len(X_train), 10)
def test_svc_ovr_tie_breaking(SVCClass): """Test if predict breaks ties in OVR mode. Related issue: https://github.com/scikit-learn/scikit-learn/issues/8277 """ X, y = make_blobs(random_state=27) xs = np.linspace(X[:, 0].min(), X[:, 0].max(), 1000) ys = np.linspace(X[:, 1].min(), X[:, 1].max(), 1000) xx, yy = np.meshgrid(xs, ys) svm = SVCClass(kernel="linear", decision_function_shape='ovr', break_ties=False, random_state=42).fit(X, y) pred = svm.predict(np.c_[xx.ravel(), yy.ravel()]) dv = svm.decision_function(np.c_[xx.ravel(), yy.ravel()]) assert not np.all(pred == np.argmax(dv, axis=1)) svm = SVCClass(kernel="linear", decision_function_shape='ovr', break_ties=True, random_state=42).fit(X, y) pred = svm.predict(np.c_[xx.ravel(), yy.ravel()]) dv = svm.decision_function(np.c_[xx.ravel(), yy.ravel()]) assert np.all(pred == np.argmax(dv, axis=1))
def test_make_blobs_n_samples_list(): n_samples = [50, 30, 20] X, y = make_blobs(n_samples=n_samples, n_features=2, random_state=0) assert X.shape == (sum(n_samples), 2), "X shape mismatch" assert all(np.bincount(y, minlength=len(n_samples)) == n_samples), \ "Incorrect number of samples per blob"
def test_compute_class_weight_invariance(): # Test that results with class_weight="balanced" is invariant wrt # class imbalance if the number of samples is identical. # The test uses a balanced two class dataset with 100 datapoints. # It creates three versions, one where class 1 is duplicated # resulting in 150 points of class 1 and 50 of class 0, # one where there are 50 points in class 1 and 150 in class 0, # and one where there are 100 points of each class (this one is balanced # again). # With balancing class weights, all three should give the same model. X, y = make_blobs(centers=2, random_state=0) # create dataset where class 1 is duplicated twice X_1 = np.vstack([X] + [X[y == 1]] * 2) y_1 = np.hstack([y] + [y[y == 1]] * 2) # create dataset where class 0 is duplicated twice X_0 = np.vstack([X] + [X[y == 0]] * 2) y_0 = np.hstack([y] + [y[y == 0]] * 2) # duplicate everything X_ = np.vstack([X] * 2) y_ = np.hstack([y] * 2) # results should be identical logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1) logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0) logreg = LogisticRegression(class_weight="balanced").fit(X_, y_) assert_array_almost_equal(logreg1.coef_, logreg0.coef_) assert_array_almost_equal(logreg.coef_, logreg0.coef_)
def test_n_clusters(): # Test that n_clusters param works properly X, y = make_blobs(n_samples=100, centers=10) brc1 = Birch(n_clusters=10) brc1.fit(X) assert len(brc1.subcluster_centers_) > 10 assert len(np.unique(brc1.labels_)) == 10 # Test that n_clusters = Agglomerative Clustering gives # the same results. gc = AgglomerativeClustering(n_clusters=10) brc2 = Birch(n_clusters=gc) brc2.fit(X) assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_) assert_array_equal(brc1.labels_, brc2.labels_) # Test that the wrong global clustering step raises an Error. clf = ElasticNet() brc3 = Birch(n_clusters=clf) with pytest.raises(ValueError): brc3.fit(X) # Test that a small number of clusters raises a warning. brc4 = Birch(threshold=10000.) assert_warns(ConvergenceWarning, brc4.fit, X)
def test_decision_function_shape_two_class(): for n_classes in [2, 3]: X, y = make_blobs(centers=n_classes, random_state=0) for estimator in [svm.SVC, svm.NuSVC]: clf = OneVsRestClassifier( estimator(decision_function_shape="ovr")).fit(X, y) assert len(clf.predict(X)) == len(y)
def test_make_blobs_n_samples_centers_none(n_samples): centers = None X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=0) assert X.shape == (sum(n_samples), 2), "X shape mismatch" assert all(np.bincount(y, minlength=len(n_samples)) == n_samples), \ "Incorrect number of samples per blob"
def test_binary_classifier_class_weight(): """tests binary classifier with classweights for each class""" alpha = .1 n_samples = 50 n_iter = 20 tol = .00001 fit_intercept = True X, y = make_blobs(n_samples=n_samples, centers=2, random_state=10, cluster_std=0.1) step_size = get_step_size(X, alpha, fit_intercept, classification=True) classes = np.unique(y) y_tmp = np.ones(n_samples) y_tmp[y != classes[1]] = -1 y = y_tmp class_weight = {1: .45, -1: .55} clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples, max_iter=n_iter, tol=tol, random_state=77, fit_intercept=fit_intercept, multi_class='ovr', class_weight=class_weight) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) le = LabelEncoder() class_weight_ = compute_class_weight(class_weight, np.unique(y), y) sample_weight = class_weight_[le.fit_transform(y)] spweights, spintercept = sag_sparse(X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, sample_weight=sample_weight, fit_intercept=fit_intercept) spweights2, spintercept2 = sag_sparse(X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, sparse=True, sample_weight=sample_weight, fit_intercept=fit_intercept) assert_array_almost_equal(clf1.coef_.ravel(), spweights.ravel(), decimal=2) assert_almost_equal(clf1.intercept_, spintercept, decimal=1) assert_array_almost_equal(clf2.coef_.ravel(), spweights2.ravel(), decimal=2) assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)
def test_init_transformation(): rng = np.random.RandomState(42) X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0) # Start learning from scratch nca = NeighborhoodComponentsAnalysis(init='identity') nca.fit(X, y) # Initialize with random nca_random = NeighborhoodComponentsAnalysis(init='random') nca_random.fit(X, y) # Initialize with auto nca_auto = NeighborhoodComponentsAnalysis(init='auto') nca_auto.fit(X, y) # Initialize with PCA nca_pca = NeighborhoodComponentsAnalysis(init='pca') nca_pca.fit(X, y) # Initialize with LDA nca_lda = NeighborhoodComponentsAnalysis(init='lda') nca_lda.fit(X, y) init = rng.rand(X.shape[1], X.shape[1]) nca = NeighborhoodComponentsAnalysis(init=init) nca.fit(X, y) # init.shape[1] must match X.shape[1] init = rng.rand(X.shape[1], X.shape[1] + 1) nca = NeighborhoodComponentsAnalysis(init=init) assert_raise_message(ValueError, 'The input dimensionality ({}) of the given ' 'linear transformation `init` must match the ' 'dimensionality of the given inputs `X` ({}).' .format(init.shape[1], X.shape[1]), nca.fit, X, y) # init.shape[0] must be <= init.shape[1] init = rng.rand(X.shape[1] + 1, X.shape[1]) nca = NeighborhoodComponentsAnalysis(init=init) assert_raise_message(ValueError, 'The output dimensionality ({}) of the given ' 'linear transformation `init` cannot be ' 'greater than its input dimensionality ({}).' .format(init.shape[0], init.shape[1]), nca.fit, X, y) # init.shape[0] must match n_components init = rng.rand(X.shape[1], X.shape[1]) n_components = X.shape[1] - 2 nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components) assert_raise_message(ValueError, 'The preferred dimensionality of the ' 'projected space `n_components` ({}) does not match ' 'the output dimensionality of the given ' 'linear transformation `init` ({})!' .format(n_components, init.shape[0]), nca.fit, X, y)
def test_pipeline(): # check that Isomap works fine as a transformer in a Pipeline # only checks that no error is raised. # TODO check that it actually does something useful X, y = datasets.make_blobs(random_state=0) clf = pipeline.Pipeline([('isomap', manifold.Isomap()), ('clf', neighbors.KNeighborsClassifier())]) clf.fit(X, y) assert .9 < clf.score(X, y)
def test_supervised_cluster_scorers(): # Test clustering scorers against gold standard labeling. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) km = KMeans(n_clusters=3) km.fit(X_train) for name in CLUSTER_SCORERS: score1 = get_scorer(name)(km, X_test, y_test) score2 = getattr(cluster_module, name)(y_test, km.predict(X_test)) assert_almost_equal(score1, score2)
def test_svc_invalid_break_ties_param(SVCClass): X, y = make_blobs(random_state=42) svm = SVCClass(kernel="linear", decision_function_shape='ovo', break_ties=True, random_state=42).fit(X, y) with pytest.raises(ValueError, match="break_ties must be False"): svm.predict(y)
def test_threshold(): # Test that the leaf subclusters have a threshold lesser than radius X, y = make_blobs(n_samples=80, centers=4) brc = Birch(threshold=0.5, n_clusters=None) brc.fit(X) check_threshold(brc, 0.5) brc = Birch(threshold=5.0, n_clusters=None) brc.fit(X) check_threshold(brc, 5.)
def test_kde_pipeline_gridsearch(): # test that kde plays nice in pipelines and grid-searches X, _ = make_blobs(cluster_std=.1, random_state=1, centers=[[0, 1], [1, 0], [0, 0]]) pipe1 = make_pipeline(StandardScaler(with_mean=False, with_std=False), KernelDensity(kernel="gaussian")) params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10]) search = GridSearchCV(pipe1, param_grid=params) search.fit(X) assert search.best_params_['kerneldensity__bandwidth'] == .1
def test_n_samples_leaves_roots(): # Sanity check for the number of samples in leaves and roots X, y = make_blobs(n_samples=10) brc = Birch() brc.fit(X) n_samples_root = sum([sc.n_samples_ for sc in brc.root_.subclusters_]) n_samples_leaves = sum([ sc.n_samples_ for leaf in brc._get_leaves() for sc in leaf.subclusters_ ]) assert n_samples_leaves == X.shape[0] assert n_samples_root == X.shape[0]
def test_pipeline(): # check that LocallyLinearEmbedding works fine as a Pipeline # only checks that no error is raised. # TODO check that it actually does something useful from mrex import pipeline, datasets X, y = datasets.make_blobs(random_state=0) clf = pipeline.Pipeline( [('filter', manifold.LocallyLinearEmbedding(random_state=0)), ('clf', neighbors.KNeighborsClassifier())]) clf.fit(X, y) assert .9 < clf.score(X, y)
def test_covariance(): x, y = make_blobs(n_samples=100, n_features=5, centers=1, random_state=42) # make features correlated x = np.dot(x, np.arange(x.shape[1]**2).reshape(x.shape[1], x.shape[1])) c_e = _cov(x, 'empirical') assert_almost_equal(c_e, c_e.T) c_s = _cov(x, 'auto') assert_almost_equal(c_s, c_s.T)
def test_check_is_fitted(): # Check is TypeError raised when non estimator instance passed assert_raises(TypeError, check_is_fitted, ARDRegression) assert_raises(TypeError, check_is_fitted, "SVR") ard = ARDRegression() svr = SVR() try: assert_raises(NotFittedError, check_is_fitted, ard) assert_raises(NotFittedError, check_is_fitted, svr) except ValueError: assert False, "check_is_fitted failed with ValueError" # NotFittedError is a subclass of both ValueError and AttributeError try: check_is_fitted(ard, msg="Random message %(name)s, %(name)s") except ValueError as e: assert str(e) == "Random message ARDRegression, ARDRegression" try: check_is_fitted(svr, msg="Another message %(name)s, %(name)s") except AttributeError as e: assert str(e) == "Another message SVR, SVR" ard.fit(*make_blobs()) svr.fit(*make_blobs()) assert check_is_fitted(ard) is None assert check_is_fitted(svr) is None # to be removed in 0.23 assert_warns_message( DeprecationWarning, "Passing attributes to check_is_fitted is deprecated", check_is_fitted, ard, ['coef_']) assert_warns_message(DeprecationWarning, "Passing all_or_any to check_is_fitted is deprecated", check_is_fitted, ard, all_or_any=any)
def test_sparse_oneclasssvm(datasets_index, kernel): # Check that sparse OneClassSVM gives the same result as dense OneClassSVM # many class dataset: X_blobs, _ = make_blobs(n_samples=100, centers=10, random_state=0) X_blobs = sparse.csr_matrix(X_blobs) datasets = [[X_sp, None, T], [X2_sp, None, T2], [X_blobs[:80], None, X_blobs[80:]], [iris.data, None, iris.data]] dataset = datasets[datasets_index] clf = svm.OneClassSVM(gamma=1, kernel=kernel) sp_clf = svm.OneClassSVM(gamma=1, kernel=kernel) check_svm_model_equal(clf, sp_clf, *dataset)
def test_sag_classifier_computed_correctly(): """tests if the binary classifier is computed correctly""" alpha = .1 n_samples = 50 n_iter = 50 tol = .00001 fit_intercept = True X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1) step_size = get_step_size(X, alpha, fit_intercept, classification=True) classes = np.unique(y) y_tmp = np.ones(n_samples) y_tmp[y != classes[1]] = -1 y = y_tmp clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples, max_iter=n_iter, tol=tol, random_state=77, fit_intercept=fit_intercept, multi_class='ovr') clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) spweights, spintercept = sag_sparse(X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, fit_intercept=fit_intercept) spweights2, spintercept2 = sag_sparse(X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, sparse=True, fit_intercept=fit_intercept) assert_array_almost_equal(clf1.coef_.ravel(), spweights.ravel(), decimal=2) assert_almost_equal(clf1.intercept_, spintercept, decimal=1) assert_array_almost_equal(clf2.coef_.ravel(), spweights2.ravel(), decimal=2) assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)
def test_sparse_X(): # Test that sparse and dense data give same results X, y = make_blobs(n_samples=100, centers=10) brc = Birch(n_clusters=10) brc.fit(X) csr = sparse.csr_matrix(X) brc_sparse = Birch(n_clusters=10) brc_sparse.fit(csr) assert_array_equal(brc.labels_, brc_sparse.labels_) assert_array_almost_equal(brc.subcluster_centers_, brc_sparse.subcluster_centers_)
def test_classifier_matching(): n_samples = 20 X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1) y[y == 0] = -1 alpha = 1.1 fit_intercept = True step_size = get_step_size(X, alpha, fit_intercept) for solver in ['sag', 'saga']: if solver == 'sag': n_iter = 80 else: # SAGA variance w.r.t. stream order is higher n_iter = 300 clf = LogisticRegression(solver=solver, fit_intercept=fit_intercept, tol=1e-11, C=1. / alpha / n_samples, max_iter=n_iter, random_state=10, multi_class='ovr') clf.fit(X, y) weights, intercept = sag_sparse(X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, fit_intercept=fit_intercept, saga=solver == 'saga') weights2, intercept2 = sag(X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, fit_intercept=fit_intercept, saga=solver == 'saga') weights = np.atleast_2d(weights) intercept = np.atleast_1d(intercept) weights2 = np.atleast_2d(weights2) intercept2 = np.atleast_1d(intercept2) assert_array_almost_equal(weights, clf.coef_, decimal=9) assert_array_almost_equal(intercept, clf.intercept_, decimal=9) assert_array_almost_equal(weights2, clf.coef_, decimal=9) assert_array_almost_equal(intercept2, clf.intercept_, decimal=9)
def test_make_blobs(): cluster_stds = np.array([0.05, 0.2, 0.4]) cluster_centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]]) X, y = make_blobs(random_state=0, n_samples=50, n_features=2, centers=cluster_centers, cluster_std=cluster_stds) assert X.shape == (50, 2), "X shape mismatch" assert y.shape == (50, ), "y shape mismatch" assert np.unique(y).shape == (3, ), "Unexpected number of blobs" for i, (ctr, std) in enumerate(zip(cluster_centers, cluster_stds)): assert_almost_equal((X[y == i] - ctr).std(), std, 1, "Unexpected std")
def test_make_blobs_n_samples_list_with_centers(): n_samples = [20, 20, 20] centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]]) cluster_stds = np.array([0.05, 0.2, 0.4]) X, y = make_blobs(n_samples=n_samples, centers=centers, cluster_std=cluster_stds, random_state=0) assert X.shape == (sum(n_samples), 2), "X shape mismatch" assert all(np.bincount(y, minlength=len(n_samples)) == n_samples), \ "Incorrect number of samples per blob" for i, (ctr, std) in enumerate(zip(centers, cluster_stds)): assert_almost_equal((X[y == i] - ctr).std(), std, 1, "Unexpected std")
def test_classification_scores(): # Test classification scorers. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LinearSVC(random_state=0) clf.fit(X_train, y_train) for prefix, metric in [('f1', f1_score), ('precision', precision_score), ('recall', recall_score), ('jaccard', jaccard_score)]: score1 = get_scorer('%s_weighted' % prefix)(clf, X_test, y_test) score2 = metric(y_test, clf.predict(X_test), pos_label=None, average='weighted') assert_almost_equal(score1, score2) score1 = get_scorer('%s_macro' % prefix)(clf, X_test, y_test) score2 = metric(y_test, clf.predict(X_test), pos_label=None, average='macro') assert_almost_equal(score1, score2) score1 = get_scorer('%s_micro' % prefix)(clf, X_test, y_test) score2 = metric(y_test, clf.predict(X_test), pos_label=None, average='micro') assert_almost_equal(score1, score2) score1 = get_scorer('%s' % prefix)(clf, X_test, y_test) score2 = metric(y_test, clf.predict(X_test), pos_label=1) assert_almost_equal(score1, score2) # test fbeta score that takes an argument scorer = make_scorer(fbeta_score, beta=2) score1 = scorer(clf, X_test, y_test) score2 = fbeta_score(y_test, clf.predict(X_test), beta=2) assert_almost_equal(score1, score2) # test that custom scorer can be pickled unpickled_scorer = pickle.loads(pickle.dumps(scorer)) score3 = unpickled_scorer(clf, X_test, y_test) assert_almost_equal(score1, score3) # smoke test the repr: repr(fbeta_score)
def test_raises_on_score_list(): # Test that when a list of scores is returned, we raise proper errors. X, y = make_blobs(random_state=0) f1_scorer_no_average = make_scorer(f1_score, average=None) clf = DecisionTreeClassifier() assert_raises(ValueError, cross_val_score, clf, X, y, scoring=f1_scorer_no_average) grid_search = GridSearchCV(clf, scoring=f1_scorer_no_average, param_grid={'max_depth': [1, 2]}) assert_raises(ValueError, grid_search.fit, X, y)
def generate_data(n_samples, n_features): """Generate random blob-ish data with noisy features. This returns an array of input data with shape `(n_samples, n_features)` and an array of `n_samples` target labels. Only one feature contains discriminative information, the other features contain only noise. """ X, y = make_blobs(n_samples=n_samples, n_features=1, centers=[[-2], [2]]) # add non-discriminative features if n_features > 1: X = np.hstack([X, np.random.randn(n_samples, n_features - 1)]) return X, y
def test_partial_fit(): # Test that fit is equivalent to calling partial_fit multiple times X, y = make_blobs(n_samples=100) brc = Birch(n_clusters=3) brc.fit(X) brc_partial = Birch(n_clusters=None) brc_partial.partial_fit(X[:50]) brc_partial.partial_fit(X[50:]) assert_array_almost_equal(brc_partial.subcluster_centers_, brc.subcluster_centers_) # Test that same global labels are obtained after calling partial_fit # with None brc_partial.set_params(n_clusters=3) brc_partial.partial_fit(None) assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_)