def check_classifiers_classes(name, Classifier): X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) # We need to make sure that we have non negative data, for things # like NMF X -= X.min() - .1 y_names = np.array(["one", "two", "three"])[y] for y_names in [y_names, y_names.astype('O')]: if name in ["LabelPropagation", "LabelSpreading"]: # TODO some complication with -1 label y_ = y else: y_ = y_names classes = np.unique(y_) # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() if name == 'BernoulliNB': classifier.set_params(binarize=X.mean()) set_fast_parameters(classifier) # fit classifier.fit(X, y_) y_pred = classifier.predict(X) # training set performance assert_array_equal(np.unique(y_), np.unique(y_pred)) if np.any(classifier.classes_ != classes): print("Unexpected classes_ attribute for %r: " "expected %s, got %s" % (classifier, classes, classifier.classes_))
def test_multilabel_binarizer_empty_sample(): mlb = MultiLabelBinarizer() y = [[1, 2], [1], []] Y = np.array([[1, 1], [1, 0], [0, 0]]) assert_array_equal(mlb.fit_transform(y), Y)
def test_kneighbors_graph(): """Test kneighbors_graph to build the k-Nearest Neighbor graph.""" X = np.array([[0, 1], [1.01, 1.], [2, 0]]) # n_neighbors = 1 A = neighbors.kneighbors_graph(X, 1, mode='connectivity') assert_array_equal(A.toarray(), np.eye(A.shape[0])) A = neighbors.kneighbors_graph(X, 1, mode='distance') assert_array_almost_equal( A.toarray(), [[0.00, 1.01, 0.], [1.01, 0., 0.], [0.00, 1.40716026, 0.]]) # n_neighbors = 2 A = neighbors.kneighbors_graph(X, 2, mode='connectivity') assert_array_equal( A.toarray(), [[1., 1., 0.], [1., 1., 0.], [0., 1., 1.]]) A = neighbors.kneighbors_graph(X, 2, mode='distance') assert_array_almost_equal( A.toarray(), [[0., 1.01, 2.23606798], [1.01, 0., 1.40716026], [2.23606798, 1.40716026, 0.]]) # n_neighbors = 3 A = neighbors.kneighbors_graph(X, 3, mode='connectivity') assert_array_almost_equal( A.toarray(), [[1, 1, 1], [1, 1, 1], [1, 1, 1]])
def test_radius_neighbors_classifier(n_samples=40, n_features=5, n_test_pts=10, radius=0.5, random_state=0): """Test radius-based classification""" rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = ((X ** 2).sum(axis=1) < .5).astype(np.int) y_str = y.astype(str) weight_func = _weight_func for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: neigh = neighbors.RadiusNeighborsClassifier(radius=radius, weights=weights, algorithm=algorithm) neigh.fit(X, y) epsilon = 1e-5 * (2 * rng.rand(1, n_features) - 1) y_pred = neigh.predict(X[:n_test_pts] + epsilon) assert_array_equal(y_pred, y[:n_test_pts]) neigh.fit(X, y_str) y_pred = neigh.predict(X[:n_test_pts] + epsilon) assert_array_equal(y_pred, y_str[:n_test_pts])
def test_precision_recall_f1_score_binary(): """Test Precision Recall and F1 Score for binary classification task""" y_true, y_pred, _ = make_prediction(binary=True) # detailed measures for each class p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) assert_array_almost_equal(p, [0.73, 0.85], 2) assert_array_almost_equal(r, [0.88, 0.68], 2) assert_array_almost_equal(f, [0.80, 0.76], 2) assert_array_equal(s, [25, 25]) # individual scoring function that can be used for grid search: in the # binary class case the score is the value of the measure for the positive # class (e.g. label == 1) ps = precision_score(y_true, y_pred) assert_array_almost_equal(ps, 0.85, 2) rs = recall_score(y_true, y_pred) assert_array_almost_equal(rs, 0.68, 2) fs = f1_score(y_true, y_pred) assert_array_almost_equal(fs, 0.76, 2) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2), (1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2)
def test_radius_neighbors_classifier_when_no_neighbors(): """ Test radius-based classifier when no neighbors found. In this case it should rise an informative exception """ X = np.array([[1.0, 1.0], [2.0, 2.0]]) y = np.array([1, 2]) radius = 0.1 z1 = np.array([[1.01, 1.01], [2.01, 2.01]]) # no outliers z2 = np.array([[1.01, 1.01], [1.4, 1.4]]) # one outlier weight_func = _weight_func for outlier_label in [0, -1, None]: for algorithm in ALGORITHMS: for weights in ['uniform', 'distance', weight_func]: rnc = neighbors.RadiusNeighborsClassifier clf = rnc(radius=radius, weights=weights, algorithm=algorithm, outlier_label=outlier_label) clf.fit(X, y) assert_array_equal(np.array([1, 2]), clf.predict(z1)) if outlier_label is None: assert_raises(ValueError, clf.predict, z2) elif False: assert_array_equal(np.array([1, outlier_label]), clf.predict(z2))
def test_iforest_sparse(): """Check IForest for various parameter settings on sparse input.""" rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]}) for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in grid: # Trained on sparse format sparse_classifier = IsolationForest( n_estimators=10, random_state=1, **params).fit(X_train_sparse) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_classifier = IsolationForest( n_estimators=10, random_state=1, **params).fit(X_train) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results) assert_array_equal(sparse_results, dense_results)
def test_kneighbors_classifier_predict_proba(): """Test KNeighborsClassifier.predict_proba() method""" X = np.array([[0, 2, 0], [0, 2, 1], [2, 0, 0], [2, 2, 0], [0, 0, 2], [0, 0, 1]]) y = np.array([4, 4, 5, 5, 1, 1]) cls = neighbors.KNeighborsClassifier(n_neighbors=3, p=1) # cityblock dist cls.fit(X, y) y_prob = cls.predict_proba(X) real_prob = np.array([[0, 2. / 3, 1. / 3], [1. / 3, 2. / 3, 0], [1. / 3, 0, 2. / 3], [0, 1. / 3, 2. / 3], [2. / 3, 1. / 3, 0], [2. / 3, 1. / 3, 0]]) assert_array_equal(real_prob, y_prob) # Check that it also works with non integer labels cls.fit(X, y.astype(str)) y_prob = cls.predict_proba(X) assert_array_equal(real_prob, y_prob) # Check that it works with weights='distance' cls = neighbors.KNeighborsClassifier( n_neighbors=2, p=1, weights='distance') cls.fit(X, y) y_prob = cls.predict_proba(np.array([[0, 2, 0], [2, 2, 2]])) real_prob = np.array([[0, 1, 0], [0, 0.4, 0.6]]) assert_array_almost_equal(real_prob, y_prob)
def test_grid_search_sparse_scoring(): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C X_ = sp.csr_matrix(X_) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator_.C assert_array_equal(y_pred, y_pred2) assert_equal(C, C2) # Smoke test the score #np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]), # cv.score(X_[:180], y[:180])) # test loss where greater is worse def f1_loss(y_true_, y_pred_): return -f1_score(y_true_, y_pred_) F1Loss = make_scorer(f1_loss, greater_is_better=False) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss) cv.fit(X_[:180], y_[:180]) y_pred3 = cv.predict(X_[180:]) C3 = cv.best_estimator_.C assert_equal(C, C3) assert_array_equal(y_pred, y_pred3)
def test_deprecated_score_func(): # test that old deprecated way of passing a score / loss function is still # supported X, y = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC(random_state=0) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X[:180], y[:180]) y_pred = cv.predict(X[180:]) C = cv.best_estimator_.C clf = LinearSVC(random_state=0) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) with warnings.catch_warnings(record=True): # catch deprecation warning cv.fit(X[:180], y[:180]) y_pred_func = cv.predict(X[180:]) C_func = cv.best_estimator_.C assert_array_equal(y_pred, y_pred_func) assert_equal(C, C_func) # test loss where greater is worse def f1_loss(y_true_, y_pred_): return -f1_score(y_true_, y_pred_) clf = LinearSVC(random_state=0) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, loss_func=f1_loss) with warnings.catch_warnings(record=True): # catch deprecation warning cv.fit(X[:180], y[:180]) y_pred_loss = cv.predict(X[180:]) C_loss = cv.best_estimator_.C assert_array_equal(y_pred, y_pred_loss) assert_equal(C, C_loss)
def test_labels_assignment_and_inertia(): # pure numpy implementation as easily auditable reference gold # implementation rng = np.random.RandomState(42) noisy_centers = centers + rng.normal(size=centers.shape) labels_gold = - np.ones(n_samples, dtype=np.int) mindist = np.empty(n_samples) mindist.fill(np.infty) for center_id in range(n_clusters): dist = np.sum((X - noisy_centers[center_id]) ** 2, axis=1) labels_gold[dist < mindist] = center_id mindist = np.minimum(dist, mindist) inertia_gold = mindist.sum() assert_true((mindist >= 0.0).all()) assert_true((labels_gold != -1).all()) # perform label assignment using the dense array input x_squared_norms = (X ** 2).sum(axis=1) labels_array, inertia_array = _labels_inertia( X, x_squared_norms, noisy_centers) assert_array_almost_equal(inertia_array, inertia_gold) assert_array_equal(labels_array, labels_gold) # perform label assignment using the sparse CSR input x_squared_norms_from_csr = row_norms(X_csr, squared=True) labels_csr, inertia_csr = _labels_inertia( X_csr, x_squared_norms_from_csr, noisy_centers) assert_array_almost_equal(inertia_csr, inertia_gold) assert_array_equal(labels_csr, labels_gold)
def test_int_input(): X_list = [[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]] for dtype in [np.int32, np.int64]: X_int = np.array(X_list, dtype=dtype) X_int_csr = sp.csr_matrix(X_int) init_int = X_int[:2] fitted_models = [ KMeans(n_clusters=2).fit(X_int), KMeans(n_clusters=2, init=init_int, n_init=1).fit(X_int), # mini batch kmeans is very unstable on such a small dataset hence # we use many inits MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int), MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int_csr), MiniBatchKMeans(n_clusters=2, batch_size=2, init=init_int, n_init=1).fit(X_int), MiniBatchKMeans(n_clusters=2, batch_size=2, init=init_int, n_init=1).fit(X_int_csr), ] for km in fitted_models: assert_equal(km.cluster_centers_.dtype, np.float64) expected_labels = [0, 1, 1, 0, 0, 1] scores = np.array([v_measure_score(expected_labels, km.labels_) for km in fitted_models]) assert_array_equal(scores, np.ones(scores.shape[0]))
def test_ovr_multilabel(): # Toy dataset where features correspond directly to labels. X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]]) y = [["spam", "eggs"], ["spam"], ["ham", "eggs", "spam"], ["ham", "eggs"], ["ham"]] #y = [[1, 2], [1], [0, 1, 2], [0, 2], [0]] Y = np.array([[0, 1, 1], [0, 1, 0], [1, 1, 1], [1, 0, 1], [1, 0, 0]]) classes = set("ham eggs spam".split()) for base_clf in (MultinomialNB(), LinearSVC(random_state=0), LinearRegression(), Ridge(), ElasticNet(), Lasso(alpha=0.5)): # test input as lists of tuples clf = assert_warns(DeprecationWarning, OneVsRestClassifier(base_clf).fit, X, y) assert_equal(set(clf.classes_), classes) y_pred = clf.predict([[0, 4, 4]])[0] assert_equal(set(y_pred), set(["spam", "eggs"])) assert_true(clf.multilabel_) # test input as label indicator matrix clf = OneVsRestClassifier(base_clf).fit(X, Y) y_pred = clf.predict([[0, 4, 4]])[0] assert_array_equal(y_pred, [0, 1, 1]) assert_true(clf.multilabel_)
def test_sparse_fit_params(): iris = load_iris() X, y = iris.data, iris.target clf = MockClassifier() fit_params = {'sparse_sample_weight': coo_matrix(np.eye(X.shape[0]))} a = cval.cross_val_score(clf, X, y, fit_params=fit_params) assert_array_equal(a, np.ones(3))
def test_ovr_multilabel_predict_proba(): base_clf = MultinomialNB(alpha=1) for au in (False, True): X, Y = datasets.make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=3, length=50, allow_unlabeled=au, return_indicator=True, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test, Y_test = X[80:], Y[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) # decision function only estimator. Fails in current implementation. decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train) assert_raises(AttributeError, decision_only.predict_proba, X_test) # Estimator with predict_proba disabled, depending on parameters. decision_only = OneVsRestClassifier(svm.SVC(probability=False)) decision_only.fit(X_train, Y_train) assert_raises(AttributeError, decision_only.predict_proba, X_test) Y_pred = clf.predict(X_test) Y_proba = clf.predict_proba(X_test) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = Y_proba > .5 assert_array_equal(pred, Y_pred)
def test_ovo_ties(): # test that ties are broken using the decision function, not defaulting to # the smallest label X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) y = np.array([2, 0, 1, 2]) multi_clf = OneVsOneClassifier(Perceptron()) ovo_prediction = multi_clf.fit(X, y).predict(X) # recalculate votes to make sure we have a tie predictions = np.vstack([clf.predict(X) for clf in multi_clf.estimators_]) scores = np.vstack([clf.decision_function(X) for clf in multi_clf.estimators_]) # classifiers are in order 0-1, 0-2, 1-2 # aggregate votes: votes = np.zeros((4, 3)) votes[np.arange(4), predictions[0]] += 1 votes[np.arange(4), 2 * predictions[1]] += 1 votes[np.arange(4), 1 + predictions[2]] += 1 # for the first point, there is one vote per class assert_array_equal(votes[0, :], 1) # for the rest, there is no tie and the prediction is the argmax assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:]) # for the tie, the prediction is the class with the highest score assert_equal(ovo_prediction[0], 0) # in the zero-one classifier, the score for 0 is greater than the score for # one. assert_greater(scores[0][0], scores[0][1]) # score for one is greater than score for zero assert_greater(scores[2, 0] - scores[0, 0], scores[0, 0] + scores[1, 0]) # score for one is greater than score for two assert_greater(scores[2, 0] - scores[0, 0], -scores[1, 0] - scores[2, 0])
def check_classifiers_input_shapes(name, Classifier): iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=1) X = StandardScaler().fit_transform(X) # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() set_fast_parameters(classifier) set_random_state(classifier) # fit classifier.fit(X, y) y_pred = classifier.predict(X) set_random_state(classifier) # Check that when a 2D y is given, a DataConversionWarning is # raised with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", DataConversionWarning) warnings.simplefilter("ignore", RuntimeWarning) classifier.fit(X, y[:, np.newaxis]) msg = "expected 1 DataConversionWarning, got: %s" % ( ", ".join([str(w_x) for w_x in w])) assert_equal(len(w), 1, msg) assert_array_equal(y_pred, classifier.predict(X))
def test_make_swiss_roll(): X, t = make_swiss_roll(n_samples=5, noise=0.0, random_state=0) assert_equal(X.shape, (5, 3), "X shape mismatch") assert_equal(t.shape, (5,), "t shape mismatch") assert_array_equal(X[:, 0], t * np.cos(t)) assert_array_equal(X[:, 2], t * np.sin(t))
def check_clustering(name, Alg): X, y = make_blobs(n_samples=50, random_state=1) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) n_samples, n_features = X.shape # catch deprecation and neighbors warnings with warnings.catch_warnings(record=True): alg = Alg() set_fast_parameters(alg) if hasattr(alg, "n_clusters"): alg.set_params(n_clusters=3) set_random_state(alg) if name == 'AffinityPropagation': alg.set_params(preference=-100) alg.set_params(max_iter=100) # fit alg.fit(X) # with lists alg.fit(X.tolist()) assert_equal(alg.labels_.shape, (n_samples,)) pred = alg.labels_ assert_greater(adjusted_rand_score(pred, y), 0.4) # fit another time with ``fit_predict`` and compare results if name is 'SpectralClustering': # there is no way to make Spectral clustering deterministic :( return set_random_state(alg) with warnings.catch_warnings(record=True): pred2 = alg.fit_predict(X) assert_array_equal(pred, pred2)
def test_make_s_curve(): X, t = make_s_curve(n_samples=5, noise=0.0, random_state=0) assert_equal(X.shape, (5, 3), "X shape mismatch") assert_equal(t.shape, (5,), "t shape mismatch") assert_array_equal(X[:, 0], np.sin(t)) assert_array_equal(X[:, 2], np.sign(t) * (np.cos(t) - 1))
def check_warm_start(name, random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = ForestEstimator(n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert_equal(len(clf_ws), n_estimators) clf_no_ws = ForestEstimator(n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert_equal(set([tree.random_state for tree in clf_ws]), set([tree.random_state for tree in clf_no_ws])) assert_array_equal(clf_ws.apply(X), clf_no_ws.apply(X), err_msg="Failed with {0}".format(name))
def test_Y_is_not_None(): rng = np.random.RandomState(0) hm = HammingKernel() X = rng.randint(0, 4, (5, 3)) hm = HammingKernel(length_scale=[1.0, 1.0, 1.0]) assert_array_equal(hm(X), hm(X, X))
def test_spectral_clustering(): S = np.array([[1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0], [0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0]]) for eigen_solver in ('arpack', 'lobpcg'): for assign_labels in ('kmeans', 'discretize'): for mat in (S, sparse.csr_matrix(S)): model = SpectralClustering(random_state=0, n_clusters=2, affinity='precomputed', eigen_solver=eigen_solver, assign_labels=assign_labels ).fit(mat) labels = model.labels_ if labels[0] == 0: labels = 1 - labels assert_array_equal(labels, [1, 1, 1, 0, 0, 0, 0]) model_copy = loads(dumps(model)) assert_equal(model_copy.n_clusters, model.n_clusters) assert_equal(model_copy.eigen_solver, model.eigen_solver) assert_array_equal(model_copy.labels_, model.labels_)
def test_predict_iris(): # Test logistic regression with the iris dataset n_samples, n_features = iris.data.shape target = iris.target_names[iris.target] # Test that both multinomial and OvR solvers handle # multiclass data correctly and give good accuracy # score (>0.95) for the training data. for clf in [LogisticRegression(C=len(iris.data)), LogisticRegression(C=len(iris.data), solver='lbfgs', multi_class='multinomial'), LogisticRegression(C=len(iris.data), solver='newton-cg', multi_class='multinomial'), LogisticRegression(C=len(iris.data), solver='sag', tol=1e-2, multi_class='ovr', random_state=42), LogisticRegression(C=len(iris.data), solver='saga', tol=1e-2, multi_class='ovr', random_state=42) ]: clf.fit(iris.data, target) assert_array_equal(np.unique(target), clf.classes_) pred = clf.predict(iris.data) assert_greater(np.mean(pred == target), .95) probabilities = clf.predict_proba(iris.data) assert_array_almost_equal(probabilities.sum(axis=1), np.ones(n_samples)) pred = iris.target_names[probabilities.argmax(axis=1)] assert_greater(np.mean(pred == target), .95)
def check_classifiers_one_label(name, Classifier): error_string_fit = "Classifier can't train when only one class is present." error_string_predict = ("Classifier can't predict when only one class is " "present.") rnd = np.random.RandomState(0) X_train = rnd.uniform(size=(10, 3)) X_test = rnd.uniform(size=(10, 3)) y = np.ones(10) # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() set_fast_parameters(classifier) # try to fit try: classifier.fit(X_train, y) except ValueError as e: if 'class' not in repr(e): print(error_string_fit, Classifier, e) traceback.print_exc(file=sys.stdout) raise e else: return except Exception as exc: print(error_string_fit, Classifier, exc) traceback.print_exc(file=sys.stdout) raise exc # predict try: assert_array_equal(classifier.predict(X_test), y) except Exception as exc: print(error_string_predict, Classifier, exc) raise exc
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def test_logistic_regressioncv_class_weights(): X, y = make_classification(n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0) # Test the liblinear fails when class_weight of type dict is # provided, when it is multiclass. However it can handle # binary problems. clf_lib = LogisticRegressionCV(class_weight={0: 0.1, 1: 0.2}, solver='liblinear') assert_raises(ValueError, clf_lib.fit, X, y) y_ = y.copy() y_[y == 2] = 1 clf_lib.fit(X, y_) assert_array_equal(clf_lib.classes_, [0, 1]) # Test for class_weight=auto X, y = make_classification(n_samples=20, n_features=20, n_informative=10, random_state=0) clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False, class_weight='auto') clf_lbf.fit(X, y) clf_lib = LogisticRegressionCV(solver='liblinear', fit_intercept=False, class_weight='auto') clf_lib.fit(X, y) assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4)
def test_staged_predict_proba(): # Test whether staged predict proba eventually gives # the same prediction. X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingClassifier(n_estimators=20) # test raise NotFittedError if not fitted assert_raises(NotFittedError, lambda X: np.fromiter( clf.staged_predict_proba(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) # test if prediction for last stage equals ``predict`` for y_pred in clf.staged_predict(X_test): assert_equal(y_test.shape, y_pred.shape) assert_array_equal(clf.predict(X_test), y_pred) # test if prediction for last stage equals ``predict_proba`` for staged_proba in clf.staged_predict_proba(X_test): assert_equal(y_test.shape[0], staged_proba.shape[0]) assert_equal(2, staged_proba.shape[1]) assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8))
def test_cross_val_generator_mask_indices_same(): # Test that the cross validation generators return the same results when # indices=True and when indices=False y = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2]) labels = np.array([1, 1, 2, 3, 3, 3, 4]) loo_mask = cval.LeaveOneOut(5, indices=False) loo_ind = cval.LeaveOneOut(5, indices=True) lpo_mask = cval.LeavePOut(10, 2, indices=False) lpo_ind = cval.LeavePOut(10, 2, indices=True) kf_mask = cval.KFold(10, 5, indices=False, shuffle=True, random_state=1) kf_ind = cval.KFold(10, 5, indices=True, shuffle=True, random_state=1) skf_mask = cval.StratifiedKFold(y, 3, indices=False) skf_ind = cval.StratifiedKFold(y, 3, indices=True) lolo_mask = cval.LeaveOneLabelOut(labels, indices=False) lolo_ind = cval.LeaveOneLabelOut(labels, indices=True) lopo_mask = cval.LeavePLabelOut(labels, 2, indices=False) lopo_ind = cval.LeavePLabelOut(labels, 2, indices=True) for cv_mask, cv_ind in [(loo_mask, loo_ind), (lpo_mask, lpo_ind), (kf_mask, kf_ind), (skf_mask, skf_ind), (lolo_mask, lolo_ind), (lopo_mask, lopo_ind)]: for (train_mask, test_mask), (train_ind, test_ind) in \ zip(cv_mask, cv_ind): assert_array_equal(np.where(train_mask)[0], train_ind) assert_array_equal(np.where(test_mask)[0], test_ind)
def test_ward_linkage_tree_return_distance(): """Test return_distance option on linkage and ward trees""" # test that return_distance when set true, gives same # output on both structured and unstructured clustering. n, p = 10, 5 rng = np.random.RandomState(0) connectivity = np.ones((n, n)) for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out_unstructured = ward_tree(X, return_distance=True) out_structured = ward_tree(X, connectivity=connectivity, return_distance=True) # get children children_unstructured = out_unstructured[0] children_structured = out_structured[0] # check if we got the same clusters assert_array_equal(children_unstructured, children_structured) # check if the distances are the same dist_unstructured = out_unstructured[-1] dist_structured = out_structured[-1] assert_array_almost_equal(dist_unstructured, dist_structured) for linkage in ['average', 'complete']: structured_items = linkage_tree(X, connectivity=connectivity, linkage=linkage, return_distance=True)[-1] unstructured_items = linkage_tree(X, linkage=linkage, return_distance=True)[-1] structured_dist = structured_items[-1] unstructured_dist = unstructured_items[-1] structured_children = structured_items[0] unstructured_children = unstructured_items[0] assert_array_almost_equal(structured_dist, unstructured_dist) assert_array_almost_equal(structured_children, unstructured_children) # test on the following dataset where we know the truth # taken from scipy/cluster/tests/hierarchy_test_data.py X = np.array([[1.43054825, -7.5693489], [6.95887839, 6.82293382], [2.87137846, -9.68248579], [7.87974764, -6.05485803], [8.24018364, -6.09495602], [7.39020262, 8.54004355]]) # truth linkage_X_ward = np.array([[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 9.10208346, 4.], [7., 9., 24.7784379, 6.]]) linkage_X_complete = np.array([[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 6.96742194, 4.], [7., 9., 18.77445997, 6.]]) linkage_X_average = np.array([[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 6.55832839, 4.], [7., 9., 15.44089605, 6.]]) n_samples, n_features = np.shape(X) connectivity_X = np.ones((n_samples, n_samples)) out_X_unstructured = ward_tree(X, return_distance=True) out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True) # check that the labels are the same assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0]) assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0]) # check that the distances are correct assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4]) assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4]) linkage_options = ['complete', 'average'] X_linkage_truth = [linkage_X_complete, linkage_X_average] for (linkage, X_truth) in zip(linkage_options, X_linkage_truth): out_X_unstructured = linkage_tree(X, return_distance=True, linkage=linkage) out_X_structured = linkage_tree(X, connectivity=connectivity_X, linkage=linkage, return_distance=True) # check that the labels are the same assert_array_equal(X_truth[:, :2], out_X_unstructured[0]) assert_array_equal(X_truth[:, :2], out_X_structured[0]) # check that the distances are correct assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4]) assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
def test_init_ndarray(): # Initialize TSNE with ndarray and test fit tsne = TSNE(init=np.zeros((100, 2))) X_embedded = tsne.fit_transform(np.ones((100, 5))) assert_array_equal(np.zeros((100, 2)), X_embedded)
def test_fit_transform(): X1 = KMeans(n_clusters=3, random_state=51).fit(X).transform(X) X2 = KMeans(n_clusters=3, random_state=51).fit_transform(X) assert_array_equal(X1, X2)
def test_isotonic_regression(): y = np.array([3, 7, 5, 9, 8, 7, 10]) y_ = np.array([3, 6, 6, 8, 8, 8, 10]) assert_array_equal(y_, isotonic_regression(y)) y = np.array([10, 0, 2]) y_ = np.array([4, 4, 4]) assert_array_equal(y_, isotonic_regression(y)) x = np.arange(len(y)) ir = IsotonicRegression(y_min=0., y_max=1.) ir.fit(x, y) assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y)) assert_array_equal(ir.transform(x), ir.predict(x)) # check that it is immune to permutation perm = np.random.permutation(len(y)) ir = IsotonicRegression(y_min=0., y_max=1.) assert_array_equal(ir.fit_transform(x[perm], y[perm]), ir.fit_transform(x, y)[perm]) assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm]) # check we don't crash when all x are equal: ir = IsotonicRegression() assert_array_equal(ir.fit_transform(np.ones(len(x)), y), np.mean(y))
def test_isotonic_regression_reversed(): y = np.array([10, 9, 10, 7, 6, 6.1, 5]) y_ = IsotonicRegression(increasing=False).fit_transform( np.arange(len(y)), y) assert_array_equal(np.ones(y_[:-1].shape), ((y_[:-1] - y_[1:]) >= 0))
def test_predict_consistent(kernel): # Check binary predict decision has also predicted probability above 0.5. gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5)
def test_minibatch_update_consistency(): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(42) old_centers = centers + rng.normal(size=centers.shape) new_centers = old_centers.copy() new_centers_csr = old_centers.copy() counts = np.zeros(new_centers.shape[0], dtype=np.int32) counts_csr = np.zeros(new_centers.shape[0], dtype=np.int32) x_squared_norms = (X**2).sum(axis=1) x_squared_norms_csr = row_norms(X_csr, squared=True) buffer = np.zeros(centers.shape[1], dtype=np.double) buffer_csr = np.zeros(centers.shape[1], dtype=np.double) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] # step 1: compute the dense minibatch update old_inertia, incremental_diff = _mini_batch_step(X_mb, x_mb_squared_norms, new_centers, counts, buffer, 1, None, random_reassign=False) assert_greater(old_inertia, 0.0) # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia(X_mb, x_mb_squared_norms, new_centers) assert_greater(new_inertia, 0.0) assert_less(new_inertia, old_inertia) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers - old_centers)**2) assert_almost_equal(incremental_diff, effective_diff) # step 2: compute the sparse minibatch update old_inertia_csr, incremental_diff_csr = _mini_batch_step( X_mb_csr, x_mb_squared_norms_csr, new_centers_csr, counts_csr, buffer_csr, 1, None, random_reassign=False) assert_greater(old_inertia_csr, 0.0) # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia(X_mb_csr, x_mb_squared_norms_csr, new_centers_csr) assert_greater(new_inertia_csr, 0.0) assert_less(new_inertia_csr, old_inertia_csr) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers_csr - old_centers)**2) assert_almost_equal(incremental_diff_csr, effective_diff) # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_array_almost_equal(new_centers, new_centers_csr) assert_almost_equal(incremental_diff, incremental_diff_csr) assert_almost_equal(old_inertia, old_inertia_csr) assert_almost_equal(new_inertia, new_inertia_csr)
def test_fetch_multiple_column(tmpdata): _urlopen_ref = datasets.mldata.urlopen try: # create fake data set in cache x = sp.arange(6).reshape(2, 3) y = sp.array([1, -1]) z = sp.arange(12).reshape(4, 3) # by default dataname = 'threecol-default' datasets.mldata.urlopen = mock_mldata_urlopen({ dataname: ( { 'label': y, 'data': x, 'z': z, }, ['z', 'data', 'label'], ), }) dset = fetch_mldata(dataname, data_home=tmpdata) for n in ["COL_NAMES", "DESCR", "target", "data", "z"]: assert_in(n, dset) assert_not_in("x", dset) assert_not_in("y", dset) assert_array_equal(dset.data, x) assert_array_equal(dset.target, y) assert_array_equal(dset.z, z.T) # by order dataname = 'threecol-order' datasets.mldata.urlopen = mock_mldata_urlopen({ dataname: ({'y': y, 'x': x, 'z': z}, ['y', 'x', 'z']), }) dset = fetch_mldata(dataname, data_home=tmpdata) for n in ["COL_NAMES", "DESCR", "target", "data", "z"]: assert_in(n, dset) assert_not_in("x", dset) assert_not_in("y", dset) assert_array_equal(dset.data, x) assert_array_equal(dset.target, y) assert_array_equal(dset.z, z.T) # by number dataname = 'threecol-number' datasets.mldata.urlopen = mock_mldata_urlopen({ dataname: ({'y': y, 'x': x, 'z': z}, ['z', 'x', 'y']), }) dset = fetch_mldata(dataname, target_name=2, data_name=0, data_home=tmpdata) for n in ["COL_NAMES", "DESCR", "target", "data", "x"]: assert_in(n, dset) assert_not_in("y", dset) assert_not_in("z", dset) assert_array_equal(dset.data, z) assert_array_equal(dset.target, y) # by name dset = fetch_mldata(dataname, target_name='y', data_name='z', data_home=tmpdata) for n in ["COL_NAMES", "DESCR", "target", "data", "x"]: assert_in(n, dset) assert_not_in("y", dset) assert_not_in("z", dset) finally: datasets.mldata.urlopen = _urlopen_ref
def test_set_feature_union_step_drop(drop): mult2 = Mult(2) mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) mult3.get_feature_names = lambda: ['x3'] X = np.asarray([[1]]) ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names()) ft.set_params(m2=drop) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert_equal(['m3__x3'], ft.get_feature_names()) ft.set_params(m3=drop) assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) assert_equal([], ft.get_feature_names()) # check we can change back ft.set_params(m3=mult3) assert_array_equal([[3]], ft.fit(X).transform(X)) # Check 'drop' step at construction time ft = FeatureUnion([('m2', drop), ('m3', mult3)]) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert_equal(['m3__x3'], ft.get_feature_names())
def test_pipeline_memory(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: if LooseVersion(joblib_version) < LooseVersion('0.12'): # Deal with change of API in joblib memory = Memory(cachedir=cachedir, verbose=10) else: memory = Memory(location=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma='scale', probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the transformer in the cached pipeline ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert_equal(ts, cached_pipe.named_steps['transf'].timestamp_) # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma='scale', probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert_equal(ts, cached_pipe_2.named_steps['transf_2'].timestamp_) finally: shutil.rmtree(cachedir)
def assert_best_scores_kept(score_filter): scores = score_filter.scores_ support = score_filter.get_support() assert_array_equal(np.sort(scores[support]), np.sort(scores)[-support.sum():])
def test_set_pipeline_step_passthrough(passthrough): X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) def make(): return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)]) pipeline = make() exp = 2 * 3 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline.set_params(m3=passthrough) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert_dict_equal(pipeline.get_params(deep=True), {'steps': pipeline.steps, 'm2': mult2, 'm3': passthrough, 'last': mult5, 'memory': None, 'm2__mult': 2, 'last__mult': 5, }) pipeline.set_params(m2=passthrough) exp = 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) # for other methods, ensure no AttributeErrors on None: other_methods = ['predict_proba', 'predict_log_proba', 'decision_function', 'transform', 'score'] for method in other_methods: getattr(pipeline, method)(X) pipeline.set_params(m2=mult2) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline = make() pipeline.set_params(last=passthrough) # mult2 and mult3 are active exp = 6 assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert_raise_message(AttributeError, "'str' object has no attribute 'predict'", getattr, pipeline, 'predict') # Check 'passthrough' step at construction time exp = 2 * 5 pipeline = Pipeline( [('m2', mult2), ('m3', passthrough), ('last', mult5)]) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]]))
def test_one_hot_encoder_sparse(): # Test OneHotEncoder's fit and transform. X = [[3, 2, 1], [0, 1, 1]] enc = OneHotEncoder() with ignore_warnings(category=(DeprecationWarning, FutureWarning)): # discover max values automatically X_trans = enc.fit_transform(X).toarray() assert_equal(X_trans.shape, (2, 5)) assert_array_equal(enc.active_features_, np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) # check outcome assert_array_equal(X_trans, [[0., 1., 0., 1., 1.], [1., 0., 1., 0., 1.]]) # max value given as 3 # enc = assert_warns(DeprecationWarning, OneHotEncoder, n_values=4) enc = OneHotEncoder(n_values=4) with ignore_warnings(category=DeprecationWarning): X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 4 * 3)) assert_array_equal(enc.feature_indices_, [0, 4, 8, 12]) # max value given per feature # enc = assert_warns(DeprecationWarning, OneHotEncoder, n_values=[3, 2, 2]) enc = OneHotEncoder(n_values=[3, 2, 2]) with ignore_warnings(category=DeprecationWarning): X = [[1, 0, 1], [0, 1, 1]] X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 3 + 2 + 2)) assert_array_equal(enc.n_values_, [3, 2, 2]) # check that testing with larger feature works: X = np.array([[2, 0, 1], [0, 1, 1]]) enc.transform(X) # test that an error is raised when out of bounds: X_too_large = [[0, 2, 1], [0, 1, 1]] assert_raises(ValueError, enc.transform, X_too_large) error_msg = r"unknown categorical feature present \[2\] during transform" assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large) with ignore_warnings(category=DeprecationWarning): assert_raises( ValueError, OneHotEncoder(n_values=2).fit_transform, X) # test that error is raised when wrong number of features assert_raises(ValueError, enc.transform, X[:, :-1]) # test that error is raised when wrong number of features in fit # with prespecified n_values with ignore_warnings(category=DeprecationWarning): assert_raises(ValueError, enc.fit, X[:, :-1]) # test exception on wrong init param with ignore_warnings(category=DeprecationWarning): assert_raises( TypeError, OneHotEncoder(n_values=np.int).fit, X) enc = OneHotEncoder() # test negative input to fit with ignore_warnings(category=FutureWarning): assert_raises(ValueError, enc.fit, [[0], [-1]]) # test negative input to transform with ignore_warnings(category=FutureWarning): enc.fit([[0], [1]]) assert_raises(ValueError, enc.transform, [[0], [-1]])
def test_shuffle_split(): ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0) ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0) ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0) for typ in six.integer_types: ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0) for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4): assert_array_equal(t1[0], t2[0]) assert_array_equal(t2[0], t3[0]) assert_array_equal(t3[0], t4[0]) assert_array_equal(t1[1], t2[1]) assert_array_equal(t2[1], t3[1]) assert_array_equal(t3[1], t4[1])
def test_kfold_no_shuffle(): # Manually check that KFold preserves the data ordering on toy datasets splits = iter(cval.KFold(4, 2)) train, test = next(splits) assert_array_equal(test, [0, 1]) assert_array_equal(train, [2, 3]) train, test = next(splits) assert_array_equal(test, [2, 3]) assert_array_equal(train, [0, 1]) splits = iter(cval.KFold(5, 2)) train, test = next(splits) assert_array_equal(test, [0, 1, 2]) assert_array_equal(train, [3, 4]) train, test = next(splits) assert_array_equal(test, [3, 4]) assert_array_equal(train, [0, 1, 2])
def test_agglomerative_clustering(): """ Check that we obtain the correct number of clusters with agglomerative clustering. """ rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) for linkage in ("ward", "complete", "average"): clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) clustering.fit(X) # test caching try: tempdir = mkdtemp() clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, memory=tempdir, linkage=linkage) clustering.fit(X) labels = clustering.labels_ assert_true(np.size(np.unique(labels)) == 10) finally: shutil.rmtree(tempdir) # Turn caching off now clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) assert_almost_equal( normalized_mutual_info_score(clustering.labels_, labels), 1) clustering.connectivity = None clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10) # Check that we raise a TypeError on dense matrices clustering = AgglomerativeClustering( n_clusters=10, connectivity=sparse.lil_matrix(connectivity.toarray()[:10, :10]), linkage=linkage) assert_raises(ValueError, clustering.fit, X) # Test that using ward with another metric than euclidean raises an # exception clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity.toarray(), affinity="manhattan", linkage="ward") assert_raises(ValueError, clustering.fit, X) # Test using another metric than euclidean works with linkage complete for affinity in PAIRED_DISTANCES.keys(): # Compare our (structured) implementation to scipy clustering = AgglomerativeClustering(n_clusters=10, connectivity=np.ones( (n_samples, n_samples)), affinity=affinity, linkage="complete") clustering.fit(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=None, affinity=affinity, linkage="complete") clustering2.fit(X) assert_almost_equal( normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1) # Test that using a distance matrix (affinity = 'precomputed') has same # results (with connectivity constraints) clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage="complete") clustering.fit(X) X_dist = pairwise_distances(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, affinity='precomputed', linkage="complete") clustering2.fit(X_dist) assert_array_equal(clustering.labels_, clustering2.labels_)
def test_set_estimator_none(): """VotingClassifier set_params should be able to set estimators as None""" # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) eclf2.set_params(rf=None).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_true(dict(eclf2.estimators)["rf"] is None) assert_true(len(eclf2.estimators_) == 2) assert_true(all([not isinstance(est, RandomForestClassifier) for est in eclf2.estimators_])) assert_true(eclf2.get_params()["rf"] is None) eclf1.set_params(voting='soft').fit(X, y) eclf2.set_params(voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = ('All estimators are None. At least one is required' ' to be a classifier!') assert_raise_message( ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y) # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5]).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5]) eclf2.set_params(rf=None).fit(X1, y1) assert_array_equal(eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_stratified_kfold_no_shuffle(): # Manually check that StratifiedKFold preserves the data ordering as much # as possible on toy datasets in order to avoid hiding sample dependencies # when possible splits = iter(cval.StratifiedKFold([1, 1, 0, 0], 2)) train, test = next(splits) assert_array_equal(test, [0, 2]) assert_array_equal(train, [1, 3]) train, test = next(splits) assert_array_equal(test, [1, 3]) assert_array_equal(train, [0, 2]) splits = iter(cval.StratifiedKFold([1, 1, 1, 0, 0, 0, 0], 2)) train, test = next(splits) assert_array_equal(test, [0, 1, 3, 4]) assert_array_equal(train, [2, 5, 6]) train, test = next(splits) assert_array_equal(test, [2, 5, 6]) assert_array_equal(train, [0, 1, 3, 4])
def test_kfold_no_shuffle(): # Manually check that KFold preserves the data ordering on toy datasets X2 = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] splits = KFold(2).split(X2[:-1]) train, test = next(splits) assert_array_equal(test, [0, 1]) assert_array_equal(train, [2, 3]) train, test = next(splits) assert_array_equal(test, [2, 3]) assert_array_equal(train, [0, 1]) splits = KFold(2).split(X2) train, test = next(splits) assert_array_equal(test, [0, 1, 2]) assert_array_equal(train, [3, 4]) train, test = next(splits) assert_array_equal(test, [3, 4]) assert_array_equal(train, [0, 1, 2])
def test_shufflesplit_reproducible(): # Check that iterating twice on the ShuffleSplit gives the same # sequence of train-test when the random_state is given ss = cval.ShuffleSplit(10, random_state=21) assert_array_equal(list(a for a, b in ss), list(a for a, b in ss))
def test_sgd_proba(self): # Check SGD.predict_proba # Hinge loss does not allow for conditional prob estimate. # We cannot use the factory here, because it defines predict_proba # anyway. clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=10).fit(X, Y) assert_false(hasattr(clf, "predict_proba")) assert_false(hasattr(clf, "predict_log_proba")) # log and modified_huber losses can output probability estimates # binary case for loss in ["log", "modified_huber"]: clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10) clf.fit(X, Y) p = clf.predict_proba([3, 2]) assert_true(p[0, 1] > 0.5) p = clf.predict_proba([-1, -1]) assert_true(p[0, 1] < 0.5) p = clf.predict_log_proba([3, 2]) assert_true(p[0, 1] > p[0, 0]) p = clf.predict_log_proba([-1, -1]) assert_true(p[0, 1] < p[0, 0]) # log loss multiclass probability estimates clf = self.factory(loss="log", alpha=0.01, n_iter=10).fit(X2, Y2) d = clf.decision_function([[.1, -.1], [.3, .2]]) p = clf.predict_proba([[.1, -.1], [.3, .2]]) assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1)) assert_almost_equal(p[0].sum(), 1) assert_true(np.all(p[0] >= 0)) p = clf.predict_proba([-1, -1]) d = clf.decision_function([-1, -1]) assert_array_equal(np.argsort(p[0]), np.argsort(d[0])) l = clf.predict_log_proba([3, 2]) p = clf.predict_proba([3, 2]) assert_array_almost_equal(np.log(p), l) l = clf.predict_log_proba([-1, -1]) p = clf.predict_proba([-1, -1]) assert_array_almost_equal(np.log(p), l) # Modified Huber multiclass probability estimates; requires a separate # test because the hard zero/one probabilities may destroy the # ordering present in decision_function output. clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10) clf.fit(X2, Y2) d = clf.decision_function([3, 2]) p = clf.predict_proba([3, 2]) if not isinstance(self, SparseSGDClassifierTestCase): assert_equal(np.argmax(d, axis=1), np.argmax(p, axis=1)) else: # XXX the sparse test gets a different X2 (?) assert_equal(np.argmin(d, axis=1), np.argmin(p, axis=1)) # the following sample produces decision_function values < -1, # which would cause naive normalization to fail (see comment # in SGDClassifier.predict_proba) x = X.mean(axis=0) d = clf.decision_function(x) if np.all(d < -1): # XXX not true in sparse test case (why?) p = clf.predict_proba(x) assert_array_almost_equal(p[0], [1 / 3.] * 3)
def test_stratified_kfold_no_shuffle(): # Manually check that StratifiedKFold preserves the data ordering as much # as possible on toy datasets in order to avoid hiding sample dependencies # when possible X, y = np.ones(4), [1, 1, 0, 0] splits = StratifiedKFold(2).split(X, y) train, test = next(splits) assert_array_equal(test, [0, 2]) assert_array_equal(train, [1, 3]) train, test = next(splits) assert_array_equal(test, [1, 3]) assert_array_equal(train, [0, 2]) X, y = np.ones(7), [1, 1, 1, 0, 0, 0, 0] splits = StratifiedKFold(2).split(X, y) train, test = next(splits) assert_array_equal(test, [0, 1, 3, 4]) assert_array_equal(train, [2, 5, 6]) train, test = next(splits) assert_array_equal(test, [2, 5, 6]) assert_array_equal(train, [0, 1, 3, 4]) # Check if get_n_splits returns the number of folds assert_equal(5, StratifiedKFold(5).get_n_splits(X, y))
def test_invariance_string_vs_numbers_labels(): # Ensure that classification metrics with string labels random_state = check_random_state(0) y1 = random_state.randint(0, 2, size=(20, )) y2 = random_state.randint(0, 2, size=(20, )) y1_str = np.array(["eggs", "spam"])[y1] y2_str = np.array(["eggs", "spam"])[y2] pos_label_str = "spam" labels_str = ["eggs", "spam"] for name, metric in CLASSIFICATION_METRICS.items(): if name in METRIC_UNDEFINED_BINARY_MULTICLASS: continue measure_with_number = metric(y1, y2) # Ugly, but handle case with a pos_label and label metric_str = metric if name in METRICS_WITH_POS_LABEL: metric_str = partial(metric_str, pos_label=pos_label_str) measure_with_str = metric_str(y1_str, y2_str) assert_array_equal(measure_with_number, measure_with_str, err_msg="{0} failed string vs number invariance " "test".format(name)) measure_with_strobj = metric_str(y1_str.astype('O'), y2_str.astype('O')) assert_array_equal(measure_with_number, measure_with_strobj, err_msg="{0} failed string object vs number " "invariance test".format(name)) if name in METRICS_WITH_LABELS: metric_str = partial(metric_str, labels=labels_str) measure_with_str = metric_str(y1_str, y2_str) assert_array_equal(measure_with_number, measure_with_str, err_msg="{0} failed string vs number " "invariance test".format(name)) measure_with_strobj = metric_str(y1_str.astype('O'), y2_str.astype('O')) assert_array_equal(measure_with_number, measure_with_strobj, err_msg="{0} failed string vs number " "invariance test".format(name)) for name, metric in THRESHOLDED_METRICS.items(): if name in ("log_loss", "hinge_loss", "unnormalized_log_loss", "brier_score_loss"): # Ugly, but handle case with a pos_label and label metric_str = metric if name in METRICS_WITH_POS_LABEL: metric_str = partial(metric_str, pos_label=pos_label_str) measure_with_number = metric(y1, y2) measure_with_str = metric_str(y1_str, y2) assert_array_equal(measure_with_number, measure_with_str, err_msg="{0} failed string vs number " "invariance test".format(name)) measure_with_strobj = metric(y1_str.astype('O'), y2) assert_array_equal(measure_with_number, measure_with_strobj, err_msg="{0} failed string object vs number " "invariance test".format(name)) else: # TODO those metrics doesn't support string label yet assert_raises(ValueError, metric, y1_str, y2) assert_raises(ValueError, metric, y1_str.astype('O'), y2)
def test_time_series_cv(): X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]] # Should fail if there are more folds than samples assert_raises_regexp(ValueError, "Cannot have number of folds.*greater", next, TimeSeriesSplit(n_splits=7).split(X)) tscv = TimeSeriesSplit(2) # Manually check that Time Series CV preserves the data # ordering on toy datasets splits = tscv.split(X[:-1]) train, test = next(splits) assert_array_equal(train, [0, 1]) assert_array_equal(test, [2, 3]) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3]) assert_array_equal(test, [4, 5]) splits = TimeSeriesSplit(2).split(X) train, test = next(splits) assert_array_equal(train, [0, 1, 2]) assert_array_equal(test, [3, 4]) train, test = next(splits) assert_array_equal(train, [0, 1, 2, 3, 4]) assert_array_equal(test, [5, 6]) # Check get_n_splits returns the correct number of splits splits = TimeSeriesSplit(2).split(X) n_splits_actual = len(list(splits)) assert_equal(n_splits_actual, tscv.get_n_splits()) assert_equal(n_splits_actual, 2)
def test_safe_indexing_pandas_series(idx, asarray): pd = pytest.importorskip("pandas") idx = np.asarray(idx) if asarray else idx serie = pd.Series(np.arange(3)) assert_array_equal(safe_indexing(serie, idx).values, [0, 1])
def check_memory_layout(name, dtype): # Check that it works no matter the memory layout est = FOREST_ESTIMATORS[name](random_state=0, bootstrap=False) # Nothing X = np.asarray(iris.data, dtype=dtype) y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) # C-order X = np.asarray(iris.data, order="C", dtype=dtype) y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) # F-order X = np.asarray(iris.data, order="F", dtype=dtype) y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) # Contiguous X = np.ascontiguousarray(iris.data, dtype=dtype) y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) if est.base_estimator.splitter in SPARSE_SPLITTERS: # csr matrix X = csr_matrix(iris.data, dtype=dtype) y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) # csc_matrix X = csc_matrix(iris.data, dtype=dtype) y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) # coo_matrix X = coo_matrix(iris.data, dtype=dtype) y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) # Strided X = np.asarray(iris.data[::3], dtype=dtype) y = iris.target[::3] assert_array_equal(est.fit(X, y).predict(X), y)
def _check_transform_selected(X, X_expected, dtype, sel): for M in (X, sparse.csr_matrix(X)): Xtr = _transform_selected(M, Binarizer().transform, dtype, sel) assert_array_equal(toarray(Xtr), X_expected)
def test_predict_equal_labels(): km = KMeans(random_state=13, n_jobs=1, n_init=1, max_iter=1) km.fit(X) assert_array_equal(km.predict(X), km.labels_)
def test_random_hasher_sparse_data(): X, y = datasets.make_multilabel_classification(random_state=0) hasher = RandomTreesEmbedding(n_estimators=30, random_state=1) X_transformed = hasher.fit_transform(X) X_transformed_sparse = hasher.fit_transform(csc_matrix(X)) assert_array_equal(X_transformed_sparse.toarray(), X_transformed.toarray())
def _check_path(in_path): path = sorted(set(in_path), reverse=True) assert_array_equal(path, in_path)