def test_make_classification_weights_array_or_list_ok(kwargs): X1, y1 = make_classification(weights=[.1, .9], random_state=0, **kwargs) X2, y2 = make_classification(weights=np.array([.1, .9]), random_state=0, **kwargs) assert_almost_equal(X1, X2) assert_almost_equal(y1, y2)
def test_linearsvc_parameters(): # Test possible parameter combinations in LinearSVC # Generate list of possible parameter combinations losses = ['hinge', 'squared_hinge', 'logistic_regression', 'foo'] penalties, duals = ['l1', 'l2', 'bar'], [True, False] X, y = make_classification(n_samples=5, n_features=5) for loss, penalty, dual in itertools.product(losses, penalties, duals): clf = svm.LinearSVC(penalty=penalty, loss=loss, dual=dual) if ((loss, penalty) == ('hinge', 'l1') or (loss, penalty, dual) == ('hinge', 'l2', False) or (penalty, dual) == ('l1', True) or loss == 'foo' or penalty == 'bar'): with pytest.raises(ValueError, match="Unsupported set of " "arguments.*penalty='%s.*loss='%s.*dual=%s" % (penalty, loss, dual)): clf.fit(X, y) else: clf.fit(X, y) # Incorrect loss value - test if explicit error message is raised with pytest.raises(ValueError, match=".*loss='l3' is not supported.*"): svm.LinearSVC(loss="l3").fit(X, y)
def test_mutual_info_classif(): X, y = make_classification(n_samples=100, n_features=5, n_informative=1, n_redundant=1, n_repeated=0, n_classes=2, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) # Test in KBest mode. univariate_filter = SelectKBest(mutual_info_classif, k=2) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(mutual_info_classif, mode='k_best', param=2).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(5) gtruth[:2] = 1 assert_array_equal(support, gtruth) # Test in Percentile mode. univariate_filter = SelectPercentile(mutual_info_classif, percentile=40) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(mutual_info_classif, mode='percentile', param=40).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(5) gtruth[:2] = 1 assert_array_equal(support, gtruth)
def test_partial_dependence_unknown_feature_indices(estimator, features): X, y = make_classification(random_state=0) estimator.fit(X, y) err_msg = 'all features must be in' with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, X, [features])
def test_precision_recall_curve_pipeline(pyplot, clf): X, y = make_classification(n_classes=2, n_samples=50, random_state=0) with pytest.raises(NotFittedError): plot_precision_recall_curve(clf, X, y) clf.fit(X, y) disp = plot_precision_recall_curve(clf, X, y) assert disp.estimator_name == clf.__class__.__name__
def test_missing_values_resilience(problem, missing_proportion, expected_min_score_classification, expected_min_score_regression): # Make sure the estimators can deal with missing values and still yield # decent predictions rng = np.random.RandomState(0) n_samples = 1000 n_features = 2 if problem == 'regression': X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_features, random_state=rng) gb = HistGradientBoostingRegressor() expected_min_score = expected_min_score_regression else: X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_features, n_redundant=0, n_repeated=0, random_state=rng) gb = HistGradientBoostingClassifier() expected_min_score = expected_min_score_classification mask = rng.binomial(1, missing_proportion, size=X.shape).astype(np.bool) X[mask] = np.nan gb.fit(X, y) assert gb.score(X, y) > expected_min_score
def test_mean_variance_axis1(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit X[0, 0] = 0 X[2, 1] = 0 X[4, 3] = 0 X_lil = sp.lil_matrix(X) X_lil[1, 0] = 0 X[1, 0] = 0 assert_raises(TypeError, mean_variance_axis, X_lil, axis=1) X_csr = sp.csr_matrix(X_lil) X_csc = sp.csc_matrix(X_lil) expected_dtypes = [(np.float32, np.float32), (np.float64, np.float64), (np.int32, np.float64), (np.int64, np.float64)] for input_dtype, output_dtype in expected_dtypes: X_test = X.astype(input_dtype) for X_sparse in (X_csr, X_csc): X_sparse = X_sparse.astype(input_dtype) X_means, X_vars = mean_variance_axis(X_sparse, axis=0) assert X_means.dtype == output_dtype assert X_vars.dtype == output_dtype assert_array_almost_equal(X_means, np.mean(X_test, axis=0)) assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
def test_2d_coef(): X, y = datasets.make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0, n_classes=4) est = LogisticRegression() for threshold, func in zip(["mean", "median"], [np.mean, np.median]): for order in [1, 2, np.inf]: # Fit SelectFromModel a multi-class problem transformer = SelectFromModel(estimator=LogisticRegression(), threshold=threshold, norm_order=order) transformer.fit(X, y) assert hasattr(transformer.estimator_, 'coef_') X_new = transformer.transform(X) assert X_new.shape[1] < X.shape[1] # Manually check that the norm is correctly performed est.fit(X, y) importances = np.linalg.norm(est.coef_, axis=0, ord=order) feature_mask = importances > func(importances) assert_array_almost_equal(X_new, X[:, feature_mask])
def test_label_propagation_closed_form(): n_classes = 2 X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) y[::3] = -1 Y = np.zeros((len(y), n_classes + 1)) Y[np.arange(len(y)), y] = 1 unlabelled_idx = Y[:, (-1, )].nonzero()[0] labelled_idx = (Y[:, (-1, )] == 0).nonzero()[0] clf = label_propagation.LabelPropagation(max_iter=10000, gamma=0.1) clf.fit(X, y) # adopting notation from Zhu et al 2002 T_bar = clf._build_graph() Tuu = T_bar[tuple( np.meshgrid(unlabelled_idx, unlabelled_idx, indexing='ij'))] Tul = T_bar[tuple(np.meshgrid(unlabelled_idx, labelled_idx, indexing='ij'))] Y = Y[:, :-1] Y_l = Y[labelled_idx, :] Y_u = np.dot(np.dot(np.linalg.inv(np.eye(Tuu.shape[0]) - Tuu), Tul), Y_l) expected = Y.copy() expected[unlabelled_idx, :] = Y_u expected /= expected.sum(axis=1)[:, np.newaxis] assert_array_almost_equal(expected, clf.label_distributions_, 4)
def test_select_heuristics_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the fdr, fwe and fpr heuristics X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ['fdr', 'fpr', 'fwe']: X_r2 = GenericUnivariateSelect(f_classif, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_almost_equal(support, gtruth)
def test_threshold_and_max_features(): X, y = datasets.make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) est = RandomForestClassifier(n_estimators=50, random_state=0) transformer1 = SelectFromModel(estimator=est, max_features=3, threshold=-np.inf) X_new1 = transformer1.fit_transform(X, y) transformer2 = SelectFromModel(estimator=est, threshold=0.04) X_new2 = transformer2.fit_transform(X, y) transformer3 = SelectFromModel(estimator=est, max_features=3, threshold=0.04) X_new3 = transformer3.fit_transform(X, y) assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1]) selected_indices = transformer3.transform( np.arange(X.shape[1])[np.newaxis, :]) assert_allclose(X_new3, X[:, selected_indices[0]])
def test_predict_sparse_callable_kernel(): # This is a non-regression test for #15866 # Custom sparse kernel (top-K RBF) def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5): nn = NearestNeighbors(n_neighbors=10, metric='euclidean', n_jobs=-1) nn.fit(X) W = -1 * nn.kneighbors_graph(Y, mode='distance').power(2) * gamma np.exp(W.data, out=W.data) assert issparse(W) return W.T n_classes = 4 n_samples = 500 n_test = 10 X, y = make_classification(n_classes=n_classes, n_samples=n_samples, n_features=20, n_informative=20, n_redundant=0, n_repeated=0, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test, random_state=0) model = label_propagation.LabelSpreading(kernel=topk_rbf) model.fit(X_train, y_train) assert model.score(X_test, y_test) >= 0.9 model = label_propagation.LabelPropagation(kernel=topk_rbf) model.fit(X_train, y_train) assert model.score(X_test, y_test) >= 0.9
def test_select_kbest_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the k best heuristic X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_classif, k=5) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='k_best', param=5).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_percentile_classif_sparse(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the percentile heuristic X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) X = sparse.csr_matrix(X) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='percentile', param=25).fit(X, y).transform(X) assert_array_equal(X_r.toarray(), X_r2.toarray()) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_r2inv = univariate_filter.inverse_transform(X_r2) assert sparse.issparse(X_r2inv) support_mask = safe_mask(X_r2inv, support) assert X_r2inv.shape == X.shape assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray()) # Check other columns are empty assert X_r2inv.getnnz() == X_r.getnnz()
def test_valid_alpha(): n_classes = 2 X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) for alpha in [-0.1, 0, 1, 1.1, None]: with pytest.raises(ValueError): label_propagation.LabelSpreading(alpha=alpha).fit(X, y)
def test_warm_start_validation(): X, y = make_classification(n_samples=30, n_features=5, n_classes=4, n_redundant=0, n_informative=5, random_state=0) nca = NeighborhoodComponentsAnalysis(warm_start=True, max_iter=5) nca.fit(X, y) X_less_features, y = make_classification(n_samples=30, n_features=4, n_classes=4, n_redundant=0, n_informative=4, random_state=0) assert_raise_message(ValueError, 'The new inputs dimensionality ({}) does not ' 'match the input dimensionality of the ' 'previously learned transformation ({}).' .format(X_less_features.shape[1], nca.components_.shape[1]), nca.fit, X_less_features, y)
def test_multiclass_roc_no_proba_scorer_errors(scorer_name): # Perceptron has no predict_proba scorer = get_scorer(scorer_name) X, y = make_classification(n_classes=3, n_informative=3, n_samples=20, random_state=0) lr = Perceptron().fit(X, y) msg = "'Perceptron' object has no attribute 'predict_proba'" with pytest.raises(AttributeError, match=msg): scorer(lr, X, y)
def test_ovr_single_label_decision_function(): X, Y = datasets.make_classification(n_samples=100, n_features=20, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test = X[80:] clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train) assert_array_equal(clf.decision_function(X_test).ravel() > 0, clf.predict(X_test))
def test_partial_dependence_slice_error(with_dataframe, err_msg): X, y = make_classification(random_state=0) if with_dataframe: pd = pytest.importorskip('pandas') X = pd.DataFrame(X) estimator = LogisticRegression().fit(X, y) with pytest.raises(TypeError, match=err_msg): partial_dependence(estimator, X, features=slice(0, 2, 1))
def test_forest_feature_importances_sum(): X, y = make_classification(n_samples=15, n_informative=3, random_state=1, n_classes=3) clf = RandomForestClassifier(min_samples_leaf=5, random_state=42, n_estimators=200).fit(X, y) assert math.isclose(1, clf.feature_importances_.sum(), abs_tol=1e-7)
def test_crammer_singer_binary(): # Test Crammer-Singer formulation in the binary case X, y = make_classification(n_classes=2, random_state=0) for fit_intercept in (True, False): acc = svm.LinearSVC(fit_intercept=fit_intercept, multi_class="crammer_singer", random_state=0).fit(X, y).score(X, y) assert acc > 0.9
def test_multiclass_roc_proba_scorer(scorer_name, metric): scorer = get_scorer(scorer_name) X, y = make_classification(n_classes=3, n_informative=3, n_samples=20, random_state=0) lr = LogisticRegression(multi_class="multinomial").fit(X, y) y_proba = lr.predict_proba(X) expected_score = metric(y, y_proba) assert scorer(lr, X, y) == pytest.approx(expected_score)
def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 255 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingClassifier( loss='binary_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) assert np.mean(pred_sklearn == pred_lightgbm) > .89 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_sklearn = accuracy_score(y_train, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) assert np.mean(pred_sklearn == pred_lightgbm) > .89 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_sklearn = accuracy_score(y_test, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
def test_select_kbest_all(): # Test whether k="all" correctly returns all features. X, y = make_classification(n_samples=20, n_features=10, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_classif, k='all') X_r = univariate_filter.fit(X, y).transform(X) assert_array_equal(X, X_r)
def test_partial_dependence_unknown_feature_string(estimator): pd = pytest.importorskip("pandas") X, y = make_classification(random_state=0) df = pd.DataFrame(X) estimator.fit(df, y) features = ['random'] err_msg = 'A given column is not a column of the dataframe' with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, df, features)
def setup_module(): # Create some memory mapped data global X_mm, y_mm, y_ml_mm, TEMP_FOLDER, ESTIMATORS TEMP_FOLDER = tempfile.mkdtemp(prefix='sklearn_test_score_objects_') X, y = make_classification(n_samples=30, n_features=5, random_state=0) _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0) filename = os.path.join(TEMP_FOLDER, 'test_data.pkl') joblib.dump((X, y, y_ml), filename) X_mm, y_mm, y_ml_mm = joblib.load(filename, mmap_mode='r') ESTIMATORS = _make_estimators(X_mm, y_mm, y_ml_mm)
def test_calibration_prob_sum(): # Test that sum of probabilities is 1. A non-regression test for # issue #7796 num_classes = 2 X, y = make_classification(n_samples=10, n_features=5, n_classes=num_classes) clf = LinearSVC(C=1.0) clf_prob = CalibratedClassifierCV(clf, method="sigmoid", cv=LeaveOneOut()) clf_prob.fit(X, y) probs = clf_prob.predict_proba(X) assert_array_almost_equal(probs.sum(axis=1), np.ones(probs.shape[0]))
def test_calibration_nan_imputer(): """Test that calibration can accept nan""" X, y = make_classification(n_samples=10, n_features=2, n_informative=2, n_redundant=0, random_state=42) X[0, 0] = np.nan clf = Pipeline( [('imputer', SimpleImputer()), ('rf', RandomForestClassifier(n_estimators=1))]) clf_c = CalibratedClassifierCV(clf, cv=2, method='isotonic') clf_c.fit(X, y) clf_c.predict(X)
def test_make_classification(): weights = [0.1, 0.25] X, y = make_classification(n_samples=100, n_features=20, n_informative=5, n_redundant=1, n_repeated=1, n_classes=3, n_clusters_per_class=1, hypercube=False, shift=None, scale=None, weights=weights, random_state=0) assert weights == [0.1, 0.25] assert X.shape == (100, 20), "X shape mismatch" assert y.shape == (100, ), "y shape mismatch" assert np.unique(y).shape == (3, ), "Unexpected number of classes" assert sum(y == 0) == 10, "Unexpected number of samples in class #0" assert sum(y == 1) == 25, "Unexpected number of samples in class #1" assert sum(y == 2) == 65, "Unexpected number of samples in class #2" # Test for n_features > 30 X, y = make_classification(n_samples=2000, n_features=31, n_informative=31, n_redundant=0, n_repeated=0, hypercube=True, scale=0.5, random_state=0) assert X.shape == (2000, 31), "X shape mismatch" assert y.shape == (2000, ), "y shape mismatch" assert (np.unique( X.view([('', X.dtype)] * X.shape[1])).view(X.dtype).reshape( -1, X.shape[1]).shape[0] == 2000), ("Unexpected number of unique rows")
def test_error_bad_response(pyplot, response_method, msg): X, y = make_classification(n_classes=2, n_samples=50, random_state=0) class MyClassifier(BaseEstimator, ClassifierMixin): def fit(self, X, y): self.fitted_ = True self.classes_ = [0, 1] return self clf = MyClassifier().fit(X, y) with pytest.raises(ValueError, match=msg): plot_precision_recall_curve(clf, X, y, response_method=response_method)