def test_permutation_importance_correlated_feature_regression_pandas(n_jobs): pd = pytest.importorskip("pandas") # Make sure that feature highly correlated to the target have a higher # importance rng = np.random.RandomState(42) n_repeats = 5 dataset = load_iris() X, y = dataset.data, dataset.target y_with_little_noise = (y + rng.normal(scale=0.001, size=y.shape[0])).reshape( -1, 1) # Adds feature correlated with y as the last column X = pd.DataFrame(X, columns=dataset.feature_names) X['correlated_feature'] = y_with_little_noise clf = RandomForestClassifier(n_estimators=10, random_state=42) clf.fit(X, y) result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng, n_jobs=n_jobs) assert result.importances.shape == (X.shape[1], n_repeats) # the correlated feature with y was added as the last column and should # have the highest importance assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
def test_threshold_string(): est = RandomForestClassifier(n_estimators=50, random_state=0) model = SelectFromModel(est, threshold="0.5*mean") model.fit(data, y) X_transform = model.transform(data) # Calculate the threshold from the estimator directly. est.fit(data, y) threshold = 0.5 * np.mean(est.feature_importances_) mask = est.feature_importances_ > threshold assert_array_almost_equal(X_transform, data[:, mask])
def test_backend_respected(): clf = RandomForestClassifier(n_estimators=10, n_jobs=2) with joblib.parallel_backend("testing") as (ba, n_jobs): clf.fit(X, y) assert ba.count > 0 # predict_proba requires shared memory. Ensure that's honored. with joblib.parallel_backend("testing") as (ba, _): clf.predict_proba(X) assert ba.count == 0
def test_predict_on_toy_problem(): """Manually check predicted class labels for toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]]) y = np.array([1, 1, 1, 2, 2, 2]) assert_array_equal(clf1.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2]) assert_array_equal(clf2.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2]) assert_array_equal(clf3.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2]) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard', weights=[1, 1, 1]) assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2]) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[1, 1, 1]) assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
def test_tie_situation(): """Check voting classifier selects smaller class label in tie situation.""" clf1 = LogisticRegression(random_state=123, solver='liblinear') clf2 = RandomForestClassifier(random_state=123) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='hard') assert clf1.fit(X, y).predict(X)[73] == 2 assert clf2.fit(X, y).predict(X)[73] == 1 assert eclf.fit(X, y).predict(X)[73] == 1
def test_dtype_convert(n_classes=15): classifier = RandomForestClassifier(random_state=0, bootstrap=False) X = np.eye(n_classes) y = [ch for ch in 'ABCDEFGHIJKLMNOPQRSTU'[:n_classes]] result = classifier.fit(X, y).predict(X) assert_array_equal(classifier.classes_, y) assert_array_equal(result, y)