def test_score_samples(): X_train = [[1, 1], [1, 2], [2, 1]] clf1 = IsolationForest(contamination=0.1).fit(X_train) clf2 = IsolationForest().fit(X_train) assert_array_equal(clf1.score_samples([[2., 2.]]), clf1.decision_function([[2., 2.]]) + clf1.offset_) assert_array_equal(clf2.score_samples([[2., 2.]]), clf2.decision_function([[2., 2.]]) + clf2.offset_) assert_array_equal(clf1.score_samples([[2., 2.]]), clf2.score_samples([[2., 2.]]))
def test_max_samples_attribute(): X = iris.data clf = IsolationForest().fit(X) assert clf.max_samples_ == X.shape[0] clf = IsolationForest(max_samples=500) assert_warns_message(UserWarning, "max_samples will be set to n_samples for estimation", clf.fit, X) assert clf.max_samples_ == X.shape[0] clf = IsolationForest(max_samples=0.4).fit(X) assert clf.max_samples_ == 0.4*X.shape[0]
def test_iforest_error(): """Test that it gives proper exception on deficient input.""" X = iris.data # Test max_samples assert_raises(ValueError, IsolationForest(max_samples=-1).fit, X) assert_raises(ValueError, IsolationForest(max_samples=0.0).fit, X) assert_raises(ValueError, IsolationForest(max_samples=2.0).fit, X) # The dataset has less than 256 samples, explicitly setting # max_samples > n_samples should result in a warning. If not set # explicitly there should be no warning assert_warns_message(UserWarning, "max_samples will be set to n_samples for estimation", IsolationForest(max_samples=1000).fit, X) # note that assert_no_warnings does not apply since it enables a # PendingDeprecationWarning triggered by scipy.sparse's use of # np.matrix. See issue #11251. with pytest.warns(None) as record: IsolationForest(max_samples='auto').fit(X) user_warnings = [each for each in record if issubclass(each.category, UserWarning)] assert len(user_warnings) == 0 with pytest.warns(None) as record: IsolationForest(max_samples=np.int64(2)).fit(X) user_warnings = [each for each in record if issubclass(each.category, UserWarning)] assert len(user_warnings) == 0 assert_raises(ValueError, IsolationForest(max_samples='foobar').fit, X) assert_raises(ValueError, IsolationForest(max_samples=1.5).fit, X) # test X_test n_features match X_train one: assert_raises(ValueError, IsolationForest().fit(X).predict, X[:, 1:]) # test that behaviour='old' will raise an error msg = "The old behaviour of IsolationForest is not implemented anymore." with pytest.raises(NotImplementedError, match=msg): IsolationForest(behaviour='old').fit(X)
def test_iforest_sparse(): """Check IForest for various parameter settings on sparse input.""" rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]}) for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in grid: # Trained on sparse format sparse_classifier = IsolationForest( n_estimators=10, random_state=1, **params).fit(X_train_sparse) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_classifier = IsolationForest( n_estimators=10, random_state=1, **params).fit(X_train) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results)
def test_iforest(): """Check Isolation Forest for various parameter settings.""" X_train = np.array([[0, 1], [1, 2]]) X_test = np.array([[2, 1], [1, 1]]) grid = ParameterGrid({"n_estimators": [3], "max_samples": [0.5, 1.0, 3], "bootstrap": [True, False]}) with ignore_warnings(): for params in grid: IsolationForest(random_state=rng, **params).fit(X_train).predict(X_test)
def test_iforest_performance(): """Test Isolation Forest performs well""" # Generate train/test data rng = check_random_state(2) X = 0.3 * rng.randn(120, 2) X_train = np.r_[X + 2, X - 2] X_train = X[:100] # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) X_test = np.r_[X[100:], X_outliers] y_test = np.array([0] * 20 + [1] * 20) # fit the model clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train) # predict scores (the lower, the more normal) y_pred = - clf.decision_function(X_test) # check that there is at most 6 errors (false positive or false negative) assert roc_auc_score(y_test, y_pred) > 0.98
def test_iforest_subsampled_features(): # It tests non-regression for #5732 which failed at predict. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) clf = IsolationForest(max_features=0.8) clf.fit(X_train, y_train) clf.predict(X_test)
def test_iforest_warm_start(): """Test iterative addition of iTrees to an iForest """ rng = check_random_state(0) X = rng.randn(20, 2) # fit first 10 trees clf = IsolationForest(n_estimators=10, max_samples=20, random_state=rng, warm_start=True) clf.fit(X) # remember the 1st tree tree_1 = clf.estimators_[0] # fit another 10 trees clf.set_params(n_estimators=20) clf.fit(X) # expecting 20 fitted trees and no overwritten trees assert len(clf.estimators_) == 20 assert clf.estimators_[0] is tree_1
def test_iforest_works(contamination): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]] # Test IsolationForest clf = IsolationForest(random_state=rng, contamination=contamination) clf.fit(X) decision_func = -clf.decision_function(X) pred = clf.predict(X) # assert detect outliers: assert np.min(decision_func[-2:]) > np.max(decision_func[:-2]) assert_array_equal(pred, 6 * [1] + 2 * [-1])
def test_iforest_with_uniform_data(): """Test whether iforest predicts inliers when using uniform data""" # 2-d array of all 1s X = np.ones((100, 10)) iforest = IsolationForest() iforest.fit(X) rng = np.random.RandomState(0) assert all(iforest.predict(X) == 1) assert all(iforest.predict(rng.randn(100, 10)) == 1) assert all(iforest.predict(X + 1) == 1) assert all(iforest.predict(X - 1) == 1) # 2-d array where columns contain the same value across rows X = np.repeat(rng.randn(1, 10), 100, 0) iforest = IsolationForest() iforest.fit(X) assert all(iforest.predict(X) == 1) assert all(iforest.predict(rng.randn(100, 10)) == 1) assert all(iforest.predict(np.ones((100, 10))) == 1) # Single row X = rng.randn(1, 10) iforest = IsolationForest() iforest.fit(X) assert all(iforest.predict(X) == 1) assert all(iforest.predict(rng.randn(100, 10)) == 1) assert all(iforest.predict(np.ones((100, 10))) == 1)
def test_iforest_deprecation(): iforest = IsolationForest(behaviour='new') warn_msg = "'behaviour' is deprecated in 0.22 and will be removed in 0.24" with pytest.warns(DeprecationWarning, match=warn_msg): iforest.fit(iris.data)
import matplotlib.pyplot as plt from mrex.ensemble import IsolationForest rng = np.random.RandomState(42) # Generate train data X = 0.3 * rng.randn(100, 2) X_train = np.r_[X + 2, X - 2] # Generate some regular novel observations X = 0.3 * rng.randn(20, 2) X_test = np.r_[X + 2, X - 2] # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) # fit the model clf = IsolationForest(max_samples=100, random_state=rng) clf.fit(X_train) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) y_pred_outliers = clf.predict(X_outliers) # plot the line, the samples, and the nearest vectors to the plane xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("IsolationForest") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k') b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
def test_iforest_parallel_regression(): """Check parallel regression.""" rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) ensemble = IsolationForest(n_jobs=3, random_state=0).fit(X_train) ensemble.set_params(n_jobs=1) y1 = ensemble.predict(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict(X_test) assert_array_almost_equal(y1, y2) ensemble = IsolationForest(n_jobs=1, random_state=0).fit(X_train) y3 = ensemble.predict(X_test) assert_array_almost_equal(y1, y3)
def test_recalculate_max_depth(): """Check max_depth recalculation when max_samples is reset to n_samples""" X = iris.data clf = IsolationForest().fit(X) for est in clf.estimators_: assert est.max_depth == int(np.ceil(np.log2(X.shape[0])))
# Example settings n_samples = 300 outliers_fraction = 0.15 n_outliers = int(outliers_fraction * n_samples) n_inliers = n_samples - n_outliers # define outlier/anomaly detection methods to be compared anomaly_algorithms = [("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)), ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)), ("Isolation Forest", IsolationForest(contamination=outliers_fraction, random_state=42)), ("Local Outlier Factor", LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction))] # Define datasets blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2) datasets = [ make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5, **blobs_params)[0], make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5], **blobs_params)[0], make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3], **blobs_params)[0], 4. * (make_moons(n_samples=n_samples, noise=.05, random_state=0)[0] -
def test_max_samples_consistency(): # Make sure validated max_samples in iforest and BaseBagging are identical X = iris.data clf = IsolationForest().fit(X) assert clf.max_samples_ == clf._max_samples
if dat in ('http', 'smtp'): y = (y != b'normal.').astype(int) print_outlier_ratio(y) n_samples, n_features = X.shape n_samples_train = n_samples // 2 X = X.astype(float) X_train = X[:n_samples_train, :] X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] print('--- Fitting the IsolationForest estimator...') model = IsolationForest(n_jobs=-1, random_state=random_state) tstart = time() model.fit(X_train) fit_time = time() - tstart tstart = time() scoring = -model.decision_function(X_test) # the lower, the more abnormal print("--- Preparing the plot elements...") if with_decision_function_histograms: fig, ax = plt.subplots(3, sharex=True, sharey=True) bins = np.linspace(-0.5, 0.5, 200) ax[0].hist(scoring, bins, color='black') ax[0].set_title('Decision function for %s dataset' % dat) ax[1].hist(scoring[y_test == 0], bins, color='b', label='normal data') ax[1].legend(loc="lower right")