def test_iforest_average_path_length(): # It tests non-regression for #8549 which used the wrong formula # for average path length, strictly for the integer case result_one = 2. * (np.log(4.) + euler_gamma) - 2. * 4. / 5. result_two = 2. * (np.log(998.) + euler_gamma) - 2. * 998. / 999. assert_almost_equal(_average_path_length(1), 1., decimal=10) assert_almost_equal(_average_path_length(5), result_one, decimal=10) assert_almost_equal(_average_path_length(999), result_two, decimal=10) assert_array_almost_equal(_average_path_length(np.array([1, 5, 999])), [1., result_one, result_two], decimal=10)
def test_isolation_forest(): import shap import numpy as np from sklearn.ensemble import IsolationForest from sklearn.ensemble.iforest import _average_path_length X, y = shap.datasets.boston() iso = IsolationForest(behaviour='new', contamination='auto') iso.fit(X) explainer = shap.TreeExplainer(iso) shap_values = explainer.shap_values(X) score_from_shap = -2**( -(np.sum(shap_values, axis=1) + explainer.expected_value) / _average_path_length(np.array([iso.max_samples_]))[0]) assert np.allclose(iso.score_samples(X), score_from_shap, atol=1e-7)
def test_iforest_average_path_length(): # It tests non-regression for #8549 which used the wrong formula # for average path length, strictly for the integer case # Updated to check average path length when input is <= 2 (issue #11839) result_one = 2. * (np.log(4.) + np.euler_gamma) - 2. * 4. / 5. result_two = 2. * (np.log(998.) + np.euler_gamma) - 2. * 998. / 999. assert _average_path_length(0) == pytest.approx(0) assert _average_path_length(1) == pytest.approx(0) assert _average_path_length(2) == pytest.approx(1) assert_allclose(_average_path_length(5), result_one) assert_allclose(_average_path_length(999), result_two) assert_allclose(_average_path_length(np.array([1, 2, 5, 999])), [0., 1., result_one, result_two]) # _average_path_length is increasing avg_path_length = _average_path_length(np.arange(5)) assert_array_equal(avg_path_length, np.sort(avg_path_length))
def test_iforest_average_path_length(): # It tests non-regression for #8549 which used the wrong formula # for average path length, strictly for the integer case # Updated to check average path length when input is <= 2 (issue #11839) result_one = 2.0 * (np.log(4.0) + np.euler_gamma) - 2.0 * 4.0 / 5.0 result_two = 2.0 * (np.log(998.0) + np.euler_gamma) - 2.0 * 998.0 / 999.0 assert_allclose(_average_path_length([0]), [0.0]) assert_allclose(_average_path_length([1]), [0.0]) assert_allclose(_average_path_length([2]), [1.0]) assert_allclose(_average_path_length([5]), [result_one]) assert_allclose(_average_path_length([999]), [result_two]) assert_allclose( _average_path_length(np.array([1, 2, 5, 999])), [0.0, 1.0, result_one, result_two], ) # _average_path_length is increasing avg_path_length = _average_path_length(np.arange(5)) assert_array_equal(avg_path_length, np.sort(avg_path_length))
def test_pyod_isolation_forest(): import shap import numpy as np from pyod.models.iforest import IForest from sklearn.ensemble.iforest import _average_path_length X, _ = shap.datasets.boston() for max_features in [1.0, 0.75]: iso = IForest(max_features=max_features) iso.fit(X) explainer = shap.TreeExplainer(iso) shap_values = explainer.shap_values(X) score_from_shap = -2**( -(np.sum(shap_values, axis=1) + explainer.expected_value) / _average_path_length(np.array([iso.max_samples_]))[0]) assert np.allclose(iso.detector_.score_samples(X), score_from_shap, atol=1e-7)