def test_klpe_contamination(): """Check that predict agrees with contamination parameter. """ # This requires a certain amount of data samples because the threshold is # defined by a quantile. X = np.random.randn(50, 2) contamination = 0.1 clf1 = AverageKLPE(k=5, contamination=contamination) clf1.fit(X) assert_almost_equal(np.mean(clf1.predict(X) == 1), 1 - contamination) clf2 = MaxKLPE(k=5, contamination=contamination) clf2.fit(X) assert_almost_equal(np.mean(clf2.predict(X) == 1), 1 - contamination)
def test_score_train_novelty_or_not(): """Check score_fit_ attribute is the same if novelty=True of False""" X = np.random.randn(50, 2) # for AverageKLPE clf1 = AverageKLPE(k=10) clf2 = AverageKLPE(k=10, novelty=True) clf1.fit(X) clf2.fit(X) assert_array_equal(clf1.scores_fit_, clf2.scores_fit_) # for MaxKLPE clf3 = MaxKLPE(k=10) clf4 = MaxKLPE(k=10, novelty=True) clf3.fit(X) clf4.fit(X) assert_array_equal(clf3.scores_fit_, clf4.scores_fit_)
def test_compute_volumes(): """Check _compute_volumes for several masses.""" estimators = [ AverageKLPE(k=3, novelty=True), MaxKLPE(k=3, novelty=True), OCSVM(sigma=1.), IsolationForest(n_estimators=5, random_state=2), KernelSmoothing() ] alphas = rng.randint(1, 100, size=5) / 100 alphas = np.sort(alphas) for clf in estimators: clf = clf.fit(X_train) clf_test = clf.score_samples(X_test) min_test = np.min(clf_test) max_test = np.max(clf_test) score_function = clf.score_samples vols, offsets = _compute_volumes(score_function, alphas, X_test, U, vol_tot_cube) # check increasing order of volumes and decreasing order of offsets assert_array_equal(vols, np.sort(vols)) assert_array_equal(offsets, -np.sort(-offsets)) # check volumes in [0, vol_tot_cube] assert_true(np.all(0 <= vols) and np.all(vols <= vol_tot_cube)) # check offset values assert_true( np.all(min_test <= offsets) and np.all(offsets <= max_test)) proba_offsets_pos = (clf_test >= offsets[:, np.newaxis]) # this test requires to have a large number of samples because # np.percentile is an empirical quantile which uses interpolation. # this is also why we ask the values to be equal only up to the # second decimal. assert_array_almost_equal(np.mean(proba_offsets_pos, axis=1), alphas, decimal=2)
def test_maxklpe(): """Check MaxKLPE""" score_train_true = -np.array([np.sqrt(10), 2, np.sqrt(10)]) pred_train_true = np.array([0, 1, 0]) score_test_true = -np.array([np.sqrt(5), 2]) pred_test_true = np.array([1, 1]) # when novelty=False, i.e. scores and predict on X_train itself clf1 = MaxKLPE(k=2, contamination=0.7) clf1.fit(X_train) assert_equal(clf1.algo, 'max') score_train_attr1 = clf1.scores_fit_ assert_array_almost_equal(score_train_attr1, score_train_true) score_train1 = clf1.score_samples(X_train) assert_array_almost_equal(score_train1, score_train_true) assert_array_equal((score_train1 >= clf1.threshold_).astype(int), clf1.predict(X_train)) assert_array_equal(pred_train_true, clf1.predict(X_train)) # when novelty=True, i.e. scores and predict on X_test clf2 = MaxKLPE(k=2, contamination=0.7, novelty=True) clf2.fit(X_train) score_train_attr2 = clf2.scores_fit_ assert_array_almost_equal(score_train_attr2, score_train_true) score_test2 = clf2.score_samples(X_test) assert_array_almost_equal(score_test2, score_test_true) assert_array_equal(pred_test_true, clf2.predict(X_test))