def train_detector(data, preprocessor, perc_outlier=5): """Train outliers detector.""" print("Initialize outlier detector.") od = IForest(threshold=None, n_estimators=100) print("Training on normal data.") np.random.seed(0) normal_batch = create_outlier_batch(data.data, data.target, n_samples=30000, perc_outlier=0) X_train = normal_batch.data.astype('float') # y_train = normal_batch.target od.fit(preprocessor.transform(X_train)) print("Train on threshold data.") np.random.seed(0) threshold_batch = create_outlier_batch(data.data, data.target, n_samples=1000, perc_outlier=perc_outlier) X_threshold = threshold_batch.data.astype('float') # y_threshold = threshold_batch.target od.infer_threshold(preprocessor.transform(X_threshold), threshold_perc=100 - perc_outlier) return od
def test_isolation_forest(iforest_params): threshold, threshold_perc, return_instance_score = iforest_params X, y = load_iris(return_X_y=True) iforest = IForest(threshold) assert iforest.threshold == threshold assert iforest.meta == { 'name': 'IForest', 'detector_type': 'offline', 'data_type': 'tabular' } iforest.fit(X) iforest.infer_threshold(X, threshold_perc=threshold_perc) iscore = iforest.score(X) perc_score = 100 * (iscore < iforest.threshold).astype(int).sum() / iscore.shape[0] assert threshold_perc + 5 > perc_score > threshold_perc - 5 od_preds = iforest.predict(X, return_instance_score=return_instance_score) assert od_preds['meta'] == iforest.meta assert od_preds['data']['is_outlier'].sum() == ( iscore > iforest.threshold).astype(int).sum() if not return_instance_score: assert od_preds['data']['instance_score'] is None