Пример #1
0
def train_detector(data, preprocessor, perc_outlier=5):
    """Train outliers detector."""

    print("Initialize outlier detector.")
    od = IForest(threshold=None, n_estimators=100)

    print("Training on normal data.")
    np.random.seed(0)
    normal_batch = create_outlier_batch(data.data,
                                        data.target,
                                        n_samples=30000,
                                        perc_outlier=0)

    X_train = normal_batch.data.astype('float')
    # y_train = normal_batch.target

    od.fit(preprocessor.transform(X_train))

    print("Train on threshold data.")
    np.random.seed(0)
    threshold_batch = create_outlier_batch(data.data,
                                           data.target,
                                           n_samples=1000,
                                           perc_outlier=perc_outlier)
    X_threshold = threshold_batch.data.astype('float')
    # y_threshold = threshold_batch.target

    od.infer_threshold(preprocessor.transform(X_threshold),
                       threshold_perc=100 - perc_outlier)

    return od
Пример #2
0
def test_isolation_forest(iforest_params):
    threshold, threshold_perc, return_instance_score = iforest_params
    X, y = load_iris(return_X_y=True)
    iforest = IForest(threshold)
    assert iforest.threshold == threshold
    assert iforest.meta == {
        'name': 'IForest',
        'detector_type': 'offline',
        'data_type': 'tabular'
    }
    iforest.fit(X)
    iforest.infer_threshold(X, threshold_perc=threshold_perc)
    iscore = iforest.score(X)
    perc_score = 100 * (iscore <
                        iforest.threshold).astype(int).sum() / iscore.shape[0]
    assert threshold_perc + 5 > perc_score > threshold_perc - 5
    od_preds = iforest.predict(X, return_instance_score=return_instance_score)
    assert od_preds['meta'] == iforest.meta
    assert od_preds['data']['is_outlier'].sum() == (
        iscore > iforest.threshold).astype(int).sum()
    if not return_instance_score:
        assert od_preds['data']['instance_score'] is None