예제 #1
0
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, X_test, y_train, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train LOCI detector
    clf_name = 'LODA'
    clf = LODA()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
예제 #2
0
class TestLODA(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        self.clf = LODA(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
        assert (hasattr(self.clf, 'projections_')
                and self.clf.projections_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_model_clone(self):
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass
예제 #3
0
iterate_threshold = True

if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s P%(process)d %(levelname)s %(message)s",
    )

    # load dataset
    data_dict = load_dataset(
        dataset,
        subdataset,
        "all",
    )

    x_train = data_dict["train"]
    x_test = data_dict["test"]
    x_test_labels = data_dict["test_labels"]

    # data preprocessing for MSCRED
    od = LODA(n_bins=n_bins, n_random_cuts=n_random_cuts)
    od.fit(x_train)

    # get outlier scores
    anomaly_score = od.decision_function(x_test)

    anomaly_label = x_test_labels

    # Make evaluation
    evaluate_all(anomaly_score, anomaly_label)
예제 #4
0
                      random_state=42)
    # load pretrained models
    prepare_trained_model()

    # recommended models
    selected_models = select_model(X_train, n_selection=100)

    print("Showing the top recommended models...")
    for i, model in enumerate(selected_models):
        print(i, model)

    print()

    model_1 = LODA(n_bins=5, n_random_cuts=100)
    print(
        "1st model Average Precision",
        average_precision_score(y_train,
                                model_1.fit(X_train).decision_scores_))

    model_10 = LODA(n_bins=5, n_random_cuts=20)
    print(
        "10th model Average Precision",
        average_precision_score(y_train,
                                model_10.fit(X_train).decision_scores_))

    model_50 = OCSVM(kernel='sigmoid', nu=0.6)
    print(
        "50th model Average Precision",
        average_precision_score(y_train,
                                model_50.fit(X_train).decision_scores_))
예제 #5
0
        # grogger
        df['arr86'] = (df['narr86'] >= 1).astype(int)
        Y = df['arr86']
        X = df[[
            'pcnv', 'avgsen', 'tottime', 'ptime86', 'inc86', 'black', 'hispan',
            'born60'
        ]]

    print(i, X.shape, Y.shape)

    if OD_Flag:

        # clf = HBOS(contamination=0.05)
        # clf = IForest(contamination=0.05)
        clf = LODA(contamination=0.05)
        clf.fit(X)

        # remove outliers
        X = X.loc[np.where(clf.labels_ == 0)]
        Y = Y.loc[np.where(clf.labels_ == 0)]

    X = sm.add_constant(X)

    # general OLS
    # https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.OLS.html
    # model=sm.OLS(Y, X.astype(float))

    # robust regression
    # https://www.statsmodels.org/stable/generated/statsmodels.robust.robust_linear_model.RLM.html
    # model=sm.RLM(Y, X.astype(float))
예제 #6
0
sklearn_score_anomalies = clf.decision_function(X_test)
original_paper_score = [-1 * s + 0.5 for s in sklearn_score_anomalies]
auc_svm_ws = evaluate.AUC(original_paper_score, y_test)

# --- LOF --- #
lof = LocalOutlierFactor(novelty=True)
lof.fit(X_train)
sklearn_score_anomalies = lof.decision_function(X_test)
original_paper_score = [-1 * s + 0.5 for s in sklearn_score_anomalies]
auc_lof_ws = evaluate.AUC(original_paper_score, y_test)

# --- LODA --- #
aucs_loda_ws = np.zeros(num_of_experiments)
for r in tqdm(range(num_of_experiments)):
    loda = LODA()
    loda.fit(X_train)
    y_pred_proba_loda = np.zeros(X_test.shape[0])
    for i in tqdm(range(X_test.shape[0])):
        loda.fit(X_test[i, :].reshape(1, -1))
        y_pred_proba_loda[i] = loda.decision_function(X_test[i, :].reshape(
            1, -1))
    aucs_loda_ws[r] = evaluate.AUC(1 - y_pred_proba_loda, y_test)
auc_loda_ws = np.mean(aucs_loda_ws)

# --- HalfSpaceTrees --- #
aucs_hst_ws = np.zeros(num_of_experiments)
for r in tqdm(range(num_of_experiments)):
    hst = HalfSpaceTrees(n_features=X_train_hst.shape[1], n_estimators=100)
    hst.fit(X_train_hst, np.zeros(X_train_hst.shape[0]))
    y_pred_proba_hst = np.zeros(X_test_hst.shape[0])
    for i in tqdm(range(X_test_hst.shape[0])):