예제 #1
0
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test, n_features=10,
            contamination=self.contamination, random_state=42)

        self.clf = COPOD(contamination=self.contamination)
        self.clf.fit(self.X_train)
예제 #2
0
 def model_init(self, model):
     """Model initialisation of a single model.
     """
     if self.model == 'pca':
         self.models[model] = PCA(contamination=self.contamination)
     elif self.model == 'loda':
         self.models[model] = LODA(contamination=self.contamination)
     elif self.model == 'iforest':
         self.models[model] = IForest(n_estimators=50,
                                      bootstrap=True,
                                      behaviour='new',
                                      contamination=self.contamination)
     elif self.model == 'cblof':
         self.models[model] = CBLOF(n_clusters=3,
                                    contamination=self.contamination)
     elif self.model == 'feature_bagging':
         self.models[model] = FeatureBagging(
             base_estimator=PCA(contamination=self.contamination),
             contamination=self.contamination)
     elif self.model == 'copod':
         self.models[model] = COPOD(contamination=self.contamination)
     elif self.model == 'hbos':
         self.models[model] = HBOS(contamination=self.contamination)
     else:
         self.models[model] = HBOS(contamination=self.contamination)
     self.custom_model_scalers[model] = MinMaxScaler()
예제 #3
0
    def setUp(self):
        # Define data file and read X and y
        # Generate some data if the source data is missing
        this_directory = path.abspath(path.dirname(__file__))
        mat_file = 'cardio.mat'
        try:
            mat = loadmat(path.join(*[this_directory, 'data', mat_file]))

        except TypeError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        except IOError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        else:
            X = mat['X']
            y = mat['y'].ravel()
            X, y = check_X_y(X, y)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=42)

        self.base_estimators = [LOF(), LOF(), IForest(), COPOD()]
        self.clf = SUOD(base_estimators=self.base_estimators)
        self.clf.fit(self.X_train)
        self.roc_floor = 0.7
예제 #4
0
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            n_features=10,
            contamination=self.contamination,
            random_state=42)

        self.clf = COPOD(contamination=self.contamination, n_jobs=2)
        self.clf.fit(self.X_train)

        # get a copy from the single thread copy
        self.clf_ = COPOD(contamination=self.contamination)
        self.clf_.fit(self.X_train)
예제 #5
0
def main():

    scalers = ['no', 'std', 'minmax']
    root = 'Unsupervised_Anamaly_Detection_csv'
    start = 0
    counts = 90
    CPUS = 3
    CPUS_Models = 4
    sklearn_models = [
        'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS',
        'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD',
        'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal',
        'VAE', 'AutoEncoder'
    ]

    models = {
        'BRM': BRM(bootstrap_sample_percent=70),
        'GM': GaussianMixture(),
        'IF': IsolationForest(),
        'OCSVM': OneClassSVM(),
        'EE': EllipticEnvelope(),
        'AvgKNN': KNN(method='mean'),
        'LargestKNN': KNN(method='largest'),
        'MedKNN': KNN(method='median'),
        'PCA': PCA(),
        'COF': COF(),
        'LODA': LODA(),
        'LOF': LOF(),
        'HBOS': HBOS(),
        'MCD': MCD(),
        'AvgBagging': FeatureBagging(combination='average'),
        'MaxBagging': FeatureBagging(combination='max'),
        'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'FactorAnalysis': FactorAnalysis(),
        'KernelDensity': KernelDensity(),
        'COPOD': COPOD(),
        'SOD': SOD(),
        'LSCPwithLODA': LSCP([LODA(), LODA()]),
        'AveLMDD': LMDD(dis_measure='aad'),
        'VarLMDD': LMDD(dis_measure='var'),
        'IqrLMDD': LMDD(dis_measure='iqr'),
        'SoGaal': SO_GAAL(),
        'MoGaal': MO_GAAL(),
        'VAE': VAE(encoder_neurons=[8, 4, 2]),
        'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'OCKRA': m_OCKRA(),
    }

    name = "30_Models"

    Parallel(n_jobs=CPUS) \
        (delayed(runByScaler)
         (root, scaler, models, start, counts,
          other_models=sklearn_models,
          CPUS=CPUS_Models,
          save_name=name)
         for scaler in scalers)
예제 #6
0
def remove_outlier(data, x, row, contamination):
    '''
     data: which kind of data you are passing
     x : 0 for live and 1 for spoof
    '''
    
    # 0 indicates all the live images
    x1 = np.where(data[:, 512]==x) #512
    x2 = data[x1][0:row, :]
    train_features_x = x2[:,0:511]
    clf = COPOD(contamination = contamination)
    clf.fit(train_features_x)
    z = clf.labels_
    z = np.asarray(z).reshape(row,1)
    z_final =  np.hstack((x2, z))
    
    x1_2 = np.where(z_final[:, 513]==0)
    x2_2 = z_final[x1_2][:, :]
    return x2_2
예제 #7
0
 def models_init(self):
     """Models initialisation.
     """
     self.model = self.configuration.get('model', 'pca')
     if self.model == 'pca':
         self.models = {
             model: PCA(contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'loda':
         self.models = {
             model: LODA(contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'iforest':
         self.models = {
             model: IForest(n_estimators=50,
                            bootstrap=True,
                            behaviour='new',
                            contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'cblof':
         self.models = {
             model: CBLOF(n_clusters=3, contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'feature_bagging':
         self.models = {
             model: FeatureBagging(
                 base_estimator=PCA(contamination=self.contamination),
                 contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'copod':
         self.models = {
             model: COPOD(contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'hbos':
         self.models = {
             model: HBOS(contamination=self.contamination)
             for model in self.models_in_scope
         }
     else:
         self.models = {
             model: HBOS(contamination=self.contamination)
             for model in self.models_in_scope
         }
     self.custom_model_scalers = {
         model: MinMaxScaler()
         for model in self.models_in_scope
     }
예제 #8
0
파일: model.py 프로젝트: esowc/DAAQS
    def pred_COPOD(self, comp_with="openaq"):

        self.comp_with = comp_with

        if comp_with == "openaq":
            if self.X_o == []:
                pred = []
            else:
                self.clf = COPOD()
                self.clf.fit(self.X_o)
                pred = self.clf.labels_

        elif comp_with == "cams":
            pred = []
            for each_X in self.X_c:
                self.clf = COPOD()
                self.clf.fit(each_X)
                pred.append(self.clf.labels_[-1])

        A_location, B_location, C_location = self.pred_location(pred)

        return A_location, B_location, C_location
def detect_anomaly(df_floats: pd.DataFrame,
                   train_size: float,
                   outliers_rate: float,
                   classifier: str,
                   plot: bool = False):
    """ Return binary classified outlier and raw outlier score.

    Performs training of anomaly detection model on subset of dataset and returns
    binary label and decision score for whole dataset.

    Parameters
        ----------
        df_floats: pd.DataFrame with elements as floats.

        train_size: proportion of dataset to be used for training anomaly detection model.

        outliers_rate: proportion of training set to be considered outlier.

        classifier: string representing name of anomaly detection algorithm.

        plot: plots 2d contourf of anomaly detection scores.

    Returns
        -------
        y_labels: numpy array of the same length as df_floats that assigns 0/1 (inlier/outlier) to each observation
                    according to fitted model.
        y_scores: numpy array of the same length as df_floats that assigns outlier scores to each observation
                    according to fitted model.

    """
    if df_floats.shape[0] < 8:
        raise Warning(
            'Not enough measurements. Please use DataFrame with at last 10 measurements.'
        )
    if train_size > 1:
        train_size = train_size / 100
    # TODO: Find out empirical way to set contamination level - Tukey's method
    if outliers_rate >= 1:
        outliers_rate = outliers_rate / 100

    random_state = np.random.RandomState(42)

    # TODO: Perform scaling of data ONLY for AKNN, CBLOF, HBOS, KNN, OCSVM. Other classifiers are not influenced.
    classifiers = {
        'Average KNN (AKNN)':
        KNN(method='mean', contamination=outliers_rate),
        'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_rate,
              check_estimator=False,
              random_state=random_state),
        'Copula based Outlier Detection (COPOD)':
        COPOD(contamination=outliers_rate),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_rate),
        'Isolation Forest (IForest)':
        IForest(contamination=outliers_rate, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_rate),
        'One-Class SVM (OCSVM)':
        OCSVM(contamination=outliers_rate),
        'Principal component analysis (PCA)':
        PCA(contamination=outliers_rate)
    }

    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled = scaler.fit_transform(df_floats)
    df_scaled = pd.DataFrame(scaled,
                             index=df_floats.index,
                             columns=df_floats.columns)

    x_train, x_test = train_test_split(df_scaled, train_size=train_size)

    if classifier == 'all':
        raise Warning('This option is currently unsupported.'
                      '\nPlease use one of those classifiers:'
                      '\n{}.'.format(list(classifiers.keys())))
        # for i, (clf_name, clf) in enumerate(classifiers.items()):
        #    # fit model
        #    clf.fit(x_train)
        #    # prediction of a datapoint category outlier or inlier
        #    y_labels = clf.predict(df_scaled)
        #    plot_outlier_detection(df_scaled, y_labels, clf, clf_name, scaler)
    else:
        clf_name = ''
        for name in classifiers.keys():
            if classifier in name:
                clf_name = name
                break
        if clf_name:
            # print("\nUsed classifier: {}".format(clf_name))
            clf = classifiers.get(clf_name)
            clf.fit(x_train)
            y_labels = clf.predict(
                df_scaled)  # binary labels (0: inliers, 1: outliers)
            y_scores = clf.decision_function(df_scaled)  # raw outlier scores
        else:
            raise NameError('Unknown classifier. '
                            'Please use one of those: {}.'.format(
                                list(classifiers.keys())))

        if plot:
            plot_outlier_detection(df_scaled, y_labels, clf, clf_name, scaler)

    return y_labels, y_scores
예제 #10
0
파일: eval.py 프로젝트: ralampay/pyntrainer
    def execute(self):
        evaluation_results = []

        print("Loading training data...")
        data = pd.DataFrame()

        for i, chunk in enumerate(
                pd.read_csv(self.input_file,
                            header=None,
                            chunksize=self.chunk_size)):
            print("Reading chunk: %d" % (i + 1))
            #print(chunk)
            data = data.append(chunk)

        input_dimensionality = len(data.columns) - 1
        print("Input Dimensionality: %d" % (input_dimensionality))

        positive_data = data[data[len(data.columns) -
                                  1] == 1].iloc[:, :len(data.columns) - 1]
        negative_data = data[data[len(data.columns) -
                                  1] == -1].iloc[:, :len(data.columns) - 1]

        training_data = positive_data.sample(frac=0.70)
        positive_validation_data = positive_data.drop(training_data.index)

        if self.neg_cont and self.neg_cont > 0:
            print("Negative Contamination: %0.4f" % (self.neg_cont))
            num_negative = math.floor(
                self.neg_cont *
                (len(negative_data) + len(positive_validation_data)))
            negative_data = data.sample(frac=1, random_state=200)[
                data[len(data.columns) -
                     1] == -1].iloc[:num_negative, :len(data.columns) - 1]

        negative_validation_data = negative_data.copy()

        temp_positive = positive_validation_data.copy()
        temp_positive[input_dimensionality] = 1

        temp_negative = negative_data.copy()
        temp_negative[input_dimensionality] = -1

        validation_data_with_labels = pd.concat([temp_positive, temp_negative],
                                                ignore_index=True)
        validation_data = validation_data_with_labels.iloc[:, :len(data.columns
                                                                   ) - 1]
        validation_labels = validation_data_with_labels.iloc[:, -1:].values

        # Convert to tensor
        positive_data = torch.tensor(positive_data.values).float().to(
            self.device)
        negative_data = torch.tensor(negative_data.values).float().to(
            self.device)
        training_data = torch.tensor(training_data.values).float()
        validation_data = torch.tensor(validation_data.values).float()

        print("Validation Data:")
        print(validation_data)

        ## AE-D TRAINING ##
        print("Initializing autoencoder...")
        net = Autoencoder(layers=self.layers,
                          device=self.device,
                          add_syn=self.add_syn)
        net.to(self.device)

        print(net)

        print("Training Stochastic Autoencoder...")
        net.fit(training_data,
                epochs=self.epochs,
                lr=self.lr,
                batch_size=self.batch_size)

        predictions = net.predict(validation_data)

        tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc = performance_metrics(
            validation_labels, predictions)

        r = ["AE-D", tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc]

        evaluation_results.append(r)

        print("AE-D Results:")
        print(
            tabulate([r], [
                "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV",
                "TS", "PT", "ACC", "F1", "MCC"
            ],
                     tablefmt="grid"))

        # Convert back to CPU before other methods
        validation_data = validation_data.cpu()

        # Train only linear classifiers
        if self.eval_cat == "linear":
            print("Initiating training for linear detectors...")

            ## MCD ##
            print("Training MCD...")
            result = train_and_evaluate_classifier("MCD", MCD(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## ROBUST COVARIANCE ##
            print("Training Robust Covariance...")
            result = train_and_evaluate_classifier("ROB-COV",
                                                   EllipticEnvelope(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## ONE CLASS SVM TRAINING ##
            print("Training OneClassSVM...")
            result = train_and_evaluate_classifier(
                "OC-SVM", svm.OneClassSVM(gamma="auto"), validation_data,
                validation_labels)
            evaluation_results.append(result)

        elif self.eval_cat == "prob":
            ## ABOD ##
            #print("Training ABOD...")
            #result = train_and_evaluate_classifier("ABOD", ABOD(), validation_data, validation_labels)
            #evaluation_results.append(result)

            ## SOS ##
            #print("Training SOS...")
            #result = train_and_evaluate_classifier("SOS", SOS(), validation_data, validation_labels)
            #evaluation_results.append(result)

            ## COPOD ##
            print("Training COPOD...")
            result = train_and_evaluate_classifier("COPOD", COPOD(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

        elif self.eval_cat == "ensemble":
            ## ISOLATION FOREST TRAINING ##
            print("Training Isolation Forest...")
            result = train_and_evaluate_classifier(
                "ISO-F", IsolationForest(random_state=0), validation_data,
                validation_labels)
            evaluation_results.append(result)

            ## LODA ##
            print("Training LODA...")
            result = train_and_evaluate_classifier("LODA", LODA(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## LSCP ##


#      print("Training LSCP...")
#      result = train_and_evaluate_classifier("LSCP", LSCP([LOF(), LOF()]), validation_data, validation_labels)
#      evaluation_results.append(result)

        elif self.eval_cat == "proximity":
            ## LOCAL OUTLIER FACTOR ##
            print("Training Local Outlier Factor...")
            result = train_and_evaluate_classifier(
                "LOC-OF", LocalOutlierFactor(novelty=True), validation_data,
                validation_labels)
            evaluation_results.append(result)

            ## CBLOF ##
            print("Training CBLOF...")
            result = train_and_evaluate_classifier("CBLOF", CBLOF(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## HBOS ##
            print("Training HBOS...")
            result = train_and_evaluate_classifier("HBOS", HBOS(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

        elif self.eval_cat == "nn":
            ## VAE ##
            print("Training VAE...")
            result = train_and_evaluate_classifier(
                "VAE",
                VAE(encoder_neurons=self.layers,
                    decoder_neurons=self.layers.reverse()), validation_data,
                validation_labels)
            evaluation_results.append(result)

            ## SO_GAAL ##
            print("Training SO_GAAL...")
            result = train_and_evaluate_classifier(
                "SO_GAAL", SO_GAAL(lr_d=self.lr, stop_epochs=self.epochs),
                validation_data, validation_labels)
            evaluation_results.append(result)

            ## MO_GAAL ##
            print("Training MO_GAAL...")
            result = train_and_evaluate_classifier(
                "MO_GAAL", MO_GAAL(lr_d=self.lr, stop_epochs=self.epochs),
                validation_data, validation_labels)
            evaluation_results.append(result)

        ## EVALUATE RESULTS ##
        if self.eval_cat != "none":
            print("Aggregated Results:")
            print(
                tabulate(evaluation_results, [
                    "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV",
                    "TS", "PT", "ACC", "F1", "MCC"
                ],
                         tablefmt="grid"))

        ## DATASET METRICS ##
        len_training_data_points = len(training_data)
        len_positive_validations = len(positive_validation_data)
        len_negative_validations = len(negative_validation_data)
        len_validations = len_positive_validations + len_negative_validations

        metrics_results = [
            ["Training Data Points", len_training_data_points],
            ["# Normal Points", len_positive_validations],
            ["# Anomalies", len_negative_validations],
            [
                "Contamination Percentage",
                math.floor((len_negative_validations / len_validations) * 100)
            ]
        ]

        ## EVALUATE RESULTS ##
        print(tabulate(metrics_results, ["Metric", "Value"], tablefmt="grid"))

        if self.printout:
            print("Saving results to %s" % (self.printout))
            df = pd.DataFrame(evaluation_results)
            df.to_csv(self.printout, header=None, index=False)
예제 #11
0
class TestCOPOD(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test, n_features=10,
            contamination=self.contamination, random_state=42)

        self.clf = COPOD(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_') and
                self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and
                self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_') and
                self.clf.threshold_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    # def test_plot(self):
    #     os, cutoff1, cutoff2 = self.clf.explain_outlier(ind=1)
    #     assert_array_less(0, os)

    def test_model_clone(self):
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass
예제 #12
0
def evaluation_od_train(x,
                        y,
                        data_name,
                        model_name="iforest",
                        chosen_subspace=None):
    """
    using anomaly detector to yield anomaly score for each subspace,
    generate two files: the subspaces with the highest anomaly score & lof score for each subspace
    :param x: data matrix
    :param y: class information
    :param data_name: the data set name, using for naming the ground truth file
    :param model_name: anomaly detector name, default: lof
    :param chosen_subspace: use this to only evaluate a subset of the power set of full feature space
    :return: df: a ground-truth map using anomaly idx as key and ground truth feature subspace as value.
    """
    global chosen_model

    dim = x.shape[1]
    ano_idx = np.where(y == 1)[0]
    n_ano = len(ano_idx)

    # get all the possible feature subset or just use given subset list
    f_subsets = utils.get_subset_candidate(dim, chosen_subspace)

    # score anomalies in each subspace, generate the score matrix
    n_subsets = len(f_subsets)
    score_matrix = np.zeros([n_ano, n_subsets])
    for i in tqdm(range(n_subsets)):
        subset = f_subsets[i]
        x_subset = x[:, subset]

        if model_name == "iforest":
            clf = IForest()
            clf.fit(x_subset)
            od_score = clf.decision_scores_
        elif model_name == "copod":
            clf = COPOD()
            clf.fit(x_subset)
            od_score = clf.decision_scores_
        elif model_name == "hbos":
            clf = HBOS()
            clf.fit(x_subset)
            od_score = clf.decision_scores_
        else:
            raise ValueError("unsupported od model")

        od_score = utils.min_max_norm(od_score)
        score_matrix[:, i] = od_score[ano_idx]

    if not os.path.exists(eva_root + "data_od_evaluation/"):
        os.makedirs(eva_root + "data_od_evaluation/")

    # score matrix to df
    anomaly_score_df = pd.DataFrame(data=score_matrix,
                                    columns=[str(s) for s in f_subsets])
    col_name = anomaly_score_df.columns.tolist()
    col_name.insert(0, 'ano_idx')
    anomaly_score_df["ano_idx"] = ano_idx
    anomaly_score_df = anomaly_score_df.reindex(columns=col_name)
    path1 = eva_root + "data_od_evaluation/" + data_name + "_score_" + model_name + ".csv"
    anomaly_score_df.to_csv(path1, index=False)

    # get the ground truth (one subspace for each anomaly that the anomaly can obtain the highest anomaly score)
    g_truth_df = pd.DataFrame(columns=["ano_idx", "exp_subspace"])

    exp_subspaces = []
    for ii, ano_score in enumerate(score_matrix):
        max_score_idx = int(np.argmax(ano_score))
        exp_subset = str(f_subsets[max_score_idx])
        exp_subspaces.append(exp_subset)
    g_truth_df["ano_idx"] = ano_idx
    g_truth_df["exp_subspace"] = exp_subspaces

    g_truth_df.astype({"exp_subspace": "object"})
    path2 = eva_root + "data_od_evaluation/" + data_name + "_gt_" + model_name + ".csv"
    g_truth_df.to_csv(path2, index=False)
    return anomaly_score_df, g_truth_df
예제 #13
0
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train SUOD
    clf_name = 'SUOD'

    # initialized a group of outlier detectors for acceleration
    detector_list = [
        LOF(n_neighbors=15),
        LOF(n_neighbors=20),
        LOF(n_neighbors=25),
        LOF(n_neighbors=35),
        COPOD(),
        IForest(n_estimators=100),
        IForest(n_estimators=200)
    ]

    # decide the number of parallel process, and the combination method
    clf = SUOD(base_estimators=detector_list,
               n_jobs=2,
               combination='average',
               verbose=False)

    # or to use the default detectors
    # clf = SUOD(n_jobs=2, combination='average',
    #            verbose=False)
    clf.fit(X_train)
예제 #14
0
    elif model == 'MCD':
        clf = MCD()
    elif model == 'OCSVM':
        clf = OCSVM()
    elif model == 'LOF':
        clf = LOF()
    elif model == 'CBLOF':
        clf = CBLOF()
    elif model == 'HBOS':
        clf = HBOS()
    elif model == 'KNN':
        clf = KNN()
    elif model == 'ABOD':
        clf = ABOD()
    else:
        clf = COPOD()

    # fit the model
    clf.fit(X)
    # get outlier scores
    scores = clf.decision_scores_  # raw outlier scores

with col2:
    st.write('Top 10 anomaly scores for the', model, 'model:')
    df_id.loc[:, 'scores'] = scores
    top10 = df_id.nlargest(10, 'scores')
    top10_list = top10.index.tolist()
    st.write(top10)

st.write('---')
def compare(inputdata, labels, n_clusters, dset_name):
    """
    Compute the AUC, Fgap, Frank score on all conventional outlier detectors for the given dataset
    Args:
        inputdata: input data
        labels: ground truth outlier labels
        n_clusters: number of clusters, for some cluster-based detectors
        dset_name: dataset

    Returns: AUC, Fgap, Frank

    """
    print(
        "Competing with conventional unsupervised outlier detection algorithms..."
    )
    random_state = np.random.RandomState(1)
    if inputdata.shape[1] < 64:
        AEneurons = [16, 8, 8, 16]
        VAEneurons = [16, 8, 4], [4, 8, 16]
    else:
        AEneurons = [64, 32, 32, 64]
        VAEneurons = [128, 64, 32], [32, 64, 128]

    classifiers = {
        'PCA':
        PCA(random_state=random_state),
        'AutoEncoder':
        AutoEncoder(batch_size=100,
                    hidden_neurons=AEneurons,
                    random_state=random_state),
        'VAE':
        VAE(batch_size=100,
            encoder_neurons=VAEneurons[0],
            decoder_neurons=VAEneurons[1],
            random_state=random_state),
        'COPOD':
        COPOD(),
        'Iforest':
        IForest(random_state=random_state),
        'AutoEncoder':
        AutoEncoder(batch_size=100, random_state=random_state),
        'VAE':
        VAE(batch_size=100, random_state=random_state),
        'LODA':
        LODA(),
        'OCSVM':
        OCSVM(),
        'ABOD':
        ABOD(n_neighbors=20),
        'Fb':
        FeatureBagging(random_state=random_state),
        'CBLOF':
        CBLOF(n_clusters=n_clusters,
              check_estimator=False,
              random_state=random_state),
        'LOF':
        LOF(),
        'COF':
        COF()
    }

    for clf_name, clf in classifiers.items():
        print(f"Using {clf_name} method")
        starttime = time.time()
        clf.fit(inputdata)
        time_taken = time.time() - starttime
        test_scores = clf.decision_scores_

        # -----fix some broken scores----- #
        for i in range(len(test_scores)):
            cur = test_scores[i]
            if np.isnan(cur) or not np.isfinite(cur):
                test_scores[i] = 0

        np.save(f'{dset_name}/{clf_name}_raw.npy', test_scores)
        auc = roc_auc_score(labels, test_scores)
        print('AUC:', auc)
        fetch(normalize(test_scores), f'../datasets/{dset_name.upper()}_Y.npy',
              f'{dset_name}/attribute.npy')
        print('time_taken:', time_taken)
예제 #16
0
    'AvgKNN': KNN(method='mean'),
    'LargestKNN': KNN(method='largest'),
    'MedKNN': KNN(method='median'),
    'PCA': PCA(),
    'COF': COF(),
    'LODA': LODA(),
    'LOF': LOF(),
    'HBOS': HBOS(),
    'MCD': MCD(),
    'AvgBagging': FeatureBagging(combination='average'),
    'MaxBagging': FeatureBagging(combination='max'),
    'IForest': IForest(),
    'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
    'FactorAnalysis': FactorAnalysis(),
    'KernelDensity': KernelDensity(),
    'COPOD': COPOD(),
    'SOD': SOD(),
    'LSCPwithLODA': LSCP([LODA(), LODA()]),
    'AveLMDD': LMDD(dis_measure='aad'),
    'VarLMDD': LMDD(dis_measure='var'),
    'IqrLMDD': LMDD(dis_measure='iqr'),
    'SoGaal': SO_GAAL(),
    #'MoGaal':MO_GAAL(),
    'VAE': VAE(encoder_neurons=[8, 4, 2]),
    'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6])
}

models = {
    'XGBOD': XGBOD(),
    'BRM': BRM(),
    'GM': GaussianMixture(),
예제 #17
0
if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train COPOD detector
    clf_name = 'COPOD'
    clf = COPOD()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
예제 #18
0
'''
Author: Christian O'Leary
Email: [email protected]
'''

import numpy as np
from pyod.models.copod import COPOD

from emmv import emmv_scores

rng = np.random.RandomState(42)

NUM_COLS = 2
# Generate train data
X = 0.3 * rng.randn(100, NUM_COLS)
X_train = np.r_[X + 2, X - 2]
# Generate some regular novel observations
X = 0.3 * rng.randn(20, NUM_COLS)
X_regular = np.r_[X + 2, X - 2]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, NUM_COLS))
# fit the model
model = COPOD()
model.fit(X_train)

# Get EM & MV scores
X_test = np.concatenate((X_regular, X_outliers), axis=0)
test_scores = emmv_scores(model, X_test)
print('Excess Mass score;', test_scores['em'])
print('Mass Volume score:', test_scores['mv'])
예제 #19
0
def main():

    # PART 1:
    # Getting the predictions for each classifier
    # SK means: The classifier is from sklearn or works like sklearn
    # PY means: The classifier is from pyod or works like pyod

    models = {
        'SK_EE': EllipticEnvelope(),
        'SK_GM': GaussianMixture(),
        'SK_IF': IsolationForest(),
        'SK_OCSVM': OneClassSVM(),
        'SK_FA': FactorAnalysis(),
        'SK_KD': KernelDensity(),
        'PY_PCA': PCA(),
        'PY_COF': COF(),
        'PY_LODA': LODA(),
        'PY_LOF': LOF(),
        'PY_HBOS': HBOS(),
        'PY_MCD': MCD(),
        'PY_AvgKNN': KNN(method='mean'),
        'PY_LargestKNN': KNN(method='largest'),
        'PY_MedKNN': KNN(method='median'),
        'PY_AvgBagging': FeatureBagging(combination='average'),
        'PY_MaxBagging': FeatureBagging(combination='max'),
        'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'PY_COPOD': COPOD(),
        'PY_SOD': SOD(),
        'PY_LSCPwithLODA': LSCP([LODA(), LODA()]),
        'PY_AveLMDD': LMDD(dis_measure='aad'),
        'PY_VarLMDD': LMDD(dis_measure='var'),
        'PY_IqrLMDD': LMDD(dis_measure='iqr'),
        'PY_VAE': VAE(encoder_neurons=[8, 4, 2]),
        'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'SK_BRM': BRM(bootstrap_sample_percent=70),
        'SK_OCKRA': m_OCKRA(),
        'PY_SoGaal': SO_GAAL(),
        'PY_MoGaal': MO_GAAL()
    }
    ranker = ADRanker(data="datasets", models=models)
    ranker.get_predictions()

    # PART 2:
    # After predictions, we can evaluate our classifiers using different scores
    # You can add manually a new metric by modifying 'metrics.py'

    ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave})

    # PART 3:
    # Finally, it is time to summarize the results by plotting different graphs
    # You can add your own graphs by modifying ' plots.py'
    plot = Plots()
    plot.make_plot_basic(paths=[
        'results/scores/auc/no/results.csv',
        'results/scores/auc/minmax/results.csv',
        'results/scores/auc/std/results.csv',
        'results/scores/ave/no/results.csv',
        'results/scores/ave/minmax/results.csv',
        'results/scores/ave/std/results.csv'
    ],
                         scalers=[
                             'Without scaler', 'Min max scaler',
                             'Standard scaler', 'Without scaler',
                             'Min max scaler', 'Standard scaler'
                         ])

    plot.make_cd_plot(
        paths=[
            'results/scores/auc/minmax/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/no/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/std/results.csv',
            'results/scores/ave/std/results.csv'
        ],
        names=[
            'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale',
            'CD ave no scale', 'CD auc std scale', 'CD ave std scale'
        ],
        titles=[
            'CD diagram - AUC with min max scaling',
            'CD diagram - Average precision with min max scaling',
            'CD diagram - AUC without scaling',
            'CD diagram - Average precision without scaling',
            'CD diagram - AUC with standard scaling',
            'CD diagram - Average precision with  standard scaling'
        ])
예제 #20
0
    mat = loadmat(os.path.join('data', mat_file))
    X = mat['X']
    y = mat['y'].ravel()

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.4,
                                                        random_state=1)

    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    # train COPOD detector
    clf_name = 'COPOD'
    clf = COPOD()

    # you could try parallel version as well.
    # clf = COPOD(n_jobs=2)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    print('The first sample is an outlier', y_train[0])
    clf.explain_outlier(0)

    # we could see feature 7, 16, and 20 is above the 0.99 cutoff
    # and play a more important role in deciding it is an outlier.
예제 #21
0
    def fit(self, X, shrink_cols = True, data_scaler = preprocessing.MaxAbsScaler(), 
            quick_methods = True, slow_methods = False, nn_methods = False, 
            contamination = 0.05, use_score_rank = False, random_state = None, verbose = 0):

        if len(X.shape) > 2:
            X = X.reshape(X.shape[0], X.shape[1]*X.shape[2])
        elif len(X.shape) > 3:
            raise ValueError("Expected number of dimensions: 2 or 3") 
        
        if shrink_cols:
            X = X[:,~np.all(X == 0, axis=0)]
            log.info('zero columns shrinked')
        if data_scaler:
            X = data_scaler.fit_transform(X)
            log.info(f'used {data_scaler} data scaler')
            #log.info(X[0:1,:])
        
        n_rows = X.shape[0]
        n_features = X.shape[1]
        log.info (f'n_rows = {n_rows}, n_features = {n_features}')
        
        quick_scores = np.zeros([n_rows, 0])
        slow_scores = np.zeros([n_rows, 0])
        nn_scores = np.zeros([n_rows, 0])
        
        if quick_methods:
            # Define anomaly detection tools to be compared
            quick_classifiers = {
                'PCA_randomized':
                    PCA(contamination=contamination, random_state=random_state, 
                        standardization = False, svd_solver = 'randomized'),
                'PCA_full':
                    PCA(contamination=contamination, random_state=random_state, 
                        standardization = False, svd_solver = 'full'),                               
                'COPOD':
                   COPOD(contamination=contamination),  
                f'HBOS': 
                    HBOS(contamination=contamination),
                f'HBOS_{200}': 
                    HBOS(contamination=contamination, n_bins = 200),                
                f'HBOS_{300}':  
                    HBOS(contamination=contamination, n_bins = 300), 
                'LODA':
                    LODA(contamination=contamination),
                'LODA_200':
                    LODA(contamination=contamination, n_random_cuts  = 200),
                'LODA_300':
                    LODA(contamination=contamination, n_random_cuts  = 300),                
                'IForest_100':
                    IForest(contamination=contamination, random_state=random_state, 
                            n_estimators = 100, bootstrap = False, n_jobs = -1),
                'IForest_200':
                    IForest(contamination=contamination, random_state=random_state, 
                            n_estimators = 200, bootstrap = False, n_jobs = -1),                
                'IForest_bootstrap':
                    IForest(contamination = contamination, random_state=random_state, 
                            n_estimators = 150, bootstrap = True, n_jobs = -1), 
                #'MCD': 
                #    MCD(contamination=contamination, random_state=random_state, assume_centered = False),
                #'MCD_centered': 
                #    MCD(contamination=contamination, random_state=random_state, assume_centered = True),    
                f'CBLOF_16':
                    CBLOF(contamination=contamination, random_state=random_state, n_clusters = 16),
                f'CBLOF_24':
                    CBLOF(contamination=contamination, random_state=random_state, n_clusters = 24),
                f'CBLOF_32':
                    CBLOF(contamination=contamination, random_state=random_state, n_clusters = 32)
            }
            
            quick_scores = np.zeros([n_rows, len(quick_classifiers)])

            for i, (clf_name, clf) in enumerate(quick_classifiers.items()):
                log.info(f'{i+1} - fitting {clf_name}')
                try:
                    clf.fit(X)
                    quick_scores[:, i] = clf.decision_scores_
                except:
                    log.info(traceback.print_exc())
                else:    
                    log.info(f'Base detector {i+1}/{len(quick_classifiers)} is fitted for prediction') 

            quick_scores = np.nan_to_num(quick_scores)
            
        if slow_methods:
            # initialize a set of detectors for LSCP
            detector_list = [LOF(n_neighbors=10), LOF(n_neighbors=15), LOF(n_neighbors=20)]
            slow_classifiers = {               
                #'Angle-based Outlier Detector (ABOD)': #too slow and nan results
                #   ABOD(contamination=contamination),
                #'One-class SVM (OCSVM)':
                #   OCSVM(contamination=contamination, cache_size = 2000, shrinking = False, tol = 1e-2),   
                #'LSCP': #slow and no parallel
                #   LSCP(detector_list, contamination=contamination, random_state=random_state, local_region_size = 30),
                #'Feature Bagging': #ensemble #no real par
                #   FeatureBagging(LOF(n_neighbors=20), contamination=contamination, 
                #                  random_state=random_state, n_jobs = -1),                
                #'SOS' : # too memory inefficient  
                #    SOS(contamination=contamination),
                #'COF': # memory inefficient
                #   COF(contamination=contamination),                  
                #'SOD':
                #    SOD(contamination = contamination),
                #'KNN': 
                #   KNN(contamination=contamination, n_jobs = -1),
                #'KNN_50': 
                #   KNN(contamination=contamination, leaf_size = 50, n_jobs = -1),
                #'KNN_70': 
                #   KNN(contamination=contamination, leaf_size = 70, n_jobs = -1),

                'LOF_4':
                   LOF(n_neighbors=4, contamination=contamination, n_jobs = -1),
                'LOF_5':
                   LOF(n_neighbors=5, contamination=contamination, n_jobs = -1),                
                'LOF_6':
                   LOF(n_neighbors=6, contamination=contamination, n_jobs = -1),
                'LOF_7':
                   LOF(n_neighbors=7, contamination=contamination, n_jobs = -1),                
                'LOF_8':
                   LOF(n_neighbors=8, contamination=contamination, n_jobs = -1),
                'LOF_9':
                   LOF(n_neighbors=9, contamination=contamination, n_jobs = -1),                
                'LOF_10':
                   LOF(n_neighbors=10, contamination=contamination, n_jobs = -1),
                'LOF_12':
                   LOF(n_neighbors=12, contamination=contamination, n_jobs = -1),  
                'LOF_14':
                   LOF(n_neighbors=14, contamination=contamination, n_jobs = -1),
                'LOF_16':
                   LOF(n_neighbors=16, contamination=contamination, n_jobs = -1),
                'LOF_18':
                   LOF(n_neighbors=18, contamination=contamination, n_jobs = -1),
                'LOF_20':
                   LOF(n_neighbors=20, contamination=contamination, n_jobs = -1), 
                'LOF_22':
                   LOF(n_neighbors=22, contamination=contamination, n_jobs = -1)            
            }
            
            slow_scores = np.zeros([n_rows, len(slow_classifiers)])

            for i, (clf_name, clf) in enumerate(slow_classifiers.items()):
                log.info(f'{i+1} - fitting {clf_name}')
                try:
                    clf.fit(X)
                    slow_scores[:, i] = clf.decision_scores_
                except:
                    log.info(traceback.print_exc())
                else:    
                    log.info(f'Base detector {i+1}/{len(slow_classifiers)} is fitted for prediction') 
            
            slow_scores = np.nan_to_num(slow_scores)
        
        if nn_methods:
            
            nn_classifiers = {}
            n_list = [1024, 512, 256, 128, 64, 32, 16, 8, 4, 2]
            n_idx = next(x[0] for x in enumerate(n_list) if x[1] < n_features)
            for i in range(3,6):
                n_enc = n_list[n_idx:n_idx+i-1] 
                n_dec = n_enc[::-1]
                n_enc_dec = n_enc + n_dec
                nn_classifiers[f'FULL_AE_{len(n_enc + n_dec)}'] = {'clf': self.full_autoencoder, 
                                                                   'hidden_layers' : n_enc_dec
                                                                  }
                nn_classifiers[f'VAE_{len(n_enc_dec)}'] = {'clf': VAE(contamination = contamination, random_state = random_state,
                                                                      encoder_neurons  = n_enc, decoder_neurons = n_dec,
                                                                      preprocessing = False, epochs = 32, verbosity = verbose), 
                                                            'hidden_layers' : n_enc + n_dec
                                                            }                
                
            
            nn_scores = np.zeros([n_rows, len(nn_classifiers)])
            
            for i, (clf_name, clf) in enumerate(nn_classifiers.items()):
                log.info(f'''{i+1} - fitting {clf_name} with layers {clf['hidden_layers']}''')
                try:
                    if clf['clf'] == self.full_autoencoder:
                        nn_scores[:, i] = clf['clf'](X, neurons_list = clf['hidden_layers'], verbose = verbose)
                    else:
                        clf['clf'].fit(X)
                        nn_scores[:, i] = clf['clf'].decision_scores_                        
                except:
                    log.info(traceback.print_exc())
                else:    
                    log.info(f'Base detector {i+1}/{len(nn_classifiers)} is fitted for prediction')             

            nn_scores = np.nan_to_num(nn_scores)

            
        all_scores = np.concatenate((quick_scores, slow_scores, nn_scores), axis=1)
        all_scores = all_scores[:,~np.all(all_scores == 0, axis=0)]
        log.info(f'total scores = {all_scores.shape[1]}')
        
        all_scores_norm = np.copy(all_scores)
        if use_score_rank:
            all_scores_norm = np.apply_along_axis(rank_fun, 0, all_scores_norm)
            log.info(f'score rank applied')
        all_scores_norm = preprocessing.MinMaxScaler().fit_transform(all_scores_norm)
        
        if all_scores_norm.shape[1] >= 12:
            score_by_aom = aom(all_scores_norm, method = 'dynamic', n_buckets = round(all_scores_norm.shape[1]/4))
            score_by_moa = moa(all_scores_norm, method = 'dynamic', n_buckets = round(all_scores_norm.shape[1]/4))
            score_by_avg = np.mean(all_scores_norm, axis = 1) 
            score_by_max = np.max(all_scores_norm, axis = 1)
        else:
            score_by_avg = np.mean(all_scores_norm, axis = 1)
            score_by_max = np.max(all_scores_norm, axis = 1)
            score_by_aom = score_by_avg
            score_by_moa = score_by_max
        return score_by_aom, score_by_moa, score_by_max, score_by_avg, all_scores, all_scores_norm
예제 #22
0
        print('\nFinished ' + self.model_name)

        return None


if __name__ == '__main__':
    # Specify the root directory
    rootDir = 'G:/My Drive/Github/ml-group-col/One-Class-models/Anomaly_Datasets_csv/'
    # specify the random state
    rs = 10
    # Save how to run the models
    models = [
        (IsolationForest(random_state=rs), 'ISOF'),
        (EllipticEnvelope(random_state=rs), 'EE'),
        (LMDD(dis_measure='aad', random_state=rs), 'AAD_LMDD'),
        (COPOD(), 'COPOD'),
        (FeatureBagging(combination='average',
                        random_state=rs), 'AVE_Bagging'),  # n_jobs
        (LMDD(dis_measure='iqr', random_state=rs), 'IQR_LMDD'),
        (KNN(method='largest'), 'Largest_KNN'),  # n_jobs
        (LODA(), 'LODA'),
        (FeatureBagging(combination='max', n_jobs=-1,
                        random_state=rs), 'MAX_Bagging'),
        (MCD(random_state=rs), 'MCD'),
        (XGBOD(random_state=rs), 'XGBOD'),  # n_jobs
        (GaussianMixture(random_state=rs), 'GMM'),
        (LocalOutlierFactor(novelty=True), 'LOF'),
        (KNN(method='median'), 'Median_KNN'),  # n_jobs
        (KNN(method='mean'), 'Avg_KNN'),  # n_jobs
        (CBLOF(n_clusters=10, random_state=rs), 'CBLOF'),
        (HBOS(), 'HBOS'),