예제 #1
0
 def model_init(self, model):
     """Model initialisation of a single model.
     """
     if self.model == 'pca':
         self.models[model] = PCA(contamination=self.contamination)
     elif self.model == 'loda':
         self.models[model] = LODA(contamination=self.contamination)
     elif self.model == 'iforest':
         self.models[model] = IForest(n_estimators=50,
                                      bootstrap=True,
                                      behaviour='new',
                                      contamination=self.contamination)
     elif self.model == 'cblof':
         self.models[model] = CBLOF(n_clusters=3,
                                    contamination=self.contamination)
     elif self.model == 'feature_bagging':
         self.models[model] = FeatureBagging(
             base_estimator=PCA(contamination=self.contamination),
             contamination=self.contamination)
     elif self.model == 'copod':
         self.models[model] = COPOD(contamination=self.contamination)
     elif self.model == 'hbos':
         self.models[model] = HBOS(contamination=self.contamination)
     else:
         self.models[model] = HBOS(contamination=self.contamination)
     self.custom_model_scalers[model] = MinMaxScaler()
예제 #2
0
파일: model.py 프로젝트: esowc/DAAQS
    def pred_PCA(self, n_comp=3, comp_with='openaq'):

        ## hyperparameters for KNN is tuned here
        # Number of samples must be greater than the n_components (3 in this case). It can be made 0.3 to make it work

        self.comp_with = comp_with

        if comp_with == "openaq":
            if self.X_o == []:
                pred = []
            elif self.X_o.shape[0] > n_comp:
                self.clf = PCA(n_components=n_comp)
                self.clf.fit(self.X_o)
                pred = self.clf.labels_
            elif self.X_o.shape[0] > 2:
                # print(f"The value of k is changed from {k} to {self.X_o.shape[0]-1}")
                n_comp = self.X_o.shape[0] - 1
                self.clf = PCA(n_components=n_comp)
                self.clf.fit(self.X_o)
                pred = self.clf.labels_
            else:
                pred = []

        elif comp_with == "cams":
            pred = []
            for each_X in self.X_c:
                self.clf = PCA(n_components=n_comp)
                self.clf.fit(each_X)
                pred.append(self.clf.labels_[-1])

        A_location, B_location, C_location = self.pred_location(pred)

        return A_location, B_location, C_location
예제 #3
0
 def models_init(self):
     """Models initialisation.
     """
     self.model = self.configuration.get('model', 'pca')
     if self.model == 'pca':
         self.models = {
             model: PCA(contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'loda':
         self.models = {
             model: LODA(contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'iforest':
         self.models = {
             model: IForest(n_estimators=50,
                            bootstrap=True,
                            behaviour='new',
                            contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'cblof':
         self.models = {
             model: CBLOF(n_clusters=3, contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'feature_bagging':
         self.models = {
             model: FeatureBagging(
                 base_estimator=PCA(contamination=self.contamination),
                 contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'copod':
         self.models = {
             model: COPOD(contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'hbos':
         self.models = {
             model: HBOS(contamination=self.contamination)
             for model in self.models_in_scope
         }
     else:
         self.models = {
             model: HBOS(contamination=self.contamination)
             for model in self.models_in_scope
         }
     self.custom_model_scalers = {
         model: MinMaxScaler()
         for model in self.models_in_scope
     }
예제 #4
0
def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test):
    if method == 'KNN':
        clf = KNN()
    elif method == 'CBLOF':
        clf = CBLOF()
    elif method == 'PCA':
        clf = PCA()
    else:
        clf = IForest()
    clf.fit(x_train)  # 使用x_train训练检测器clf

    # 返回训练数据x_train上的异常标签和异常分值
    y_train_pred = clf.labels_  # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
    y_train_scores = clf.decision_scores_  # 返回训练数据上的异常值 (分值越大越异常)
    print("On train Data:")
    evaluate_print(method, y_train, y_train_scores)

    # 用训练好的clf来预测未知数据中的异常值
    y_test_pred = clf.predict(x_test)  # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值)
    y_test_scores = clf.decision_function(x_test)  # 返回未知数据上的异常值 (分值越大越异常)
    print("On Test Data:")
    evaluate_print(method, y_test, y_test_scores)

    y_true = column_or_1d(y_test)
    y_pred = column_or_1d(y_test_scores)
    check_consistent_length(y_true, y_pred)

    roc = np.round(roc_auc_score(y_true, y_pred), decimals=4),
    prn = np.round(precision_n_scores(y_true, y_pred), decimals=4)

    total_roc.append(roc)
    total_prn.append(prn)
예제 #5
0
def load_classifiers(outliers_fraction):
    outliers_fraction = min(0.5, outliers_fraction)
    random_state = np.random.RandomState(42)
    # Define nine outlier detection tools to be compared
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=35),
                       contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction,
                random_state=random_state,
                behaviour="new"),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Average KNN':
        KNN(method='mean', contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(n_neighbors=35, contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state)
    }
    return classifiers
예제 #6
0
def define_classifiers(random_state, outliers_fraction):
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state)
    }
    return classifiers
예제 #7
0
def train():
    dataset = get_data(1000, 10, 100)
    contamination = 0.01
    with mlflow.start_run():
        base_estimators = [
            LOF(n_neighbors=5, contamination=contamination),
            LOF(n_neighbors=15, contamination=contamination),
            LOF(n_neighbors=25, contamination=contamination),
            PCA(contamination=contamination),
            KNN(n_neighbors=5, contamination=contamination),
            KNN(n_neighbors=15, contamination=contamination),
            KNN(n_neighbors=25, contamination=contamination)]
        model = SUOD(base_estimators=base_estimators, n_jobs=6,  
                    rp_flag_global=True,  
                    bps_flag=True,  
                    approx_flag_global=False, 
                    contamination=contamination)
        model.fit(dataset)  
        model.approximate(dataset)  
        predicted_labels = model.predict(dataset)
        voted_labels = vote(predicted_labels)
        true_labels = [0]*1000 + [1]*10
        auc_score = roc_auc_score(voted_labels, true_labels)
        print("The resulted area under the ROC curve score is {}".format(auc_score))
        mlflow.log_metric("auc_score", auc_score)
        mlflow.sklearn.log_model(model, "anomaly_model", conda_env="conda.yaml")
예제 #8
0
파일: ml.py 프로젝트: pedrovhb/tcc
def train_model(station: Station) -> LSCP:
    t1 = time.time()
    log.info(f'Training model for {station}...')
    log.info('Loading training observations')
    observations_select = Observation.select(
        Observation.time,
        Observation.sample_frequency,
        Observation.sample_count,
        Observation.rms,
        Observation.crest,
        Observation.peak_to_peak,
        Observation.kurtosis,
    ).where(Observation.station == station, Observation.is_training)

    obs_data = []
    for observation in observations_select:
        obs_data.append([
            observation.rms, observation.peak_to_peak, observation.kurtosis,
            observation.crest
        ])

    log.info('Fitting LSCP model')
    lscp = LSCP([KNN()] * 5 + [LOF()] * 5 + [PCA()] * 5, contamination=0.03)
    lscp.fit(X=obs_data)
    log.info(f'Trained model in {time.time() - t1}')
    return lscp
예제 #9
0
def main(args):
    data = loadmat(args.filename)
    trainx, testx, trainy, testy = train_test_split(data['X'],
                                                    data['y'],
                                                    test_size=args.train_split,
                                                    random_state=2)
    valx, evalx, valy, evaly = train_test_split(testx, testy, test_size=0.5)
    data_size = len(trainx[0])
    encoder_neurons = [data_size, data_size / 2, data_size / 4]
    clf = KNN()
    clf.fit(trainx)
    print("Results Validation KNN")
    print_metrics(valy, clf.predict(valx))
    print("Results Evaluation KNN")
    print_metrics(evaly, clf.predict(evalx))

    clf = PCA(n_components=args.components)
    clf.fit(trainx)
    print("Results Validation PCA")
    print_metrics(valy, clf.predict(valx))
    print("Results Evaluation PCA")
    print_metrics(evaly, clf.predict(evalx))

    clf = VAE(encoder_neurons=encoder_neurons,
              decoder_neurons=encoder_neurons[::-1],
              epochs=args.epochs,
              contamination=args.contamination,
              gamma=args.gamma,
              capacity=args.capacity)
    clf.fit(trainx)
    print("Results Validation VAE")
    print_metrics(valy, clf.predict(valx))
    print("Results Evaluation VAE")
    print_metrics(evaly, clf.predict(evalx))
예제 #10
0
def pca(X_train, X_test, Y_train, Y_test):
    from pyod.models.pca import PCA
    model = PCA()
    model.fit(X_train)
    pred = model.predict(X_test)
    acc = np.sum(pred == Y_test) / X_test.shape[0]
    print(acc)
    return (acc * 100)
예제 #11
0
def pca_outlier_detection(X_train, X_test, **kwargs):
    detector = PCA(**kwargs)
    detector.fit(X_train)
    prob = detector.predict_proba(X_test)[:, -1]

    if isinstance(X_test, pd.DataFrame):
        return pd.Series(prob, name='outlier', index=X_test.index)
    return pd.Series(prob, name='outlier')
예제 #12
0
def main():

    scalers = ['no', 'std', 'minmax']
    root = 'Unsupervised_Anamaly_Detection_csv'
    start = 0
    counts = 90
    CPUS = 3
    CPUS_Models = 4
    sklearn_models = [
        'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS',
        'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD',
        'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal',
        'VAE', 'AutoEncoder'
    ]

    models = {
        'BRM': BRM(bootstrap_sample_percent=70),
        'GM': GaussianMixture(),
        'IF': IsolationForest(),
        'OCSVM': OneClassSVM(),
        'EE': EllipticEnvelope(),
        'AvgKNN': KNN(method='mean'),
        'LargestKNN': KNN(method='largest'),
        'MedKNN': KNN(method='median'),
        'PCA': PCA(),
        'COF': COF(),
        'LODA': LODA(),
        'LOF': LOF(),
        'HBOS': HBOS(),
        'MCD': MCD(),
        'AvgBagging': FeatureBagging(combination='average'),
        'MaxBagging': FeatureBagging(combination='max'),
        'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'FactorAnalysis': FactorAnalysis(),
        'KernelDensity': KernelDensity(),
        'COPOD': COPOD(),
        'SOD': SOD(),
        'LSCPwithLODA': LSCP([LODA(), LODA()]),
        'AveLMDD': LMDD(dis_measure='aad'),
        'VarLMDD': LMDD(dis_measure='var'),
        'IqrLMDD': LMDD(dis_measure='iqr'),
        'SoGaal': SO_GAAL(),
        'MoGaal': MO_GAAL(),
        'VAE': VAE(encoder_neurons=[8, 4, 2]),
        'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'OCKRA': m_OCKRA(),
    }

    name = "30_Models"

    Parallel(n_jobs=CPUS) \
        (delayed(runByScaler)
         (root, scaler, models, start, counts,
          other_models=sklearn_models,
          CPUS=CPUS_Models,
          save_name=name)
         for scaler in scalers)
예제 #13
0
파일: test_pca.py 프로젝트: deltat99/Pyod
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination)

        self.clf = PCA(contamination=self.contamination)
예제 #14
0
def outlier_detection(x_raw, y_raw):
    """
    Filter all ourlier points
    :param x_raw: feature in ndarray
    :param y_raw: label in ndarray
    :return x_clean, y_clean: cleaned feature and label in ndarray
    """
    # TODO Filter the outliers.
    print()
    print("Detecting outliers...")
    print("Before outlier detection: {}".format(x_raw.shape))
    outliers_fraction = 0.04
    random_state = np.random.RandomState(42)
    # all outlier detection method candidate list as follows
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state),
        'Improving Supervised Outlier Detection with Unsupervised Representation Learning':
        XGBOD(contamination=outliers_fraction),
    }
    clf_name = 'Isolation Forest'
    clf = IForest(contamination=outliers_fraction, random_state=random_state)
    # clf_name = 'Angle-based Outlier Detector (ABOD)'
    # clf = ABOD(contamination=outliers_fraction, method='default')
    clf.fit(x_raw)
    y_pred = clf.predict(x_raw)
    # for pyod, 1 means outliers and 0 means inliers
    # for sklearn,  -1 means outliers and 1 means inliers
    idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1]
    x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0)
    y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0)
    print("After outlier detection: {}".format(x_clean.shape))
    assert (x_clean.shape[0] == y_clean.shape[0])
    return x_clean, y_clean
예제 #15
0
    def S2(self):

        self.S1()
        water_data = self.water_data
        result = self.result

        # 数据预处理及模型训练
        clean_data = water_data[water_data['S1'] == 0]
        Y = pd.DataFrame(index=clean_data.index, columns=['S2'])

        X_train = np.array(clean_data.iloc[:, 1:12])
        name = list(clean_data.iloc[:, 1:12].columns.values)
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)

        clf1 = IForest(contamination=0.05, max_features=11, bootstrap=True)
        clf2 = KNN(contamination=0.05, n_neighbors=100)
        clf3 = HBOS(contamination=0.05, n_bins=10)
        clf4 = PCA(contamination=0.05)

        clf1.fit(X_train)
        clf2.fit(X_train)
        clf3.fit(X_train)
        clf4.fit(X_train)

        Y['S2'] = clf1.labels_ * clf2.labels_ * clf3.labels_ * clf4.labels_
        water_data = pd.concat([water_data, Y], axis=1)
        # water_data.loc[water_data['S2'].isna(),['S2']]=0,将S1中异常的,在S2中标注为0;

        result['统计异常'] = water_data['S2'].values

        # 寻找异常维度
        from sklearn.neighbors import KernelDensity
        clean_data = water_data[water_data['S1'] == 0]
        dens = pd.DataFrame(index=clean_data.index,
                            columns=[
                                'temperature', 'pH', 'EC', 'ORP', 'DO',
                                'turbidity', 'transparency', 'COD', 'P',
                                'NH3N', 'flux'
                            ])

        for i in dens.columns:
            kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(
                clean_data[i].values.reshape(-1, 1))
            dens[i] = np.exp(
                kde.score_samples(clean_data[i].values.reshape(-1, 1)))
        dens = dens.iloc[:, 0:11].rank()
        dens['S2_names'] = dens.idxmin(axis=1)
        water_data = pd.concat([water_data, dens['S2_names']], axis=1)
        self.water_data = water_data
        result['统计异常维度'] = water_data['S2_names'].values

        # 存储模型
        joblib.dump(scaler, "./water_model/S2_scaler")
        joblib.dump(clf1, "./water_model/S2_Iforest")
def OD_detect(df, id_col=None, contamination=0.05, trans_cols=None):
    """
    use pyod lib to find 5% outlier in dataset
    """
    df = df.copy()
    OD_clfs = {
        "HBOS": HBOS(contamination=contamination),
        "IForest": IForest(contamination=contamination),
        "CBLOF": CBLOF(contamination=contamination, n_clusters=5),
        # "OCSVM": OCSVM(contamination=contamination),
        "PCA": PCA(contamination=contamination)
    }
    results_list = []
    od_cols = ["id", "name", "result", "label"]

    if id_col is None:
        s_id = df.index
        od_cols = df.columns
    else:
        s_id = df[id_col]
        X_cols = df.columns.drop(id_col)

    if trans_cols is not None:
        for col in trans_cols:
            df[col] = PowerTransformer().fit_transform(df[col].values.reshape(
                -1, 1))

    for clf_name, clf in OD_clfs.items():
        od_result = pd.DataFrame(columns=od_cols)  # create an empty  dataframe

        od_result["id"] = s_id

        od_result['name'] = clf_name
        print(f"{clf_name}, {clf}")

        clf.fit(df[X_cols])

        od_result['result'] = clf.decision_scores_
        od_result['label'] = clf.labels_

        results_list.append(od_result)

    od_results_df = pd.concat(results_list, axis=0, ignore_index=True)
    job_name = f'{pd.datetime.now():%H%M}'
    od_results_df['job_name'] = job_name
    od_results_df.to_sql('t_ml',
                         engine,
                         if_exists='append',
                         schema='wh_v1',
                         method=psql_insert_copy)
    print(
        f"OD results {od_results_df.shape}exported to database{engine},job_name={job_name}"
    )
    return od_results_df
예제 #17
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.5
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = PCA(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)
def getOutlierPCA(dataset):
    '''
    @brief Function that executes PCA algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model
    pca = PCA()
    # Fits the data and obtains labels
    pca.fit(dataset)
    # Return labels
    return pca.labels_
예제 #19
0
    def fit_transform(self, df_train, df_corrupted):
        pyod_model = PCA(
            contamination=0.25
        )  # n_components = min(n_samples, n_features) default  # n_selected_components = None

        df_outliers_num = self.num_out_detect(df_train, df_corrupted,
                                              pyod_model)
        df_outliers_cat = self.cat_out_detect(df_train, df_corrupted)

        df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')

        for col in df_corrupted.columns:
            for i in df_outliers.index:
                if df_outliers.loc[i, col + "_outlier"] == 1:
                    df_outliers.loc[i, col] = np.nan

        return df_outliers, self.predictors
예제 #20
0
def print_accuracy(train_arr,test_arr,trader_id):
    if len(train_arr)==0 or len(test_arr)==0:
        return
    for i in range(len(train_arr)):
        l1=len(train_arr[i])
        l2=len(test_arr[i])
        if l1==0 or l2==0:
            continue
        train_data=np.array([train_arr[i]]).T
        test_data=np.array([test_arr[i]]).T
        # clf=OCSVM(kernel ='rbf',gamma = 0.5)
        print(len(train_arr))
        clf = PCA(n_components =15)
        clf.fit(train_arr)
        y_pred=clf.predict(train_arr)
        print("TRAINING ACCURACY for TRADER",trader_id,":",100 - (sum(y_pred)*100/l1))
        y_pred=clf.predict(test_data)
        print("TESTING ACCURACY: ",sum(y_pred)*100/l2)
예제 #21
0
    def setUp(self):
        self.n_train = 1000
        self.n_test = 500
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.random_state = 42
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=self.random_state)

        self.base_estimators = [
            LOF(n_neighbors=5, contamination=self.contamination),
            LOF(n_neighbors=15, contamination=self.contamination),
            LOF(n_neighbors=25, contamination=self.contamination),
            LOF(n_neighbors=35, contamination=self.contamination),
            LOF(n_neighbors=45, contamination=self.contamination),
            HBOS(contamination=self.contamination),
            PCA(contamination=self.contamination),
            LSCP(detector_list=[
                LOF(n_neighbors=5, contamination=self.contamination),
                LOF(n_neighbors=15, contamination=self.contamination)
            ],
                 random_state=self.random_state)
        ]

        this_directory = os.path.abspath(os.path.dirname(__file__))

        self.cost_forecast_loc_fit_ = os.path.join(this_directory,
                                                   'bps_train.joblib')

        self.cost_forecast_loc_pred_ = os.path.join(this_directory,
                                                    'bps_prediction.joblib')

        self.model = SUOD(base_estimators=self.base_estimators,
                          n_jobs=2,
                          rp_flag_global=True,
                          bps_flag=True,
                          contamination=self.contamination,
                          approx_flag_global=True,
                          cost_forecast_loc_fit=self.cost_forecast_loc_fit_,
                          cost_forecast_loc_pred=self.cost_forecast_loc_pred_,
                          verbose=True)
예제 #22
0
def choose_model(model, nnet):
    """ among implemented in PyOD """
    clfs = {
        'AE':
        AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15),
        'VAE':
        VAE(encoder_neurons=nnet[:5],
            decoder_neurons=nnet[4:],
            contamination=0.1,
            epochs=13),
        'ABOD':
        ABOD(),
        'FeatureBagging':
        FeatureBagging(),
        'HBOS':
        HBOS(),
        'IForest':
        IForest(),
        'KNN':
        KNN(),
        'LOF':
        LOF(),
        'OCSVM':
        OCSVM(),
        'PCA':
        PCA(),
        'SOS':
        SOS(),
        'COF':
        COF(),
        'CBLOF':
        CBLOF(),
        'SOD':
        SOD(),
        'LOCI':
        LOCI(),
        'MCD':
        MCD()
    }
    return clfs[model]
예제 #23
0
def train(doc_list, dataset_name, clf_name):
    model_roc = []
    model_prc = []
    if clf_name == "PCA":
        clf = PCA()
    elif clf_name == "MCD":
        clf = MCD()
    elif clf_name == "LOF":
        clf = LOF()
    elif clf_name == "KNN":
        clf = KNN()
    elif clf_name == "LODA":
        clf = LODA()
    for i in range(10):
        data = pd.read_csv(doc_list[i], header=0, index_col=0)
        train_x = data.drop(drop + ground_truth, axis=1).values
        train_y = np.array([
            transfor[x]
            for x in list(_flatten(data[ground_truth].values.tolist()))
        ])
        clf.fit(train_x)
        predict = clf.decision_scores_
        roc = roc_auc_score(train_y, predict)
        prc = precision_n_scores(train_y, predict)
        if ((i + 1) % 200 == 0):
            print("第" + str(i + 1) + "个文件结果:")
            evaluate_print(clf_name, train_y, predict)
        model_roc.append(roc)
        model_prc.append(prc)
    model_roc_avg = np.mean(model_roc)
    model_prc_avg = np.mean(model_prc)
    print("模型" + clf_name + "在数据集" + dataset_name + "的平均roc_auc为" +
          str(round(model_roc_avg, 4)) + ",平均prc为" +
          str(round(model_prc_avg, 4)) + "。")

    return model_roc_avg, model_prc_avg
예제 #24
0
    'K Nearest Neighbors (KNN)':
    KNN(contamination=outliers_fraction),
    'Average KNN':
    KNN(method='mean', contamination=outliers_fraction),
    # 'Median KNN': KNN(method='median',
    #                   contamination=outliers_fraction),
    'Local Outlier Factor (LOF)':
    LOF(n_neighbors=35, contamination=outliers_fraction),
    # 'Local Correlation Integral (LOCI)':
    #     LOCI(contamination=outliers_fraction),
    'Minimum Covariance Determinant (MCD)':
    MCD(contamination=outliers_fraction, random_state=random_state),
    'One-class SVM (OCSVM)':
    OCSVM(contamination=outliers_fraction),
    'Principal Component Analysis (PCA)':
    PCA(contamination=outliers_fraction, random_state=random_state),
    # 'Stochastic Outlier Selection (SOS)': SOS(
    #     contamination=outliers_fraction),
    'Locally Selective Combination (LSCP)':
    LSCP(detector_list,
         contamination=outliers_fraction,
         random_state=random_state),
    # 'Connectivity-Based Outlier Factor (COF)':
    #     COF(n_neighbors=35, contamination=outliers_fraction),
    # 'Subspace Outlier Detection (SOD)':
    #     SOD(contamination=outliers_fraction),
}

# Show all detectors
for i, clf in enumerate(classifiers.keys()):
    print('Model', i + 1, clf)
예제 #25
0
    'Isolation Forest': IForest(contamination=outliers_fraction,
                                random_state=random_state),
    '(KNN) K Nearest Neighbors ': KNN(
        contamination=outliers_fraction),
    'Average KNN': KNN(method='mean',
                       contamination=outliers_fraction),
    # 'Median KNN': KNN(method='median',
    #                   contamination=outliers_fraction),
    '(LOF) Local Outlier Factor ':
        LOF(n_neighbors=35, contamination=outliers_fraction),
    # 'Local Correlation Integral (LOCI)':
    #     LOCI(contamination=outliers_fraction),
    '(MCD) Minimum Covariance Determinant ': MCD(
        contamination=outliers_fraction, random_state=random_state),
    'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
    '(PCA) Principal Component Analysis ': PCA(
        contamination=outliers_fraction, random_state=random_state),
    # 'Stochastic Outlier Selection (SOS)': SOS(
    #     contamination=outliers_fraction),
    '(LSCP) Locally Selective Combination ': LSCP(
        detector_list, contamination=outliers_fraction,
        random_state=random_state),
    # 'Connectivity-Based Outlier Factor (COF)':
    #     COF(n_neighbors=35, contamination=outliers_fraction),
    # 'Subspace Outlier Detection (SOD)':
    #     SOD(contamination=outliers_fraction),
}
st.subheader('SELECT AN ALGORITHM:')

classifier_name = st.selectbox('THE ALGORITHM',[*classifiers])

# Show all detectors
예제 #26
0
    'Isolation Forest':
    IForest(contamination=outliers_fraction, random_state=random_state),
    'K Nearest Neighbors (KNN)':
    KNN(contamination=outliers_fraction),
    'Average KNN':
    KNN(method='mean', contamination=outliers_fraction),
    'Median KNN':
    KNN(method='median', contamination=outliers_fraction),
    'Local Outlier Factor (LOF)':
    LOF(n_neighbors=35, contamination=outliers_fraction),
    'Minimum Covariance Determinant (MCD)':
    MCD(contamination=outliers_fraction, random_state=random_state),
    'One-class SVM (OCSVM)':
    OCSVM(contamination=outliers_fraction),
    'Principal Component Analysis (PCA)':
    PCA(contamination=outliers_fraction),
}

# Show all detectors
for i, clf in enumerate(classifiers.keys()):
    print('Model', i + 1, clf)

# Fit the models with the generated data and
# compare model performances
for i, offset in enumerate(clusters_separation):
    np.random.seed(42)
    # Data generation
    X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset
    X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset
    X = np.r_[X1, X2]
    # Add outliers
예제 #27
0
            trader_timestamp_dict[(
                time_stamp, trader_id)]['buying']['volume'].append(volume)
            trader_list.append([trader_id, price, volume])
        elif int(direction) == -1 and int(entry_type) == 1:
            trader_timestamp_dict[(
                time_stamp, trader_id)]['selling']['price'].append(price)
            trader_timestamp_dict[(
                time_stamp, trader_id)]['selling']['volume'].append(volume)
            trader_list.append([trader_id, price * -1, volume])
    # print(trader_timestamp_dict)
# traders=list(set(traders))
# print(traders)
keys = list(trader_timestamp_dict.keys())
keys.sort()
# trader_arr = trader_list
clf = PCA()
user_order = []

## Standardize data

# trader_list = [v for v in trader_timestamp_dict.values()]
# for key, value in trader_timestamp_dict.iteritems():
#     temp = [key,value]
#     trader_list.append(temp)
trader_arr = np.asarray(trader_list)
# print(len(traders))
# print(len(set(trader_arr[:,0])))
# print(len(keys))
# malicious_complete_data = np.zeros((len(malicious_keys),16))
# normal_complete_data = np.zeros((len(traders)-len(malicious_keys),16))
malicious_complete_data = []
def detect(file, amountanom, realtime):
    """
    Functon to apply a very simple anomaly detector
    amountanom: The top number of anomalies we want to print
    realtime: If we want to read the conn.log file in real time (not working)
    """

    # Create a zeek reader on a given log file. Thanks brothon
    reader = bro_log_reader.BroLogReader(file, tail=realtime)
    # Create a Pandas dataframe from reader
    bro_df = pd.DataFrame(reader.readrows())

    # In case you need a label, due to some models being able to work in a semisupervized mode, then put it here. For now everything is 'normal', but we are not using this for detection
    bro_df['label'] = 'normal'
    # Change the datetime delta value to seconds. Scikit does not now how to work with timedeltas
    bro_df['durationsec'] = bro_df.duration.apply(lambda x: x.total_seconds())
    # Replace the rows without data (with '-') with -1. Even though this may add a bias in the algorithms, is better than not using the lines.
    bro_df['orig_bytes'] = bro_df['orig_bytes'].replace(to_replace='-',
                                                        value=-1)
    bro_df['resp_bytes'] = bro_df['resp_bytes'].replace(to_replace='-',
                                                        value=-1)
    bro_df['resp_pkts'] = bro_df['resp_pkts'].replace(to_replace='-', value=-1)
    bro_df['orig_ip_bytes'] = bro_df['orig_ip_bytes'].replace(to_replace='-',
                                                              value=-1)
    bro_df['resp_ip_bytes'] = bro_df['resp_ip_bytes'].replace(to_replace='-',
                                                              value=-1)

    # Add the columns from the log file that we know are numbers. This is only for conn.log files.
    X_train = bro_df[[
        'durationsec', 'orig_bytes', 'id.resp_p', 'resp_bytes',
        'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes'
    ]]
    # Our y is the label. But we are not using it now.
    y = bro_df.label

    # The X_test is where we are going to search for anomalies. In our case, its the same set of data than X_train.
    X_test = X_train

    #################
    # Select a model from below

    # ABOD class for Angle-base Outlier Detection. For an observation, the variance of its weighted cosine scores to all neighbors could be viewed as the outlying score.
    #clf = ABOD()

    # LOF
    #clf = LOF()

    # CBLOF
    #clf = CBLOF()

    # LOCI
    #clf = LOCI()

    # LSCP
    #clf = LSCP()

    # MCD
    #clf = MCD()

    # OCSVM
    #clf = OCSVM()

    # PCA. Good and fast!
    clf = PCA()

    # SOD
    #clf = SOD()

    # SO_GAAL
    #clf = SO_GALL()

    # SOS
    #clf = SOS()

    # XGBOD
    #clf = XGBOD()

    # KNN
    # Good results but slow
    #clf = KNN()
    #clf = KNN(n_neighbors=10)
    #################

    # Fit the model to the train data
    clf.fit(X_train)

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # Convert the ndarrays of scores and predictions to  pandas series
    scores_series = pd.Series(y_test_scores)
    pred_series = pd.Series(y_test_pred)

    # Now use the series to add a new column to the X test
    X_test['score'] = scores_series.values
    X_test['pred'] = pred_series.values

    # Add the score to the bro_df also. So we can show it at the end
    bro_df['score'] = X_test['score']

    # Keep the positive predictions only. That is, keep only what we predict is an anomaly.
    X_test_predicted = X_test[X_test.pred == 1]

    # Keep the top X amount of anomalies
    top10 = X_test_predicted.sort_values(by='score',
                                         ascending=False).iloc[:amountanom]

    ## Print the results
    # Find the predicted anomalies in the original bro dataframe, where the rest of the data is
    df_to_print = bro_df.iloc[top10.index]
    print('\nFlows of the top anomalies')
    # Only print some columns, not all, so its easier to read.
    df_to_print = df_to_print.drop([
        'conn_state', 'history', 'local_orig', 'local_resp', 'missed_bytes',
        'ts', 'tunnel_parents', 'uid', 'label'
    ],
                                   axis=1)
    print(df_to_print)
예제 #29
0
def main():

    # PART 1:
    # Getting the predictions for each classifier
    # SK means: The classifier is from sklearn or works like sklearn
    # PY means: The classifier is from pyod or works like pyod

    models = {
        'SK_EE': EllipticEnvelope(),
        'SK_GM': GaussianMixture(),
        'SK_IF': IsolationForest(),
        'SK_OCSVM': OneClassSVM(),
        'SK_FA': FactorAnalysis(),
        'SK_KD': KernelDensity(),
        'PY_PCA': PCA(),
        'PY_COF': COF(),
        'PY_LODA': LODA(),
        'PY_LOF': LOF(),
        'PY_HBOS': HBOS(),
        'PY_MCD': MCD(),
        'PY_AvgKNN': KNN(method='mean'),
        'PY_LargestKNN': KNN(method='largest'),
        'PY_MedKNN': KNN(method='median'),
        'PY_AvgBagging': FeatureBagging(combination='average'),
        'PY_MaxBagging': FeatureBagging(combination='max'),
        'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'PY_COPOD': COPOD(),
        'PY_SOD': SOD(),
        'PY_LSCPwithLODA': LSCP([LODA(), LODA()]),
        'PY_AveLMDD': LMDD(dis_measure='aad'),
        'PY_VarLMDD': LMDD(dis_measure='var'),
        'PY_IqrLMDD': LMDD(dis_measure='iqr'),
        'PY_VAE': VAE(encoder_neurons=[8, 4, 2]),
        'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'SK_BRM': BRM(bootstrap_sample_percent=70),
        'SK_OCKRA': m_OCKRA(),
        'PY_SoGaal': SO_GAAL(),
        'PY_MoGaal': MO_GAAL()
    }
    ranker = ADRanker(data="datasets", models=models)
    ranker.get_predictions()

    # PART 2:
    # After predictions, we can evaluate our classifiers using different scores
    # You can add manually a new metric by modifying 'metrics.py'

    ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave})

    # PART 3:
    # Finally, it is time to summarize the results by plotting different graphs
    # You can add your own graphs by modifying ' plots.py'
    plot = Plots()
    plot.make_plot_basic(paths=[
        'results/scores/auc/no/results.csv',
        'results/scores/auc/minmax/results.csv',
        'results/scores/auc/std/results.csv',
        'results/scores/ave/no/results.csv',
        'results/scores/ave/minmax/results.csv',
        'results/scores/ave/std/results.csv'
    ],
                         scalers=[
                             'Without scaler', 'Min max scaler',
                             'Standard scaler', 'Without scaler',
                             'Min max scaler', 'Standard scaler'
                         ])

    plot.make_cd_plot(
        paths=[
            'results/scores/auc/minmax/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/no/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/std/results.csv',
            'results/scores/ave/std/results.csv'
        ],
        names=[
            'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale',
            'CD ave no scale', 'CD auc std scale', 'CD ave std scale'
        ],
        titles=[
            'CD diagram - AUC with min max scaling',
            'CD diagram - Average precision with min max scaling',
            'CD diagram - AUC without scaling',
            'CD diagram - Average precision without scaling',
            'CD diagram - AUC with standard scaling',
            'CD diagram - Average precision with  standard scaling'
        ])
예제 #30
0
파일: utility.py 프로젝트: hsw2012s/temp
def get_estimators(contamination):
    """Internal method to create a list of 600 random base outlier detectors.

    Parameters
    ----------
    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set,
        i.e. the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.

    Returns
    -------
    base_detectors : list
        A list of initialized random base outlier detectors.

    """
    BASE_ESTIMATORS = [
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
        ABOD(n_neighbors=45, contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
    ]

    return BASE_ESTIMATORS