def model_init(self, model): """Model initialisation of a single model. """ if self.model == 'pca': self.models[model] = PCA(contamination=self.contamination) elif self.model == 'loda': self.models[model] = LODA(contamination=self.contamination) elif self.model == 'iforest': self.models[model] = IForest(n_estimators=50, bootstrap=True, behaviour='new', contamination=self.contamination) elif self.model == 'cblof': self.models[model] = CBLOF(n_clusters=3, contamination=self.contamination) elif self.model == 'feature_bagging': self.models[model] = FeatureBagging( base_estimator=PCA(contamination=self.contamination), contamination=self.contamination) elif self.model == 'copod': self.models[model] = COPOD(contamination=self.contamination) elif self.model == 'hbos': self.models[model] = HBOS(contamination=self.contamination) else: self.models[model] = HBOS(contamination=self.contamination) self.custom_model_scalers[model] = MinMaxScaler()
def pred_PCA(self, n_comp=3, comp_with='openaq'): ## hyperparameters for KNN is tuned here # Number of samples must be greater than the n_components (3 in this case). It can be made 0.3 to make it work self.comp_with = comp_with if comp_with == "openaq": if self.X_o == []: pred = [] elif self.X_o.shape[0] > n_comp: self.clf = PCA(n_components=n_comp) self.clf.fit(self.X_o) pred = self.clf.labels_ elif self.X_o.shape[0] > 2: # print(f"The value of k is changed from {k} to {self.X_o.shape[0]-1}") n_comp = self.X_o.shape[0] - 1 self.clf = PCA(n_components=n_comp) self.clf.fit(self.X_o) pred = self.clf.labels_ else: pred = [] elif comp_with == "cams": pred = [] for each_X in self.X_c: self.clf = PCA(n_components=n_comp) self.clf.fit(each_X) pred.append(self.clf.labels_[-1]) A_location, B_location, C_location = self.pred_location(pred) return A_location, B_location, C_location
def models_init(self): """Models initialisation. """ self.model = self.configuration.get('model', 'pca') if self.model == 'pca': self.models = { model: PCA(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'loda': self.models = { model: LODA(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'iforest': self.models = { model: IForest(n_estimators=50, bootstrap=True, behaviour='new', contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'cblof': self.models = { model: CBLOF(n_clusters=3, contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'feature_bagging': self.models = { model: FeatureBagging( base_estimator=PCA(contamination=self.contamination), contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'copod': self.models = { model: COPOD(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'hbos': self.models = { model: HBOS(contamination=self.contamination) for model in self.models_in_scope } else: self.models = { model: HBOS(contamination=self.contamination) for model in self.models_in_scope } self.custom_model_scalers = { model: MinMaxScaler() for model in self.models_in_scope }
def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test): if method == 'KNN': clf = KNN() elif method == 'CBLOF': clf = CBLOF() elif method == 'PCA': clf = PCA() else: clf = IForest() clf.fit(x_train) # 使用x_train训练检测器clf # 返回训练数据x_train上的异常标签和异常分值 y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值) y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常) print("On train Data:") evaluate_print(method, y_train, y_train_scores) # 用训练好的clf来预测未知数据中的异常值 y_test_pred = clf.predict(x_test) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) y_test_scores = clf.decision_function(x_test) # 返回未知数据上的异常值 (分值越大越异常) print("On Test Data:") evaluate_print(method, y_test, y_test_scores) y_true = column_or_1d(y_test) y_pred = column_or_1d(y_test_scores) check_consistent_length(y_true, y_pred) roc = np.round(roc_auc_score(y_true, y_pred), decimals=4), prn = np.round(precision_n_scores(y_true, y_pred), decimals=4) total_roc.append(roc) total_prn.append(prn)
def load_classifiers(outliers_fraction): outliers_fraction = min(0.5, outliers_fraction) random_state = np.random.RandomState(42) # Define nine outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state, behaviour="new"), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def define_classifiers(random_state, outliers_fraction): classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def train(): dataset = get_data(1000, 10, 100) contamination = 0.01 with mlflow.start_run(): base_estimators = [ LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination)] model = SUOD(base_estimators=base_estimators, n_jobs=6, rp_flag_global=True, bps_flag=True, approx_flag_global=False, contamination=contamination) model.fit(dataset) model.approximate(dataset) predicted_labels = model.predict(dataset) voted_labels = vote(predicted_labels) true_labels = [0]*1000 + [1]*10 auc_score = roc_auc_score(voted_labels, true_labels) print("The resulted area under the ROC curve score is {}".format(auc_score)) mlflow.log_metric("auc_score", auc_score) mlflow.sklearn.log_model(model, "anomaly_model", conda_env="conda.yaml")
def train_model(station: Station) -> LSCP: t1 = time.time() log.info(f'Training model for {station}...') log.info('Loading training observations') observations_select = Observation.select( Observation.time, Observation.sample_frequency, Observation.sample_count, Observation.rms, Observation.crest, Observation.peak_to_peak, Observation.kurtosis, ).where(Observation.station == station, Observation.is_training) obs_data = [] for observation in observations_select: obs_data.append([ observation.rms, observation.peak_to_peak, observation.kurtosis, observation.crest ]) log.info('Fitting LSCP model') lscp = LSCP([KNN()] * 5 + [LOF()] * 5 + [PCA()] * 5, contamination=0.03) lscp.fit(X=obs_data) log.info(f'Trained model in {time.time() - t1}') return lscp
def main(args): data = loadmat(args.filename) trainx, testx, trainy, testy = train_test_split(data['X'], data['y'], test_size=args.train_split, random_state=2) valx, evalx, valy, evaly = train_test_split(testx, testy, test_size=0.5) data_size = len(trainx[0]) encoder_neurons = [data_size, data_size / 2, data_size / 4] clf = KNN() clf.fit(trainx) print("Results Validation KNN") print_metrics(valy, clf.predict(valx)) print("Results Evaluation KNN") print_metrics(evaly, clf.predict(evalx)) clf = PCA(n_components=args.components) clf.fit(trainx) print("Results Validation PCA") print_metrics(valy, clf.predict(valx)) print("Results Evaluation PCA") print_metrics(evaly, clf.predict(evalx)) clf = VAE(encoder_neurons=encoder_neurons, decoder_neurons=encoder_neurons[::-1], epochs=args.epochs, contamination=args.contamination, gamma=args.gamma, capacity=args.capacity) clf.fit(trainx) print("Results Validation VAE") print_metrics(valy, clf.predict(valx)) print("Results Evaluation VAE") print_metrics(evaly, clf.predict(evalx))
def pca(X_train, X_test, Y_train, Y_test): from pyod.models.pca import PCA model = PCA() model.fit(X_train) pred = model.predict(X_test) acc = np.sum(pred == Y_test) / X_test.shape[0] print(acc) return (acc * 100)
def pca_outlier_detection(X_train, X_test, **kwargs): detector = PCA(**kwargs) detector.fit(X_train) prob = detector.predict_proba(X_test)[:, -1] if isinstance(X_test, pd.DataFrame): return pd.Series(prob, name='outlier', index=X_test.index) return pd.Series(prob, name='outlier')
def main(): scalers = ['no', 'std', 'minmax'] root = 'Unsupervised_Anamaly_Detection_csv' start = 0 counts = 90 CPUS = 3 CPUS_Models = 4 sklearn_models = [ 'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS', 'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD', 'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE', 'AutoEncoder' ] models = { 'BRM': BRM(bootstrap_sample_percent=70), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), 'MoGaal': MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'OCKRA': m_OCKRA(), } name = "30_Models" Parallel(n_jobs=CPUS) \ (delayed(runByScaler) (root, scaler, models, start, counts, other_models=sklearn_models, CPUS=CPUS_Models, save_name=name) for scaler in scalers)
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) self.clf = PCA(contamination=self.contamination)
def outlier_detection(x_raw, y_raw): """ Filter all ourlier points :param x_raw: feature in ndarray :param y_raw: label in ndarray :return x_clean, y_clean: cleaned feature and label in ndarray """ # TODO Filter the outliers. print() print("Detecting outliers...") print("Before outlier detection: {}".format(x_raw.shape)) outliers_fraction = 0.04 random_state = np.random.RandomState(42) # all outlier detection method candidate list as follows classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), 'Improving Supervised Outlier Detection with Unsupervised Representation Learning': XGBOD(contamination=outliers_fraction), } clf_name = 'Isolation Forest' clf = IForest(contamination=outliers_fraction, random_state=random_state) # clf_name = 'Angle-based Outlier Detector (ABOD)' # clf = ABOD(contamination=outliers_fraction, method='default') clf.fit(x_raw) y_pred = clf.predict(x_raw) # for pyod, 1 means outliers and 0 means inliers # for sklearn, -1 means outliers and 1 means inliers idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1] x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0) y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0) print("After outlier detection: {}".format(x_clean.shape)) assert (x_clean.shape[0] == y_clean.shape[0]) return x_clean, y_clean
def S2(self): self.S1() water_data = self.water_data result = self.result # 数据预处理及模型训练 clean_data = water_data[water_data['S1'] == 0] Y = pd.DataFrame(index=clean_data.index, columns=['S2']) X_train = np.array(clean_data.iloc[:, 1:12]) name = list(clean_data.iloc[:, 1:12].columns.values) scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) clf1 = IForest(contamination=0.05, max_features=11, bootstrap=True) clf2 = KNN(contamination=0.05, n_neighbors=100) clf3 = HBOS(contamination=0.05, n_bins=10) clf4 = PCA(contamination=0.05) clf1.fit(X_train) clf2.fit(X_train) clf3.fit(X_train) clf4.fit(X_train) Y['S2'] = clf1.labels_ * clf2.labels_ * clf3.labels_ * clf4.labels_ water_data = pd.concat([water_data, Y], axis=1) # water_data.loc[water_data['S2'].isna(),['S2']]=0,将S1中异常的,在S2中标注为0; result['统计异常'] = water_data['S2'].values # 寻找异常维度 from sklearn.neighbors import KernelDensity clean_data = water_data[water_data['S1'] == 0] dens = pd.DataFrame(index=clean_data.index, columns=[ 'temperature', 'pH', 'EC', 'ORP', 'DO', 'turbidity', 'transparency', 'COD', 'P', 'NH3N', 'flux' ]) for i in dens.columns: kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit( clean_data[i].values.reshape(-1, 1)) dens[i] = np.exp( kde.score_samples(clean_data[i].values.reshape(-1, 1))) dens = dens.iloc[:, 0:11].rank() dens['S2_names'] = dens.idxmin(axis=1) water_data = pd.concat([water_data, dens['S2_names']], axis=1) self.water_data = water_data result['统计异常维度'] = water_data['S2_names'].values # 存储模型 joblib.dump(scaler, "./water_model/S2_scaler") joblib.dump(clf1, "./water_model/S2_Iforest")
def OD_detect(df, id_col=None, contamination=0.05, trans_cols=None): """ use pyod lib to find 5% outlier in dataset """ df = df.copy() OD_clfs = { "HBOS": HBOS(contamination=contamination), "IForest": IForest(contamination=contamination), "CBLOF": CBLOF(contamination=contamination, n_clusters=5), # "OCSVM": OCSVM(contamination=contamination), "PCA": PCA(contamination=contamination) } results_list = [] od_cols = ["id", "name", "result", "label"] if id_col is None: s_id = df.index od_cols = df.columns else: s_id = df[id_col] X_cols = df.columns.drop(id_col) if trans_cols is not None: for col in trans_cols: df[col] = PowerTransformer().fit_transform(df[col].values.reshape( -1, 1)) for clf_name, clf in OD_clfs.items(): od_result = pd.DataFrame(columns=od_cols) # create an empty dataframe od_result["id"] = s_id od_result['name'] = clf_name print(f"{clf_name}, {clf}") clf.fit(df[X_cols]) od_result['result'] = clf.decision_scores_ od_result['label'] = clf.labels_ results_list.append(od_result) od_results_df = pd.concat(results_list, axis=0, ignore_index=True) job_name = f'{pd.datetime.now():%H%M}' od_results_df['job_name'] = job_name od_results_df.to_sql('t_ml', engine, if_exists='append', schema='wh_v1', method=psql_insert_copy) print( f"OD results {od_results_df.shape}exported to database{engine},job_name={job_name}" ) return od_results_df
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.5 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = PCA(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train)
def getOutlierPCA(dataset): ''' @brief Function that executes PCA algorithm on the dataset and obtains the labels of the dataset indicating which instance is an inlier (0) or outlier (1) @param dataset Dataset on which to try the algorithm @return It returns a list of labels 0 means inlier, 1 means outlier ''' # Initializating the model pca = PCA() # Fits the data and obtains labels pca.fit(dataset) # Return labels return pca.labels_
def fit_transform(self, df_train, df_corrupted): pyod_model = PCA( contamination=0.25 ) # n_components = min(n_samples, n_features) default # n_selected_components = None df_outliers_num = self.num_out_detect(df_train, df_corrupted, pyod_model) df_outliers_cat = self.cat_out_detect(df_train, df_corrupted) df_outliers = df_outliers_num.join(df_outliers_cat, how='inner') for col in df_corrupted.columns: for i in df_outliers.index: if df_outliers.loc[i, col + "_outlier"] == 1: df_outliers.loc[i, col] = np.nan return df_outliers, self.predictors
def print_accuracy(train_arr,test_arr,trader_id): if len(train_arr)==0 or len(test_arr)==0: return for i in range(len(train_arr)): l1=len(train_arr[i]) l2=len(test_arr[i]) if l1==0 or l2==0: continue train_data=np.array([train_arr[i]]).T test_data=np.array([test_arr[i]]).T # clf=OCSVM(kernel ='rbf',gamma = 0.5) print(len(train_arr)) clf = PCA(n_components =15) clf.fit(train_arr) y_pred=clf.predict(train_arr) print("TRAINING ACCURACY for TRADER",trader_id,":",100 - (sum(y_pred)*100/l1)) y_pred=clf.predict(test_data) print("TESTING ACCURACY: ",sum(y_pred)*100/l2)
def setUp(self): self.n_train = 1000 self.n_test = 500 self.contamination = 0.1 self.roc_floor = 0.6 self.random_state = 42 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=self.random_state) self.base_estimators = [ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination), LOF(n_neighbors=25, contamination=self.contamination), LOF(n_neighbors=35, contamination=self.contamination), LOF(n_neighbors=45, contamination=self.contamination), HBOS(contamination=self.contamination), PCA(contamination=self.contamination), LSCP(detector_list=[ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination) ], random_state=self.random_state) ] this_directory = os.path.abspath(os.path.dirname(__file__)) self.cost_forecast_loc_fit_ = os.path.join(this_directory, 'bps_train.joblib') self.cost_forecast_loc_pred_ = os.path.join(this_directory, 'bps_prediction.joblib') self.model = SUOD(base_estimators=self.base_estimators, n_jobs=2, rp_flag_global=True, bps_flag=True, contamination=self.contamination, approx_flag_global=True, cost_forecast_loc_fit=self.cost_forecast_loc_fit_, cost_forecast_loc_pred=self.cost_forecast_loc_pred_, verbose=True)
def choose_model(model, nnet): """ among implemented in PyOD """ clfs = { 'AE': AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15), 'VAE': VAE(encoder_neurons=nnet[:5], decoder_neurons=nnet[4:], contamination=0.1, epochs=13), 'ABOD': ABOD(), 'FeatureBagging': FeatureBagging(), 'HBOS': HBOS(), 'IForest': IForest(), 'KNN': KNN(), 'LOF': LOF(), 'OCSVM': OCSVM(), 'PCA': PCA(), 'SOS': SOS(), 'COF': COF(), 'CBLOF': CBLOF(), 'SOD': SOD(), 'LOCI': LOCI(), 'MCD': MCD() } return clfs[model]
def train(doc_list, dataset_name, clf_name): model_roc = [] model_prc = [] if clf_name == "PCA": clf = PCA() elif clf_name == "MCD": clf = MCD() elif clf_name == "LOF": clf = LOF() elif clf_name == "KNN": clf = KNN() elif clf_name == "LODA": clf = LODA() for i in range(10): data = pd.read_csv(doc_list[i], header=0, index_col=0) train_x = data.drop(drop + ground_truth, axis=1).values train_y = np.array([ transfor[x] for x in list(_flatten(data[ground_truth].values.tolist())) ]) clf.fit(train_x) predict = clf.decision_scores_ roc = roc_auc_score(train_y, predict) prc = precision_n_scores(train_y, predict) if ((i + 1) % 200 == 0): print("第" + str(i + 1) + "个文件结果:") evaluate_print(clf_name, train_y, predict) model_roc.append(roc) model_prc.append(prc) model_roc_avg = np.mean(model_roc) model_prc_avg = np.mean(model_prc) print("模型" + clf_name + "在数据集" + dataset_name + "的平均roc_auc为" + str(round(model_roc_avg, 4)) + ",平均prc为" + str(round(model_prc_avg, 4)) + "。") return model_roc_avg, model_prc_avg
'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), # 'Median KNN': KNN(method='median', # contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), # 'Local Correlation Integral (LOCI)': # LOCI(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), # 'Stochastic Outlier Selection (SOS)': SOS( # contamination=outliers_fraction), 'Locally Selective Combination (LSCP)': LSCP(detector_list, contamination=outliers_fraction, random_state=random_state), # 'Connectivity-Based Outlier Factor (COF)': # COF(n_neighbors=35, contamination=outliers_fraction), # 'Subspace Outlier Detection (SOD)': # SOD(contamination=outliers_fraction), } # Show all detectors for i, clf in enumerate(classifiers.keys()): print('Model', i + 1, clf)
'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), '(KNN) K Nearest Neighbors ': KNN( contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), # 'Median KNN': KNN(method='median', # contamination=outliers_fraction), '(LOF) Local Outlier Factor ': LOF(n_neighbors=35, contamination=outliers_fraction), # 'Local Correlation Integral (LOCI)': # LOCI(contamination=outliers_fraction), '(MCD) Minimum Covariance Determinant ': MCD( contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), '(PCA) Principal Component Analysis ': PCA( contamination=outliers_fraction, random_state=random_state), # 'Stochastic Outlier Selection (SOS)': SOS( # contamination=outliers_fraction), '(LSCP) Locally Selective Combination ': LSCP( detector_list, contamination=outliers_fraction, random_state=random_state), # 'Connectivity-Based Outlier Factor (COF)': # COF(n_neighbors=35, contamination=outliers_fraction), # 'Subspace Outlier Detection (SOD)': # SOD(contamination=outliers_fraction), } st.subheader('SELECT AN ALGORITHM:') classifier_name = st.selectbox('THE ALGORITHM',[*classifiers]) # Show all detectors
'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Median KNN': KNN(method='median', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction), } # Show all detectors for i, clf in enumerate(classifiers.keys()): print('Model', i + 1, clf) # Fit the models with the generated data and # compare model performances for i, offset in enumerate(clusters_separation): np.random.seed(42) # Data generation X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset X = np.r_[X1, X2] # Add outliers
trader_timestamp_dict[( time_stamp, trader_id)]['buying']['volume'].append(volume) trader_list.append([trader_id, price, volume]) elif int(direction) == -1 and int(entry_type) == 1: trader_timestamp_dict[( time_stamp, trader_id)]['selling']['price'].append(price) trader_timestamp_dict[( time_stamp, trader_id)]['selling']['volume'].append(volume) trader_list.append([trader_id, price * -1, volume]) # print(trader_timestamp_dict) # traders=list(set(traders)) # print(traders) keys = list(trader_timestamp_dict.keys()) keys.sort() # trader_arr = trader_list clf = PCA() user_order = [] ## Standardize data # trader_list = [v for v in trader_timestamp_dict.values()] # for key, value in trader_timestamp_dict.iteritems(): # temp = [key,value] # trader_list.append(temp) trader_arr = np.asarray(trader_list) # print(len(traders)) # print(len(set(trader_arr[:,0]))) # print(len(keys)) # malicious_complete_data = np.zeros((len(malicious_keys),16)) # normal_complete_data = np.zeros((len(traders)-len(malicious_keys),16)) malicious_complete_data = []
def detect(file, amountanom, realtime): """ Functon to apply a very simple anomaly detector amountanom: The top number of anomalies we want to print realtime: If we want to read the conn.log file in real time (not working) """ # Create a zeek reader on a given log file. Thanks brothon reader = bro_log_reader.BroLogReader(file, tail=realtime) # Create a Pandas dataframe from reader bro_df = pd.DataFrame(reader.readrows()) # In case you need a label, due to some models being able to work in a semisupervized mode, then put it here. For now everything is 'normal', but we are not using this for detection bro_df['label'] = 'normal' # Change the datetime delta value to seconds. Scikit does not now how to work with timedeltas bro_df['durationsec'] = bro_df.duration.apply(lambda x: x.total_seconds()) # Replace the rows without data (with '-') with -1. Even though this may add a bias in the algorithms, is better than not using the lines. bro_df['orig_bytes'] = bro_df['orig_bytes'].replace(to_replace='-', value=-1) bro_df['resp_bytes'] = bro_df['resp_bytes'].replace(to_replace='-', value=-1) bro_df['resp_pkts'] = bro_df['resp_pkts'].replace(to_replace='-', value=-1) bro_df['orig_ip_bytes'] = bro_df['orig_ip_bytes'].replace(to_replace='-', value=-1) bro_df['resp_ip_bytes'] = bro_df['resp_ip_bytes'].replace(to_replace='-', value=-1) # Add the columns from the log file that we know are numbers. This is only for conn.log files. X_train = bro_df[[ 'durationsec', 'orig_bytes', 'id.resp_p', 'resp_bytes', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes' ]] # Our y is the label. But we are not using it now. y = bro_df.label # The X_test is where we are going to search for anomalies. In our case, its the same set of data than X_train. X_test = X_train ################# # Select a model from below # ABOD class for Angle-base Outlier Detection. For an observation, the variance of its weighted cosine scores to all neighbors could be viewed as the outlying score. #clf = ABOD() # LOF #clf = LOF() # CBLOF #clf = CBLOF() # LOCI #clf = LOCI() # LSCP #clf = LSCP() # MCD #clf = MCD() # OCSVM #clf = OCSVM() # PCA. Good and fast! clf = PCA() # SOD #clf = SOD() # SO_GAAL #clf = SO_GALL() # SOS #clf = SOS() # XGBOD #clf = XGBOD() # KNN # Good results but slow #clf = KNN() #clf = KNN(n_neighbors=10) ################# # Fit the model to the train data clf.fit(X_train) # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # Convert the ndarrays of scores and predictions to pandas series scores_series = pd.Series(y_test_scores) pred_series = pd.Series(y_test_pred) # Now use the series to add a new column to the X test X_test['score'] = scores_series.values X_test['pred'] = pred_series.values # Add the score to the bro_df also. So we can show it at the end bro_df['score'] = X_test['score'] # Keep the positive predictions only. That is, keep only what we predict is an anomaly. X_test_predicted = X_test[X_test.pred == 1] # Keep the top X amount of anomalies top10 = X_test_predicted.sort_values(by='score', ascending=False).iloc[:amountanom] ## Print the results # Find the predicted anomalies in the original bro dataframe, where the rest of the data is df_to_print = bro_df.iloc[top10.index] print('\nFlows of the top anomalies') # Only print some columns, not all, so its easier to read. df_to_print = df_to_print.drop([ 'conn_state', 'history', 'local_orig', 'local_resp', 'missed_bytes', 'ts', 'tunnel_parents', 'uid', 'label' ], axis=1) print(df_to_print)
def main(): # PART 1: # Getting the predictions for each classifier # SK means: The classifier is from sklearn or works like sklearn # PY means: The classifier is from pyod or works like pyod models = { 'SK_EE': EllipticEnvelope(), 'SK_GM': GaussianMixture(), 'SK_IF': IsolationForest(), 'SK_OCSVM': OneClassSVM(), 'SK_FA': FactorAnalysis(), 'SK_KD': KernelDensity(), 'PY_PCA': PCA(), 'PY_COF': COF(), 'PY_LODA': LODA(), 'PY_LOF': LOF(), 'PY_HBOS': HBOS(), 'PY_MCD': MCD(), 'PY_AvgKNN': KNN(method='mean'), 'PY_LargestKNN': KNN(method='largest'), 'PY_MedKNN': KNN(method='median'), 'PY_AvgBagging': FeatureBagging(combination='average'), 'PY_MaxBagging': FeatureBagging(combination='max'), 'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'PY_COPOD': COPOD(), 'PY_SOD': SOD(), 'PY_LSCPwithLODA': LSCP([LODA(), LODA()]), 'PY_AveLMDD': LMDD(dis_measure='aad'), 'PY_VarLMDD': LMDD(dis_measure='var'), 'PY_IqrLMDD': LMDD(dis_measure='iqr'), 'PY_VAE': VAE(encoder_neurons=[8, 4, 2]), 'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'SK_BRM': BRM(bootstrap_sample_percent=70), 'SK_OCKRA': m_OCKRA(), 'PY_SoGaal': SO_GAAL(), 'PY_MoGaal': MO_GAAL() } ranker = ADRanker(data="datasets", models=models) ranker.get_predictions() # PART 2: # After predictions, we can evaluate our classifiers using different scores # You can add manually a new metric by modifying 'metrics.py' ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave}) # PART 3: # Finally, it is time to summarize the results by plotting different graphs # You can add your own graphs by modifying ' plots.py' plot = Plots() plot.make_plot_basic(paths=[ 'results/scores/auc/no/results.csv', 'results/scores/auc/minmax/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/ave/minmax/results.csv', 'results/scores/ave/std/results.csv' ], scalers=[ 'Without scaler', 'Min max scaler', 'Standard scaler', 'Without scaler', 'Min max scaler', 'Standard scaler' ]) plot.make_cd_plot( paths=[ 'results/scores/auc/minmax/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/no/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/std/results.csv' ], names=[ 'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale', 'CD ave no scale', 'CD auc std scale', 'CD ave std scale' ], titles=[ 'CD diagram - AUC with min max scaling', 'CD diagram - Average precision with min max scaling', 'CD diagram - AUC without scaling', 'CD diagram - Average precision without scaling', 'CD diagram - AUC with standard scaling', 'CD diagram - Average precision with standard scaling' ])
def get_estimators(contamination): """Internal method to create a list of 600 random base outlier detectors. Parameters ---------- contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. Returns ------- base_detectors : list A list of initialized random base outlier detectors. """ BASE_ESTIMATORS = [ LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), ABOD(n_neighbors=45, contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), ] return BASE_ESTIMATORS