def model_init(self, model): """Model initialisation of a single model. """ if self.model == 'pca': self.models[model] = PCA(contamination=self.contamination) elif self.model == 'loda': self.models[model] = LODA(contamination=self.contamination) elif self.model == 'iforest': self.models[model] = IForest(n_estimators=50, bootstrap=True, behaviour='new', contamination=self.contamination) elif self.model == 'cblof': self.models[model] = CBLOF(n_clusters=3, contamination=self.contamination) elif self.model == 'feature_bagging': self.models[model] = FeatureBagging( base_estimator=PCA(contamination=self.contamination), contamination=self.contamination) elif self.model == 'copod': self.models[model] = COPOD(contamination=self.contamination) elif self.model == 'hbos': self.models[model] = HBOS(contamination=self.contamination) else: self.models[model] = HBOS(contamination=self.contamination) self.custom_model_scalers[model] = MinMaxScaler()
def S2(self): self.S1() water_data = self.water_data result = self.result # 数据预处理及模型训练 clean_data = water_data[water_data['S1'] == 0] Y = pd.DataFrame(index=clean_data.index, columns=['S2']) X_train = np.array(clean_data.iloc[:, 1:12]) name = list(clean_data.iloc[:, 1:12].columns.values) scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) clf1 = IForest(contamination=0.05, max_features=11, bootstrap=True) clf2 = KNN(contamination=0.05, n_neighbors=100) clf3 = HBOS(contamination=0.05, n_bins=10) clf4 = PCA(contamination=0.05) clf1.fit(X_train) clf2.fit(X_train) clf3.fit(X_train) clf4.fit(X_train) Y['S2'] = clf1.labels_ * clf2.labels_ * clf3.labels_ * clf4.labels_ water_data = pd.concat([water_data, Y], axis=1) # water_data.loc[water_data['S2'].isna(),['S2']]=0,将S1中异常的,在S2中标注为0; result['统计异常'] = water_data['S2'].values # 寻找异常维度 from sklearn.neighbors import KernelDensity clean_data = water_data[water_data['S1'] == 0] dens = pd.DataFrame(index=clean_data.index, columns=[ 'temperature', 'pH', 'EC', 'ORP', 'DO', 'turbidity', 'transparency', 'COD', 'P', 'NH3N', 'flux' ]) for i in dens.columns: kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit( clean_data[i].values.reshape(-1, 1)) dens[i] = np.exp( kde.score_samples(clean_data[i].values.reshape(-1, 1))) dens = dens.iloc[:, 0:11].rank() dens['S2_names'] = dens.idxmin(axis=1) water_data = pd.concat([water_data, dens['S2_names']], axis=1) self.water_data = water_data result['统计异常维度'] = water_data['S2_names'].values # 存储模型 joblib.dump(scaler, "./water_model/S2_scaler") joblib.dump(clf1, "./water_model/S2_Iforest")
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = HBOS(contamination=self.contamination) self.clf.fit(self.X_train)
def models_init(self): """Models initialisation. """ self.model = self.configuration.get('model', 'pca') if self.model == 'pca': self.models = { model: PCA(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'loda': self.models = { model: LODA(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'iforest': self.models = { model: IForest(n_estimators=50, bootstrap=True, behaviour='new', contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'cblof': self.models = { model: CBLOF(n_clusters=3, contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'feature_bagging': self.models = { model: FeatureBagging( base_estimator=PCA(contamination=self.contamination), contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'copod': self.models = { model: COPOD(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'hbos': self.models = { model: HBOS(contamination=self.contamination) for model in self.models_in_scope } else: self.models = { model: HBOS(contamination=self.contamination) for model in self.models_in_scope } self.custom_model_scalers = { model: MinMaxScaler() for model in self.models_in_scope }
def getOutlierHBOS(dataset): ''' @brief Function that executes HBOS algorithm on the dataset and obtains the labels of the dataset indicating which instance is an inlier (0) or outlier (1) @param dataset Dataset on which to try the algorithm @return It returns a list of labels 0 means inlier, 1 means outlier ''' # Initializating the model hbos = HBOS() # Fits the data and obtains labels hbos.fit(dataset) # Return labels return hbos.labels_
def define_classifiers(random_state, outliers_fraction): classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def load_classifiers(outliers_fraction): outliers_fraction = min(0.5, outliers_fraction) random_state = np.random.RandomState(42) # Define nine outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state, behaviour="new"), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def try_fit(self, df_train, models_to_train=None): """Try fit each model and try to fallback to a default model if fit fails for any reason. :param df_train <pd.DataFrame>: data to train on. :param models_to_train <list>: list of models to train. """ if models_to_train is None: models_to_train = list(self.models.keys()) self.n_fit_fail, self.n_fit_success = 0, 0 for model in models_to_train: X_train = self.make_features(df_train[df_train.columns[ df_train.columns.str.startswith(f'{model}|')]].values, train=True, model=model) try: self.models[model].fit(X_train) self.n_fit_success += 1 except Exception as e: self.n_fit_fail += 1 self.info(e) self.info( f'training failed for {model} at run_counter {self.runs_counter}, defaulting to hbos model.' ) self.models[model] = HBOS(contamination=self.contamination) self.models[model].fit(X_train) self.fitted_at[model] = self.runs_counter
def __load_classifiers(self): outliers_fraction = 0.05 random_state = np.random.RandomState(0) classifiers = { 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), } return classifiers
def extract_is_outlier_forall(df: pd.DataFrame, verbose: bool = True, model=None, outliers_fraction: float = 0.05, replace_with=None) -> pd.DataFrame: df = df.copy(deep=True) df.columns = [column.lower() for column in df.columns] conf = make_conf(df) if model is None: model = HBOS(contamination=outliers_fraction) # Filter numerical columns conf = [ col for col in list(conf.keys()) if conf[col]['type'] == 'numerical' ] cols = tqdm(conf) for col in cols: df = extract_is_outlier(df, col, cols, verbose, model=model, outliers_fraction=outliers_fraction, replace_with=replace_with) new_cols = [col for col in df.columns if '_isoutlier' in col] df = remove_single_value_features(df, verbose, include=new_cols) return df
def train(): """ Train the predictor on the data collected """ start_time = time.time() device_uuid, date_time = request.args.get('deviceUUID'), request.args.get( 'datetime') data_filename = get_data_filename(device_uuid, date_time) with open(data_filename, 'r') as f: rows = f.readlines() with open(config.awake_filename, 'rb') as f: awake_features = pickle.load(f) if len(rows) < config.min_train_data_size: return jsonify({ "status": 1, "message": "Not enough training data! %d" % len(rows) }) raw = np.zeros((len(rows), 3)) for i in range(len(rows)): raw[i] = [int(val) for val in rows[i].strip().split(',')] norm = features.normalize(raw) temp_features = features.extract_multi_features(norm, step=config.step_size, x_len=config.sample_size) baseline_features = features.get_baseline_features(temp_features) norm_features = features.get_calibrated_features(temp_features, baseline_features) X = np.concatenate((awake_features, norm_features), axis=0) X[:, 1] = np.abs(np.random.normal(0, 0.01, len(X))) app.logger.info( 'Training classifier using %d feature sets, each containing %d features' % (X.shape[0], X.shape[1])) clf = HBOS(contamination=0.05) clf.fit(X) model_filename = get_model_filename(device_uuid, date_time) with open(model_filename, 'wb') as f: pickle.dump(clf, f) pred = clf.decision_function(X) baseline = {'features': baseline_features, 'hboss_base': np.min(pred)} baseline_filename = get_baseline_filename(device_uuid, date_time) with open(baseline_filename, 'wb') as f: pickle.dump(baseline, f) return jsonify({"status": 0, "time": (time.time() - start_time)})
def main(): scalers = ['no', 'std', 'minmax'] root = 'Unsupervised_Anamaly_Detection_csv' start = 0 counts = 90 CPUS = 3 CPUS_Models = 4 sklearn_models = [ 'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS', 'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD', 'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE', 'AutoEncoder' ] models = { 'BRM': BRM(bootstrap_sample_percent=70), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), 'MoGaal': MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'OCKRA': m_OCKRA(), } name = "30_Models" Parallel(n_jobs=CPUS) \ (delayed(runByScaler) (root, scaler, models, start, counts, other_models=sklearn_models, CPUS=CPUS_Models, save_name=name) for scaler in scalers)
def detect_anomaly(df): df = df.fillna(0) clf =HBOS() x_values = df.index.values.reshape(df.index.values.shape[0],1) y_values = df.total_traded_quote_asset_volume.values.reshape(df.total_traded_quote_asset_volume.values.shape[0],1) clf.fit(y_values) clf.predict(y_values) df["label_qav"] = clf.predict(y_values) df["score_qav"] = clf.decision_function(y_values)#.round(6) df['change_qav'] = df.total_traded_quote_asset_volume.pct_change(periods=1)*100 df['change_price'] = df.last_price.pct_change(periods=1)*100 return df
def outlier_detection(x_raw, y_raw): """ Filter all ourlier points :param x_raw: feature in ndarray :param y_raw: label in ndarray :return x_clean, y_clean: cleaned feature and label in ndarray """ # TODO Filter the outliers. print() print("Detecting outliers...") print("Before outlier detection: {}".format(x_raw.shape)) outliers_fraction = 0.04 random_state = np.random.RandomState(42) # all outlier detection method candidate list as follows classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), 'Improving Supervised Outlier Detection with Unsupervised Representation Learning': XGBOD(contamination=outliers_fraction), } clf_name = 'Isolation Forest' clf = IForest(contamination=outliers_fraction, random_state=random_state) # clf_name = 'Angle-based Outlier Detector (ABOD)' # clf = ABOD(contamination=outliers_fraction, method='default') clf.fit(x_raw) y_pred = clf.predict(x_raw) # for pyod, 1 means outliers and 0 means inliers # for sklearn, -1 means outliers and 1 means inliers idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1] x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0) y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0) print("After outlier detection: {}".format(x_clean.shape)) assert (x_clean.shape[0] == y_clean.shape[0]) return x_clean, y_clean
def extract_is_outlier(df: pd.DataFrame, col: str, pbar=None, verbose: bool = True, model=None, outliers_fraction: float = 0.05, replace_with=None) -> pd.DataFrame: """ Create an is_outlier column :param df: the data :param col: the column name :param conf: the config dir :param pbar: tqdm progress bar :return: """ df = df.copy(deep=True) msg = "Trying to find outliers in " + str(col) if pbar is None: print_c(verbose, msg) else: pbar.set_description(msg) if model is None: model = HBOS(contamination=outliers_fraction) X = df[col].astype(np.float32) mask = ~(np.isnan(X) | np.isinf(X) | np.isneginf(X)) model.fit(X[mask].to_frame()) preds = model.predict(X[mask].to_frame()) df[col + '_' + 'isoutlier'] = 0 df.loc[mask, col + '_' + 'isoutlier'] = preds if replace_with is not None: msg = "Replacing outliers in " + str(col) + " with " + str( replace_with) if pbar is None: print_c(verbose, msg) else: pbar.set_description(msg) df.loc[df[col + '_' + 'isoutlier'] == 1, col] = replace_with return df
def OD_detect(df, id_col=None, contamination=0.05, trans_cols=None): """ use pyod lib to find 5% outlier in dataset """ df = df.copy() OD_clfs = { "HBOS": HBOS(contamination=contamination), "IForest": IForest(contamination=contamination), "CBLOF": CBLOF(contamination=contamination, n_clusters=5), # "OCSVM": OCSVM(contamination=contamination), "PCA": PCA(contamination=contamination) } results_list = [] od_cols = ["id", "name", "result", "label"] if id_col is None: s_id = df.index od_cols = df.columns else: s_id = df[id_col] X_cols = df.columns.drop(id_col) if trans_cols is not None: for col in trans_cols: df[col] = PowerTransformer().fit_transform(df[col].values.reshape( -1, 1)) for clf_name, clf in OD_clfs.items(): od_result = pd.DataFrame(columns=od_cols) # create an empty dataframe od_result["id"] = s_id od_result['name'] = clf_name print(f"{clf_name}, {clf}") clf.fit(df[X_cols]) od_result['result'] = clf.decision_scores_ od_result['label'] = clf.labels_ results_list.append(od_result) od_results_df = pd.concat(results_list, axis=0, ignore_index=True) job_name = f'{pd.datetime.now():%H%M}' od_results_df['job_name'] = job_name od_results_df.to_sql('t_ml', engine, if_exists='append', schema='wh_v1', method=psql_insert_copy) print( f"OD results {od_results_df.shape}exported to database{engine},job_name={job_name}" ) return od_results_df
def ranger(parameter, classifier): __ = parameter classi__ = { 'CBLOF': (CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state, n_clusters=__)), 'HBOS': (HBOS(contamination=outliers_fraction, n_bins=__)), 'KNN': (KNN(contamination=outliers_fraction, n_neighbors=__)), 'LOF': (LOF(n_neighbors=__, contamination=outliers_fraction)) } return classi__[classifier]
def _create_model(n_bins=10, alpha=0.1, tol=0.1, contamination=0.1): """(Internal helper) Create a HBOS instance""" n_bins = int(n_bins) hbos = HBOS(n_bins=n_bins, alpha=alpha, tol=tol, contamination=contamination) print('Created Model: {}'.format(hbos)) return hbos
def _get_outlier_labels(eigs: ndarray, tol: float) -> List[str]: """Identify the outliers of eigs with HBOS.""" hb = HBOS(tol=tol) steps = np.arange(0, len(eigs)) X = np.vstack([eigs, steps]).T # data array is_outlier = np.array(hb.fit(X).labels_, dtype=bool) # outliers get "1" # because eigs are sorted, HBOS will *usually* identify outliers at one of # the two ends of the eigenvalues, which is what we want. But this is not # always the case, so we need to de-identify those values as outliers. if is_outlier[0]: start = find_first(is_outlier, False) for i in range(start, len(is_outlier)): is_outlier[i] = False if is_outlier[-1]: stop = find_last(is_outlier, False) for i in range(stop): is_outlier[i] = False if not is_outlier[0] and not is_outlier[-1]: # force a break later is_outlier = np.zeros(is_outlier.shape, dtype=bool) return ["outlier" if label else "inlier" for label in is_outlier]
def test_hbos(self): clf = HBOS(contamination=0.05) clf.fit(self.X_train) assert_equal(len(clf.decision_scores), self.X_train.shape[0]) pred_scores = clf.decision_function(self.X_test) assert_equal(pred_scores.shape[0], self.X_test.shape[0]) assert_equal(clf.predict(self.X_test).shape[0], self.X_test.shape[0]) assert_greater(roc_auc_score(self.y_test, pred_scores), 0.5)
def identify_fit(algo_type, outliers_fraction): random_state = np.random.RandomState(42) if algo_type == 'hbos': clf = HBOS(contamination=outliers_fraction) elif algo_type == 'cblof': clf = CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state) elif algo_type == 'iforest': clf = IForest(contamination=outliers_fraction, random_state=random_state) elif algo_type == 'knn': clf = KNN(contamination=outliers_fraction) elif algo_type == 'aknn': clf = KNN(method='mean', contamination=outliers_fraction) return clf
def setUp(self): self.n_train = 1000 self.n_test = 500 self.contamination = 0.1 self.roc_floor = 0.6 self.random_state = 42 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=self.random_state) self.base_estimators = [ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination), LOF(n_neighbors=25, contamination=self.contamination), LOF(n_neighbors=35, contamination=self.contamination), LOF(n_neighbors=45, contamination=self.contamination), HBOS(contamination=self.contamination), PCA(contamination=self.contamination), LSCP(detector_list=[ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination) ], random_state=self.random_state) ] this_directory = os.path.abspath(os.path.dirname(__file__)) self.cost_forecast_loc_fit_ = os.path.join(this_directory, 'bps_train.joblib') self.cost_forecast_loc_pred_ = os.path.join(this_directory, 'bps_prediction.joblib') self.model = SUOD(base_estimators=self.base_estimators, n_jobs=2, rp_flag_global=True, bps_flag=True, contamination=self.contamination, approx_flag_global=True, cost_forecast_loc_fit=self.cost_forecast_loc_fit_, cost_forecast_loc_pred=self.cost_forecast_loc_pred_, verbose=True)
def out_lier_score(df, target, num_var): scaler = MinMaxScaler(feature_range=(0, 1)) df = scaler.fit_transform(df.loc[:, num_var], df[target]) #.to_numpy() random_state = np.random.RandomState(42) outliers_fraction = 0.05 X = df df_out_score = [] # Define seven outlier tools detectionto be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction) } for i, (clf_name, clf) in enumerate(classifiers.items()): clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) df_out_score.append(y_pred.tolist()) df_out_score = pd.DataFrame(df_out_score).T df_out_score.columns = list(classifiers.keys()) return df_out_score
def choose_model(model, nnet): """ among implemented in PyOD """ clfs = { 'AE': AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15), 'VAE': VAE(encoder_neurons=nnet[:5], decoder_neurons=nnet[4:], contamination=0.1, epochs=13), 'ABOD': ABOD(), 'FeatureBagging': FeatureBagging(), 'HBOS': HBOS(), 'IForest': IForest(), 'KNN': KNN(), 'LOF': LOF(), 'OCSVM': OCSVM(), 'PCA': PCA(), 'SOS': SOS(), 'COF': COF(), 'CBLOF': CBLOF(), 'SOD': SOD(), 'LOCI': LOCI(), 'MCD': MCD() } return clfs[model]
def runMethod(self): ''' @brief This function is the actual implementation of HICS ''' if self.verbose: print("Calculating the subspaces\n") # First we obtain the high contrast subspaces subspaces = self.hicsFramework() if self.verbose: print("Now calculating the scoring\n") # We initialize the scores for each instance as 0 scores = np.zeros(len(self.dataset)) # For each subspace for sub in subspaces: # We place the corresponding scorer according to parameter scorer = None if self.outlier_rank == "lof": scorer = LOF() elif self.outlier_rank == "cof": scorer = COF() elif self.outlier_rank == "cblof": scorer = CBLOF() elif self.outlier_rank == "loci": scorer = LOCI() elif self.outlier_rank == "hbos": scorer = HBOS() elif self.outlier_rank == "sod": scorer = SOD() # Fits the scorer with the dataset projection scorer.fit(self.dataset[:, sub]) # Adds the scores obtained to the global ones scores = scores + scorer.decision_scores_ # Compute the average self.outlier_score = scores / len(subspaces) # Marks the calculations as done self.calculations_done = True
def get_HBOS_scores(dataframe, cols, outliers_fraction=0.01, standardize=True): '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default Returns: df with CBOLF scores added ''' if standardize: #standardize selected variables minmax = MinMaxScaler(feature_range=(0, 1)) dataframe[cols] = minmax.fit_transform(dataframe[cols]) #Convert dataframe to a numpy array in order to incorprate our algorithm arrays = [] for row in cols: row = dataframe[row].values.reshape(-1, 1) arrays.append(row) X = np.concatenate((arrays), axis=1) #fit clf = HBOS(contamination=outliers_fraction) #clf = CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=0) clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) CheckOutliers.df2 = dataframe CheckOutliers.df2['outlier'] = y_pred.tolist() print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with HBOS')
class TestHBOS(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = HBOS(contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'hist_') and self.clf.hist_ is not None) assert (hasattr(self.clf, 'bin_edges_') and self.clf.bin_edges_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') # def test_score(self): # self.clf.score(self.X_test, self.y_test) # self.clf.score(self.X_test, self.y_test, scoring='roc_auc_score') # self.clf.score(self.X_test, self.y_test, scoring='prc_n_score') # with assert_raises(NotImplementedError): # self.clf.score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
outliers_fraction = 0.05 # Define seven outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction) } xx, yy = np.meshgrid(np.linspace(0, 1, 200), np.linspace(0, 1, 200)) for i, (clf_name, clf) in enumerate(classifiers.items()): clf.fit(X) # predict raw anomaly score,预测原始异常值 scores_pred = clf.decision_function(X) * -1
if not len(os.listdir(csv_folder)): convert_mat_to_csv(mat_data) random_state = 42 # Define nine outlier detection tools to be compared classifiers = { '(ABOD) Angle-based Outlier Detector': ABOD(contamination=outliers_fraction), '(CBLOF) Cluster-based Local Outlier Factor ': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), '(HBOS) Histogram-base Outlier Detection': HBOS( contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), '(KNN) K Nearest Neighbors ': KNN( contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), # 'Median KNN': KNN(method='median', # contamination=outliers_fraction), '(LOF) Local Outlier Factor ': LOF(n_neighbors=35, contamination=outliers_fraction), # 'Local Correlation Integral (LOCI)': # LOCI(contamination=outliers_fraction), '(MCD) Minimum Covariance Determinant ': MCD( contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
from matplotlib.lines import Line2D from pyod.data.load_data import generate_data from pyod.utils.utility import precision_n_scores from pyod.models.hbos import HBOS if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 1000 n_test = 500 X_train, y_train, c_train, X_test, y_test, c_test = generate_data( n_train=n_train, n_test=n_test, contamination=contamination) # train a HBOS detector (default version) clf = HBOS() clf.fit(X_train) # get the prediction on the training data y_train_pred = clf.y_pred y_train_score = clf.decision_scores # get the prediction on the test data y_test_pred = clf.predict(X_test) y_test_score = clf.decision_function(X_test) print('Train ROC:{roc}, precision@n:{prn}'.format( roc=roc_auc_score(y_train, y_train_score), prn=precision_n_scores(y_train, y_train_score))) print('Test ROC:{roc}, precision@n:{prn}'.format(
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train HBOS detector clf_name = 'HBOS' clf = HBOS() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
class TestHBOS(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = HBOS(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'hist_') and self.clf.hist_ is not None) assert_true(hasattr(self.clf, 'bin_edges_') and self.clf.bin_edges_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') # def test_score(self): # self.clf.score(self.X_test, self.y_test) # self.clf.score(self.X_test, self.y_test, scoring='roc_auc_score') # self.clf.score(self.X_test, self.y_test, scoring='prc_n_score') # with assert_raises(NotImplementedError): # self.clf.score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass