def feature_bagging(X_train, X_test, Y_train, Y_test): from pyod.models.feature_bagging import FeatureBagging model = FeatureBagging(random_state=1) model.fit(X_train) pred = model.predict(X_test) acc = np.sum(pred == Y_test) / X_test.shape[0] print(acc) return (acc * 100)
def main(): scalers = ['no', 'std', 'minmax'] root = 'Unsupervised_Anamaly_Detection_csv' start = 0 counts = 90 CPUS = 3 CPUS_Models = 4 sklearn_models = [ 'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS', 'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD', 'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE', 'AutoEncoder' ] models = { 'BRM': BRM(bootstrap_sample_percent=70), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), 'MoGaal': MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'OCKRA': m_OCKRA(), } name = "30_Models" Parallel(n_jobs=CPUS) \ (delayed(runByScaler) (root, scaler, models, start, counts, other_models=sklearn_models, CPUS=CPUS_Models, save_name=name) for scaler in scalers)
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = FeatureBagging(contamination=self.contamination) self.clf.fit(self.X_train)
def getOulierFeatureBagging(dataset): ''' @brief Function that executes Feature Bagging algorithm on the dataset and obtains the labels of the dataset indicating which instance is an inlier (0) or outlier (1) @param dataset Dataset on which to try the algorithm @return It returns a list of labels 0 means inlier, 1 means outlier ''' # Initializating the model without verbose fb = FeatureBagging(verbose=0) # Fits the data and obtains labels fb.fit(dataset) # Return labels return fb.labels_
def get_model_bagging(percentage_of_outliers=0.002, num_estimators=2, combination='max'): """Create a Feature Bagging model. Args: percentage_of_outliers: percentage of fraud on data num_estimators: number of base estimators in the ensemble. combination: if ‘average’: take the average of all detectors if ‘max’: take the maximum scores of all detectors Returns: model: Feature Bagging model """ utils.save_log('{0} :: {1}'.format( get_model_bagging.__module__, get_model_bagging.__name__)) model = FeatureBagging(contamination=percentage_of_outliers, n_estimators=num_estimators, combination=combination, random_state=config.random_seed, n_jobs=config.num_jobs) return model
def model_init(self, model): """Model initialisation of a single model. """ if self.model == 'pca': self.models[model] = PCA(contamination=self.contamination) elif self.model == 'loda': self.models[model] = LODA(contamination=self.contamination) elif self.model == 'iforest': self.models[model] = IForest(n_estimators=50, bootstrap=True, behaviour='new', contamination=self.contamination) elif self.model == 'cblof': self.models[model] = CBLOF(n_clusters=3, contamination=self.contamination) elif self.model == 'feature_bagging': self.models[model] = FeatureBagging( base_estimator=PCA(contamination=self.contamination), contamination=self.contamination) elif self.model == 'copod': self.models[model] = COPOD(contamination=self.contamination) elif self.model == 'hbos': self.models[model] = HBOS(contamination=self.contamination) else: self.models[model] = HBOS(contamination=self.contamination) self.custom_model_scalers[model] = MinMaxScaler()
def define_classifiers(random_state, outliers_fraction): classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def __load_classifiers(self): outliers_fraction = 0.05 random_state = np.random.RandomState(0) classifiers = { 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), } return classifiers
def load_classifiers(outliers_fraction): outliers_fraction = min(0.5, outliers_fraction) random_state = np.random.RandomState(42) # Define nine outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state, behaviour="new"), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def outlier_detection(x_raw, y_raw): """ Filter all ourlier points :param x_raw: feature in ndarray :param y_raw: label in ndarray :return x_clean, y_clean: cleaned feature and label in ndarray """ # TODO Filter the outliers. print() print("Detecting outliers...") print("Before outlier detection: {}".format(x_raw.shape)) outliers_fraction = 0.04 random_state = np.random.RandomState(42) # all outlier detection method candidate list as follows classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), 'Improving Supervised Outlier Detection with Unsupervised Representation Learning': XGBOD(contamination=outliers_fraction), } clf_name = 'Isolation Forest' clf = IForest(contamination=outliers_fraction, random_state=random_state) # clf_name = 'Angle-based Outlier Detector (ABOD)' # clf = ABOD(contamination=outliers_fraction, method='default') clf.fit(x_raw) y_pred = clf.predict(x_raw) # for pyod, 1 means outliers and 0 means inliers # for sklearn, -1 means outliers and 1 means inliers idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1] x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0) y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0) print("After outlier detection: {}".format(x_clean.shape)) assert (x_clean.shape[0] == y_clean.shape[0]) return x_clean, y_clean
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = FeatureBagging(contamination=self.contamination) self.clf.fit(self.X_train)
def models_init(self): """Models initialisation. """ self.model = self.configuration.get('model', 'pca') if self.model == 'pca': self.models = { model: PCA(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'loda': self.models = { model: LODA(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'iforest': self.models = { model: IForest(n_estimators=50, bootstrap=True, behaviour='new', contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'cblof': self.models = { model: CBLOF(n_clusters=3, contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'feature_bagging': self.models = { model: FeatureBagging( base_estimator=PCA(contamination=self.contamination), contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'copod': self.models = { model: COPOD(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'hbos': self.models = { model: HBOS(contamination=self.contamination) for model in self.models_in_scope } else: self.models = { model: HBOS(contamination=self.contamination) for model in self.models_in_scope } self.custom_model_scalers = { model: MinMaxScaler() for model in self.models_in_scope }
def out_lier_score(df, target, num_var): scaler = MinMaxScaler(feature_range=(0, 1)) df = scaler.fit_transform(df.loc[:, num_var], df[target]) #.to_numpy() random_state = np.random.RandomState(42) outliers_fraction = 0.05 X = df df_out_score = [] # Define seven outlier tools detectionto be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction) } for i, (clf_name, clf) in enumerate(classifiers.items()): clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) df_out_score.append(y_pred.tolist()) df_out_score = pd.DataFrame(df_out_score).T df_out_score.columns = list(classifiers.keys()) return df_out_score
def choose_model(model, nnet): """ among implemented in PyOD """ clfs = { 'AE': AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15), 'VAE': VAE(encoder_neurons=nnet[:5], decoder_neurons=nnet[4:], contamination=0.1, epochs=13), 'ABOD': ABOD(), 'FeatureBagging': FeatureBagging(), 'HBOS': HBOS(), 'IForest': IForest(), 'KNN': KNN(), 'LOF': LOF(), 'OCSVM': OCSVM(), 'PCA': PCA(), 'SOS': SOS(), 'COF': COF(), 'CBLOF': CBLOF(), 'SOD': SOD(), 'LOCI': LOCI(), 'MCD': MCD() } return clfs[model]
def fun(dir_path): file_list = [] total_roc = [] total_prn = [] count = 0 for home, dirs, files in os.walk("./"+dir_path+"/benchmarks"): for filename in files: fullname = os.path.join(home, filename) file_list.append(fullname)cb for file_csv in file_list: # if count == 2: # break df = pd.read_csv(file_csv) columns = df.columns # df = df[columns].fillna('nan') data = df.drop(columns = ['point.id', 'motherset', 'origin']) class_mapping = {"anomaly":1, "nominal":0} data['ground.truth'] = data['ground.truth'].map(class_mapping) class_mapping = {"anomaly":1, "nominal":0} y = data['ground.truth'] x = data.drop('ground.truth',axis=1) X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=28) random_state = np.random.RandomState(42) outliers_fraction = 0.05 # Define seven outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=random_state), 'Feature Bagging':FeatureBagging(LOF(n_neighbors=35),contamination=outliers_fraction,check_estimator=False,random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean',contamination=outliers_fraction) } p_prn = [] p_roc = [] for i, (clf_name, clf) in enumerate(classifiers.items()): try: clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print(str(count)+"is analysing") print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) roc=np.round(roc_auc_score(y_train, y_train_scores), decimals=4), prn=np.round(precision_n_scores(y_test, y_test_scores), decimals=4) p_prn.append(prn) p_roc.append(roc[0]) except: p_prn.append(-1) p_roc.append(-1) total_prn.append(p_prn) total_roc.append(p_roc) count += 1 total_prn = json.dumps(total_prn) total_roc = json.dumps(total_roc) a = open(dir_path+"_prn_list.txt", "w",encoding='UTF-8') a.write(total_prn) a.close() a = open(dir_path+"_roc_list.txt", "w",encoding='UTF-8') a.write(total_roc) a.close()
X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.4, random_state=random_state) # standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(n_clusters=10, contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)':
class TestFeatureBagging(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = FeatureBagging(contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'estimators_') and self.clf.estimators_ is not None) assert (hasattr(self.clf, 'estimators_features_') and self.clf.estimators_features_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
# 60% data for training and 40% for testing X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.4, random_state=random_state) # standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) if sys.argv[1] == 'abod': classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction)} classifiers_indices = { 'Angle-based Outlier Detector (ABOD)': 0} elif sys.argv[1] == 'cblof': classifiers = {'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state)} classifiers_indices = {'Cluster-based Local Outlier Factor': 0} elif sys.argv[1] == 'fb': classifiers = {'Feature Bagging': FeatureBagging(contamination=outliers_fraction, check_estimator=False, random_state=random_state)} classifiers_indices = {'Feature Bagging': 0} elif sys.argv[1] == 'hbos': classifiers = {'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction)} classifiers_indices = {'Histogram-base Outlier Detection (HBOS)': 0} elif sys.argv[1] == 'iforest': classifiers = {'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state)} classifiers_indices = {'Isolation Forest': 0} elif sys.argv[1] == 'knn': classifiers = {'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction)} classifiers_indices = {'K Nearest Neighbors (KNN)': 0} elif sys.argv[1] == 'lof': classifiers = {'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction)} classifiers_indices = {'Local Outlier Factor (LOF)': 0} elif sys.argv[1] == 'mcd': classifiers = {'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state)}
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train ABOD detector clf_name = 'FeatureBagging' clf = FeatureBagging() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
LOF(n_neighbors=30), LOF(n_neighbors=35), LOF(n_neighbors=40), LOF(n_neighbors=45), LOF(n_neighbors=50) ] random_state = 42 # Define nine outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(), 'Isolation Forest': IForest(random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(), 'Average KNN': KNN(method='mean'), # 'Median KNN': KNN(method='median', # contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35), # 'Local Correlation Integral (LOCI)': # LOCI(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)':
'Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format( shape=ground_truth.shape)) print(ground_truth, '\n') random_state = np.random.RandomState(42) # Define nine outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), # 'Median KNN': KNN(method='median', # contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), # 'Local Correlation Integral (LOCI)': # LOCI(contamination=outliers_fraction),
def analysis(): roc_df = pd.DataFrame(columns=df_columns) prn_df = pd.DataFrame(columns=df_columns) for doc in fileList: print(doc) df = pd.read_csv(doc, encoding='utf-8') # x =df.loc[:,('V1','V2','V3','V4','V5','V6','V7')] x = df.loc[:, ('R', 'G', 'B')] # x=df.iloc[:,6:57] y = df.loc[:, 'original.label'] roc_list = [count, doc] count = count + 1 roc_mat = np.zeros(6) # 设置 5%的离群点数据 random_state = np.random.RandomState(42) outliers_fraction = 0.02 # 定义6个后续会使用的离群点检测模型 classifiers = { "Feature Bagging": FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), "Isolation Forest": IForest(contamination=outliers_fraction, random_state=random_state), "KNN": KNN(contamination=outliers_fraction), 'Local Outlier Factor': LOF(contamination=outliers_fraction), 'One-class SVM': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis': PCA(contamination=outliers_fraction, random_state=random_state), } classifiers_indices = { 'Feature Bagging': 0, 'Isolation Forest': 1, "Average KNN": 2, 'Local Outlier Factor': 3, 'One-class SVM': 4, 'Principal Component Analysis': 5, } # 60% data for training and 40% for testing X_train, X_test, y_train, y_test = \ train_test_split(x, y, test_size=0.4, random_state=random_state) # standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) for i, (clf_name, clf) in enumerate(classifiers.items()): clf.fit(X_train_norm, y_train) # 预测离群点得分 scores_pred = clf.decision_function(X_test_norm) try: roc = round(roc_auc_score(y_test, scores_pred), ndigits=4) roc_mat[classifiers_indices[clf_name]] = roc except ValueError: continue roc_list = roc_list + roc_mat.tolist() temp_df = pd.DataFrame(roc_list).transpose() temp_df.columns = [ 'Data', 'dir', 'FB', 'IForest', 'Average KNN', 'LOF', 'OCSVM', 'PCA' ] roc_df = pd.concat([roc_df, temp_df], axis=0) roc_df.to_csv("roc.csv", index=False, float_format="%.3f")
mat_file_name = mat_file.replace('.mat', '') print("\n... Processing", mat_file_name, '...') mat = sp.io.loadmat(os.path.join('../datasets', mat_file)) X = mat['X'] y = mat['y'] X = StandardScaler().fit_transform(X) # load the pre-trained model cost predictor clf = load('rf_predictor.joblib') classifiers = { 1: ABOD(n_neighbors=10), 2: CBLOF(check_estimator=False), 3: FeatureBagging(LOF()), 4: HBOS(), 5: IForest(), 6: KNN(), 7: KNN(method='mean'), 8: LOF(), 9: MCD(), 10: OCSVM(), 11: PCA(), } clfs = np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], size=n_estimators_total) clfs_real = [] for estimator in clfs:
def bivariate_outliers(df, method, x_col, y_col, outliers_fraction, visualize): dfx = df.loc[:, [x_col, y_col]] scaler = MinMaxScaler(feature_range=(0, 1)) dfx.loc[:, [x_col, y_col]] = scaler.fit_transform(dfx.loc[:, [x_col, y_col]]) X1 = dfx[x_col].values.reshape(-1, 1) X2 = dfx[y_col].values.reshape(-1, 1) X = np.concatenate((X1, X2), axis=1) random_state = np.random.RandomState(42) classifiers_name = { 'IForest': 'Isolation Forest', 'CBLOF': 'Cluster-based Local Outlier Factor (CBLOF)', 'ABOD': 'Angle-based Outlier Detector (ABOD)', 'Feature Bagging': 'Feature Bagging', 'HBOS': 'Histogram-base Outlier Detection (HBOS)', 'KNN': 'K Nearest Neighbors (KNN)', 'AvgKNN': 'Average KNN' } # Seven outlier detection tools to be used classifiers = { 'Isolation Forest': IForest(behaviour='new', contamination=outliers_fraction, random_state=random_state), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction) } clf = classifiers[classifiers_name[method]] clf.fit(X) # prediction of a dfpoint category outlier or inlier y_pred = clf.predict(X) if visualize == False: df[x_col] = y_pred.tolist() return df else: xx, yy = np.meshgrid(np.linspace(0, 1, 200), np.linspace(0, 1, 200)) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) plt.figure(figsize=(16, 8)) # copy of dfframe dfx['outlier'] = y_pred.tolist() # IX1 - inlier feature 1, IX2 - inlier feature 2 IX1 = np.array(dfx[x_col][dfx['outlier'] == 0]).reshape(-1, 1) IX2 = np.array(dfx[y_col][dfx['outlier'] == 0]).reshape(-1, 1) # OX1 - outlier feature 1, OX2 - outlier feature 2 OX1 = dfx[x_col][dfx['outlier'] == 1].values.reshape(-1, 1) OX2 = dfx[y_col][dfx['outlier'] == 1].values.reshape(-1, 1) print('OUTLIERS: ', n_outliers, ',', 'INLIERS: ', n_inliers, ',', 'Detection Method:', classifiers_name[method]) # threshold value to consider a dfpoint inlier or outlier threshold = stats.scoreatpercentile(scores_pred, 100 * outliers_fraction) # decision function calculates the raw anomaly score for every point Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 Z = Z.reshape(xx.shape) # fill blue map colormap from minimum anomaly score to threshold value plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) # draw red contour line where anomaly score is equal to thresold a = plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red') # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange') b = plt.scatter(IX1, IX2, c='white', s=20, edgecolor='k') c = plt.scatter(OX1, OX2, c='black', s=20, edgecolor='k') plt.axis('tight') # loc=2 is used for the top left corner plt.legend([a.collections[0], b, c], ['learned decision function', 'inliers', 'outliers'], prop=matplotlib.font_manager.FontProperties(size=16), loc='best') plt.xlim((0, 1)) plt.ylim((0, 1)) plt.title(method, fontsize=20) plt.xlabel(x_col, fontsize=16) plt.ylabel(y_col, fontsize=16) plt.show()
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train Feature Bagging detector clf_name = 'FeatureBagging' clf = FeatureBagging() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
def initialise_pyod_classifiers(self, outlier_fraction): #Testing every query to every class and then predicting only if it belongs to the same class classifiers = {} #Proximity based classifiers['K Nearest Neighbors (KNN)'] = [] classifiers['Average K Nearest Neighbors (AvgKNN)'] = [] classifiers['Median K Nearest Neighbors (MedKNN)'] = [] classifiers['Local Outlier Factor (LOF)'] = [] classifiers['Connectivity-Based Outlier Factor (COF)'] = [] #classifiers['Clustering-Based Local Outlier Factor (CBLOF)'] = [] classifiers['LOCI'] = [] #classifiers['Histogram-based Outlier Score (HBOS)'] = [] classifiers['Subspace Outlier Detection (SOD)'] = [] #Linear models classifiers['Principal Component Analysis (PCA)'] = [] #classifiers['Minimum Covariance Determinant (MCD)'] = [] #To slow classifiers['One-Class Support Vector Machines (OCSVM)'] = [] classifiers['Deviation-based Outlier Detection (LMDD)'] = [] #Probabilistic classifiers['Angle-Based Outlier Detection (ABOD)'] = [] classifiers['Stochastic Outlier Selection (SOS)'] = [] #Outlier Ensembles classifiers['Isolation Forest (IForest)'] = [] classifiers['Feature Bagging'] = [] classifiers['Lightweight On-line Detector of Anomalies (LODA)'] = [] for i in range(self.k_way): for i in range(self.k_way): classifiers['K Nearest Neighbors (KNN)'].append( KNN(method='largest', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Average K Nearest Neighbors (AvgKNN)'].append( KNN(method='mean', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Median K Nearest Neighbors (MedKNN)'].append( KNN(method='median', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Local Outlier Factor (LOF)'].append( LOF(n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Connectivity-Based Outlier Factor (COF)'].append( COF(n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['LOCI'].append( LOCI(contamination=outlier_fraction)) classifiers['Subspace Outlier Detection (SOD)'].append( SOD(n_neighbors=int(self.n_shot / 3) + 2, contamination=outlier_fraction, ref_set=max(2, int((int(self.n_shot / 3) + 2) / 3)))) classifiers['Principal Component Analysis (PCA)'].append( PCA(contamination=outlier_fraction)) classifiers[ 'One-Class Support Vector Machines (OCSVM)'].append( OCSVM(contamination=outlier_fraction)) classifiers['Deviation-based Outlier Detection (LMDD)'].append( LMDD(contamination=outlier_fraction)) classifiers['Angle-Based Outlier Detection (ABOD)'].append( ABOD(contamination=outlier_fraction)) classifiers['Stochastic Outlier Selection (SOS)'].append( SOS(contamination=outlier_fraction)) classifiers['Isolation Forest (IForest)'].append( IForest(contamination=outlier_fraction)) classifiers['Feature Bagging'].append( FeatureBagging(contamination=outlier_fraction)) classifiers[ 'Lightweight On-line Detector of Anomalies (LODA)'].append( LODA(contamination=outlier_fraction)) self.num_different_models = len(classifiers) return classifiers
class TestFeatureBagging(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = FeatureBagging(contamination=self.contamination) self.clf.fit(self.X_train) # TODO: failed due to sklearn uses 2 feature examples. # def test_sklearn_estimator(self): # check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'estimators_') and self.clf.estimators_ is not None) assert_true(hasattr(self.clf, 'estimators_features_') and self.clf.estimators_features_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
def main(): # PART 1: # Getting the predictions for each classifier # SK means: The classifier is from sklearn or works like sklearn # PY means: The classifier is from pyod or works like pyod models = { 'SK_EE': EllipticEnvelope(), 'SK_GM': GaussianMixture(), 'SK_IF': IsolationForest(), 'SK_OCSVM': OneClassSVM(), 'SK_FA': FactorAnalysis(), 'SK_KD': KernelDensity(), 'PY_PCA': PCA(), 'PY_COF': COF(), 'PY_LODA': LODA(), 'PY_LOF': LOF(), 'PY_HBOS': HBOS(), 'PY_MCD': MCD(), 'PY_AvgKNN': KNN(method='mean'), 'PY_LargestKNN': KNN(method='largest'), 'PY_MedKNN': KNN(method='median'), 'PY_AvgBagging': FeatureBagging(combination='average'), 'PY_MaxBagging': FeatureBagging(combination='max'), 'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'PY_COPOD': COPOD(), 'PY_SOD': SOD(), 'PY_LSCPwithLODA': LSCP([LODA(), LODA()]), 'PY_AveLMDD': LMDD(dis_measure='aad'), 'PY_VarLMDD': LMDD(dis_measure='var'), 'PY_IqrLMDD': LMDD(dis_measure='iqr'), 'PY_VAE': VAE(encoder_neurons=[8, 4, 2]), 'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'SK_BRM': BRM(bootstrap_sample_percent=70), 'SK_OCKRA': m_OCKRA(), 'PY_SoGaal': SO_GAAL(), 'PY_MoGaal': MO_GAAL() } ranker = ADRanker(data="datasets", models=models) ranker.get_predictions() # PART 2: # After predictions, we can evaluate our classifiers using different scores # You can add manually a new metric by modifying 'metrics.py' ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave}) # PART 3: # Finally, it is time to summarize the results by plotting different graphs # You can add your own graphs by modifying ' plots.py' plot = Plots() plot.make_plot_basic(paths=[ 'results/scores/auc/no/results.csv', 'results/scores/auc/minmax/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/ave/minmax/results.csv', 'results/scores/ave/std/results.csv' ], scalers=[ 'Without scaler', 'Min max scaler', 'Standard scaler', 'Without scaler', 'Min max scaler', 'Standard scaler' ]) plot.make_cd_plot( paths=[ 'results/scores/auc/minmax/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/no/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/std/results.csv' ], names=[ 'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale', 'CD ave no scale', 'CD auc std scale', 'CD ave std scale' ], titles=[ 'CD diagram - AUC with min max scaling', 'CD diagram - Average precision with min max scaling', 'CD diagram - AUC without scaling', 'CD diagram - Average precision without scaling', 'CD diagram - AUC with standard scaling', 'CD diagram - Average precision with standard scaling' ])
from pyod.models.cblof import CBLOF from pyod.models.feature_bagging import FeatureBagging from pyod.models.hbos import HBOS from pyod.models.iforest import IForest from pyod.models.knn import KNN from pyod.models.lof import LOF from sklearn.preprocessing import MinMaxScaler import matplotlib.pyplot as plt import matplotlib.font_manager as mfm from sklearn.metrics import accuracy_score,recall_score # 设置 10%的离群点数据 random_state = np.random.RandomState(42) outliers_fraction = 0.1 classifiers = { "FB": FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), "IForest": IForest(contamination=outliers_fraction, random_state=random_state), "Average KNN": KNN(contamination=outliers_fraction), 'LOF': LOF( contamination=outliers_fraction), 'OCSVM': OCSVM(contamination=outliers_fraction), 'PCA': PCA( contamination=outliers_fraction, random_state=random_state), } #读取roc,orignal文件 path="D:\\BIT\\Course\\sjwj\\homework\\12\\abalone\\skin_roc.csv" f=open(path,encoding='utf-8') df=pd.read_csv(f) dff_orignal = pd.read_csv('D:\\BIT\\Course\\sjwj\\homework\\12\\skin_benchmarks\\skin\\meta_data\\skin.original.csv',encoding='utf-8') x_orignal = dff_orignal.loc[:, ('R', 'G', 'B')]
from pyod.models.abod import ABOD from pyod.models.knn import KNN from pyod.models.feature_bagging import FeatureBagging from pyod.models.hbos import HBOS from pyod.models.iforest import IForest # from pyod.models.cblof import CBLOF from pyod.models.lof import LOF from sklearn.utils import * pd.set_option('display.max_column',100) n_clusters=8 classifiers={ 'abod':ABOD(n_neighbors=15), 'knn':KNN(), # 'cblof':CBLOF(n_clusters=n_clusters), 'fg':FeatureBagging(), 'hbos':HBOS(), 'if':IForest(), 'lof':LOF() } dict={'csvname':[], 'roc_abod_train':[], 'roc_abod_test':[], 'prn_abod_train':[], 'prn_abod_test':[], 'roc_knn_train':[], 'roc_knn_test':[], 'prn_knn_train':[], 'prn_knn_test':[], # 'roc_cblof_train':[], # 'roc_cblof_test':[],
def plot_out_liers(df, cur_var, target): plt.scatter(df[cur_var], df[target]) plt.show(block=False) plt.pause(5) plt.close() scaler = MinMaxScaler(feature_range=(0, 1)) df[[cur_var, target]] = scaler.fit_transform(df[[cur_var, target]]) X1 = df[cur_var].values.reshape(-1, 1) X2 = df[target].values.reshape(-1, 1) X = np.concatenate((X1, X2), axis=1) random_state = np.random.RandomState(42) outliers_fraction = 0.05 # Define seven outlier tools detectionto be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction) } xx, yy = np.meshgrid(np.linspace(0, 1, 200), np.linspace(0, 1, 200)) for i, (clf_name, clf) in enumerate(classifiers.items()): clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) plt.figure(figsize=(10, 10)) # copy of dataframe dfx = df dfx['outlier'] = y_pred.tolist() # IX1 - inlier feature 1, IX2 - inlier feature 2 IX1 = np.array(dfx[cur_var][dfx['outlier'] == 0]).reshape(-1, 1) IX2 = np.array(dfx[target][dfx['outlier'] == 0]).reshape(-1, 1) # OX1 - outlier feature 1, OX2 - outlier feature 2 OX1 = dfx[cur_var][dfx['outlier'] == 1].values.reshape(-1, 1) OX2 = dfx[target][dfx['outlier'] == 1].values.reshape(-1, 1) print('OUTLIERS : ', n_outliers, 'INLIERS : ', n_inliers, clf_name) # threshold value to consider a datapoint inlier or outlier threshold = stats.scoreatpercentile(scores_pred, 100 * outliers_fraction) # decision function calculates the raw anomaly score for every point Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1 Z = Z.reshape(xx.shape) # fill blue map colormap from minimum anomaly score to threshold value plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) # draw red contour line where anomaly score is equal to thresold a = plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red') # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange') b = plt.scatter(IX1, IX2, c='white', s=20, edgecolor='k') c = plt.scatter(OX1, OX2, c='black', s=20, edgecolor='k') plt.axis('tight') # loc=2 is used for the top left corner plt.legend([a.collections[0], b, c], ['learned decision function', 'inliers', 'outliers'], prop=matplotlib.font_manager.FontProperties(size=20), loc=2) plt.xlim((0, 1)) plt.ylim((0, 1)) plt.title(clf_name) plt.show(block=False) plt.pause(5) plt.close()
X2 = df['G'].values.reshape(-1, 1) X = np.concatenate((X1, X2), axis=1) random_state = np.random.RandomState(42) outliers_fraction = 0.05 # Define seven outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction) } xx, yy = np.meshgrid(np.linspace(0, 1, 200), np.linspace(0, 1, 200)) for i, (clf_name, clf) in enumerate(classifiers.items()): clf.fit(X)
def outlier_detector(self, clustered_data, outliers_fraction=0.05, method='Voting', cluster_number=3): random_state = np.random.RandomState(42) outliers_df = pd.DataFrame() classifiers = { #Cluster-based Local Outlier Factor 'CBLOF': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), #Feature Bagging 'FB': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), #Histogram-base Outlier Detection 'HBOS': HBOS(contamination=outliers_fraction), #Isolation Forest 'IF': IForest(contamination=outliers_fraction, random_state=random_state), #K Nearest Neighbors 'KNN': KNN(contamination=outliers_fraction) } detectors_list = [] for k in range(cluster_number): curr_cluster = clustered_data[clustered_data['Cluster'] == k] X_train = curr_cluster.drop(['consumer_id', 'Cluster'], axis=1) for i, (clf_name, clf) in enumerate(classifiers.items()): clf_pred = clf_name + '_Decision' clf.fit(X_train) if (method == 'Voting'): if (clf_name == 'KNN'): #just save KNN for inference detectors_list.append(clf) elif (method != 'Voting'): if (clf_name == method): detectors_list.append(clf) # predict raw anomaly score scores_pred = clf.decision_function(X_train) scores_pred_df = pd.DataFrame(list(scores_pred), columns=[clf_name], index=curr_cluster.index.copy()) curr_cluster = pd.concat([curr_cluster, scores_pred_df], axis=1) outliers_pred = clf.predict(X_train) outliers_pred_df = pd.DataFrame( list(outliers_pred), columns=[clf_pred], index=curr_cluster.index.copy()) curr_cluster = pd.concat([curr_cluster, outliers_pred_df], axis=1) outliers_df = outliers_df.append(curr_cluster) if (method == 'Voting'): outliers_df['Voting'] = outliers_df.filter(regex='Decision').sum( axis=1) outliers_df['bad_customer'] = 0 outliers_df.loc[(outliers_df.Voting > len(classifiers) / 2), 'bad_customer'] = 1 else: decision = method + '_Decision' outliers_df['bad_customer'] = outliers_df[decision] return outliers_df, detectors_list