def fast_abod_pyod_auc(X_nor, X_test, y_test, n_neighbors, contamination=0.05): fastABOD = ABOD(n_neighbors=n_neighbors, contamination=contamination, method='fast') X_train = X_nor.astype(float).values.copy() fastABOD.fit(X_train) ## now threshold is determined #y_pred = fastABOD.predict(X_test) scoreTable = fastABOD.decision_function(X_test) #print(scoreTable) scoreTable = np.nan_to_num(scoreTable, copy=True) ## confusion matrix #tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() #tpr = tp/(tp+fn) #fpr = fp/(tn+fp) #tprW[trail] = tpr #fprW[trail] = fpr #tprW = tpr #fprW = fpr # Auc score auc = roc_auc_score(y_test, scoreTable) #print(tpr, fpr) #print(auc) return auc
def getOutliers(df): from pyod.models.abod import ABOD from pyod.models.knn import KNN outlier_fraction = 0.1 outlierlist = [] classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outlier_fraction), 'K Nearest Neighbors (KNN)': KNN(contamination=outlier_fraction) } classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outlier_fraction), } print("outlier detection") for i, (clf_name, clf) in enumerate(classifiers.items()): print(clf_name) clf.fit(df) # predict raw anomaly score #scores_pred = clf.decision_function(df) * -1 y_pred = clf.predict(df) #n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) print(clf_name, n_outliers) outlierlist.append((n_outliers, y_pred.tolist())) return outlierlist
def __init__(self, inliers, outliers): data_total = np.concatenate((inliers, outliers), axis=0) self.data_total = data_total self.outliers = outliers self.inliers = inliers OutlierStream.__init__(self, inliers, outliers) #self.model = KNN(contamination=0.045) self.model = ABOD(n_neighbors=20, contamination=0.2)
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = ABOD(contamination=self.contamination) self.clf.fit(self.X_train)
def setUp(self): self.n_train = 50 self.n_test = 50 self.contamination = 0.2 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = ABOD(contamination=self.contamination, method='default') self.clf.fit(self.X_train)
def getOutlierABOD(dataset): ''' @brief Function that executes ABOD algorithm on the dataset and obtains the labels of the dataset indicating which instance is an inlier (0) or outlier (1) @param dataset Dataset on which to try the algorithm @return It returns a list of labels 0 means inlier, 1 means outlier ''' # Initializating the model abod = ABOD() # Fits the data and obtains labels abod.fit(dataset) # Return labels return abod.labels_
def define_classifiers(random_state, outliers_fraction): classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def fit(self, X, y=None): """ Fits the outlier detection on the given data Parameters ---------- X (DataFrame) : training data y (DataFrame, default=None) : target values (if needed) Returns ------- (DataFrame, DataFrame) : A tuple of the transformed DataFrames, the first being the X data and the second being the y data """ abod = ABOD(**self.kwargs) self.fitted = abod.fit(X) return self.transform(X, y=y)
def load_classifiers(outliers_fraction): outliers_fraction = min(0.5, outliers_fraction) random_state = np.random.RandomState(42) # Define nine outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state, behaviour="new"), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def outlier_detection(x_raw, y_raw): """ Filter all ourlier points :param x_raw: feature in ndarray :param y_raw: label in ndarray :return x_clean, y_clean: cleaned feature and label in ndarray """ # TODO Filter the outliers. print() print("Detecting outliers...") print("Before outlier detection: {}".format(x_raw.shape)) outliers_fraction = 0.04 random_state = np.random.RandomState(42) # all outlier detection method candidate list as follows classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), 'Improving Supervised Outlier Detection with Unsupervised Representation Learning': XGBOD(contamination=outliers_fraction), } clf_name = 'Isolation Forest' clf = IForest(contamination=outliers_fraction, random_state=random_state) # clf_name = 'Angle-based Outlier Detector (ABOD)' # clf = ABOD(contamination=outliers_fraction, method='default') clf.fit(x_raw) y_pred = clf.predict(x_raw) # for pyod, 1 means outliers and 0 means inliers # for sklearn, -1 means outliers and 1 means inliers idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1] x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0) y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0) print("After outlier detection: {}".format(x_clean.shape)) assert (x_clean.shape[0] == y_clean.shape[0]) return x_clean, y_clean
def __init__(self, nth_layer=18, nn_name='vgg', detector_name='svm', pool=None, pca_n_components=None): """ Extract feature by neural network and detector train normal samples then predict new data nn_name: 'Xception', 'ResNet'(Default), 'InceptionV3', 'InceptionResNetV2', 'MobileNet', 'MobileNetV2', 'DenseNet', 'NASNet' detector_name: 'RobustCovariance', 'IsolationForest, 'LocalOutlierFactor, ABOD, kNN(Default)' """ self.nth_layer = nth_layer self.nn_name = nn_name self.pool = pool self.pca_n_components = pca_n_components self.input_shape = None self.pretrained_nn = None self.extracting_model = None K.clear_session() detector_name_lower = detector_name.lower() if detector_name_lower == 'robustcovariance': self.detector_name = 'rc' from sklearn.covariance import EllipticEnvelope self.clf = EllipticEnvelope() print('Novelty Detector: Robust covariance') elif detector_name_lower in ['localoutlierfactor', 'lof']: self.detector_name = 'lof' from sklearn.neighbors import LocalOutlierFactor self.clf = LocalOutlierFactor(novelty=True) print('Novelty Detector: Local Outlier Factor') elif detector_name_lower in ['abod', 'fastabod', 'anglebasedoutlierdetection']: self.detector_name = 'abod' from pyod.models.abod import ABOD self.clf = ABOD() print('Novelty Detector: Angle Based Outlier Detection') elif detector_name_lower in ['iforest', 'isolationforest']: self.detector_name = 'iforest' from sklearn.ensemble import IsolationForest self.clf = IsolationForest() print('Novelty Detector: Isolation Forest') elif detector_name_lower in ['knn', 'median_knn']: self.detector_name = 'median_kNN' from pyod.models.knn import KNN self.clf = KNN(method='median', contamination=0.1) print('Novelty Detector: Median K Nearest Neighbors') elif detector_name_lower =='svm': from sklearn.svm import OneClassSVM self.clf = OneClassSVM(gamma='scale') print('SVM') else: print(self.detector_name_lower) raise ValueError
def __init__( self, *, hyperparams: Hyperparams, # random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._clf = ABOD( contamination=hyperparams['contamination'], n_neighbors=hyperparams['n_neighbors'], method=hyperparams['method'], )
def test_abod(self): clf = ABOD(contamination=0.05) clf.fit(self.X_train) assert_equal(len(clf.decision_scores), self.X_train.shape[0]) # invert the scores pred_scores = clf.decision_function(self.X_test) * -1 assert_equal(pred_scores.shape[0], self.X_test.shape[0]) assert_equal(clf.predict(self.X_test).shape[0], self.X_test.shape[0]) assert_greater(roc_auc_score(self.y_test, pred_scores), 0.5)
def out_lier_score(df, target, num_var): scaler = MinMaxScaler(feature_range=(0, 1)) df = scaler.fit_transform(df.loc[:, num_var], df[target]) #.to_numpy() random_state = np.random.RandomState(42) outliers_fraction = 0.05 X = df df_out_score = [] # Define seven outlier tools detectionto be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction) } for i, (clf_name, clf) in enumerate(classifiers.items()): clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) df_out_score.append(y_pred.tolist()) df_out_score = pd.DataFrame(df_out_score).T df_out_score.columns = list(classifiers.keys()) return df_out_score
def create_tunable_ensemble(knn_neighbors, lof_neighbors, abod_neighbors): model_list = [] for knn_neighbor in knn_neighbors: for lof_neighbor in lof_neighbors: for abod_neighbor in abod_neighbors: element = { "model": SimpleDetectorAggregator, "supervised": False, "parameters": { "method": "average", "base_estimators": [ KNN(n_neighbors=knn_neighbor), LOF(n_neighbors=lof_neighbor), ABOD(n_neighbors=abod_neighbor), OCSVM() ], } } model_list.append(element) return model_list
def choose_model(model, nnet): """ among implemented in PyOD """ clfs = { 'AE': AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15), 'VAE': VAE(encoder_neurons=nnet[:5], decoder_neurons=nnet[4:], contamination=0.1, epochs=13), 'ABOD': ABOD(), 'FeatureBagging': FeatureBagging(), 'HBOS': HBOS(), 'IForest': IForest(), 'KNN': KNN(), 'LOF': LOF(), 'OCSVM': OCSVM(), 'PCA': PCA(), 'SOS': SOS(), 'COF': COF(), 'CBLOF': CBLOF(), 'SOD': SOD(), 'LOCI': LOCI(), 'MCD': MCD() } return clfs[model]
def create_ensemble_models(knn_methods, ensemble_combinations, pca): model_list = [] for ensemble_combination in ensemble_combinations: for i in range(1, pca + 1): for j in range(1, pca + 1): for method in knn_methods: element = { "model": SimpleDetectorAggregator, "supervised": False, "parameters": { "method": ensemble_combination, "base_estimators": [ KNN(n_neighbors=i, method=method), LOF(n_neighbors=j), ABOD(), OCSVM() ], } } model_list.append(element) return model_list
clf.fit(X_train, y_train) test_pred = clf.predict(X_test) test_pred[test_pred < 0] = 0 r2.append([r2_score(y_test, test_pred)]) mse.append([mean_squared_error(y_test, test_pred)]) pearson.append(pearsonr(y_test, test_pred)[0]) spearman.append(spearmanr(y_test, test_pred)[0]) print('Spearman Rank', np.mean(spearman)) clf.fit(X, y) if save_to_local: # save to the local dump(clf, os.path.join("saved_models", output_file)) if __name__ == "__main__": # this should be only executed if the pre-trained model is missing. build_cost_predictor(file_name=os.path.join('saved_models', 'summary_train.txt'), output_file="bps_train.joblib", save_to_local=False) build_cost_predictor(file_name=os.path.join('saved_models', 'summary_prediction.txt'), output_file="bps_prediction.joblib", save_to_local=False) clf = ABOD()
class TestFastABOD(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = ABOD(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true( hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true( hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true( hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true( hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'tree_') and self.clf.tree_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def tearDown(self): pass
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train ABOD detector clf_name = 'ABOD' clf = ABOD() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier s`cores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
# df.plot.scatter('original.label', 'R') scaler = MinMaxScaler(feature_range=(0, 1)) df[['R', 'G']] = scaler.fit_transform(df[['R', 'G']]) df[['R', 'G']].head() X1 = df['R'].values.reshape(-1, 1) X2 = df['G'].values.reshape(-1, 1) X = np.concatenate((X1, X2), axis=1) random_state = np.random.RandomState(42) outliers_fraction = 0.05 # Define seven outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
class TestABOD(unittest.TestCase): def setUp(self): self.n_train = 50 self.n_test = 50 self.contamination = 0.2 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = ABOD(contamination=self.contamination, method='default') self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): if not hasattr( self.clf, 'decision_scores_') or self.clf.decision_scores_ is None: self.assertRaises(AttributeError, 'decision_scores_ is not set') if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None: self.assertRaises(AttributeError, 'labels_ is not set') if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None: self.assertRaises(AttributeError, 'threshold_ is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') # def test_score(self): # self.clf.score(self.X_test, self.y_test) # self.clf.score(self.X_test, self.y_test, scoring='roc_auc_score') # self.clf.score(self.X_test, self.y_test, scoring='prc_n_score') # with assert_raises(NotImplementedError): # self.clf.score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3.5) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3.5) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
mat_file = 'cardio.mat' mat_file_name = mat_file.replace('.mat', '') print("\n... Processing", mat_file_name, '...') mat = sp.io.loadmat(os.path.join('../datasets', mat_file)) X = mat['X'] y = mat['y'] X = StandardScaler().fit_transform(X) # load the pre-trained model cost predictor clf = load('rf_predictor.joblib') classifiers = { 1: ABOD(n_neighbors=10), 2: CBLOF(check_estimator=False), 3: FeatureBagging(LOF()), 4: HBOS(), 5: IForest(), 6: KNN(), 7: KNN(method='mean'), 8: LOF(), 9: MCD(), 10: OCSVM(), 11: PCA(), } clfs = np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], size=n_estimators_total) clfs_real = []
ground_truth = np.zeros(n_samples, dtype=int) ground_truth[-n_outliers:] = 1 # Show the statics of the data print('Number of inliers: %i' % n_inliers) print('Number of outliers: %i' % n_outliers) print( 'Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format( shape=ground_truth.shape)) print(ground_truth) random_state = np.random.RandomState(42) # Define nine outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(n_neighbors=10, contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
def fun(dir_path): file_list = [] total_roc = [] total_prn = [] count = 0 for home, dirs, files in os.walk("./"+dir_path+"/benchmarks"): for filename in files: fullname = os.path.join(home, filename) file_list.append(fullname)cb for file_csv in file_list: # if count == 2: # break df = pd.read_csv(file_csv) columns = df.columns # df = df[columns].fillna('nan') data = df.drop(columns = ['point.id', 'motherset', 'origin']) class_mapping = {"anomaly":1, "nominal":0} data['ground.truth'] = data['ground.truth'].map(class_mapping) class_mapping = {"anomaly":1, "nominal":0} y = data['ground.truth'] x = data.drop('ground.truth',axis=1) X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=28) random_state = np.random.RandomState(42) outliers_fraction = 0.05 # Define seven outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=random_state), 'Feature Bagging':FeatureBagging(LOF(n_neighbors=35),contamination=outliers_fraction,check_estimator=False,random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean',contamination=outliers_fraction) } p_prn = [] p_roc = [] for i, (clf_name, clf) in enumerate(classifiers.items()): try: clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print(str(count)+"is analysing") print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) roc=np.round(roc_auc_score(y_train, y_train_scores), decimals=4), prn=np.round(precision_n_scores(y_test, y_test_scores), decimals=4) p_prn.append(prn) p_roc.append(roc[0]) except: p_prn.append(-1) p_roc.append(-1) total_prn.append(p_prn) total_roc.append(p_roc) count += 1 total_prn = json.dumps(total_prn) total_roc = json.dumps(total_roc) a = open(dir_path+"_prn_list.txt", "w",encoding='UTF-8') a.write(total_prn) a.close() a = open(dir_path+"_roc_list.txt", "w",encoding='UTF-8') a.write(total_roc) a.close()
from pyod.utils.data import generate_data from pyod.utils.data import evaluate_print from pyod.utils.data import visualize if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points X_train, y_train, X_test, y_test = generate_data( n_train=n_train, n_test=n_test, contamination=contamination) # train ABOD detector clf_name = 'ABOD' clf = ABOD() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier s`cores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
roc_mat = np.zeros([n_ite, n_classifiers]) prn_mat = np.zeros([n_ite, n_classifiers]) time_mat = np.zeros([n_ite, n_classifiers]) # Apagar o número 0 sendo passado como parâmetro do RandomState random_state = np.random.RandomState(0) # 60% data for training and 40% for testing X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.4, random_state=random_state) # standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) if sys.argv[1] == 'abod': classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction)} classifiers_indices = { 'Angle-based Outlier Detector (ABOD)': 0} elif sys.argv[1] == 'cblof': classifiers = {'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state)} classifiers_indices = {'Cluster-based Local Outlier Factor': 0} elif sys.argv[1] == 'fb': classifiers = {'Feature Bagging': FeatureBagging(contamination=outliers_fraction, check_estimator=False, random_state=random_state)} classifiers_indices = {'Feature Bagging': 0} elif sys.argv[1] == 'hbos': classifiers = {'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction)} classifiers_indices = {'Histogram-base Outlier Detection (HBOS)': 0} elif sys.argv[1] == 'iforest': classifiers = {'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state)} classifiers_indices = {'Isolation Forest': 0} elif sys.argv[1] == 'knn':
def get_estimators(contamination): """Internal method to create a list of 600 random base outlier detectors. Parameters ---------- contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. Returns ------- base_detectors : list A list of initialized random base outlier detectors. """ BASE_ESTIMATORS = [ LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), ABOD(n_neighbors=45, contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), ] return BASE_ESTIMATORS
def pyod_anomaly_detection(type, contamination): X_train, y_train, X_test, y_test = data(type=type, contamination=contamination) if type == 'MAD': # train MAD detector clf_name = 'MAD' clf = MAD(threshold=3.5) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results # making dimensions = 2 for visualising purpose only. By repeating same data each dimension. visualize(clf_name, np.hstack((X_train, X_train)), y_train, np.hstack((X_test, X_test)), y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False) elif type == 'ABOD': # train ABOD detector clf_name = 'ABOD' clf = ABOD() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False) elif type == 'AutoEncoder': # train AutoEncoder detector clf_name = 'AutoEncoder' clf = AutoEncoder(epochs=30, contamination=contamination) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
LOF(n_neighbors=10), LOF(n_neighbors=15), LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=30), LOF(n_neighbors=35), LOF(n_neighbors=40), LOF(n_neighbors=45), LOF(n_neighbors=50) ] random_state = 42 # Define nine outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(), 'Isolation Forest': IForest(random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(), 'Average KNN': KNN(method='mean'), # 'Median KNN': KNN(method='median', # contamination=outliers_fraction), 'Local Outlier Factor (LOF)':
from scipy import stats import matplotlib.pyplot as plt import matplotlib.font_manager from pyod.models.abod import ABOD from pyod.models.knn import KNN from pyod.models.feature_bagging import FeatureBagging from pyod.models.hbos import HBOS from pyod.models.iforest import IForest # from pyod.models.cblof import CBLOF from pyod.models.lof import LOF from sklearn.utils import * pd.set_option('display.max_column',100) n_clusters=8 classifiers={ 'abod':ABOD(n_neighbors=15), 'knn':KNN(), # 'cblof':CBLOF(n_clusters=n_clusters), 'fg':FeatureBagging(), 'hbos':HBOS(), 'if':IForest(), 'lof':LOF() } dict={'csvname':[], 'roc_abod_train':[], 'roc_abod_test':[], 'prn_abod_train':[], 'prn_abod_test':[], 'roc_knn_train':[], 'roc_knn_test':[], 'prn_knn_train':[],