def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = SOS(contamination=self.contamination) self.clf.fit(self.X_train)
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 # self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=5, contamination=self.contamination, random_state=42) self.clf = SOS(contamination=self.contamination) self.clf.fit(self.X_train)
def getOulierSOS(dataset): ''' @brief Function that executes SOS algorithm on the dataset and obtains the labels of the dataset indicating which instance is an inlier (0) or outlier (1) @param dataset Dataset on which to try the algorithm @return It returns a list of labels 0 means inlier, 1 means outlier ''' # Initializating the model sos = SOS() # Fits the data and obtains labels sos.fit(dataset) # Return labels return sos.labels_
def fit_transform(self, df_train, df_corrupted): pyod_model = SOS(contamination=0.25) df_outliers_num = self.num_out_detect(df_train, df_corrupted, pyod_model) df_outliers_cat = self.cat_out_detect(df_train, df_corrupted) df_outliers = df_outliers_num.join(df_outliers_cat, how='inner') for col in df_corrupted.columns: for i in df_outliers.index: if df_outliers.loc[i, col + "_outlier"] == 1: df_outliers.loc[i, col] = np.nan return df_outliers, self.predictors
def choose_model(model, nnet): """ among implemented in PyOD """ clfs = { 'AE': AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15), 'VAE': VAE(encoder_neurons=nnet[:5], decoder_neurons=nnet[4:], contamination=0.1, epochs=13), 'ABOD': ABOD(), 'FeatureBagging': FeatureBagging(), 'HBOS': HBOS(), 'IForest': IForest(), 'KNN': KNN(), 'LOF': LOF(), 'OCSVM': OCSVM(), 'PCA': PCA(), 'SOS': SOS(), 'COF': COF(), 'CBLOF': CBLOF(), 'SOD': SOD(), 'LOCI': LOCI(), 'MCD': MCD() } return clfs[model]
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train kNN detector clf_name = 'KNN' clf = SOS() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
class TestSOS(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 # self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=5, contamination=self.contamination, random_state=42) self.clf = SOS(contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance # assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_prediction_labels_confidence(self): pred_labels, confidence = self.clf.predict(self.X_test, return_confidence=True) assert_equal(pred_labels.shape, self.y_test.shape) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_prediction_proba_linear_confidence(self): pred_proba, confidence = self.clf.predict_proba(self.X_test, method='linear', return_confidence=True) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) # todo: fix clone issue def test_model_clone(self): pass # clone_clf = clone(self.clf) def tearDown(self): pass
def initialise_pyod_classifiers(self, outlier_fraction): #Testing every query to every class and then predicting only if it belongs to the same class classifiers = {} #Proximity based classifiers['K Nearest Neighbors (KNN)'] = [] classifiers['Average K Nearest Neighbors (AvgKNN)'] = [] classifiers['Median K Nearest Neighbors (MedKNN)'] = [] classifiers['Local Outlier Factor (LOF)'] = [] classifiers['Connectivity-Based Outlier Factor (COF)'] = [] #classifiers['Clustering-Based Local Outlier Factor (CBLOF)'] = [] classifiers['LOCI'] = [] #classifiers['Histogram-based Outlier Score (HBOS)'] = [] classifiers['Subspace Outlier Detection (SOD)'] = [] #Linear models classifiers['Principal Component Analysis (PCA)'] = [] #classifiers['Minimum Covariance Determinant (MCD)'] = [] #To slow classifiers['One-Class Support Vector Machines (OCSVM)'] = [] classifiers['Deviation-based Outlier Detection (LMDD)'] = [] #Probabilistic classifiers['Angle-Based Outlier Detection (ABOD)'] = [] classifiers['Stochastic Outlier Selection (SOS)'] = [] #Outlier Ensembles classifiers['Isolation Forest (IForest)'] = [] classifiers['Feature Bagging'] = [] classifiers['Lightweight On-line Detector of Anomalies (LODA)'] = [] for i in range(self.k_way): for i in range(self.k_way): classifiers['K Nearest Neighbors (KNN)'].append( KNN(method='largest', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Average K Nearest Neighbors (AvgKNN)'].append( KNN(method='mean', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Median K Nearest Neighbors (MedKNN)'].append( KNN(method='median', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Local Outlier Factor (LOF)'].append( LOF(n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Connectivity-Based Outlier Factor (COF)'].append( COF(n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['LOCI'].append( LOCI(contamination=outlier_fraction)) classifiers['Subspace Outlier Detection (SOD)'].append( SOD(n_neighbors=int(self.n_shot / 3) + 2, contamination=outlier_fraction, ref_set=max(2, int((int(self.n_shot / 3) + 2) / 3)))) classifiers['Principal Component Analysis (PCA)'].append( PCA(contamination=outlier_fraction)) classifiers[ 'One-Class Support Vector Machines (OCSVM)'].append( OCSVM(contamination=outlier_fraction)) classifiers['Deviation-based Outlier Detection (LMDD)'].append( LMDD(contamination=outlier_fraction)) classifiers['Angle-Based Outlier Detection (ABOD)'].append( ABOD(contamination=outlier_fraction)) classifiers['Stochastic Outlier Selection (SOS)'].append( SOS(contamination=outlier_fraction)) classifiers['Isolation Forest (IForest)'].append( IForest(contamination=outlier_fraction)) classifiers['Feature Bagging'].append( FeatureBagging(contamination=outlier_fraction)) classifiers[ 'Lightweight On-line Detector of Anomalies (LODA)'].append( LODA(contamination=outlier_fraction)) self.num_different_models = len(classifiers) return classifiers
class TestSOS(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 # self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=5, contamination=self.contamination, random_state=42) self.clf = SOS(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): # TODO: sklearn check does not support Numba optimization # check_estimator(self.clf) pass def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance # assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
f.write("Model: " + modelname + "\n") f.write("Dataset " + str(datasetnumber) + ": " + datasetname + "\n") f.write("Time taken: " + str(time) + " seg.\n") f.write("Accuracy: " + str(accuracy) + "\n") if accuracy!=None: f.write("@scores\n") for score in model.decision_scores_: f.write(str(score) + "\n") f.close() # This is based on executing the script from the folder experiments ROUTE = "../datasets/outlier_ground_truth/" # List of datasets datasets = ["annthyroid.mat", "arrhythmia.mat", "breastw.mat", "cardio.mat", "glass.mat", "ionosphere.mat", "letter.mat", "lympho.mat", "mammography.mat", "mnist.mat", "musk.mat", "optdigits.mat", "pendigits.mat", "pima.mat", "satellite.mat", "satimage-2.mat", "speech.mat", "thyroid.mat", "vertebral.mat", "vowels.mat", "wbc.mat", "wine.mat"] # List of models and names models = [ABOD(), COF(), HBOS(), KNN(), LOF(), MCD(), OCSVM(), PCA(), SOD(), SOS()] names = ["ABOD", "COF", "HBOS", "KNN", "LOF", "MCD", "OCSVM", "PCA", "SOD", "SOS"] accuracies = [] for name, model in zip(names, models): print("\n\n#################################################################") print("MODEL " + name + " " + str(names.index(name)+1) + "/" + str(len(names))) print("#################################################################") acc = [] for dat in datasets: if name=="ABOD" and dat in ["breastw.mat", "letter.mat", "satellite.mat"]: result = None else: print("Computing dataset " + dat + " " + str(datasets.index(dat)+1) + "/" + str(len(datasets))) # Read dataset dataset, labels = readDataset(ROUTE + dat)
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train SOS detector clf_name = 'SOS' clf = SOS() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
import pandas as pd from pyod.models.sos import SOS iris = pd.read_csv("http://bit.ly/iris-csv") X = iris.drop("Name", axis=1).values detector = SOS() detector.fit(X) iris["score"] = detector.decision_scores_ print(iris.sort_values("score", ascending=False).head(10))
def run_all_models(all_array, labels, pca, data_set_name): picture_name = all_array.get("# img", 1) all_array = all_array.drop("# img", 1) # standardizing data for processing all_array = standardizer(all_array) y = labels.get("in").to_numpy() x_train, x_test, y_train, y_test, picture_train, picture_test = train_test_split(all_array, y, picture_name, test_size=0.4) if pca: transformer = IncrementalPCA() all_array = transformer.fit_transform(all_array) print("OCSVM") now = time() clf = OCSVM() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("OCSVM", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("Auto-encoder") now = time() clf = AutoEncoder(epochs=30) clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("Auto-encoder", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("HBOS") now = time() clf = HBOS() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("HBOS", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("SO_GAAL") now = time() clf = SO_GAAL() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("SO_GAAL", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("MO_GAAL") now = time() clf = MO_GAAL() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("MO_GAAL", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("MCD") now = time() clf = MCD() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("MCD", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("SOS") now = time() clf = SOS() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("SOS", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("IForest") now = time() clf = IForest() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("IFrorest", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("KNN") now = time() clf = KNN() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("KNN", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("PCA") now = time() clf = PCA() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("PCA", all_array.shape, temp, data_set_name, time() - now, scores_train))