def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = LOF(contamination=self.contamination) self.clf.fit(self.X_train)
from pyod.utils.data import generate_data from pyod.utils.data import evaluate_print from pyod.utils.data import visualize if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points X_train, y_train, X_test, y_test = generate_data( n_train=n_train, n_test=n_test, contamination=contamination) # train LOF detector clf_name = 'LOF' clf = LOF() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
for doc in csvlist: print(doc) df = pd.read_csv(doc,encoding='utf-8') #x =df.loc[:,('V1','V2','V3','V4','V5','V6','V7')] x = df.loc[:, ('R', 'G', 'B')] #x=df.iloc[:,6:57] y=df.loc[:,'original.label'] roc_list=[count,doc] count=count+1 roc_mat = np.zeros(6) # 设置 5%的离群点数据 random_state = np.random.RandomState(42) outliers_fraction = 0.02 # 定义6个后续会使用的离群点检测模型 classifiers = { "Feature Bagging": FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), "Isolation Forest": IForest(contamination=outliers_fraction, random_state=random_state), "Average KNN": KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF( contamination=outliers_fraction), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA( contamination=outliers_fraction, random_state=random_state), } classifiers_indices = { 'Feature Bagging': 0, 'Isolation Forest': 1, "Average KNN":2, 'Local Outlier Factor (LOF)':3, 'One-class SVM (OCSVM)':4,
clusters_separation = [0] # Compare given classifiers under given settings xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100)) n_inliers = int((1. - outliers_fraction) * n_samples) n_outliers = int(outliers_fraction * n_samples) ground_truth = np.zeros(n_samples, dtype=int) ground_truth[-n_outliers:] = 1 random_state = np.random.RandomState(42) # Define nine outlier detection tools to be compared classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(n_neighbors=10, contamination=outliers_fraction), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS( contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN( contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD( contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(
X1 = df['SF Ratio'].values.reshape(-1,1) X2 = df['PD Ratio'].values.reshape(-1,1) X3 = df['AE Ratio'].values.reshape(-1,1) X4 = df['SAE Ratio'].values.reshape(-1,1) X5 = df['discontinued patients Ratio'].values.reshape(-1,1) X = np.concatenate((X1,X2,X3,X4,X5),axis=1) random_state = np.random.RandomState(42) outliers_fraction = 0.05 # Define seven outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=random_state), 'Feature Bagging':FeatureBagging(LOF(n_neighbors=35),contamination=outliers_fraction,check_estimator=False,random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean',contamination=outliers_fraction) } xx, yy = np.meshgrid(np.linspace(0, 1, 200), np.linspace(0, 1, 200)) total=[] for i, (clf_name, clf) in enumerate(classifiers.items()): clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1
toeplitz_roc = [] toeplitz_prn = [] toeplitz_time = [] pca_roc = [] pca_prn = [] pca_time = [] rp_roc = [] rp_prn = [] rp_time = [] for j in range(n_iter): start = time.time() clf = LOF() # change this to other detection algorithms clf.fit(X) y_train_scores = clf.decision_scores_ original_time.append(time.time() - start) original_roc.append(roc_auc_score(y, y_train_scores)) original_prn.append(precision_n_scores(y, y_train_scores)) X_transformed, _ = jl_fit_transform(X, dim_new, "basic") start = time.time() clf.fit(X_transformed) y_train_scores = clf.decision_scores_ basic_time.append(time.time() - start) basic_roc.append(roc_auc_score(y, y_train_scores)) basic_prn.append(precision_n_scores(y, y_train_scores)) X_transformed, _ = jl_fit_transform(X, dim_new, "discrete")
y_train = np.append(np.zeros(100 + number_outliers_1), np.ones(100 + number_outliers_2)).astype(int) class_1_test = np.random.normal(mean_class_1, var_class_1, (samples_class_1, 2)) class_2_test = np.random.normal(mean_class_2, var_class_2, (samples_class_2, 2)) x_test = np.concatenate((class_1_test, class_2_test)) y_test = np.append(np.zeros(samples_class_1), np.ones(samples_class_2)).astype(int) acc_normal = accuracy_test( x_train, y_train, x_test, y_test, outlierScoreAlgorithm=LOF(contamination=0.23)) results_normal.append(acc_normal) normal_class_1 = np.random.normal(mean_class_1[1], var_class_1[1], samples_class_1_outliers) skew_class_1 = skewnorm.rvs(alpha_1, size=samples_class_1_outliers, scale=omega_1, loc=mean_class_1[0]) class_1_outliers_test = np.array([ np.array([x, y]) for x, y in zip(skew_class_1, normal_class_1) ]) normal_class_2 = np.random.normal(mean_class_2[1], var_class_2[1], samples_class_2_outliers) skew_class_2 = skewnorm.rvs(alpha_2, size=samples_class_2_outliers,
'OCSVM', 'LOF', 'CBLOF', 'HBOS', 'KNN', 'ABOD', 'COPOD'), index=0) # train the COPOD detector if model == 'IForest': clf = IForest() elif model == 'FeatureBagging': clf = FeatureBagging() elif model == 'PCA': clf = PCA() elif model == 'MCD': clf = MCD() elif model == 'OCSVM': clf = OCSVM() elif model == 'LOF': clf = LOF() elif model == 'CBLOF': clf = CBLOF() elif model == 'HBOS': clf = HBOS() elif model == 'KNN': clf = KNN() elif model == 'ABOD': clf = ABOD() else: clf = COPOD() # fit the model clf.fit(X) # get outlier scores scores = clf.decision_scores_ # raw outlier scores
def pyod_init(model, n_features=None): # initial model set up if model == 'abod': from pyod.models.abod import ABOD clf = ABOD() elif model == 'auto_encoder' and n_features: #import os #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from pyod.models.auto_encoder import AutoEncoder clf = AutoEncoder(hidden_neurons=[ n_features, n_features * 5, n_features * 5, n_features ], epochs=5, batch_size=64, preprocessing=False) elif model == 'cblof': from pyod.models.cblof import CBLOF clf = CBLOF(n_clusters=4) elif model == 'hbos': from pyod.models.hbos import HBOS clf = HBOS() elif model == 'iforest': from pyod.models.iforest import IForest clf = IForest() elif model == 'knn': from pyod.models.knn import KNN clf = KNN() elif model == 'lmdd': from pyod.models.lmdd import LMDD clf = LMDD() elif model == 'loci': from pyod.models.loci import LOCI clf = LOCI() elif model == 'loda': from pyod.models.loda import LODA clf = LODA() elif model == 'lof': from pyod.models.lof import LOF clf = LOF() elif model == 'mcd': from pyod.models.mcd import MCD clf = MCD() elif model == 'ocsvm': from pyod.models.ocsvm import OCSVM clf = OCSVM() elif model == 'pca': from pyod.models.pca import PCA clf = PCA() elif model == 'sod': from pyod.models.sod import SOD clf = SOD() elif model == 'vae': from pyod.models.vae import VAE clf = VAE() elif model == 'xgbod': from pyod.models.xgbod import XGBOD clf = XGBOD() else: #raise ValueError(f"unknown model {model}") clf = PyODDefaultModel() return clf
'AutoEncoder' ] models = { 'BRM': BRM(), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'IForest': IForest(), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(),
np.load(os.path.join("multimodality", "baseline_wen_embeding.npy")), np.load(os.path.join("multimodality", "vae_joint_representation.npy")), np.concatenate([ np.load( os.path.join("unimodality", "image", "train_image_embedding.npy")), np.load(os.path.join("unimodality", "language", "word2vec.npy")) ], axis=1) ] unimodality = [ "image", "word2vec", "bert", "concat_joint", "vae_joint", "simple_concat" ] clfs = [ IForest(random_state=42), LOF(), OCSVM(), PCA(), KNN(), HBOS(), COPOD(), AutoEncoder(verbose=0), VAE(latent_dim=32, verbosity=0) ] for embedding, modality in zip(unimodal_embeddings, unimodality): print() print(modality) print() embedding_scaled = standardizer(embedding)
#FeatureSelector().select_features(write=True) #a = pd.Series( DataProcessor().get_all_commands_series()) #print a #commands = pd.Series(DataProcessor().get_all_commands_series()) #print commands.keys() sample_df = pd.read_csv(sample_submission_file) result_df = pd.read_csv('outputs/FeatureSelector/all_500_500.csv') cols = select_k_best(result_df, 200) result_df = result_df[cols] result_df.loc[:, 'Label'] = FeatureSelector().get_labels_array_all() result_df.to_csv('outputs/FeatureSelector/selected_all.csv') v = pd.read_csv(validation_file) validation_set = v['Label'] classification_res = [] clf = LOF(n_neighbors=20, contamination=0.1) #for num in range(0, 40): # print "******* User {} ********".format(num) # ClassificationModel(user_num=num, df=result_df).optimize_parameters() for num in range(0, 10): print "******* User {} ********".format(num) classification_res.extend( ClassificationModel(user_num=num, df=result_df, model=clf).predictLabels()) validation(classification_res, validation_set) for num in range(10, 40): print "******* User {} ********".format(num) classification_res = ClassificationModel(user_num=num,
def make_mlo(hub, data, train): ''' Create the Machine Learning Object used for this sequence ''' return LOF(contamination=0.01)
X = np.concatenate((X1, X2), axis=1) random_state = np.random.RandomState(42) outliers_fraction = 0.05 # Running PyOD Algorithms to detect outliers classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), #'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction) } xx, yy = np.meshgrid(np.linspace(0, 1, 200), np.linspace(0, 1, 200)) for i, (clf_name, clf) in enumerate(classifiers.items()):
class TestLOF(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = LOF(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'n_neighbors_') and self.clf.n_neighbors_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) print(pred_ranks) print(pred_ranks) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
def get_estimators(contamination): """Internal method to create a list of 600 random base outlier detectors. Parameters ---------- contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. Returns ------- base_detectors : list A list of initialized random base outlier detectors. """ BASE_ESTIMATORS = [ LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), ABOD(n_neighbors=45, contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), ] return BASE_ESTIMATORS
CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction, random_state=random_state), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), } classifiers_indices = { 'Angle-based Outlier Detector (ABOD)': 0, 'Cluster-based Local Outlier Factor': 1, 'Feature Bagging': 2, 'Histogram-base Outlier Detection (HBOS)': 3, 'Isolation Forest': 4, 'K Nearest Neighbors (KNN)': 5, 'Local Outlier Factor (LOF)': 6,
print("\n... Processing", mat_file_name, '...') mat = sp.io.loadmat(os.path.join('../datasets', mat_file)) X = mat['X'] y = mat['y'].ravel() outliers_fraction = np.sum(y) / len(y) X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(n_neighbors=10, contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False), 'Feature Bagging': FeatureBagging(LOF(), contamination=outliers_fraction), 'Histogram-base Outlier Detection (HBOS)': HBOS( contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD( contamination=outliers_fraction), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction) } stat_mat_all = np.zeros([len(classifiers), 10]) report_list = ['train_roc_orig', 'train_p@n_orig', 'train_roc_psa', 'train_p@n_psa',
# Define the number of inliers and outliers n_samples = 200 outliers_fraction = 0.25 clusters_separation = [0] # Compare given detectors under given settings # Initialize the data xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100)) n_inliers = int((1. - outliers_fraction) * n_samples) n_outliers = int(outliers_fraction * n_samples) ground_truth = np.zeros(n_samples, dtype=int) ground_truth[-n_outliers:] = 1 # initialize a set of detectors for LSCP detector_list = [ LOF(n_neighbors=5), LOF(n_neighbors=10), LOF(n_neighbors=15), LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=30), LOF(n_neighbors=35), LOF(n_neighbors=40), LOF(n_neighbors=45), LOF(n_neighbors=50) ] # Show the statics of the data print('Number of inliers: %i' % n_inliers) print('Number of outliers: %i' % n_outliers) print(
def getDetectors(self, parameterdict): detectors = [] windowSize = [ value for key, value in parameterdict.items() if "windowsize_" in key.lower() ][0] featureset = [ value for key, value in parameterdict.items() if "featureset_" in key.lower() ][0] detectortype = [ value for key, value in parameterdict.items() if "type_" in key.lower() ][0] detectorlist = detectortype.split() features = self.featureGenerator.getSlidingWindowFeaturesEvents( int(windowSize), int(featureset)) X = features[0] y = features[1] for detector in detectorlist: if (detector.rstrip() == "OCSVM"): kernel = [ value for key, value in parameterdict.items() if "kernel_" in key.lower() ][0] nu = [ value for key, value in parameterdict.items() if "nu_" in key.lower() ][0] clf = OCSVM(kernel=kernel, nu=nu, max_iter=100) if (detector.rstrip() == "IForest"): num_estimators = [ value for key, value in parameterdict.items() if "num_estimators_" in key.lower() ][0] max_samples = [ value for key, value in parameterdict.items() if "max_samples_" in key.lower() ][0] clf = IForest(n_estimators=int(num_estimators), max_samples=int(max_samples)) if (detector.rstrip() == "PCA"): clf = PCA() if (detector.rstrip() == "LOF"): n_neighbors = [ value for key, value in parameterdict.items() if "lof_n_neighbors_" in key.lower() ][0] clf = LOF(n_neighbors=int(n_neighbors)) if (detector.rstrip() == "KNN"): n_neighbors = [ value for key, value in parameterdict.items() if "knn_n_neighbors_" in key.lower() ][0] clf = KNN(n_neighbors=int(n_neighbors)) with HiddenPrints(): clf.fit_predict_score(X, y, scoring='roc_auc_score') detectors.append(clf) return detectors, detectorlist, y, X
from combo.models.detector_lscp import LSCP contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) detectors = [KNN(), LOF(), OCSVM()] clf = LSCP(base_estimators=detectors) clf_name = 'LSCP' clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:")
print("\n... Processing", mat_file_name, '...') mat = sp.io.loadmat(os.path.join('', 'datasets', mat_file)) X = mat['X'] y = mat['y'] # standardize data to be digestible for most algorithms X = StandardScaler().fit_transform(X) X, X, y_train, y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) contamination = y.sum() / len(y) base_estimators = [ LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=5, contamination=contamination),
def compare(inputdata, labels, n_clusters, dset_name): """ Compute the AUC, Fgap, Frank score on all conventional outlier detectors for the given dataset Args: inputdata: input data labels: ground truth outlier labels n_clusters: number of clusters, for some cluster-based detectors dset_name: dataset Returns: AUC, Fgap, Frank """ print( "Competing with conventional unsupervised outlier detection algorithms..." ) random_state = np.random.RandomState(1) if inputdata.shape[1] < 64: AEneurons = [16, 8, 8, 16] VAEneurons = [16, 8, 4], [4, 8, 16] else: AEneurons = [64, 32, 32, 64] VAEneurons = [128, 64, 32], [32, 64, 128] classifiers = { 'PCA': PCA(random_state=random_state), 'AutoEncoder': AutoEncoder(batch_size=100, hidden_neurons=AEneurons, random_state=random_state), 'VAE': VAE(batch_size=100, encoder_neurons=VAEneurons[0], decoder_neurons=VAEneurons[1], random_state=random_state), 'COPOD': COPOD(), 'Iforest': IForest(random_state=random_state), 'AutoEncoder': AutoEncoder(batch_size=100, random_state=random_state), 'VAE': VAE(batch_size=100, random_state=random_state), 'LODA': LODA(), 'OCSVM': OCSVM(), 'ABOD': ABOD(n_neighbors=20), 'Fb': FeatureBagging(random_state=random_state), 'CBLOF': CBLOF(n_clusters=n_clusters, check_estimator=False, random_state=random_state), 'LOF': LOF(), 'COF': COF() } for clf_name, clf in classifiers.items(): print(f"Using {clf_name} method") starttime = time.time() clf.fit(inputdata) time_taken = time.time() - starttime test_scores = clf.decision_scores_ # -----fix some broken scores----- # for i in range(len(test_scores)): cur = test_scores[i] if np.isnan(cur) or not np.isfinite(cur): test_scores[i] = 0 np.save(f'{dset_name}/{clf_name}_raw.npy', test_scores) auc = roc_auc_score(labels, test_scores) print('AUC:', auc) fetch(normalize(test_scores), f'../datasets/{dset_name.upper()}_Y.npy', f'{dset_name}/attribute.npy') print('time_taken:', time_taken)
n_test = 100 # number of testing points # Generate sample data X_train, X_test, y_train, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train SUOD clf_name = 'SUOD' # initialized a group of outlier detectors for acceleration detector_list = [ LOF(n_neighbors=15), LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=35), COPOD(), IForest(n_estimators=100), IForest(n_estimators=200) ] # decide the number of parallel process, and the combination method clf = SUOD(base_estimators=detector_list, n_jobs=2, combination='average', verbose=False) # or to use the default detectors
mat = sp.io.loadmat(os.path.join('../datasets', mat_file)) X = mat['X'] y = mat['y'].ravel() outliers_fraction = np.sum(y) / len(y) X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(n_neighbors=10, contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False), 'Feature Bagging': FeatureBagging(LOF(), contamination=outliers_fraction), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)':
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4,random_state=random_state) x_train_norm, x_test_norm = standardizer(x_train, x_test) classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD( contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF( contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS( contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF( contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD( contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA( contamination=outliers_fraction, random_state=random_state), } for clf_name, clf in classifiers.items(): try: t0 = time() clf.fit(x_train_norm) test_scores = clf.decision_function(x_test_norm) t1 = time() duration = round(t1 - t0, ndigits=4) roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
# detector_list, contamination=outliers_fraction, # random_state=random_state) #} #%% file_no, plotp = 3, 0 anomalies = anomalies_3 data = preprocess(data_3, plotp) start_date = data.head(1).index.date[0] end_date = data.tail(1).index.date[0] middle_date = start_date + (end_date - start_date) / 2 datatotrain, datatotest, datatotrain_normalized, datatotest_normalized, dataanomaly_normalized, train_data, test_data, mix_data = createtraintest( data, anomalies, file_no) X_train, X_test, X_outliers = datatotrain_normalized, datatotest_normalized, dataanomaly_normalized #%% clf = LOF(n_neighbors=10, contamination=0.1) clf.fit(X_train) #%% #y_pred = clf.fit_predict(X_test) i = 0 for dt in rrule(DAILY, dtstart=start_date, until=end_date): try: if (data.loc[dt.strftime("%Y-%m-%d")]['value']).values.size != 0: data.loc[dt.strftime("%Y-%m-%d"), 'ocsvm_score'] = clf.predict( preprocessing.normalize( data.loc[dt.strftime("%Y-%m-%d")].value.values.reshape( 1, -1))) i = i + 1 except: print(dt, i) continue
encodings_train = np.concatenate(encodings_train, 0) print(encodings_train.shape) # train the KNN detector from pyod.models.knn import KNN from pyod.models.pca import PCA from pyod.models.lof import LOF from pyod.models.cblof import CBLOF from pyod.models.mcd import MCD from pyod.models.lscp import LSCP # from pyod.models.auto_encoder import AutoEncoder clf_knn = KNN() clf_pca = PCA() clf_mcd = MCD() clf_lof = LOF() clf_cblof = CBLOF() # clf_lscp = LSCP([clf_knn, clf_pca, clf_mcd ]) # clf_ae = AutoEncoder(epochs=50) clf_mcd.fit(encodings_train) clf_pca.fit(encodings_train) clf_knn.fit(encodings_train) clf_lof.fit(encodings_train) clf_cblof.fit(encodings_train) # clf_lscp.fit(encodings_train) # clf_ae.fit(encodings_train) anomaly_scores_mcd = clf_mcd.decision_function(encodings_train) anomaly_scores_pca = clf_pca.decision_function(encodings_train) anomaly_scores_knn = clf_knn.decision_function(encodings_train)
class TestLOF(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = LOF(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true( hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true( hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true( hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true( hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true( hasattr(self.clf, 'n_neighbors_') and self.clf.n_neighbors_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) print(pred_ranks) print(pred_ranks) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train LOF detector clf_name = 'LOF' clf = LOF() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, contamination=contamination, random_state=42) X_train, X_test = standardizer(X_train, X_test) # train lscp clf_name = 'LSCP' detector_list = [LOF(), LOF()] clf = LSCP(detector_list, random_state=42) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:")
# -*- coding: utf-8 -*- """ Created on Tue Dec 24 15:51:40 2019 @author: zixing.mei """ from pyod.models.lof import LOF #训练异常检测模型,然后输出训练集样本的异常分 clf = LOF(n_neighbors=20, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, contamination=0.1, n_jobs=1) clf.fit(x) #异常分 out_pred = clf.predict_proba(x, method='linear')[:, 1] train['out_pred'] = out_pred #异常分在0.9百分位以下的样本删掉 key = train['out_pred'].quantile(0.9) x = train[train.out_pred < key][feature_lst] y = train[train.out_pred < key]['bad_ind'] val_x = val[feature_lst]
#df = pd.DataFrame(datax) # ##scatter_matrix(df, alpha=0.2, diagonal='kde') #from pandas.tools.plotting import lag_plot #lag_plot(data.value) #%% #clf = LOF(n_neighbors=10, contamination=0.1) #clf.fit(data_n) #clf1 = LOF(n_neighbors=10, contamination=0.1) #clf1.fit(data_n[:,0:48]) #clf2 = LOF(n_neighbors=10, contamination=0.1) #clf2.fit(data_n[:,48:96]) #clf3 = LOF(n_neighbors=10, contamination=0.1) #clf3.fit(data_n[:,96:144]) #%% clf = LOF(n_neighbors=10, contamination=0.1) clf.fit(X_train) clf1 = LOF(n_neighbors=10, contamination=0.1) clf1.fit(X_train[:, 0:48]) clf2 = LOF(n_neighbors=10, contamination=0.1) clf2.fit(X_train[:, 48:96]) clf3 = LOF(n_neighbors=10, contamination=0.1) clf3.fit(X_train[:, 96:144]) #%% clf = HBOS(contamination=outliers_fraction) clf.fit(X_train) clf1 = HBOS(contamination=outliers_fraction) clf1.fit(X_train[:, 0:48]) clf2 = HBOS(contamination=outliers_fraction) clf2.fit(X_train[:, 48:96])