def cof(X): contamination_factor = 0.1 k = 20 clf = COF(contamination=contamination_factor, n_neighbors=k) clf.fit(X) label = clf.labels_ score = clf.decision_scores_ threshold = clf.threshold_ writeLabel(label) return
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = COF(contamination=self.contamination) self.clf.fit(self.X_train)
def getOutlierCOF(dataset): ''' @brief Function that executes COF algorithm on the dataset and obtains the labels of the dataset indicating which instance is an inlier (0) or outlier (1) @param dataset Dataset on which to try the algorithm @return It returns a list of labels 0 means inlier, 1 means outlier ''' # Initializating the model cof = COF() # Fits the data and obtains labels cof.fit(dataset) # Return labels return cof.labels_
def aCOF(dataset, contamination, n_neighbors, name): algo = COF(contamination=contamination, n_neighbors=n_neighbors).fit(dataset) outlier_labels = algo.predict(dataset) outlier_index = where(outlier_labels == 1) outlier_values = dataset.iloc[outlier_index] number_of_outlier = len(outlier_values) plt.title(name, loc='center', fontsize=20) plt.scatter(dataset["P1"], dataset["P2"], color="b", s=65) plt.scatter(outlier_values["P1"], outlier_values["P2"], color="r") plt.figtext( 0.7, 0.91, 'contamination = {}\nn_neighbors = {} \nnumber of outlier = {}'.format( contamination, n_neighbors, number_of_outlier), fontsize=9) plt.show()
def main(): scalers = ['no', 'std', 'minmax'] root = 'Unsupervised_Anamaly_Detection_csv' start = 0 counts = 90 CPUS = 3 CPUS_Models = 4 sklearn_models = [ 'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS', 'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD', 'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE', 'AutoEncoder' ] models = { 'BRM': BRM(bootstrap_sample_percent=70), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), 'MoGaal': MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'OCKRA': m_OCKRA(), } name = "30_Models" Parallel(n_jobs=CPUS) \ (delayed(runByScaler) (root, scaler, models, start, counts, other_models=sklearn_models, CPUS=CPUS_Models, save_name=name) for scaler in scalers)
def test_check_parameters(self): with assert_raises(ValueError): COF(contamination=0.1, n_neighbors=-1) with assert_raises(ValueError): COF(contamination=10., n_neighbors=5) with assert_raises(TypeError): COF(contamination=0.1, n_neighbors='not int') with assert_raises(TypeError): COF(contamination='not float', n_neighbors=5) cof_ = COF(contamination=0.1, n_neighbors=10000) cof_.fit(self.X_train) assert self.X_train.shape[0] > cof_.n_neighbors_
def __init__(self, *, hyperparams: Hyperparams, # random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._clf = COF(contamination=hyperparams['contamination'], n_neighbors=hyperparams['n_neighbors'], ) return
def choose_model(model, nnet): """ among implemented in PyOD """ clfs = { 'AE': AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15), 'VAE': VAE(encoder_neurons=nnet[:5], decoder_neurons=nnet[4:], contamination=0.1, epochs=13), 'ABOD': ABOD(), 'FeatureBagging': FeatureBagging(), 'HBOS': HBOS(), 'IForest': IForest(), 'KNN': KNN(), 'LOF': LOF(), 'OCSVM': OCSVM(), 'PCA': PCA(), 'SOS': SOS(), 'COF': COF(), 'CBLOF': CBLOF(), 'SOD': SOD(), 'LOCI': LOCI(), 'MCD': MCD() } return clfs[model]
def runMethod(self): ''' @brief This function is the actual implementation of HICS ''' if self.verbose: print("Calculating the subspaces\n") # First we obtain the high contrast subspaces subspaces = self.hicsFramework() if self.verbose: print("Now calculating the scoring\n") # We initialize the scores for each instance as 0 scores = np.zeros(len(self.dataset)) # For each subspace for sub in subspaces: # We place the corresponding scorer according to parameter scorer = None if self.outlier_rank == "lof": scorer = LOF() elif self.outlier_rank == "cof": scorer = COF() elif self.outlier_rank == "cblof": scorer = CBLOF() elif self.outlier_rank == "loci": scorer = LOCI() elif self.outlier_rank == "hbos": scorer = HBOS() elif self.outlier_rank == "sod": scorer = SOD() # Fits the scorer with the dataset projection scorer.fit(self.dataset[:, sub]) # Adds the scores obtained to the global ones scores = scores + scorer.decision_scores_ # Compute the average self.outlier_score = scores / len(subspaces) # Marks the calculations as done self.calculations_done = True
'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD', 'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE', 'AutoEncoder' ] models = { 'BRM': BRM(), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'IForest': IForest(), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'),
def get_detectors(): # randomness_flags = [] BASE_ESTIMATORS = [ LODA(n_bins=5, n_random_cuts=10), LODA(n_bins=5, n_random_cuts=20), LODA(n_bins=5, n_random_cuts=30), LODA(n_bins=5, n_random_cuts=40), LODA(n_bins=5, n_random_cuts=50), LODA(n_bins=5, n_random_cuts=75), LODA(n_bins=5, n_random_cuts=100), LODA(n_bins=5, n_random_cuts=150), LODA(n_bins=5, n_random_cuts=200), LODA(n_bins=10, n_random_cuts=10), LODA(n_bins=10, n_random_cuts=20), LODA(n_bins=10, n_random_cuts=30), LODA(n_bins=10, n_random_cuts=40), LODA(n_bins=10, n_random_cuts=50), LODA(n_bins=10, n_random_cuts=75), LODA(n_bins=10, n_random_cuts=100), LODA(n_bins=10, n_random_cuts=150), LODA(n_bins=10, n_random_cuts=200), LODA(n_bins=15, n_random_cuts=10), LODA(n_bins=15, n_random_cuts=20), LODA(n_bins=15, n_random_cuts=30), LODA(n_bins=15, n_random_cuts=40), LODA(n_bins=15, n_random_cuts=50), LODA(n_bins=15, n_random_cuts=75), LODA(n_bins=15, n_random_cuts=100), LODA(n_bins=15, n_random_cuts=150), LODA(n_bins=15, n_random_cuts=200), LODA(n_bins=20, n_random_cuts=10), LODA(n_bins=20, n_random_cuts=20), LODA(n_bins=20, n_random_cuts=30), LODA(n_bins=20, n_random_cuts=40), LODA(n_bins=20, n_random_cuts=50), LODA(n_bins=20, n_random_cuts=75), LODA(n_bins=20, n_random_cuts=100), LODA(n_bins=20, n_random_cuts=150), LODA(n_bins=20, n_random_cuts=200), LODA(n_bins=25, n_random_cuts=10), LODA(n_bins=25, n_random_cuts=20), LODA(n_bins=25, n_random_cuts=30), LODA(n_bins=25, n_random_cuts=40), LODA(n_bins=25, n_random_cuts=50), LODA(n_bins=25, n_random_cuts=75), LODA(n_bins=25, n_random_cuts=100), LODA(n_bins=25, n_random_cuts=150), LODA(n_bins=25, n_random_cuts=200), LODA(n_bins=30, n_random_cuts=10), LODA(n_bins=30, n_random_cuts=20), LODA(n_bins=30, n_random_cuts=30), LODA(n_bins=30, n_random_cuts=40), LODA(n_bins=30, n_random_cuts=50), LODA(n_bins=30, n_random_cuts=75), LODA(n_bins=30, n_random_cuts=100), LODA(n_bins=30, n_random_cuts=150), LODA(n_bins=30, n_random_cuts=200), ABOD(n_neighbors=3), ABOD(n_neighbors=5), ABOD(n_neighbors=10), ABOD(n_neighbors=15), ABOD(n_neighbors=20), ABOD(n_neighbors=25), ABOD(n_neighbors=50), ABOD(n_neighbors=60), ABOD(n_neighbors=75), ABOD(n_neighbors=80), ABOD(n_neighbors=90), ABOD(n_neighbors=100), IForest(n_estimators=10, max_features=0.1), IForest(n_estimators=10, max_features=0.2), IForest(n_estimators=10, max_features=0.3), IForest(n_estimators=10, max_features=0.4), IForest(n_estimators=10, max_features=0.5), IForest(n_estimators=10, max_features=0.6), IForest(n_estimators=10, max_features=0.7), IForest(n_estimators=10, max_features=0.8), IForest(n_estimators=10, max_features=0.9), IForest(n_estimators=20, max_features=0.1), IForest(n_estimators=20, max_features=0.2), IForest(n_estimators=20, max_features=0.3), IForest(n_estimators=20, max_features=0.4), IForest(n_estimators=20, max_features=0.5), IForest(n_estimators=20, max_features=0.6), IForest(n_estimators=20, max_features=0.7), IForest(n_estimators=20, max_features=0.8), IForest(n_estimators=20, max_features=0.9), IForest(n_estimators=30, max_features=0.1), IForest(n_estimators=30, max_features=0.2), IForest(n_estimators=30, max_features=0.3), IForest(n_estimators=30, max_features=0.4), IForest(n_estimators=30, max_features=0.5), IForest(n_estimators=30, max_features=0.6), IForest(n_estimators=30, max_features=0.7), IForest(n_estimators=30, max_features=0.8), IForest(n_estimators=30, max_features=0.9), IForest(n_estimators=40, max_features=0.1), IForest(n_estimators=40, max_features=0.2), IForest(n_estimators=40, max_features=0.3), IForest(n_estimators=40, max_features=0.4), IForest(n_estimators=40, max_features=0.5), IForest(n_estimators=40, max_features=0.6), IForest(n_estimators=40, max_features=0.7), IForest(n_estimators=40, max_features=0.8), IForest(n_estimators=40, max_features=0.9), IForest(n_estimators=50, max_features=0.1), IForest(n_estimators=50, max_features=0.2), IForest(n_estimators=50, max_features=0.3), IForest(n_estimators=50, max_features=0.4), IForest(n_estimators=50, max_features=0.5), IForest(n_estimators=50, max_features=0.6), IForest(n_estimators=50, max_features=0.7), IForest(n_estimators=50, max_features=0.8), IForest(n_estimators=50, max_features=0.9), IForest(n_estimators=75, max_features=0.1), IForest(n_estimators=75, max_features=0.2), IForest(n_estimators=75, max_features=0.3), IForest(n_estimators=75, max_features=0.4), IForest(n_estimators=75, max_features=0.5), IForest(n_estimators=75, max_features=0.6), IForest(n_estimators=75, max_features=0.7), IForest(n_estimators=75, max_features=0.8), IForest(n_estimators=75, max_features=0.9), IForest(n_estimators=100, max_features=0.1), IForest(n_estimators=100, max_features=0.2), IForest(n_estimators=100, max_features=0.3), IForest(n_estimators=100, max_features=0.4), IForest(n_estimators=100, max_features=0.5), IForest(n_estimators=100, max_features=0.6), IForest(n_estimators=100, max_features=0.7), IForest(n_estimators=100, max_features=0.8), IForest(n_estimators=100, max_features=0.9), IForest(n_estimators=150, max_features=0.1), IForest(n_estimators=150, max_features=0.2), IForest(n_estimators=150, max_features=0.3), IForest(n_estimators=150, max_features=0.4), IForest(n_estimators=150, max_features=0.5), IForest(n_estimators=150, max_features=0.6), IForest(n_estimators=150, max_features=0.7), IForest(n_estimators=150, max_features=0.8), IForest(n_estimators=150, max_features=0.9), IForest(n_estimators=200, max_features=0.1), IForest(n_estimators=200, max_features=0.2), IForest(n_estimators=200, max_features=0.3), IForest(n_estimators=200, max_features=0.4), IForest(n_estimators=200, max_features=0.5), IForest(n_estimators=200, max_features=0.6), IForest(n_estimators=200, max_features=0.7), IForest(n_estimators=200, max_features=0.8), IForest(n_estimators=200, max_features=0.9), KNN(n_neighbors=1, method='largest'), KNN(n_neighbors=5, method='largest'), KNN(n_neighbors=10, method='largest'), KNN(n_neighbors=15, method='largest'), KNN(n_neighbors=20, method='largest'), KNN(n_neighbors=25, method='largest'), KNN(n_neighbors=50, method='largest'), KNN(n_neighbors=60, method='largest'), KNN(n_neighbors=70, method='largest'), KNN(n_neighbors=80, method='largest'), KNN(n_neighbors=90, method='largest'), KNN(n_neighbors=100, method='largest'), KNN(n_neighbors=1, method='mean'), KNN(n_neighbors=5, method='mean'), KNN(n_neighbors=10, method='mean'), KNN(n_neighbors=15, method='mean'), KNN(n_neighbors=20, method='mean'), KNN(n_neighbors=25, method='mean'), KNN(n_neighbors=50, method='mean'), KNN(n_neighbors=60, method='mean'), KNN(n_neighbors=70, method='mean'), KNN(n_neighbors=80, method='mean'), KNN(n_neighbors=90, method='mean'), KNN(n_neighbors=100, method='mean'), KNN(n_neighbors=1, method='median'), KNN(n_neighbors=5, method='median'), KNN(n_neighbors=10, method='median'), KNN(n_neighbors=15, method='median'), KNN(n_neighbors=20, method='median'), KNN(n_neighbors=25, method='median'), KNN(n_neighbors=50, method='median'), KNN(n_neighbors=60, method='median'), KNN(n_neighbors=70, method='median'), KNN(n_neighbors=80, method='median'), KNN(n_neighbors=90, method='median'), KNN(n_neighbors=100, method='median'), LOF(n_neighbors=1, metric='manhattan'), LOF(n_neighbors=5, metric='manhattan'), LOF(n_neighbors=10, metric='manhattan'), LOF(n_neighbors=15, metric='manhattan'), LOF(n_neighbors=20, metric='manhattan'), LOF(n_neighbors=25, metric='manhattan'), LOF(n_neighbors=50, metric='manhattan'), LOF(n_neighbors=60, metric='manhattan'), LOF(n_neighbors=70, metric='manhattan'), LOF(n_neighbors=80, metric='manhattan'), LOF(n_neighbors=90, metric='manhattan'), LOF(n_neighbors=100, metric='manhattan'), LOF(n_neighbors=1, metric='euclidean'), LOF(n_neighbors=5, metric='euclidean'), LOF(n_neighbors=10, metric='euclidean'), LOF(n_neighbors=15, metric='euclidean'), LOF(n_neighbors=20, metric='euclidean'), LOF(n_neighbors=25, metric='euclidean'), LOF(n_neighbors=50, metric='euclidean'), LOF(n_neighbors=60, metric='euclidean'), LOF(n_neighbors=70, metric='euclidean'), LOF(n_neighbors=80, metric='euclidean'), LOF(n_neighbors=90, metric='euclidean'), LOF(n_neighbors=100, metric='euclidean'), LOF(n_neighbors=1, metric='minkowski'), LOF(n_neighbors=5, metric='minkowski'), LOF(n_neighbors=10, metric='minkowski'), LOF(n_neighbors=15, metric='minkowski'), LOF(n_neighbors=20, metric='minkowski'), LOF(n_neighbors=25, metric='minkowski'), LOF(n_neighbors=50, metric='minkowski'), LOF(n_neighbors=60, metric='minkowski'), LOF(n_neighbors=70, metric='minkowski'), LOF(n_neighbors=80, metric='minkowski'), LOF(n_neighbors=90, metric='minkowski'), LOF(n_neighbors=100, metric='minkowski'), HBOS(n_bins=5, alpha=0.1), HBOS(n_bins=5, alpha=0.2), HBOS(n_bins=5, alpha=0.3), HBOS(n_bins=5, alpha=0.4), HBOS(n_bins=5, alpha=0.5), HBOS(n_bins=10, alpha=0.1), HBOS(n_bins=10, alpha=0.2), HBOS(n_bins=10, alpha=0.3), HBOS(n_bins=10, alpha=0.4), HBOS(n_bins=10, alpha=0.5), HBOS(n_bins=20, alpha=0.1), HBOS(n_bins=20, alpha=0.2), HBOS(n_bins=20, alpha=0.3), HBOS(n_bins=20, alpha=0.4), HBOS(n_bins=20, alpha=0.5), HBOS(n_bins=30, alpha=0.1), HBOS(n_bins=30, alpha=0.2), HBOS(n_bins=30, alpha=0.3), HBOS(n_bins=30, alpha=0.4), HBOS(n_bins=30, alpha=0.5), HBOS(n_bins=40, alpha=0.1), HBOS(n_bins=40, alpha=0.2), HBOS(n_bins=40, alpha=0.3), HBOS(n_bins=40, alpha=0.4), HBOS(n_bins=40, alpha=0.5), HBOS(n_bins=50, alpha=0.1), HBOS(n_bins=50, alpha=0.2), HBOS(n_bins=50, alpha=0.3), HBOS(n_bins=50, alpha=0.4), HBOS(n_bins=50, alpha=0.5), HBOS(n_bins=75, alpha=0.1), HBOS(n_bins=75, alpha=0.2), HBOS(n_bins=75, alpha=0.3), HBOS(n_bins=75, alpha=0.4), HBOS(n_bins=75, alpha=0.5), HBOS(n_bins=100, alpha=0.1), HBOS(n_bins=100, alpha=0.2), HBOS(n_bins=100, alpha=0.3), HBOS(n_bins=100, alpha=0.4), HBOS(n_bins=100, alpha=0.5), OCSVM(nu=0.1, kernel="linear"), OCSVM(nu=0.2, kernel="linear"), OCSVM(nu=0.3, kernel="linear"), OCSVM(nu=0.4, kernel="linear"), OCSVM(nu=0.5, kernel="linear"), OCSVM(nu=0.6, kernel="linear"), OCSVM(nu=0.7, kernel="linear"), OCSVM(nu=0.8, kernel="linear"), OCSVM(nu=0.9, kernel="linear"), OCSVM(nu=0.1, kernel="poly"), OCSVM(nu=0.2, kernel="poly"), OCSVM(nu=0.3, kernel="poly"), OCSVM(nu=0.4, kernel="poly"), OCSVM(nu=0.5, kernel="poly"), OCSVM(nu=0.6, kernel="poly"), OCSVM(nu=0.7, kernel="poly"), OCSVM(nu=0.8, kernel="poly"), OCSVM(nu=0.9, kernel="poly"), OCSVM(nu=0.1, kernel="rbf"), OCSVM(nu=0.2, kernel="rbf"), OCSVM(nu=0.3, kernel="rbf"), OCSVM(nu=0.4, kernel="rbf"), OCSVM(nu=0.5, kernel="rbf"), OCSVM(nu=0.6, kernel="rbf"), OCSVM(nu=0.7, kernel="rbf"), OCSVM(nu=0.8, kernel="rbf"), OCSVM(nu=0.9, kernel="rbf"), OCSVM(nu=0.1, kernel="sigmoid"), OCSVM(nu=0.2, kernel="sigmoid"), OCSVM(nu=0.3, kernel="sigmoid"), OCSVM(nu=0.4, kernel="sigmoid"), OCSVM(nu=0.5, kernel="sigmoid"), OCSVM(nu=0.6, kernel="sigmoid"), OCSVM(nu=0.7, kernel="sigmoid"), OCSVM(nu=0.8, kernel="sigmoid"), OCSVM(nu=0.9, kernel="sigmoid"), COF(n_neighbors=3), COF(n_neighbors=5), COF(n_neighbors=10), COF(n_neighbors=15), COF(n_neighbors=20), COF(n_neighbors=25), COF(n_neighbors=50), ] # randomness_flags.extend([True] * 54) # LODA # randomness_flags.extend([False] * 7) # ABOD # randomness_flags.extend([True] * 81) # IForest # randomness_flags.extend([False] * 36) # KNN # randomness_flags.extend([False] * 36) # LOF # randomness_flags.extend([False] * 40) # HBOS # randomness_flags.extend([False] * 36) # OCSVM # randomness_flags.extend([False] * 7) # COF # return BASE_ESTIMATORS, randomness_flags return BASE_ESTIMATORS
class TestCOF(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = COF(contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, 'n_neighbors_') and self.clf.n_neighbors_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_scores = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) print(pred_ranks) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_scores), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_check_parameters(self): with assert_raises(ValueError): COF(contamination=0.1, n_neighbors=-1) with assert_raises(ValueError): COF(contamination=10., n_neighbors=5) with assert_raises(TypeError): COF(contamination=0.1, n_neighbors='not int') with assert_raises(TypeError): COF(contamination='not float', n_neighbors=5) cof_ = COF(contamination=0.1, n_neighbors=10000) cof_.fit(self.X_train) assert self.X_train.shape[0] > cof_.n_neighbors_ def tearDown(self): pass
def main(): # PART 1: # Getting the predictions for each classifier # SK means: The classifier is from sklearn or works like sklearn # PY means: The classifier is from pyod or works like pyod models = { 'SK_EE': EllipticEnvelope(), 'SK_GM': GaussianMixture(), 'SK_IF': IsolationForest(), 'SK_OCSVM': OneClassSVM(), 'SK_FA': FactorAnalysis(), 'SK_KD': KernelDensity(), 'PY_PCA': PCA(), 'PY_COF': COF(), 'PY_LODA': LODA(), 'PY_LOF': LOF(), 'PY_HBOS': HBOS(), 'PY_MCD': MCD(), 'PY_AvgKNN': KNN(method='mean'), 'PY_LargestKNN': KNN(method='largest'), 'PY_MedKNN': KNN(method='median'), 'PY_AvgBagging': FeatureBagging(combination='average'), 'PY_MaxBagging': FeatureBagging(combination='max'), 'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'PY_COPOD': COPOD(), 'PY_SOD': SOD(), 'PY_LSCPwithLODA': LSCP([LODA(), LODA()]), 'PY_AveLMDD': LMDD(dis_measure='aad'), 'PY_VarLMDD': LMDD(dis_measure='var'), 'PY_IqrLMDD': LMDD(dis_measure='iqr'), 'PY_VAE': VAE(encoder_neurons=[8, 4, 2]), 'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'SK_BRM': BRM(bootstrap_sample_percent=70), 'SK_OCKRA': m_OCKRA(), 'PY_SoGaal': SO_GAAL(), 'PY_MoGaal': MO_GAAL() } ranker = ADRanker(data="datasets", models=models) ranker.get_predictions() # PART 2: # After predictions, we can evaluate our classifiers using different scores # You can add manually a new metric by modifying 'metrics.py' ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave}) # PART 3: # Finally, it is time to summarize the results by plotting different graphs # You can add your own graphs by modifying ' plots.py' plot = Plots() plot.make_plot_basic(paths=[ 'results/scores/auc/no/results.csv', 'results/scores/auc/minmax/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/ave/minmax/results.csv', 'results/scores/ave/std/results.csv' ], scalers=[ 'Without scaler', 'Min max scaler', 'Standard scaler', 'Without scaler', 'Min max scaler', 'Standard scaler' ]) plot.make_cd_plot( paths=[ 'results/scores/auc/minmax/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/no/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/std/results.csv' ], names=[ 'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale', 'CD ave no scale', 'CD auc std scale', 'CD ave std scale' ], titles=[ 'CD diagram - AUC with min max scaling', 'CD diagram - Average precision with min max scaling', 'CD diagram - AUC without scaling', 'CD diagram - Average precision without scaling', 'CD diagram - AUC with standard scaling', 'CD diagram - Average precision with standard scaling' ])
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, X_test, y_train, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train COF detector clf_name = 'COF' clf = COF(n_neighbors=30) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
def compare(inputdata, labels, n_clusters, dset_name): """ Compute the AUC, Fgap, Frank score on all conventional outlier detectors for the given dataset Args: inputdata: input data labels: ground truth outlier labels n_clusters: number of clusters, for some cluster-based detectors dset_name: dataset Returns: AUC, Fgap, Frank """ print( "Competing with conventional unsupervised outlier detection algorithms..." ) random_state = np.random.RandomState(1) if inputdata.shape[1] < 64: AEneurons = [16, 8, 8, 16] VAEneurons = [16, 8, 4], [4, 8, 16] else: AEneurons = [64, 32, 32, 64] VAEneurons = [128, 64, 32], [32, 64, 128] classifiers = { 'PCA': PCA(random_state=random_state), 'AutoEncoder': AutoEncoder(batch_size=100, hidden_neurons=AEneurons, random_state=random_state), 'VAE': VAE(batch_size=100, encoder_neurons=VAEneurons[0], decoder_neurons=VAEneurons[1], random_state=random_state), 'COPOD': COPOD(), 'Iforest': IForest(random_state=random_state), 'AutoEncoder': AutoEncoder(batch_size=100, random_state=random_state), 'VAE': VAE(batch_size=100, random_state=random_state), 'LODA': LODA(), 'OCSVM': OCSVM(), 'ABOD': ABOD(n_neighbors=20), 'Fb': FeatureBagging(random_state=random_state), 'CBLOF': CBLOF(n_clusters=n_clusters, check_estimator=False, random_state=random_state), 'LOF': LOF(), 'COF': COF() } for clf_name, clf in classifiers.items(): print(f"Using {clf_name} method") starttime = time.time() clf.fit(inputdata) time_taken = time.time() - starttime test_scores = clf.decision_scores_ # -----fix some broken scores----- # for i in range(len(test_scores)): cur = test_scores[i] if np.isnan(cur) or not np.isfinite(cur): test_scores[i] = 0 np.save(f'{dset_name}/{clf_name}_raw.npy', test_scores) auc = roc_auc_score(labels, test_scores) print('AUC:', auc) fetch(normalize(test_scores), f'../datasets/{dset_name.upper()}_Y.npy', f'{dset_name}/attribute.npy') print('time_taken:', time_taken)
# Specify the root directory datasets_path = "Anomaly_Datasets_csv" rootDir = os.path.abspath(datasets_path) # specify the random state rs = 10 # Save how to run the models detector_list = [LOF(), LOF()] models = [ # BRM github (brminer.BRM(), 'BRM'), # ocSVM sklearn (OneClassSVM(gamma='auto'), 'ocSVM'), # COF pyod (COF(contamination=0.1, n_neighbors=20), 'COF'), # ABOD pyod (ABOD(contamination=0.1, n_neighbors=5, method='fast'), 'ABOD'), # MO_GAAL pyod (MO_GAAL(k=10, stop_epochs=20, lr_d=0.01, lr_g=0.0001, decay=1e-06, momentum=0.9, contamination=0.1), 'MO_GAAL'), # SO_GAAL pyod (SO_GAAL(stop_epochs=20, lr_d=0.01, lr_g=0.0001, decay=1e-06,
#equipment_hist = sorted_dataset[['Equipment Name','date']].groupby('Equipment Name').count().plot.barh() #plt.plot(data_hist['Inspection Date'],data_hist['date']) #plt.show() #print(sorted_dataset.to_string()) sliced_data = sorted_dataset[[ 'PD Average', 'PD Count', 'Temperature', 'Humidity', 'Loading' ]] print(sorted_dataset.loc[sorted_dataset['Confirm action'] == '2'].to_string()) clfs = [ ABOD(contamination=.01), COF(contamination=.01), CBLOF(contamination=.01), IForest(contamination=.01) ] anomalies = [] for clf in clfs: clf.fit(sliced_data) y_train_pred = clf.labels_ sorted_dataset['Anomaly_status'] = y_train_pred anomalies.extend(sorted_dataset.loc[sorted_dataset['Anomaly_status'] == 1].index.values.tolist()) print("Completed:" + clf.__class__.__name__) anomaly_counter = Counter(anomalies)
def initialise_pyod_classifiers(self, outlier_fraction): #Testing every query to every class and then predicting only if it belongs to the same class classifiers = {} #Proximity based classifiers['K Nearest Neighbors (KNN)'] = [] classifiers['Average K Nearest Neighbors (AvgKNN)'] = [] classifiers['Median K Nearest Neighbors (MedKNN)'] = [] classifiers['Local Outlier Factor (LOF)'] = [] classifiers['Connectivity-Based Outlier Factor (COF)'] = [] #classifiers['Clustering-Based Local Outlier Factor (CBLOF)'] = [] classifiers['LOCI'] = [] #classifiers['Histogram-based Outlier Score (HBOS)'] = [] classifiers['Subspace Outlier Detection (SOD)'] = [] #Linear models classifiers['Principal Component Analysis (PCA)'] = [] #classifiers['Minimum Covariance Determinant (MCD)'] = [] #To slow classifiers['One-Class Support Vector Machines (OCSVM)'] = [] classifiers['Deviation-based Outlier Detection (LMDD)'] = [] #Probabilistic classifiers['Angle-Based Outlier Detection (ABOD)'] = [] classifiers['Stochastic Outlier Selection (SOS)'] = [] #Outlier Ensembles classifiers['Isolation Forest (IForest)'] = [] classifiers['Feature Bagging'] = [] classifiers['Lightweight On-line Detector of Anomalies (LODA)'] = [] for i in range(self.k_way): for i in range(self.k_way): classifiers['K Nearest Neighbors (KNN)'].append( KNN(method='largest', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Average K Nearest Neighbors (AvgKNN)'].append( KNN(method='mean', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Median K Nearest Neighbors (MedKNN)'].append( KNN(method='median', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Local Outlier Factor (LOF)'].append( LOF(n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Connectivity-Based Outlier Factor (COF)'].append( COF(n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['LOCI'].append( LOCI(contamination=outlier_fraction)) classifiers['Subspace Outlier Detection (SOD)'].append( SOD(n_neighbors=int(self.n_shot / 3) + 2, contamination=outlier_fraction, ref_set=max(2, int((int(self.n_shot / 3) + 2) / 3)))) classifiers['Principal Component Analysis (PCA)'].append( PCA(contamination=outlier_fraction)) classifiers[ 'One-Class Support Vector Machines (OCSVM)'].append( OCSVM(contamination=outlier_fraction)) classifiers['Deviation-based Outlier Detection (LMDD)'].append( LMDD(contamination=outlier_fraction)) classifiers['Angle-Based Outlier Detection (ABOD)'].append( ABOD(contamination=outlier_fraction)) classifiers['Stochastic Outlier Selection (SOS)'].append( SOS(contamination=outlier_fraction)) classifiers['Isolation Forest (IForest)'].append( IForest(contamination=outlier_fraction)) classifiers['Feature Bagging'].append( FeatureBagging(contamination=outlier_fraction)) classifiers[ 'Lightweight On-line Detector of Anomalies (LODA)'].append( LODA(contamination=outlier_fraction)) self.num_different_models = len(classifiers) return classifiers
f.write("Model: " + modelname + "\n") f.write("Dataset " + str(datasetnumber) + ": " + datasetname + "\n") f.write("Time taken: " + str(time) + " seg.\n") f.write("Accuracy: " + str(accuracy) + "\n") if accuracy!=None: f.write("@scores\n") for score in model.decision_scores_: f.write(str(score) + "\n") f.close() # This is based on executing the script from the folder experiments ROUTE = "../datasets/outlier_ground_truth/" # List of datasets datasets = ["annthyroid.mat", "arrhythmia.mat", "breastw.mat", "cardio.mat", "glass.mat", "ionosphere.mat", "letter.mat", "lympho.mat", "mammography.mat", "mnist.mat", "musk.mat", "optdigits.mat", "pendigits.mat", "pima.mat", "satellite.mat", "satimage-2.mat", "speech.mat", "thyroid.mat", "vertebral.mat", "vowels.mat", "wbc.mat", "wine.mat"] # List of models and names models = [ABOD(), COF(), HBOS(), KNN(), LOF(), MCD(), OCSVM(), PCA(), SOD(), SOS()] names = ["ABOD", "COF", "HBOS", "KNN", "LOF", "MCD", "OCSVM", "PCA", "SOD", "SOS"] accuracies = [] for name, model in zip(names, models): print("\n\n#################################################################") print("MODEL " + name + " " + str(names.index(name)+1) + "/" + str(len(names))) print("#################################################################") acc = [] for dat in datasets: if name=="ABOD" and dat in ["breastw.mat", "letter.mat", "satellite.mat"]: result = None else: print("Computing dataset " + dat + " " + str(datasets.index(dat)+1) + "/" + str(len(datasets))) # Read dataset dataset, labels = readDataset(ROUTE + dat)
# 60% data for training and 40% for testing #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=random_state) # standardizing data for processing #X_train_norm, X_test_norm = standardizer(X_train, X_test) X_norm = normalizeData(X) print(np.shape(X)) print(np.shape(X_norm)) classifiers = { 'Local Outlier Factor (LOF)': LOF( contamination=outliers_fraction ), 'Connectivity-Based Outlier Factor (COF)': COF( contamination=outliers_fraction ), 'K Nearest Neighbors (KNN)': KNN( contamination=outliers_fraction ), 'Average K Nearest Neighbors (AvgKNN)': KNN( method='mean', contamination=outliers_fraction ), 'Median K Nearest Neighbors (MedKNN)': KNN( method='median', contamination=outliers_fraction ), 'Subspace Outlier Detection (SOD)': SOD( contamination=outliers_fraction )
def cof(n_neighbors, contamination, name): dataset = prepare_data(df_names[0]) clf = COF(n_neighbors=n_neighbors, contamination=contamination).fit_predict(dataset) outlier_index = np.where(clf == 1) outlier_plot(dataset, outlier_index, contamination, n_neighbors, name) outlier_remove(outlier_index, 'df_without_outliers_cof.csv')
# 返回训练数据X_train上的异常标签和异常分值 y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值) y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常) # 用训练好的clf来预测未知数据中的异常值 y_test_pred = clf.predict(new_origin_all[pos:]) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) y_test_scores = clf.decision_function(new_origin_all[pos:]) # 返回未知数据上的异常值 show_scatter(clf_name, df, y_train_pred, pos) # In[170]: clf_name = 'COF' clf = COF(n_neighbors=30) clf.fit(new_origin_all[:pos]) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(new_origin_all[pos:]) # outlier labels (0 or 1) y_test_scores = clf.decision_function(new_origin_all[pos:]) # outlier scores show_scatter(clf_name, df, y_train_pred, pos) # In[171]: