def detect_outliers(lst): """detect outliers in a list of numpy arrays Parameters ---------- lst : List [description] Returns ------- inliers : List A list of the inliers """ clf = OCSVM(verbose=0) clf.fit(lst) inlier_idx = [] outlier_idx = [] for index, data in enumerate(lst): y = clf.predict(data.reshape(1, -1)) if y: # y==1 for outliers logger.debug('Found outlier: {0}'.format(index)) outlier_idx.append(index) else: inlier_idx.append(index) logger.info('{:.0%} are outliers'.format(len(outlier_idx) / len(lst))) return inlier_idx, outlier_idx
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = OCSVM() self.clf.fit(self.X_train)
def getOutlierOCSVM(dataset): ''' @brief Function that executes OCSVM algorithm on the dataset and obtains the labels of the dataset indicating which instance is an inlier (0) or outlier (1) @param dataset Dataset on which to try the algorithm @return It returns a list of labels 0 means inlier, 1 means outlier ''' # Initializating the model ocsvm = OCSVM() # Fits the data and obtains labels ocsvm.fit(dataset) # Return labels return ocsvm.labels_
def construct_raw_base_estimators(): from pyod.models.knn import KNN from pyod.models.lof import LOF from pyod.models.cblof import CBLOF from pyod.models.hbos import HBOS from pyod.models.iforest import IForest from pyod.models.abod import ABOD from pyod.models.ocsvm import OCSVM estimator_list = [] # predefined range of n_neighbors for KNN, AvgKNN, and LOF k_range = [3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] for k in k_range: estimator_list.append( KNN(n_neighbors=k, method="largest", contamination=0.05)) estimator_list.append( KNN(n_neighbors=k, method="mean", contamination=0.05)) estimator_list.append(LOF(n_neighbors=k, contamination=0.05)) # predefined range of nu for one-class svm nu_range = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99] for nu in nu_range: estimator_list.append(OCSVM(nu=nu, contamination=0.05)) # predefined range for number of estimators in isolation forests n_range = [10, 20, 50, 70, 100, 150, 200, 250] for n in n_range: estimator_list.append( IForest(n_estimators=n, random_state=42, contamination=0.05)) return estimator_list
def load_classifiers(outliers_fraction): outliers_fraction = min(0.5, outliers_fraction) random_state = np.random.RandomState(42) # Define nine outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state, behaviour="new"), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def __load_classifiers(self): outliers_fraction = 0.05 random_state = np.random.RandomState(0) classifiers = { 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), } return classifiers
def __init__( self, *, hyperparams: Hyperparams, # random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._clf = OCSVM( contamination=hyperparams['contamination'], kernel=hyperparams['kernel'], nu=hyperparams['nu'], degree=hyperparams['degree'], gamma=hyperparams['gamma'], coef0=hyperparams['coef0'], tol=hyperparams['tol'], shrinking=hyperparams['shrinking'], cache_size=hyperparams['cache_size'], verbose=hyperparams['verbose'], max_iter=hyperparams['max_iter'], ) return
def train_model(X_train, contamination, model): """ Train the model based on the user's choice (KNN, IForest, OCSVM). Parameters ---------- X_train : list of shape (n_train, n_features) containing the training set with only features. contamination : float representing the expected proportion of anomalies in the training set. model : string, claiming the model to use for evaluating the confidence. It can be one of: KNN, IForest, OCSVM. Returns ---------- clf : obj with the model trained on the training set. """ np.random.seed(331) n = np.shape(X_train)[0] if model == 'KNN': clf = KNN(n_neighbors=max(np.int(n*contamination),1), contamination = contamination).fit(X_train) elif model == 'IForest': clf = IForest(contamination = contamination, random_state = 331).fit(X_train) elif model == 'OCSVM': clf = OCSVM(contamination = contamination).fit(X_train) return clf
def define_classifiers(random_state, outliers_fraction): classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def outlier_detection(x_raw, y_raw): """ Filter all ourlier points :param x_raw: feature in ndarray :param y_raw: label in ndarray :return x_clean, y_clean: cleaned feature and label in ndarray """ # TODO Filter the outliers. print() print("Detecting outliers...") print("Before outlier detection: {}".format(x_raw.shape)) outliers_fraction = 0.04 random_state = np.random.RandomState(42) # all outlier detection method candidate list as follows classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), 'Improving Supervised Outlier Detection with Unsupervised Representation Learning': XGBOD(contamination=outliers_fraction), } clf_name = 'Isolation Forest' clf = IForest(contamination=outliers_fraction, random_state=random_state) # clf_name = 'Angle-based Outlier Detector (ABOD)' # clf = ABOD(contamination=outliers_fraction, method='default') clf.fit(x_raw) y_pred = clf.predict(x_raw) # for pyod, 1 means outliers and 0 means inliers # for sklearn, -1 means outliers and 1 means inliers idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1] x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0) y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0) print("After outlier detection: {}".format(x_clean.shape)) assert (x_clean.shape[0] == y_clean.shape[0]) return x_clean, y_clean
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) detectors = [KNN(), LOF(), OCSVM()] self.clf = SimpleDetectorAggregator(base_estimators=detectors, method='maximization', contamination=self.contamination) self.clf.fit(self.X_train)
def create_tunable_ensemble(knn_neighbors, lof_neighbors, abod_neighbors): model_list = [] for knn_neighbor in knn_neighbors: for lof_neighbor in lof_neighbors: for abod_neighbor in abod_neighbors: element = { "model": SimpleDetectorAggregator, "supervised": False, "parameters": { "method": "average", "base_estimators": [ KNN(n_neighbors=knn_neighbor), LOF(n_neighbors=lof_neighbor), ABOD(n_neighbors=abod_neighbor), OCSVM() ], } } model_list.append(element) return model_list
def choose_model(model, nnet): """ among implemented in PyOD """ clfs = { 'AE': AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15), 'VAE': VAE(encoder_neurons=nnet[:5], decoder_neurons=nnet[4:], contamination=0.1, epochs=13), 'ABOD': ABOD(), 'FeatureBagging': FeatureBagging(), 'HBOS': HBOS(), 'IForest': IForest(), 'KNN': KNN(), 'LOF': LOF(), 'OCSVM': OCSVM(), 'PCA': PCA(), 'SOS': SOS(), 'COF': COF(), 'CBLOF': CBLOF(), 'SOD': SOD(), 'LOCI': LOCI(), 'MCD': MCD() } return clfs[model]
def print_accuracy(train_arr,test_arr,trader_id): if len(train_arr)==0 or len(test_arr)==0: return for i in range(len(train_arr)): l1=len(train_arr[i]) l2=len(test_arr[i]) if l1==0 or l2==0: continue train_data=np.array([train_arr[i]]).T test_data=np.array([test_arr[i]]).T clf=OCSVM() clf.fit(train_data) y_pred=clf.predict(train_data) print("TRAINING ACCURACY for TRADER",trader_id,":",100 - (sum(y_pred)*100/l1)) y_pred=clf.predict(test_data) print("TESTING ACCURACY: ",sum(y_pred)*100/l2)
def create_ensemble_models(knn_methods, ensemble_combinations, pca): model_list = [] for ensemble_combination in ensemble_combinations: for i in range(1, pca + 1): for j in range(1, pca + 1): for method in knn_methods: element = { "model": SimpleDetectorAggregator, "supervised": False, "parameters": { "method": ensemble_combination, "base_estimators": [ KNN(n_neighbors=i, method=method), LOF(n_neighbors=j), ABOD(), OCSVM() ], } } model_list.append(element) return model_list
def get_estimators(contamination): """Internal method to create a list of 600 random base outlier detectors. Parameters ---------- contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. Returns ------- base_detectors : list A list of initialized random base outlier detectors. """ BASE_ESTIMATORS = [ LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), KNN(n_neighbors=50, contamination=contamination), KNN(n_neighbors=55, contamination=contamination), KNN(n_neighbors=65, contamination=contamination), KNN(n_neighbors=75, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=85, contamination=contamination), KNN(n_neighbors=95, contamination=contamination), KNN(n_neighbors=100, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), IForest(n_estimators=150, contamination=contamination), IForest(n_estimators=200, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=10, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), LOF(n_neighbors=50, contamination=contamination), LOF(n_neighbors=55, contamination=contamination), LOF(n_neighbors=60, contamination=contamination), LOF(n_neighbors=65, contamination=contamination), LOF(n_neighbors=70, contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), ABOD(n_neighbors=45, contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), OCSVM(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), MCD(contamination=contamination), LOF(n_neighbors=75, contamination=contamination), LOF(n_neighbors=80, contamination=contamination), LOF(n_neighbors=85, contamination=contamination), LOF(n_neighbors=90, contamination=contamination), LOF(n_neighbors=95, contamination=contamination), LOF(n_neighbors=100, contamination=contamination), ABOD(n_neighbors=5, contamination=contamination), ABOD(n_neighbors=10, contamination=contamination), ABOD(n_neighbors=15, contamination=contamination), ABOD(n_neighbors=20, contamination=contamination), ABOD(n_neighbors=25, contamination=contamination), ABOD(n_neighbors=30, contamination=contamination), ABOD(n_neighbors=35, contamination=contamination), ABOD(n_neighbors=40, contamination=contamination), ] return BASE_ESTIMATORS
'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), # 'Median KNN': KNN(method='median', # contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), # 'Local Correlation Integral (LOCI)': # LOCI(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), # 'Stochastic Outlier Selection (SOS)': SOS( # contamination=outliers_fraction), 'Locally Selective Combination (LSCP)': LSCP(detector_list, contamination=outliers_fraction, random_state=random_state), # 'Connectivity-Based Outlier Factor (COF)': # COF(n_neighbors=35, contamination=outliers_fraction), # 'Subspace Outlier Detection (SOD)': # SOD(contamination=outliers_fraction), } # Show all detectors
def initialise_pyod_classifiers(self, outlier_fraction): #Testing every query to every class and then predicting only if it belongs to the same class classifiers = {} #Proximity based classifiers['K Nearest Neighbors (KNN)'] = [] classifiers['Average K Nearest Neighbors (AvgKNN)'] = [] classifiers['Median K Nearest Neighbors (MedKNN)'] = [] classifiers['Local Outlier Factor (LOF)'] = [] classifiers['Connectivity-Based Outlier Factor (COF)'] = [] #classifiers['Clustering-Based Local Outlier Factor (CBLOF)'] = [] classifiers['LOCI'] = [] #classifiers['Histogram-based Outlier Score (HBOS)'] = [] classifiers['Subspace Outlier Detection (SOD)'] = [] #Linear models classifiers['Principal Component Analysis (PCA)'] = [] #classifiers['Minimum Covariance Determinant (MCD)'] = [] #To slow classifiers['One-Class Support Vector Machines (OCSVM)'] = [] classifiers['Deviation-based Outlier Detection (LMDD)'] = [] #Probabilistic classifiers['Angle-Based Outlier Detection (ABOD)'] = [] classifiers['Stochastic Outlier Selection (SOS)'] = [] #Outlier Ensembles classifiers['Isolation Forest (IForest)'] = [] classifiers['Feature Bagging'] = [] classifiers['Lightweight On-line Detector of Anomalies (LODA)'] = [] for i in range(self.k_way): for i in range(self.k_way): classifiers['K Nearest Neighbors (KNN)'].append( KNN(method='largest', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Average K Nearest Neighbors (AvgKNN)'].append( KNN(method='mean', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Median K Nearest Neighbors (MedKNN)'].append( KNN(method='median', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Local Outlier Factor (LOF)'].append( LOF(n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Connectivity-Based Outlier Factor (COF)'].append( COF(n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['LOCI'].append( LOCI(contamination=outlier_fraction)) classifiers['Subspace Outlier Detection (SOD)'].append( SOD(n_neighbors=int(self.n_shot / 3) + 2, contamination=outlier_fraction, ref_set=max(2, int((int(self.n_shot / 3) + 2) / 3)))) classifiers['Principal Component Analysis (PCA)'].append( PCA(contamination=outlier_fraction)) classifiers[ 'One-Class Support Vector Machines (OCSVM)'].append( OCSVM(contamination=outlier_fraction)) classifiers['Deviation-based Outlier Detection (LMDD)'].append( LMDD(contamination=outlier_fraction)) classifiers['Angle-Based Outlier Detection (ABOD)'].append( ABOD(contamination=outlier_fraction)) classifiers['Stochastic Outlier Selection (SOS)'].append( SOS(contamination=outlier_fraction)) classifiers['Isolation Forest (IForest)'].append( IForest(contamination=outlier_fraction)) classifiers['Feature Bagging'].append( FeatureBagging(contamination=outlier_fraction)) classifiers[ 'Lightweight On-line Detector of Anomalies (LODA)'].append( LODA(contamination=outlier_fraction)) self.num_different_models = len(classifiers) return classifiers
'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Median KNN': KNN(method='median', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction, random_state=random_state), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), } # Show all detectors for i, clf in enumerate(classifiers.keys()): print('Model', i + 1, clf) # Fit the models with the generated data and # compare model performances for i, offset in enumerate(clusters_separation): np.random.seed(42) # Data generation X1 = 0.3 * np.random.randn(n_inliers // 2, 2) - offset X2 = 0.3 * np.random.randn(n_inliers // 2, 2) + offset
contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), '(KNN) K Nearest Neighbors ': KNN( contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), # 'Median KNN': KNN(method='median', # contamination=outliers_fraction), '(LOF) Local Outlier Factor ': LOF(n_neighbors=35, contamination=outliers_fraction), # 'Local Correlation Integral (LOCI)': # LOCI(contamination=outliers_fraction), '(MCD) Minimum Covariance Determinant ': MCD( contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), '(PCA) Principal Component Analysis ': PCA( contamination=outliers_fraction, random_state=random_state), # 'Stochastic Outlier Selection (SOS)': SOS( # contamination=outliers_fraction), '(LSCP) Locally Selective Combination ': LSCP( detector_list, contamination=outliers_fraction, random_state=random_state), # 'Connectivity-Based Outlier Factor (COF)': # COF(n_neighbors=35, contamination=outliers_fraction), # 'Subspace Outlier Detection (SOD)': # SOD(contamination=outliers_fraction), } st.subheader('SELECT AN ALGORITHM:') classifier_name = st.selectbox('THE ALGORITHM',[*classifiers])
from combo.models.detector_lscp import LSCP contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) detectors = [KNN(), LOF(), OCSVM()] clf_name = 'LSCP' clf = LSCP(base_estimators=detectors) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:")
class TestOCSVM(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = OCSVM() self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'support_') and self.clf.support_ is not None) assert (hasattr(self.clf, 'support_vectors_') and self.clf.support_vectors_ is not None) assert (hasattr(self.clf, 'dual_coef_') and self.clf.dual_coef_ is not None) assert (hasattr(self.clf, 'intercept_') and self.clf.intercept_ is not None) # only available for linear kernel # if not hasattr(self.clf, 'coef_') or self.clf.coef_ is None: # self.assertRaises(AttributeError, 'coef_ is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3.5) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3.5) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, X_test, y_train, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train one_class_svm detector clf_name = 'OneClassSVM' clf = OCSVM() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
OCSVM """ num_params = num_search sigmas = np.logspace(-2, 2, num_params) nus = np.linspace(0.01, 0.99, num_params) gridsearch = np.zeros((num_params * num_params, 3)) #sigma rows, q, cols mesh = np.zeros((num_params, num_params)) #perform gridsearch run = 0 i = 0 for sigma in sigmas: j = 0 for nu in nus: gamma = 0.5 / (sigma * sigma) model = OCSVM(nu=nu, gamma=gamma) model.fit(x_train) val_scores = model.decision_function(x_val) auc = metrics.roc_auc_score(y_val, val_scores) gridsearch[run, :] = np.asarray([sigma, nu, auc]) mesh[j, i] = auc run += 1 j += 1 i += 1 best_idx = np.argmax(gridsearch[:, 2]) val_auc = np.max(gridsearch[:, 2]) best_sigma = gridsearch[best_idx, 0] best_nu = gridsearch[best_idx, 1] param_heatmap(methods[3],
class TestOCSVM(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = OCSVM() self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'support_') and self.clf.support_ is not None) assert_true(hasattr(self.clf, 'support_vectors_') and self.clf.support_vectors_ is not None) assert_true(hasattr(self.clf, 'dual_coef_') and self.clf.dual_coef_ is not None) assert_true(hasattr(self.clf, 'intercept_') and self.clf.intercept_ is not None) # only available for linear kernel # if not hasattr(self.clf, 'coef_') or self.clf.coef_ is None: # self.assertRaises(AttributeError, 'coef_ is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
""" Result -> Performance """ """ One class Svm """ anomaly_algorithms = [ ("Robust covariance", EllipticEnvelope()), ("One-Class SVM", svm.OneClassSVM(kernel="rbf", gamma=0.001)), ("Isolation Forest", IsolationForest(random_state=42)), ("Local Outlier Factor", LocalOutlierFactor()) ] X = np.append(arr=X, values=features_pca, axis=0) X_num = X.shape[0] base_estimators = [LOF(), IForest(), OCSVM(kernel="rbf", gamma=0.001)] model = SUOD( base_estimators=base_estimators, n_jobs=2, # number of workers(if -1 it use full core) rp_flag_global=True, # global flag for random projection bps_flag=True, # global flag for balanced parallel scheduling approx_flag_global=False, # global flag for model approximation contamination=0.2) # X_train, X_test = train_test_split(X, test_size=0, random_state=123)\ model.fit(X) model.approximate(X) predicted_labels = model.predict(X) sum_labels = np.sum(predicted_labels, axis=1) / 3
'Isolation Forest': IForest(random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(), 'Average KNN': KNN(method='mean'), # 'Median KNN': KNN(method='median', # contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35), # 'Local Correlation Integral (LOCI)': # LOCI(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(), 'Principal Component Analysis (PCA)': PCA(random_state=random_state), # 'Stochastic Outlier Selection (SOS)': SOS( # contamination=outliers_fraction), 'Locally Selective Combination (LSCP)': LSCP(detector_list, random_state=random_state), # 'Connectivity-Based Outlier Factor (COF)': # COF(n_neighbors=35, contamination=outliers_fraction), # 'Subspace Outlier Detection (SOD)': # SOD(contamination=outliers_fraction), } # Show all detectors for i, clf in enumerate(classifiers.keys()): print('Model', i + 1, clf)
def detect_anomaly(df_floats: pd.DataFrame, train_size: float, outliers_rate: float, classifier: str, plot: bool = False): """ Return binary classified outlier and raw outlier score. Performs training of anomaly detection model on subset of dataset and returns binary label and decision score for whole dataset. Parameters ---------- df_floats: pd.DataFrame with elements as floats. train_size: proportion of dataset to be used for training anomaly detection model. outliers_rate: proportion of training set to be considered outlier. classifier: string representing name of anomaly detection algorithm. plot: plots 2d contourf of anomaly detection scores. Returns ------- y_labels: numpy array of the same length as df_floats that assigns 0/1 (inlier/outlier) to each observation according to fitted model. y_scores: numpy array of the same length as df_floats that assigns outlier scores to each observation according to fitted model. """ if df_floats.shape[0] < 8: raise Warning( 'Not enough measurements. Please use DataFrame with at last 10 measurements.' ) if train_size > 1: train_size = train_size / 100 # TODO: Find out empirical way to set contamination level - Tukey's method if outliers_rate >= 1: outliers_rate = outliers_rate / 100 random_state = np.random.RandomState(42) # TODO: Perform scaling of data ONLY for AKNN, CBLOF, HBOS, KNN, OCSVM. Other classifiers are not influenced. classifiers = { 'Average KNN (AKNN)': KNN(method='mean', contamination=outliers_rate), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_rate, check_estimator=False, random_state=random_state), 'Copula based Outlier Detection (COPOD)': COPOD(contamination=outliers_rate), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_rate), 'Isolation Forest (IForest)': IForest(contamination=outliers_rate, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_rate), 'One-Class SVM (OCSVM)': OCSVM(contamination=outliers_rate), 'Principal component analysis (PCA)': PCA(contamination=outliers_rate) } scaler = MinMaxScaler(feature_range=(0, 1)) scaled = scaler.fit_transform(df_floats) df_scaled = pd.DataFrame(scaled, index=df_floats.index, columns=df_floats.columns) x_train, x_test = train_test_split(df_scaled, train_size=train_size) if classifier == 'all': raise Warning('This option is currently unsupported.' '\nPlease use one of those classifiers:' '\n{}.'.format(list(classifiers.keys()))) # for i, (clf_name, clf) in enumerate(classifiers.items()): # # fit model # clf.fit(x_train) # # prediction of a datapoint category outlier or inlier # y_labels = clf.predict(df_scaled) # plot_outlier_detection(df_scaled, y_labels, clf, clf_name, scaler) else: clf_name = '' for name in classifiers.keys(): if classifier in name: clf_name = name break if clf_name: # print("\nUsed classifier: {}".format(clf_name)) clf = classifiers.get(clf_name) clf.fit(x_train) y_labels = clf.predict( df_scaled) # binary labels (0: inliers, 1: outliers) y_scores = clf.decision_function(df_scaled) # raw outlier scores else: raise NameError('Unknown classifier. ' 'Please use one of those: {}.'.format( list(classifiers.keys()))) if plot: plot_outlier_detection(df_scaled, y_labels, clf, clf_name, scaler) return y_labels, y_scores
# standardize data to be digestible for most algorithms X = StandardScaler().fit_transform(X) contamination = y.sum() / len(y) ############################################################################## # initialize a set of anomaly detectors base_estimators = [ LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), OCSVM(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), LOF(n_neighbors=35, contamination=contamination), LOF(n_neighbors=45, contamination=contamination), HBOS(contamination=contamination), PCA(contamination=contamination), OCSVM(contamination=contamination),
pred_train_set.append(1) for i in range(train_len_negative,len(negatives)): test_set.append(negatives2[indices_negatives[i]]) pred_test_set.append(1) # print(len(train_set),len(test_set)) import numpy as np train_set=np.array(train_set) test_set=np.array(test_set) from pyod.models.ocsvm import OCSVM from pyod.models.pca import PCA # from pyod.models.mcd import MCD clf1=PCA(standardization = True,contamination=0.2) # clf1 = MCD(assume_centered = True) clf2=OCSVM(kernel = 'poly',nu = 0.25,degree =2,contamination =0.2) # clf2 = OCSVM(kernel = 'linear',nu =0.02) clf1.fit(train_set) clf2.fit(train_set) y_pred_train_pca=clf1.predict(train_set) y_pred_test_pca=clf1.predict(test_set) y_pred_train_ocsvm=clf2.predict(train_set) y_pred_test_ocsvm=clf2.predict(test_set) print(clf1.explained_variance_) # print(y_pred_test_pca,y_pred_test_ocsvm) train_pca_correct=0 train_ocsvm_correct=0 print("TRAIN SET") for i in range(len(pred_train_set)):
X = StandardScaler().fit_transform(X) # load the pre-trained model cost predictor clf = load('rf_predictor.joblib') classifiers = { 1: ABOD(n_neighbors=10), 2: CBLOF(check_estimator=False), 3: FeatureBagging(LOF()), 4: HBOS(), 5: IForest(), 6: KNN(), 7: KNN(method='mean'), 8: LOF(), 9: MCD(), 10: OCSVM(), 11: PCA(), } clfs = np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], size=n_estimators_total) clfs_real = [] for estimator in clfs: clfs_real.append(classifiers[estimator]) X_w = indices_to_one_hot(clfs - 1, 11) X_d1 = np.array([X.shape[0], X.shape[1]]).reshape(1, 2) X_d = np.repeat(X_d1, len(clfs), axis=0) X_c = np.concatenate((X_d, X_w), axis=1) predict_time = clf.predict(X_c)