def __load_classifiers(self): outliers_fraction = 0.05 random_state = np.random.RandomState(0) classifiers = { 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), } return classifiers
def train(): dataset = get_data(1000, 10, 100) contamination = 0.01 with mlflow.start_run(): base_estimators = [ LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination)] model = SUOD(base_estimators=base_estimators, n_jobs=6, rp_flag_global=True, bps_flag=True, approx_flag_global=False, contamination=contamination) model.fit(dataset) model.approximate(dataset) predicted_labels = model.predict(dataset) voted_labels = vote(predicted_labels) true_labels = [0]*1000 + [1]*10 auc_score = roc_auc_score(voted_labels, true_labels) print("The resulted area under the ROC curve score is {}".format(auc_score)) mlflow.log_metric("auc_score", auc_score) mlflow.sklearn.log_model(model, "anomaly_model", conda_env="conda.yaml")
def setUp(self): # Define data file and read X and y # Generate some data if the source data is missing this_directory = path.abspath(path.dirname(__file__)) mat_file = 'cardio.mat' try: mat = loadmat(path.join(*[this_directory, 'data', mat_file])) except TypeError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() X, y = check_X_y(X, y) self.X_train, self.X_test, self.y_train, self.y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) self.base_estimators = [LOF(), LOF(), IForest(), COPOD()] self.clf = SUOD(base_estimators=self.base_estimators) self.clf.fit(self.X_train) self.roc_floor = 0.7
def load_classifiers(outliers_fraction): outliers_fraction = min(0.5, outliers_fraction) random_state = np.random.RandomState(42) # Define nine outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state, behaviour="new"), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.X_train, self.X_test = standardizer(self.X_train, self.X_test) self.detector_list = [LOF(), LOF()] self.clf = LSCP(self.detector_list, contamination=self.contamination) self.clf.fit(self.X_train)
def lof_pyod_once(X_nor, X_test, y_test, n_neighbors, contamination=0.05): lof = LOF(n_neighbors=n_neighbors, contamination=contamination) X_train = X_nor.astype(float).values.copy() lof.fit(X_train) ## now threshold is determined y_pred = lof.predict(X_test) scoreTable = lof.decision_function(X_test) #print(scoreTable) scoreTable = np.nan_to_num(scoreTable, copy=True) ## confusion matrix tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() tpr = tp / (tp + fn) fpr = fp / (tn + fp) #tprW[trail] = tpr #fprW[trail] = fpr tprW = tpr fprW = fpr # Auc score auc = roc_auc_score(y_test, scoreTable) #print(tpr, fpr) #print(auc) return tprW, fprW, auc, scoreTable
def __call__(self): clf = LOF(contamination=0.1) buggy_enter_csv = self.get_file( join(self.data_buggy_dir, '*_ENTER.csv')) buggy_exit_csv = self.get_file(join(self.data_buggy_dir, '*_EXIT.csv')) data = self.get_data(buggy_enter_csv, buggy_exit_csv) # extend data with self.data_orig_dir for cur_dir, dirs, files in os.walk(self.data_orig_dir): for f_dir in dirs: enter_csv = self.get_file(join(cur_dir, f_dir, '*_ENTER.csv')) exit_csv = self.get_file(join(cur_dir, f_dir, '*_EXIT.csv')) ext_data = self.get_data(enter_csv, exit_csv) logger.debug('shape of data: {}'.format(data.shape)) logger.debug('shape of ext_data: {}'.format(ext_data.shape)) data = np.concatenate((data, ext_data), axis=0) logger.debug('shape of data: {}'.format(data.shape)) clf.fit(data) train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) unique, counts = np.unique(train_pred, return_counts=True) logger.debug('unique (train): {}'.format(unique)) logger.debug('counts (train): {}'.format(counts)) if 0 not in unique: raise ModelError('Model contains no inlier') inliers_size = counts[0] outliers_size = counts[1] if len(counts) > 1 else 0 logger.debug('num of inliers: {}'.format(inliers_size)) logger.debug('num of outliers: {}'.format(outliers_size)) return self.predict(clf)
def run_LOF_base_detector(data, k, metric='euclidean', p=2): """ Function to fit and predict the LOF base detector on `data`. Input: - data: pd.DataFrame, to run LOF on - k: integer, parameter to indicate the amount of neighbours to include in relative density determination - metric: string, distance metric to use, default `euclidean` - p: int, default 1 since metric = `euclidean`, otherwise set according to distance metric Output: - clf of class pyod.models.lof.LOF with all its properties """ # Split data in values and targets: some datasets have an ID column, others don't try: X = data.drop(['outlier', 'id'], axis=1) except KeyError: X = data.drop('outlier', axis=1) # Construct and fit classifier clf = LOF(n_neighbors=k, metric='euclidean', p=p) clf.fit(X) # Fit only on features # Add ground truth labels for evaluation of the classifier clf.true_labels_ = data['outlier'] # Return the classifier for further processing return clf
def define_classifiers(random_state, outliers_fraction): classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def train_model(station: Station) -> LSCP: t1 = time.time() log.info(f'Training model for {station}...') log.info('Loading training observations') observations_select = Observation.select( Observation.time, Observation.sample_frequency, Observation.sample_count, Observation.rms, Observation.crest, Observation.peak_to_peak, Observation.kurtosis, ).where(Observation.station == station, Observation.is_training) obs_data = [] for observation in observations_select: obs_data.append([ observation.rms, observation.peak_to_peak, observation.kurtosis, observation.crest ]) log.info('Fitting LSCP model') lscp = LSCP([KNN()] * 5 + [LOF()] * 5 + [PCA()] * 5, contamination=0.03) lscp.fit(X=obs_data) log.info(f'Trained model in {time.time() - t1}') return lscp
def obj_func_LOF(params): ## objective function used in baseian optimization outlier_fraction = params[0] n_neighbors = params[1] algorithm = params[2] leaf_size = params[3] # load data set to function work space Y_train = np.load('Y_train.npy') X_train = np.load('X_train.npy') # create model clf = LOF(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, contamination=outlier_fraction) # fit the dataset to the model clf.fit(X_train) scores_pred = clf.decision_function( X_train) * -1 # predict raw anomaly score Rprecision = Rprecision_f(Y_train, scores_pred) if glb_verbose: print('R Precision : ', Rprecision) y_pred = clf.predict( X_train) # prediction of a datapoint category outlier or inlier objVal = objVal_f(Rprecision, y_pred, Y_train) return objVal
def calculate_LOF(given_DT, given_neighbors): X_1 = pd.DataFrame(given_DT) X = X_1.values clf = LOF(n_neighbors=given_neighbors) clf.fit(X) X_scores = clf.decision_scores_#clf.decision_function(XX_1) return X_scores
def anomaly_detection(data, label): X = data[data.select_dtypes('number').columns.tolist()] y = data[label] y = y.values X = X.drop([label], axis=1) sc = StandardScaler() X = pd.DataFrame(data=sc.fit_transform(X), columns=X.columns) ifo = IForest(contamination=0.01, behaviour='new', n_estimators=1000, max_samples=1024, n_jobs=-1, verbose=1) ifo.fit(X) ifo_pred = ifo.labels_ print('ROC score for Isolation forest: ', roc_auc_score(y, ifo_pred)) utilities.plot_outlier_scores( y, ifo.decision_scores_, bw=0.1, title='Fraud, Isolation forest. (n_estimators={})'.format( ifo.n_estimators)) ae = AutoEncoder(hidden_neurons=[25, 20, 15, 20, 25], hidden_activation='relu', output_activation='sigmoid', optimizer='adam', epochs=20, batch_size=128, dropout_rate=0.2, l2_regularizer=0.0, validation_size=0.1, preprocessing=False, verbose=1, random_state=1, contamination=0.01) ae.fit(X) ae_pred = ae.labels_ print('ROC score for Autoencoder: ', roc_auc_score(y, ae_pred)) utilities.plot_outlier_scores( y, ae.decision_scores_, bw=0.1, title='Fraud, Autoencoder. (epochs={})'.format(ae.epochs)) # Too long to train, under-sample needed lof = LOF(n_neighbors=int(y.sum() * 1.3), contamination=0.01, n_jobs=-1) lof.fit(X) lof_pred = lof.labels_ print('ROC score for LOF: ', roc_auc_score(y, lof_pred)) utilities.plot_outlier_scores( y, lof.decision_scores_, bw=0.1, title='Fraud, Local outliers factor. (n_neighbors={})'.format( lof.n_neighbors)) return y, ifo_pred, ae_pred, lof_pred
def construct_raw_base_estimators(): from pyod.models.knn import KNN from pyod.models.lof import LOF from pyod.models.cblof import CBLOF from pyod.models.hbos import HBOS from pyod.models.iforest import IForest from pyod.models.abod import ABOD from pyod.models.ocsvm import OCSVM estimator_list = [] # predefined range of n_neighbors for KNN, AvgKNN, and LOF k_range = [3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] for k in k_range: estimator_list.append( KNN(n_neighbors=k, method="largest", contamination=0.05)) estimator_list.append( KNN(n_neighbors=k, method="mean", contamination=0.05)) estimator_list.append(LOF(n_neighbors=k, contamination=0.05)) # predefined range of nu for one-class svm nu_range = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99] for nu in nu_range: estimator_list.append(OCSVM(nu=nu, contamination=0.05)) # predefined range for number of estimators in isolation forests n_range = [10, 20, 50, 70, 100, 150, 200, 250] for n in n_range: estimator_list.append( IForest(n_estimators=n, random_state=42, contamination=0.05)) return estimator_list
def train_model(X, Y, contamination, name, from_scratch=True): model_dir = './model' if not os.path.exists(model_dir): os.mkdir(model_dir) file_name = name + '.pkl' if from_scratch: if name == 'ocsvm': model = OCSVM(contamination=contamination) model.fit(X) elif name == 'iforest': model = IForest(contamination=contamination) model.fit(X) elif name == 'lof': model = LOF(contamination=contamination) model.fit(X) elif name == 'knn': model = KNN(contamination=contamination) model.fit(X) elif name == 'xgbod': model = XGBOD(contamination=contamination) model.fit(X, Y) save(model, model_dir, file_name) else: model = load(model_dir, file_name) return model
def main(): scalers = ['no', 'std', 'minmax'] root = 'Unsupervised_Anamaly_Detection_csv' start = 0 counts = 90 CPUS = 3 CPUS_Models = 4 sklearn_models = [ 'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS', 'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD', 'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE', 'AutoEncoder' ] models = { 'BRM': BRM(bootstrap_sample_percent=70), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), 'MoGaal': MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'OCKRA': m_OCKRA(), } name = "30_Models" Parallel(n_jobs=CPUS) \ (delayed(runByScaler) (root, scaler, models, start, counts, other_models=sklearn_models, CPUS=CPUS_Models, save_name=name) for scaler in scalers)
def outlier_detection(x_raw, y_raw): """ Filter all ourlier points :param x_raw: feature in ndarray :param y_raw: label in ndarray :return x_clean, y_clean: cleaned feature and label in ndarray """ # TODO Filter the outliers. print() print("Detecting outliers...") print("Before outlier detection: {}".format(x_raw.shape)) outliers_fraction = 0.04 random_state = np.random.RandomState(42) # all outlier detection method candidate list as follows classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), 'Improving Supervised Outlier Detection with Unsupervised Representation Learning': XGBOD(contamination=outliers_fraction), } clf_name = 'Isolation Forest' clf = IForest(contamination=outliers_fraction, random_state=random_state) # clf_name = 'Angle-based Outlier Detector (ABOD)' # clf = ABOD(contamination=outliers_fraction, method='default') clf.fit(x_raw) y_pred = clf.predict(x_raw) # for pyod, 1 means outliers and 0 means inliers # for sklearn, -1 means outliers and 1 means inliers idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1] x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0) y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0) print("After outlier detection: {}".format(x_clean.shape)) assert (x_clean.shape[0] == y_clean.shape[0]) return x_clean, y_clean
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.X_test, self.y_train, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = LOF(contamination=self.contamination) self.clf.fit(self.X_train)
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) self.clf = LOF(contamination=self.contamination) self.clf.fit(self.X_train)
def create_ensemble_LOF(ensemble_combinations, pca): model_list = [] for ensemble_combination in ensemble_combinations: for i in range(3, pca + 1): for j in range(i + 1, pca + 1): for k in range(j + 1, pca + 1): element = { "model": SimpleDetectorAggregator, "supervised": False, "parameters": { "method": ensemble_combination, "base_estimators": [ LOF(n_neighbors=i), LOF(n_neighbors=j), LOF(n_neighbors=k), ], } } model_list.append(element) print(len(model_list)) return model_list
def ranger(parameter, classifier): __ = parameter classi__ = { 'CBLOF': (CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state, n_clusters=__)), 'HBOS': (HBOS(contamination=outliers_fraction, n_bins=__)), 'KNN': (KNN(contamination=outliers_fraction, n_neighbors=__)), 'LOF': (LOF(n_neighbors=__, contamination=outliers_fraction)) } return classi__[classifier]
def getOutlierLOF(dataset): ''' @brief Function that executes LOF algorithm on the dataset and obtains the labels of the dataset indicating which instance is an inlier (0) or outlier (1) @param dataset Dataset on which to try the algorithm @return It returns a list of labels 0 means inlier, 1 means outlier ''' # Initializating the model lof = LOF() # Fits the data and obtains labels lof.fit(dataset) # Return labels return lof.labels_
def __init__(self, *, hyperparams: Hyperparams, # random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._clf = LOF(contamination=hyperparams['contamination'], n_neighbors=hyperparams['n_neighbors'], algorithm=hyperparams['algorithm'], leaf_size=hyperparams['leaf_size'], metric=hyperparams['metric'], p=hyperparams['p'], metric_params=hyperparams['metric_params'], ) return
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) detectors = [KNN(), LOF(), OCSVM()] self.clf = SimpleDetectorAggregator(base_estimators=detectors, method='maximization', contamination=self.contamination) self.clf.fit(self.X_train)
def get_model_lof(percentage_of_outliers=0.002, num_neighbors=2): """Create a LOF model. Args: percentage_of_outliers: percentage of fraud on data num_neighbors: number of neighbors for kneighbors queries Returns: model: LOF model """ utils.save_log('{0} :: {1}'.format( get_model_lof.__module__, get_model_lof.__name__)) model = LOF(contamination=percentage_of_outliers, n_neighbors=num_neighbors, n_jobs=config.num_jobs) return model
def out_lier_score(df, target, num_var): scaler = MinMaxScaler(feature_range=(0, 1)) df = scaler.fit_transform(df.loc[:, num_var], df[target]) #.to_numpy() random_state = np.random.RandomState(42) outliers_fraction = 0.05 X = df df_out_score = [] # Define seven outlier tools detectionto be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction) } for i, (clf_name, clf) in enumerate(classifiers.items()): clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) df_out_score.append(y_pred.tolist()) df_out_score = pd.DataFrame(df_out_score).T df_out_score.columns = list(classifiers.keys()) return df_out_score
def setUp(self): self.n_train = 1000 self.n_test = 500 self.contamination = 0.1 self.roc_floor = 0.6 self.random_state = 42 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=self.random_state) self.base_estimators = [ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination), LOF(n_neighbors=25, contamination=self.contamination), LOF(n_neighbors=35, contamination=self.contamination), LOF(n_neighbors=45, contamination=self.contamination), HBOS(contamination=self.contamination), PCA(contamination=self.contamination), LSCP(detector_list=[ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination) ], random_state=self.random_state) ] this_directory = os.path.abspath(os.path.dirname(__file__)) self.cost_forecast_loc_fit_ = os.path.join(this_directory, 'bps_train.joblib') self.cost_forecast_loc_pred_ = os.path.join(this_directory, 'bps_prediction.joblib') self.model = SUOD(base_estimators=self.base_estimators, n_jobs=2, rp_flag_global=True, bps_flag=True, contamination=self.contamination, approx_flag_global=True, cost_forecast_loc_fit=self.cost_forecast_loc_fit_, cost_forecast_loc_pred=self.cost_forecast_loc_pred_, verbose=True)
def outlier_score(x_train, y_train, algorithm=LOF(), method="unify"): number_of_instances = len(x_train) outlier_values = np.zeros(number_of_instances) outlier_labels = np.zeros(number_of_instances) number_of_classes = len(set(y_train)) for i in range(number_of_classes): indices_of_class_members = [ j for j in range(number_of_instances) if y_train[j] == i ] class_i_x_values = np.array( [x_train[j] for j in indices_of_class_members]) algorithm.fit(X=class_i_x_values) partial_values = algorithm.predict_proba(X=class_i_x_values, method=method)[:, 1] partial_labels = algorithm.labels_ for t in range(len(indices_of_class_members)): index = indices_of_class_members[t] outlier_values[index] = partial_values[t] outlier_labels[index] = partial_labels[t] return outlier_values, outlier_labels.astype('int')
def create_tunable_ensemble(knn_neighbors, lof_neighbors, abod_neighbors): model_list = [] for knn_neighbor in knn_neighbors: for lof_neighbor in lof_neighbors: for abod_neighbor in abod_neighbors: element = { "model": SimpleDetectorAggregator, "supervised": False, "parameters": { "method": "average", "base_estimators": [ KNN(n_neighbors=knn_neighbor), LOF(n_neighbors=lof_neighbor), ABOD(n_neighbors=abod_neighbor), OCSVM() ], } } model_list.append(element) return model_list
def choose_model(model, nnet): """ among implemented in PyOD """ clfs = { 'AE': AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15), 'VAE': VAE(encoder_neurons=nnet[:5], decoder_neurons=nnet[4:], contamination=0.1, epochs=13), 'ABOD': ABOD(), 'FeatureBagging': FeatureBagging(), 'HBOS': HBOS(), 'IForest': IForest(), 'KNN': KNN(), 'LOF': LOF(), 'OCSVM': OCSVM(), 'PCA': PCA(), 'SOS': SOS(), 'COF': COF(), 'CBLOF': CBLOF(), 'SOD': SOD(), 'LOCI': LOCI(), 'MCD': MCD() } return clfs[model]