def train(): dataset = get_data(1000, 10, 100) contamination = 0.01 with mlflow.start_run(): base_estimators = [ LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination)] model = SUOD(base_estimators=base_estimators, n_jobs=6, rp_flag_global=True, bps_flag=True, approx_flag_global=False, contamination=contamination) model.fit(dataset) model.approximate(dataset) predicted_labels = model.predict(dataset) voted_labels = vote(predicted_labels) true_labels = [0]*1000 + [1]*10 auc_score = roc_auc_score(voted_labels, true_labels) print("The resulted area under the ROC curve score is {}".format(auc_score)) mlflow.log_metric("auc_score", auc_score) mlflow.sklearn.log_model(model, "anomaly_model", conda_env="conda.yaml")
bps_flag=True, contamination=contamination, approx_flag_global=True) start = time.time() model.fit(X_train) # fit all models with X print('Fit time:', time.time() - start) print() start = time.time() model.approximate(X_train) # conduct model approximation if it is enabled print('Approximation time:', time.time() - start) print() start = time.time() predicted_labels = model.predict(X_test) # predict labels print('Predict time:', time.time() - start) print() start = time.time() predicted_scores = model.decision_function(X_test) # predict scores print('Decision Function time:', time.time() - start) print() ########################################################################## # compare with no projection, no bps, and no approximation print("******************************************************************") start = time.time() n_estimators = len(base_estimators) n_estimators_list, starts, n_jobs = _partition_estimators( n_estimators, n_jobs)
X = np.append(arr=X, values=features_pca, axis=0) X_num = X.shape[0] base_estimators = [LOF(), IForest(), OCSVM(kernel="rbf", gamma=0.001)] model = SUOD( base_estimators=base_estimators, n_jobs=2, # number of workers(if -1 it use full core) rp_flag_global=True, # global flag for random projection bps_flag=True, # global flag for balanced parallel scheduling approx_flag_global=False, # global flag for model approximation contamination=0.2) # X_train, X_test = train_test_split(X, test_size=0, random_state=123)\ model.fit(X) model.approximate(X) predicted_labels = model.predict(X) sum_labels = np.sum(predicted_labels, axis=1) / 3 sum_labels = np.where(sum_labels >= 0.5, -1, 1) # -1 abnormal, 1 normal result_label = np.average(sum_labels) result_label = result_label.tolist() # Add outliers fig = plt.figure() colors = np.array(['r', 'b']) ax = fig.add_subplot(111, projection='3d') ax.scatter(X[:, 0], X[:, 1], X[:, 2], color=colors[(sum_labels + 1) // 2])
class TestBASE(unittest.TestCase): def setUp(self): self.n_train = 1000 self.n_test = 500 self.contamination = 0.1 self.roc_floor = 0.6 self.random_state = 42 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=self.random_state) self.base_estimators = [ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination), LOF(n_neighbors=25, contamination=self.contamination), LOF(n_neighbors=35, contamination=self.contamination), LOF(n_neighbors=45, contamination=self.contamination), HBOS(contamination=self.contamination), PCA(contamination=self.contamination), LSCP(detector_list=[ LOF(n_neighbors=5, contamination=self.contamination), LOF(n_neighbors=15, contamination=self.contamination)], random_state=self.random_state) ] this_directory = os.path.abspath(os.path.dirname(__file__)) self.cost_forecast_loc_fit_ = os.path.join(this_directory, 'bps_train.joblib') self.cost_forecast_loc_pred_ = os.path.join(this_directory, 'bps_prediction.joblib') self.model = SUOD(base_estimators=self.base_estimators, n_jobs=2, rp_flag_global=True, bps_flag=True, contamination=self.contamination, approx_flag_global=True, cost_forecast_loc_fit=self.cost_forecast_loc_fit_, cost_forecast_loc_pred=self.cost_forecast_loc_pred_) def test_initialization(self): self.model.get_params() self.model.set_params(**{'n_jobs': 4}) def test_fit(self): """ Test base class initialization :return: """ self.model.fit(self.X_train) def test_approximate(self): self.model.fit(self.X_train) self.model.approximate(self.X_train) def test_predict(self): self.model.fit(self.X_train) self.model.approximate(self.X_train) self.model.predict(self.X_test) def test_decision_function(self): self.model.fit(self.X_train) self.model.approximate(self.X_train) self.model.decision_function(self.X_test)
KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), LSCP(detector_list=[LOF(contamination=contamination), LOF(contamination=contamination)]) ] model = SUOD(base_estimators=base_estimators, n_jobs=6, bps_flag=True, contamination=contamination, approx_flag_global=False) model.fit(X) # fit all models with X model.approximate(X) # conduct model approximation if it is enabled predicted_labels = model.predict(X) # predict labels on X; for demo purpose only predicted_scores = model.decision_function(X) # predict scores on X; for demo purpose only # %% evaluate_print('majority vote', y, majority_vote(predicted_labels)) evaluate_print('average', y, average(predicted_scores)) evaluate_print('maximization', y, maximization(predicted_scores)) clf = LOF() clf.fit(X) evaluate_print('LOF', y, clf.decision_scores_) clf = IForest() clf.fit(X) evaluate_print('IForest', y, clf.decision_scores_)