def main(): plt.close('all') matplotlib.use('Qt5Agg') # override PyCharm pro's scientific view create_links() warnings.showwarning = silence_warnings contamination = 0.1 # percentage of outliers n_train = 500 # number of training points n_test = 500 # number of testing points n_features = 25 # Number of features X_test, y_test, X_train, y_train = _generate_random_data( contamination, n_features, n_test, n_train) # X_test, y_test, X_train, y_train = ? _plot_using_pca(X_train, y_train) hidden_neurons = [25, 2, 2, 25] clf1 = AutoEncoder(hidden_neurons=hidden_neurons) clf1.fit(X_train) y_train_scores = clf1.decision_scores_ # Predict the anomaly scores y_test_scores = clf1.decision_function(X_test) # outlier scores y_test_scores = pd.Series(y_test_scores) # Plot anomaly scores plt.hist(y_test_scores, bins='auto') plt.title("Histogram for Model Clf1 Anomaly Scores") plt.show() manual_score_thres = 4 df_test = X_test.copy() df_test['score'] = y_test_scores # assign cluster=0 to samples with low anomaly score, and cluster=1 to samples with high anomaly score. df_test['cluster'] = np.where(df_test['score'] < manual_score_thres, 0, 1) df_test['cluster'].value_counts() df_test.groupby('cluster').mean() print(df_test)
def ele_outliers(num): dataSetType = ALL_DATA_TYPE[0] trainType = ALL_TRAIN_TYPE[1] X, yc = load_data(dataSetType, trainType, num) # 10 fold validation KF = KFold(n_splits=10, shuffle=True, random_state=10) report_list = [] for train_index, test_index in KF.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = yc[train_index], yc[test_index] # split into train and test # X_train, X_test, y_train, y_test = train_test_split(X, yc, test_size=0.2, random_state=10) # split train to ele and mice X_train_ele = X_train[y_train == 1] X_train_mice = X_train[y_train == 0] # use mice to fit the model mice: 1, ele: -1 # clf = svm.OneClassSVM(nu=nu, kernel='rbf', gamma='scale') # clf = IsolationForest(max_samples=0.2, n_estimators=300, contamination=conta, random_state=rng) # clf.fit(X_train_mice) clf_name = 'AutoEncoder' clf = AutoEncoder(hidden_neurons=[256, 64, 20, 64, 256], epochs=epochs, contamination=conta, random_state=10, verbose=0) clf.fit(X_train_mice) y_pred_test = clf.predict(X_test) # get outlier scores y_pred_scores = clf.decision_function(X_test) c_matrix = confusion_matrix(y_test, y_pred_test) print(c_matrix) temp_report = classification_report(y_test, y_pred_test, output_dict=True) report_list.append(temp_report) print(classification_report(y_test, y_pred_test, output_dict=False)) # evaluate_print(clf_name, y_pred_test, y_pred_scores) final_report = get_avg_report(report_list) print("final report", final_report)
class AutoEncoderODD(abstract_occ_model): def __init__(self, hidden_neurons, nu, epochs, batch_size=32, output_activation='sigmoid'): self.model = AutoEncoder(hidden_neurons=hidden_neurons, contamination=nu, epochs=epochs, batch_size=batch_size, validation_size=0, output_activation=output_activation) def fit(self, X): self.model.fit(X) def predict(self, X): prediction = self.model.predict(X) return np.where(prediction == 0.0, 1, np.where(prediction == 1.0, -1, prediction)) def score_samples(self, X): return -self.model.decision_function(X)
n_train = 20000 # number of training points n_test = 2000 # number of testing points n_features = 300 # number of features # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=n_features, contamination=contamination, random_state=42) # train AutoEncoder detector clf_name = 'AutoEncoder' clf = AutoEncoder(epochs=30, contamination=contamination) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
test_x = test.drop(columns=['user_id', 'index']) np.any(np.isnan(train_x)) np.all(np.isfinite(train_x)) train_norm = StandardScaler().fit_transform(train_x.dropna()) test_norm = StandardScaler().fit_transform(test_x.dropna()) clf1 = AutoEncoder(hidden_neurons=[25, 2, 2, 25]) clf1.fit(train_norm) y_train_scores = clf1.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf1.predict(test_norm) # outlier labels (0 or 1) y_test_scores = clf1.decision_function(test_norm) # outlier scores y_test_pred = pd.Series(y_test_pred) y_test_scores = pd.Series(y_test_scores) y_test_pred.value_counts() y_test_scores.describe() plt.hist(y_test_scores, bins='auto') plt.title("Histogram for Model Clf1 Anomaly Scores") plt.xlim(-1, 2) plt.show() df_test = test_x.copy() df_test.insert(loc=0, column="user_id", value=test.dropna()['user_id'])
class TestAutoEncoder(unittest.TestCase): def setUp(self): self.n_train = 6000 self.n_test = 1000 self.n_features = 300 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = AutoEncoder(epochs=5, contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'model_') and self.clf.model_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_prediction_labels_confidence(self): pred_labels, confidence = self.clf.predict(self.X_test, return_confidence=True) assert_equal(pred_labels.shape, self.y_test.shape) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_prediction_proba_linear_confidence(self): pred_proba, confidence = self.clf.predict_proba(self.X_test, method='linear', return_confidence=True) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_model_clone(self): # for deep models this may not apply clone_clf = clone(self.clf) def tearDown(self): pass
plt.show() # Step 1: Build the model clf1 = AutoEncoder(hidden_neurons =[25, 2, 2, 25]) clf1.fit(X_train) clf2 = AutoEncoder(hidden_neurons =[25, 10,2, 10, 25]) clf2.fit(X_train) clf3 = AutoEncoder(hidden_neurons =[25, 15, 10, 2, 10,15, 25]) clf3.fit(X_train) # Predict the anomaly scores y_test_scores = clf1.decision_function(X_test) y_test_scores = pd.Series(y_test_scores) # Step 2: Determine the cut point import matplotlib.pyplot as plt plt.hist(y_test_scores, bins='auto') plt.title("Histogram with Model Clf3 Anomaly Scores") plt.show() df_test = X_test.copy() df_test['score'] = y_test_scores df_test['cluster'] = np.where(df_test['score']<4, 0, 1) df_test['cluster'].value_counts() # Step 3: Get the summary statistics by cluster df_test.groupby('cluster').mean()
def ele_outliers(num): # num = 10 # fileName1 = "/data/sym/one-class-svm/data/mean_of_five/dec-feature/caida-A-50W-5-{}.csv".format(num) # fileName2 = "/data/sym/one-class-svm/data/mean_of_five/bin-feature/caida-A-50W-5-{}.csv".format(num) # fileName1 = "/data/sym/one-class-svm/data/mean_of_five/dec-feature/univ1-50W-{0}-{1}.csv".format(5, num) # fileName2 = "/data/sym/one-class-svm/data/mean_of_five/bin-feature/univ1-50W-{0}-{1}.csv".format(5, num) fileName1 = "data/dec-test.csv" fileName2 = "data/bin-test.csv" df = pd.read_csv(fileName1) dfb = pd.read_csv(fileName2) #conver to matrix X = dfb.values X[X == '0'] = -1 X[X == '1'] = 1 yr = df['flowSize'] # thres = int(sys.argv[1]) yc = yr.copy(deep=True) yc[yr <= thres] = 0 yc[yr > thres] = 1 print("original mice count: ", sum(yc == 0)) print("original elephant count: ", sum(yc == 1)) # 10 fold validation KF = KFold(n_splits=10, shuffle=True, random_state=10) report_list = [] for train_index, test_index in KF.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = yc[train_index], yc[test_index] # split into train and test # X_train, X_test, y_train, y_test = train_test_split(X, yc, test_size=0.2, random_state=10) # split train to ele and mice X_train_ele = X_train[y_train == 1] X_train_mice = X_train[y_train == 0] # use mice to fit the model mice: 1, ele: -1 # clf = svm.OneClassSVM(nu=nu, kernel='rbf', gamma='scale') # clf = IsolationForest(max_samples=0.2, n_estimators=300, contamination=conta, random_state=rng) # clf.fit(X_train_mice) clf_name = 'AutoEncoder' clf = AutoEncoder(hidden_neurons=[256, 64, 20, 64, 256], epochs=epochs, contamination=conta, random_state=10, verbose=0) clf.fit(X_train_mice) y_pred_test = clf.predict(X_test) # get outlier scores y_pred_scores = clf.decision_function(X_test) c_matrix = confusion_matrix(y_test, y_pred_test) print(c_matrix) temp_report = classification_report(y_test, y_pred_test, output_dict=True) report_list.append(temp_report) print(classification_report(y_test, y_pred_test, output_dict=False)) # evaluate_print(clf_name, y_pred_test, y_pred_scores) final_report = get_avg_report(report_list) print("final report", final_report)
dropCleanScale = StandardScaler().fit_transform(nnData) dropCleanScale = pd.DataFrame(dropCleanScale) nnDataTest = pd.read_excel("nnViewDataTest.xlsx") nnDataTest = nnDataTest.drop(['date_time'], axis=1) dropCleanScaleTest = StandardScaler().fit_transform(nnDataTest) dropCleanScaleTest = pd.DataFrame(dropCleanScaleTest) clf1 = AutoEncoder(hidden_neurons=[14, 2, 2, 14]) clf1.fit(dropCleanScale) y_train_scores1 = clf1.decision_scores_ clf2 = AutoEncoder(hidden_neurons=[14, 10, 2, 10, 14]) clf2.fit(dropCleanScale) y_train_scores2 = clf2.decision_scores_ y_test1 = clf1.decision_function(dropCleanScaleTest) y_test2 = clf2.decision_function(dropCleanScaleTest) ## plotting the Remaining lifetime score plt.hist(y_test1, bins='auto', color='green') plt.hist(y_test2, bins='auto', color='blue') plt.title("Histogram for Model Clf1 Anomaly Scores") plt.show() df_test = y_train_scores2.copy() df_test['score'] = y_test2 df_test['cluster'] = np.where(df_test['score'] < 4, 0, 1) df_test['cluster'].value_counts() df_test.groupby('cluster').mean()
x_train = data_dict["train"] x_test = data_dict["test"] x_test_labels = data_dict["test_labels"] start = time.time() # data preprocessing for MSCRED od = AutoEncoder( hidden_neurons=hidden_neurons, batch_size=batch_size, epochs=epochs, l2_regularizer=l2_regularizer, verbose=1, ) od.fit(x_train) # get outlier scores anomaly_score = od.decision_function(x_test) anomaly_label = x_test_labels end = time.time() time = end - start evaluate_all(anomaly_score, anomaly_label) salience = compute_salience(anomaly_score, anomaly_label) print('time') print(' ', time) print('salience') print(' ', salience)
class TestAutoEncoder(unittest.TestCase): def setUp(self): self.n_train = 6000 self.n_test = 1000 self.n_features = 300 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = AutoEncoder(epochs=5, contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): # TODO: fix estimator check for AutoEncoder # check_estimator(self.clf) pass def test_parameters(self): assert_true( hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true( hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true( hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true( hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true( hasattr(self.clf, 'model_') and self.clf.model_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def tearDown(self): pass
import matplotlib.pyplot as plt # plt.scatter(X_train[0], X_train[1], c=y_train, alpha=0.8) # plt.title('Scatter plot') # plt.xlabel('x') # plt.ylabel('y') # plt.show() # 自编码函数 clf1 = AutoEncoder(hidden_neurons=[25, 2, 2, 25]) clf1.fit(X_train) # Get the outlier scores for the train data y_train_scores = clf1.decision_scores_ # Predict the anomaly scores 预测函数 y_test_scores = clf1.decision_function(X_test) # outlier scores y_test_scores = pd.Series(y_test_scores) # Plot it! import matplotlib.pyplot as plt # plt.hist(y_test_scores, bins='auto') # plt.title("Histogram for Model Clf1 Anomaly Scores") # plt.show() # df_test = X_test.copy() df_test['score'] = y_test_scores df_test['cluster'] = np.where(df_test['score'] < 4, 0, 1) df_test['cluster'].value_counts() df_test.groupby('cluster').mean()