def Investigate_hidden_size(data): train_X, test_X, test_y, actual= process_Data(data) # 4 datasets in UCI, and NSL-KDD train_X, test_X = normalize_data(train_X, test_X) epoch = 5 h_size = [2,3,4,5,6,7,8] k = 1.0 lr = 0.01 AUC_RE = np.empty([0,6]) print("\nDataset:" + data) for i in range(0, len(h_size)): bw = (h_size[i]/2.0)**0.5 print ("Hidden_siz: ", h_size[i], " bw: ", bw) ae, cen, kde, re = Compute_AUC_RE(data, train_X, test_X, actual, h_size[i], epoch, k, lr, bw, "HZ") temp = np.column_stack([h_size[i], epoch, ae, cen, kde, re]) AUC_RE = np.append(AUC_RE, temp) AUC_RE = np.reshape(AUC_RE, (len(h_size), 6)) print(AUC_RE) np.savetxt("Results/Hidden_size/" + data + "_hidden_size.csv", AUC_RE, delimiter=",",fmt='%f') Plotting_AUC_HZ(AUC_RE, data)
def Investigate_hidden_size(data): train_X, test_X, test_y, actual = process_Data( data) # 4 datasets in UCI, and NSL-KDD train_X, test_X = normalize_data(train_X, test_X) epoch = 5 h_size = [2, 3, 4, 5, 6, 7, 8] k = 1.0 lr = 0.01 AUC_RE = np.empty([0, 6]) print("\nDataset:" + data) for i in range(0, len(h_size)): bw = (h_size[i] / 2.0)**0.5 print("Hidden_siz: ", h_size[i], " bw: ", bw) ae, cen, kde, re = Compute_AUC_RE(data, train_X, test_X, actual, h_size[i], epoch, k, lr, bw, "HZ") temp = np.column_stack([h_size[i], epoch, ae, cen, kde, re]) AUC_RE = np.append(AUC_RE, temp) AUC_RE = np.reshape(AUC_RE, (len(h_size), 6)) print(AUC_RE) np.savetxt("Results/Hidden_size/" + data + "_hidden_size.csv", AUC_RE, delimiter=",", fmt='%f') Plotting_AUC_HZ(AUC_RE, data)
def investigate_svm(train_set, test_set, actual, scale, gamma, nu): train_set, test_set = normalize_data(train_set, test_set, scale) clf_svm = svm.OneClassSVM(nu=nu, kernel="rbf", gamma=gamma) clf_svm.fit(train_set) predictions_svm = clf_svm.decision_function(test_set) FPR_svm, TPR_svm, thresholds_svm = roc_curve(actual, predictions_svm) auc_svm = auc(FPR_svm, TPR_svm) return auc_svm
def Investigate_lof(train_set, test_set, actual, scale, k): train_set, test_set = normalize_data(train_set, test_set, scale) neighbors = (int)(len(train_set) * k) clf_lof = LocalOutlierFactor(n_neighbors=neighbors) clf_lof.fit(train_set) predict = clf_lof._decision_function(test_set) FPR, TPR, thresholds = roc_curve(actual, predict) lof = auc(FPR, TPR) return lof
def Visualize_hidden_data(data): train_X, test_X, test_y, actual= process_Data(data) # 4 datasets in UCI, and NSL-KDD train_X, test_X = normalize_data(train_X, test_X) epoch = 5 h_size = 2 k = [0.1, 0.5, 1.0] lr = 0.01 #Default in One-class SVM bw = (h_size/2.0)**0.5 print("Hidden_siz: ", h_size, " bw: ", bw) for i in range(0, len(k)): train_hidden, test_hidden = Compute_AUC_RE(data, train_X, test_X, actual, h_size, epoch, k[i], lr, bw, "HD") _, test_h = normalize_data(train_hidden, test_hidden) test_h_X0 = test_h[actual==1] test_h_X1 = test_h[actual==0] Plotting_hidden_data(test_h_X0, test_h_X1, data, k[i])
def auc_MX(training_set, testing_set, actual): training_set, testing_set = normalize_data(training_set, testing_set, "maxabs") clf_lof = mixture.GaussianMixture(n_components=1, covariance_type='full') clf_lof.fit(training_set) predict = clf_lof.score_samples(testing_set) FPR, TPR, thresholds = roc_curve(actual, predict) lof = auc(FPR, TPR) return lof
def auc_LOF(training_set, testing_set, actual): training_set, testing_set = normalize_data(training_set, testing_set, "maxabs") neighbors = (int)(len(training_set) * 0.1) clf_lof = LocalOutlierFactor(n_neighbors=neighbors) clf_lof.fit(training_set) predict = clf_lof._decision_function(testing_set) FPR, TPR, thresholds = roc_curve(actual, predict) lof = auc(FPR, TPR) return lof
def Visualize_hidden_data(data): train_X, test_X, test_y, actual = process_Data( data) # 4 datasets in UCI, and NSL-KDD train_X, test_X = normalize_data(train_X, test_X) epoch = 5 h_size = 2 k = [0.1, 0.5, 1.0] lr = 0.01 #Default in One-class SVM bw = (h_size / 2.0)**0.5 print("Hidden_siz: ", h_size, " bw: ", bw) for i in range(0, len(k)): train_hidden, test_hidden = Compute_AUC_RE(data, train_X, test_X, actual, h_size, epoch, k[i], lr, bw, "HD") _, test_h = normalize_data(train_hidden, test_hidden) test_h_X0 = test_h[actual == 1] test_h_X1 = test_h[actual == 0] Plotting_hidden_data(test_h_X0, test_h_X1, data, k[i])
def auc_IOF(training_set, testing_set, actual): # fit the model rng = np.random.RandomState(42) training_set, testing_set = normalize_data(training_set, testing_set, "maxabs") clf_iof = IsolationForest(random_state=rng) clf_iof.fit(training_set) score_iof = clf_iof.predict(testing_set) FPR, TPR, thresholds = roc_curve(actual, score_iof) iof = auc(FPR, TPR) return iof
def Main_Experiment(data): train_X, test_X, test_y, actual= process_Data(data) # 4 datasets in UCI, and NSL-KDD train_X, test_X = normalize_data(train_X, test_X) epoch = 5 h_size = 7 k = 1.0 lr = 0.01 bw = (h_size/2.0)**0.5 #Default in One-class SVM ae, cen, kde, re = Compute_AUC_RE(data, train_X, test_X, actual, h_size, epoch, k, lr, bw, "ME") print("********************************************************") print("Data: %s \nNormal train: %d \nNormal test: %d \nAnomaly test: %d" %(data, len(train_X), len(test_X[actual==1]), len(test_X[actual == 0]))) print("Hidden_size: %d \nBandwidth: %f \nLearning rate: %0.3f \nEpochs: %d" %(h_size, bw, lr, epoch)) print("Training error:%0.4f" %re)
def Main_Experiment(data): train_X, test_X, test_y, actual = process_Data( data) # 4 datasets in UCI, and NSL-KDD train_X, test_X = normalize_data(train_X, test_X) epoch = 5 h_size = 7 k = 1.0 lr = 0.01 bw = (h_size / 2.0)**0.5 #Default in One-class SVM ae, cen, kde, re = Compute_AUC_RE(data, train_X, test_X, actual, h_size, epoch, k, lr, bw, "ME") print("********************************************************") print("Data: %s \nNormal train: %d \nNormal test: %d \nAnomaly test: %d" % (data, len(train_X), len( test_X[actual == 1]), len(test_X[actual == 0]))) print( "Hidden_size: %d \nBandwidth: %f \nLearning rate: %0.3f \nEpochs: %d" % (h_size, bw, lr, epoch)) print("Training error:%0.4f" % re)
def Main_Test(): list_data = ["PageBlocks", "WPBC", "PenDigits", "GLASS", "Shuttle", "Arrhythmia",\ "CTU13_10", "CTU13_08","CTU13_09","CTU13_13",\ "Spambase", "UNSW", "NSLKDD", "InternetAds"] list_data = ["CTU13_10"] norm = "maxabs" corruptions = [0.1, 0.1, 0.1] print("DAE") print("+ Data: ", list_data) print("+ Scaler: ", norm) print("+ Corruptions: ", corruptions) AUC_Hidden = np.empty([0, 10]) num = 0 for data in list_data: num = num + 1 h_sizes = hyper_parameters(data) train_set, test_set, actual = load_data(data) train_X, test_X = normalize_data(train_set, test_set, norm) train_X = theano.shared(numpy.asarray(train_X, dtype=theano.config.floatX), borrow=True) test_X = theano.shared(numpy.asarray(test_X, dtype=theano.config.floatX), borrow=True) datasets = [(train_X), (test_X), (actual)] in_dim = train_set.shape[1] n_vali = (int)(train_set.shape[0] / 5) n_train = len(train_set) - n_vali #batch = int(n_train/20) pat, val, batch, n_batch = stopping_para_shrink(n_train) print("\n" + str(num) + ".", data, "...") print(" + Hidden Sizes: ", in_dim, h_sizes, "- Batch_sizes:", batch) print (" + Data: %d (%d train, %d vali) - %d normal, %d anomaly"\ %(len(train_set), n_train, n_vali, \ len(test_set[(actual == 1)]), len(test_set[(actual == 0)]))) print(" + Patience: %5.0d, Validate: %5.0d, \n + Batch size: %5.0d, n batch:%5.0d"\ %(pat, val, batch, n_batch)) sda, re = train_SdAE(pre_lr=1e-2, end2end_lr=1e-4, algo='adadelta', dataset=datasets, data_name=data, n_validate=n_vali, norm=norm, batch_size=batch, hidden_sizes=h_sizes, corruptions=corruptions, patience=pat, validation=val) #*******Computer AUC on hidden data************* lof, cen, dis, kde, svm05, svm01, ae = sda.Compute_AUC_Hidden( train_X, test_X, actual, norm, data) auc_hidden = np.column_stack( [batch, re[0], lof, cen, dis, kde, svm05, svm01, ae, 100 * re[2]]) AUC_Hidden = np.append(AUC_Hidden, auc_hidden) #save hidden data to files # sda.Save_Hidden_Data(train_X, test_X, data, path) AUC_Hidden = np.reshape(AUC_Hidden, (-1, 10)) np.set_printoptions(precision=3, suppress=True) column_list = [2, 3, 4, 5, 6, 7, 8, 9] print(" LOF CEN MDIS KDE SVM5 SVM1 AE RE*100") print(AUC_Hidden[:, column_list])
def auc_density(training_set, testing_set, actual, scale): """Compute AUC for density-based methods: Centroid, Negative Mean Distances, Kernel Density Estimation and One-class Support Vector Machine, and LOF. """ #gamma = 1/2bw^2 = 1/n_feautes -> bw = (n_features/2)^0.5 #h_default = (d/2.0)**0.5 bw = (training_set.shape[1] / 2.0)**0.5 #default value in One-class SVM gamma = 1 / (2 * bw * bw) "*************** Centroid AE - Hidden layer **************" CEN = CentroidBasedOneClassClassifier() CEN.fit(training_set) predictions_cen = -CEN.get_density(testing_set) FPR_cen, TPR_cen, thresholds_cen = roc_curve(actual, predictions_cen) cen = auc(FPR_cen, TPR_cen) "****************** Negative Distance - Hidden layer **********************" clf_dis = DensityBasedOneClassClassifier(bandwidth=bw, kernel="really_linear", metric="euclidean", scale=scale) clf_dis.fit(training_set) predictions_dis = clf_dis.get_density(testing_set) FPR_dis, TPR_dis, thresholds_dis = roc_curve(actual, predictions_dis) dis = auc(FPR_dis, TPR_dis) "****************** KDE AE - Hidden layer*****************" # ['gaussian'|'tophat'|'epanechnikov'|'exponential'|'linear'|'cosine'] KDE = DensityBasedOneClassClassifier(bandwidth=bw, kernel="gaussian", metric="euclidean", scale=scale) KDE.fit(training_set) predictions_kde = KDE.get_density(testing_set) FPR_kde, TPR_kde, thresholds_kde = roc_curve(actual, predictions_kde) kde = auc(FPR_kde, TPR_kde) "********************* 1-SVM Hidden layer ***************************" training_set, testing_set = normalize_data(training_set, testing_set, scale) clf_05 = svm.OneClassSVM(nu=0.5, kernel="rbf", gamma=gamma) clf_05.fit(training_set) #n_support_vectors = len(clf.support_vectors_) predictions_svm = clf_05.decision_function(testing_set) FPR_svm, TPR_svm, thresholds_svm = roc_curve(actual, predictions_svm) svm_05 = auc(FPR_svm, TPR_svm) "nu = 0.1" clf_01 = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=gamma) clf_01.fit(training_set) #num_01 = len(clf_01.support_vectors_) predictions_svm_01 = clf_01.decision_function(testing_set) FPR_svm_01, TPR_svm_01, thresholds_svm_01 = roc_curve( actual, predictions_svm_01) svm_01 = auc(FPR_svm_01, TPR_svm_01) "******************************* LOF **********************************" neighbors = (int)(len(training_set) * 0.1) clf_lof = LocalOutlierFactor(n_neighbors=neighbors) clf_lof.fit(training_set) predict = clf_lof._decision_function(testing_set) FPR, TPR, thresholds = roc_curve(actual, predict) lof = auc(FPR, TPR) return lof, cen, dis, kde, svm_05, svm_01