def Investigate_hidden_size(data):
    train_X, test_X, test_y, actual= process_Data(data)  # 4 datasets in UCI, and NSL-KDD
    train_X, test_X = normalize_data(train_X, test_X)

    epoch   = 5
    h_size  = [2,3,4,5,6,7,8]
    k       = 1.0
    lr      = 0.01

    AUC_RE = np.empty([0,6])
    print("\nDataset:" + data)
    for i in range(0, len(h_size)):
        bw = (h_size[i]/2.0)**0.5
        print ("Hidden_siz: ", h_size[i], " bw: ", bw)

        ae, cen, kde, re = Compute_AUC_RE(data, train_X, test_X, actual,
									    h_size[i], epoch, k, lr, bw, "HZ")
        temp = np.column_stack([h_size[i], epoch, ae, cen, kde, re])
        AUC_RE = np.append(AUC_RE, temp)

    AUC_RE = np.reshape(AUC_RE, (len(h_size), 6))
    print(AUC_RE)
    np.savetxt("Results/Hidden_size/" + data + "_hidden_size.csv",
			   AUC_RE, delimiter=",",fmt='%f')
    Plotting_AUC_HZ(AUC_RE, data)
示例#2
0
def Investigate_hidden_size(data):
    train_X, test_X, test_y, actual = process_Data(
        data)  # 4 datasets in UCI, and NSL-KDD
    train_X, test_X = normalize_data(train_X, test_X)

    epoch = 5
    h_size = [2, 3, 4, 5, 6, 7, 8]
    k = 1.0
    lr = 0.01

    AUC_RE = np.empty([0, 6])
    print("\nDataset:" + data)
    for i in range(0, len(h_size)):
        bw = (h_size[i] / 2.0)**0.5
        print("Hidden_siz: ", h_size[i], " bw: ", bw)

        ae, cen, kde, re = Compute_AUC_RE(data, train_X, test_X, actual,
                                          h_size[i], epoch, k, lr, bw, "HZ")
        temp = np.column_stack([h_size[i], epoch, ae, cen, kde, re])
        AUC_RE = np.append(AUC_RE, temp)

    AUC_RE = np.reshape(AUC_RE, (len(h_size), 6))
    print(AUC_RE)
    np.savetxt("Results/Hidden_size/" + data + "_hidden_size.csv",
               AUC_RE,
               delimiter=",",
               fmt='%f')
    Plotting_AUC_HZ(AUC_RE, data)
示例#3
0
def investigate_svm(train_set, test_set, actual, scale, gamma, nu):
    train_set, test_set = normalize_data(train_set, test_set, scale)
    clf_svm = svm.OneClassSVM(nu=nu, kernel="rbf", gamma=gamma)
    clf_svm.fit(train_set)

    predictions_svm = clf_svm.decision_function(test_set)
    FPR_svm, TPR_svm, thresholds_svm = roc_curve(actual, predictions_svm)
    auc_svm = auc(FPR_svm, TPR_svm)
    return auc_svm
示例#4
0
def Investigate_lof(train_set, test_set, actual, scale, k):

    train_set, test_set = normalize_data(train_set, test_set, scale)
    neighbors = (int)(len(train_set) * k)
    clf_lof = LocalOutlierFactor(n_neighbors=neighbors)
    clf_lof.fit(train_set)
    predict = clf_lof._decision_function(test_set)
    FPR, TPR, thresholds = roc_curve(actual, predict)
    lof = auc(FPR, TPR)
    return lof
def Visualize_hidden_data(data):
	train_X, test_X, test_y, actual= process_Data(data)  # 4 datasets in UCI, and NSL-KDD
	train_X, test_X = normalize_data(train_X, test_X)

	epoch   = 5
	h_size  = 2
	k       = [0.1, 0.5, 1.0]
	lr      = 0.01

    #Default in One-class SVM
	bw = (h_size/2.0)**0.5
	print("Hidden_siz: ", h_size, " bw: ", bw)

	for i in range(0, len(k)):
		train_hidden, test_hidden = Compute_AUC_RE(data, train_X, test_X, actual,
									    h_size, epoch, k[i], lr, bw, "HD")
		_, test_h = normalize_data(train_hidden, test_hidden)
		test_h_X0 = test_h[actual==1]
		test_h_X1 = test_h[actual==0]
		Plotting_hidden_data(test_h_X0, test_h_X1, data, k[i])
示例#6
0
def auc_MX(training_set, testing_set, actual):

    training_set, testing_set = normalize_data(training_set, testing_set,
                                               "maxabs")

    clf_lof = mixture.GaussianMixture(n_components=1, covariance_type='full')
    clf_lof.fit(training_set)
    predict = clf_lof.score_samples(testing_set)
    FPR, TPR, thresholds = roc_curve(actual, predict)
    lof = auc(FPR, TPR)

    return lof
示例#7
0
def auc_LOF(training_set, testing_set, actual):

    training_set, testing_set = normalize_data(training_set, testing_set,
                                               "maxabs")
    neighbors = (int)(len(training_set) * 0.1)
    clf_lof = LocalOutlierFactor(n_neighbors=neighbors)
    clf_lof.fit(training_set)
    predict = clf_lof._decision_function(testing_set)
    FPR, TPR, thresholds = roc_curve(actual, predict)
    lof = auc(FPR, TPR)

    return lof
示例#8
0
def Visualize_hidden_data(data):
    train_X, test_X, test_y, actual = process_Data(
        data)  # 4 datasets in UCI, and NSL-KDD
    train_X, test_X = normalize_data(train_X, test_X)

    epoch = 5
    h_size = 2
    k = [0.1, 0.5, 1.0]
    lr = 0.01

    #Default in One-class SVM
    bw = (h_size / 2.0)**0.5
    print("Hidden_siz: ", h_size, " bw: ", bw)

    for i in range(0, len(k)):
        train_hidden, test_hidden = Compute_AUC_RE(data, train_X, test_X,
                                                   actual, h_size, epoch, k[i],
                                                   lr, bw, "HD")
        _, test_h = normalize_data(train_hidden, test_hidden)
        test_h_X0 = test_h[actual == 1]
        test_h_X1 = test_h[actual == 0]
        Plotting_hidden_data(test_h_X0, test_h_X1, data, k[i])
示例#9
0
def auc_IOF(training_set, testing_set, actual):
    # fit the model
    rng = np.random.RandomState(42)
    training_set, testing_set = normalize_data(training_set, testing_set,
                                               "maxabs")

    clf_iof = IsolationForest(random_state=rng)
    clf_iof.fit(training_set)
    score_iof = clf_iof.predict(testing_set)

    FPR, TPR, thresholds = roc_curve(actual, score_iof)
    iof = auc(FPR, TPR)
    return iof
def Main_Experiment(data):
	train_X, test_X, test_y, actual= process_Data(data)  # 4 datasets in UCI, and NSL-KDD
	train_X, test_X = normalize_data(train_X, test_X)

	epoch   = 5
	h_size  = 7
	k       = 1.0
	lr      = 0.01
	bw = (h_size/2.0)**0.5   #Default in One-class SVM
	ae, cen, kde, re = Compute_AUC_RE(data, train_X, test_X, actual,
									    h_size, epoch, k, lr, bw, "ME")

	print("********************************************************")
	print("Data: %s \nNormal train: %d \nNormal test: %d \nAnomaly test: %d"
	   %(data, len(train_X), len(test_X[actual==1]), len(test_X[actual == 0])))
	print("Hidden_size: %d \nBandwidth: %f \nLearning rate: %0.3f \nEpochs: %d"
		   %(h_size, bw, lr, epoch))
	print("Training error:%0.4f" %re)
示例#11
0
def Main_Experiment(data):
    train_X, test_X, test_y, actual = process_Data(
        data)  # 4 datasets in UCI, and NSL-KDD
    train_X, test_X = normalize_data(train_X, test_X)

    epoch = 5
    h_size = 7
    k = 1.0
    lr = 0.01
    bw = (h_size / 2.0)**0.5  #Default in One-class SVM
    ae, cen, kde, re = Compute_AUC_RE(data, train_X, test_X, actual, h_size,
                                      epoch, k, lr, bw, "ME")

    print("********************************************************")
    print("Data: %s \nNormal train: %d \nNormal test: %d \nAnomaly test: %d" %
          (data, len(train_X), len(
              test_X[actual == 1]), len(test_X[actual == 0])))
    print(
        "Hidden_size: %d \nBandwidth: %f \nLearning rate: %0.3f \nEpochs: %d" %
        (h_size, bw, lr, epoch))
    print("Training error:%0.4f" % re)
示例#12
0
def Main_Test():

    list_data = ["PageBlocks", "WPBC", "PenDigits", "GLASS", "Shuttle", "Arrhythmia",\
                 "CTU13_10", "CTU13_08","CTU13_09","CTU13_13",\
                 "Spambase", "UNSW", "NSLKDD", "InternetAds"]

    list_data = ["CTU13_10"]

    norm = "maxabs"
    corruptions = [0.1, 0.1, 0.1]

    print("DAE")
    print("+ Data: ", list_data)
    print("+ Scaler: ", norm)
    print("+ Corruptions: ", corruptions)

    AUC_Hidden = np.empty([0, 10])

    num = 0
    for data in list_data:
        num = num + 1
        h_sizes = hyper_parameters(data)

        train_set, test_set, actual = load_data(data)
        train_X, test_X = normalize_data(train_set, test_set, norm)

        train_X = theano.shared(numpy.asarray(train_X,
                                              dtype=theano.config.floatX),
                                borrow=True)
        test_X = theano.shared(numpy.asarray(test_X,
                                             dtype=theano.config.floatX),
                               borrow=True)

        datasets = [(train_X), (test_X), (actual)]

        in_dim = train_set.shape[1]
        n_vali = (int)(train_set.shape[0] / 5)
        n_train = len(train_set) - n_vali
        #batch     = int(n_train/20)

        pat, val, batch, n_batch = stopping_para_shrink(n_train)

        print("\n" + str(num) + ".", data, "...")
        print(" + Hidden Sizes: ", in_dim, h_sizes, "- Batch_sizes:", batch)
        print (" + Data: %d (%d train, %d vali) - %d normal, %d anomaly"\
            %(len(train_set), n_train, n_vali, \
            len(test_set[(actual == 1)]), len(test_set[(actual == 0)])))
        print(" + Patience: %5.0d, Validate: %5.0d,  \n + Batch size: %5.0d, n batch:%5.0d"\
             %(pat, val, batch, n_batch))

        sda, re = train_SdAE(pre_lr=1e-2,
                             end2end_lr=1e-4,
                             algo='adadelta',
                             dataset=datasets,
                             data_name=data,
                             n_validate=n_vali,
                             norm=norm,
                             batch_size=batch,
                             hidden_sizes=h_sizes,
                             corruptions=corruptions,
                             patience=pat,
                             validation=val)

        #*******Computer AUC on hidden data*************
        lof, cen, dis, kde, svm05, svm01, ae = sda.Compute_AUC_Hidden(
            train_X, test_X, actual, norm, data)
        auc_hidden = np.column_stack(
            [batch, re[0], lof, cen, dis, kde, svm05, svm01, ae, 100 * re[2]])
        AUC_Hidden = np.append(AUC_Hidden, auc_hidden)

        #save hidden data to files
#        sda.Save_Hidden_Data(train_X, test_X, data, path)

    AUC_Hidden = np.reshape(AUC_Hidden, (-1, 10))
    np.set_printoptions(precision=3, suppress=True)
    column_list = [2, 3, 4, 5, 6, 7, 8, 9]
    print("    LOF    CEN    MDIS   KDE   SVM5    SVM1    AE    RE*100")
    print(AUC_Hidden[:, column_list])
示例#13
0
def auc_density(training_set, testing_set, actual, scale):
    """Compute AUC for density-based methods: Centroid, Negative Mean Distances,
    Kernel Density Estimation and One-class Support Vector Machine, and LOF. 
    """
    #gamma = 1/2bw^2 = 1/n_feautes -> bw = (n_features/2)^0.5
    #h_default = (d/2.0)**0.5
    bw = (training_set.shape[1] / 2.0)**0.5  #default value in One-class SVM
    gamma = 1 / (2 * bw * bw)

    "*************** Centroid AE - Hidden layer **************"
    CEN = CentroidBasedOneClassClassifier()
    CEN.fit(training_set)
    predictions_cen = -CEN.get_density(testing_set)
    FPR_cen, TPR_cen, thresholds_cen = roc_curve(actual, predictions_cen)
    cen = auc(FPR_cen, TPR_cen)

    "****************** Negative Distance - Hidden layer **********************"
    clf_dis = DensityBasedOneClassClassifier(bandwidth=bw,
                                             kernel="really_linear",
                                             metric="euclidean",
                                             scale=scale)
    clf_dis.fit(training_set)
    predictions_dis = clf_dis.get_density(testing_set)
    FPR_dis, TPR_dis, thresholds_dis = roc_curve(actual, predictions_dis)
    dis = auc(FPR_dis, TPR_dis)

    "****************** KDE AE - Hidden layer*****************"
    #  ['gaussian'|'tophat'|'epanechnikov'|'exponential'|'linear'|'cosine']
    KDE = DensityBasedOneClassClassifier(bandwidth=bw,
                                         kernel="gaussian",
                                         metric="euclidean",
                                         scale=scale)
    KDE.fit(training_set)
    predictions_kde = KDE.get_density(testing_set)
    FPR_kde, TPR_kde, thresholds_kde = roc_curve(actual, predictions_kde)
    kde = auc(FPR_kde, TPR_kde)

    "********************* 1-SVM Hidden layer ***************************"
    training_set, testing_set = normalize_data(training_set, testing_set,
                                               scale)

    clf_05 = svm.OneClassSVM(nu=0.5, kernel="rbf", gamma=gamma)
    clf_05.fit(training_set)
    #n_support_vectors =  len(clf.support_vectors_)
    predictions_svm = clf_05.decision_function(testing_set)
    FPR_svm, TPR_svm, thresholds_svm = roc_curve(actual, predictions_svm)
    svm_05 = auc(FPR_svm, TPR_svm)

    "nu = 0.1"
    clf_01 = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=gamma)
    clf_01.fit(training_set)
    #num_01 =  len(clf_01.support_vectors_)
    predictions_svm_01 = clf_01.decision_function(testing_set)
    FPR_svm_01, TPR_svm_01, thresholds_svm_01 = roc_curve(
        actual, predictions_svm_01)
    svm_01 = auc(FPR_svm_01, TPR_svm_01)

    "******************************* LOF **********************************"
    neighbors = (int)(len(training_set) * 0.1)
    clf_lof = LocalOutlierFactor(n_neighbors=neighbors)
    clf_lof.fit(training_set)
    predict = clf_lof._decision_function(testing_set)
    FPR, TPR, thresholds = roc_curve(actual, predict)
    lof = auc(FPR, TPR)

    return lof, cen, dis, kde, svm_05, svm_01