def getPCA(train_data, test_data): originalData = train_data #Extract the observation xData = originalData.drop(' Label', axis=1) #Define a StandardScaler scaling = StandardScaler() #Standardise the data X_std = scaling.fit_transform(xData) #Construct PCA data pca_std = PCA().fit(X_std) #Transform to PCA components pcaData = pca_std.transform(X_std) #Create the column for pcaData pcaCol = [] for i in range(pcaData.shape[1]): col = 'Component' + str(i + 1) pcaCol += [col] #Convernt numpy array to data frame pcaDf = pd.DataFrame(data=pcaData, columns=pcaCol) #Add Label column to pcaDf pcaDf[' Label'] = originalData[' Label'].values #Save the result in a pickle file util.pklSaver(pcaDf, 'PCA_data', path=util.getResourcePath() + '/Pickle Files/Data for model construction/First layer/') #Convert test Data to PCA testDataSet = test_data testData = testDataSet.drop(' Label', axis=1) #Using the same scaling in the train data to transform the test data testDataStd = scaling.transform(testData) test_pcaData = pca_std.transform(testDataStd) #Convernt numpy array to data frame test_pcaDf = pd.DataFrame(data=test_pcaData, columns=pcaCol) #Add Label column to pcaDf test_pcaDf[' Label'] = testDataSet[' Label'].values #Save the result in a pickle file util.pklSaver(test_pcaDf, 'PCA_testData', path=util.getResourcePath() + '/Pickle Files/Data for model construction/First layer/')
def getRanking(train_data, filename): # Examining feature importance by excluding three common features train_data = train_data.drop( [' Source Port', ' Destination Port', ' Protocol'], axis=1) #Seperating observation and label xData = train_data.drop([' Label'], axis=1) y = train_data[' Label'].values #Standardise the value of observations X_std = StandardScaler().fit_transform(xData) # feature extraction filterModel = SelectKBest(score_func=f_classif) model = filterModel.fit(X_std, y) # Construct Extra tree extraTree = classifier.extraTrees(train_data) # Construct gradient boosting gradientModel = classifier.lightGBM_model(train_data, customEval=True) #anova_imp= MinMaxScaler().fit_transform(np.array([model.scores_]).T).T # et_imp = MinMaxScaler().fit_transform(np.array([extraTree2.feature_importances_]).T).T # gd_imp = MinMaxScaler().fit_transform(np.array([gradientModel2.feature_importance()]).T).T # Save the results of threes algorithm to a data frame order = pd.DataFrame( { 'ANOVA F-value': model.scores_, 'Extra Tree': extraTree.feature_importances_, 'Gradient Boosting': gradientModel.feature_importance() }, index=xData.columns) order = order.fillna(0) #Rescale the value of each algorithm in the range [0, 1] orderScaled = MinMaxScaler().fit_transform(order) orderScaled = pd.DataFrame(orderScaled, index=order.index, columns=order.columns) #Adding one more column to sum the value of three algorithms orderScaled['Total'] = orderScaled.sum(axis=1) orderScaled = orderScaled.round(3) #Save the result to a pickle file and sort the sum value from the highest to the lowest util.pklSaver(orderScaled, filename, path=util.getResourcePath() + '/Pickle Files/Feature Importance/') print(orderScaled.sort_values('Total', ascending=False))
def getAllData(folderPath=util.getResourcePath() + '/Pickle Files/Original Data/Benigns/'): #Get the list of files in the folder files = [] for (dirPath, dirNames, fileName) in walk(folderPath): files.extend(fileName) #Initial the allBenign allData = util.pklReader(files[0], path=folderPath) #Append each benign to allBenign for i in range(1, len(files)): pklData = util.pklReader(files[i], path=folderPath) allData = allData.append(pklData) print('Total length of allData: ', len(allData)) return allData
def secondLayerWeights(): #Loading the test set allAttack20 = util.pklReader( 'AllAttack20', path=util.getResourcePath() + '/Pickle Files/Data for model construction/Second layer/') # Convert the tring lable to numeric number count = 1 for att in allAttack20: allAttack20[att][' Label'] = count count = count + 1 #A dictionary to store the number of misclassification for each value of lightGBM weight allErr = {} start = time.time() #Calculate the prediction accuracy for each attack for i in range(50): #randomly select value for lightGBM a = np.random.uniform(low=0.7, high=1) totalErr = 0 for attack in allAttack20: X_test = allAttack20[attack].drop( [' Source Port', ' Destination Port', ' Protocol', ' Label'], axis=1) y_test = allAttack20[attack][' Label'].values y_predict = getPrediction(X_test, a) error = (y_predict != y_test).sum() totalErr = totalErr + error allErr[a] = totalErr print('The weight of lightGBM: ', a, ', Number of errors: ', totalErr) #Calculate the time end = time.time() runningTime = (end - start) / 60 #lightGBM weight is the one that minimise the number of misclassification lightGBM_weight = min(allErr, key=lambda key: allErr[key]) print('Running time: ', runningTime) print('Max AUC: ', lightGBM_weight, allErr[lightGBM_weight]) return lightGBM_weight
def dataProcessor(day, inputData): #Perform some analysis print('Number of column: ', len(inputData.columns)) #Display distinct value in Lable column labelSet = set(inputData[' Label']) print('The distinct value in Label column: ', labelSet) #Original shape origShape = inputData.shape print('Shape of original data: ', origShape) #Remove unnecessary columns rmvCol = ['Flow ID', ' Source IP', ' Destination IP', ' Timestamp'] inputData = inputData.drop(rmvCol, axis=1) #Check NaN and Infinity value in two columns objCol = ['Flow Bytes/s', ' Flow Packets/s'] #Convert 'Flow Bytes/s' and 'Flow Packets/s' to float (NaN and Infinity value will become nan and inf respectively) inputData[objCol] = inputData[objCol].astype(float) #Replace inf by nan inputData[objCol] = inputData[objCol].replace(np.inf, np.nan) print('Total number of nan: ', inputData[objCol].isna().sum()) print('before: ', inputData.shape) # #Drop nan rows inputData = inputData.dropna() print('The final shape of the data: ', inputData.shape) #Check duplicate data after removing column and nan, inf origShape = inputData.shape inputData.drop_duplicates() rmvShape = inputData.shape print('Shape of the dataset after removing duplicates: ', rmvShape) print('Number of duplicate rows: ', origShape[0] - rmvShape[0]) #Store data of each attack in pkl file labelList = list(labelSet) count = 0 for label in labelList: dataLabel = inputData[inputData[' Label'] == label] print('Number of ' + label + ' :', dataLabel.shape) count += dataLabel.shape[0] #pklSaver(dataLabel,label) if (label == 'BENIGN'): util.pklSaver(dataLabel, day + '-Benign', path=util.getResourcePath() + '/Pickle Files/Original Data/Benigns/') else: util.pklSaver(dataLabel, label, path=util.getResourcePath() + '/Pickle Files/Original Data/Attacks/') print('Total: ', count) return {'label': labelSet, 'data': inputData}
for i in range(1, len(files)): pklData = util.pklReader(files[i], path=folderPath) allData = allData.append(pklData) print('Total length of allData: ', len(allData)) return allData #Perform dataPreprocessing dataProcessor('Tues', tues) #Get all Benign allBenign = getAllData() util.pklSaver(allBenign, 'All Benign', path=util.getResourcePath() + '/Pickle Files/Original Data/') #Get all attack allAttacks = getAllData(folderPath=util.getResourcePath() + '/Pickle Files/Original Data/Attacks/') util.pklSaver(allAttacks, 'All Attacks', path=util.getResourcePath() + '/Pickle Files/Original Data/') # Convert the Label of allAttack to 1 and allBenign to 0 allAttacks[' Label'] = 1 allBenign[' Label'] = 0 allData = allAttacks.append(allBenign) #Display the length of all data and save it to 'All Data' pkl file print('Length of all data: ', len(allData))
totalErr=0 totalLen=0 #Evaluate the accuracy of each attack for attack in testSet: #Separate obsevation and label X_test=testSet[attack].drop([' Source Port', ' Destination Port', ' Protocol',' Label'],axis=1) y_test=testSet[attack][' Label'].values #Get the prediction for the observation y_predict=secondLayer.getClassification(X_test) #Calculate the number of misclassification error=(y_predict!=y_test).sum() totalErr+=error totalLen+=len(testSet[attack]) #Display attack name, total observation, number of misclassification and the accuracy print(attack,', misclassification rate: ',error,'/',len(testSet[attack]),', Accuracy: ',100-error*100/len(y_predict)) print('Total number of misclassification: ',totalErr) print('Overall accuracy: ',1-totalErr/totalLen) #Display the evaluation result #Loading the test set for the first layer test_set=util.pklReader('Testset',path=util.getResourcePath() +'/Pickle Files/Data for model construction/First Layer/') print('Evaluating the performance of the first layer ...') firstLayerEval(test_set) #Loading the test set for the second layer allAttack20=util.pklReader('AllAttack20',path=util.getResourcePath() +'/Pickle Files/Data for model construction/Second layer/') print('\nEvaluating the performance of the second layer ...') secondLayerEval(allAttack20)
''' import util import firstLayer import matplotlib.pyplot as plt from sklearn import metrics import numpy as np #Loading the real traffic real_traffic=util.dataConvRealTraff('HulkWithTime.pcap_ISCX') #Label the DoS as the attack target port 8080 real_traffic.ix[real_traffic[' Destination Port']==8080, ' Label'] = 1 #Load the test set of the evaluation dataset testDataSet=util.pklReader('Testset',path=util.getResourcePath()+'/Pickle Files/Data for model construction/First Layer/') def getRocInfo(data): #Exclude the label from observations x_test=data.drop(' Label',axis=1) y_test=data[' Label'].values #Get the prediction for the data pred=firstLayer.getPred(x_test) #Compute fpr, tpr fpr, tpr, threshold = metrics.roc_curve(y_test, pred) roc_auc=metrics.auc(fpr,tpr) return {'pred': pred, 'fpr': fpr, 'tpr': tpr, 'auc':roc_auc}
#If the dataset is created from PCA else: col = list(train_data.columns) featureList = col[:n] + [' Label'] data = train_data[featureList] testSet = test_data[featureList] return {'train': data, 'test': testSet} ## Perform hyperparameters for the first layer # Loading train and test set originalData = util.pklReader( 'Original', path=util.getResourcePath() + '/Pickle Files/Data for model construction/First layer/') testDataSet = util.pklReader( 'Testset', path=util.getResourcePath() + '/Pickle Files/Data for model construction/First layer/') impList = util.pklReader( 'ImportanceList', path=util.getResourcePath() + '/Pickle Files/Data for model construction/First layer/') pca_trainData = util.pklReader( 'PCA_data', path=util.getResourcePath() + '/Pickle Files/Data for model construction/First layer/') pca_testData = util.pklReader( 'PCA_testData',
def hyperparaTuning(data, testSet, expName, mode=2, storedPath=util.getResourcePath() + '/Pickle Files/Models/First Layer/'): # Construct the set of hyperparameters for each algorithm etTree_params = { "n_estimators": [150, 250, 350], "max_features": [None, 'sqrt', 'log2'], "min_samples_leaf": [64, 128, 256] } lightGBM_params = { "learning_rate": [0.06, 0.08, 0.1], "num_leaves": [15, 31, 63], "max_bin": [63, 127, 255], "feature_fraction": [0.6, 0.8, 0.9] } knn_params = { "n_neighbors": np.arange(5, 47, 2), "weights": ["uniform", "distance"], "metric": ["euclidean", "manhattan", "chebyshev"] } #Construct a model for each algorithm et_model = ExtraTreesClassifier() lgbm_model = lgbm.LGBMClassifier(objective='binary') knn_model = KNeighborsClassifier() #Construct the training and test data trainData = data.drop(' Label', axis=1) y_train = data[' Label'].values testData = testSet.drop(' Label', axis=1) y_test = testSet[' Label'].values #Perform Extremely Randomized ALgorithm if (mode == 1): modelName = 'ExtraTrees' params = etTree_params model = et_model #Performing LightGBM elif (mode == 2): modelName = 'LightGBM' params = lightGBM_params model = lgbm_model #Performing KNN algorithm else: modelName = 'KNN' params = knn_params model = knn_model #Standardise the data in the case of KNN scaling = StandardScaler() trainData = scaling.fit_transform(trainData) testData = scaling.transform(testData) # tune the hyperparameters via a cross-validated Randomized search grid = RandomizedSearchCV(model, params, verbose=1, cv=5, n_jobs=1) start = time.time() grid.fit(trainData, y_train) #Calculate the time end = time.time() runningTime = (end - start) / 60 # evaluate the best grid searched model on the testing data preds = grid.predict_proba(testData) auc = roc_auc_score(y_test, preds[:, 1]) print("Experiment: ", expName) print("Randomized search best parameters: {}".format(grid.best_params_)) print("AUC of the best model: ", auc) print("Running time: ", runningTime) #Save the model util.pklSaver(grid, expName, path=storedPath + modelName + '/')
def firstLayerWeights(): #Loading test data and two subsets of test data subset1 = util.pklReader( 'subset1', path=util.getResourcePath() + '/Pickle Files/Data for model construction/Subset/') subset2 = util.pklReader( 'subset2', path=util.getResourcePath() + '/Pickle Files/Data for model construction/Subset/') #Loading the best model of each algorithms et_model = util.pklReader('Exp1', path=util.getResourcePath() + '/Pickle Files/Models/First Layer/ExtraTrees/') lgbm_model = util.pklReader('Exp2', path=util.getResourcePath() + '/Pickle Files/Models/First Layer/LightGBM/') #Uses subset1 as training data to search for weight impList = util.pklReader('ImportanceList', path=util.getResourcePath() + '/Pickle Files/Data for model construction/') colET = list( subset1.drop( [' Source Port', ' Destination Port', ' Protocol', ' Label'], axis=1).columns) x_subset1ET = subset1[colET] x_subset1LGBM = subset1[impList[:35]] y_subset1 = subset1[' Label'].values #used subset2 as test data #x_subset2=subset2.drop(' Label',axis=1) x_subset2ET = subset2[colET] x_subset2LGBM = subset2[impList[:35]] y_subset2 = subset2[' Label'].values #A dictionary to store AUC score of each value of lightGBM weights auc_score = {} start = time.time() #Used only lightGBM and ExtraTrees for i in range(100): #randomly select value for lightGBM a = np.random.uniform(low=0.5, high=1) b = 1 - a predict = a * lgbm_model.predict_proba( x_subset1LGBM) + b * et_model.predict_proba(x_subset1ET) auc = roc_auc_score(y_subset1, predict[:, 1]) auc_score[a] = auc print('The weight of lightGBM: ', a, ', AUC score: ', auc) #Calculate the time end = time.time() runningTime = end - start #Get the weight of lightGBM which maximise AUC score lightGBM_weight = max(auc_score, key=lambda key: auc_score[key]) #Display results print('Running time: ', runningTime) print('Max AUC: ', lightGBM_weight, auc_score[lightGBM_weight]) #Compare the result with individual algorithm print( 'LightGBM', roc_auc_score(y_subset1, lgbm_model.predict_proba(x_subset1LGBM)[:, 1])) print('ExtraTrees', roc_auc_score(y_subset1, et_model.predict_proba(x_subset1ET)[:, 1])) #For the second subset predict = lightGBM_weight * lgbm_model.predict_proba(x_subset2LGBM) + ( 1 - lightGBM_weight) * et_model.predict_proba(x_subset2ET) auc = roc_auc_score(y_subset2, predict[:, 1]) print('\n The performance on the second subset') print('The ensemble model: ', auc) print( 'LightGBM', roc_auc_score(y_subset2, lgbm_model.predict_proba(x_subset2LGBM)[:, 1])) print('ExtraTrees', roc_auc_score(y_subset2, et_model.predict_proba(x_subset2ET)[:, 1])) return lightGBM_weight
attNames=['BruteForce','DoS','Web','Bot','PortScan','DDoS'] #The number of attack n_attack=len(attNames) #clfKey is a list represent 15 pairs of 6 attack types clfKeys=[] for i in range(n_attack-1): for j in range (i+1,n_attack): clfKeys+=[attNames[i]+'-'+attNames[j]] #Construct the second layer of the IDS system using one vs one approach #Using ExtraTrees classifier to construct 45 models et_clfModels={} lgbm_clfModels={} for clfkey in clfKeys: lgbm_clfModels[clfkey]=util.pklReader(clfkey,path=util.getResourcePath()+'/Pickle Files/Models/Second Layer/LightGBM/') #Using constructed models to predict the data def getClassification(X_test): initialData=np.zeros((len(X_test),n_attack)) predictResult=pd.DataFrame(initialData,columns=attNames) #Sum the probability of each classifier for clfkey in clfKeys: key=clfkey.split('-') lgbm_clfModel=lgbm_clfModels[clfkey] y_predict=lgbm_clfModel.predict_proba(X_test) predictResult[key[0]]+=y_predict[:,0] predictResult[key[1]]+=y_predict[:,1]