class PhotonOneClassSVM(BaseEstimator, ClassifierMixin): def __init__(self, kernel='rbf', nu=0.5): self.kernel = kernel self.nu = nu self.my_svm = None def fit(self, X, y=None): self.my_svm = OneClassSVM(kernel=self.kernel, nu=self.nu) self.my_svm.fit(X[y == 1]) return self def predict(self, X): return self.my_svm.predict(X) def score(self, X, y): y_pred = self.predict(X) return accuracy_score(y, y_pred) def set_params(self, **params): if 'kernel' in params: self.kernel = params['kernel'] if 'nu' in params: self.nu = params['nu'] if not self.my_svm: self.my_svm = OneClassSVM(kernel=self.kernel, nu=self.nu) self.my_svm.set_params(**params) def get_params(self, deep=True): if not self.my_svm: self.my_svm = OneClassSVM(kernel=self.kernel, nu=self.nu) svm_dict = self.my_svm.get_params(deep) return svm_dict
def update_event(self, input_called=-1): if input_called == 0: clf = OneClassSVM() if self.input(1) != None: clf.set_params(**self.input(1)) X = self.input(2) clf.fit(X) self.set_output_val(1, clf) self.exec_output(0)
def get_model(self, queue=None): log_Pattern = r'^(?P<remote_addr>.*?) - (?P<remote_user>.*) \[(?P<time_local>.*?) \+[0-9]+?\] "(?P<request>.*?)" ' \ '(?P<status>.*?) (?P<body_bytes_sent>.*?) "(?P<http_referer>.*?)" "(?P<http_user_agent>.*?)"$' log_regx = re.compile(log_Pattern) # 输出重定向 __console__ = sys.stdout sys.stdout = open( os.path.join(self.root_path, "analog/log/train_log.txt"), 'w+') start = datetime.now() print("Start at {}".format(start.strftime("%Y/%m/%d %H:%M:%S"))) train_example = [] white_example = [] black_example = [] # 读取训练集 self.read_txt(self.train_log_path, train_example) # with open(self.train_log_path, "r") as file: # line = file.readline().strip("\r\n") # while line: # log_tuple = log_regx.search(line) # line = file.readline().strip("\r\n") # if log_tuple is not None: # train_example.append(TfidfVector.get_url(log_tuple.group('request'))) # 读取黑样本集 self.read_txt(self.test_black_path, black_example) # with open(self.test_black_path, "r") as file: # line = file.readline().strip("\r\n") # while line: # log_tuple = log_regx.search(line) # line = file.readline().strip("\r\n") # if log_tuple is not None: # black_example.append(TfidfVector.get_url(log_tuple.group('request'))) # 读取白样本集(日志格式) self.read_txt(self.test_white_path, white_example) # with open(test_white_path, "r") as file: # line = file.readline().strip("\r\n") # while line: # log_tuple = log_regx.search(line) # line = file.readline().strip("\r\n") # if log_tuple is not None: # white_example.append(TfidfVector.get_url(log_tuple.group('request'))) # 读取白样本集(纯路径格式) # with open(self.test_white_path, "r") as file: # line = file.readline().strip("\r\n") # while line: # white_example.append(line) # line = file.readline().strip("\r\n") tf_idf_vector = TfidfVector() # 特征向量化训练样本 train_vector = tf_idf_vector.fit_vector # 特征向量化黑白样本 test_normal_vector = tf_idf_vector.transform(white_example) test_abnormal_vector = tf_idf_vector.transform(black_example) y = [1] * (len(train_example)) # ============================================= 遍历调优参数nu与gamma ========================================== grid = { 'gamma': np.logspace(-8, 1, 10), 'nu': np.linspace(0.01, 0.20, 20) } # 核函数(rbf,linear,poly) kernel = 'rbf' # 最高准确度、召回率、F1值纪录 max_F1 = 0 max_Re = 0 max_Pr = 0 # 最高准确度、召回率、F1值时参数gamma的值 gamma_r_F1 = 0.01 gamma_r_Re = 0.01 gamma_r_Pr = 0.01 # 最高准确度、召回率、F1值时参数nu的值 nu_r_F1 = 0 nu_r_Re = 0 nu_r_Pr = 0 svdd = OneClassSVM(kernel=kernel) zero_count = 0 re_gamma = 0 total_loop = len(ParameterGrid(grid)) process_count = 0 for z in ParameterGrid(grid): process_count += 1 queue.put_nowait("{:0.4f}".format(process_count / total_loop)) if re_gamma == z.get('gamma'): if zero_count >= 4: continue else: zero_count = 0 # re_gamma = z.get('gamma') # zero_count = 0 # print("This parameter gamma({}) maybe too small. So pass it for saving time.".format(z.get('gamma'))) # # if : # continue svdd.set_params(**z) svdd.fit(train_vector, y) k = svdd.get_params() # 正常样本测试 f = svdd.predict(test_normal_vector) TP = f.tolist().count(1) # True positive FN = f.tolist().count(-1) # False Negative # 异常样本测试 f = svdd.predict(test_abnormal_vector) FP = f.tolist().count(1) # False positive Precision = 0 if TP == 0 else (TP / (TP + FP)) # Precision Recall = 0 if TP == 0 else (TP / (TP + FN)) # Recall if Recall == 0 or Precision == 0: F1_score = 0 zero_count += 1 re_gamma = k.get('gamma') else: F1_score = 2 * Precision * Recall / (Precision + Recall ) # F1 value if F1_score > max_F1: max_F1 = F1_score nu_r_F1 = k.get('nu') gamma_r_F1 = k.get('gamma') if Recall > max_Re: max_Re = Recall nu_r_Re = k.get('nu') gamma_r_Re = k.get('gamma') if Precision > max_Pr: max_Pr = Precision nu_r_Pr = k.get('nu') gamma_r_Pr = k.get('gamma') print( "========================== [{}] ===========================". format(datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) print( "nu: ", k.get('nu'), 'gamma', k.get('gamma'), ) print("Precision: {}%".format(Precision * 100)) print("Recall: {}%".format(Recall * 100)) print("F1 score: {}".format(F1_score)) print("========================== [{}] ===========================". format(datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) print( "MAX Precision: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}" .format(max_Pr, nu_r_Pr, gamma_r_Pr)) print( "MAX Recall: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}" .format(max_Re, nu_r_Re, gamma_r_Re)) print( "MAX F1: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}" .format(max_F1, nu_r_F1, gamma_r_F1)) total_second = datetime.now() - start print("Cost {}s.".format(total_second.total_seconds())) queue.put_nowait("1") with open(os.path.join(self.root_path, "analog/cache/model.pkl"), 'wb') as file: svdd.set_params(kernel=kernel, nu=nu_r_F1, gamma=gamma_r_F1) svdd.fit(train_vector, y) pickle.dump(svdd, file) self.complete = True
selector = RFE(clsfRFE) selector.fit(arTraining_std,np.ones(arTraining.shape[0])) with open(sTargetPhone+'_'+sTrial+'.rank','w') as fRank: Indx = np.where(selector.ranking_==1)[0] print(Indx,file=fRank) fMaxAcc = -100.0 sMAXparam='' clfMax = '' idxmax = 0 for indx in (range(arTraining_std.shape[1]),):#Indx): for kernel in ['rbf']:#,'sigmoid']:#['poly', 'rbf', 'sigmoid']: for nu in [0.0000001,0.000001,0.00001,0.0001,0.001,0.01,0.1]:#,0.15,0.2,0.25,0.3,0.35,0.4]: for gam in [0.000001,0.00001,0.0001,0.001,0.01,0.1]:#,0.3,0.5]:#,1,10,100]: #print ('nu=',nu,'gamma=',gam,'kernel=',kernel) param = str(nu)+' '+str(gam) + ' '+ kernel clsf.set_params(**{'nu':nu,'gamma':gam,'kernel':kernel}) #gsC lsf = GridSearchCV(clsf,dParams,cv=tCVIndxs,scoring='scorer') #print('Training The Model') arTr = arTraining_std[:,indx] clsf.fit(arTr) #cls f.fit(arCVData,arCVLab) #print('Prediction') arV = arValidation_std[:,indx] y_pre = clsf.predict(arV) param = param + ' ' + str(indx).replace('\n','') #print (accuracy_score(y_valid_ref,y_pre)) fCurAcc = f1_score(y_valid_ref,y_pre)#accuracy_score(y_valid_ref,y_pre) xIn = accuracy_score(y_valid_ref[:iNumInClass],y_pre[:iNumInClass]) + math.exp(-100) #print('x=',x) fscoreIn = math.log(xIn) fscoreOut = 0.0
def get_model(self): start = datetime.now() print("Start at {}".format(start.strftime("%Y/%m/%d %H:%M:%S"))) train_example = [] xss_example = [] non_xss_example = [] # 读取训练集(整理好的XSS Payload) self.read_txt(self.train_path, train_example) # 读取正常请求样本集 self.read_txt(self.test_none_xss_path, non_xss_example) # 读取攻击请求样本集 self.read_txt(self.test_xss_path, xss_example) # 特征向量化训练样本 tf_idf_vector = TfIdfVector() train_vector = tf_idf_vector.fit_vector # 特征向量化黑白样本 test_normal_vector = tf_idf_vector.transform(xss_example) test_abnormal_vector = tf_idf_vector.transform(non_xss_example) y = [1] * (len(train_example)) # 遍历调优参数nu与gamma grid = { 'gamma': np.logspace(-8, 1, 10), 'nu': np.linspace(0.01, 0.20, 20) } # 核函数(rbf,linear,poly) kernel = 'rbf' # 最高准确度、召回率、F1值纪录 max_F1 = 0 max_Re = 0 max_Pr = 0 # 最高准确度、召回率、F1值时参数gamma的值 gamma_r_F1 = 0.01 gamma_r_Re = 0.01 gamma_r_Pr = 0.01 # 最高准确度、召回率、F1值时参数nu的值 nu_r_F1 = 0 nu_r_Re = 0 nu_r_Pr = 0 svdd = OneClassSVM(kernel=kernel) zero_count = 0 re_gamma = 0 total_loop = len(ParameterGrid(grid)) process_count = 0 for z in ParameterGrid(grid): process_count += 1 if re_gamma == z.get('gamma'): if zero_count >= 4: continue else: zero_count = 0 svdd.set_params(**z) svdd.fit(train_vector, y) k = svdd.get_params() # 攻击请求样本测试 f = svdd.predict(test_normal_vector) TP = f.tolist().count(1) # True positive FN = f.tolist().count(-1) # False Negative # 非攻击样本测试 f = svdd.predict(test_abnormal_vector) FP = f.tolist().count(1) # False positive Precision = 0 if TP == 0 else (TP / (TP + FP)) # Precision Recall = 0 if TP == 0 else (TP / (TP + FN)) # Recall if Recall == 0 or Precision == 0: F1_score = 0 zero_count += 1 re_gamma = k.get('gamma') else: F1_score = 2 * Precision * Recall / (Precision + Recall ) # F1 value if F1_score > max_F1: max_F1 = F1_score nu_r_F1 = k.get('nu') gamma_r_F1 = k.get('gamma') if Recall > max_Re: max_Re = Recall nu_r_Re = k.get('nu') gamma_r_Re = k.get('gamma') if Precision > max_Pr: max_Pr = Precision nu_r_Pr = k.get('nu') gamma_r_Pr = k.get('gamma') print( "========================== [{}] ===========================". format(datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) print( "nu: ", k.get('nu'), 'gamma', k.get('gamma'), ) print("Precision: {}%".format(Precision * 100)) print("Recall: {}%".format(Recall * 100)) print("F1 score: {}".format(F1_score)) print("========================== [{}] ===========================". format(datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) print( "MAX Precision: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}" .format(max_Pr, nu_r_Pr, gamma_r_Pr)) print( "MAX Recall: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}" .format(max_Re, nu_r_Re, gamma_r_Re)) print( "MAX F1: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}" .format(max_F1, nu_r_F1, gamma_r_F1)) total_second = datetime.now() - start print("Cost {}s.".format(total_second.total_seconds())) with open(os.path.join(self.root_path, "ModuleTrain/cache/model.pkl"), 'wb') as file: svdd.set_params(kernel=kernel, nu=nu_r_F1, gamma=gamma_r_F1) svdd.fit(train_vector, y) pickle.dump(svdd, file) self.complete = True
def get_model(self, queue=None): start = datetime.now() # Since logger is not pickleable until python 3.7, # we can init logger within this function. train_logger = Logger(logger_name="train_logger", log_path=self.log_path) train_logger.register_log_function("calc", "CALCU") train_logger.register_log_function("split", "SPLIT") train_logger.register_log_function("start", "START") train_logger.register_log_function("end", "-END-") train_logger.register_log_function("result", "RESULT") train_logger.start("Start Training.") train_example = [] white_example = [] black_example = [] pattern = self.config.get(self.section_name_log, 'log_content_pattern') # 读取训练集 read_by_group(self.train_log_path, train_example, pattern=pattern) # 读取黑样本集 read_by_group(self.test_black_path, black_example, pattern=pattern) # 读取白样本集 read_by_group(self.test_white_path, white_example, pattern=pattern) # 特征向量化训练样本 tf_idf_vector = TfidfVector(self.root_path, self.config) train_vector = tf_idf_vector.fit_vector # 特征向量化黑/白样本 test_normal_vector = tf_idf_vector.transform(white_example) test_abnormal_vector = tf_idf_vector.transform(black_example) # test_param_x = tf_idf_vector.transform(white_example + black_example) # test_param_y = [1] * len(white_example) + [-1] * len(black_example) # ============================================= 遍历调优参数nu与gamma ========================================== grid = {'gamma': np.logspace(-9, 1, 10), 'nu': np.linspace(0.00001, 0.2, 100)} # ======================================= GridSearchCV遍历调优参数nu与gamma ====================================== # scores = "f1" # clf = GridSearchCV(OneClassSVM(), grid, scoring=scores) # clf.fit(test_param_x, test_param_y) # ============================================================================================================== # 核函数(rbf,linear,poly) kernel = 'rbf' # 最高准确度、召回率、F1值纪录 max_F1 = 0 max_Re = 0 max_Pr = 0 # 最高准确度、召回率、F1值时参数gamma的值 gamma_r_F1 = 0.01 gamma_r_Re = 0.01 gamma_r_Pr = 0.01 # 最高准确度、召回率、F1值时参数nu的值 nu_r_F1 = 0 nu_r_Re = 0 nu_r_Pr = 0 svdd = OneClassSVM(kernel=kernel) zero_count = 0 re_gamma = 0 total_loop = len(ParameterGrid(grid)) process_count = 0 for z in ParameterGrid(grid): process_count += 1 queue.put_nowait("{:0.4f}".format(process_count / total_loop)) if re_gamma == z.get('gamma'): if zero_count >= 6: continue else: zero_count = 0 svdd = OneClassSVM(**z) svdd.fit(train_vector) k = svdd.get_params() # 正常样本测试 f = svdd.predict(test_normal_vector) TP = f.tolist().count(1) # True positive FN = f.tolist().count(-1) # False Negative # 异常样本测试 f = svdd.predict(test_abnormal_vector) FP = f.tolist().count(1) # False positive Precision = 0 if TP == 0 else (TP / (TP + FP)) # Precision Recall = 0 if TP == 0 else (TP / (TP + FN)) # Recall if Recall == 0 or Precision == 0: F1_score = 0 zero_count += 1 re_gamma = k.get('gamma') else: F1_score = 2 * Precision * Recall / (Precision + Recall) # F1 value if F1_score > max_F1: max_F1 = F1_score nu_r_F1 = k.get('nu') gamma_r_F1 = k.get('gamma') if Recall > max_Re: max_Re = Recall nu_r_Re = k.get('nu') gamma_r_Re = k.get('gamma') if Precision > max_Pr: max_Pr = Precision nu_r_Pr = k.get('nu') gamma_r_Pr = k.get('gamma') train_logger.split("=" * 60) train_logger.calc("nu: %.08f , gamma: %.04f" % (k.get('nu'), k.get('gamma'))) train_logger.calc("precision: {}%".format(Precision * 100)) train_logger.calc("recall: {}%".format(Recall * 100)) train_logger.calc("f1 score: {}".format(F1_score)) train_logger.split("=" * 60) train_logger.result( "MAX Precision:{:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}".format(max_Pr, nu_r_Pr, gamma_r_Pr)) train_logger.result( "MAX Recall: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}".format(max_Re, nu_r_Re, gamma_r_Re)) train_logger.result( "MAX F1: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}".format(max_F1, nu_r_F1, gamma_r_F1)) total_second = datetime.now() - start train_logger.end("Cost {}s.".format(total_second.total_seconds())) queue.put_nowait("1") with open(os.path.join(self.root_path, "analog/cache/model.pkl"), 'wb') as file: # svdd = OneClassSVM(**clf.best_params_) svdd.set_params(kernel=kernel, nu=nu_r_F1, gamma=gamma_r_F1) svdd.fit(train_vector) pickle.dump(svdd, file) self.complete = True