class PhotonOneClassSVM(BaseEstimator, ClassifierMixin): def __init__(self, kernel='rbf', nu=0.5): self.kernel = kernel self.nu = nu self.my_svm = None def fit(self, X, y=None): self.my_svm = OneClassSVM(kernel=self.kernel, nu=self.nu) self.my_svm.fit(X[y == 1]) return self def predict(self, X): return self.my_svm.predict(X) def score(self, X, y): y_pred = self.predict(X) return accuracy_score(y, y_pred) def set_params(self, **params): if 'kernel' in params: self.kernel = params['kernel'] if 'nu' in params: self.nu = params['nu'] if not self.my_svm: self.my_svm = OneClassSVM(kernel=self.kernel, nu=self.nu) self.my_svm.set_params(**params) def get_params(self, deep=True): if not self.my_svm: self.my_svm = OneClassSVM(kernel=self.kernel, nu=self.nu) svm_dict = self.my_svm.get_params(deep) return svm_dict
def __init__(self, params): super(%CLASS%, self).__init__(params) tmp = OneClassSVM() params = tmp.get_params() for key in params: self.create_new_input(type_="data", label=key, widget_name="std line edit m", widget_pos="besides", pos=-1) del tmp
def __init__(self, params): super(%CLASS%, self).__init__(params) tmp = OneClassSVM() params = tmp.get_params() for key in params: self.create_new_output(type_="data", label=key, pos=-1) del tmp self.create_new_output(type_="data", label="param dict", pos=-1)
def one_class_svm(training_vectors, test_vectors_clean, test_vectors_anomalous): """Predicting Outlier using a one Class SVM """ print("Starting One Class SVM...") # Fitting model for novel predictions svm = OneClassSVM(gamma = 'auto', kernel = 'rbf', nu = 0.05) print("Fitting with Parameters: ", svm.get_params()) result_training = svm.fit_predict(training_vectors) print("Fitting successful!") print("Starting Prediction...") # Predict returns 1 for inlier and -1 for outlier result_clean = svm.predict(test_vectors_clean) result_anomalous = svm.predict(test_vectors_anomalous) print("Predicting successful!") print("**************************") return result_clean, result_anomalous, result_training
def get_model(self, queue=None): log_Pattern = r'^(?P<remote_addr>.*?) - (?P<remote_user>.*) \[(?P<time_local>.*?) \+[0-9]+?\] "(?P<request>.*?)" ' \ '(?P<status>.*?) (?P<body_bytes_sent>.*?) "(?P<http_referer>.*?)" "(?P<http_user_agent>.*?)"$' log_regx = re.compile(log_Pattern) # 输出重定向 __console__ = sys.stdout sys.stdout = open( os.path.join(self.root_path, "analog/log/train_log.txt"), 'w+') start = datetime.now() print("Start at {}".format(start.strftime("%Y/%m/%d %H:%M:%S"))) train_example = [] white_example = [] black_example = [] # 读取训练集 self.read_txt(self.train_log_path, train_example) # with open(self.train_log_path, "r") as file: # line = file.readline().strip("\r\n") # while line: # log_tuple = log_regx.search(line) # line = file.readline().strip("\r\n") # if log_tuple is not None: # train_example.append(TfidfVector.get_url(log_tuple.group('request'))) # 读取黑样本集 self.read_txt(self.test_black_path, black_example) # with open(self.test_black_path, "r") as file: # line = file.readline().strip("\r\n") # while line: # log_tuple = log_regx.search(line) # line = file.readline().strip("\r\n") # if log_tuple is not None: # black_example.append(TfidfVector.get_url(log_tuple.group('request'))) # 读取白样本集(日志格式) self.read_txt(self.test_white_path, white_example) # with open(test_white_path, "r") as file: # line = file.readline().strip("\r\n") # while line: # log_tuple = log_regx.search(line) # line = file.readline().strip("\r\n") # if log_tuple is not None: # white_example.append(TfidfVector.get_url(log_tuple.group('request'))) # 读取白样本集(纯路径格式) # with open(self.test_white_path, "r") as file: # line = file.readline().strip("\r\n") # while line: # white_example.append(line) # line = file.readline().strip("\r\n") tf_idf_vector = TfidfVector() # 特征向量化训练样本 train_vector = tf_idf_vector.fit_vector # 特征向量化黑白样本 test_normal_vector = tf_idf_vector.transform(white_example) test_abnormal_vector = tf_idf_vector.transform(black_example) y = [1] * (len(train_example)) # ============================================= 遍历调优参数nu与gamma ========================================== grid = { 'gamma': np.logspace(-8, 1, 10), 'nu': np.linspace(0.01, 0.20, 20) } # 核函数(rbf,linear,poly) kernel = 'rbf' # 最高准确度、召回率、F1值纪录 max_F1 = 0 max_Re = 0 max_Pr = 0 # 最高准确度、召回率、F1值时参数gamma的值 gamma_r_F1 = 0.01 gamma_r_Re = 0.01 gamma_r_Pr = 0.01 # 最高准确度、召回率、F1值时参数nu的值 nu_r_F1 = 0 nu_r_Re = 0 nu_r_Pr = 0 svdd = OneClassSVM(kernel=kernel) zero_count = 0 re_gamma = 0 total_loop = len(ParameterGrid(grid)) process_count = 0 for z in ParameterGrid(grid): process_count += 1 queue.put_nowait("{:0.4f}".format(process_count / total_loop)) if re_gamma == z.get('gamma'): if zero_count >= 4: continue else: zero_count = 0 # re_gamma = z.get('gamma') # zero_count = 0 # print("This parameter gamma({}) maybe too small. So pass it for saving time.".format(z.get('gamma'))) # # if : # continue svdd.set_params(**z) svdd.fit(train_vector, y) k = svdd.get_params() # 正常样本测试 f = svdd.predict(test_normal_vector) TP = f.tolist().count(1) # True positive FN = f.tolist().count(-1) # False Negative # 异常样本测试 f = svdd.predict(test_abnormal_vector) FP = f.tolist().count(1) # False positive Precision = 0 if TP == 0 else (TP / (TP + FP)) # Precision Recall = 0 if TP == 0 else (TP / (TP + FN)) # Recall if Recall == 0 or Precision == 0: F1_score = 0 zero_count += 1 re_gamma = k.get('gamma') else: F1_score = 2 * Precision * Recall / (Precision + Recall ) # F1 value if F1_score > max_F1: max_F1 = F1_score nu_r_F1 = k.get('nu') gamma_r_F1 = k.get('gamma') if Recall > max_Re: max_Re = Recall nu_r_Re = k.get('nu') gamma_r_Re = k.get('gamma') if Precision > max_Pr: max_Pr = Precision nu_r_Pr = k.get('nu') gamma_r_Pr = k.get('gamma') print( "========================== [{}] ===========================". format(datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) print( "nu: ", k.get('nu'), 'gamma', k.get('gamma'), ) print("Precision: {}%".format(Precision * 100)) print("Recall: {}%".format(Recall * 100)) print("F1 score: {}".format(F1_score)) print("========================== [{}] ===========================". format(datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) print( "MAX Precision: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}" .format(max_Pr, nu_r_Pr, gamma_r_Pr)) print( "MAX Recall: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}" .format(max_Re, nu_r_Re, gamma_r_Re)) print( "MAX F1: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}" .format(max_F1, nu_r_F1, gamma_r_F1)) total_second = datetime.now() - start print("Cost {}s.".format(total_second.total_seconds())) queue.put_nowait("1") with open(os.path.join(self.root_path, "analog/cache/model.pkl"), 'wb') as file: svdd.set_params(kernel=kernel, nu=nu_r_F1, gamma=gamma_r_F1) svdd.fit(train_vector, y) pickle.dump(svdd, file) self.complete = True
def get_model(self): start = datetime.now() print("Start at {}".format(start.strftime("%Y/%m/%d %H:%M:%S"))) train_example = [] xss_example = [] non_xss_example = [] # 读取训练集(整理好的XSS Payload) self.read_txt(self.train_path, train_example) # 读取正常请求样本集 self.read_txt(self.test_none_xss_path, non_xss_example) # 读取攻击请求样本集 self.read_txt(self.test_xss_path, xss_example) # 特征向量化训练样本 tf_idf_vector = TfIdfVector() train_vector = tf_idf_vector.fit_vector # 特征向量化黑白样本 test_normal_vector = tf_idf_vector.transform(xss_example) test_abnormal_vector = tf_idf_vector.transform(non_xss_example) y = [1] * (len(train_example)) # 遍历调优参数nu与gamma grid = { 'gamma': np.logspace(-8, 1, 10), 'nu': np.linspace(0.01, 0.20, 20) } # 核函数(rbf,linear,poly) kernel = 'rbf' # 最高准确度、召回率、F1值纪录 max_F1 = 0 max_Re = 0 max_Pr = 0 # 最高准确度、召回率、F1值时参数gamma的值 gamma_r_F1 = 0.01 gamma_r_Re = 0.01 gamma_r_Pr = 0.01 # 最高准确度、召回率、F1值时参数nu的值 nu_r_F1 = 0 nu_r_Re = 0 nu_r_Pr = 0 svdd = OneClassSVM(kernel=kernel) zero_count = 0 re_gamma = 0 total_loop = len(ParameterGrid(grid)) process_count = 0 for z in ParameterGrid(grid): process_count += 1 if re_gamma == z.get('gamma'): if zero_count >= 4: continue else: zero_count = 0 svdd.set_params(**z) svdd.fit(train_vector, y) k = svdd.get_params() # 攻击请求样本测试 f = svdd.predict(test_normal_vector) TP = f.tolist().count(1) # True positive FN = f.tolist().count(-1) # False Negative # 非攻击样本测试 f = svdd.predict(test_abnormal_vector) FP = f.tolist().count(1) # False positive Precision = 0 if TP == 0 else (TP / (TP + FP)) # Precision Recall = 0 if TP == 0 else (TP / (TP + FN)) # Recall if Recall == 0 or Precision == 0: F1_score = 0 zero_count += 1 re_gamma = k.get('gamma') else: F1_score = 2 * Precision * Recall / (Precision + Recall ) # F1 value if F1_score > max_F1: max_F1 = F1_score nu_r_F1 = k.get('nu') gamma_r_F1 = k.get('gamma') if Recall > max_Re: max_Re = Recall nu_r_Re = k.get('nu') gamma_r_Re = k.get('gamma') if Precision > max_Pr: max_Pr = Precision nu_r_Pr = k.get('nu') gamma_r_Pr = k.get('gamma') print( "========================== [{}] ===========================". format(datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) print( "nu: ", k.get('nu'), 'gamma', k.get('gamma'), ) print("Precision: {}%".format(Precision * 100)) print("Recall: {}%".format(Recall * 100)) print("F1 score: {}".format(F1_score)) print("========================== [{}] ===========================". format(datetime.now().strftime("%Y/%m/%d %H:%M:%S"))) print( "MAX Precision: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}" .format(max_Pr, nu_r_Pr, gamma_r_Pr)) print( "MAX Recall: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}" .format(max_Re, nu_r_Re, gamma_r_Re)) print( "MAX F1: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}" .format(max_F1, nu_r_F1, gamma_r_F1)) total_second = datetime.now() - start print("Cost {}s.".format(total_second.total_seconds())) with open(os.path.join(self.root_path, "ModuleTrain/cache/model.pkl"), 'wb') as file: svdd.set_params(kernel=kernel, nu=nu_r_F1, gamma=gamma_r_F1) svdd.fit(train_vector, y) pickle.dump(svdd, file) self.complete = True
def get_model(self, queue=None): start = datetime.now() # Since logger is not pickleable until python 3.7, # we can init logger within this function. train_logger = Logger(logger_name="train_logger", log_path=self.log_path) train_logger.register_log_function("calc", "CALCU") train_logger.register_log_function("split", "SPLIT") train_logger.register_log_function("start", "START") train_logger.register_log_function("end", "-END-") train_logger.register_log_function("result", "RESULT") train_logger.start("Start Training.") train_example = [] white_example = [] black_example = [] pattern = self.config.get(self.section_name_log, 'log_content_pattern') # 读取训练集 read_by_group(self.train_log_path, train_example, pattern=pattern) # 读取黑样本集 read_by_group(self.test_black_path, black_example, pattern=pattern) # 读取白样本集 read_by_group(self.test_white_path, white_example, pattern=pattern) # 特征向量化训练样本 tf_idf_vector = TfidfVector(self.root_path, self.config) train_vector = tf_idf_vector.fit_vector # 特征向量化黑/白样本 test_normal_vector = tf_idf_vector.transform(white_example) test_abnormal_vector = tf_idf_vector.transform(black_example) # test_param_x = tf_idf_vector.transform(white_example + black_example) # test_param_y = [1] * len(white_example) + [-1] * len(black_example) # ============================================= 遍历调优参数nu与gamma ========================================== grid = {'gamma': np.logspace(-9, 1, 10), 'nu': np.linspace(0.00001, 0.2, 100)} # ======================================= GridSearchCV遍历调优参数nu与gamma ====================================== # scores = "f1" # clf = GridSearchCV(OneClassSVM(), grid, scoring=scores) # clf.fit(test_param_x, test_param_y) # ============================================================================================================== # 核函数(rbf,linear,poly) kernel = 'rbf' # 最高准确度、召回率、F1值纪录 max_F1 = 0 max_Re = 0 max_Pr = 0 # 最高准确度、召回率、F1值时参数gamma的值 gamma_r_F1 = 0.01 gamma_r_Re = 0.01 gamma_r_Pr = 0.01 # 最高准确度、召回率、F1值时参数nu的值 nu_r_F1 = 0 nu_r_Re = 0 nu_r_Pr = 0 svdd = OneClassSVM(kernel=kernel) zero_count = 0 re_gamma = 0 total_loop = len(ParameterGrid(grid)) process_count = 0 for z in ParameterGrid(grid): process_count += 1 queue.put_nowait("{:0.4f}".format(process_count / total_loop)) if re_gamma == z.get('gamma'): if zero_count >= 6: continue else: zero_count = 0 svdd = OneClassSVM(**z) svdd.fit(train_vector) k = svdd.get_params() # 正常样本测试 f = svdd.predict(test_normal_vector) TP = f.tolist().count(1) # True positive FN = f.tolist().count(-1) # False Negative # 异常样本测试 f = svdd.predict(test_abnormal_vector) FP = f.tolist().count(1) # False positive Precision = 0 if TP == 0 else (TP / (TP + FP)) # Precision Recall = 0 if TP == 0 else (TP / (TP + FN)) # Recall if Recall == 0 or Precision == 0: F1_score = 0 zero_count += 1 re_gamma = k.get('gamma') else: F1_score = 2 * Precision * Recall / (Precision + Recall) # F1 value if F1_score > max_F1: max_F1 = F1_score nu_r_F1 = k.get('nu') gamma_r_F1 = k.get('gamma') if Recall > max_Re: max_Re = Recall nu_r_Re = k.get('nu') gamma_r_Re = k.get('gamma') if Precision > max_Pr: max_Pr = Precision nu_r_Pr = k.get('nu') gamma_r_Pr = k.get('gamma') train_logger.split("=" * 60) train_logger.calc("nu: %.08f , gamma: %.04f" % (k.get('nu'), k.get('gamma'))) train_logger.calc("precision: {}%".format(Precision * 100)) train_logger.calc("recall: {}%".format(Recall * 100)) train_logger.calc("f1 score: {}".format(F1_score)) train_logger.split("=" * 60) train_logger.result( "MAX Precision:{:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}".format(max_Pr, nu_r_Pr, gamma_r_Pr)) train_logger.result( "MAX Recall: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}".format(max_Re, nu_r_Re, gamma_r_Re)) train_logger.result( "MAX F1: {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}".format(max_F1, nu_r_F1, gamma_r_F1)) total_second = datetime.now() - start train_logger.end("Cost {}s.".format(total_second.total_seconds())) queue.put_nowait("1") with open(os.path.join(self.root_path, "analog/cache/model.pkl"), 'wb') as file: # svdd = OneClassSVM(**clf.best_params_) svdd.set_params(kernel=kernel, nu=nu_r_F1, gamma=gamma_r_F1) svdd.fit(train_vector) pickle.dump(svdd, file) self.complete = True