示例#1
0
class PhotonOneClassSVM(BaseEstimator, ClassifierMixin):
    def __init__(self, kernel='rbf', nu=0.5):
        self.kernel = kernel
        self.nu = nu
        self.my_svm = None

    def fit(self, X, y=None):
        self.my_svm = OneClassSVM(kernel=self.kernel, nu=self.nu)
        self.my_svm.fit(X[y == 1])
        return self

    def predict(self, X):
        return self.my_svm.predict(X)

    def score(self, X, y):
        y_pred = self.predict(X)

        return accuracy_score(y, y_pred)

    def set_params(self, **params):
        if 'kernel' in params:
            self.kernel = params['kernel']
        if 'nu' in params:
            self.nu = params['nu']

        if not self.my_svm:
            self.my_svm = OneClassSVM(kernel=self.kernel, nu=self.nu)
        self.my_svm.set_params(**params)

    def get_params(self, deep=True):
        if not self.my_svm:
            self.my_svm = OneClassSVM(kernel=self.kernel, nu=self.nu)
        svm_dict = self.my_svm.get_params(deep)
        return svm_dict
示例#2
0
    def update_event(self, input_called=-1):
        if input_called == 0:
            clf = OneClassSVM()
            if self.input(1) != None:
                clf.set_params(**self.input(1))
            X = self.input(2)

            clf.fit(X)
            self.set_output_val(1, clf)

            self.exec_output(0)
示例#3
0
文件: train.py 项目: zqd1996/analog
    def get_model(self, queue=None):
        log_Pattern = r'^(?P<remote_addr>.*?) - (?P<remote_user>.*) \[(?P<time_local>.*?) \+[0-9]+?\] "(?P<request>.*?)" ' \
                      '(?P<status>.*?) (?P<body_bytes_sent>.*?) "(?P<http_referer>.*?)" "(?P<http_user_agent>.*?)"$'
        log_regx = re.compile(log_Pattern)
        # 输出重定向
        __console__ = sys.stdout
        sys.stdout = open(
            os.path.join(self.root_path, "analog/log/train_log.txt"), 'w+')
        start = datetime.now()
        print("Start at {}".format(start.strftime("%Y/%m/%d %H:%M:%S")))

        train_example = []
        white_example = []
        black_example = []

        # 读取训练集
        self.read_txt(self.train_log_path, train_example)
        # with open(self.train_log_path, "r") as file:
        #     line = file.readline().strip("\r\n")
        #     while line:
        #         log_tuple = log_regx.search(line)
        #         line = file.readline().strip("\r\n")
        #         if log_tuple is not None:
        #             train_example.append(TfidfVector.get_url(log_tuple.group('request')))

        # 读取黑样本集
        self.read_txt(self.test_black_path, black_example)
        # with open(self.test_black_path, "r") as file:
        #     line = file.readline().strip("\r\n")
        #     while line:
        #         log_tuple = log_regx.search(line)
        #         line = file.readline().strip("\r\n")
        #         if log_tuple is not None:
        #             black_example.append(TfidfVector.get_url(log_tuple.group('request')))

        # 读取白样本集(日志格式)
        self.read_txt(self.test_white_path, white_example)
        # with open(test_white_path, "r") as file:
        #     line = file.readline().strip("\r\n")
        #     while line:
        #         log_tuple = log_regx.search(line)
        #         line = file.readline().strip("\r\n")
        #         if log_tuple is not None:
        #             white_example.append(TfidfVector.get_url(log_tuple.group('request')))

        # 读取白样本集(纯路径格式)
        # with open(self.test_white_path, "r") as file:
        #     line = file.readline().strip("\r\n")
        #     while line:
        #         white_example.append(line)
        #         line = file.readline().strip("\r\n")

        tf_idf_vector = TfidfVector()
        # 特征向量化训练样本
        train_vector = tf_idf_vector.fit_vector

        # 特征向量化黑白样本
        test_normal_vector = tf_idf_vector.transform(white_example)
        test_abnormal_vector = tf_idf_vector.transform(black_example)

        y = [1] * (len(train_example))

        # ============================================= 遍历调优参数nu与gamma ==========================================
        grid = {
            'gamma': np.logspace(-8, 1, 10),
            'nu': np.linspace(0.01, 0.20, 20)
        }

        # 核函数(rbf,linear,poly)
        kernel = 'rbf'

        # 最高准确度、召回率、F1值纪录
        max_F1 = 0
        max_Re = 0
        max_Pr = 0

        # 最高准确度、召回率、F1值时参数gamma的值
        gamma_r_F1 = 0.01
        gamma_r_Re = 0.01
        gamma_r_Pr = 0.01

        # 最高准确度、召回率、F1值时参数nu的值
        nu_r_F1 = 0
        nu_r_Re = 0
        nu_r_Pr = 0

        svdd = OneClassSVM(kernel=kernel)
        zero_count = 0
        re_gamma = 0

        total_loop = len(ParameterGrid(grid))
        process_count = 0
        for z in ParameterGrid(grid):
            process_count += 1

            queue.put_nowait("{:0.4f}".format(process_count / total_loop))
            if re_gamma == z.get('gamma'):
                if zero_count >= 4:
                    continue
            else:
                zero_count = 0
                # re_gamma = z.get('gamma')
                # zero_count = 0
            #     print("This parameter gamma({}) maybe too small. So pass it for saving time.".format(z.get('gamma')))
            #
            # if :
            #     continue
            svdd.set_params(**z)
            svdd.fit(train_vector, y)
            k = svdd.get_params()
            # 正常样本测试
            f = svdd.predict(test_normal_vector)

            TP = f.tolist().count(1)  # True positive
            FN = f.tolist().count(-1)  # False Negative

            # 异常样本测试
            f = svdd.predict(test_abnormal_vector)

            FP = f.tolist().count(1)  # False positive
            Precision = 0 if TP == 0 else (TP / (TP + FP))  # Precision
            Recall = 0 if TP == 0 else (TP / (TP + FN))  # Recall
            if Recall == 0 or Precision == 0:
                F1_score = 0
                zero_count += 1
                re_gamma = k.get('gamma')
            else:
                F1_score = 2 * Precision * Recall / (Precision + Recall
                                                     )  # F1 value

            if F1_score > max_F1:
                max_F1 = F1_score
                nu_r_F1 = k.get('nu')
                gamma_r_F1 = k.get('gamma')

            if Recall > max_Re:
                max_Re = Recall
                nu_r_Re = k.get('nu')
                gamma_r_Re = k.get('gamma')

            if Precision > max_Pr:
                max_Pr = Precision
                nu_r_Pr = k.get('nu')
                gamma_r_Pr = k.get('gamma')

            print(
                "========================== [{}] ===========================".
                format(datetime.now().strftime("%Y/%m/%d %H:%M:%S")))
            print(
                "nu: ",
                k.get('nu'),
                'gamma',
                k.get('gamma'),
            )
            print("Precision: {}%".format(Precision * 100))
            print("Recall: {}%".format(Recall * 100))
            print("F1 score: {}".format(F1_score))
        print("========================== [{}] ===========================".
              format(datetime.now().strftime("%Y/%m/%d %H:%M:%S")))

        print(
            "MAX Precision:  {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}"
            .format(max_Pr, nu_r_Pr, gamma_r_Pr))
        print(
            "MAX Recall:     {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}"
            .format(max_Re, nu_r_Re, gamma_r_Re))
        print(
            "MAX F1:         {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}"
            .format(max_F1, nu_r_F1, gamma_r_F1))
        total_second = datetime.now() - start
        print("Cost {}s.".format(total_second.total_seconds()))
        queue.put_nowait("1")
        with open(os.path.join(self.root_path, "analog/cache/model.pkl"),
                  'wb') as file:
            svdd.set_params(kernel=kernel, nu=nu_r_F1, gamma=gamma_r_F1)
            svdd.fit(train_vector, y)
            pickle.dump(svdd, file)
        self.complete = True
示例#4
0
selector = RFE(clsfRFE)
selector.fit(arTraining_std,np.ones(arTraining.shape[0]))
with open(sTargetPhone+'_'+sTrial+'.rank','w') as fRank:
    Indx = np.where(selector.ranking_==1)[0]
    print(Indx,file=fRank)
fMaxAcc = -100.0
sMAXparam=''
clfMax = ''
idxmax = 0
for indx in (range(arTraining_std.shape[1]),):#Indx):
    for kernel in ['rbf']:#,'sigmoid']:#['poly', 'rbf', 'sigmoid']:
        for nu in [0.0000001,0.000001,0.00001,0.0001,0.001,0.01,0.1]:#,0.15,0.2,0.25,0.3,0.35,0.4]:
            for gam in [0.000001,0.00001,0.0001,0.001,0.01,0.1]:#,0.3,0.5]:#,1,10,100]:
                #print ('nu=',nu,'gamma=',gam,'kernel=',kernel)
                param = str(nu)+' '+str(gam) + ' '+ kernel
                clsf.set_params(**{'nu':nu,'gamma':gam,'kernel':kernel})
        #gsC    lsf = GridSearchCV(clsf,dParams,cv=tCVIndxs,scoring='scorer')
                #print('Training The Model')
                arTr = arTraining_std[:,indx]
                clsf.fit(arTr)
        #cls    f.fit(arCVData,arCVLab)
                #print('Prediction')
                arV = arValidation_std[:,indx]
                y_pre = clsf.predict(arV)
                param = param + ' ' + str(indx).replace('\n','')
                #print (accuracy_score(y_valid_ref,y_pre))
                fCurAcc = f1_score(y_valid_ref,y_pre)#accuracy_score(y_valid_ref,y_pre)
                xIn = accuracy_score(y_valid_ref[:iNumInClass],y_pre[:iNumInClass]) + math.exp(-100)
                #print('x=',x)
                fscoreIn = math.log(xIn)
                fscoreOut = 0.0
示例#5
0
文件: Training.py 项目: su18/Marcus
    def get_model(self):
        start = datetime.now()
        print("Start at {}".format(start.strftime("%Y/%m/%d %H:%M:%S")))
        train_example = []
        xss_example = []
        non_xss_example = []

        # 读取训练集(整理好的XSS Payload)
        self.read_txt(self.train_path, train_example)
        # 读取正常请求样本集
        self.read_txt(self.test_none_xss_path, non_xss_example)
        # 读取攻击请求样本集
        self.read_txt(self.test_xss_path, xss_example)
        # 特征向量化训练样本
        tf_idf_vector = TfIdfVector()
        train_vector = tf_idf_vector.fit_vector
        # 特征向量化黑白样本
        test_normal_vector = tf_idf_vector.transform(xss_example)
        test_abnormal_vector = tf_idf_vector.transform(non_xss_example)
        y = [1] * (len(train_example))
        #  遍历调优参数nu与gamma
        grid = {
            'gamma': np.logspace(-8, 1, 10),
            'nu': np.linspace(0.01, 0.20, 20)
        }
        # 核函数(rbf,linear,poly)
        kernel = 'rbf'
        # 最高准确度、召回率、F1值纪录
        max_F1 = 0
        max_Re = 0
        max_Pr = 0
        # 最高准确度、召回率、F1值时参数gamma的值
        gamma_r_F1 = 0.01
        gamma_r_Re = 0.01
        gamma_r_Pr = 0.01
        # 最高准确度、召回率、F1值时参数nu的值
        nu_r_F1 = 0
        nu_r_Re = 0
        nu_r_Pr = 0
        svdd = OneClassSVM(kernel=kernel)
        zero_count = 0
        re_gamma = 0
        total_loop = len(ParameterGrid(grid))
        process_count = 0
        for z in ParameterGrid(grid):
            process_count += 1
            if re_gamma == z.get('gamma'):
                if zero_count >= 4:
                    continue
            else:
                zero_count = 0
            svdd.set_params(**z)
            svdd.fit(train_vector, y)
            k = svdd.get_params()

            # 攻击请求样本测试
            f = svdd.predict(test_normal_vector)
            TP = f.tolist().count(1)  # True positive
            FN = f.tolist().count(-1)  # False Negative

            # 非攻击样本测试
            f = svdd.predict(test_abnormal_vector)
            FP = f.tolist().count(1)  # False positive
            Precision = 0 if TP == 0 else (TP / (TP + FP))  # Precision
            Recall = 0 if TP == 0 else (TP / (TP + FN))  # Recall
            if Recall == 0 or Precision == 0:
                F1_score = 0
                zero_count += 1
                re_gamma = k.get('gamma')
            else:
                F1_score = 2 * Precision * Recall / (Precision + Recall
                                                     )  # F1 value

            if F1_score > max_F1:
                max_F1 = F1_score
                nu_r_F1 = k.get('nu')
                gamma_r_F1 = k.get('gamma')

            if Recall > max_Re:
                max_Re = Recall
                nu_r_Re = k.get('nu')
                gamma_r_Re = k.get('gamma')

            if Precision > max_Pr:
                max_Pr = Precision
                nu_r_Pr = k.get('nu')
                gamma_r_Pr = k.get('gamma')

            print(
                "========================== [{}] ===========================".
                format(datetime.now().strftime("%Y/%m/%d %H:%M:%S")))
            print(
                "nu: ",
                k.get('nu'),
                'gamma',
                k.get('gamma'),
            )
            print("Precision: {}%".format(Precision * 100))
            print("Recall: {}%".format(Recall * 100))
            print("F1 score: {}".format(F1_score))
        print("========================== [{}] ===========================".
              format(datetime.now().strftime("%Y/%m/%d %H:%M:%S")))

        print(
            "MAX Precision:  {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}"
            .format(max_Pr, nu_r_Pr, gamma_r_Pr))
        print(
            "MAX Recall:     {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}"
            .format(max_Re, nu_r_Re, gamma_r_Re))
        print(
            "MAX F1:         {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}"
            .format(max_F1, nu_r_F1, gamma_r_F1))
        total_second = datetime.now() - start
        print("Cost {}s.".format(total_second.total_seconds()))
        with open(os.path.join(self.root_path, "ModuleTrain/cache/model.pkl"),
                  'wb') as file:
            svdd.set_params(kernel=kernel, nu=nu_r_F1, gamma=gamma_r_F1)
            svdd.fit(train_vector, y)
            pickle.dump(svdd, file)
        self.complete = True
示例#6
0
    def get_model(self, queue=None):

        start = datetime.now()
        # Since logger is not pickleable until python 3.7,
        # we can init logger within this function.
        train_logger = Logger(logger_name="train_logger",
                              log_path=self.log_path)

        train_logger.register_log_function("calc", "CALCU")
        train_logger.register_log_function("split", "SPLIT")
        train_logger.register_log_function("start", "START")
        train_logger.register_log_function("end", "-END-")
        train_logger.register_log_function("result", "RESULT")
        train_logger.start("Start Training.")
        train_example = []
        white_example = []
        black_example = []

        pattern = self.config.get(self.section_name_log, 'log_content_pattern')
        # 读取训练集
        read_by_group(self.train_log_path, train_example, pattern=pattern)

        # 读取黑样本集
        read_by_group(self.test_black_path, black_example, pattern=pattern)

        # 读取白样本集
        read_by_group(self.test_white_path, white_example, pattern=pattern)

        # 特征向量化训练样本
        tf_idf_vector = TfidfVector(self.root_path, self.config)
        train_vector = tf_idf_vector.fit_vector

        # 特征向量化黑/白样本
        test_normal_vector = tf_idf_vector.transform(white_example)
        test_abnormal_vector = tf_idf_vector.transform(black_example)
        # test_param_x = tf_idf_vector.transform(white_example + black_example)
        # test_param_y = [1] * len(white_example) + [-1] * len(black_example)
        # ============================================= 遍历调优参数nu与gamma ==========================================
        grid = {'gamma': np.logspace(-9, 1, 10),
                'nu': np.linspace(0.00001, 0.2, 100)}
        # ======================================= GridSearchCV遍历调优参数nu与gamma ======================================
        # scores = "f1"
        # clf = GridSearchCV(OneClassSVM(), grid, scoring=scores)
        # clf.fit(test_param_x, test_param_y)
        # ==============================================================================================================

        # 核函数(rbf,linear,poly)
        kernel = 'rbf'

        # 最高准确度、召回率、F1值纪录
        max_F1 = 0
        max_Re = 0
        max_Pr = 0

        # 最高准确度、召回率、F1值时参数gamma的值
        gamma_r_F1 = 0.01
        gamma_r_Re = 0.01
        gamma_r_Pr = 0.01

        # 最高准确度、召回率、F1值时参数nu的值
        nu_r_F1 = 0
        nu_r_Re = 0
        nu_r_Pr = 0

        svdd = OneClassSVM(kernel=kernel)
        zero_count = 0
        re_gamma = 0

        total_loop = len(ParameterGrid(grid))
        process_count = 0
        for z in ParameterGrid(grid):
            process_count += 1
            queue.put_nowait("{:0.4f}".format(process_count / total_loop))
            if re_gamma == z.get('gamma'):
                if zero_count >= 6:
                    continue
            else:
                zero_count = 0
            svdd = OneClassSVM(**z)
            svdd.fit(train_vector)
            k = svdd.get_params()
            # 正常样本测试
            f = svdd.predict(test_normal_vector)

            TP = f.tolist().count(1)  # True positive
            FN = f.tolist().count(-1)  # False Negative

            # 异常样本测试
            f = svdd.predict(test_abnormal_vector)

            FP = f.tolist().count(1)  # False positive
            Precision = 0 if TP == 0 else (TP / (TP + FP))  # Precision
            Recall = 0 if TP == 0 else (TP / (TP + FN))  # Recall
            if Recall == 0 or Precision == 0:
                F1_score = 0
                zero_count += 1
                re_gamma = k.get('gamma')
            else:
                F1_score = 2 * Precision * Recall / (Precision + Recall)  # F1 value

            if F1_score > max_F1:
                max_F1 = F1_score
                nu_r_F1 = k.get('nu')
                gamma_r_F1 = k.get('gamma')

            if Recall > max_Re:
                max_Re = Recall
                nu_r_Re = k.get('nu')
                gamma_r_Re = k.get('gamma')

            if Precision > max_Pr:
                max_Pr = Precision
                nu_r_Pr = k.get('nu')
                gamma_r_Pr = k.get('gamma')

            train_logger.split("=" * 60)
            train_logger.calc("nu: %.08f , gamma: %.04f" % (k.get('nu'), k.get('gamma')))
            train_logger.calc("precision: {}%".format(Precision * 100))
            train_logger.calc("recall: {}%".format(Recall * 100))
            train_logger.calc("f1 score: {}".format(F1_score))

        train_logger.split("=" * 60)
        train_logger.result(
            "MAX Precision:{:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}".format(max_Pr, nu_r_Pr,
                                                                                          gamma_r_Pr))
        train_logger.result(
            "MAX Recall:   {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}".format(max_Re, nu_r_Re,
                                                                                          gamma_r_Re))
        train_logger.result(
            "MAX F1:       {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}".format(max_F1, nu_r_F1,
                                                                                          gamma_r_F1))
        total_second = datetime.now() - start
        train_logger.end("Cost {}s.".format(total_second.total_seconds()))
        queue.put_nowait("1")
        with open(os.path.join(self.root_path, "analog/cache/model.pkl"), 'wb') as file:
            # svdd = OneClassSVM(**clf.best_params_)
            svdd.set_params(kernel=kernel, nu=nu_r_F1, gamma=gamma_r_F1)
            svdd.fit(train_vector)
            pickle.dump(svdd, file)
        self.complete = True