示例#1
0
    def run(self, train_per=0.8, reset_dataset=False):
        """
        从数据库中读取特征数据,并使用adaboost分类
        :return:
        """
        # 首先划分训练集微博和测试集微博
        swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM swblog')
        wblog = self.sqlhelper.select_sql_one(
            'SELECT wblogId FROM wblog_choose')

        final_wblog = self.sqlhelper.select_sql_one(
            'SELECT wblogId FROM final_wblog WHERE spammer="yes"')
        for wblogId in final_wblog:
            if wblogId not in swblog:
                swblog.append(wblogId)

        for uid in swblog:
            if uid in wblog:
                wblog.remove(uid)

        train_wblog_set, test_wblog_set = Alkit.read_dataset(
            '../main/prior/wblog_train' + self.file_name_appendix + '.txt',
            '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')

        # 输出训练集和测试集的一些信息
        logging.info('训练集大小:%s' % len(train_wblog_set))
        logging.info('训练集中正例(swblog)大小:%s' %
                     len(list(set(train_wblog_set).intersection(set(swblog)))))
        logging.info('训练集中负例(wblog)大小:%s' %
                     len(list(set(train_wblog_set).intersection(set(wblog)))))
        logging.info('测试集大小:%s' % len(test_wblog_set))
        logging.info('测试集中正例(swblog)大小:%s' %
                     len(list(set(test_wblog_set).intersection(set(swblog)))))
        logging.info('测试集中负例(wblog)大小:%s' %
                     len(list(set(test_wblog_set).intersection(set(wblog)))))

        # print('279 train_wblog_set \n', train_wblog_set)
        # print('279 swblog \n', swblog)
        # print('279 wblog \n', wblog)

        # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致)
        feature_dict_data, result_dict_data = self.load_data(
            train_wblog_set, swblog, wblog)
        # print('281 feature_dict_data ', feature_dict_data)  # [('4033482998743585', [nan, nan, nan, nan, nan]),
        # print('282 result_dict_data', result_dict_data)  # [('4033482998743585', 1), ('3914608449995325', 1),

        train_feature, train_result = Alkit.process_data(
            feature_dict_data, result_dict_data)
        logging.info('训练集数据处理完毕')
        feature_dict_data, result_dict_data = self.load_data(
            test_wblog_set, swblog, wblog)
        test_feature, test_result = Alkit.process_data(feature_dict_data,
                                                       result_dict_data)
        logging.info('测试集数据处理完毕')

        # 使用ad-boost训练并输出结果
        logging.info('\nAdaBoost开始训练')
        model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2,
                                                          min_samples_split=20,
                                                          min_samples_leaf=5),
                                   algorithm="SAMME",
                                   n_estimators=100,
                                   learning_rate=0.5)
        model.fit(train_feature, train_result)
        logging.info('训练结束')
        predict_result = model.predict(test_feature)
        logging.info('准确率:%s' %
                     metrics.precision_score(test_result, predict_result))
        logging.info('召回率:%s' %
                     metrics.recall_score(test_result, predict_result))
        logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))
        predict_result_proba = model.predict_proba(test_feature)
        prp = []
        for prob in predict_result_proba:
            prp.append(float(prob[0]) * -1 + float(prob[1]) * 1)
        Alkit.write_prior(
            '../main/crowd_target/wblog_train' + self.file_name_appendix +
            '.txt', '../main/crowd_target/wblog_prior' +
            self.file_name_appendix + '.txt', train_wblog_set, train_result,
            test_wblog_set, test_result, predict_result, prp)
示例#2
0
    def run(self):
        """
        从数据库中读取特征数据,并使用svm和lr分类
        :return:
        """
        if not self.add_unknown_into_model:
            swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM swblog')
            wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM wblog_choose')

            final_wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"')
            for wblogId in final_wblog:
                if wblogId not in swblog:
                    swblog.append(wblogId)

            # 不知道为什么spammer和normal两个集合有重合的用户
            # 所以这里简单地将这些重合的用户都认为是spammer
            for uid in swblog:
                if uid in wblog:
                    wblog.remove(uid)

            """
            到目前为止,我们得到了下面几个有用的东西
            swblog: 水军  
            wblog: 正常用户
            unkonwn:还没来得及标注的未知类型微博
            """

            logging.info('原始数据spam占比例(max): %s' % (len(swblog) * 1.0 / (len(wblog) + len(swblog))))
            if self.spam_per > len(swblog) * 1.0 / (len(wblog) + len(swblog)):
                logging.info('we don\'t have so much spams in our datasets, we will keep original percentage')
            else:
                expected_spam_number = int(self.spam_per * len(wblog) * 1.0 / (1 - self.spam_per))
                swblog = random.sample(swblog, expected_spam_number)

            if self.reset_dataset:
                train_wblog_set = random.sample(swblog, int(len(swblog) * self.train_per)) + random.sample(wblog, int(
                    len(wblog) * self.train_per))
                test_wblog_set = list(set(swblog + wblog).difference(train_wblog_set))
                # # 第二期改进代码
                # train_user_set_without_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(len(normal) * train_per))
                # train_user_set_with_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(
                #     len(normal) * train_per))+random.sample(unknown, len(unknown))
                # test_user_set = list(set(spammer + normal).difference(train_user_set_without_unknown))
                # train_user_set=train_user_set_with_unknown+train_user_set_with_unknown
            else:
                train_wblog_set, test_wblog_set = Alkit.read_dataset(
                    '../main/prior/wblog_train' + self.file_name_appendix + '.txt',
                    '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')

            # 输出训练集和测试集的一些信息
            logging.info('总数据集大小:%s' % (len(train_wblog_set)+len(test_wblog_set)))
            logging.info('训练集大小:%s' % len(train_wblog_set))
            logging.info('训练集中正例(swblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(swblog)))))
            logging.info('训练集中负例(wblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(wblog)))))
            logging.info('测试集大小:%s' % len(test_wblog_set))
            logging.info('测试集中正例(swblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(swblog)))))
            logging.info('测试集中负例(wblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(wblog)))))
        else:
            raise ('we will implement this later.')

        # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致)
        feature_dict_data, result_dict_data = self.load_data(train_wblog_set, swblog, wblog)
        train_feature, train_result = Alkit.process_data(feature_dict_data, result_dict_data)
        logging.info('训练集数据处理完毕')
        feature_dict_data, result_dict_data = self.load_data(test_wblog_set, swblog, wblog)
        test_feature, test_result = Alkit.process_data(feature_dict_data, result_dict_data)
        logging.info('测试集数据处理完毕')

        # 使用svm训练并输出结果
        # logging.info('\nSVM开始训练')
        # model = SVC(class_weight='balanced')
        # model.fit(train_feature, train_result)
        # logging.info('训练结束')
        # predict_result = model.predict(test_feature)
        # logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result))
        # logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result))
        # logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))

        # 使用LR训练并输出结果
        logging.info('LR开始训练')
        model = LogisticRegression(class_weight='balanced')
        model.fit(train_feature, train_result)
        logging.info('训练结束')
        predict_result = model.predict(test_feature)
        logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result))
        logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result))
        logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))

        # 使用LR输出概率形式的结果
        predict_result_proba = model.predict_proba(test_feature)
        prp = []
        for prob in predict_result_proba:
            prp.append(float(prob[0]) * -1 + float(prob[1]) * 1)

        # 将LR跑出来的两种结果保存下来,供下一步使用
        if self.dump:
            logging.info("保存结果输出到 " + '../main/prior/wblog_train' + self.file_name_appendix + '.txt'
                         + "和" + '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')
            Alkit.write_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt',
                              '../main/prior/wblog_prior' + self.file_name_appendix + '.txt',
                              train_wblog_set, train_result, test_wblog_set, test_result, predict_result, prp)
示例#3
0
    def run(self):
        """
        从数据库中读取特征数据,并使用svm和lr分类

        水军占比例(max): 0.2325521503991759
        spammer_per <= 0.2325521503991759



        :return:
        """

        if not self.add_unknown_into_model:
            # 首先划分训练集用户和测试集用户
            spammer = self.sqlhelper.select_sql_one('SELECT uid FROM spammer')
            normal = self.sqlhelper.select_sql_one(
                'SELECT uid FROM normal WHERE choose="yes"')
            # unknown = self.sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="not"')

            final_user = self.sqlhelper.select_sql_one(
                'SELECT uid FROM final_user WHERE spammer="yes"')
            """
            final_user: 3843个用户, 水军903, 非水军2940
            normal: 13906个用户, 水军和非水军未知,为此我们通过人工的方法从从这些用户中挑选了一些正常的用户,标记为choose='yes'
            spammer: 892个水军用户
    
            """
            for uid in final_user:
                if uid not in spammer:
                    spammer.append(uid)
            """
            到这为止, 代码中spammer相当于数据表里spammer U final_user.spammer一共有903
            """

            # 不知道为什么spammer和normal两个集合有重合的用户
            # 所以这里简单地将这些重合的用户都认为是spammer
            for uid in spammer:
                if uid in normal:
                    normal.remove(uid)
                # if uid in unknown:
                #     unknown.remove(uid)
            """
            到目前为止,我们得到了下面几个有用的东西
            spammer: 水军  
            normal: 正常用户
            unkonwn:还没来得及标注的未知类型用户
            """
            logging.info('原始数据水军占比例(max): %s' % (len(spammer) * 1.0 /
                                                 (len(normal) + len(spammer))))
            if self.spammer_per > len(spammer) * 1.0 / (len(normal) +
                                                        len(spammer)):
                logging.info(
                    'we don\'t have so much spammers in our datasets, we will keep original percentage'
                )
            else:
                expected_spammer_number = int(self.spammer_per * len(normal) *
                                              1.0 / (1 - self.spammer_per))
                spammer = random.sample(spammer, expected_spammer_number)

            # print(len(spammer))
            if self.reset_dataset:
                train_user_set = random.sample(
                    spammer, int(
                        len(spammer) * self.train_per)) + random.sample(
                            normal, int(len(normal) * self.train_per))
                test_user_set = list(
                    set(spammer + normal).difference(train_user_set))
                # # 第二期改进代码
                # train_user_set_without_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(len(normal) * train_per))
                # train_user_set_with_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(
                #     len(normal) * train_per))+random.sample(unknown, len(unknown))
                # test_user_set = list(set(spammer + normal).difference(train_user_set_without_unknown))
                # train_user_set=train_user_set_with_unknown+train_user_set_with_unknown
            else:
                train_user_set, test_user_set = Alkit.read_dataset(
                    '../main/prior/user_train' + self.file_name_appendix +
                    '.txt', '../main/prior/user_prior' +
                    self.file_name_appendix + '.txt')

            # 输出训练集和测试集的一些信息
            logging.info('数据集总大小:%s' %
                         (len(train_user_set) + len(test_user_set)))
            logging.info('训练集大小:%s' % len(train_user_set))
            logging.info(
                '训练集中正例(spammer)大小:%s' %
                len(list(set(train_user_set).intersection(set(spammer)))))
            logging.info(
                '训练集中负例(normal)大小:%s' %
                len(list(set(train_user_set).intersection(set(normal)))))
            # logging.info('训练集中未知标签(unknown)大小:%s' % len(list(set(unknown))))
            logging.info('测试集大小:%s' % len(test_user_set))
            logging.info(
                '测试集中正例(spammer)大小:%s' %
                len(list(set(test_user_set).intersection(set(spammer)))))
            logging.info(
                '测试集中负例(normal)大小:%s' %
                len(list(set(test_user_set).intersection(set(normal)))))
            logging.info('水军占比例: %s' % (len(spammer) * 1.0 /
                                        (len(normal) + len(spammer))))
            """
            测试集参与训练,但是测试集在模型训练期间标签将按照unknown处理
            """
        else:
            raise ('we will implement this later.')

        # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致)
        feature_dict_data, result_dict_data = self.load_data(
            train_user_set, spammer, normal)
        train_feature, train_result = Alkit.process_data(
            feature_dict_data, result_dict_data)
        logging.info('训练集数据处理完毕')
        feature_dict_data, result_dict_data = self.load_data(
            test_user_set, spammer, normal)
        test_feature, test_result = Alkit.process_data(feature_dict_data,
                                                       result_dict_data)
        logging.info('测试集数据处理完毕')
        # print(metrics.mutual_info_score(train_result, train_feature))
        # 使用svm训练并输出结果
        # logging.info('\nSVM开始训练')
        # model = SVC(class_weight='balanced')
        # model.fit(train_feature, train_result)
        # logging.info('训练结束')
        # predict_result = model.predict(test_feature)
        # logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result))
        # logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result))
        # logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))

        # import minepy
        # m = minepy.MINE()
        # for i in range(7):
        #     m.compute_score(train_feature[:,i], train_result)
        #     print(m.mic())

        # 使用LR训练并输出结果
        logging.info('LR开始训练')
        model = LogisticRegression(class_weight='balanced')
        model.fit(train_feature, train_result)
        logging.info('训练结束')
        predict_result = model.predict(test_feature)
        logging.info('准确率:%s' %
                     metrics.precision_score(test_result, predict_result))
        logging.info('召回率:%s' %
                     metrics.recall_score(test_result, predict_result))
        logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))

        # 使用LR输出概率形式的结果
        predict_result_proba = model.predict_proba(test_feature)
        prp = []
        for prob in predict_result_proba:
            prp.append(float(prob[0]) * -1 + float(prob[1]) * 1)

        # 将LR跑出来的两种结果保存下来,供下一步使用
        if self.dump:
            logging.info("保存结果输出到 " + '../main/prior/user_train' +
                         self.file_name_appendix + '.txt 和' +
                         '../main/prior/user_prior' + self.file_name_appendix +
                         '.txt')
            Alkit.write_prior(
                '../main/prior/user_train' + self.file_name_appendix + '.txt',
                '../main/prior/user_prior' + self.file_name_appendix + '.txt',
                train_user_set, train_result, test_user_set, test_result,
                predict_result, prp)