示例#1
0
    def load_data(self, total_set, swblog, wblog, unknown=None):
        """
        从数据库读取数据,因为训练集和测试集读取的操作一样,所以单独写一个方法
        :return: 特征字典数据,类别字典数据
        """
        feature_dict_data = OrderedDict()
        result_dict_data = OrderedDict()

        for wblogId in total_set:
            feature_dict_data[wblogId] = [Alkit.load_data_help_w(self.commentSimilarity, wblogId, 'comment_similarity'),
                                          Alkit.load_data_help_w(self.sentimentSimilarity, wblogId,
                                                                 'sentiment_similarity'),
                                          Alkit.load_data_help_w(self.commentInteractRatio, wblogId, 'interact_ratio'),
                                          Alkit.load_data_help_w(self.hotCommentRatio, wblogId, 'hot_ratio')]

            # feature_dict_data[wblogId] = [Alkit.load_data_help_w(self.commentSimilarity, wblogId, 'comment_similarity'),
            #                               Alkit.load_data_help_w(self.commentInteractRatio, wblogId, 'interact_ratio'),
            #                               Alkit.load_data_help_w(self.hotCommentRatio, wblogId, 'hot_ratio')]

            if wblogId in swblog:
                result_dict_data[wblogId] = 1
            elif wblogId in wblog:
                result_dict_data[wblogId] = -1
            elif wblogId in unknown:
                result_dict_data[wblogId] = 0

        return feature_dict_data, result_dict_data
示例#2
0
    def load_data(self, total_set, swblog, wblog):
        """
        从数据库读取数据,因为训练集和测试集读取的操作一样,所以单独写一个方法
        :return: 特征字典数据,类别字典数据
        total_set=train_wblog_set, ['4033482998743585', '3914608449995325',
        swblog=swblog, ['4045047554826553', '4039829169862097',
        wblog=wblog, ['4032096583879003', '4054839190956692',
        """
        feature_dict_data = OrderedDict()
        result_dict_data = OrderedDict()

        for wblogId in total_set:
            feature_dict_data[wblogId] = [
                Alkit.load_data_help_w(self.mdb.time, wblogId, 'mean'),
                Alkit.load_data_help_w(self.mdb.time, wblogId, 'std'),
                Alkit.load_data_help_w(self.mdb.time, wblogId, 'skewness'),
                Alkit.load_data_help_w(self.mdb.time, wblogId, 'kurtosis'),
                Alkit.load_data_help_w(self.mdb.third, wblogId, 'third')
            ]

            if wblogId in swblog:
                result_dict_data[wblogId] = 1
            else:
                result_dict_data[wblogId] = -1

        # print("388 feature_dict_data\n", feature_dict_data)

        return feature_dict_data, result_dict_data
示例#3
0
    def evalutaion(self):
        """
        评价一下
        :return:
        """
        wblog_train_dict, wblog_train_list, wblog_prior_dict, wblog_prior_list = \
            Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')
        swblog, swblog_prior, nwblog, nwblog_prior = Alkit.setSN(wblog_train_dict, wblog_prior_dict)
        scores = []
        test_result = []
        predict_result = []
        for uid in wblog_prior_list:
            test_result.append(float(wblog_prior_dict[uid]['label']))
            predict_result.append(float(wblog_prior_dict[uid]['prior_label']))
            scores.append(float(wblog_prior_dict[uid]['prior']))
        # print(float(metrics.f1_score(test_result, predict_result)))
        Evaluation.evaluation_self(scores, test_result)

        # ap
        p, r, thresholds = metrics.precision_recall_curve(test_result, scores)
        ap = metrics.average_precision_score(test_result, scores)
        logging.info('wblog AP:%s' % str(ap))
        with open('../main/lr/wblog_ap'+self.file_name_appendix+'.txt', 'w') as my_file:
            my_file.write('p r\n')
            for i in range(len(p)):
                my_file.write('%s %s\n' % (str(p[i]), str(r[i])))

        # roc
        fpr, tpr, thresholds = metrics.roc_curve(test_result, scores)
        logging.info('wblog AUC:%s' % str(metrics.auc(fpr, tpr)))
        with open('../main/lr/wblog_roc'+self.file_name_appendix+'.txt', 'w') as my_file:
            my_file.write('fpr tpr\n')
            for i in range(len(fpr)):
                my_file.write('%s %s\n' % (str(fpr[i]), str(tpr[i])))

        # top k precision
        wblog_score = {}
        for i in range(len(scores)):
            wblog_score[wblog_prior_list[i]] = scores[i]
        wblog_score = sorted(wblog_score.items(), key=lambda im: float(im[1]), reverse=True)
        with open('../main/lr/res_wblog_top'+self.file_name_appendix+'.txt', 'w') as my_file:
            my_file.write('type wblogId score precision top_k\n')
            wblog_count_now = 0
            top_k = 0
            for itm in wblog_score:
                uid = itm[0]
                score = itm[1]
                if uid in swblog:
                    u_type = 's'
                    wblog_count_now += 1
                else:
                    u_type = 'n'
                top_k += 1
                precision = str(float(wblog_count_now) / top_k)
                my_file.write(u_type + ' ' + str(uid) + ' ' + str(score) + ' ' + precision + ' ' + str(top_k) + '\n')
示例#4
0
    def __init__(self, h, d, u, p, c, file_name_appendix=''):
        """
        在init中将读取msca必要的数据
        """
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c
        self.sqlhelper = SqlHelper(host=self.host,
                                   db=self.db,
                                   user=self.user,
                                   passwd=self.passwd,
                                   charset=self.charset)
        self.file_name_appendix = file_name_appendix

        # 读取训练集,以及测试集上得到的先验类别
        # user_train_dict,训练集,带标签
        # user_train_list,训练集,只有用户id
        # user_prior_dict,测试集,带ground truth标签,以及先验类别的prior标签
        # user_prior_list, 测试集,只有用户id
        self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \
            Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt',
                             '../main/prior/user_prior' + self.file_name_appendix + '.txt')
        self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \
            Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt',
                             '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')

        # self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \
        #     Alkit.read_prior('prior_bak/user_train.txt', 'prior_bak/user_prior.txt')
        # self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \
        #     Alkit.read_prior('prior_bak/wblog_train.txt', 'prior_bak/wblog_prior.txt')

        # spammer,真实的spammer用户
        # spammer_prior,先验类别判定后的spammer用户
        # normal,真实的normal用户
        # normal_prior,先验类别判定后的normal用户
        # swblog,swblog_prior,wblog,wblog_prior同理
        self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN(
            self.user_train_dict, self.user_prior_dict)
        self.swblog, self.swblog_prior, self.nwblog, self.nwblog_prior = Alkit.setSN(
            self.wblog_train_dict, self.wblog_prior_dict)
        self.all_user = self.user_train_list + self.user_prior_list
        self.all_wblog = self.wblog_train_list + self.wblog_prior_list

        self.follow_edge = {}  # {'uid': ['followeeUid']}
        self.follow_cnt = {}  # {'uid': follow count}
        self.retweet_edge = {}  # {'uid': ['wblogId']}
        self.wblog_retweet_cnt = {}  # {wblogId: retweet count}
        self.user_retweet_cnt = {}  # {uid: retweet count}
示例#5
0
    def __init__(self, h, d, u, p, c, file_name_appendix=''):
        """
        在init中将读取S3MCD必要的数据
        """
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c
        self.sqlhelper = SqlHelper(host=self.host,
                                   db=self.db,
                                   user=self.user,
                                   passwd=self.passwd,
                                   charset=self.charset)
        self.file_name_appendix = file_name_appendix
        # 读取训练集,以及测试集上得到的先验类别
        # user_train_dict,训练集,带标签
        # user_train_list,训练集,只有用户id
        # user_prior_dict,测试集,带ground truth标签,以及先验类别的prior标签
        # user_prior_list, 测试集,只有用户id
        self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \
            Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt',
                             '../main/prior/user_prior' + self.file_name_appendix + '.txt')
        self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \
            Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt',
                             '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')

        # spammer,真实的spammer用户
        # spammer_prior,先验类别判定后的spammer用户
        # normal,真实的normal用户
        # normal_prior,先验类别判定后的normal用户
        # swblog,swblog_prior,wblog,wblog_prior同理
        self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN(
            self.user_train_dict, self.user_prior_dict)
        self.swblog, self.swblog_prior, self.nwblog, self.nwblog_prior = Alkit.setSN(
            self.wblog_train_dict, self.wblog_prior_dict)
        self.all_user = self.user_prior_list
        self.all_wblog = self.wblog_prior_list

        self.follow_edge = {}  # {'uid': ['followeeUid']}
        self.tweet_edge = {}  # {'uid': ['wblogId']}
        self.wblog_content = {}  # {'wblogId': [content]}

        self.pattern_html = re.compile(r'<[^>]+>', re.S)
        self.pattern_tag = re.compile(r'#.+#', re.S)
示例#6
0
    def __init__(self, h, d, u, p, c, file_name_appendix=''):
        """
        在init中将读取CrowdTarget必要的数据
        """
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c
        self.sqlhelper = SqlHelper(host=self.host,
                                   db=self.db,
                                   user=self.user,
                                   passwd=self.passwd,
                                   charset=self.charset)
        self.file_name_appendix = file_name_appendix
        # 读取训练集,以及测试集上得到的先验类别
        # user_train_dict,训练集,带标签
        # user_train_list,训练集,只有用户id
        # user_prior_dict,测试集,带ground truth标签,以及先验类别的prior标签
        # user_prior_list, 测试集,只有用户id

        self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \
            Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt',
                             '../main/prior/user_prior' + self.file_name_appendix + '.txt')
        self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \
            Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt',
                             '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')

        # spammer,真实的spammer用户
        # spammer_prior,先验类别判定后的spammer用户
        # normal,真实的normal用户
        # normal_prior,先验类别判定后的normal用户
        # swblog,swblog_prior,wblog,wblog_prior同理
        self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN(
            self.user_train_dict, self.user_prior_dict)
        self.swblog, self.swblog_prior, self.nwblog, self.nwblog_prior = Alkit.setSN(
            self.wblog_train_dict, self.wblog_prior_dict)
        self.all_user = self.user_prior_list
        self.all_wblog = self.wblog_train_list + self.wblog_prior_list

        self.mdb = MongoClient(
        ).crowd_target  # 代码原来是crowd_target,因为我数据库的名字写错了所以改成crow_target
        self.sqlhelper = SqlHelper()
示例#7
0
    def __init__(self, h, d, u, p, c, file_name_appendix=''):
        self.host = h
        self.db = d
        self.user = u
        self.passwd = p
        self.charset = c
        self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
        self.file_name_appendix = file_name_appendix

        self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \
            Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt',
                             '../main/prior/user_prior' + self.file_name_appendix + '.txt')
        self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN(self.user_train_dict,
                                                                                       self.user_prior_dict)
        self.seed_worker = []
        for uid in self.user_train_dict.keys():
            if self.user_train_dict[uid]['label'] == '1':
                self.seed_worker.append(uid)
        self.other_worker = []
        for uid in self.user_prior_dict.keys():
            if self.user_prior_dict[uid]['label'] == '1':
                self.other_worker.append(uid)
        self.normal = []
        for uid in self.user_prior_dict.keys():
            if self.user_prior_dict[uid]['label'] == '-1':
                self.normal.append(uid)

        self.all_user = self.seed_worker + self.other_worker + self.normal

        self.follow_edge = []

        for uid in self.all_user:
            for result in self.sqlhelper.select_sql('SELECT uid, followeeUid FROM edge WHERE uid=%s' % uid):
                uid = str(result[0])
                followeeUid = str(result[1])
                if followeeUid not in self.all_user:
                    continue
                self.follow_edge.append((uid, followeeUid))
示例#8
0
    def load_data(self, total_set, spammer, normal, unknown=None):
        """
        从数据库读取数据,因为训练集和测试集读取的操作一样,所以单独写一个方法
        :return: 特征字典数据,类别字典数据
        """
        feature_dict_data = OrderedDict()
        result_dict_data = OrderedDict()

        for uid in total_set:
            feature_dict_data[uid] = [
                Alkit.load_data_help(self.registerDay, uid, 'log_time'),
                Alkit.load_data_help(self.followCnt, uid, 'log_follower'),
                Alkit.load_data_help(self.followCnt, uid, 'log_followee'),
                Alkit.load_data_help(self.oriThirdFre, uid, 'fre'),
                Alkit.load_data_help(self.retweetFre, uid, 'follow_fre'),
                Alkit.load_data_help(self.retweetFre, uid, 'onehop_fre'),
                Alkit.load_data_help(self.rvp, uid, 'rvp_ratio')
            ]
            """
            现在我需要检查一下, 看看mongodb里这些json数据表是不是仅仅包含了normal和spammer而没有把unknown放进来?
            
             self.registerDay = MongoClient().userFeature.registerDay
                self.followCnt = MongoClient().userFeature.followCnt
                self.oriThirdFre = MongoClient().userFeature.oriThirdFre
                self.retweetFre = MongoClient().userFeature.retweetFre
                self.rvp = MongoClient().userFeature.rvp
        
            """

            # feature_dict_data[uid] = [Alkit.load_data_help(self.followCnt, uid, 'follower_cnt'),
            #                           Alkit.load_data_help(self.followCnt, uid, 'followee_cnt'),
            #                           Alkit.load_data_help(self.followCnt, uid, 'ff'),
            #                           Alkit.load_data_help(self.followCnt, uid, 'profile'),
            #                           Alkit.load_data_help(self.rvp, uid, 'discription')]

            # if uid in spammer:
            #     result_dict_data[uid] = 1
            # else:
            #     result_dict_data[uid] = -1

            # 第二期改进代码
            if uid in spammer:
                result_dict_data[uid] = 1
            elif uid in normal:
                result_dict_data[uid] = -1
            elif uid in unknown:
                result_dict_data[uid] = 0  # 这个地方是我自己添加的,对于标签未知的用户,设定其标签为0

        return feature_dict_data, result_dict_data
示例#9
0
    def run(self):
        """
        从数据库中读取特征数据,并使用svm和lr分类
        :return:
        """
        if not self.add_unknown_into_model:
            swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM swblog')
            wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM wblog_choose')

            final_wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"')
            for wblogId in final_wblog:
                if wblogId not in swblog:
                    swblog.append(wblogId)

            # 不知道为什么spammer和normal两个集合有重合的用户
            # 所以这里简单地将这些重合的用户都认为是spammer
            for uid in swblog:
                if uid in wblog:
                    wblog.remove(uid)

            """
            到目前为止,我们得到了下面几个有用的东西
            swblog: 水军  
            wblog: 正常用户
            unkonwn:还没来得及标注的未知类型微博
            """

            logging.info('原始数据spam占比例(max): %s' % (len(swblog) * 1.0 / (len(wblog) + len(swblog))))
            if self.spam_per > len(swblog) * 1.0 / (len(wblog) + len(swblog)):
                logging.info('we don\'t have so much spams in our datasets, we will keep original percentage')
            else:
                expected_spam_number = int(self.spam_per * len(wblog) * 1.0 / (1 - self.spam_per))
                swblog = random.sample(swblog, expected_spam_number)

            if self.reset_dataset:
                train_wblog_set = random.sample(swblog, int(len(swblog) * self.train_per)) + random.sample(wblog, int(
                    len(wblog) * self.train_per))
                test_wblog_set = list(set(swblog + wblog).difference(train_wblog_set))
                # # 第二期改进代码
                # train_user_set_without_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(len(normal) * train_per))
                # train_user_set_with_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(
                #     len(normal) * train_per))+random.sample(unknown, len(unknown))
                # test_user_set = list(set(spammer + normal).difference(train_user_set_without_unknown))
                # train_user_set=train_user_set_with_unknown+train_user_set_with_unknown
            else:
                train_wblog_set, test_wblog_set = Alkit.read_dataset(
                    '../main/prior/wblog_train' + self.file_name_appendix + '.txt',
                    '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')

            # 输出训练集和测试集的一些信息
            logging.info('总数据集大小:%s' % (len(train_wblog_set)+len(test_wblog_set)))
            logging.info('训练集大小:%s' % len(train_wblog_set))
            logging.info('训练集中正例(swblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(swblog)))))
            logging.info('训练集中负例(wblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(wblog)))))
            logging.info('测试集大小:%s' % len(test_wblog_set))
            logging.info('测试集中正例(swblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(swblog)))))
            logging.info('测试集中负例(wblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(wblog)))))
        else:
            raise ('we will implement this later.')

        # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致)
        feature_dict_data, result_dict_data = self.load_data(train_wblog_set, swblog, wblog)
        train_feature, train_result = Alkit.process_data(feature_dict_data, result_dict_data)
        logging.info('训练集数据处理完毕')
        feature_dict_data, result_dict_data = self.load_data(test_wblog_set, swblog, wblog)
        test_feature, test_result = Alkit.process_data(feature_dict_data, result_dict_data)
        logging.info('测试集数据处理完毕')

        # 使用svm训练并输出结果
        # logging.info('\nSVM开始训练')
        # model = SVC(class_weight='balanced')
        # model.fit(train_feature, train_result)
        # logging.info('训练结束')
        # predict_result = model.predict(test_feature)
        # logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result))
        # logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result))
        # logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))

        # 使用LR训练并输出结果
        logging.info('LR开始训练')
        model = LogisticRegression(class_weight='balanced')
        model.fit(train_feature, train_result)
        logging.info('训练结束')
        predict_result = model.predict(test_feature)
        logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result))
        logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result))
        logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))

        # 使用LR输出概率形式的结果
        predict_result_proba = model.predict_proba(test_feature)
        prp = []
        for prob in predict_result_proba:
            prp.append(float(prob[0]) * -1 + float(prob[1]) * 1)

        # 将LR跑出来的两种结果保存下来,供下一步使用
        if self.dump:
            logging.info("保存结果输出到 " + '../main/prior/wblog_train' + self.file_name_appendix + '.txt'
                         + "和" + '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')
            Alkit.write_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt',
                              '../main/prior/wblog_prior' + self.file_name_appendix + '.txt',
                              train_wblog_set, train_result, test_wblog_set, test_result, predict_result, prp)
示例#10
0
    def run(self, train_per=0.8, reset_dataset=False):
        """
        从数据库中读取特征数据,并使用adaboost分类
        :return:
        """
        # 首先划分训练集微博和测试集微博
        swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM swblog')
        wblog = self.sqlhelper.select_sql_one(
            'SELECT wblogId FROM wblog_choose')

        final_wblog = self.sqlhelper.select_sql_one(
            'SELECT wblogId FROM final_wblog WHERE spammer="yes"')
        for wblogId in final_wblog:
            if wblogId not in swblog:
                swblog.append(wblogId)

        for uid in swblog:
            if uid in wblog:
                wblog.remove(uid)

        train_wblog_set, test_wblog_set = Alkit.read_dataset(
            '../main/prior/wblog_train' + self.file_name_appendix + '.txt',
            '../main/prior/wblog_prior' + self.file_name_appendix + '.txt')

        # 输出训练集和测试集的一些信息
        logging.info('训练集大小:%s' % len(train_wblog_set))
        logging.info('训练集中正例(swblog)大小:%s' %
                     len(list(set(train_wblog_set).intersection(set(swblog)))))
        logging.info('训练集中负例(wblog)大小:%s' %
                     len(list(set(train_wblog_set).intersection(set(wblog)))))
        logging.info('测试集大小:%s' % len(test_wblog_set))
        logging.info('测试集中正例(swblog)大小:%s' %
                     len(list(set(test_wblog_set).intersection(set(swblog)))))
        logging.info('测试集中负例(wblog)大小:%s' %
                     len(list(set(test_wblog_set).intersection(set(wblog)))))

        # print('279 train_wblog_set \n', train_wblog_set)
        # print('279 swblog \n', swblog)
        # print('279 wblog \n', wblog)

        # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致)
        feature_dict_data, result_dict_data = self.load_data(
            train_wblog_set, swblog, wblog)
        # print('281 feature_dict_data ', feature_dict_data)  # [('4033482998743585', [nan, nan, nan, nan, nan]),
        # print('282 result_dict_data', result_dict_data)  # [('4033482998743585', 1), ('3914608449995325', 1),

        train_feature, train_result = Alkit.process_data(
            feature_dict_data, result_dict_data)
        logging.info('训练集数据处理完毕')
        feature_dict_data, result_dict_data = self.load_data(
            test_wblog_set, swblog, wblog)
        test_feature, test_result = Alkit.process_data(feature_dict_data,
                                                       result_dict_data)
        logging.info('测试集数据处理完毕')

        # 使用ad-boost训练并输出结果
        logging.info('\nAdaBoost开始训练')
        model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2,
                                                          min_samples_split=20,
                                                          min_samples_leaf=5),
                                   algorithm="SAMME",
                                   n_estimators=100,
                                   learning_rate=0.5)
        model.fit(train_feature, train_result)
        logging.info('训练结束')
        predict_result = model.predict(test_feature)
        logging.info('准确率:%s' %
                     metrics.precision_score(test_result, predict_result))
        logging.info('召回率:%s' %
                     metrics.recall_score(test_result, predict_result))
        logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))
        predict_result_proba = model.predict_proba(test_feature)
        prp = []
        for prob in predict_result_proba:
            prp.append(float(prob[0]) * -1 + float(prob[1]) * 1)
        Alkit.write_prior(
            '../main/crowd_target/wblog_train' + self.file_name_appendix +
            '.txt', '../main/crowd_target/wblog_prior' +
            self.file_name_appendix + '.txt', train_wblog_set, train_result,
            test_wblog_set, test_result, predict_result, prp)
示例#11
0
    def run(self):
        """
        从数据库中读取特征数据,并使用svm和lr分类

        水军占比例(max): 0.2325521503991759
        spammer_per <= 0.2325521503991759



        :return:
        """

        if not self.add_unknown_into_model:
            # 首先划分训练集用户和测试集用户
            spammer = self.sqlhelper.select_sql_one('SELECT uid FROM spammer')
            normal = self.sqlhelper.select_sql_one(
                'SELECT uid FROM normal WHERE choose="yes"')
            # unknown = self.sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="not"')

            final_user = self.sqlhelper.select_sql_one(
                'SELECT uid FROM final_user WHERE spammer="yes"')
            """
            final_user: 3843个用户, 水军903, 非水军2940
            normal: 13906个用户, 水军和非水军未知,为此我们通过人工的方法从从这些用户中挑选了一些正常的用户,标记为choose='yes'
            spammer: 892个水军用户
    
            """
            for uid in final_user:
                if uid not in spammer:
                    spammer.append(uid)
            """
            到这为止, 代码中spammer相当于数据表里spammer U final_user.spammer一共有903
            """

            # 不知道为什么spammer和normal两个集合有重合的用户
            # 所以这里简单地将这些重合的用户都认为是spammer
            for uid in spammer:
                if uid in normal:
                    normal.remove(uid)
                # if uid in unknown:
                #     unknown.remove(uid)
            """
            到目前为止,我们得到了下面几个有用的东西
            spammer: 水军  
            normal: 正常用户
            unkonwn:还没来得及标注的未知类型用户
            """
            logging.info('原始数据水军占比例(max): %s' % (len(spammer) * 1.0 /
                                                 (len(normal) + len(spammer))))
            if self.spammer_per > len(spammer) * 1.0 / (len(normal) +
                                                        len(spammer)):
                logging.info(
                    'we don\'t have so much spammers in our datasets, we will keep original percentage'
                )
            else:
                expected_spammer_number = int(self.spammer_per * len(normal) *
                                              1.0 / (1 - self.spammer_per))
                spammer = random.sample(spammer, expected_spammer_number)

            # print(len(spammer))
            if self.reset_dataset:
                train_user_set = random.sample(
                    spammer, int(
                        len(spammer) * self.train_per)) + random.sample(
                            normal, int(len(normal) * self.train_per))
                test_user_set = list(
                    set(spammer + normal).difference(train_user_set))
                # # 第二期改进代码
                # train_user_set_without_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(len(normal) * train_per))
                # train_user_set_with_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(
                #     len(normal) * train_per))+random.sample(unknown, len(unknown))
                # test_user_set = list(set(spammer + normal).difference(train_user_set_without_unknown))
                # train_user_set=train_user_set_with_unknown+train_user_set_with_unknown
            else:
                train_user_set, test_user_set = Alkit.read_dataset(
                    '../main/prior/user_train' + self.file_name_appendix +
                    '.txt', '../main/prior/user_prior' +
                    self.file_name_appendix + '.txt')

            # 输出训练集和测试集的一些信息
            logging.info('数据集总大小:%s' %
                         (len(train_user_set) + len(test_user_set)))
            logging.info('训练集大小:%s' % len(train_user_set))
            logging.info(
                '训练集中正例(spammer)大小:%s' %
                len(list(set(train_user_set).intersection(set(spammer)))))
            logging.info(
                '训练集中负例(normal)大小:%s' %
                len(list(set(train_user_set).intersection(set(normal)))))
            # logging.info('训练集中未知标签(unknown)大小:%s' % len(list(set(unknown))))
            logging.info('测试集大小:%s' % len(test_user_set))
            logging.info(
                '测试集中正例(spammer)大小:%s' %
                len(list(set(test_user_set).intersection(set(spammer)))))
            logging.info(
                '测试集中负例(normal)大小:%s' %
                len(list(set(test_user_set).intersection(set(normal)))))
            logging.info('水军占比例: %s' % (len(spammer) * 1.0 /
                                        (len(normal) + len(spammer))))
            """
            测试集参与训练,但是测试集在模型训练期间标签将按照unknown处理
            """
        else:
            raise ('we will implement this later.')

        # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致)
        feature_dict_data, result_dict_data = self.load_data(
            train_user_set, spammer, normal)
        train_feature, train_result = Alkit.process_data(
            feature_dict_data, result_dict_data)
        logging.info('训练集数据处理完毕')
        feature_dict_data, result_dict_data = self.load_data(
            test_user_set, spammer, normal)
        test_feature, test_result = Alkit.process_data(feature_dict_data,
                                                       result_dict_data)
        logging.info('测试集数据处理完毕')
        # print(metrics.mutual_info_score(train_result, train_feature))
        # 使用svm训练并输出结果
        # logging.info('\nSVM开始训练')
        # model = SVC(class_weight='balanced')
        # model.fit(train_feature, train_result)
        # logging.info('训练结束')
        # predict_result = model.predict(test_feature)
        # logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result))
        # logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result))
        # logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))

        # import minepy
        # m = minepy.MINE()
        # for i in range(7):
        #     m.compute_score(train_feature[:,i], train_result)
        #     print(m.mic())

        # 使用LR训练并输出结果
        logging.info('LR开始训练')
        model = LogisticRegression(class_weight='balanced')
        model.fit(train_feature, train_result)
        logging.info('训练结束')
        predict_result = model.predict(test_feature)
        logging.info('准确率:%s' %
                     metrics.precision_score(test_result, predict_result))
        logging.info('召回率:%s' %
                     metrics.recall_score(test_result, predict_result))
        logging.info('F1:%s' % metrics.f1_score(test_result, predict_result))

        # 使用LR输出概率形式的结果
        predict_result_proba = model.predict_proba(test_feature)
        prp = []
        for prob in predict_result_proba:
            prp.append(float(prob[0]) * -1 + float(prob[1]) * 1)

        # 将LR跑出来的两种结果保存下来,供下一步使用
        if self.dump:
            logging.info("保存结果输出到 " + '../main/prior/user_train' +
                         self.file_name_appendix + '.txt 和' +
                         '../main/prior/user_prior' + self.file_name_appendix +
                         '.txt')
            Alkit.write_prior(
                '../main/prior/user_train' + self.file_name_appendix + '.txt',
                '../main/prior/user_prior' + self.file_name_appendix + '.txt',
                train_user_set, train_result, test_user_set, test_result,
                predict_result, prp)