def load_data(self, total_set, swblog, wblog, unknown=None): """ 从数据库读取数据,因为训练集和测试集读取的操作一样,所以单独写一个方法 :return: 特征字典数据,类别字典数据 """ feature_dict_data = OrderedDict() result_dict_data = OrderedDict() for wblogId in total_set: feature_dict_data[wblogId] = [Alkit.load_data_help_w(self.commentSimilarity, wblogId, 'comment_similarity'), Alkit.load_data_help_w(self.sentimentSimilarity, wblogId, 'sentiment_similarity'), Alkit.load_data_help_w(self.commentInteractRatio, wblogId, 'interact_ratio'), Alkit.load_data_help_w(self.hotCommentRatio, wblogId, 'hot_ratio')] # feature_dict_data[wblogId] = [Alkit.load_data_help_w(self.commentSimilarity, wblogId, 'comment_similarity'), # Alkit.load_data_help_w(self.commentInteractRatio, wblogId, 'interact_ratio'), # Alkit.load_data_help_w(self.hotCommentRatio, wblogId, 'hot_ratio')] if wblogId in swblog: result_dict_data[wblogId] = 1 elif wblogId in wblog: result_dict_data[wblogId] = -1 elif wblogId in unknown: result_dict_data[wblogId] = 0 return feature_dict_data, result_dict_data
def load_data(self, total_set, swblog, wblog): """ 从数据库读取数据,因为训练集和测试集读取的操作一样,所以单独写一个方法 :return: 特征字典数据,类别字典数据 total_set=train_wblog_set, ['4033482998743585', '3914608449995325', swblog=swblog, ['4045047554826553', '4039829169862097', wblog=wblog, ['4032096583879003', '4054839190956692', """ feature_dict_data = OrderedDict() result_dict_data = OrderedDict() for wblogId in total_set: feature_dict_data[wblogId] = [ Alkit.load_data_help_w(self.mdb.time, wblogId, 'mean'), Alkit.load_data_help_w(self.mdb.time, wblogId, 'std'), Alkit.load_data_help_w(self.mdb.time, wblogId, 'skewness'), Alkit.load_data_help_w(self.mdb.time, wblogId, 'kurtosis'), Alkit.load_data_help_w(self.mdb.third, wblogId, 'third') ] if wblogId in swblog: result_dict_data[wblogId] = 1 else: result_dict_data[wblogId] = -1 # print("388 feature_dict_data\n", feature_dict_data) return feature_dict_data, result_dict_data
def evalutaion(self): """ 评价一下 :return: """ wblog_train_dict, wblog_train_list, wblog_prior_dict, wblog_prior_list = \ Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') swblog, swblog_prior, nwblog, nwblog_prior = Alkit.setSN(wblog_train_dict, wblog_prior_dict) scores = [] test_result = [] predict_result = [] for uid in wblog_prior_list: test_result.append(float(wblog_prior_dict[uid]['label'])) predict_result.append(float(wblog_prior_dict[uid]['prior_label'])) scores.append(float(wblog_prior_dict[uid]['prior'])) # print(float(metrics.f1_score(test_result, predict_result))) Evaluation.evaluation_self(scores, test_result) # ap p, r, thresholds = metrics.precision_recall_curve(test_result, scores) ap = metrics.average_precision_score(test_result, scores) logging.info('wblog AP:%s' % str(ap)) with open('../main/lr/wblog_ap'+self.file_name_appendix+'.txt', 'w') as my_file: my_file.write('p r\n') for i in range(len(p)): my_file.write('%s %s\n' % (str(p[i]), str(r[i]))) # roc fpr, tpr, thresholds = metrics.roc_curve(test_result, scores) logging.info('wblog AUC:%s' % str(metrics.auc(fpr, tpr))) with open('../main/lr/wblog_roc'+self.file_name_appendix+'.txt', 'w') as my_file: my_file.write('fpr tpr\n') for i in range(len(fpr)): my_file.write('%s %s\n' % (str(fpr[i]), str(tpr[i]))) # top k precision wblog_score = {} for i in range(len(scores)): wblog_score[wblog_prior_list[i]] = scores[i] wblog_score = sorted(wblog_score.items(), key=lambda im: float(im[1]), reverse=True) with open('../main/lr/res_wblog_top'+self.file_name_appendix+'.txt', 'w') as my_file: my_file.write('type wblogId score precision top_k\n') wblog_count_now = 0 top_k = 0 for itm in wblog_score: uid = itm[0] score = itm[1] if uid in swblog: u_type = 's' wblog_count_now += 1 else: u_type = 'n' top_k += 1 precision = str(float(wblog_count_now) / top_k) my_file.write(u_type + ' ' + str(uid) + ' ' + str(score) + ' ' + precision + ' ' + str(top_k) + '\n')
def __init__(self, h, d, u, p, c, file_name_appendix=''): """ 在init中将读取msca必要的数据 """ self.host = h self.db = d self.user = u self.passwd = p self.charset = c self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.file_name_appendix = file_name_appendix # 读取训练集,以及测试集上得到的先验类别 # user_train_dict,训练集,带标签 # user_train_list,训练集,只有用户id # user_prior_dict,测试集,带ground truth标签,以及先验类别的prior标签 # user_prior_list, 测试集,只有用户id self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \ Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt') self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \ Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') # self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \ # Alkit.read_prior('prior_bak/user_train.txt', 'prior_bak/user_prior.txt') # self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \ # Alkit.read_prior('prior_bak/wblog_train.txt', 'prior_bak/wblog_prior.txt') # spammer,真实的spammer用户 # spammer_prior,先验类别判定后的spammer用户 # normal,真实的normal用户 # normal_prior,先验类别判定后的normal用户 # swblog,swblog_prior,wblog,wblog_prior同理 self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN( self.user_train_dict, self.user_prior_dict) self.swblog, self.swblog_prior, self.nwblog, self.nwblog_prior = Alkit.setSN( self.wblog_train_dict, self.wblog_prior_dict) self.all_user = self.user_train_list + self.user_prior_list self.all_wblog = self.wblog_train_list + self.wblog_prior_list self.follow_edge = {} # {'uid': ['followeeUid']} self.follow_cnt = {} # {'uid': follow count} self.retweet_edge = {} # {'uid': ['wblogId']} self.wblog_retweet_cnt = {} # {wblogId: retweet count} self.user_retweet_cnt = {} # {uid: retweet count}
def __init__(self, h, d, u, p, c, file_name_appendix=''): """ 在init中将读取S3MCD必要的数据 """ self.host = h self.db = d self.user = u self.passwd = p self.charset = c self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.file_name_appendix = file_name_appendix # 读取训练集,以及测试集上得到的先验类别 # user_train_dict,训练集,带标签 # user_train_list,训练集,只有用户id # user_prior_dict,测试集,带ground truth标签,以及先验类别的prior标签 # user_prior_list, 测试集,只有用户id self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \ Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt') self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \ Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') # spammer,真实的spammer用户 # spammer_prior,先验类别判定后的spammer用户 # normal,真实的normal用户 # normal_prior,先验类别判定后的normal用户 # swblog,swblog_prior,wblog,wblog_prior同理 self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN( self.user_train_dict, self.user_prior_dict) self.swblog, self.swblog_prior, self.nwblog, self.nwblog_prior = Alkit.setSN( self.wblog_train_dict, self.wblog_prior_dict) self.all_user = self.user_prior_list self.all_wblog = self.wblog_prior_list self.follow_edge = {} # {'uid': ['followeeUid']} self.tweet_edge = {} # {'uid': ['wblogId']} self.wblog_content = {} # {'wblogId': [content]} self.pattern_html = re.compile(r'<[^>]+>', re.S) self.pattern_tag = re.compile(r'#.+#', re.S)
def __init__(self, h, d, u, p, c, file_name_appendix=''): """ 在init中将读取CrowdTarget必要的数据 """ self.host = h self.db = d self.user = u self.passwd = p self.charset = c self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.file_name_appendix = file_name_appendix # 读取训练集,以及测试集上得到的先验类别 # user_train_dict,训练集,带标签 # user_train_list,训练集,只有用户id # user_prior_dict,测试集,带ground truth标签,以及先验类别的prior标签 # user_prior_list, 测试集,只有用户id self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \ Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt') self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \ Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') # spammer,真实的spammer用户 # spammer_prior,先验类别判定后的spammer用户 # normal,真实的normal用户 # normal_prior,先验类别判定后的normal用户 # swblog,swblog_prior,wblog,wblog_prior同理 self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN( self.user_train_dict, self.user_prior_dict) self.swblog, self.swblog_prior, self.nwblog, self.nwblog_prior = Alkit.setSN( self.wblog_train_dict, self.wblog_prior_dict) self.all_user = self.user_prior_list self.all_wblog = self.wblog_train_list + self.wblog_prior_list self.mdb = MongoClient( ).crowd_target # 代码原来是crowd_target,因为我数据库的名字写错了所以改成crow_target self.sqlhelper = SqlHelper()
def __init__(self, h, d, u, p, c, file_name_appendix=''): self.host = h self.db = d self.user = u self.passwd = p self.charset = c self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.file_name_appendix = file_name_appendix self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \ Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt') self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN(self.user_train_dict, self.user_prior_dict) self.seed_worker = [] for uid in self.user_train_dict.keys(): if self.user_train_dict[uid]['label'] == '1': self.seed_worker.append(uid) self.other_worker = [] for uid in self.user_prior_dict.keys(): if self.user_prior_dict[uid]['label'] == '1': self.other_worker.append(uid) self.normal = [] for uid in self.user_prior_dict.keys(): if self.user_prior_dict[uid]['label'] == '-1': self.normal.append(uid) self.all_user = self.seed_worker + self.other_worker + self.normal self.follow_edge = [] for uid in self.all_user: for result in self.sqlhelper.select_sql('SELECT uid, followeeUid FROM edge WHERE uid=%s' % uid): uid = str(result[0]) followeeUid = str(result[1]) if followeeUid not in self.all_user: continue self.follow_edge.append((uid, followeeUid))
def load_data(self, total_set, spammer, normal, unknown=None): """ 从数据库读取数据,因为训练集和测试集读取的操作一样,所以单独写一个方法 :return: 特征字典数据,类别字典数据 """ feature_dict_data = OrderedDict() result_dict_data = OrderedDict() for uid in total_set: feature_dict_data[uid] = [ Alkit.load_data_help(self.registerDay, uid, 'log_time'), Alkit.load_data_help(self.followCnt, uid, 'log_follower'), Alkit.load_data_help(self.followCnt, uid, 'log_followee'), Alkit.load_data_help(self.oriThirdFre, uid, 'fre'), Alkit.load_data_help(self.retweetFre, uid, 'follow_fre'), Alkit.load_data_help(self.retweetFre, uid, 'onehop_fre'), Alkit.load_data_help(self.rvp, uid, 'rvp_ratio') ] """ 现在我需要检查一下, 看看mongodb里这些json数据表是不是仅仅包含了normal和spammer而没有把unknown放进来? self.registerDay = MongoClient().userFeature.registerDay self.followCnt = MongoClient().userFeature.followCnt self.oriThirdFre = MongoClient().userFeature.oriThirdFre self.retweetFre = MongoClient().userFeature.retweetFre self.rvp = MongoClient().userFeature.rvp """ # feature_dict_data[uid] = [Alkit.load_data_help(self.followCnt, uid, 'follower_cnt'), # Alkit.load_data_help(self.followCnt, uid, 'followee_cnt'), # Alkit.load_data_help(self.followCnt, uid, 'ff'), # Alkit.load_data_help(self.followCnt, uid, 'profile'), # Alkit.load_data_help(self.rvp, uid, 'discription')] # if uid in spammer: # result_dict_data[uid] = 1 # else: # result_dict_data[uid] = -1 # 第二期改进代码 if uid in spammer: result_dict_data[uid] = 1 elif uid in normal: result_dict_data[uid] = -1 elif uid in unknown: result_dict_data[uid] = 0 # 这个地方是我自己添加的,对于标签未知的用户,设定其标签为0 return feature_dict_data, result_dict_data
def run(self): """ 从数据库中读取特征数据,并使用svm和lr分类 :return: """ if not self.add_unknown_into_model: swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM swblog') wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM wblog_choose') final_wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"') for wblogId in final_wblog: if wblogId not in swblog: swblog.append(wblogId) # 不知道为什么spammer和normal两个集合有重合的用户 # 所以这里简单地将这些重合的用户都认为是spammer for uid in swblog: if uid in wblog: wblog.remove(uid) """ 到目前为止,我们得到了下面几个有用的东西 swblog: 水军 wblog: 正常用户 unkonwn:还没来得及标注的未知类型微博 """ logging.info('原始数据spam占比例(max): %s' % (len(swblog) * 1.0 / (len(wblog) + len(swblog)))) if self.spam_per > len(swblog) * 1.0 / (len(wblog) + len(swblog)): logging.info('we don\'t have so much spams in our datasets, we will keep original percentage') else: expected_spam_number = int(self.spam_per * len(wblog) * 1.0 / (1 - self.spam_per)) swblog = random.sample(swblog, expected_spam_number) if self.reset_dataset: train_wblog_set = random.sample(swblog, int(len(swblog) * self.train_per)) + random.sample(wblog, int( len(wblog) * self.train_per)) test_wblog_set = list(set(swblog + wblog).difference(train_wblog_set)) # # 第二期改进代码 # train_user_set_without_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(len(normal) * train_per)) # train_user_set_with_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int( # len(normal) * train_per))+random.sample(unknown, len(unknown)) # test_user_set = list(set(spammer + normal).difference(train_user_set_without_unknown)) # train_user_set=train_user_set_with_unknown+train_user_set_with_unknown else: train_wblog_set, test_wblog_set = Alkit.read_dataset( '../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') # 输出训练集和测试集的一些信息 logging.info('总数据集大小:%s' % (len(train_wblog_set)+len(test_wblog_set))) logging.info('训练集大小:%s' % len(train_wblog_set)) logging.info('训练集中正例(swblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(swblog))))) logging.info('训练集中负例(wblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(wblog))))) logging.info('测试集大小:%s' % len(test_wblog_set)) logging.info('测试集中正例(swblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(swblog))))) logging.info('测试集中负例(wblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(wblog))))) else: raise ('we will implement this later.') # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致) feature_dict_data, result_dict_data = self.load_data(train_wblog_set, swblog, wblog) train_feature, train_result = Alkit.process_data(feature_dict_data, result_dict_data) logging.info('训练集数据处理完毕') feature_dict_data, result_dict_data = self.load_data(test_wblog_set, swblog, wblog) test_feature, test_result = Alkit.process_data(feature_dict_data, result_dict_data) logging.info('测试集数据处理完毕') # 使用svm训练并输出结果 # logging.info('\nSVM开始训练') # model = SVC(class_weight='balanced') # model.fit(train_feature, train_result) # logging.info('训练结束') # predict_result = model.predict(test_feature) # logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) # logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) # logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) # 使用LR训练并输出结果 logging.info('LR开始训练') model = LogisticRegression(class_weight='balanced') model.fit(train_feature, train_result) logging.info('训练结束') predict_result = model.predict(test_feature) logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) # 使用LR输出概率形式的结果 predict_result_proba = model.predict_proba(test_feature) prp = [] for prob in predict_result_proba: prp.append(float(prob[0]) * -1 + float(prob[1]) * 1) # 将LR跑出来的两种结果保存下来,供下一步使用 if self.dump: logging.info("保存结果输出到 " + '../main/prior/wblog_train' + self.file_name_appendix + '.txt' + "和" + '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') Alkit.write_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt', train_wblog_set, train_result, test_wblog_set, test_result, predict_result, prp)
def run(self, train_per=0.8, reset_dataset=False): """ 从数据库中读取特征数据,并使用adaboost分类 :return: """ # 首先划分训练集微博和测试集微博 swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM swblog') wblog = self.sqlhelper.select_sql_one( 'SELECT wblogId FROM wblog_choose') final_wblog = self.sqlhelper.select_sql_one( 'SELECT wblogId FROM final_wblog WHERE spammer="yes"') for wblogId in final_wblog: if wblogId not in swblog: swblog.append(wblogId) for uid in swblog: if uid in wblog: wblog.remove(uid) train_wblog_set, test_wblog_set = Alkit.read_dataset( '../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') # 输出训练集和测试集的一些信息 logging.info('训练集大小:%s' % len(train_wblog_set)) logging.info('训练集中正例(swblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(swblog))))) logging.info('训练集中负例(wblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(wblog))))) logging.info('测试集大小:%s' % len(test_wblog_set)) logging.info('测试集中正例(swblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(swblog))))) logging.info('测试集中负例(wblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(wblog))))) # print('279 train_wblog_set \n', train_wblog_set) # print('279 swblog \n', swblog) # print('279 wblog \n', wblog) # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致) feature_dict_data, result_dict_data = self.load_data( train_wblog_set, swblog, wblog) # print('281 feature_dict_data ', feature_dict_data) # [('4033482998743585', [nan, nan, nan, nan, nan]), # print('282 result_dict_data', result_dict_data) # [('4033482998743585', 1), ('3914608449995325', 1), train_feature, train_result = Alkit.process_data( feature_dict_data, result_dict_data) logging.info('训练集数据处理完毕') feature_dict_data, result_dict_data = self.load_data( test_wblog_set, swblog, wblog) test_feature, test_result = Alkit.process_data(feature_dict_data, result_dict_data) logging.info('测试集数据处理完毕') # 使用ad-boost训练并输出结果 logging.info('\nAdaBoost开始训练') model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=5), algorithm="SAMME", n_estimators=100, learning_rate=0.5) model.fit(train_feature, train_result) logging.info('训练结束') predict_result = model.predict(test_feature) logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) predict_result_proba = model.predict_proba(test_feature) prp = [] for prob in predict_result_proba: prp.append(float(prob[0]) * -1 + float(prob[1]) * 1) Alkit.write_prior( '../main/crowd_target/wblog_train' + self.file_name_appendix + '.txt', '../main/crowd_target/wblog_prior' + self.file_name_appendix + '.txt', train_wblog_set, train_result, test_wblog_set, test_result, predict_result, prp)
def run(self): """ 从数据库中读取特征数据,并使用svm和lr分类 水军占比例(max): 0.2325521503991759 spammer_per <= 0.2325521503991759 :return: """ if not self.add_unknown_into_model: # 首先划分训练集用户和测试集用户 spammer = self.sqlhelper.select_sql_one('SELECT uid FROM spammer') normal = self.sqlhelper.select_sql_one( 'SELECT uid FROM normal WHERE choose="yes"') # unknown = self.sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="not"') final_user = self.sqlhelper.select_sql_one( 'SELECT uid FROM final_user WHERE spammer="yes"') """ final_user: 3843个用户, 水军903, 非水军2940 normal: 13906个用户, 水军和非水军未知,为此我们通过人工的方法从从这些用户中挑选了一些正常的用户,标记为choose='yes' spammer: 892个水军用户 """ for uid in final_user: if uid not in spammer: spammer.append(uid) """ 到这为止, 代码中spammer相当于数据表里spammer U final_user.spammer一共有903 """ # 不知道为什么spammer和normal两个集合有重合的用户 # 所以这里简单地将这些重合的用户都认为是spammer for uid in spammer: if uid in normal: normal.remove(uid) # if uid in unknown: # unknown.remove(uid) """ 到目前为止,我们得到了下面几个有用的东西 spammer: 水军 normal: 正常用户 unkonwn:还没来得及标注的未知类型用户 """ logging.info('原始数据水军占比例(max): %s' % (len(spammer) * 1.0 / (len(normal) + len(spammer)))) if self.spammer_per > len(spammer) * 1.0 / (len(normal) + len(spammer)): logging.info( 'we don\'t have so much spammers in our datasets, we will keep original percentage' ) else: expected_spammer_number = int(self.spammer_per * len(normal) * 1.0 / (1 - self.spammer_per)) spammer = random.sample(spammer, expected_spammer_number) # print(len(spammer)) if self.reset_dataset: train_user_set = random.sample( spammer, int( len(spammer) * self.train_per)) + random.sample( normal, int(len(normal) * self.train_per)) test_user_set = list( set(spammer + normal).difference(train_user_set)) # # 第二期改进代码 # train_user_set_without_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(len(normal) * train_per)) # train_user_set_with_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int( # len(normal) * train_per))+random.sample(unknown, len(unknown)) # test_user_set = list(set(spammer + normal).difference(train_user_set_without_unknown)) # train_user_set=train_user_set_with_unknown+train_user_set_with_unknown else: train_user_set, test_user_set = Alkit.read_dataset( '../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt') # 输出训练集和测试集的一些信息 logging.info('数据集总大小:%s' % (len(train_user_set) + len(test_user_set))) logging.info('训练集大小:%s' % len(train_user_set)) logging.info( '训练集中正例(spammer)大小:%s' % len(list(set(train_user_set).intersection(set(spammer))))) logging.info( '训练集中负例(normal)大小:%s' % len(list(set(train_user_set).intersection(set(normal))))) # logging.info('训练集中未知标签(unknown)大小:%s' % len(list(set(unknown)))) logging.info('测试集大小:%s' % len(test_user_set)) logging.info( '测试集中正例(spammer)大小:%s' % len(list(set(test_user_set).intersection(set(spammer))))) logging.info( '测试集中负例(normal)大小:%s' % len(list(set(test_user_set).intersection(set(normal))))) logging.info('水军占比例: %s' % (len(spammer) * 1.0 / (len(normal) + len(spammer)))) """ 测试集参与训练,但是测试集在模型训练期间标签将按照unknown处理 """ else: raise ('we will implement this later.') # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致) feature_dict_data, result_dict_data = self.load_data( train_user_set, spammer, normal) train_feature, train_result = Alkit.process_data( feature_dict_data, result_dict_data) logging.info('训练集数据处理完毕') feature_dict_data, result_dict_data = self.load_data( test_user_set, spammer, normal) test_feature, test_result = Alkit.process_data(feature_dict_data, result_dict_data) logging.info('测试集数据处理完毕') # print(metrics.mutual_info_score(train_result, train_feature)) # 使用svm训练并输出结果 # logging.info('\nSVM开始训练') # model = SVC(class_weight='balanced') # model.fit(train_feature, train_result) # logging.info('训练结束') # predict_result = model.predict(test_feature) # logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) # logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) # logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) # import minepy # m = minepy.MINE() # for i in range(7): # m.compute_score(train_feature[:,i], train_result) # print(m.mic()) # 使用LR训练并输出结果 logging.info('LR开始训练') model = LogisticRegression(class_weight='balanced') model.fit(train_feature, train_result) logging.info('训练结束') predict_result = model.predict(test_feature) logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) # 使用LR输出概率形式的结果 predict_result_proba = model.predict_proba(test_feature) prp = [] for prob in predict_result_proba: prp.append(float(prob[0]) * -1 + float(prob[1]) * 1) # 将LR跑出来的两种结果保存下来,供下一步使用 if self.dump: logging.info("保存结果输出到 " + '../main/prior/user_train' + self.file_name_appendix + '.txt 和' + '../main/prior/user_prior' + self.file_name_appendix + '.txt') Alkit.write_prior( '../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt', train_user_set, train_result, test_user_set, test_result, predict_result, prp)