def __init__(self, h, d, u, p, c, train_per=0.8, spammer_per=0.1, reset_dataset=False, dump=True, add_unknown_into_model=False, file_name_appendix=''): self.host = h self.db = d self.user = u self.passwd = p self.charset = c self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.registerDay = MongoClient().userFeature.registerDay self.followCnt = MongoClient().userFeature.followCnt self.oriThirdFre = MongoClient().userFeature.oriThirdFre self.retweetFre = MongoClient().userFeature.retweetFre self.rvp = MongoClient().userFeature.rvp self.train_per = train_per self.spammer_per = spammer_per self.reset_dataset = reset_dataset self.dump = dump self.add_unknown_into_model = add_unknown_into_model self.file_name_appendix = file_name_appendix
def time_distribution(): """ 统计时间周期分布 :return: """ sqlhelper = SqlHelper() res = {0: {}, 1: {}, 2: {}, 3: {}, 4: {}, 5: {}, 6: {}, } for key in res.keys(): for i in range(24): res[key][i] = 0 for t in sqlhelper.select_sql('SELECT created_at FROM wblog'): timestamp = t[0] res[timestamp.weekday()][timestamp.hour] += 1 with open('data/timestamp.txt', 'w') as my_file: for key in res.keys(): for k in res[key].keys(): my_file.write(str(key * 24 + k) + ' ' + str(res[key][k]) + '\n')
def __enter__(self): self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.mdb = MongoClient().wblogFeature self.swblog = self.sqlhelper.select_sql_one( 'SELECT wblogId FROM swblog') self.wblog = self.sqlhelper.select_sql_one( 'SELECT wblogId FROM final_wblog WHERE spammer="no"') self.unknown = self.sqlhelper.select_sql_one( 'SELECT wblogId FROM wblog') final_wblog = self.sqlhelper.select_sql_one( 'SELECT wblogId FROM final_wblog WHERE spammer="yes"') for wblogId in final_wblog: if wblogId not in self.swblog: self.swblog.append(wblogId) # 不知道为什么spammer和normal两个集合有重合的用户 # 所以这里简单地将这些重合的用户都认为是spammer for uid in self.swblog: if uid in self.wblog: self.wblog.remove(uid) # print(len(swblog)) for uid in self.swblog: if uid in self.unknown: self.unknown.remove(uid) for uid in self.wblog: if uid in self.unknown: self.unknown.remove(uid) return self
def __enter__(self): self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.mdb = MongoClient().userFeature return self
def profile_complete(): """ 统计用户的主页信息完整程度 :return: """ sqlhelper = SqlHelper() spammer = sqlhelper.select_sql_one('SELECT uid FROM final_user WHERE spammer="yes"') normal = sqlhelper.select_sql_one('SELECT uid FROM final_user WHERE spammer="no"') cnt_dict = {} profile = MongoClient().profile.json_text for json_text in profile.find(): uid = json_text['uid'] if uid not in spammer and uid not in normal: continue cnt = 0 try: for card in json_text['json_text']['cards']: try: cnt += len(card['card_group']) except Exception as e: pass except Exception as e: print('no cards %s' % uid) cnt_dict[uid] = cnt spammer_dict = {} spammer_cnt = 0 normal_dict = {} normal_cnt = 0 for key in cnt_dict.keys(): if key in spammer: if cnt_dict[key] not in spammer_dict.keys(): spammer_dict[cnt_dict[key]] = 0 spammer_dict[cnt_dict[key]] += 1 spammer_cnt += 1 else: if cnt_dict[key] not in normal_dict.keys(): normal_dict[cnt_dict[key]] = 0 normal_dict[cnt_dict[key]] += 1 normal_cnt += 1 spammer_dict = sorted(spammer_dict.items(), key=lambda x: x[0]) with open('data/profile_complete_spammer.txt', 'w') as my_file: cnt = 0 for itm in spammer_dict: cnt += itm[1] my_file.write('%s %s\n' % (str(float(itm[0])), str(float(cnt) / spammer_cnt))) normal_dict = sorted(normal_dict.items(), key=lambda x: x[0]) with open('data/profile_complete_normal.txt', 'w') as my_file: cnt = 0 for itm in normal_dict: cnt += itm[1] my_file.write('%s %s\n' % (str(float(itm[0])), str(float(cnt) / normal_cnt)))
def interact(): """ 统计微博评论的互动情况 :return: """ sqlhelper = SqlHelper() swblog = sqlhelper.select_sql_one('SELECT wblogId FROM swblog') wblog = sqlhelper.select_sql_one('SELECT wblogId FROM wblog_choose') final_wblog = sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"') for wblogId in final_wblog: if wblogId not in swblog: swblog.append(wblogId) for wblogId in swblog: if wblogId in wblog: wblog.remove(wblogId) print(len(swblog) + len(wblog)) hot = 0 interact = 0 hotCommentRatio = MongoClient().wblogFeature.hotCommentRatio commentInteractRatio = MongoClient().wblogFeature.commentInteractRatio for wblogId in wblog: try: a = hotCommentRatio.find_one({'wblogId': str(wblogId)})['hot_ratio'] b = commentInteractRatio.find_one({'wblogId': str(wblogId)})['interact_ratio'] # if float(a) != 0: # hot += 1 # if float(b) != 0: # interact += 1 if float(a) != 0 or float(b) != 0: hot += 1 except Exception as e: print('%s---- %s' % (str(e), str(wblogId))) print() print(hot) print(len(wblog)) print(float(hot) / len(wblog)) print() hot=0 for wblogId in swblog: try: a = hotCommentRatio.find_one({'wblogId': str(wblogId)})['hot_ratio'] b = commentInteractRatio.find_one({'wblogId': str(wblogId)})['interact_ratio'] if float(a) != 0 or float(b) != 0: hot += 1 except Exception as e: print('%s---- %s' % (str(e), str(wblogId))) print(hot) print(len(swblog)) print(float(hot) / len(swblog))
def __init__(self, h, d, u, p, c, file_name_appendix=''): """ 在init中将读取msca必要的数据 """ self.host = h self.db = d self.user = u self.passwd = p self.charset = c self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.file_name_appendix = file_name_appendix # 读取训练集,以及测试集上得到的先验类别 # user_train_dict,训练集,带标签 # user_train_list,训练集,只有用户id # user_prior_dict,测试集,带ground truth标签,以及先验类别的prior标签 # user_prior_list, 测试集,只有用户id self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \ Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt') self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \ Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') # self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \ # Alkit.read_prior('prior_bak/user_train.txt', 'prior_bak/user_prior.txt') # self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \ # Alkit.read_prior('prior_bak/wblog_train.txt', 'prior_bak/wblog_prior.txt') # spammer,真实的spammer用户 # spammer_prior,先验类别判定后的spammer用户 # normal,真实的normal用户 # normal_prior,先验类别判定后的normal用户 # swblog,swblog_prior,wblog,wblog_prior同理 self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN( self.user_train_dict, self.user_prior_dict) self.swblog, self.swblog_prior, self.nwblog, self.nwblog_prior = Alkit.setSN( self.wblog_train_dict, self.wblog_prior_dict) self.all_user = self.user_train_list + self.user_prior_list self.all_wblog = self.wblog_train_list + self.wblog_prior_list self.follow_edge = {} # {'uid': ['followeeUid']} self.follow_cnt = {} # {'uid': follow count} self.retweet_edge = {} # {'uid': ['wblogId']} self.wblog_retweet_cnt = {} # {wblogId: retweet count} self.user_retweet_cnt = {} # {uid: retweet count}
def _set_follow_edge(user_list, all_user): follow_edge = {} sqlhelper = SqlHelper(host='localhost', db='sdh', user='******', passwd='root', charset='utf8') for uid in user_list: follow_edge[uid] = [] for result in sqlhelper.select_sql( 'SELECT uid, followeeUid FROM edge WHERE uid=%s' % uid): uid = str(result[0]) followeeUid = str(result[1]) if followeeUid not in all_user: continue follow_edge[uid].append(followeeUid) return follow_edge
def count_edge(): sqlhelper = SqlHelper() cnt = 0 spammer = sqlhelper.select_sql_one('SELECT uid FROM spammer') normal = sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="yes"') for uid in spammer: if uid in normal: normal.remove(uid) all_user = spammer + normal print(len(all_user)) for uid in all_user: for u in sqlhelper.select_sql('SELECT followeeUid FROM edge WHERE uid=%s' % str(uid)): if str(u[0]) in all_user: cnt += 1 print(cnt)
def _set_retweet_edge(user_list, all_wblog): retweet_edge = {} sqlhelper = SqlHelper(host='localhost', db='sdh', user='******', passwd='root', charset='utf8') for uid in user_list: retweet_edge[uid] = [] for res in sqlhelper.select_sql( 'SELECT paMid, orMid FROM wblog WHERE uid=%s' % uid): paMid = str(res[0]) orMid = str(res[1]) if paMid in all_wblog: retweet_edge[uid].append(paMid) if orMid in all_wblog: retweet_edge[uid].append(orMid) return retweet_edge
def __init__(self, h, d, u, p, c, file_name_appendix=''): """ 在init中将读取CrowdTarget必要的数据 """ self.host = h self.db = d self.user = u self.passwd = p self.charset = c self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.file_name_appendix = file_name_appendix # 读取训练集,以及测试集上得到的先验类别 # user_train_dict,训练集,带标签 # user_train_list,训练集,只有用户id # user_prior_dict,测试集,带ground truth标签,以及先验类别的prior标签 # user_prior_list, 测试集,只有用户id self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \ Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt') self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \ Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') # spammer,真实的spammer用户 # spammer_prior,先验类别判定后的spammer用户 # normal,真实的normal用户 # normal_prior,先验类别判定后的normal用户 # swblog,swblog_prior,wblog,wblog_prior同理 self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN( self.user_train_dict, self.user_prior_dict) self.swblog, self.swblog_prior, self.nwblog, self.nwblog_prior = Alkit.setSN( self.wblog_train_dict, self.wblog_prior_dict) self.all_user = self.user_prior_list self.all_wblog = self.wblog_train_list + self.wblog_prior_list self.mdb = MongoClient( ).crowd_target # 代码原来是crowd_target,因为我数据库的名字写错了所以改成crow_target self.sqlhelper = SqlHelper()
def tongi(): """ 各种统计 :return: """ sqlhelper = SqlHelper() spammer = sqlhelper.select_sql_one('SELECT uid FROM spammer') normal = sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="yes"') final_user = sqlhelper.select_sql_one('SELECT uid FROM final_user WHERE spammer="yes"') for uid in final_user: if uid not in spammer: spammer.append(uid) for uid in spammer: if uid in normal: normal.remove(uid) print(len(spammer)) print(len(normal))
def count_comment(): sqlhelper = SqlHelper() comment = {} col = MongoClient().wblog.wblog # i = 0 for wblogId in sqlhelper.select_sql('SELECT wblogId FROM wblog'): wblogId = wblogId[0] cnt = 0 try: wblog = col.find_one({'wblogId': str(wblogId)})['json_text'] cnt = int(wblog['comments_count']) # print(cnt) except Exception as e: print(e) if cnt not in comment.keys(): comment[cnt] = 1 else: comment[cnt] += 1 # i += 1 # if i == 100: # break # cnt = [] # for i in range(10000): # cnt.append(i) # comment_cnt = init_dict(cnt, 0) # # calculate_cnt(comment_cnt, comment) write_dict_cnt_to_txt(comment, 'data\\comment_cnt.txt') """ 0 615501 1 120480 2 74059 3 47064 4 37356 5 29747 6 25166 """ sqlhelper.close()
def _set_tweet_edge(user_list_split, all_wblog): tweet_edge = {} sqlhelper = SqlHelper(host='localhost', db='sdh', user='******', passwd='root', charset='utf8') for uid in user_list_split: tweet_edge[uid] = [] for res in sqlhelper.select_sql( 'SELECT wblogId FROM wblog WHERE uid=%s' % uid): wblogId = str(res[0]) if wblogId in all_wblog: tweet_edge[uid].append(wblogId) for res in sqlhelper.select_sql( 'SELECT wblogId FROM swblog WHERE uid=%s' % uid): wblogId = str(res[0]) if wblogId in all_wblog: tweet_edge[uid].append(wblogId) return tweet_edge
def __init__(self, h, d, u, p, c, train_per=0.8, spam_per=0.1, reset_dataset=False, dump=True, add_unknown_into_model=False,file_name_appendix=''): self.host = h self.db = d self.user = u self.passwd = p self.charset = c self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.commentSimilarity = MongoClient().wblogFeature.commentSimilarity self.sentimentSimilarity = MongoClient().wblogFeature.sentimentSimilarity self.commentInteractRatio = MongoClient().wblogFeature.commentInteractRatio self.hotCommentRatio = MongoClient().wblogFeature.hotCommentRatio self.train_per = train_per self.spam_per = spam_per self.reset_dataset = reset_dataset self.dump = dump self.add_unknown_into_model = add_unknown_into_model self.file_name_appendix = file_name_appendix
def setFF(self): """ :return: none """ col = self.mdb.followCnt sqlhelper = SqlHelper() # spammer = sqlhelper.select_sql_one('SELECT uid FROM final_user WHERE spammer="yes"') # normal = sqlhelper.select_sql_one('SELECT uid FROM final_user WHERE spammer="no"') # cnt_dict = {} # profile = MongoClient().profile.json_text # for json_text in profile.find(): # uid = json_text['uid'] # if uid not in spammer and uid not in normal: # continue # cnt = 0 # try: # for card in json_text['json_text']['cards']: # try: # cnt += len(card['card_group']) # except Exception as e: # pass # except Exception as e: # print('no cards %s' % uid) # cnt_dict[uid] = cnt # for key in cnt_dict.keys(): # col.update({'uid': str(key)}, {'$set': {'profile': cnt_dict[key]}}) # # followCnt = MongoClient().userFeature.followCnt # for user in followCnt.find(): # uid = user['uid'] # try: # followee_cnt = followCnt.find_one({'uid': str(uid)})['followee_cnt'] # follower_cnt = followCnt.find_one({'uid': str(uid)})['follower_cnt'] # res = float(followee_cnt) / follower_cnt # col.update({'uid': str(uid)}, {'$set': {'ff': res}}) # except Exception as e: # print('no cards %s' % uid) uu = MongoClient().profile.user for user in uu.find(): uid = user['uid'] # if uid in spammer try: if uu.find_one({'uid': str(uid) })['json_text']['description'] != '': col.update({'uid': str(uid)}, {'$set': {'description': 1}}) else: col.update({'uid': str(uid)}, {'$set': {'description': 0}}) except Exception as e: print('no cards %s' % uid)
def __init__(self, h, d, u, p, c, file_name_appendix=''): self.host = h self.db = d self.user = u self.passwd = p self.charset = c self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.file_name_appendix = file_name_appendix self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \ Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt') self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN(self.user_train_dict, self.user_prior_dict) self.seed_worker = [] for uid in self.user_train_dict.keys(): if self.user_train_dict[uid]['label'] == '1': self.seed_worker.append(uid) self.other_worker = [] for uid in self.user_prior_dict.keys(): if self.user_prior_dict[uid]['label'] == '1': self.other_worker.append(uid) self.normal = [] for uid in self.user_prior_dict.keys(): if self.user_prior_dict[uid]['label'] == '-1': self.normal.append(uid) self.all_user = self.seed_worker + self.other_worker + self.normal self.follow_edge = [] for uid in self.all_user: for result in self.sqlhelper.select_sql('SELECT uid, followeeUid FROM edge WHERE uid=%s' % uid): uid = str(result[0]) followeeUid = str(result[1]) if followeeUid not in self.all_user: continue self.follow_edge.append((uid, followeeUid))
def works(): """ 统计众包水军参与任务次数 :return: """ sqlhelper = SqlHelper() w = {} for res in sqlhelper.select_sql('SELECT woUid FROM works1516'): woUid = res[0] if woUid not in w: w[woUid] = 0 w[woUid] += 1 w_cnt = {} for woUid in w.keys(): cnt = w[woUid] if cnt not in w_cnt: w_cnt[cnt] = 1 w_cnt[cnt] += 1 w_cnt = sorted(w_cnt.items(), key=lambda x: x[0]) with open('data/works.txt', 'w') as my_file: my_file.write('woUid cnt\n') for itm in w_cnt: my_file.write('%s %s\n' % (str(itm[0]), str(itm[1])))
def __init__(self, h, d, u, p, c, file_name_appendix=''): """ 在init中将读取S3MCD必要的数据 """ self.host = h self.db = d self.user = u self.passwd = p self.charset = c self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.file_name_appendix = file_name_appendix # 读取训练集,以及测试集上得到的先验类别 # user_train_dict,训练集,带标签 # user_train_list,训练集,只有用户id # user_prior_dict,测试集,带ground truth标签,以及先验类别的prior标签 # user_prior_list, 测试集,只有用户id self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \ Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt') self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \ Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') # spammer,真实的spammer用户 # spammer_prior,先验类别判定后的spammer用户 # normal,真实的normal用户 # normal_prior,先验类别判定后的normal用户 # swblog,swblog_prior,wblog,wblog_prior同理 self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN( self.user_train_dict, self.user_prior_dict) self.swblog, self.swblog_prior, self.nwblog, self.nwblog_prior = Alkit.setSN( self.wblog_train_dict, self.wblog_prior_dict) self.all_user = self.user_prior_list self.all_wblog = self.wblog_prior_list self.follow_edge = {} # {'uid': ['followeeUid']} self.tweet_edge = {} # {'uid': ['wblogId']} self.wblog_content = {} # {'wblogId': [content]} self.pattern_html = re.compile(r'<[^>]+>', re.S) self.pattern_tag = re.compile(r'#.+#', re.S)
def count_wblog(): sqlhelper = SqlHelper() wblog = {} for user in sqlhelper.select_sql_one('SELECT uid FROM user'): wblog[str(user)] = 0 tmp = sqlhelper.select_cnt('SELECT count(*) FROM swblog WHERE uid=%s' % (str(user))) # print(tmp) if tmp: wblog[str(user)] += int(tmp) tmp = sqlhelper.select_cnt('SELECT count(*) FROM wblog WHERE uid=%s' % (str(user))) # print(tmp) if tmp: wblog[str(user)] += int(tmp) write_dict_to_txt(wblog, 'data\\wblog.txt') """ 1751565235 42 5136420870 0 3106192681 24 3203825104 0 2126474562 8 2324752481 57 """ cnt = [] for i in range(10000): cnt.append(i) wblog_cnt = init_dict(cnt, 0) calculate_cnt(wblog_cnt, wblog) write_dict_cnt_to_txt(wblog_cnt, 'data\\wblog_cnt.txt') """ 0 7938 1 532 2 336 3 249 4 189 5 169 6 151 """ sqlhelper.close()
class UserClassify(object): def __init__(self, h, d, u, p, c, train_per=0.8, spammer_per=0.1, reset_dataset=False, dump=True, add_unknown_into_model=False, file_name_appendix=''): self.host = h self.db = d self.user = u self.passwd = p self.charset = c self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.registerDay = MongoClient().userFeature.registerDay self.followCnt = MongoClient().userFeature.followCnt self.oriThirdFre = MongoClient().userFeature.oriThirdFre self.retweetFre = MongoClient().userFeature.retweetFre self.rvp = MongoClient().userFeature.rvp self.train_per = train_per self.spammer_per = spammer_per self.reset_dataset = reset_dataset self.dump = dump self.add_unknown_into_model = add_unknown_into_model self.file_name_appendix = file_name_appendix def run(self): """ 从数据库中读取特征数据,并使用svm和lr分类 水军占比例(max): 0.2325521503991759 spammer_per <= 0.2325521503991759 :return: """ if not self.add_unknown_into_model: # 首先划分训练集用户和测试集用户 spammer = self.sqlhelper.select_sql_one('SELECT uid FROM spammer') normal = self.sqlhelper.select_sql_one( 'SELECT uid FROM normal WHERE choose="yes"') # unknown = self.sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="not"') final_user = self.sqlhelper.select_sql_one( 'SELECT uid FROM final_user WHERE spammer="yes"') """ final_user: 3843个用户, 水军903, 非水军2940 normal: 13906个用户, 水军和非水军未知,为此我们通过人工的方法从从这些用户中挑选了一些正常的用户,标记为choose='yes' spammer: 892个水军用户 """ for uid in final_user: if uid not in spammer: spammer.append(uid) """ 到这为止, 代码中spammer相当于数据表里spammer U final_user.spammer一共有903 """ # 不知道为什么spammer和normal两个集合有重合的用户 # 所以这里简单地将这些重合的用户都认为是spammer for uid in spammer: if uid in normal: normal.remove(uid) # if uid in unknown: # unknown.remove(uid) """ 到目前为止,我们得到了下面几个有用的东西 spammer: 水军 normal: 正常用户 unkonwn:还没来得及标注的未知类型用户 """ logging.info('原始数据水军占比例(max): %s' % (len(spammer) * 1.0 / (len(normal) + len(spammer)))) if self.spammer_per > len(spammer) * 1.0 / (len(normal) + len(spammer)): logging.info( 'we don\'t have so much spammers in our datasets, we will keep original percentage' ) else: expected_spammer_number = int(self.spammer_per * len(normal) * 1.0 / (1 - self.spammer_per)) spammer = random.sample(spammer, expected_spammer_number) # print(len(spammer)) if self.reset_dataset: train_user_set = random.sample( spammer, int( len(spammer) * self.train_per)) + random.sample( normal, int(len(normal) * self.train_per)) test_user_set = list( set(spammer + normal).difference(train_user_set)) # # 第二期改进代码 # train_user_set_without_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(len(normal) * train_per)) # train_user_set_with_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int( # len(normal) * train_per))+random.sample(unknown, len(unknown)) # test_user_set = list(set(spammer + normal).difference(train_user_set_without_unknown)) # train_user_set=train_user_set_with_unknown+train_user_set_with_unknown else: train_user_set, test_user_set = Alkit.read_dataset( '../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt') # 输出训练集和测试集的一些信息 logging.info('数据集总大小:%s' % (len(train_user_set) + len(test_user_set))) logging.info('训练集大小:%s' % len(train_user_set)) logging.info( '训练集中正例(spammer)大小:%s' % len(list(set(train_user_set).intersection(set(spammer))))) logging.info( '训练集中负例(normal)大小:%s' % len(list(set(train_user_set).intersection(set(normal))))) # logging.info('训练集中未知标签(unknown)大小:%s' % len(list(set(unknown)))) logging.info('测试集大小:%s' % len(test_user_set)) logging.info( '测试集中正例(spammer)大小:%s' % len(list(set(test_user_set).intersection(set(spammer))))) logging.info( '测试集中负例(normal)大小:%s' % len(list(set(test_user_set).intersection(set(normal))))) logging.info('水军占比例: %s' % (len(spammer) * 1.0 / (len(normal) + len(spammer)))) """ 测试集参与训练,但是测试集在模型训练期间标签将按照unknown处理 """ else: raise ('we will implement this later.') # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致) feature_dict_data, result_dict_data = self.load_data( train_user_set, spammer, normal) train_feature, train_result = Alkit.process_data( feature_dict_data, result_dict_data) logging.info('训练集数据处理完毕') feature_dict_data, result_dict_data = self.load_data( test_user_set, spammer, normal) test_feature, test_result = Alkit.process_data(feature_dict_data, result_dict_data) logging.info('测试集数据处理完毕') # print(metrics.mutual_info_score(train_result, train_feature)) # 使用svm训练并输出结果 # logging.info('\nSVM开始训练') # model = SVC(class_weight='balanced') # model.fit(train_feature, train_result) # logging.info('训练结束') # predict_result = model.predict(test_feature) # logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) # logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) # logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) # import minepy # m = minepy.MINE() # for i in range(7): # m.compute_score(train_feature[:,i], train_result) # print(m.mic()) # 使用LR训练并输出结果 logging.info('LR开始训练') model = LogisticRegression(class_weight='balanced') model.fit(train_feature, train_result) logging.info('训练结束') predict_result = model.predict(test_feature) logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) # 使用LR输出概率形式的结果 predict_result_proba = model.predict_proba(test_feature) prp = [] for prob in predict_result_proba: prp.append(float(prob[0]) * -1 + float(prob[1]) * 1) # 将LR跑出来的两种结果保存下来,供下一步使用 if self.dump: logging.info("保存结果输出到 " + '../main/prior/user_train' + self.file_name_appendix + '.txt 和' + '../main/prior/user_prior' + self.file_name_appendix + '.txt') Alkit.write_prior( '../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt', train_user_set, train_result, test_user_set, test_result, predict_result, prp) # 使用Random Forest训练并输出结果 # logging.info('\nRandom Forest开始训练') # model = RandomForestClassifier(n_estimators=100, class_weight='balanced') # model.fit(train_feature, train_result) # logging.info('训练结束') # # importances = model.feature_importances_ # print(importances) # # predict_result = model.predict(test_feature) # logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) # logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) # logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) # 使用RF输出概率形式的结果 # predict_result_proba = model.predict_proba(test_feature) # prp = [] # for prob in predict_result_proba: # prp.append(float(prob[0]) * -1 + float(prob[1]) * 1) # # 将RF跑出来的两种结果保存下来,供下一步使用 # Alkit.write_prior('prior/user_train.txt', 'prior/user_prior.txt', # train_user_set, train_result, test_user_set, test_result, predict_result, prp) # return float(metrics.f1_score(test_result, predict_result)) # feature_name = ['log_time', 'log_follower', 'log_followee', 'fre-re', 'fre', 'follow_fre', 'onehop_fre', 'rvp_ratio'] # df = DataFrame(numpy.hstack((test_feature, test_result[:, None])), # columns=feature_name + ["class"]) # _ = seaborn.pairplot(df, vars=feature_name, hue="class", size=1.5) # plt.show() # feature_dict_data, result_dict_data = self.load_data(train_user_set + test_user_set, spammer, normal) # test_feature, test_result = Alkit.process_data(feature_dict_data, result_dict_data) # logging.info('数据处理完毕') # # logging.info('\nSVM开始训练-交叉验证') # model = SVC(class_weight='balanced') # res = cross_val_score(model, test_feature, test_result, cv=5, scoring='f1') # logging.info('训练结束') # logging.info(res) # # logging.info('\nLR开始训练-交叉验证') # model = LogisticRegression(class_weight='balanced') # res = cross_val_score(model, test_feature, test_result, cv=5, scoring='f1') # logging.info('训练结束') # logging.info(res) def evalutaion(self): """ 评价一下 :return: """ user_train_dict, user_train_list, user_prior_dict, user_prior_list = \ Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt') spammer, spammer_prior, normal, normal_prior = Alkit.setSN( user_train_dict, user_prior_dict) scores = [] test_result = [] predict_result = [] for uid in user_prior_list: test_result.append(float(user_prior_dict[uid]['label'])) predict_result.append(float(user_prior_dict[uid]['prior_label'])) scores.append(float(user_prior_dict[uid]['prior'])) # print(float(metrics.f1_score(test_result, predict_result))) Evaluation.evaluation_self(scores, test_result) # ap p, r, thresholds = metrics.precision_recall_curve(test_result, scores) ap = metrics.average_precision_score(test_result, scores) logging.info('user AP:%s' % str(ap)) with open('../main/lr/user_ap' + self.file_name_appendix + '.txt', 'w') as my_file: my_file.write('p r\n') for i in range(len(p)): my_file.write('%s %s\n' % (str(p[i]), str(r[i]))) # roc fpr, tpr, thresholds = metrics.roc_curve(test_result, scores) logging.info('user AUC:%s' % str(metrics.auc(fpr, tpr))) with open('../main/lr/user_roc' + self.file_name_appendix + '.txt', 'w') as my_file: my_file.write('fpr tpr\n') for i in range(len(fpr)): my_file.write('%s %s\n' % (str(fpr[i]), str(tpr[i]))) # top k precision worker_score = {} for i in range(len(scores)): worker_score[user_prior_list[i]] = scores[i] worker_score = sorted(worker_score.items(), key=lambda im: float(im[1]), reverse=True) with open('../main/lr/res_user_top' + self.file_name_appendix + '.txt', 'w') as my_file: my_file.write('type uid score precision top_k\n') worker_count_now = 0 top_k = 0 for itm in worker_score: uid = itm[0] score = itm[1] if uid in spammer: u_type = 'w' worker_count_now += 1 else: u_type = 'n' top_k += 1 precision = str(float(worker_count_now) / top_k) my_file.write(u_type + ' ' + str(uid) + ' ' + str(score) + ' ' + precision + ' ' + str(top_k) + '\n') def load_data(self, total_set, spammer, normal, unknown=None): """ 从数据库读取数据,因为训练集和测试集读取的操作一样,所以单独写一个方法 :return: 特征字典数据,类别字典数据 """ feature_dict_data = OrderedDict() result_dict_data = OrderedDict() for uid in total_set: feature_dict_data[uid] = [ Alkit.load_data_help(self.registerDay, uid, 'log_time'), Alkit.load_data_help(self.followCnt, uid, 'log_follower'), Alkit.load_data_help(self.followCnt, uid, 'log_followee'), Alkit.load_data_help(self.oriThirdFre, uid, 'fre'), Alkit.load_data_help(self.retweetFre, uid, 'follow_fre'), Alkit.load_data_help(self.retweetFre, uid, 'onehop_fre'), Alkit.load_data_help(self.rvp, uid, 'rvp_ratio') ] """ 现在我需要检查一下, 看看mongodb里这些json数据表是不是仅仅包含了normal和spammer而没有把unknown放进来? self.registerDay = MongoClient().userFeature.registerDay self.followCnt = MongoClient().userFeature.followCnt self.oriThirdFre = MongoClient().userFeature.oriThirdFre self.retweetFre = MongoClient().userFeature.retweetFre self.rvp = MongoClient().userFeature.rvp """ # feature_dict_data[uid] = [Alkit.load_data_help(self.followCnt, uid, 'follower_cnt'), # Alkit.load_data_help(self.followCnt, uid, 'followee_cnt'), # Alkit.load_data_help(self.followCnt, uid, 'ff'), # Alkit.load_data_help(self.followCnt, uid, 'profile'), # Alkit.load_data_help(self.rvp, uid, 'discription')] # if uid in spammer: # result_dict_data[uid] = 1 # else: # result_dict_data[uid] = -1 # 第二期改进代码 if uid in spammer: result_dict_data[uid] = 1 elif uid in normal: result_dict_data[uid] = -1 elif uid in unknown: result_dict_data[uid] = 0 # 这个地方是我自己添加的,对于标签未知的用户,设定其标签为0 return feature_dict_data, result_dict_data
class CrowdTarget(): def __init__(self, h, d, u, p, c, file_name_appendix=''): """ 在init中将读取CrowdTarget必要的数据 """ self.host = h self.db = d self.user = u self.passwd = p self.charset = c self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.file_name_appendix = file_name_appendix # 读取训练集,以及测试集上得到的先验类别 # user_train_dict,训练集,带标签 # user_train_list,训练集,只有用户id # user_prior_dict,测试集,带ground truth标签,以及先验类别的prior标签 # user_prior_list, 测试集,只有用户id self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \ Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt') self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \ Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') # spammer,真实的spammer用户 # spammer_prior,先验类别判定后的spammer用户 # normal,真实的normal用户 # normal_prior,先验类别判定后的normal用户 # swblog,swblog_prior,wblog,wblog_prior同理 self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN( self.user_train_dict, self.user_prior_dict) self.swblog, self.swblog_prior, self.nwblog, self.nwblog_prior = Alkit.setSN( self.wblog_train_dict, self.wblog_prior_dict) self.all_user = self.user_prior_list self.all_wblog = self.wblog_train_list + self.wblog_prior_list self.mdb = MongoClient( ).crowd_target # 代码原来是crowd_target,因为我数据库的名字写错了所以改成crow_target self.sqlhelper = SqlHelper() def feature_retweet_time(self): col = self.mdb.time if not col.find_one(): logging.info('retweet_time为空,设置主键为wblogId') col.create_index([('wblogId', pymongo.DESCENDING)], unique=True) cc = MongoClient().comment.comment for wblogId in self.all_wblog: if wblogId in self.swblog: col.insert_one({'wblogId': wblogId, 'spammer': 'true'}) else: col.insert_one({'wblogId': wblogId, 'spammer': 'false'}) t = self.sqlhelper.select_sql_one( 'SELECT created_at FROM wblog WHERE wblogId=%s' % str(wblogId)) if not t: t = self.sqlhelper.select_sql_one( 'SELECT created_at FROM swblog WHERE wblogId=%s' % str(wblogId)) a = time.mktime(time.strptime(t[0], '%Y/%m/%d %H:%M:%S')) res = 0.0 cnt = 0 time_list = [] try: for comment in cc.find({'wblogId': str(wblogId)}): created_at = comment['json_text']['created_at'] + ':00' if len(created_at.split('-')[0]) != 4: created_at = '2017-' + created_at b = time.mktime( time.strptime(created_at, '%Y-%m-%d %H:%M:%S')) res += b - a cnt += 1 time_list.append(res) except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId))) if cnt != 0: col.update({'wblogId': wblogId}, {'$set': { 'mean': str(res / cnt) }}) if cnt > 3: col.update({'wblogId': wblogId}, { '$set': { 'std': str(numpy.std(numpy.array(time_list), ddof=1)) } }) col.update({'wblogId': wblogId}, { '$set': { 'skewness': str(stats.skew(numpy.array(time_list))) } }) col.update({'wblogId': wblogId}, { '$set': { 'kurtosis': str(stats.kurtosis(numpy.array(time_list))) } }) logging.info('feature_time finished') def feature_third(self): col = self.mdb.third if not col.find_one(): logging.info('retweet_third为空,设置主键为wblogId') col.create_index([('wblogId', pymongo.DESCENDING)], unique=True) third_party = ('推兔', '好保姆', '互粉派对 ', '优推推互粉', '未通过审核应用', '互粉加加', '互粉小助手', '孔明社交管理', '互粉赏金榜', '推米互粉', '多推', '互粉一族', '推兔手机版', '推啊') cc = MongoClient().comment.comment for wblogId in self.all_wblog: cnt = 0 third_cnt = 0 if wblogId in self.swblog: col.insert_one({'wblogId': wblogId, 'spammer': 'true'}) else: col.insert_one({'wblogId': wblogId, 'spammer': 'false'}) try: for comment in cc.find({'wblogId': str(wblogId)}): source = comment['json_text']['source'] if source in third_party: third_cnt += 1 cnt += 1 except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId))) if cnt > 1: if cnt != 0: # if third_cnt != 0: # print(wblogId) # print(float(third_cnt) / cnt) col.update( {'wblogId': wblogId}, {'$set': { 'third': str(float(third_cnt) / cnt) }}) # for wblogId in self.all_wblog: # retweet_list = [] # cnt = 0 # try: # for wid in self.sqlhelper.select_sql('SELECT wblogId FROM wblog WHERE paMid=%s' % str(wblogId)): # retweet_list.append(wid[0]) # for wid in self.sqlhelper.select_sql('SELECT wblogId FROM wblog WHERE orMid=%s' % str(wblogId)): # if wid[0] not in retweet_list: # retweet_list.append(wid[0]) # # print(retweet_list) # # print(len(retweet_list)) # for wid in retweet_list: # res = self.sqlhelper.select_sql_one('SELECT source FROM wblog WHERE wblogId=%s' % str(wid)) # if len(res) == 0: # continue # source = res[0] # if source in third_party: # cnt += 1 # except Exception as e: # logging.error('%s. The wblogId is %s' % (e, str(wblogId))) # # if len(retweet_list) > 1: # if cnt != 0: # print(wblogId) # print(float(cnt) / len(retweet_list)) def feature_ur(self): col = self.mdb.ur if not col.find_one(): logging.info('retweet_ur为空,设置主键为wblogId') col.create_index([('wblogId', pymongo.DESCENDING)], unique=True) total_user = [] for uid in self.sqlhelper.select_sql('SELECT uid FROM spammer'): total_user.append(str(uid[0])) for uid in self.sqlhelper.select_sql('SELECT uid FROM normal'): if str(uid[0]) not in total_user: total_user.append(str(uid[0])) cc = MongoClient().comment.comment process_cnt = 0.0 for wblogId in self.all_wblog: cnt = 0 follow_cnt = 0 if wblogId in self.swblog: col.insert_one({'wblogId': wblogId, 'spammer': 'true'}) else: col.insert_one({'wblogId': wblogId, 'spammer': 'false'}) poster_uid = self.sqlhelper.select_sql_first( 'SELECT uid FROM swblog WHERE wblogId=%s' % str(wblogId)) if poster_uid == -1: poster_uid = self.sqlhelper.select_sql_first( 'SELECT uid FROM wblog WHERE wblogId=%s' % str(wblogId)) try: for comment in cc.find({'wblogId': str(wblogId)}): uid = comment['json_text']['user']['id'] if str(uid) in total_user: cnt += 1 for followeeUid in self.sqlhelper.select_sql( 'SELECT followeeUid FROM edge1516 WHERE uid=%s' % str(uid)): if str(followeeUid[0]) == str(poster_uid): follow_cnt += 1 break except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId))) process_cnt += 1.0 print('processing:%s' % str(process_cnt / len(self.all_wblog))) if cnt > 1: if cnt != 0: # if follow_cnt != 0: # print(wblogId) # print(float(follow_cnt) / cnt) col.update({'wblogId': wblogId}, {'$set': { 'ur': str(float(follow_cnt) / cnt) }}) def feature_click(self): col = self.mdb.click if not col.find_one(): logging.info('click为空,设置主键为wblogId') col.create_index([('wblogId', pymongo.DESCENDING)], unique=True) ws = MongoClient().wblog.swblog ww = MongoClient().wblog.wblog for wblogId in self.all_wblog: if wblogId in self.swblog: pass else: wblog = ww.find_one({'wblogId': str(wblogId)}) content = wblog['json_text']['text'] if 'ttarticle' in content: print('https:' + content.split('ttarticle')[0].split(':')[-1] + 'ttarticle' + content.split('ttarticle')[1].split('&')[0]) for wblog in ws.find(): content = wblog['json_text']['text'] if 'ttarticle' in content: print('https:' + content.split('ttarticle')[0].split(':')[-1] + 'ttarticle' + content.split('ttarticle')[1].split('&')[0]) def run(self, train_per=0.8, reset_dataset=False): """ 从数据库中读取特征数据,并使用adaboost分类 :return: """ # 首先划分训练集微博和测试集微博 swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM swblog') wblog = self.sqlhelper.select_sql_one( 'SELECT wblogId FROM wblog_choose') final_wblog = self.sqlhelper.select_sql_one( 'SELECT wblogId FROM final_wblog WHERE spammer="yes"') for wblogId in final_wblog: if wblogId not in swblog: swblog.append(wblogId) for uid in swblog: if uid in wblog: wblog.remove(uid) train_wblog_set, test_wblog_set = Alkit.read_dataset( '../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') # 输出训练集和测试集的一些信息 logging.info('训练集大小:%s' % len(train_wblog_set)) logging.info('训练集中正例(swblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(swblog))))) logging.info('训练集中负例(wblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(wblog))))) logging.info('测试集大小:%s' % len(test_wblog_set)) logging.info('测试集中正例(swblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(swblog))))) logging.info('测试集中负例(wblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(wblog))))) # print('279 train_wblog_set \n', train_wblog_set) # print('279 swblog \n', swblog) # print('279 wblog \n', wblog) # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致) feature_dict_data, result_dict_data = self.load_data( train_wblog_set, swblog, wblog) # print('281 feature_dict_data ', feature_dict_data) # [('4033482998743585', [nan, nan, nan, nan, nan]), # print('282 result_dict_data', result_dict_data) # [('4033482998743585', 1), ('3914608449995325', 1), train_feature, train_result = Alkit.process_data( feature_dict_data, result_dict_data) logging.info('训练集数据处理完毕') feature_dict_data, result_dict_data = self.load_data( test_wblog_set, swblog, wblog) test_feature, test_result = Alkit.process_data(feature_dict_data, result_dict_data) logging.info('测试集数据处理完毕') # 使用ad-boost训练并输出结果 logging.info('\nAdaBoost开始训练') model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=5), algorithm="SAMME", n_estimators=100, learning_rate=0.5) model.fit(train_feature, train_result) logging.info('训练结束') predict_result = model.predict(test_feature) logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) predict_result_proba = model.predict_proba(test_feature) prp = [] for prob in predict_result_proba: prp.append(float(prob[0]) * -1 + float(prob[1]) * 1) Alkit.write_prior( '../main/crowd_target/wblog_train' + self.file_name_appendix + '.txt', '../main/crowd_target/wblog_prior' + self.file_name_appendix + '.txt', train_wblog_set, train_result, test_wblog_set, test_result, predict_result, prp) def evalutaion(self): """ 评价一下 :return: """ wblog_train_dict, wblog_train_list, wblog_prior_dict, wblog_prior_list = \ Alkit.read_prior('../main/crowd_target/wblog_train' + self.file_name_appendix + '.txt', '../main/crowd_target/wblog_prior' + self.file_name_appendix + '.txt') swblog, swblog_prior, nwblog, nwblog_prior = Alkit.setSN( wblog_train_dict, wblog_prior_dict) scores = [] test_result = [] predict_result = [] for uid in wblog_prior_list: test_result.append(float(wblog_prior_dict[uid]['label'])) predict_result.append(float(wblog_prior_dict[uid]['prior_label'])) scores.append(float(wblog_prior_dict[uid]['prior'])) Evaluation.evaluation_self(scores, test_result) # ap p, r, thresholds = metrics.precision_recall_curve(test_result, scores) ap = metrics.average_precision_score(test_result, scores) logging.info('wblog AP:%s' % str(ap)) with open( '../main/crowd_target/wblog_ap' + self.file_name_appendix + '.txt', 'w') as my_file: my_file.write('p r\n') for i in range(len(p)): my_file.write('%s %s\n' % (str(p[i]), str(r[i]))) # roc fpr, tpr, thresholds = metrics.roc_curve(test_result, scores) logging.info('wblog AUC:%s' % str(metrics.auc(fpr, tpr))) with open( '../main/crowd_target/wblog_roc' + self.file_name_appendix + '.txt', 'w') as my_file: my_file.write('fpr tpr\n') for i in range(len(fpr)): my_file.write('%s %s\n' % (str(fpr[i]), str(tpr[i]))) # top k precision wblog_score = {} for i in range(len(scores)): wblog_score[wblog_prior_list[i]] = scores[i] wblog_score = sorted(wblog_score.items(), key=lambda im: float(im[1]), reverse=True) with open( '../main/crowd_target/res_wblog_top' + self.file_name_appendix + '.txt', 'w') as my_file: my_file.write('type wblogId score precision top_k\n') wblog_count_now = 0 top_k = 0 for itm in wblog_score: uid = itm[0] score = itm[1] if uid in swblog: u_type = 's' wblog_count_now += 1 else: u_type = 'n' top_k += 1 precision = str(float(wblog_count_now) / top_k) my_file.write(u_type + ' ' + str(uid) + ' ' + str(score) + ' ' + precision + ' ' + str(top_k) + '\n') def load_data(self, total_set, swblog, wblog): """ 从数据库读取数据,因为训练集和测试集读取的操作一样,所以单独写一个方法 :return: 特征字典数据,类别字典数据 total_set=train_wblog_set, ['4033482998743585', '3914608449995325', swblog=swblog, ['4045047554826553', '4039829169862097', wblog=wblog, ['4032096583879003', '4054839190956692', """ feature_dict_data = OrderedDict() result_dict_data = OrderedDict() for wblogId in total_set: feature_dict_data[wblogId] = [ Alkit.load_data_help_w(self.mdb.time, wblogId, 'mean'), Alkit.load_data_help_w(self.mdb.time, wblogId, 'std'), Alkit.load_data_help_w(self.mdb.time, wblogId, 'skewness'), Alkit.load_data_help_w(self.mdb.time, wblogId, 'kurtosis'), Alkit.load_data_help_w(self.mdb.third, wblogId, 'third') ] if wblogId in swblog: result_dict_data[wblogId] = 1 else: result_dict_data[wblogId] = -1 # print("388 feature_dict_data\n", feature_dict_data) return feature_dict_data, result_dict_data
def sentiment(): """ 为了画情感极性 :return: """ sqlhelper = SqlHelper() swblog = sqlhelper.select_sql_one('SELECT wblogId FROM swblog') wblog = sqlhelper.select_sql_one('SELECT wblogId FROM wblog_choose') final_wblog = sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"') for wblogId in final_wblog: if wblogId not in swblog: swblog.append(wblogId) for wblogId in swblog: if wblogId in wblog: wblog.remove(wblogId) all_wblog = swblog + wblog swblog_sentiment_dict = {} swblog_comment_cnt = 0 wblog_sentiment_dict = {} wblog_comment_cnt = 0 # 有一些评论很短或者没有字之类的 # 对于这些微博,不参与计算情感极性 # 过滤的方法是分词后判断去除一个词都不剩下的文本 stop_words = WblogFeature.get_stop_words('stop_words.txt') cc = MongoClient().comment.comment for wblogId in all_wblog: corpus = [] try: for comment in cc.find({'wblogId': str(wblogId)}): text = WblogFeature.remove_html(comment['json_text']['text']) text = WblogFeature.remove_tag(text) fenci = list(jieba.cut_for_search(text)) if len(fenci) == 0: continue # 由于jieba分词没有提供去停用词的接口,所以手动去停用词 stop_cnt = 0 for word in fenci: if word in stop_words: stop_cnt += 1 if stop_cnt == len(fenci): continue corpus.append(text) except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId))) if wblogId in swblog: swblog_comment_cnt += len(corpus) for text in corpus: sen = round(float(snownlp.SnowNLP(text).sentiments), 1) if sen not in swblog_sentiment_dict.keys(): swblog_sentiment_dict[sen] = 0 swblog_sentiment_dict[sen] += 1 else: wblog_comment_cnt += len(corpus) for text in corpus: sen = round(float(snownlp.SnowNLP(text).sentiments), 1) if sen not in wblog_sentiment_dict.keys(): wblog_sentiment_dict[sen] = 0 wblog_sentiment_dict[sen] += 1 with open('swblog_sentiment.txt', 'w') as my_file: for key in swblog_sentiment_dict.keys(): my_file.write(str(key) + ' ' + str(float(swblog_sentiment_dict[key]) / swblog_comment_cnt) + '\n') with open('wblog_sentiment.txt', 'w') as my_file: for key in wblog_sentiment_dict.keys(): my_file.write(str(key) + ' ' + str(float(wblog_sentiment_dict[key]) / wblog_comment_cnt) + '\n')
class WblogFeature: pattern_html = re.compile(r'<[^>]+>', re.S) pattern_tag = re.compile(r'#.+#', re.S) def __init__(self, h, d, u, p, c): self.host = h self.db = d self.user = u self.passwd = p self.charset = c def __enter__(self): self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.mdb = MongoClient().wblogFeature self.swblog = self.sqlhelper.select_sql_one( 'SELECT wblogId FROM swblog') self.wblog = self.sqlhelper.select_sql_one( 'SELECT wblogId FROM final_wblog WHERE spammer="no"') self.unknown = self.sqlhelper.select_sql_one( 'SELECT wblogId FROM wblog') final_wblog = self.sqlhelper.select_sql_one( 'SELECT wblogId FROM final_wblog WHERE spammer="yes"') for wblogId in final_wblog: if wblogId not in self.swblog: self.swblog.append(wblogId) # 不知道为什么spammer和normal两个集合有重合的用户 # 所以这里简单地将这些重合的用户都认为是spammer for uid in self.swblog: if uid in self.wblog: self.wblog.remove(uid) # print(len(swblog)) for uid in self.swblog: if uid in self.unknown: self.unknown.remove(uid) for uid in self.wblog: if uid in self.unknown: self.unknown.remove(uid) return self def __exit__(self, exc_type, exc_val, exc_tb): self.sqlhelper.close() def setCommentSimilarity(self): """ 计算评论的文本相似度 将某一条微博下面的所有评论作为语料空间,然后计算基于tf-idf的文本余弦相似度 :return: none """ col = self.mdb.commentSimilarity if not col.find_one(): logging.info('commentSimilarity为空,设置主键为wblogId') col.create_index([('wblogId', pymongo.DESCENDING)], unique=True) # swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"') # wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="no"') # all_wblog = swblog + wblog swblog = self.swblog wblog = self.wblog unknown = self.unknown all_wblog = swblog + wblog + unknown # 将“转发微博”这四个字加入了停用词表 stop_words = WblogFeature.get_stop_words( os.path.dirname(os.getcwd()) + '/microblog/stop_words.txt') vectorizer = CountVectorizer( stop_words=stop_words ) # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 cc = MongoClient().comment.comment for wblogId in all_wblog: corpus = [] try: for comment in cc.find({'wblogId': str(wblogId)}): text = self.remove_html(comment['json_text']['text']) # 太短的文本很有可能去停用词后没有 有意义的内容,所以直接不计入计算 if len(text) <= 4: continue if wblogId in wblog: text = self.remove_tag(text) corpus.append(' '.join(jieba.cut_for_search(text))) except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId))) cos_sum = 0.0 cos_cnt = 0 try: # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵 tfidf = TfidfTransformer().fit_transform( vectorizer.fit_transform(corpus)) weight = tfidf.toarray( ) # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 # 计算每两条评论间的余弦相似度 for i in range(len(weight)): for j in range(len(weight)): if i == j: continue cos_sum += WblogFeature.cos(weight[i], weight[j]) cos_cnt += 1 cos_avg = cos_sum / float(cos_cnt) except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId))) cos_avg = 0.0 try: if wblogId in swblog: col.insert_one({ 'wblogId': wblogId, 'swblog': 'true', 'comment_similarity': cos_avg }) elif wblogId in wblog: col.insert_one({ 'wblogId': wblogId, 'swblog': 'false', 'comment_similarity': cos_avg }) elif wblogId in unknown: col.insert_one({ 'wblogId': wblogId, 'swblog': 'unknown', 'comment_similarity': cos_avg }) except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId))) logging.info('setCommentSimilarity finished') def setSentimentSimilarity(self): """ 计算评论文本的情感相似度 使用snownlp(背后是朴素贝叶斯方法)来判断评论的情感,从0(消极)~1(积极)分布,然后计算其标准差 有待改进:分类精度问题,即目前的情感分类的工具的都很笨,对于复杂一点的句式就不行了,也许用自己以前的可能更好 :return: none """ col = self.mdb.sentimentSimilarity if not col.find_one(): logging.info('sentimentSimilarity为空,设置主键为wblogId') col.create_index([('wblogId', pymongo.DESCENDING)], unique=True) # swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"') # wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="no"') # all_wblog = swblog + wblog swblog = self.swblog wblog = self.wblog unknown = self.unknown all_wblog = swblog + wblog + unknown # 有一些评论很短或者没有字之类的 # 对于这些微博,不参与计算情感极性 # 过滤的方法是分词后判断去除一个词都不剩下的文本 stop_words = WblogFeature.get_stop_words( os.path.dirname(os.getcwd()) + '/microblog/stop_words.txt') cc = MongoClient().comment.comment for wblogId in all_wblog: corpus = [] try: for comment in cc.find({'wblogId': str(wblogId)}): text = self.remove_html(comment['json_text']['text']) text = self.remove_tag(text) fenci = list(jieba.cut_for_search(text)) if len(fenci) == 0: continue # 由于jieba分词没有提供去停用词的接口,所以手动去停用词 stop_cnt = 0 for word in fenci: if word in stop_words: stop_cnt += 1 if stop_cnt == len(fenci): continue corpus.append(text) except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId))) std = 0.0 if len(corpus) > 3: sentiment_list = [] for text in corpus: sentiment_list.append(snownlp.SnowNLP(text).sentiments) std = numpy.std(numpy.array(sentiment_list), ddof=1) try: if wblogId in swblog: col.insert_one({ 'wblogId': wblogId, 'swblog': 'true', 'sentiment_similarity': std }) elif wblogId in wblog: col.insert_one({ 'wblogId': wblogId, 'swblog': 'false', 'sentiment_similarity': std }) elif wblogId in unknown: col.insert_one({ 'wblogId': wblogId, 'swblog': 'unknown', 'sentiment_similarity': std }) except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId))) logging.info('setSentimentSimilarity finished') def setSpamWords(self): """ 从众包营销微博下面的评论中抽取关键词,即tf-idf排名前十的词 这样对于每一条微博,都能生成十维特征,每一维特征的计算方式为 :return: """ col = self.mdb.spamWords if not col.find_one(): logging.info('spamWords为空,设置主键为wblogId') col.create_index([('wblogId', pymongo.DESCENDING)], unique=True) # swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"') # wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="no"') # all_wblog = swblog + wblog swblog = self.swblog wblog = self.wblog unknown = self.unknown all_wblog = swblog + wblog + unknown # 有一些评论很短或者没有字之类的 # 对于这些微博,不参与计算情感极性 # 过滤的方法是分词后判断去除一个词都不剩下的文本 stop_words = WblogFeature.get_stop_words( os.path.dirname(os.getcwd()) + '\\microblog\\stop_words.txt') cc = MongoClient().comment.comment pass def setCommentInteractRatio(self): """ 计算给定微博下面的评论之间的互动频率 = 与其他人互动的评论的条数 / 总评论条数 如何确定是一条互动评论:就简单地看有没有reply_id这个字段,还有@ :return: none """ col = self.mdb.commentInteractRatio if not col.find_one(): logging.info('commentInteractRatio为空,设置主键为wblogId') col.create_index([('wblogId', pymongo.DESCENDING)], unique=True) # swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"') # wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="no"') # all_wblog = swblog + wblog swblog = self.swblog wblog = self.wblog unknown = self.unknown all_wblog = swblog + wblog + unknown cc = MongoClient().comment.comment for wblogId in all_wblog: comment_cnt = 0 interact_cnt = 0 try: for comment in cc.find({'wblogId': str(wblogId)}): if 'reply_id' in comment['json_text'].keys(): interact_cnt += 1 continue # text = comment['json_text']['text'] # if '>@' in text: # interact_cnt += 1 comment_cnt += 1 except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId))) if comment_cnt == 0: interact_ratio = 0.0 else: interact_ratio = float(interact_cnt) / float(comment_cnt) try: if wblogId in swblog: col.insert_one({ 'wblogId': wblogId, 'swblog': 'true', 'interact_ratio': interact_ratio }) elif wblogId in wblog: col.insert_one({ 'wblogId': wblogId, 'swblog': 'false', 'interact_ratio': interact_ratio }) elif wblogId in unknown: col.insert_one({ 'wblogId': wblogId, 'swblog': 'unknown', 'interact_ratio': interact_ratio }) except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId))) def setHotCommentRatio(self): """ 计算给定微博的评论中的点赞数与评论数的比例 :return: none """ col = self.mdb.hotCommentRatio if not col.find_one(): logging.info('hotCommentRatio为空,设置主键为wblogId') col.create_index([('wblogId', pymongo.DESCENDING)], unique=True) # swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"') # wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="no"') # all_wblog = swblog + wblog swblog = self.swblog wblog = self.wblog unknown = self.unknown all_wblog = swblog + wblog + unknown cc = MongoClient().comment.comment for wblogId in all_wblog: comment_cnt = 0 hot_cnt = 0 try: for comment in cc.find({'wblogId': str(wblogId)}): if comment['json_text']['like_counts'] == '': comment_cnt += 1 else: hot_cnt += int(comment['json_text']['like_counts']) comment_cnt += 1 except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId))) if comment_cnt == 0: hot_ratio = 0.0 else: hot_ratio = float(hot_cnt) / float(comment_cnt) try: if wblogId in swblog: col.insert_one({ 'wblogId': wblogId, 'swblog': 'true', 'hot_ratio': hot_ratio }) elif wblogId in wblog: col.insert_one({ 'wblogId': wblogId, 'swblog': 'false', 'hot_ratio': hot_ratio }) elif wblogId in unknown: col.insert_one({ 'wblogId': wblogId, 'swblog': 'unknown', 'hot_ratio': hot_ratio }) except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId))) @staticmethod def remove_html(text): """ 去除文本中的html :return: 去除html后的文本 """ return WblogFeature.pattern_html.sub('', text) @staticmethod def remove_tag(text): """ 去除文本中的标签文本 :return: 去除标签文本后的文本 """ return WblogFeature.pattern_tag.sub('', text) @staticmethod def remove_html_complete(text): """ 去除文本中的html,并提取其中的表情符号 :return: list[去除html后的文本,表情1,表情2...] """ pass @staticmethod def get_stop_words(file_path): """ 读取停用词文件 :return: 停用词list """ stop_words = [] with open(file_path, 'r', encoding='utf-8') as my_file: for line in my_file: stop_words.append(line.split('\n')[0]) return stop_words @staticmethod def cos(vector1, vector2): """ 计算余弦相似度 :param vector1: :param vector2: :return: 余弦相似度 """ dot_product = 0.0 norm_a = 0.0 norm_b = 0.0 for a, b in zip(vector1, vector2): dot_product += a * b norm_a += a**2 norm_b += b**2 if norm_a == 0.0 or norm_b == 0.0: return 0.0 else: return dot_product / ((norm_a * norm_b)**0.5)
class WblogClassify(object): def __init__(self, h, d, u, p, c, train_per=0.8, spam_per=0.1, reset_dataset=False, dump=True, add_unknown_into_model=False,file_name_appendix=''): self.host = h self.db = d self.user = u self.passwd = p self.charset = c self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.commentSimilarity = MongoClient().wblogFeature.commentSimilarity self.sentimentSimilarity = MongoClient().wblogFeature.sentimentSimilarity self.commentInteractRatio = MongoClient().wblogFeature.commentInteractRatio self.hotCommentRatio = MongoClient().wblogFeature.hotCommentRatio self.train_per = train_per self.spam_per = spam_per self.reset_dataset = reset_dataset self.dump = dump self.add_unknown_into_model = add_unknown_into_model self.file_name_appendix = file_name_appendix def run(self): """ 从数据库中读取特征数据,并使用svm和lr分类 :return: """ if not self.add_unknown_into_model: swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM swblog') wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM wblog_choose') final_wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"') for wblogId in final_wblog: if wblogId not in swblog: swblog.append(wblogId) # 不知道为什么spammer和normal两个集合有重合的用户 # 所以这里简单地将这些重合的用户都认为是spammer for uid in swblog: if uid in wblog: wblog.remove(uid) """ 到目前为止,我们得到了下面几个有用的东西 swblog: 水军 wblog: 正常用户 unkonwn:还没来得及标注的未知类型微博 """ logging.info('原始数据spam占比例(max): %s' % (len(swblog) * 1.0 / (len(wblog) + len(swblog)))) if self.spam_per > len(swblog) * 1.0 / (len(wblog) + len(swblog)): logging.info('we don\'t have so much spams in our datasets, we will keep original percentage') else: expected_spam_number = int(self.spam_per * len(wblog) * 1.0 / (1 - self.spam_per)) swblog = random.sample(swblog, expected_spam_number) if self.reset_dataset: train_wblog_set = random.sample(swblog, int(len(swblog) * self.train_per)) + random.sample(wblog, int( len(wblog) * self.train_per)) test_wblog_set = list(set(swblog + wblog).difference(train_wblog_set)) # # 第二期改进代码 # train_user_set_without_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(len(normal) * train_per)) # train_user_set_with_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int( # len(normal) * train_per))+random.sample(unknown, len(unknown)) # test_user_set = list(set(spammer + normal).difference(train_user_set_without_unknown)) # train_user_set=train_user_set_with_unknown+train_user_set_with_unknown else: train_wblog_set, test_wblog_set = Alkit.read_dataset( '../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') # 输出训练集和测试集的一些信息 logging.info('总数据集大小:%s' % (len(train_wblog_set)+len(test_wblog_set))) logging.info('训练集大小:%s' % len(train_wblog_set)) logging.info('训练集中正例(swblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(swblog))))) logging.info('训练集中负例(wblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(wblog))))) logging.info('测试集大小:%s' % len(test_wblog_set)) logging.info('测试集中正例(swblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(swblog))))) logging.info('测试集中负例(wblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(wblog))))) else: raise ('we will implement this later.') # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致) feature_dict_data, result_dict_data = self.load_data(train_wblog_set, swblog, wblog) train_feature, train_result = Alkit.process_data(feature_dict_data, result_dict_data) logging.info('训练集数据处理完毕') feature_dict_data, result_dict_data = self.load_data(test_wblog_set, swblog, wblog) test_feature, test_result = Alkit.process_data(feature_dict_data, result_dict_data) logging.info('测试集数据处理完毕') # 使用svm训练并输出结果 # logging.info('\nSVM开始训练') # model = SVC(class_weight='balanced') # model.fit(train_feature, train_result) # logging.info('训练结束') # predict_result = model.predict(test_feature) # logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) # logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) # logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) # 使用LR训练并输出结果 logging.info('LR开始训练') model = LogisticRegression(class_weight='balanced') model.fit(train_feature, train_result) logging.info('训练结束') predict_result = model.predict(test_feature) logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) # 使用LR输出概率形式的结果 predict_result_proba = model.predict_proba(test_feature) prp = [] for prob in predict_result_proba: prp.append(float(prob[0]) * -1 + float(prob[1]) * 1) # 将LR跑出来的两种结果保存下来,供下一步使用 if self.dump: logging.info("保存结果输出到 " + '../main/prior/wblog_train' + self.file_name_appendix + '.txt' + "和" + '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') Alkit.write_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt', train_wblog_set, train_result, test_wblog_set, test_result, predict_result, prp) # 使用Random Forest训练并输出结果 # logging.info('\nRandom Forest开始训练') # model = RandomForestClassifier(n_estimators=100, class_weight='balanced') # model.fit(train_feature, train_result) # logging.info('训练结束') # # importances = model.feature_importances_ # print(importances) # # predict_result = model.predict(test_feature) # logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) # logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) # logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) # 使用RF输出概率形式的结果 # predict_result_proba = model.predict_proba(test_feature) # prp = [] # for prob in predict_result_proba: # prp.append(float(prob[0]) * -1 + float(prob[1]) * 1) # # 将RF跑出来的两种结果保存下来,供下一步使用 # Alkit.write_prior('prior/wblog_train.txt', 'prior/wblog_prior.txt', # train_wblog_set, train_result, test_wblog_set, test_result, predict_result, prp) # return float(metrics.f1_score(test_result, predict_result)) # feature_name = ['log_time', 'log_follower', 'log_followee', 'fre-re', 'fre', 'follow_fre', 'onehop_fre', 'rvp_ratio'] # df = DataFrame(numpy.hstack((test_feature, test_result[:, None])), # columns=feature_name + ["class"]) # _ = seaborn.pairplot(df, vars=feature_name, hue="class", size=1.5) # plt.show() # feature_dict_data, result_dict_data = self.load_data(train_wblog_set + test_wblog_set, swblog, wblog) # test_feature, test_result = Alkit.process_data(feature_dict_data, result_dict_data) # logging.info('数据处理完毕') # # logging.info('\nSVM开始训练-交叉验证') # model = SVC(class_weight='balanced') # res = cross_val_score(model, test_feature, test_result, cv=5, scoring='f1') # logging.info('训练结束') # logging.info(res) # # logging.info('\nLR开始训练-交叉验证') # model = LogisticRegression(class_weight='balanced') # res = cross_val_score(model, test_feature, test_result, cv=5, scoring='f1') # logging.info('训练结束') # logging.info(res) def evalutaion(self): """ 评价一下 :return: """ wblog_train_dict, wblog_train_list, wblog_prior_dict, wblog_prior_list = \ Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') swblog, swblog_prior, nwblog, nwblog_prior = Alkit.setSN(wblog_train_dict, wblog_prior_dict) scores = [] test_result = [] predict_result = [] for uid in wblog_prior_list: test_result.append(float(wblog_prior_dict[uid]['label'])) predict_result.append(float(wblog_prior_dict[uid]['prior_label'])) scores.append(float(wblog_prior_dict[uid]['prior'])) # print(float(metrics.f1_score(test_result, predict_result))) Evaluation.evaluation_self(scores, test_result) # ap p, r, thresholds = metrics.precision_recall_curve(test_result, scores) ap = metrics.average_precision_score(test_result, scores) logging.info('wblog AP:%s' % str(ap)) with open('../main/lr/wblog_ap'+self.file_name_appendix+'.txt', 'w') as my_file: my_file.write('p r\n') for i in range(len(p)): my_file.write('%s %s\n' % (str(p[i]), str(r[i]))) # roc fpr, tpr, thresholds = metrics.roc_curve(test_result, scores) logging.info('wblog AUC:%s' % str(metrics.auc(fpr, tpr))) with open('../main/lr/wblog_roc'+self.file_name_appendix+'.txt', 'w') as my_file: my_file.write('fpr tpr\n') for i in range(len(fpr)): my_file.write('%s %s\n' % (str(fpr[i]), str(tpr[i]))) # top k precision wblog_score = {} for i in range(len(scores)): wblog_score[wblog_prior_list[i]] = scores[i] wblog_score = sorted(wblog_score.items(), key=lambda im: float(im[1]), reverse=True) with open('../main/lr/res_wblog_top'+self.file_name_appendix+'.txt', 'w') as my_file: my_file.write('type wblogId score precision top_k\n') wblog_count_now = 0 top_k = 0 for itm in wblog_score: uid = itm[0] score = itm[1] if uid in swblog: u_type = 's' wblog_count_now += 1 else: u_type = 'n' top_k += 1 precision = str(float(wblog_count_now) / top_k) my_file.write(u_type + ' ' + str(uid) + ' ' + str(score) + ' ' + precision + ' ' + str(top_k) + '\n') def load_data(self, total_set, swblog, wblog, unknown=None): """ 从数据库读取数据,因为训练集和测试集读取的操作一样,所以单独写一个方法 :return: 特征字典数据,类别字典数据 """ feature_dict_data = OrderedDict() result_dict_data = OrderedDict() for wblogId in total_set: feature_dict_data[wblogId] = [Alkit.load_data_help_w(self.commentSimilarity, wblogId, 'comment_similarity'), Alkit.load_data_help_w(self.sentimentSimilarity, wblogId, 'sentiment_similarity'), Alkit.load_data_help_w(self.commentInteractRatio, wblogId, 'interact_ratio'), Alkit.load_data_help_w(self.hotCommentRatio, wblogId, 'hot_ratio')] # feature_dict_data[wblogId] = [Alkit.load_data_help_w(self.commentSimilarity, wblogId, 'comment_similarity'), # Alkit.load_data_help_w(self.commentInteractRatio, wblogId, 'interact_ratio'), # Alkit.load_data_help_w(self.hotCommentRatio, wblogId, 'hot_ratio')] if wblogId in swblog: result_dict_data[wblogId] = 1 elif wblogId in wblog: result_dict_data[wblogId] = -1 elif wblogId in unknown: result_dict_data[wblogId] = 0 return feature_dict_data, result_dict_data
class MSCA(object): def __init__(self, h, d, u, p, c, file_name_appendix=''): """ 在init中将读取msca必要的数据 """ self.host = h self.db = d self.user = u self.passwd = p self.charset = c self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.file_name_appendix = file_name_appendix # 读取训练集,以及测试集上得到的先验类别 # user_train_dict,训练集,带标签 # user_train_list,训练集,只有用户id # user_prior_dict,测试集,带ground truth标签,以及先验类别的prior标签 # user_prior_list, 测试集,只有用户id self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \ Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt') self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \ Alkit.read_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') # self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \ # Alkit.read_prior('prior_bak/user_train.txt', 'prior_bak/user_prior.txt') # self.wblog_train_dict, self.wblog_train_list, self.wblog_prior_dict, self.wblog_prior_list = \ # Alkit.read_prior('prior_bak/wblog_train.txt', 'prior_bak/wblog_prior.txt') # spammer,真实的spammer用户 # spammer_prior,先验类别判定后的spammer用户 # normal,真实的normal用户 # normal_prior,先验类别判定后的normal用户 # swblog,swblog_prior,wblog,wblog_prior同理 self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN( self.user_train_dict, self.user_prior_dict) self.swblog, self.swblog_prior, self.nwblog, self.nwblog_prior = Alkit.setSN( self.wblog_train_dict, self.wblog_prior_dict) self.all_user = self.user_train_list + self.user_prior_list self.all_wblog = self.wblog_train_list + self.wblog_prior_list self.follow_edge = {} # {'uid': ['followeeUid']} self.follow_cnt = {} # {'uid': follow count} self.retweet_edge = {} # {'uid': ['wblogId']} self.wblog_retweet_cnt = {} # {wblogId: retweet count} self.user_retweet_cnt = {} # {uid: retweet count} def loadFollowRelationship(self, workers=8): """ 读取用户间的关注关系 :return: none """ # 读取用户间关注关系 # 注意spammer关注normal的边需要去除 # 去除包括user_train里面的这种边 以及 user_prior里面的这种边(user_prior里面根据prior_label来确定) logging.info('多进程读取关注关系') self.follow_edge = set_follow_edge(self.all_user, self.all_user, self.spammer_prior, self.normal_prior, workers=workers) print("注意啦!!!!!") len(list(self.follow_edge.keys())) len(self.all_user) import operator print(operator.eq(list(self.follow_edge.keys()), self.all_user)) """ 下面一段的注视是原来的代码,因为速度太慢, 我将其改造成了上面的多进程形式 """ # logging.info('loading FollowRelationship') # for uid in self.all_user[0:8]: # self.follow_edge[uid] = [] # for result in self.sqlhelper.select_sql('SELECT uid, followeeUid FROM edge WHERE uid=%s' % uid): # uid = str(result[0]) # followeeUid = str(result[1]) # if followeeUid not in self.all_user: # continue # if uid in self.spammer_prior and followeeUid in self.normal_prior: # continue # self.follow_edge[uid].append(followeeUid) # print('180 ', self.follow_edge) # # import operator # print(operator.eq(follow_edge,self.follow_edge)) # print(follow_edge) # print(len(follow_edge)) # print(self.follow_edge) # print(len(self.follow_edge)) # 统计每个用户的关注数,方便后面的计算 # 这里就统计这三千多个用户中的,就不统计总的粉丝数了 for uid in self.all_user: self.follow_cnt[uid] = 0 for uid in self.follow_edge.keys(): self.follow_cnt[uid] += len(self.follow_edge[uid]) logging.info('多进程读取关注关系处理结束!') def loadRetweetRelationship(self, workers=8): """ 读取用户与微博间的转发关系 以及 微博的转发数 和 用户的转发数 :return: none """ # 读取转发关系 # 注意除了wblog表中三个月的微博数据外,还需要考虑spammer对于swblog的转发 # 本来想根据提交的众包任务来确定spammer与swblog的转发关系的,但是刚发现不行,不行的原因有两点: # 1.mission表中没有wblogId,只有微博短id,无法匹配,好像我之前确定swblog的wblogId的时候是一条条人工记录的 # 2.有一些水军提交任务的时候是浑水摸鱼的,可能啥都没干,也可能贴的错误的回复 # 所以换一种方法 # 之前爬评论的时候专门爬取了swblog的评论,就将评论了swblog的用户全部当做转发了 logging.info('多进程读取转发关系') # 3884个用户全部处理完大概需要30min self.retweet_edge = set_retweet_edge(self.all_user, self.all_wblog, workers=workers) """ 下面一段的注视是原来的代码,因为速度太慢, 我将其改造成了上面的多进程形式 """ # logging.info('non-process!') # uid_count = 0 # for uid in self.all_user[0:80]: # # for uid in all_user_sample: # uid_count = uid_count + 1 # if uid_count % 500 == 0: # logging.info("outerloop: {}/{}={}%".format(str(uid_count), str(len(self.all_user)), # str(100.0 * uid_count / len(self.all_user)))) # self.retweet_edge[uid] = [] # for res in self.sqlhelper.select_sql('SELECT paMid, orMid FROM wblog WHERE uid=%s' % uid): # paMid = str(res[0]) # orMid = str(res[1]) # if paMid in self.all_wblog: # self.retweet_edge[uid].append(paMid) # if orMid in self.all_wblog: # self.retweet_edge[uid].append(orMid) # import operator # # print(operator.eq(retweet_edge, self.retweet_edge)) logging.info("retweet_edge...") mdb = MongoClient().comment.comment for wblogId in self.swblog: # for wblogId in sw_sample: for res in mdb.find({'wblogId': wblogId}): try: uid = res['json_text']['user']['id'] if uid in self.retweet_edge.keys(): if wblogId not in self.retweet_edge[uid]: self.retweet_edge[uid].append(wblogId) except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId))) logging.info('读取微博的转发数') # 读取每条微博的转发数,方便后面计算用户间的联系强度 # print(len(self.all_wblog)) for wblogId in self.all_wblog: self.wblog_retweet_cnt[wblogId] = 0 for uid in self.retweet_edge.keys(): for wblogId in self.retweet_edge[uid]: self.wblog_retweet_cnt[wblogId] += 1 # # 下面是统计一条微博总的转发数,也即转发数会很大 # # mdb1 = MongoClient().wblog.wblog # mdb2 = MongoClient().wblog.swblog # suc=0 # fail=0 # logging.info('测试点!') # for wblogId in self.all_wblog: # try: # wblog = mdb1.find_one({'wblogId': wblogId}) # self.wblog_retweet_cnt[wblogId] = int(wblog['json_text']['reposts_count']) # wblog = mdb2.find_one({'wblogId': wblogId}) # self.wblog_retweet_cnt[wblogId] = int(wblog['json_text']['reposts_count']) # suc = suc + 1 # print("LINE:172 | suc: ", suc, "fail: ", fail) # except Exception as e: # fail=fail+1 # # logging.error('%s. The wblogId is %s' % (e, str(wblogId))) # logging.error('success %s, fail %s' %(str(suc),str(fail))) # mdb = MongoClient().wblog.wblog # # suc = 0 # fail = 0 # for wblogId in self.nwblog: # try: # wblog = mdb.find_one({'wblogId': wblogId}) # self.wblog_retweet_cnt[wblogId] = int(wblog['json_text']['reposts_count']) # suc = suc + 1 # except Exception as e: # fail = fail + 1 # # print("LINE:187 | suc: ", suc, "fail: ", fail) # logging.error('%s. The wblogId is %s' % (e, str(wblogId))) # logging.error('for wblogId in self.nwblog... success %s, fail %s' % (str(suc), str(fail))) # mdb = MongoClient().wblog.swblog # # suc = 0 # fail = 0 # for wblog in mdb.find(): # try: # self.wblog_retweet_cnt[wblog['json_text']['id']] = wblog['json_text']['reposts_count'] # suc = suc + 1 # except Exception as e: # fail = fail + 1 # # print("LINE:199 | suc: ", suc, "fail: ", fail) # # logging.error('%s.' % e) # logging.error('or wblog in mdb.find():... success %s, fail %s' % (str(suc), str(fail))) logging.info('读取用户的转发数') # 同样的,读取每个用户的转发数,方便后面计算微博间的联系强度 # 由于用户的转发数和微博的转发数的获取难度不同,后者在json里就有,前者没有,所以我就只统计这三个月的了 for uid in self.all_user: self.user_retweet_cnt[uid] = len(self.retweet_edge[uid]) logging.info('loadRetweetRelationship finished') def setRelationIntensity_new(self, type, target, workers=4): """ 计算用户间的联系强度 以及 微博间的联系强度,然后将其记录下来 值得注意的是,不知道只统计了三个月的转发够不够,但再统计两年的转发就有点费时间了 :return: """ if type == 'user': # 首先生成以wblogId为key的转发边字典,{wblogId: [uid]},方便后面计算 retweet_edge = {} for uid in self.retweet_edge.keys(): for wblogId in self.retweet_edge[uid]: if wblogId not in retweet_edge: retweet_edge[wblogId] = [] if uid not in retweet_edge[wblogId]: retweet_edge[wblogId].append(uid) retweet_cnt = self.wblog_retweet_cnt # 微博的retweet_cnt retweet_cnt2 = self.user_retweet_cnt # 用户自己的retweet_cnt else: # 如果是计算微博的联系强度的话,就直接用原本的转发关系边就行 retweet_edge = self.retweet_edge retweet_cnt = self.user_retweet_cnt retweet_cnt2 = self.wblog_retweet_cnt # 然后计算用户两两间的联系强度,并生成一个用户数*用户数的方阵S # 因为我后面会将这个方阵写入文件中,所以就不真正生成S了 # 对于微博同理 compute_relation_intensity(type, target, retweet_cnt, retweet_cnt2, retweet_edge, self.file_name_appendix, workers=workers) # nc = 0 # 记录方阵S中不为0的元素数 # # with open('../main/relation_intensity/%s' % type + self.file_name_appendix + '.txt', 'w') as my_file: # for i in range(len(target)): # """ # 这个地方太浪费时间了,想一想如何通过多进程实现 # """ # id1 = target[i] # for j in range(i + 1, len(target)): # id2 = target[j] # # 计算id1和id2之间的联系强度 # s = 0.0 # if retweet_cnt2[id1] == 0 or retweet_cnt2[id2] == 0: # s = 0.0 # else: # for key in retweet_edge.keys(): # if len(retweet_edge[key]) == 0: # continue # if id1 in retweet_edge[key] and id2 in retweet_edge[key]: # s += 1.0 / float(retweet_cnt[key]) # if s != 0.0: # nc += 1 # my_file.write('%s %s %s\n' % (id1, id2, str(s))) # logging.info('%s, nc=%s' % (type, str(nc))) def setRelationIntensity_old(self, type, target): """ 读取记录下来的用户间的联系强度 以及 微博间的联系强度 首先保存为稀疏矩阵A,然后计算A^T*A后保存为正常矩阵,再记录到文件中 :return: """ with open( '../main/relation_intensity/%s' % type + self.file_name_appendix + '.txt', 'r') as my_file: row_and_column = len(my_file.readlines()) A = sparse.lil_matrix((row_and_column, len(target))) with open( '../main/relation_intensity/%s' % type + self.file_name_appendix + '.txt', 'r') as my_file: cnt = 0 if type == 'user': retweet_cnt = self.user_retweet_cnt else: retweet_cnt = self.wblog_retweet_cnt for line in my_file: line = line.split('\n')[0] id1 = line.split(' ')[0] id2 = line.split(' ')[1] index1 = target.index(id1) index2 = target.index(id2) ri = line.split(' ')[2] A[cnt, index1] = pow(float(ri) / float(retweet_cnt[id1]), 0.5) A[cnt, index2] = 0.0 - pow( float(ri) / float(retweet_cnt[id2]), 0.5) cnt += 1 logging.info('setRelationIntensity_old read file finished') if type == 'user': sparse.save_npz( '../main/relation_intensity/A' + self.file_name_appendix, A.tocoo()) logging.info('save A finished') else: sparse.save_npz( '../main/relation_intensity/B' + self.file_name_appendix, A.tocoo()) logging.info('save B finished') ATA = A.T.dot(A).tocoo() logging.info('setRelationIntensity_old ATA finished') if type == 'user': sparse.save_npz( '../main/relation_intensity/ATA' + self.file_name_appendix, ATA) logging.info('save ATA finished') else: sparse.save_npz( '../main/relation_intensity/BTB' + self.file_name_appendix, ATA) logging.info('save BTB finished') def setRelationIntensity(self, reset_dataset=False, workers=4): """ reset_dataset为True的时候 调用setRelationIntensity_new(user)和setRelationIntensity_new(wblog) reset_dataset为False的时候 调用setRelationIntensity_old(user)和setRelationIntensity_old(wblog) :return: A B ATA BTB '../main/relation_intensity/user.txt' '../main/relation_intensity/wblog.txt' """ # self.loadRetweetRelationship() """ 上面这个被我单独调用了, 见主程序 """ if reset_dataset: logging.info('setRelationIntensity_new------user') self.setRelationIntensity_new('user', self.all_user, workers=workers) logging.info('setRelationIntensity_new------wblog') self.setRelationIntensity_new('wblog', self.all_wblog, workers=workers) logging.info('setRelationIntensity_new------finished') logging.info('setRelationIntensity_old------user') self.setRelationIntensity_old('user', self.all_user) logging.info('setRelationIntensity_old------wblog') self.setRelationIntensity_old('wblog', self.all_wblog) logging.info('setRelationIntensity_old------finished') def setLaplacian(self): """ 计算拉普拉斯矩阵L,并保存进文件中 :return: none """ # 首先要计算用户的pagerank值 # self.loadFollowRelationship() """ 上面这个被我单独调用了, 见主程序 """ logging.info('计算pagerank值') print("572注意啦啦啦啦!!!!!") import operator print('572', list(self.follow_edge.keys())) print('572', self.all_user) print('572', operator.eq(list(self.follow_edge.keys()), self.all_user)) page_ranks = PRMapReduce(nodes=self.all_user, edge=self.follow_edge).page_rank() # 生成对角矩阵PI PI = sparse.lil_matrix((len(self.all_user), len(self.all_user))) for i in range(len(self.all_user)): PI[i, i] = float(page_ranks[self.all_user[i]][0]) # 生成跳转概率矩阵P P = sparse.lil_matrix((len(self.all_user), len(self.all_user))) for uid in self.follow_edge.keys(): for followeeUid in self.follow_edge[uid]: P[self.all_user.index(uid), self.all_user.index(followeeUid)] = 1.0 / float( self.follow_cnt[uid]) * 0.85 for i in range(len(self.all_user)): for j in range(len(self.all_user)): P[i, j] += 0.15 * 1.0 / len(self.all_user) # 计算拉普拉斯矩阵L I = sparse.identity(len(self.all_user)) # L = I - (PI.power(0.5) * P * PI.power(-0.5) + PI.power(-0.5) * P.T * PI.power(0.5)).dot(0.5) L = PI - (PI.dot(P) + P.T.dot(PI)).dot(0.5) L = L.tocoo() # 写入文件中 sparse.save_npz( '../main/relation_intensity/L' + self.file_name_appendix, L) logging.info('setLaplacian finished') def setReteetMatrix(self): """ 设置转发矩阵R,并保存进文件中 :return: none """ # self.loadRetweetRelationship() """ 上面这句话被我单独调用了,见主程序 """ # 生成转发矩阵R R = sparse.lil_matrix((len(self.all_user), len(self.all_wblog))) for uid in self.retweet_edge.keys(): for wblogId in self.retweet_edge[uid]: R[self.all_user.index(uid), self.all_wblog.index(wblogId)] = 1.0 R = R.tocoo() # 写入文件中 sparse.save_npz( '../main/relation_intensity/R' + self.file_name_appendix, R) logging.info('setReteetMatrix finished') def run(self, lenda1, lenda2, alpha, beta, gamma, theta, iteration_limit, change_limit): """ 跑MSCA算法 :return: """ # 首先确定x和y向量 li = [] for uid in self.user_train_list: li.append(float(self.user_train_dict[uid]['label'])) for uid in self.user_prior_list: li.append(float(self.user_prior_dict[uid]['prior'])) # li.append(-1) self.x_p = numpy.array(li) logging.info('user num: %s' % str(len(li))) li = [] for wblogId in self.wblog_train_list: li.append(float(self.wblog_train_dict[wblogId]['label'])) for wblogId in self.wblog_prior_list: li.append(float(self.wblog_prior_dict[wblogId]['prior'])) # li.append(-1) self.y_p = numpy.array(li) logging.info('wblog num: %s' % str(len(li))) # 载入转发矩阵 self.R = sparse.load_npz('../main/relation_intensity/R' + self.file_name_appendix + '.npz') # 然后需要分别计算x和y迭代时的逆矩阵 logging.info('计算迭代x时的逆矩阵') self.I1 = sparse.identity(len(self.all_user)) self.ATA = sparse.load_npz('../main/relation_intensity/ATA' + self.file_name_appendix + '.npz') self.L = sparse.load_npz('../main/relation_intensity/L' + self.file_name_appendix + '.npz') logging.info('计算迭代y时的逆矩阵') self.I2 = sparse.identity(len(self.all_wblog)) self.BTB = sparse.load_npz('../main/relation_intensity/BTB' + self.file_name_appendix + '.npz') self.A = sparse.load_npz('../main/relation_intensity/A' + self.file_name_appendix + '.npz') self.B = sparse.load_npz('../main/relation_intensity/B' + self.file_name_appendix + '.npz') # # 首先确定x和y向量 # li = [] # for uid in self.user_train_list: # li.append(float(self.user_train_dict[uid]['label'])) # for uid in self.user_prior_list: # li.append(float(self.user_prior_dict[uid]['prior_label'])) # x_p = numpy.array(li) # logging.info('user num: %s' % str(len(li))) # li = [] # for wblogId in self.wblog_train_list: # li.append(float(self.wblog_train_dict[wblogId]['label'])) # for wblogId in self.wblog_prior_list: # li.append(float(self.wblog_prior_dict[wblogId]['prior_label'])) # y_p = numpy.array(li) # logging.info('wblog num: %s' % str(len(li))) # # # 载入转发矩阵 # R = sparse.load_npz('relation_intensity\\R.npz') # # # 然后需要分别计算x和y迭代时的逆矩阵 # logging.info('计算迭代x时的逆矩阵') # I1 = sparse.identity(len(self.all_user)) # ATA = sparse.load_npz('relation_intensity\\ATA.npz') # L = sparse.load_npz('relation_intensity\\L.npz') # xm = I1.dot(2.0 * lenda1) + ATA.dot(2.0 * alpha) + L.dot(2.0 * theta) # xm = linalg.inv(xm.toarray()) # logging.info('计算迭代y时的逆矩阵') # I2 = sparse.identity(len(self.all_wblog)) # BTB = sparse.load_npz('relation_intensity\\BTB.npz') # ym = I2.dot(2.0 * lenda2) + BTB.dot(2.0 * beta) # ym = linalg.inv(ym.toarray()) # # A = sparse.load_npz('relation_intensity\\A.npz') # B = sparse.load_npz('relation_intensity\\B.npz') li = [] for uid in self.all_user: li.append(0.0) w_o = numpy.array(li) C = sparse.lil_matrix((len(self.all_user), len(self.all_user))) for i in range(len(self.user_train_list)): C[i, i] = float(1.0) li = [] for uid in self.user_train_list: li.append(float(self.user_train_dict[uid]['label'])) for uid in self.user_prior_list: li.append(0.0) u = numpy.array(li) luo_x = 20.05 xm = self.I1.dot(2.0 * lenda1) + self.ATA.dot( 2.0 * alpha) + self.L.dot(2.0 * theta) + C.T.dot(C).dot(luo_x) # xm = self.I1.dot(2.0 * lenda1) + self.ATA.dot(2.0 * alpha) + self.L.dot(2.0 * theta) xm = linalg.inv(xm.toarray()) li = [] for wblogId in self.all_wblog: li.append(0.0) m_o = numpy.array(li) D = sparse.lil_matrix((len(self.all_wblog), len(self.all_wblog))) for i in range(len(self.wblog_train_list)): D[i, i] = float(1.0) li = [] for wblogId in self.wblog_train_list: li.append(float(self.wblog_train_dict[wblogId]['label'])) for wblogId in self.wblog_prior_list: li.append(0.0) m = numpy.array(li) luo_y = 5.05749 ym = self.I2.dot(2.0 * lenda2) + self.BTB.dot( 2.0 * beta) + D.T.dot(D).dot(luo_y) # ym = self.I2.dot(2.0 * lenda2) + self.BTB.dot(2.0 * beta) ym = linalg.inv(ym.toarray()) # 开始迭代 logging.info('开始迭代') iteration = 0 x = self.x_p y = self.y_p cnt1 = 0 cnt2 = 0 while True: iteration += 1 logging.info('iteration: %s' % str(iteration)) if iteration > iteration_limit: break self.getFun(lenda1, lenda2, alpha, beta, gamma, theta, x, self.x_p, y, self.y_p, self.A, self.B, self.R, self.L) iteration_x = 0 w = w_o tmp = x while True: iteration_x += 1 if iteration_x > 1000: break x_next = xm.dot( self.x_p.dot(2 * lenda1) + self.R.dot(gamma).dot(y) + C.T.dot(u).dot(luo_x) - C.T.dot(w)) w_next = w + C.dot(x_next) - u change = self.getChange(tmp, x_next, w, w_next) tmp = x_next w = w_next # print(change) if change <= change_limit: break cnt1 += 1 # x_next = xm.dot(self.x_p.dot(2 * lenda1) + self.R.dot(gamma).dot(y)) iteration_y = 0 w = m_o tmp = y while True: iteration_y += 1 if iteration_y > 100: break y_next = ym.dot( self.y_p.dot(2 * lenda2) + self.R.T.dot(gamma).dot(x_next) + D.T.dot(m).dot(luo_y) - D.T.dot(w)) w_next = w + D.dot(y_next) - m change = self.getChange(tmp, y_next, w, w_next) tmp = y_next w = w_next if change <= change_limit: break cnt2 += 1 # y_next = ym.dot(self.y_p.dot(2 * lenda2) + self.R.T.dot(gamma).dot(x_next)) change = self.getChange(x, x_next, y, y_next) logging.info('change: %s' % str(change)) if change <= change_limit: break x = x_next y = y_next # for i in range(len(self.user_train_list)): # x[i] = float(self.user_train_dict[self.user_train_list[i]]['label']) # for i in range(len(self.wblog_train_list)): # y[i] = float(self.wblog_train_dict[self.wblog_train_list[i]]['label']) logging.info('迭代结束') print(cnt1) print(cnt2) # 将结果写入文件 numpy.savetxt('res_user' + self.file_name_appendix + '.txt', x) numpy.savetxt('res_wblog' + self.file_name_appendix + '.txt', y) def getChange(self, x, x_next, y, y_next): """ 计算每次迭代时的change :param x: :param x_next: :param y: :param y_next: :return: change """ return linalg.norm(x - x_next, 1) + linalg.norm(y - y_next, 1) def getFun(self, lenda1, lenda2, alpha, beta, gamma, theta, x, x_p, y, y_p, A, B, R, L): """ 计算损失函数的值 :return: """ # print(pow(lenda1 * linalg.norm(x - x_p, 2), 2)) # print(pow(lenda2 * linalg.norm(y - y_p, 2), 2)) # print(pow(alpha * linalg.norm(A.dot(x), 2), 2)) # print(pow(beta * linalg.norm(B.dot(y), 2), 2)) # print(0.0 - gamma * R.T.dot(x).dot(y)) # print(theta * L.T.dot(x).dot(x)) res = pow(lenda1 * linalg.norm(x - x_p, 2), 2) res += pow(lenda2 * linalg.norm(y - y_p, 2), 2) res += pow(alpha * linalg.norm(A.dot(x), 2), 2) res += pow(beta * linalg.norm(B.dot(y), 2), 2) res -= gamma * R.T.dot(x).dot(y) res += theta * L.T.dot(x).dot(x) logging.info('Function loss: %s' % str(res)) def evaluation_bak(self): """ 评价MSCA算法的结果 :return: """ logging.info('用户结果') scores = [] cnt = 0 with open('../main/res_user' + self.file_name_appendix + '.txt', 'r') as my_file: for line in my_file: score = float(line.split('\n')[0]) if self.all_user[cnt] in self.user_prior_list: scores.append(score) cnt += 1 logging.info('min_score: %s, max_score: %s, len(user):%s' % (str(min(scores)), str(max(scores)), str(len(scores)))) test_result = [] for uid in self.user_prior_list: test_result.append(int(self.user_prior_dict[uid]['label'])) user_res = Evaluation.evaluation_self(scores, test_result) # roc fpr, tpr, thresholds = metrics.roc_curve(test_result, scores) logging.info('user AUC:%s' % str(metrics.auc(fpr, tpr))) with open('../main/user_roc' + self.file_name_appendix + '.txt', 'w') as my_file: my_file.write('fpr tpr\n') for i in range(len(fpr)): my_file.write('%s %s\n' % (str(fpr[i]), str(tpr[i]))) def evaluation(self): """ 评价MSCA算法的结果 :return: """ logging.info('用户结果') scores = [] cnt = 0 with open('../main/res_user' + self.file_name_appendix + '.txt', 'r') as my_file: for line in my_file: score = float(line.split('\n')[0]) if self.all_user[cnt] in self.user_prior_list: scores.append(score) cnt += 1 logging.info('min_score: %s, max_score: %s, len(user):%s' % (str(min(scores)), str(max(scores)), str(len(scores)))) test_result = [] for uid in self.user_prior_list: test_result.append(int(self.user_prior_dict[uid]['label'])) user_res = Evaluation.evaluation_self(scores, test_result) # ap p, r, thresholds = metrics.precision_recall_curve(test_result, scores) ap = metrics.average_precision_score(test_result, scores) logging.info('user AP:%s' % str(ap)) with open('../main/user_ap' + self.file_name_appendix + '.txt', 'w') as my_file: my_file.write('p r\n') for i in range(len(p)): my_file.write('%s %s\n' % (str(p[i]), str(r[i]))) # roc fpr, tpr, thresholds = metrics.roc_curve(test_result, scores) logging.info('user AUC:%s' % str(metrics.auc(fpr, tpr))) with open('../main/user_roc' + self.file_name_appendix + '.txt', 'w') as my_file: my_file.write('fpr tpr\n') for i in range(len(fpr)): my_file.write('%s %s\n' % (str(fpr[i]), str(tpr[i]))) # top k precision worker_score = {} for i in range(len(scores)): worker_score[self.user_prior_list[i]] = scores[i] worker_score = sorted(worker_score.items(), key=lambda im: float(im[1]), reverse=True) with open('../main/res_user_top' + self.file_name_appendix + '.txt', 'w') as my_file: my_file.write('type uid score precision top_k\n') worker_count_now = 0 top_k = 0 for itm in worker_score: uid = itm[0] score = itm[1] if uid in self.spammer: u_type = 'w' worker_count_now += 1 else: u_type = 'n' top_k += 1 precision = str(float(worker_count_now) / top_k) my_file.write(u_type + ' ' + str(uid) + ' ' + str(score) + ' ' + precision + ' ' + str(top_k) + '\n') logging.info('微博结果') scores = [] cnt = 0 with open('../main/res_wblog' + self.file_name_appendix + '.txt', 'r') as my_file: for line in my_file: score = float(line.split('\n')[0]) if self.all_wblog[cnt] in self.wblog_prior_list: scores.append(score) cnt += 1 logging.info('min_score: %s, max_score: %s, len(wblog):%s' % (str(min(scores)), str(max(scores)), str(len(scores)))) test_result = [] for wblogId in self.wblog_prior_list: test_result.append(int(self.wblog_prior_dict[wblogId]['label'])) wblog_res = Evaluation.evaluation_self(scores, test_result) # top k precision wblog_score = {} for i in range(len(scores)): wblog_score[self.wblog_prior_list[i]] = scores[i] wblog_score = sorted(wblog_score.items(), key=lambda im: float(im[1]), reverse=True) with open('../main/res_wblog_top' + self.file_name_appendix + '.txt', 'w') as my_file: my_file.write('type wblogId score precision top_k\n') wblog_count_now = 0 top_k = 0 for itm in wblog_score: uid = itm[0] score = itm[1] if uid in self.swblog: u_type = 's' wblog_count_now += 1 else: u_type = 'n' top_k += 1 precision = str(float(wblog_count_now) / top_k) my_file.write(u_type + ' ' + str(uid) + ' ' + str(score) + ' ' + precision + ' ' + str(top_k) + '\n') # ap p, r, thresholds = metrics.precision_recall_curve(test_result, scores) ap = metrics.average_precision_score(test_result, scores) logging.info('wblog AP:%s' % str(ap)) with open('../main/wblog_ap' + self.file_name_appendix + '.txt', 'w') as my_file: my_file.write('p r\n') for i in range(len(p)): my_file.write('%s %s\n' % (str(p[i]), str(r[i]))) # roc fpr, tpr, thresholds = metrics.roc_curve(test_result, scores) logging.info('wblog AUC:%s' % str(metrics.auc(fpr, tpr))) with open('../main/wblog_roc' + self.file_name_appendix + '.txt', 'w') as my_file: my_file.write('fpr tpr\n') for i in range(len(fpr)): my_file.write('%s %s\n' % (str(fpr[i]), str(tpr[i]))) return user_res, wblog_res def show(self): """ 为了界面展示 :return: """ self.all_user = random.sample(self.all_user, 500) self.all_wblog = random.sample(self.all_wblog, 500) for uid in self.all_user: self.retweet_edge[uid] = [] for res in self.sqlhelper.select_sql( 'SELECT paMid, orMid FROM wblog WHERE uid=%s' % uid): paMid = str(res[0]) orMid = str(res[1]) if paMid in self.all_wblog: self.retweet_edge[uid].append(paMid) if orMid in self.all_wblog: self.retweet_edge[uid].append(orMid) mdb = MongoClient().comment.comment for wblogId in self.swblog: for res in mdb.find({'wblogId': wblogId}): try: uid = res['json_text']['user']['id'] if uid in self.retweet_edge.keys(): if wblogId not in self.retweet_edge[uid]: self.retweet_edge[uid].append(wblogId) except Exception as e: logging.error('%s. The wblogId is %s' % (e, str(wblogId)))
class UserFeature: def __init__(self, h, d, u, p, c): # 'localhost', 'sdh', 'root', 'root', 'utf8' self.host = h self.db = d self.user = u self.passwd = p self.charset = c def __enter__(self): self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.mdb = MongoClient().userFeature return self def __exit__(self, exc_type, exc_val, exc_tb): self.sqlhelper.close() def arrangeFeatures(self): """ 将多张特征表整合为一个表,方便后面使用pandas操作 :return: """ col = self.mdb.features if not col.find_one(): logging.info('features为空') col.create_index([('uid', pymongo.DESCENDING)], unique=True) def setRegisterDay(self): """ 设置用户的注册天数 和 log后的结果 :return: none """ col = self.mdb.registerDay if not col.find_one(): logging.info('registerDay为空,设置主键为uid') col.create_index([('uid', pymongo.DESCENDING)], unique=True) # spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer') # normal = self.sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="yes"') """我的修改: 事实上,如果把choose='yes'去掉, 那么mongodb里存储的就是所有的14774个账号的了。 """ spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer') normal = self.sqlhelper.select_sql_one( 'SELECT uid FROM normal WHERE choose="yes"') unknown = self.sqlhelper.select_sql_one( 'SELECT uid FROM normal WHERE choose="not"') final_user = self.sqlhelper.select_sql_one( 'SELECT uid FROM final_user WHERE spammer="yes"') for uid in final_user: if uid not in spammers: spammers.append(uid) """ 到这为止, 代码中spammer相当于数据表里spammer U final_user.spammer一共有903 """ # 不知道为什么spammer和normal两个集合有重合的用户 # 所以这里简单地将这些重合的用户都认为是spammer for uid in spammers: if uid in normal: normal.remove(uid) if uid in unknown: unknown.remove(uid) """ 到目前为止,我们得到了下面几个有用的东西 spammer: 水军 normal: 正常用户 unkonwn:还没来得及标注的未知类型用户 """ all_user = spammers + normal + unknown for uid in all_user: try: for card in MongoClient().profile.json_text.find_one( {'uid': str(uid)})['json_text']['cards']: if 'card_group' not in card: continue for elem in card['card_group']: if 'item_name' in elem and elem['item_name'] == u'注册时间': t = float( (datetime.datetime(2017, 11, 25) - datetime.datetime.strptime( elem['item_content'], '%Y-%m-%d')).days) if uid in spammers: col.insert_one({ 'uid': uid, 'spammer': 'true', 'register_day': t, 'log_time': math.log10(t) }) elif uid in normal: col.insert_one({ 'uid': uid, 'spammer': 'false', 'register_day': t, 'log_time': math.log10(t) }) elif uid in unknown: col.insert_one({ 'uid': uid, 'spammer': 'unknown', 'register_day': t, 'log_time': math.log10(t) }) break except Exception as e: logging.error('%s. The user is %s' % (e, str(uid))) logging.info('setRegisterDay finished') def setFollowCnt(self): """ 设置用户的关注数,粉丝数和 log 后的结果 :return: none """ col = self.mdb.followCnt if not col.find_one(): logging.info('followCnt为空,设置主键为uid') col.create_index([('uid', pymongo.DESCENDING)], unique=True) # spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer') # normal = self.sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="yes"') spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer') normal = self.sqlhelper.select_sql_one( 'SELECT uid FROM normal WHERE choose="yes"') unknown = self.sqlhelper.select_sql_one( 'SELECT uid FROM normal WHERE choose="not"') final_user = self.sqlhelper.select_sql_one( 'SELECT uid FROM final_user WHERE spammer="yes"') for uid in final_user: if uid not in spammers: spammers.append(uid) """ 到这为止, 代码中spammer相当于数据表里spammer U final_user.spammer一共有903 """ # 不知道为什么spammer和normal两个集合有重合的用户 # 所以这里简单地将这些重合的用户都认为是spammer for uid in spammers: if uid in normal: normal.remove(uid) if uid in unknown: unknown.remove(uid) """ 到目前为止,我们得到了下面几个有用的东西 spammer: 水军 normal: 正常用户 unkonwn:还没来得及标注的未知类型用户 """ for user in MongoClient().profile.user.find(): uid = user['uid'] try: if uid in spammers: col.insert_one({ 'uid': uid, 'spammer': 'true', 'followee_cnt': user['json_text']['follow_count'], 'log_followee': math.log10(int(user['json_text']['follow_count'] + 1.0)), 'follower_cnt': user['json_text']['followers_count'], 'log_follower': math.log10( int(user['json_text']['followers_count'] + 1.0)) }) elif uid in normal: col.insert_one({ 'uid': uid, 'spammer': 'false', 'followee_cnt': user['json_text']['follow_count'], 'log_followee': math.log10(int(user['json_text']['follow_count'] + 1.0)), 'follower_cnt': user['json_text']['followers_count'], 'log_follower': math.log10( int(user['json_text']['followers_count'] + 1.0)) }) elif uid in unknown: col.insert_one({ 'uid': uid, 'spammer': 'unknown', 'followee_cnt': user['json_text']['follow_count'], 'log_followee': math.log10(int(user['json_text']['follow_count'] + 1.0)), 'follower_cnt': user['json_text']['followers_count'], 'log_follower': math.log10( int(user['json_text']['followers_count'] + 1.0)) }) except Exception as e: logging.error('%s. The user is %s' % (e, str(uid))) logging.info('setFollowCnt finished') def setRVP(self): """ 设置用户的双向关注率 :return: none """ col = self.mdb.rvp if not col.find_one(): logging.info('rvp为空,设置主键为uid') col.create_index([('uid', pymongo.DESCENDING)], unique=True) # spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer') # normal = self.sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="yes"') # all_user = spammers + normal """我的修改: 事实上,如果把choose='yes'去掉, 那么mongodb里存储的就是所有的14774个账号的了。 """ spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer') normal = self.sqlhelper.select_sql_one( 'SELECT uid FROM normal WHERE choose="yes"') unknown = self.sqlhelper.select_sql_one( 'SELECT uid FROM normal WHERE choose="not"') final_user = self.sqlhelper.select_sql_one( 'SELECT uid FROM final_user WHERE spammer="yes"') for uid in final_user: if uid not in spammers: spammers.append(uid) """ 到这为止, 代码中spammer相当于数据表里spammer U final_user.spammer一共有903 """ # 不知道为什么spammer和normal两个集合有重合的用户 # 所以这里简单地将这些重合的用户都认为是spammer for uid in spammers: if uid in normal: normal.remove(uid) if uid in unknown: unknown.remove(uid) """ 到目前为止,我们得到了下面几个有用的东西 spammer: 水军 normal: 正常用户 unkonwn:还没来得及标注的未知类型用户 """ all_user = spammers + normal + unknown edge = {} for uid in all_user: for result in self.sqlhelper.select_sql( 'SELECT uid, followeeUid FROM edge WHERE uid=%s' % uid): if result[0] in edge.keys(): edge[result[0]].append(result[1]) else: edge[result[0]] = [result[1]] edge_reverse = {} for uid in all_user: for result in self.sqlhelper.select_sql( 'SELECT uid, followeeUid FROM edge WHERE followeeUid=%s' % uid): if result[1] in edge_reverse.keys(): edge_reverse[result[1]].append(result[0]) else: edge_reverse[result[1]] = [result[0]] for uid in all_user: res = UserFeature.caculate_rvp_ratio(int(uid), edge, edge_reverse) try: if uid in spammers: col.insert_one({ 'uid': uid, 'spammer': 'true', 'rvp_ratio': str(res) }) elif uid in normal: col.insert_one({ 'uid': uid, 'spammer': 'false', 'rvp_ratio': str(res) }) elif uid in unknown: col.insert_one({ 'uid': uid, 'spammer': 'unknown', 'rvp_ratio': str(res) }) except Exception as e: logging.error('%s. The user is %s' % (e, str(uid))) logging.info('setRVP finished') def setOriThirdFre(self): """ 设置用户发布微博时使用第三方软件的频率 :return: none """ third_party = ('推兔', '好保姆', '互粉派对 ', '优推推互粉', '未通过审核应用', '互粉加加', '互粉小助手', '孔明社交管理', '互粉赏金榜', '推米互粉', '多推', '互粉一族', '推兔手机版', '推啊') col = self.mdb.oriThirdFre if not col.find_one(): logging.info('oriThirdFre为空,设置主键为uid') col.create_index([('uid', pymongo.DESCENDING)], unique=True) ori_cnt = 0 thi_cnt = 0 ori_cnt_re = 0 thi_cnt_re = 0 # spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer') # normal = self.sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="yes"') spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer') normal = self.sqlhelper.select_sql_one( 'SELECT uid FROM normal WHERE choose="yes"') unknown = self.sqlhelper.select_sql_one( 'SELECT uid FROM normal WHERE choose="not"') final_user = self.sqlhelper.select_sql_one( 'SELECT uid FROM final_user WHERE spammer="yes"') for uid in final_user: if uid not in spammers: spammers.append(uid) """ 到这为止, 代码中spammer相当于数据表里spammer U final_user.spammer一共有903 """ # 不知道为什么spammer和normal两个集合有重合的用户 # 所以这里简单地将这些重合的用户都认为是spammer for uid in spammers: if uid in normal: normal.remove(uid) if uid in unknown: unknown.remove(uid) """ 到目前为止,我们得到了下面几个有用的东西 spammer: 水军 normal: 正常用户 unkonwn:还没来得及标注的未知类型用户 """ for user in MongoClient().profile.user.find(): uid = user['uid'] tmp_ori_cnt = 0 # 微博数量 tmp_thi_cnt = 0 # 第三方微博数量 tmp_ori_cnt_re = 0 # 微博数量(去除转发微博) tmp_thi_cnt_re = 0 # 第三方微博数量(去除转发微博) for res in self.sqlhelper.select_sql( 'SELECT source, retweet_flag FROM wblog WHERE uid=%s' % uid): source = res[0] retweet_flag = res[1] # 下面这个判断是为了筛选出原创微博 if str(retweet_flag) == '0': tmp_ori_cnt_re += 1 ori_cnt_re += 1 if source in third_party: tmp_thi_cnt_re += 1 thi_cnt_re += 1 tmp_ori_cnt += 1 ori_cnt += 1 if source in third_party: tmp_thi_cnt += 1 thi_cnt += 1 try: if uid in spammers: col.insert_one({ 'uid': uid, 'spammer': 'true', 'ori_cnt-re': tmp_ori_cnt_re, 'thi_cnt-re': tmp_thi_cnt_re, 'ori_cnt': tmp_ori_cnt, 'thi_cnt': tmp_thi_cnt }) elif uid in normal: col.insert_one({ 'uid': uid, 'spammer': 'false', 'ori_cnt-re': tmp_ori_cnt_re, 'thi_cnt-re': tmp_thi_cnt_re, 'ori_cnt': tmp_ori_cnt, 'thi_cnt': tmp_thi_cnt }) elif uid in unknown: col.insert_one({ 'uid': uid, 'spammer': 'unknown', 'ori_cnt-re': tmp_ori_cnt_re, 'thi_cnt-re': tmp_thi_cnt_re, 'ori_cnt': tmp_ori_cnt, 'thi_cnt': tmp_thi_cnt }) except Exception as e: print('%s. The user is %s' % (e, str(uid))) self.updateOriThirdFre(ori_cnt, thi_cnt, ori_cnt_re, thi_cnt_re) def updateOriThirdFre(self, ori_cnt, thi_cnt, ori_cnt_re, thi_cnt_re): """ 在setOriThirdFre中只是做了初步的统计 所以这里需要计算出特征具体的值,并更新到mongodb中 :return: none """ col = self.mdb.oriThirdFre # ori_cnt = 1525387 # thi_cnt = 47284 # ori_cnt_re = 971792 # thi_cnt_re = 10407 max_ori = 0 max_ori_re = 0 for user in col.find(): if user['ori_cnt'] > max_ori: max_ori = user['ori_cnt'] if int(user['ori_cnt-re']) > max_ori_re: max_ori_re = user['ori_cnt-re'] for user in col.find(): if user['ori_cnt'] == 0: fre = float(thi_cnt) / ori_cnt else: coefficient = math.log10(user['ori_cnt'] + 1.0) / math.log10(max_ori) fre = coefficient * (float(user['thi_cnt']) / user['ori_cnt']) + (1 - coefficient) * ( float(thi_cnt) / ori_cnt) col.update({'uid': user['uid']}, {'$set': {'fre': fre}}) if user['ori_cnt'] == 0: fre = 0 else: fre = float(user['thi_cnt']) / user['ori_cnt'] col.update({'uid': user['uid']}, {'$set': {'fre_new': fre}}) for user in col.find(): if user['ori_cnt-re'] == 0: fre_re = float(thi_cnt_re) / ori_cnt_re else: coefficient = math.log10(user['ori_cnt-re'] + 1.0) / math.log10(max_ori_re) fre_re = coefficient * ( float(user['thi_cnt-re']) / user['ori_cnt-re']) + ( 1 - coefficient) * (float(thi_cnt_re) / ori_cnt_re) col.update({'uid': user['uid']}, {'$set': {'fre-re': fre_re}}) def setRetweetFre(self): """ 设置用户转发微博的关注比例 :return: none """ col = self.mdb.retweetFre if not col.find_one(): logging.info('retweetFre为空,设置主键为uid') col.create_index([('uid', pymongo.DESCENDING)], unique=True) # spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer') # normal = self.sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="yes"') """我的修改: 事实上,如果把choose='yes'去掉, 那么mongodb里存储的就是所有的14774个账号的了。 """ spammers = self.sqlhelper.select_sql_one('SELECT uid FROM spammer') normal = self.sqlhelper.select_sql_one( 'SELECT uid FROM normal WHERE choose="yes"') unknown = self.sqlhelper.select_sql_one( 'SELECT uid FROM normal WHERE choose="not"') final_user = self.sqlhelper.select_sql_one( 'SELECT uid FROM final_user WHERE spammer="yes"') for uid in final_user: if uid not in spammers: spammers.append(uid) """ 到这为止, 代码中spammer相当于数据表里spammer U final_user.spammer一共有903 """ # 不知道为什么spammer和normal两个集合有重合的用户 # 所以这里简单地将这些重合的用户都认为是spammer for uid in spammers: if uid in normal: normal.remove(uid) if uid in unknown: unknown.remove(uid) """ 到目前为止,我们得到了下面几个有用的东西 spammer: 水军 normal: 正常用户 unkonwn:还没来得及标注的未知类型用户 """ retweet_cnt = 0 follow_cnt = 0 onehop_cnt = 0 for user in MongoClient().profile.user.find(): uid = user['uid'] tmp_retweet_cnt = 0 tmp_follow_cnt = 0 tmp_onehop_cnt = 0 for res in self.sqlhelper.select_sql( 'SELECT retweet_flag, follow_flag, paMid FROM wblog WHERE uid=%s' % uid): retweet_flag = res[0] follow_flag = res[1] paMid = res[2] # 下面这个判断是为了筛选出转发微博 if str(retweet_flag) == '0': continue tmp_retweet_cnt += 1 retweet_cnt += 1 if str(follow_flag) == '1': tmp_follow_cnt += 1 follow_cnt += 1 if str(paMid) == '0': tmp_onehop_cnt += 1 onehop_cnt += 1 try: if uid in spammers: col.insert_one({ 'uid': uid, 'spammer': 'true', 'retweet_cnt': tmp_retweet_cnt, 'follow_cnt': tmp_follow_cnt, 'onehop_cnt': tmp_onehop_cnt }) elif uid in normal: col.insert_one({ 'uid': uid, 'spammer': 'false', 'retweet_cnt': tmp_retweet_cnt, 'follow_cnt': tmp_follow_cnt, 'onehop_cnt': tmp_onehop_cnt }) elif uid in unknown: col.insert_one({ 'uid': uid, 'spammer': 'unknown', 'retweet_cnt': tmp_retweet_cnt, 'follow_cnt': tmp_follow_cnt, 'onehop_cnt': tmp_onehop_cnt }) except Exception as e: print('%s. The user is %s' % (e, str(uid))) self.updateRetweetFre(retweet_cnt, follow_cnt, onehop_cnt) def updateRetweetFre(self, retweet_cnt, follow_cnt, onehop_cnt): """ 在setRetweetFre中只是做了初步的统计 所以这里需要计算出特征具体的值,并更新到mongodb中 :return: none """ col = self.mdb.retweetFre # max_retweet_cnt = 0 # max_follow_cnt = 0 # max_onehop_cnt = 0 # for user in col.find(): # if int(user['retweet_cnt']) > max_retweet_cnt: # max_retweet_cnt = user['retweet_cnt'] # if int(user['follow_cnt']) > max_follow_cnt: # max_follow_cnt = user['follow_cnt'] # if int(user['onehop_cnt']) > max_onehop_cnt: # max_onehop_cnt = user['onehop_cnt'] # spammer = self.sqlhelper.select_sql_one('SELECT uid FROM final_user WHERE spammer="yes"') # 先计算转发微博的关注比例 for user in col.find(): fre = 0 # if user['retweet_cnt'] == 0: # fre = float(follow_cnt) / retweet_cnt # else: # coefficient = math.log10(user['retweet_cnt'] + 1.0) / math.log10(max_retweet_cnt) # fre = coefficient * (float(user['follow_cnt']) / user['retweet_cnt']) + (1 - coefficient) * ( # float(follow_cnt) / retweet_cnt) if float(user['retweet_cnt']) != 0: fre = str( float(user['follow_cnt']) / float(user['retweet_cnt'])) if int(fre) == 0: pass col.update({'uid': user['uid']}, {'$set': {'follow_fre': fre}}) # 再计算转发微博中一跳转发的比例 for user in col.find(): fre = 0 # if user['retweet_cnt'] == 0: # fre = float(onehop_cnt) / retweet_cnt # else: # coefficient = math.log10(user['retweet_cnt'] + 1.0) / math.log10(max_retweet_cnt) # fre = coefficient * (float(user['onehop_cnt']) / user['retweet_cnt']) + (1 - coefficient) * ( # float(onehop_cnt) / retweet_cnt) if float(user['retweet_cnt']) != 0: fre = str( float(user['onehop_cnt']) / float(user['retweet_cnt'])) col.update({'uid': user['uid']}, {'$set': {'onehop_fre': fre}}) @staticmethod def caculate_rvp_ratio(user, edge, edge_reverse): reciprocated_edge = 0 edge_total_count = 0 if user in edge.keys(): edge_total_count += len(edge[user]) for followee in edge[user]: if followee in edge_reverse.keys(): if user in edge_reverse[followee]: reciprocated_edge += 1 if user in edge_reverse.keys(): edge_total_count += len(edge_reverse[user]) if edge_total_count == 0: return 0.0 return float(reciprocated_edge) / float(edge_total_count) def setFF(self): """ :return: none """ col = self.mdb.followCnt sqlhelper = SqlHelper() # spammer = sqlhelper.select_sql_one('SELECT uid FROM final_user WHERE spammer="yes"') # normal = sqlhelper.select_sql_one('SELECT uid FROM final_user WHERE spammer="no"') # cnt_dict = {} # profile = MongoClient().profile.json_text # for json_text in profile.find(): # uid = json_text['uid'] # if uid not in spammer and uid not in normal: # continue # cnt = 0 # try: # for card in json_text['json_text']['cards']: # try: # cnt += len(card['card_group']) # except Exception as e: # pass # except Exception as e: # print('no cards %s' % uid) # cnt_dict[uid] = cnt # for key in cnt_dict.keys(): # col.update({'uid': str(key)}, {'$set': {'profile': cnt_dict[key]}}) # # followCnt = MongoClient().userFeature.followCnt # for user in followCnt.find(): # uid = user['uid'] # try: # followee_cnt = followCnt.find_one({'uid': str(uid)})['followee_cnt'] # follower_cnt = followCnt.find_one({'uid': str(uid)})['follower_cnt'] # res = float(followee_cnt) / follower_cnt # col.update({'uid': str(uid)}, {'$set': {'ff': res}}) # except Exception as e: # print('no cards %s' % uid) uu = MongoClient().profile.user for user in uu.find(): uid = user['uid'] # if uid in spammer try: if uu.find_one({'uid': str(uid) })['json_text']['description'] != '': col.update({'uid': str(uid)}, {'$set': {'description': 1}}) else: col.update({'uid': str(uid)}, {'$set': {'description': 0}}) except Exception as e: print('no cards %s' % uid)
class DetectVC(object): def __init__(self, h, d, u, p, c, file_name_appendix=''): self.host = h self.db = d self.user = u self.passwd = p self.charset = c self.sqlhelper = SqlHelper(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) self.file_name_appendix = file_name_appendix self.user_train_dict, self.user_train_list, self.user_prior_dict, self.user_prior_list = \ Alkit.read_prior('../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt') self.spammer, self.spammer_prior, self.normal, self.normal_prior = Alkit.setSN(self.user_train_dict, self.user_prior_dict) self.seed_worker = [] for uid in self.user_train_dict.keys(): if self.user_train_dict[uid]['label'] == '1': self.seed_worker.append(uid) self.other_worker = [] for uid in self.user_prior_dict.keys(): if self.user_prior_dict[uid]['label'] == '1': self.other_worker.append(uid) self.normal = [] for uid in self.user_prior_dict.keys(): if self.user_prior_dict[uid]['label'] == '-1': self.normal.append(uid) self.all_user = self.seed_worker + self.other_worker + self.normal self.follow_edge = [] for uid in self.all_user: for result in self.sqlhelper.select_sql('SELECT uid, followeeUid FROM edge WHERE uid=%s' % uid): uid = str(result[0]) followeeUid = str(result[1]) if followeeUid not in self.all_user: continue self.follow_edge.append((uid, followeeUid)) def run(self): """ 主要调用HITS算法,稍作修改就行 :return: hub, auth """ logging.info('compute hits') hub = {} auth = {} graph = HITSMapReduce(self.all_user, self.follow_edge, self.seed_worker).hits() for user in self.all_user: hub[user] = graph[user]['hub'][0] auth[user] = graph[user]['authority'][0] logging.info('用户结果') scores = [] test_result = [] for uid in self.user_prior_list: test_result.append(int(self.user_prior_dict[uid]['label'])) scores.append(float(hub[uid])) user_res = Evaluation.evaluation_self(scores, test_result) # ap p, r, thresholds = metrics.precision_recall_curve(test_result, scores) ap = metrics.average_precision_score(test_result, scores) logging.info('user AP:%s' % str(ap)) with open('../main/detect_vc/user_ap'+self.file_name_appendix+'.txt', 'w') as my_file: my_file.write('p r\n') for i in range(len(p)): my_file.write('%s %s\n' % (str(p[i]), str(r[i]))) # roc fpr, tpr, thresholds = metrics.roc_curve(test_result, scores) logging.info('user AUC:%s' % str(metrics.auc(fpr, tpr))) with open('../main/detect_vc/user_roc'+self.file_name_appendix+'.txt', 'w') as my_file: my_file.write('fpr tpr\n') for i in range(len(fpr)): my_file.write('%s %s\n' % (str(fpr[i]), str(tpr[i]))) # top k precision worker_score = {} for i in range(len(scores)): worker_score[self.user_prior_list[i]] = scores[i] worker_score = sorted(worker_score.items(), key=lambda im: float(im[1]), reverse=True) with open('../main/detect_vc/res_user_top'+self.file_name_appendix+'.txt', 'w') as my_file: my_file.write('type uid score precision top_k\n') worker_count_now = 0 top_k = 0 for itm in worker_score: uid = itm[0] score = itm[1] if uid in self.spammer: u_type = 'w' worker_count_now += 1 else: u_type = 'n' top_k += 1 precision = str(float(worker_count_now) / top_k) my_file.write(u_type + ' ' + str(uid) + ' ' + str(score) + ' ' + precision + ' ' + str(top_k) + '\n') hub = sorted(hub.items(), key=lambda im: float(im[1]), reverse=True) with open('../main/detect_vc/hub'+self.file_name_appendix+'.txt', 'w') as my_file: my_file.write('type uid hub worker_per total_per\n') worker_count_now = 0 worker_count_all = len(self.other_worker) all_count_now = 0 all_count_all = len(self.all_user) - len(self.seed_worker) for itm in hub: uid = str(itm[0]) u_type = '-' if uid in self.seed_worker: continue if uid in self.other_worker: u_type = 'o' worker_count_now += 1 if uid in self.normal: u_type = 'n' all_count_now += 1 hub_score = str(itm[1]) worker_per = str(float(worker_count_now) / worker_count_all) total_per = str(float(all_count_now) / all_count_all) my_file.write(u_type + ' ' + uid + ' ' + hub_score + ' ' + worker_per + ' ' + total_per + '\n') return hub, auth