def run(self, train_per=0.8, reset_dataset=False): """ 从数据库中读取特征数据,并使用adaboost分类 :return: """ # 首先划分训练集微博和测试集微博 swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM swblog') wblog = self.sqlhelper.select_sql_one( 'SELECT wblogId FROM wblog_choose') final_wblog = self.sqlhelper.select_sql_one( 'SELECT wblogId FROM final_wblog WHERE spammer="yes"') for wblogId in final_wblog: if wblogId not in swblog: swblog.append(wblogId) for uid in swblog: if uid in wblog: wblog.remove(uid) train_wblog_set, test_wblog_set = Alkit.read_dataset( '../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') # 输出训练集和测试集的一些信息 logging.info('训练集大小:%s' % len(train_wblog_set)) logging.info('训练集中正例(swblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(swblog))))) logging.info('训练集中负例(wblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(wblog))))) logging.info('测试集大小:%s' % len(test_wblog_set)) logging.info('测试集中正例(swblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(swblog))))) logging.info('测试集中负例(wblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(wblog))))) # print('279 train_wblog_set \n', train_wblog_set) # print('279 swblog \n', swblog) # print('279 wblog \n', wblog) # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致) feature_dict_data, result_dict_data = self.load_data( train_wblog_set, swblog, wblog) # print('281 feature_dict_data ', feature_dict_data) # [('4033482998743585', [nan, nan, nan, nan, nan]), # print('282 result_dict_data', result_dict_data) # [('4033482998743585', 1), ('3914608449995325', 1), train_feature, train_result = Alkit.process_data( feature_dict_data, result_dict_data) logging.info('训练集数据处理完毕') feature_dict_data, result_dict_data = self.load_data( test_wblog_set, swblog, wblog) test_feature, test_result = Alkit.process_data(feature_dict_data, result_dict_data) logging.info('测试集数据处理完毕') # 使用ad-boost训练并输出结果 logging.info('\nAdaBoost开始训练') model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=5), algorithm="SAMME", n_estimators=100, learning_rate=0.5) model.fit(train_feature, train_result) logging.info('训练结束') predict_result = model.predict(test_feature) logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) predict_result_proba = model.predict_proba(test_feature) prp = [] for prob in predict_result_proba: prp.append(float(prob[0]) * -1 + float(prob[1]) * 1) Alkit.write_prior( '../main/crowd_target/wblog_train' + self.file_name_appendix + '.txt', '../main/crowd_target/wblog_prior' + self.file_name_appendix + '.txt', train_wblog_set, train_result, test_wblog_set, test_result, predict_result, prp)
def run(self): """ 从数据库中读取特征数据,并使用svm和lr分类 :return: """ if not self.add_unknown_into_model: swblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM swblog') wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM wblog_choose') final_wblog = self.sqlhelper.select_sql_one('SELECT wblogId FROM final_wblog WHERE spammer="yes"') for wblogId in final_wblog: if wblogId not in swblog: swblog.append(wblogId) # 不知道为什么spammer和normal两个集合有重合的用户 # 所以这里简单地将这些重合的用户都认为是spammer for uid in swblog: if uid in wblog: wblog.remove(uid) """ 到目前为止,我们得到了下面几个有用的东西 swblog: 水军 wblog: 正常用户 unkonwn:还没来得及标注的未知类型微博 """ logging.info('原始数据spam占比例(max): %s' % (len(swblog) * 1.0 / (len(wblog) + len(swblog)))) if self.spam_per > len(swblog) * 1.0 / (len(wblog) + len(swblog)): logging.info('we don\'t have so much spams in our datasets, we will keep original percentage') else: expected_spam_number = int(self.spam_per * len(wblog) * 1.0 / (1 - self.spam_per)) swblog = random.sample(swblog, expected_spam_number) if self.reset_dataset: train_wblog_set = random.sample(swblog, int(len(swblog) * self.train_per)) + random.sample(wblog, int( len(wblog) * self.train_per)) test_wblog_set = list(set(swblog + wblog).difference(train_wblog_set)) # # 第二期改进代码 # train_user_set_without_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(len(normal) * train_per)) # train_user_set_with_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int( # len(normal) * train_per))+random.sample(unknown, len(unknown)) # test_user_set = list(set(spammer + normal).difference(train_user_set_without_unknown)) # train_user_set=train_user_set_with_unknown+train_user_set_with_unknown else: train_wblog_set, test_wblog_set = Alkit.read_dataset( '../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') # 输出训练集和测试集的一些信息 logging.info('总数据集大小:%s' % (len(train_wblog_set)+len(test_wblog_set))) logging.info('训练集大小:%s' % len(train_wblog_set)) logging.info('训练集中正例(swblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(swblog))))) logging.info('训练集中负例(wblog)大小:%s' % len(list(set(train_wblog_set).intersection(set(wblog))))) logging.info('测试集大小:%s' % len(test_wblog_set)) logging.info('测试集中正例(swblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(swblog))))) logging.info('测试集中负例(wblog)大小:%s' % len(list(set(test_wblog_set).intersection(set(wblog))))) else: raise ('we will implement this later.') # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致) feature_dict_data, result_dict_data = self.load_data(train_wblog_set, swblog, wblog) train_feature, train_result = Alkit.process_data(feature_dict_data, result_dict_data) logging.info('训练集数据处理完毕') feature_dict_data, result_dict_data = self.load_data(test_wblog_set, swblog, wblog) test_feature, test_result = Alkit.process_data(feature_dict_data, result_dict_data) logging.info('测试集数据处理完毕') # 使用svm训练并输出结果 # logging.info('\nSVM开始训练') # model = SVC(class_weight='balanced') # model.fit(train_feature, train_result) # logging.info('训练结束') # predict_result = model.predict(test_feature) # logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) # logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) # logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) # 使用LR训练并输出结果 logging.info('LR开始训练') model = LogisticRegression(class_weight='balanced') model.fit(train_feature, train_result) logging.info('训练结束') predict_result = model.predict(test_feature) logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) # 使用LR输出概率形式的结果 predict_result_proba = model.predict_proba(test_feature) prp = [] for prob in predict_result_proba: prp.append(float(prob[0]) * -1 + float(prob[1]) * 1) # 将LR跑出来的两种结果保存下来,供下一步使用 if self.dump: logging.info("保存结果输出到 " + '../main/prior/wblog_train' + self.file_name_appendix + '.txt' + "和" + '../main/prior/wblog_prior' + self.file_name_appendix + '.txt') Alkit.write_prior('../main/prior/wblog_train' + self.file_name_appendix + '.txt', '../main/prior/wblog_prior' + self.file_name_appendix + '.txt', train_wblog_set, train_result, test_wblog_set, test_result, predict_result, prp)
def run(self): """ 从数据库中读取特征数据,并使用svm和lr分类 水军占比例(max): 0.2325521503991759 spammer_per <= 0.2325521503991759 :return: """ if not self.add_unknown_into_model: # 首先划分训练集用户和测试集用户 spammer = self.sqlhelper.select_sql_one('SELECT uid FROM spammer') normal = self.sqlhelper.select_sql_one( 'SELECT uid FROM normal WHERE choose="yes"') # unknown = self.sqlhelper.select_sql_one('SELECT uid FROM normal WHERE choose="not"') final_user = self.sqlhelper.select_sql_one( 'SELECT uid FROM final_user WHERE spammer="yes"') """ final_user: 3843个用户, 水军903, 非水军2940 normal: 13906个用户, 水军和非水军未知,为此我们通过人工的方法从从这些用户中挑选了一些正常的用户,标记为choose='yes' spammer: 892个水军用户 """ for uid in final_user: if uid not in spammer: spammer.append(uid) """ 到这为止, 代码中spammer相当于数据表里spammer U final_user.spammer一共有903 """ # 不知道为什么spammer和normal两个集合有重合的用户 # 所以这里简单地将这些重合的用户都认为是spammer for uid in spammer: if uid in normal: normal.remove(uid) # if uid in unknown: # unknown.remove(uid) """ 到目前为止,我们得到了下面几个有用的东西 spammer: 水军 normal: 正常用户 unkonwn:还没来得及标注的未知类型用户 """ logging.info('原始数据水军占比例(max): %s' % (len(spammer) * 1.0 / (len(normal) + len(spammer)))) if self.spammer_per > len(spammer) * 1.0 / (len(normal) + len(spammer)): logging.info( 'we don\'t have so much spammers in our datasets, we will keep original percentage' ) else: expected_spammer_number = int(self.spammer_per * len(normal) * 1.0 / (1 - self.spammer_per)) spammer = random.sample(spammer, expected_spammer_number) # print(len(spammer)) if self.reset_dataset: train_user_set = random.sample( spammer, int( len(spammer) * self.train_per)) + random.sample( normal, int(len(normal) * self.train_per)) test_user_set = list( set(spammer + normal).difference(train_user_set)) # # 第二期改进代码 # train_user_set_without_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int(len(normal) * train_per)) # train_user_set_with_unknown = random.sample(spammer, int(len(spammer) * train_per)) + random.sample(normal, int( # len(normal) * train_per))+random.sample(unknown, len(unknown)) # test_user_set = list(set(spammer + normal).difference(train_user_set_without_unknown)) # train_user_set=train_user_set_with_unknown+train_user_set_with_unknown else: train_user_set, test_user_set = Alkit.read_dataset( '../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt') # 输出训练集和测试集的一些信息 logging.info('数据集总大小:%s' % (len(train_user_set) + len(test_user_set))) logging.info('训练集大小:%s' % len(train_user_set)) logging.info( '训练集中正例(spammer)大小:%s' % len(list(set(train_user_set).intersection(set(spammer))))) logging.info( '训练集中负例(normal)大小:%s' % len(list(set(train_user_set).intersection(set(normal))))) # logging.info('训练集中未知标签(unknown)大小:%s' % len(list(set(unknown)))) logging.info('测试集大小:%s' % len(test_user_set)) logging.info( '测试集中正例(spammer)大小:%s' % len(list(set(test_user_set).intersection(set(spammer))))) logging.info( '测试集中负例(normal)大小:%s' % len(list(set(test_user_set).intersection(set(normal))))) logging.info('水军占比例: %s' % (len(spammer) * 1.0 / (len(normal) + len(spammer)))) """ 测试集参与训练,但是测试集在模型训练期间标签将按照unknown处理 """ else: raise ('we will implement this later.') # 将训练集和测试集从数据库中读出来,以顺序字典存储(调用vlues()输出的list顺序和插入顺序一致) feature_dict_data, result_dict_data = self.load_data( train_user_set, spammer, normal) train_feature, train_result = Alkit.process_data( feature_dict_data, result_dict_data) logging.info('训练集数据处理完毕') feature_dict_data, result_dict_data = self.load_data( test_user_set, spammer, normal) test_feature, test_result = Alkit.process_data(feature_dict_data, result_dict_data) logging.info('测试集数据处理完毕') # print(metrics.mutual_info_score(train_result, train_feature)) # 使用svm训练并输出结果 # logging.info('\nSVM开始训练') # model = SVC(class_weight='balanced') # model.fit(train_feature, train_result) # logging.info('训练结束') # predict_result = model.predict(test_feature) # logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) # logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) # logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) # import minepy # m = minepy.MINE() # for i in range(7): # m.compute_score(train_feature[:,i], train_result) # print(m.mic()) # 使用LR训练并输出结果 logging.info('LR开始训练') model = LogisticRegression(class_weight='balanced') model.fit(train_feature, train_result) logging.info('训练结束') predict_result = model.predict(test_feature) logging.info('准确率:%s' % metrics.precision_score(test_result, predict_result)) logging.info('召回率:%s' % metrics.recall_score(test_result, predict_result)) logging.info('F1:%s' % metrics.f1_score(test_result, predict_result)) # 使用LR输出概率形式的结果 predict_result_proba = model.predict_proba(test_feature) prp = [] for prob in predict_result_proba: prp.append(float(prob[0]) * -1 + float(prob[1]) * 1) # 将LR跑出来的两种结果保存下来,供下一步使用 if self.dump: logging.info("保存结果输出到 " + '../main/prior/user_train' + self.file_name_appendix + '.txt 和' + '../main/prior/user_prior' + self.file_name_appendix + '.txt') Alkit.write_prior( '../main/prior/user_train' + self.file_name_appendix + '.txt', '../main/prior/user_prior' + self.file_name_appendix + '.txt', train_user_set, train_result, test_user_set, test_result, predict_result, prp)