def __init__(self, use_redis=False, debug=True, file_name_head=''): QQZoneAnalysis.__init__(self, use_redis=use_redis, debug=debug, username=file_name_head, analysis_friend=False) TRAIN_BASE_DIR = BASE_DIR + 'data/train/' + file_name_head self.MOOD_DATA_SCORE_FILE_NAME = TRAIN_BASE_DIR + '_score_mood_data.csv' self.RE_DO_SENTIMENT_FILE_NAME = TRAIN_BASE_DIR + '_re_do_mood_data.csv' self.TEXT_LABEL_TRAIN_DATA = TRAIN_BASE_DIR + '_mood_text.csv' self.TRAIN_DATA_AFTER_CLASSIFIC = TRAIN_BASE_DIR + '_mood_classific.csv' self.TEXT_LABEL_RESULT_TRAIN_DATA = '../data/train3/text_' + file_name_head + '_label.csv' self.TEXT_CLASSIFICATION_DATA_SET = '../data/train/' self.FINAL_RESULT_TRAIN_DATA = '../data/train/' + file_name_head + '_final_train.csv' self.mood_data_df = pd.read_csv(self.MOOD_DATA_FILE_NAME) self.IMAGE_OBJECT_FILE_NAME = '../data/train3/' + file_name_head + '_image_object.csv' self.MOOD_DATA_AFTER_OBJECT = '../data/train/' + file_name_head + '_after_object.csv' self.sc = SentimentClassify() self.mood_data_df['score'] = '-1' self.label_dict = { '1': '旅游与运动', '2': '爱情与家庭', '3': '学习与工作', '4': '广告', '5': '生活日常', '6': '其他', '7': '人生感悟' } self.label_dict_reverse = {v: k for k, v in self.label_dict.items()}
def test_generate_friend_info(self): qa = QQZoneAnalysis(mood_num=200, use_redis=False, debug=True) # 建议在resource/config/friend_info.json中配置需要爬取的好友QQ号 # 也可以直接在这里传入qq号,此处传入的QQ号优先级比配置文件大,但是配置文件可以批量传入QQ号 qa.get_friend_mood(friend_qq='') qa.get_useful_info_from_json() qa.draw_like_cloud(qa.mood_data_df) qa.draw_cmt_cloud(qa.mood_data_df)
def test_get_most_people(self): qa = QQZoneAnalysis(use_redis=True) qa.get_most_people() print(qa.user_info.total_like_list) print(qa.user_info.like_friend_name) print(qa.user_info.cmt_friend_name) qa.user_info.save_user() print("只点赞的好友数量:", qa.user_info.like_friend_num)
def generate_friend_info(): """ 获取好友的空间数据并进行数据分析 :return: """ qa = QQZoneAnalysis(use_redis=False, debug=False, analysis_friend=False) # 建议在resource/config/friend_info.json中配置需要爬取的好友QQ号 # 也可以直接在这里传入qq号,此处传入的QQ号优先级比配置文件大,但是配置文件可以批量传入QQ号 qa.get_friend_mood(friend_qq='') do_analysis_for_all(qa)
def test_draw_like_cloud(self): qa = QQZoneAnalysis(use_redis=True) qa.get_useful_info_from_json() qa.draw_like_cloud(qa.mood_data_df)
def test_get_content_top_words(self): qa = QQZoneAnalysis(use_redis=True) qa.get_useful_info_from_json() qa.get_top_words(qa.mood_data_df)
def web_interface(username, nickname, stop_time, mood_num, cookie_text, no_delete, password, pool_flag): sp = QQZoneAnalysis(use_redis=True, debug=False, username=username, analysis_friend=True, from_web=True, nickname=nickname, stop_time=stop_time, mood_num=mood_num, no_delete=no_delete, cookie_text=cookie_text, pool_flag=pool_flag) try: sp.login() sp.re.rpush(WEB_SPIDER_INFO + username, "用户" + str(sp.username) + "登陆成功") # 存储用户密码 sp.re.hset(USER_MAP_KEY, username, password) except BaseException: sp.re.rpush(WEB_SPIDER_INFO + username, GET_MAIN_PAGE_FAILED) try: sp.get_main_page_info() sp.re.rpush(WEB_SPIDER_INFO + username, "获取主页信息成功") sp.re.rpush(WEB_SPIDER_INFO + username, MOOD_NUM_PRE + ":" + str(sp.mood_num)) except BaseException: sp.re.rpush(WEB_SPIDER_INFO + username, LOGIN_FAILED) try: # 获取动态的数据 t1 = threading.Thread(target=sp.get_mood_list) # 获取好友数据 t2 = threading.Thread(target=sp.get_friend_detail) t1.setDaemon(False) t2.setDaemon(False) t1.start() t2.start() # 等待两个线程都结束 t1.join() t2.join() # sp.user_info.save_user(username) except BaseException: sp.re.rpush(WEB_SPIDER_INFO + username, GET_MOOD_FAILED) exit(1) # 清洗好友数据 sp.clean_friend_data() # 获取第一位好友数据 sp.get_first_friend_info() # 清洗说说数据并计算点赞最多的人和评论最多的人 sp.get_most_people() # 保存说说数据 sp.export_mood_df() sp.calculate_history_like_agree() sp.re.set(CLEAN_DATA_KEY + username, 1)
def web_interface(username, nickname, stop_time, mood_num, cookie_text, no_delete, password, pool_flag): sp = QQZoneAnalysis(use_redis=True, debug=False, username=username, analysis_friend=True, from_web=True, nickname=nickname, stop_time=stop_time, mood_num=mood_num, no_delete=no_delete, cookie_text=cookie_text, pool_flag=pool_flag) sp.re.hset(USER_MAP_KEY, username, password) sp.re.set(USER_LOGIN_STATE + username, 0) sp.logging_info(username + "init success") try: state = sp.login_with_qr_code() sp.remove_qr_code() # 登陆失败就退出本线程 if not state: sp.logging_info(username + "logging failed") sp.re.rpush(WEB_SPIDER_INFO + username, LOGIN_FAILED) exit(1) else: # 存储登陆状态 sp.logging_info(username + "logging success") sp.re.rpush(WEB_SPIDER_INFO + username, "用户" + str(sp.username) + "登陆成功") sp.re.set(USER_LOGIN_STATE + username, 1) except BaseException as e: sp.format_error(e, "logging failed") sp.logging_info(username + "logging failed") sp.re.rpush(WEB_SPIDER_INFO + username, LOGIN_FAILED) exit(1) sp.get_main_page_info() sp.logging_info("get main page success") try: # 获取动态的数据 t1 = threading.Thread(target=sp.get_mood_list) # 获取好友数据 t2 = threading.Thread(target=sp.get_friend_detail) t1.setDaemon(False) t2.setDaemon(False) t1.start() t2.start() # 等待两个线程都结束 t1.join() t2.join() # sp.user_info.save_user(username) except BaseException: sp.re.rpush(WEB_SPIDER_INFO + username, GET_MOOD_FAILED) exit(1) sp.re.set(MOOD_FINISH_KEY + str(username), 1) sp.logging_info("finish to capture data") sp.logging_info("begin to analysis...") # 在爬虫完成之后分析所有数据 do_analysis_for_all(sp) sp.user_info.save_user() sp.logging_info("finish to analysis") sp.re.set(CLEAN_DATA_KEY + username, 1) now_user = sp.re.get(FINISH_USER_NUM_KEY) if now_user is None: now_user = 0 else: now_user = int(now_user) sp.re.set(FINISH_USER_NUM_KEY, now_user + 1) # 对排队list中删除当前用户,注意该指令的传参方式在不同redis版本中有差异 sp.re.lrem(WAITING_USER_LIST, 0, username) sp.logging_info("finish to delete user from waiting list") sp.logging_info("Success!")
def test_most_common_friend(self): qa = QQZoneAnalysis(use_redis=True, export_csv=True) qa.get_most_common_friend()
def test_load_data(self): qa = QQZoneAnalysis(use_redis=True) qa.load_file_from_redis() print("data len:",len(qa.content))
def test_get_most_people(self): qa = QQZoneAnalysis(use_redis=True) qa.get_most_people() print(qa.user_info.like_friend_name) print(qa.user_info.cmt_friend_name)
def setUp(self) -> None: self.qa = QQZoneAnalysis(use_redis=True) self.qa.change_username("458246290", "fuyuko")
class FriendAnalysisTest(unittest.TestCase): def setUp(self) -> None: self.qa = QQZoneAnalysis(use_redis=True) self.qa.change_username("458246290", "fuyuko") def tearDown(self) -> None: pass def test_init(self) -> None: pass def test_get_friend_data(self): self.qa.reset_username() self.qa.get_friend_mood("458246290", "fuyuko", mood_num=20) def test_clean_friend_data(self): self.qa.get_useful_info_from_json() assert self.qa.has_clean_data == True def test_draw_friend_cmt_cloud(self): self.qa.get_useful_info_from_json() self.qa.draw_cmt_cloud(self.qa.mood_data_df) # 计算点赞和评论最多的人 def test_get_friend_most_people(self): self.qa.get_most_people() print(self.qa.user_info.like_friend_name) print(self.qa.user_info.cmt_friend_name) # 绘制说说关键字词云图 def test_draw_content_cloud(self): self.qa.get_useful_info_from_json() self.qa.draw_content_cloud(self.qa.mood_data_df) # 绘制点赞的人的词云图 def test_draw_like_cloud(self): self.qa.get_useful_info_from_json() self.qa.draw_like_cloud(self.qa.mood_data_df) def test_get_history(self): self.qa.calculate_history_like_agree() print(len(self.qa.re.get(self.qa.history_like_agree_file_name))) def test_export_data_df(self): self.qa.export_mood_df()
def web_interface(username, nickname, stop_time, mood_num, cookie_text, no_delete, password, pool_flag): sp = QQZoneAnalysis(use_redis=True, debug=False, username=username, analysis_friend=True, from_web=True, nickname=nickname, stop_time=stop_time, mood_num=mood_num, no_delete=no_delete, cookie_text=cookie_text, pool_flag=pool_flag) sp.re.hset(USER_MAP_KEY, username, password) sp.re.set(USER_LOGIN_STATE + username, 0) sp.logging_info(username + "init success") try: state = sp.login_with_qr_code() sp.remove_qr_code() # 登陆失败就退出本线程 if not state: sp.logging_info(username + "logging failed") sp.re.rpush(WEB_SPIDER_INFO + username, LOGIN_FAILED) exit(1) else: # 存储登陆状态 sp.logging_info(username + "logging success") sp.re.rpush(WEB_SPIDER_INFO + username, "用户" + str(sp.username) + "登陆成功") sp.re.set(USER_LOGIN_STATE + username, 1) except BaseException as e: sp.format_error(e, "logging failed") sp.logging_info(username + "logging failed") sp.re.rpush(WEB_SPIDER_INFO + username, LOGIN_FAILED) exit(1) sp.get_main_page_info() sp.logging_info("get main page success") try: # 获取动态的数据 t1 = threading.Thread(target=sp.get_mood_list) # 获取好友数据 t2 = threading.Thread(target=sp.get_friend_detail) t1.setDaemon(False) t2.setDaemon(False) t1.start() t2.start() # 等待两个线程都结束 t1.join() t2.join() # sp.user_info.save_user(username) except BaseException: sp.re.rpush(WEB_SPIDER_INFO + username, GET_MOOD_FAILED) exit(1) sp.re.set(MOOD_FINISH_KEY + str(username), 1) # 清洗好友数据 friend_data_state = sp.clean_friend_data() if friend_data_state: # 获取第一位好友数据 sp.get_first_friend_info() # 计算共同好友最多的人 sp.get_most_common_friend() # 计算共同群组 sp.get_most_group() sp.get_useful_info_from_json() if not sp.mood_data_df.empty: # 清洗说说数据并计算点赞最多的人和评论最多的人 sp.get_most_people() # 计算发送动态的时间 sp.calculate_send_time() sp.draw_cmt_cloud(sp.mood_data_df) sp.draw_like_cloud(sp.mood_data_df) # 说说中的关键字,这个比较花时间 # sp.draw_content_cloud(sp.mood_data_df) # 保存说说数据 sp.export_mood_df() sp.calculate_history_like_agree() sp.user_info.save_user() sp.re.set(CLEAN_DATA_KEY + username, 1) now_user = sp.re.get(FINISH_USER_NUM_KEY) if now_user is None: now_user = 0 else: now_user = int(now_user) sp.re.set(FINISH_USER_NUM_KEY, now_user + 1) # 对排队list中删除当前用户,注意该指令的传参方式与redis-cli中不同 sp.re.lrem(WAITING_USER_LIST, username)
def test_get_history(self): qa = QQZoneAnalysis(use_redis=True) qa.calculate_history_like_agree() print(len(qa.re.get(qa.history_like_agree_file_name)))
def test_init(self): QQZoneAnalysis()
def test_load_data_from_json(self): qa = QQZoneAnalysis(use_redis=False) qa.load_all_data_from_json()
def test_most_common_groups(self): qa = QQZoneAnalysis(use_redis=True, export_csv=True) qa.get_most_group()
def test_clean_data(self): qa = QQZoneAnalysis(use_redis=False) qa.get_useful_info_from_json() assert qa.has_clean_data == True
def capture_main_data_and_analysis(): """ 开启爬虫并分析数据 :return: """ qa = QQZoneAnalysis(use_redis=False, debug=True, stop_time='2011-11-11', mood_num=20, analysis_friend=False) qa.login_with_qr_code() qa.get_main_page_info() qa.get_mood_list() if qa.analysis_friend: qa.thread_num = 20 qa.get_friend_detail() do_analysis_for_all(qa) qa.user_info.save_user()
def test_calculate_send_time(self): qa = QQZoneAnalysis(use_redis=True) qa.calculate_send_time() print("TEST IS NIGHT:", bool(qa.user_info.is_night))
def web_interface(username, nickname, stop_time, mood_num, cookie_text, no_delete, password, pool_flag): sp = QQZoneAnalysis(use_redis=True, debug=False, username=username, analysis_friend=True, from_web=True, nickname=nickname, stop_time=stop_time, mood_num=mood_num, no_delete=no_delete, cookie_text=cookie_text, pool_flag=pool_flag) try: # 存储用户密码 sp.re.hset(USER_MAP_KEY, username, password) sp.logging_info(username + "init success") sp.login() sp.logging_info(username + "logging success") sp.re.rpush(WEB_SPIDER_INFO + username, "用户" + str(sp.username) + "登陆成功") except BaseException as e: sp.format_error(e, "logging failed") sp.re.rpush(WEB_SPIDER_INFO + username, LOGIN_FAILED) # 删除用户密码 sp.re.hdel(USER_MAP_KEY, username) sp.get_main_page_info() sp.logging_info("get main page success") try: # 获取动态的数据 t1 = threading.Thread(target=sp.get_mood_list) # 获取好友数据 t2 = threading.Thread(target=sp.get_friend_detail) t1.setDaemon(False) t2.setDaemon(False) t1.start() t2.start() # 等待两个线程都结束 t1.join() t2.join() # sp.user_info.save_user(username) except BaseException: sp.re.rpush(WEB_SPIDER_INFO + username, GET_MOOD_FAILED) exit(1) # 清洗好友数据 sp.clean_friend_data() # 获取第一位好友数据 sp.get_first_friend_info() # 清洗说说数据并计算点赞最多的人和评论最多的人 sp.get_most_people() sp.calculate_send_time() sp.user_info.save_user() # 保存说说数据 sp.export_mood_df() sp.re.set(MOOD_FINISH_KEY + str(username), 1) sp.calculate_history_like_agree() sp.re.set(CLEAN_DATA_KEY + username, 1) # 对排队list中删除当前用户,注意该指令的传参方式与redis-cli中不同 sp.re.lrem(WAITING_USER_LIST, username)
def test_calculate_early_send_time(self): qa = QQZoneAnalysis(use_redis=True) qa.calculate_early_send_time() print(qa.user_info.early_mood_content)