def save_all_users_info_to_file_by_multi_threading(self,threads = 10): ''' 多线程加速保存用户信息到磁盘 :param threads: 线程数 ''' Helper.check_file_exits_and_overwrite(self.users_info_file_path) start_time = time.time() users_url = self.load_all_users_url() num = len(users_url) pack = num//threads # 每个线程处理的url数量 threads_list = [] # 计数器初始化 self.no_counter_init() for i in range(threads): if i < threads-1: urls = users_url[i*pack:(i+1)*pack] else: urls = users_url[i*pack:] t = Thread(target = self.save_users_info,args=(urls,num)) threads_list.append(t) for i in range(threads): threads_list[i].start() for i in range(threads): threads_list[i].join() end_time = time.time() self.logger.info("Using {threads} threads to save users info done,costs {cost_time} seconds" .format(threads = threads,cost_time = (end_time - start_time)))
def draw_wordcloud(self,cutted_words_text,save_path, background_path = None,font_path = None, max_words = 2000,max_font_size = 40,background_color = 'white'): ''' 绘制词云,并保存图像到磁盘 :param cutted_words_text: 已经切分好的,用空格分隔的word 字符串 :param save_path: 保存路径 :param background_path:背景图片地址 :param font_path:字体文件地址 :param max_words:最大单词数 :param max_font_size:最大字体 :param background_color:背景颜色 :return: ''' Helper.check_file_exits_and_overwrite(save_path) if background_path is None: background_path = Constants.DEFAULT_BACKGROUND_PATH if font_path is None: font_path = Constants.DEFAULT_FONT_PATH color_mask = imread(background_path) cloud = WordCloud(font_path = font_path,background_color=background_color, mask=color_mask,max_words=max_words,max_font_size = max_font_size) # 产生词云 word_cloud = cloud.generate(cutted_words_text) word_cloud.to_file(save_path) self.logger.info("Successfully generate wordcloud img to {save_path}!".format(save_path=save_path))
def save_singer_all_hot_comments_to_file(self): ''' 保存歌手的全部热门评论到磁盘 :param singer_name: 歌手名字 :param singer_id:歌手 id ''' save_path = self.singer_all_hot_comments_file_path Helper.check_file_exits_and_overwrite(save_path) song_ids = Helper.get_singer_hot_songs_ids( self.singer_url) # 歌手全部歌曲id list if len(song_ids) == 0: self.logger.error( "crawl from %s to get %s all hot songs ids failed!" % (self.singer_url, self.singer_name)) return # first line is headers all_hot_comments_list = [] for song_id in song_ids: url = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_{song_id}/?csrf_token=".format( song_id=song_id) hot_comments_list = self.get_hot_comments(url) all_hot_comments_list.extend(hot_comments_list) all_hot_comments_json_str_list = [ self.extract_comment_info_as_json_str(comment) for comment in all_hot_comments_list ] Helper.save_lines_to_file(all_hot_comments_json_str_list, save_path) self.logger.info( "Write {singer_name}'s {num} hot songs hot comments successfully!". format(singer_name=self.singer_name, num=len(song_ids)))
def save_all_users_info_to_file(self): ''' 保存一首歌曲下全部用户信息到磁盘 :return: ''' Helper.check_file_exits_and_overwrite(self.users_info_file_path) users_info_list = self.get_users_info_list() Helper.save_lines_to_file(users_info_list,self.users_info_file_path)
def save_sorted_bar_plot(self,datas,label,title,key_index, save_path,reverse = False): ''' 绘制有序的柱状图并保存 :param datas: 输入数据 :param label: 标签 :param title: 标题 :param key_index: 排序的key index :param reverse:是否翻转排序(递减,默认递增) :param save_path: 保存路径 :return: ''' Helper.check_file_exits_and_overwrite(save_path) x,y = zip(*(sorted(Counter(datas).items(), key=itemgetter(key_index),reverse=reverse))) bar = Bar(title) bar.add(label,x,y) bar.render(save_path)
def save_all_comments_to_file(self): ''' 顺序保存全部评论到磁盘 :return: ''' Helper.check_file_exits_and_overwrite(self.comments_file_path) start_time = time.time() all_comments_list = self.get_all_comments() # comment dict to json str all_comments_json_str_list = [ self.extract_comment_info_as_json_str(comment) for comment in all_comments_list ] Helper.save_lines_to_file(all_comments_json_str_list, self.comments_file_path) end_time = time.time() print("It costs %.2f seconds to crawler <%s>." % (end_time - start_time, self.song_name))
def save_lyrics_to_file(self): ''' 保存歌曲歌词到磁盘 :return: ''' save_path = os.path.join( self.song_path, "{song_name}_lyrics.txt".format(song_name=self.song_name)) Helper.check_file_exits_and_overwrite(save_path) lyrics_json = json.loads(self.get_lyrics_format_json()) lyrics_str = lyrics_json['lrc']['lyric'] pattern = r'\[\d+:\d+\.\d+\](.+?\n)' lyrics_list = re.findall(pattern, lyrics_str) with open(save_path, "w", encoding="utf-8") as f: f.write("{song_name}\n{singer_name}\n".format( song_name=self.song_name, singer_name=self.singer_name)) f.writelines(lyrics_list) self.logger.info( "save {save_path} successfully!".format(save_path=save_path))
def save_all_comments_to_file_by_multi_threading(self, threads=10): ''' 使用多线程保存全部评论文件到磁盘 :param threads:线程数 ''' self.no_counter_init() # 检查文件是否已经存在 Helper.check_file_exits_and_overwrite(self.comments_file_path) start_time = time.time() total_comments_num, page = self.get_song_total_comments_num_and_page_num( ) self.logger.info( "Song name:{song_name}".format(song_name=self.song_name)) self.logger.info("There are %d pages of total %d comments!" % (page, total_comments_num)) pack = page // threads threads_list = [] for i in range(threads): begin_page = i * pack if i < threads - 1: end_page = (i + 1) * pack else: end_page = page t = Thread(target=self.save_pages_comments, args=(begin_page, end_page, total_comments_num)) threads_list.append(t) for i in range(threads): threads_list[i].start() for i in range(threads): threads_list[i].join() end_time = time.time() self.logger.info( "Using {threads} threads,it costs {cost_time} seconds to crawl <{song_name}>'s all comments!" .format(threads=threads, cost_time=(end_time - start_time), song_name=self.song_name))
def core_visual_analyse(self): ''' 评论以及用户信息可视化,核心函数,使用pyecharts绘制 1. 评论时间的分布,包括月和天,柱状图 2. 赞同数分布,柱状图 3. 去除停用词之后评论关键词的分布,柱状图 4. 用户地理位置的分布,使用地图展示 5. 用户地理位置的分布,使用柱状图展示 6. 用户动态的分布,柱状图展示 7. 用户关注人数的分布,柱状图展示 8. 用户粉丝数的分布,柱状图展示 9. 去停用词之后用户个人描述关键词分布,柱状图 10. 用户年龄的分布,柱状图 11. 用户听歌总数分布,柱状图 ''' plot_save_path = os.path.join(self.song_path,Constants.PLOTS_SAVE_NAME) Helper.mkdir(plot_save_path) # 加载全部评论 comments_list = Helper.load_file_format_json(self.comments_file_path) # 加载全部用户信息 users_info_list = Helper.load_file_format_json(self.users_info_file_path) # 1.评论时间的分布, 包括月和天, 柱状图 comments_time = [comment[Constants.CREATE_TIME_STAMP_KEY] for comment in comments_list] # 年-月 格式的时间 comments_date_year_month = [] # 年-月-日 格式的时间 comments_date_year_month_day = [] for comment_time in comments_time: # 时间戳要除以1000得到实际的时间戳 year_month = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m") year_month_day = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m-%d") comments_date_year_month.append(year_month) comments_date_year_month_day.append(year_month_day) self.save_sorted_bar_plot( datas = comments_date_year_month, label = "年-月", title = "歌曲<{song_name}>评论时间(年-月)数量分布".format(song_name = self.song_name), key_index = 0, save_path = os.path.join(plot_save_path,Constants.ECHARTS_COMMENTS_YEAR_MONTH_BAR_HTML) ) self.save_sorted_bar_plot( datas = comments_date_year_month_day, label = "年-月-日", title = "歌曲<{song_name}>评论时间(年-月-日)数量分布".format(song_name=self.song_name), key_index = 0, save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_YEAR_MONTH_DAY_BAR_HTML) ) # 2. 赞同数分布,柱状图 liked_count_list = [int(comment[Constants.LIKED_COUNT_KEY]) for comment in comments_list if comment[Constants.LIKED_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas = liked_count_list, label = "点赞数量", title = "歌曲<{song_name}>评论点赞数量分布".format(song_name = self.song_name), key_index = 0, save_path = os.path.join(plot_save_path, Constants.ECHARTS_LIKED_COUNT_BAR_HTML) ) # 3. 去除停用词之后评论关键词的分布,柱状图 comments_text = "".join([comment[Constants.COMMENT_CONTENT_KEY] for comment in comments_list]) comments_keywords = Helper.cut_text(comments_text) # 移除长度小于2的词以及停用词 stopwords = Helper.load_stopwords() comments_keywords = [keyword for keyword in comments_keywords if keyword not in stopwords and len(keyword) > 1] self.save_sorted_bar_plot( datas=comments_keywords, label="关键词", title="歌曲<{song_name}>评论关键词数量分布(已去除停用词)".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_KEYWORDS_BAR_HTML), reverse=True ) # 4. 用户地理位置的分布,使用地图展示 users_location = [user_info[Constants.LOCATION_KEY] for user_info in users_info_list] users_city = [] # 用户所处城市 all_support_cities = Helper.load_echarts_support_cities() for location in users_location: for city in all_support_cities: if city in location: users_city.append(city) break users_city_data = list(Counter(users_city).items()) users_city_geo = Geo("歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name),title_color="#fff", title_pos="left", width=1200, height=600, background_color='#404a59') attr, value = users_city_geo.cast(users_city_data) users_city_geo.add("", attr, value, visual_range=[0, 200], visual_text_color="#fff", symbol_size=15, is_visualmap=True) users_city_save_path = os.path.join(plot_save_path,Constants.ECHARTS_USERS_CITY_GEO_HTML) Helper.check_file_exits_and_overwrite(users_city_save_path) users_city_geo.render(users_city_save_path) # 5.用户地理位置分布的柱状图展示 self.save_sorted_bar_plot( datas=users_location, label="用户所在地区", title="歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path, Constants.ECHARTS_USERS_LOCATION_BAR_HTML), reverse=True ) # 6. 用户动态数量的分布,柱状图展示 events_count_list = [int(user_info[Constants.EVENT_COUNT_KEY]) for user_info in users_info_list if user_info[Constants.EVENT_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas=events_count_list, label="用户动态总数", title="歌曲<{song_name}>评论用户动态总数分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path, Constants.ECHARTS_EVENTS_COUNT_BAR_HTML) ) # 7. 用户关注人数的分布,柱状图展示 follow_count_list = [int(user_info[Constants.FOLLOW_COUNT_KEY]) for user_info in users_info_list if user_info[Constants.FOLLOW_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas=follow_count_list, label="用户关注人数", title="歌曲<{song_name}>评论用户关注人数分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path,Constants.ECHARTS_FOLLOW_COUNT_BAR_HTML) ) # 8. 用户粉丝数的分布,柱状图展示 fan_count_list = [int(user_info[Constants.FAN_COUNT_KEY]) for user_info in users_info_list if user_info[Constants.FAN_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas=fan_count_list, label="用户粉丝人数", title="歌曲<{song_name}>评论用户粉丝人数分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path,Constants.ECHARTS_FAN_COUNT_BAR_HTML) ) # 9. 去停用词之后用户个人描述关键词分布,柱状图 description_text = "".join([user_info[Constants.USER_DESCRIPTION_KEY] for user_info in users_info_list]) description_keywords = Helper.cut_text(description_text) description_keywords_list = [keyword for keyword in description_keywords if keyword not in stopwords and len(keyword) > 1] self.save_sorted_bar_plot( datas=description_keywords_list, label="用户简介关键词", title="歌曲<{song_name}>评论用户简介关键词数量分布(已去除停用词)".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_DESCRIPTION_KEYWORDS_BAR_HTML), reverse=True ) # 10. 用户年龄分布 age_count_list = [int(user_info[Constants.USER_AGE_KEY]) for user_info in users_info_list if user_info[Constants.USER_AGE_KEY] != Constants.UNKNOWN_TOKEN] age_count_list = [age for age in age_count_list if age >= 0] # 年龄必须要大于等于0 self.save_sorted_bar_plot( datas=age_count_list, label="年龄", title="歌曲<{song_name}>评论用户年龄分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_AGE_BAR_HTML) ) # 11. 累计听歌数量分布 listening_songs_num_list = [int(user_info[Constants.LISTENING_SONGS_NUM_KEY]) for user_info in users_info_list if user_info[Constants.LISTENING_SONGS_NUM_KEY] != Constants.UNKNOWN_TOKEN] # 听歌数量离散化(因为极差太大) listening_songs_dict = {'0-100':0,'100-1000':0,'1000-10000':0,'>10000':0} for c in listening_songs_num_list: if c < 100: listening_songs_dict['0-100'] += 1 elif c < 1000: listening_songs_dict['100-1000'] += 1 elif c < 10000: listening_songs_dict['1000-10000'] += 1 else: listening_songs_dict['>10000'] += 1 self.save_sorted_bar_plot( datas=listening_songs_dict, label="听歌总数", title="歌曲<{song_name}>评论用户听歌总数分布".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path,Constants.ECHARTS_LISTENING_SONGS_NUM_BAR_HTML), reverse=True )