def draw_all_comments_wordcloud(self): ''' 产生歌曲全部评论的词云图像,全部使用默认参数 :return: ''' # 如果磁盘不存在,则先加载之,并保存到磁盘 if not os.path.exists(self.comments_file_path): self.save_all_comments_to_file() all_comments_list = Helper.load_file_format_json(self.comments_file_path) if len(all_comments_list) == 0: self.logger.error("Load %s failed!" % self.comments_file_path) return all_comments_conent = "".join([comment[Constants.COMMENT_CONTENT_KEY] for comment in all_comments_list]) stopwords = Helper.load_stopwords() wordcloud_text = " ".join([word for word in Helper.cut_text(all_comments_conent) if word not in stopwords]) save_path = os.path.join(self.song_path,"%s_all_comments.png" % self.song_name) self.draw_wordcloud(wordcloud_text,save_path)
def core_visual_analyse(self): ''' 评论以及用户信息可视化,核心函数,使用pyecharts绘制 1. 评论时间的分布,包括月和天,柱状图 2. 赞同数分布,柱状图 3. 去除停用词之后评论关键词的分布,柱状图 4. 用户地理位置的分布,使用地图展示 5. 用户地理位置的分布,使用柱状图展示 6. 用户动态的分布,柱状图展示 7. 用户关注人数的分布,柱状图展示 8. 用户粉丝数的分布,柱状图展示 9. 去停用词之后用户个人描述关键词分布,柱状图 10. 用户年龄的分布,柱状图 11. 用户听歌总数分布,柱状图 ''' plot_save_path = os.path.join(self.song_path,Constants.PLOTS_SAVE_NAME) Helper.mkdir(plot_save_path) # 加载全部评论 comments_list = Helper.load_file_format_json(self.comments_file_path) # 加载全部用户信息 users_info_list = Helper.load_file_format_json(self.users_info_file_path) # 1.评论时间的分布, 包括月和天, 柱状图 comments_time = [comment[Constants.CREATE_TIME_STAMP_KEY] for comment in comments_list] # 年-月 格式的时间 comments_date_year_month = [] # 年-月-日 格式的时间 comments_date_year_month_day = [] for comment_time in comments_time: # 时间戳要除以1000得到实际的时间戳 year_month = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m") year_month_day = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m-%d") comments_date_year_month.append(year_month) comments_date_year_month_day.append(year_month_day) self.save_sorted_bar_plot( datas = comments_date_year_month, label = "年-月", title = "歌曲<{song_name}>评论时间(年-月)数量分布".format(song_name = self.song_name), key_index = 0, save_path = os.path.join(plot_save_path,Constants.ECHARTS_COMMENTS_YEAR_MONTH_BAR_HTML) ) self.save_sorted_bar_plot( datas = comments_date_year_month_day, label = "年-月-日", title = "歌曲<{song_name}>评论时间(年-月-日)数量分布".format(song_name=self.song_name), key_index = 0, save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_YEAR_MONTH_DAY_BAR_HTML) ) # 2. 赞同数分布,柱状图 liked_count_list = [int(comment[Constants.LIKED_COUNT_KEY]) for comment in comments_list if comment[Constants.LIKED_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas = liked_count_list, label = "点赞数量", title = "歌曲<{song_name}>评论点赞数量分布".format(song_name = self.song_name), key_index = 0, save_path = os.path.join(plot_save_path, Constants.ECHARTS_LIKED_COUNT_BAR_HTML) ) # 3. 去除停用词之后评论关键词的分布,柱状图 comments_text = "".join([comment[Constants.COMMENT_CONTENT_KEY] for comment in comments_list]) comments_keywords = Helper.cut_text(comments_text) # 移除长度小于2的词以及停用词 stopwords = Helper.load_stopwords() comments_keywords = [keyword for keyword in comments_keywords if keyword not in stopwords and len(keyword) > 1] self.save_sorted_bar_plot( datas=comments_keywords, label="关键词", title="歌曲<{song_name}>评论关键词数量分布(已去除停用词)".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_KEYWORDS_BAR_HTML), reverse=True ) # 4. 用户地理位置的分布,使用地图展示 users_location = [user_info[Constants.LOCATION_KEY] for user_info in users_info_list] users_city = [] # 用户所处城市 all_support_cities = Helper.load_echarts_support_cities() for location in users_location: for city in all_support_cities: if city in location: users_city.append(city) break users_city_data = list(Counter(users_city).items()) users_city_geo = Geo("歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name),title_color="#fff", title_pos="left", width=1200, height=600, background_color='#404a59') attr, value = users_city_geo.cast(users_city_data) users_city_geo.add("", attr, value, visual_range=[0, 200], visual_text_color="#fff", symbol_size=15, is_visualmap=True) users_city_save_path = os.path.join(plot_save_path,Constants.ECHARTS_USERS_CITY_GEO_HTML) Helper.check_file_exits_and_overwrite(users_city_save_path) users_city_geo.render(users_city_save_path) # 5.用户地理位置分布的柱状图展示 self.save_sorted_bar_plot( datas=users_location, label="用户所在地区", title="歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path, Constants.ECHARTS_USERS_LOCATION_BAR_HTML), reverse=True ) # 6. 用户动态数量的分布,柱状图展示 events_count_list = [int(user_info[Constants.EVENT_COUNT_KEY]) for user_info in users_info_list if user_info[Constants.EVENT_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas=events_count_list, label="用户动态总数", title="歌曲<{song_name}>评论用户动态总数分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path, Constants.ECHARTS_EVENTS_COUNT_BAR_HTML) ) # 7. 用户关注人数的分布,柱状图展示 follow_count_list = [int(user_info[Constants.FOLLOW_COUNT_KEY]) for user_info in users_info_list if user_info[Constants.FOLLOW_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas=follow_count_list, label="用户关注人数", title="歌曲<{song_name}>评论用户关注人数分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path,Constants.ECHARTS_FOLLOW_COUNT_BAR_HTML) ) # 8. 用户粉丝数的分布,柱状图展示 fan_count_list = [int(user_info[Constants.FAN_COUNT_KEY]) for user_info in users_info_list if user_info[Constants.FAN_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas=fan_count_list, label="用户粉丝人数", title="歌曲<{song_name}>评论用户粉丝人数分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path,Constants.ECHARTS_FAN_COUNT_BAR_HTML) ) # 9. 去停用词之后用户个人描述关键词分布,柱状图 description_text = "".join([user_info[Constants.USER_DESCRIPTION_KEY] for user_info in users_info_list]) description_keywords = Helper.cut_text(description_text) description_keywords_list = [keyword for keyword in description_keywords if keyword not in stopwords and len(keyword) > 1] self.save_sorted_bar_plot( datas=description_keywords_list, label="用户简介关键词", title="歌曲<{song_name}>评论用户简介关键词数量分布(已去除停用词)".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_DESCRIPTION_KEYWORDS_BAR_HTML), reverse=True ) # 10. 用户年龄分布 age_count_list = [int(user_info[Constants.USER_AGE_KEY]) for user_info in users_info_list if user_info[Constants.USER_AGE_KEY] != Constants.UNKNOWN_TOKEN] age_count_list = [age for age in age_count_list if age >= 0] # 年龄必须要大于等于0 self.save_sorted_bar_plot( datas=age_count_list, label="年龄", title="歌曲<{song_name}>评论用户年龄分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_AGE_BAR_HTML) ) # 11. 累计听歌数量分布 listening_songs_num_list = [int(user_info[Constants.LISTENING_SONGS_NUM_KEY]) for user_info in users_info_list if user_info[Constants.LISTENING_SONGS_NUM_KEY] != Constants.UNKNOWN_TOKEN] # 听歌数量离散化(因为极差太大) listening_songs_dict = {'0-100':0,'100-1000':0,'1000-10000':0,'>10000':0} for c in listening_songs_num_list: if c < 100: listening_songs_dict['0-100'] += 1 elif c < 1000: listening_songs_dict['100-1000'] += 1 elif c < 10000: listening_songs_dict['1000-10000'] += 1 else: listening_songs_dict['>10000'] += 1 self.save_sorted_bar_plot( datas=listening_songs_dict, label="听歌总数", title="歌曲<{song_name}>评论用户听歌总数分布".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path,Constants.ECHARTS_LISTENING_SONGS_NUM_BAR_HTML), reverse=True )