예제 #1
0
 def draw_all_comments_wordcloud(self):
     '''
     产生歌曲全部评论的词云图像,全部使用默认参数
     :return:
     '''
     # 如果磁盘不存在,则先加载之,并保存到磁盘
     if not os.path.exists(self.comments_file_path):
         self.save_all_comments_to_file()
     all_comments_list = Helper.load_file_format_json(self.comments_file_path)
     if len(all_comments_list) == 0:
         self.logger.error("Load %s failed!" % self.comments_file_path)
         return
     all_comments_conent = "".join([comment[Constants.COMMENT_CONTENT_KEY] for comment in all_comments_list])
     stopwords = Helper.load_stopwords()
     wordcloud_text = " ".join([word for word in Helper.cut_text(all_comments_conent) if word not in stopwords])
     save_path = os.path.join(self.song_path,"%s_all_comments.png" % self.song_name)
     self.draw_wordcloud(wordcloud_text,save_path)
예제 #2
0
    def core_visual_analyse(self):
        '''
        评论以及用户信息可视化,核心函数,使用pyecharts绘制
        1. 评论时间的分布,包括月和天,柱状图
        2. 赞同数分布,柱状图
        3. 去除停用词之后评论关键词的分布,柱状图
        4. 用户地理位置的分布,使用地图展示
        5. 用户地理位置的分布,使用柱状图展示
        6. 用户动态的分布,柱状图展示
        7. 用户关注人数的分布,柱状图展示
        8. 用户粉丝数的分布,柱状图展示
        9. 去停用词之后用户个人描述关键词分布,柱状图
        10. 用户年龄的分布,柱状图
        11. 用户听歌总数分布,柱状图
        '''
        plot_save_path = os.path.join(self.song_path,Constants.PLOTS_SAVE_NAME)
        Helper.mkdir(plot_save_path)
        # 加载全部评论
        comments_list = Helper.load_file_format_json(self.comments_file_path)
        # 加载全部用户信息
        users_info_list = Helper.load_file_format_json(self.users_info_file_path)


        # 1.评论时间的分布, 包括月和天, 柱状图
        comments_time = [comment[Constants.CREATE_TIME_STAMP_KEY] for comment in comments_list]
        # 年-月 格式的时间
        comments_date_year_month = []
        # 年-月-日 格式的时间
        comments_date_year_month_day = []
        for comment_time in comments_time:
            # 时间戳要除以1000得到实际的时间戳
            year_month = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m")
            year_month_day = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m-%d")
            comments_date_year_month.append(year_month)
            comments_date_year_month_day.append(year_month_day)

        self.save_sorted_bar_plot(
            datas = comments_date_year_month,
            label = "年-月",
            title = "歌曲<{song_name}>评论时间(年-月)数量分布".format(song_name = self.song_name),
            key_index = 0,
            save_path = os.path.join(plot_save_path,Constants.ECHARTS_COMMENTS_YEAR_MONTH_BAR_HTML)
        )

        self.save_sorted_bar_plot(
            datas = comments_date_year_month_day,
            label = "年-月-日",
            title = "歌曲<{song_name}>评论时间(年-月-日)数量分布".format(song_name=self.song_name),
            key_index = 0,
            save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_YEAR_MONTH_DAY_BAR_HTML)
        )


        # 2. 赞同数分布,柱状图
        liked_count_list = [int(comment[Constants.LIKED_COUNT_KEY]) for comment in comments_list
                            if comment[Constants.LIKED_COUNT_KEY] != Constants.UNKNOWN_TOKEN]
        self.save_sorted_bar_plot(
            datas = liked_count_list,
            label = "点赞数量",
            title = "歌曲<{song_name}>评论点赞数量分布".format(song_name = self.song_name),
            key_index = 0,
            save_path = os.path.join(plot_save_path, Constants.ECHARTS_LIKED_COUNT_BAR_HTML)
        )

        # 3. 去除停用词之后评论关键词的分布,柱状图
        comments_text = "".join([comment[Constants.COMMENT_CONTENT_KEY] for comment in comments_list])
        comments_keywords = Helper.cut_text(comments_text)
        # 移除长度小于2的词以及停用词
        stopwords = Helper.load_stopwords()
        comments_keywords = [keyword for keyword in comments_keywords if keyword not in stopwords and len(keyword) > 1]

        self.save_sorted_bar_plot(
            datas=comments_keywords,
            label="关键词",
            title="歌曲<{song_name}>评论关键词数量分布(已去除停用词)".format(song_name = self.song_name),
            key_index=1,
            save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_KEYWORDS_BAR_HTML),
            reverse=True
        )


        # 4. 用户地理位置的分布,使用地图展示
        users_location = [user_info[Constants.LOCATION_KEY] for user_info in users_info_list]
        users_city = [] # 用户所处城市
        all_support_cities = Helper.load_echarts_support_cities()
        for location in users_location:
            for city in all_support_cities:
                if city in location:
                    users_city.append(city)
                    break
        users_city_data = list(Counter(users_city).items()) 
        users_city_geo = Geo("歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name),title_color="#fff", title_pos="left",
                                width=1200, height=600, background_color='#404a59')
        attr, value = users_city_geo.cast(users_city_data)
        users_city_geo.add("", attr, value, visual_range=[0, 200], visual_text_color="#fff", symbol_size=15, is_visualmap=True)
        users_city_save_path = os.path.join(plot_save_path,Constants.ECHARTS_USERS_CITY_GEO_HTML)
        Helper.check_file_exits_and_overwrite(users_city_save_path)
        users_city_geo.render(users_city_save_path)



        # 5.用户地理位置分布的柱状图展示
        self.save_sorted_bar_plot(
            datas=users_location,
            label="用户所在地区",
            title="歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name),
            key_index=1,
            save_path=os.path.join(plot_save_path, Constants.ECHARTS_USERS_LOCATION_BAR_HTML),
            reverse=True
        )

        # 6. 用户动态数量的分布,柱状图展示
        events_count_list = [int(user_info[Constants.EVENT_COUNT_KEY]) for user_info in users_info_list
                             if user_info[Constants.EVENT_COUNT_KEY] != Constants.UNKNOWN_TOKEN]
        self.save_sorted_bar_plot(
            datas=events_count_list,
            label="用户动态总数",
            title="歌曲<{song_name}>评论用户动态总数分布".format(song_name = self.song_name),
            key_index=0,
            save_path=os.path.join(plot_save_path, Constants.ECHARTS_EVENTS_COUNT_BAR_HTML)
        )

        # 7. 用户关注人数的分布,柱状图展示
        follow_count_list = [int(user_info[Constants.FOLLOW_COUNT_KEY]) for user_info in users_info_list
                             if user_info[Constants.FOLLOW_COUNT_KEY] != Constants.UNKNOWN_TOKEN]
        self.save_sorted_bar_plot(
            datas=follow_count_list,
            label="用户关注人数",
            title="歌曲<{song_name}>评论用户关注人数分布".format(song_name = self.song_name),
            key_index=0,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_FOLLOW_COUNT_BAR_HTML)
        )

        # 8. 用户粉丝数的分布,柱状图展示
        fan_count_list = [int(user_info[Constants.FAN_COUNT_KEY]) for user_info in users_info_list
                          if user_info[Constants.FAN_COUNT_KEY] != Constants.UNKNOWN_TOKEN]
        self.save_sorted_bar_plot(
            datas=fan_count_list,
            label="用户粉丝人数",
            title="歌曲<{song_name}>评论用户粉丝人数分布".format(song_name = self.song_name),
            key_index=0,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_FAN_COUNT_BAR_HTML)
        )


        # 9. 去停用词之后用户个人描述关键词分布,柱状图
        description_text = "".join([user_info[Constants.USER_DESCRIPTION_KEY] for user_info in users_info_list])
        description_keywords = Helper.cut_text(description_text)
        description_keywords_list = [keyword for keyword in description_keywords if keyword not in stopwords and len(keyword) > 1]
        self.save_sorted_bar_plot(
            datas=description_keywords_list,
            label="用户简介关键词",
            title="歌曲<{song_name}>评论用户简介关键词数量分布(已去除停用词)".format(song_name = self.song_name),
            key_index=1,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_DESCRIPTION_KEYWORDS_BAR_HTML),
            reverse=True
        )

        # 10. 用户年龄分布
        age_count_list = [int(user_info[Constants.USER_AGE_KEY]) for user_info in users_info_list
                          if user_info[Constants.USER_AGE_KEY] != Constants.UNKNOWN_TOKEN]

        age_count_list = [age for age in age_count_list if age >= 0] # 年龄必须要大于等于0
        self.save_sorted_bar_plot(
            datas=age_count_list,
            label="年龄",
            title="歌曲<{song_name}>评论用户年龄分布".format(song_name = self.song_name),
            key_index=0,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_AGE_BAR_HTML)
        )

        # 11. 累计听歌数量分布
        listening_songs_num_list = [int(user_info[Constants.LISTENING_SONGS_NUM_KEY]) for user_info in users_info_list
                                    if user_info[Constants.LISTENING_SONGS_NUM_KEY] != Constants.UNKNOWN_TOKEN]
        # 听歌数量离散化(因为极差太大)
        listening_songs_dict = {'0-100':0,'100-1000':0,'1000-10000':0,'>10000':0}
        for c in listening_songs_num_list:
            if c < 100:
                listening_songs_dict['0-100'] += 1
            elif c < 1000:
                listening_songs_dict['100-1000'] += 1
            elif c < 10000:
                listening_songs_dict['1000-10000'] += 1
            else:
                listening_songs_dict['>10000'] += 1

        self.save_sorted_bar_plot(
            datas=listening_songs_dict,
            label="听歌总数",
            title="歌曲<{song_name}>评论用户听歌总数分布".format(song_name = self.song_name),
            key_index=1,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_LISTENING_SONGS_NUM_BAR_HTML),
            reverse=True
        )