示例#1
0
    def download_singer_hot_songs_by_name_with_multi_threading(
            self, singer_name, threads=20):
        '''
		通过输入歌手名字来下载歌手的全部热门歌曲,多线程实现
		:param singer_name: 歌手名字
		:param threads: 线程数
		:return:
		'''
        start_time = time.time()
        # 热门歌曲保存地址
        save_path = os.path.join(Constants.SINGER_SAVE_DIR, singer_name,
                                 Constants.HOT_SONGS_SAVE_NAME)
        # 根据名字得到歌手id
        uid = self.get_singer_id_by_name(singer_name)
        # 歌手主页地址
        singer_url = "http://music.163.com/artist?id=%d" % uid
        # 歌手全部热门歌曲id list
        hot_songs_ids = Helper.get_singer_hot_songs_ids(singer_url)
        # 通过歌曲id得到下载url
        urls_list = self.get_download_urls_by_ids(hot_songs_ids)
        # 通过歌曲id获得歌曲名
        songs_name_and_singer_name_str_list = self.get_songs_name_and_singer_name_str_list_by_ids_list(
            hot_songs_ids)
        # 全部热门歌曲数
        total = len(urls_list)
        Helper.mkdir(save_path)
        self.logger.info("%s has total %d hot songs!" % (singer_name, total))
        self.logger.info(
            "(multi threads,thread_num = %d)Now start download hot musics of %s(save path is:%s):"
            % (threads, singer_name, save_path))
        # 计数器初始化为
        self.no_counter = 0
        threads_list = []
        pack = total // threads
        for i in range(threads):
            begin_index = i * pack
            if i < threads - 1:
                end_index = (i + 1) * pack
            else:
                end_index = total
            urls = urls_list[begin_index:end_index]
            save_list = [
                os.path.join(save_path, "%s.mp3" % name) for name in
                songs_name_and_singer_name_str_list[begin_index:end_index]
            ]
            t = Thread(target=self._download_list_songs_to_file,
                       args=(urls, save_list, total))
            threads_list.append(t)
        for thread in threads_list:
            thread.start()
        for thread in threads_list:
            thread.join()
        end_time = time.time()
        self.logger.info("Download %s's %d hot songs to %s succeed!"
                         "Costs %.2f seconds!" %
                         (singer_name, total, save_path,
                          (end_time - start_time)))
示例#2
0
    def download_play_list_songs(self, play_list_id, limit=1000):
        '''
		下载歌单中的全部歌曲,单线程
		:param play_list_id: 歌单id
		:param limit: 下载的最大数量
		:return:
		'''
        start_time = time.time()
        # 获取歌单详情
        res = self.get_play_list_detail(play_list_id, limit).json()
        songs_id_list = []
        # 获取歌单歌曲id list
        for content in res['playlist']["trackIds"]:
            songs_id_list.append(content['id'])
        # 歌单名字
        play_list_name = res['playlist']['name']
        # 歌单下载音乐保存地址
        save_path = os.path.join(Constants.PLAY_LIST_SAVE_DIR, play_list_name)
        Helper.mkdir(save_path)
        # 获取歌曲名+歌手名字符串列表
        songs_name_and_singer_name_str_list = self.get_songs_name_and_singer_name_str_list_by_ids_list(
            songs_id_list)
        # 获取歌曲下载url list
        urls_list = self.get_download_urls_by_ids(songs_id_list)
        # 全部歌曲数目
        total = len(urls_list)
        self.logger.info("play list %s has total %d songs!" %
                         (play_list_name, total))
        self.logger.info(
            "(single thread)Now start download musics of %s(save path is:%s):"
            % (play_list_name, save_path))
        for index, url in enumerate(urls_list, 1):
            try:
                Helper.download_network_resource(
                    url,
                    os.path.join(
                        save_path, "%s.mp3" %
                        songs_name_and_singer_name_str_list[index - 1]))
                self.logger.info(
                    "Successfully download %d/%d(%s)!" %
                    (index, total,
                     songs_name_and_singer_name_str_list[index - 1]))
            except Exception:
                self.logger.info(
                    "Fail download %d/%d(%s)!" %
                    (index, total,
                     songs_name_and_singer_name_str_list[index - 1]))
                continue
        end_time = time.time()
        self.logger.info(
            "It costs %.2f seconds to download play list %s(id=%s)'s %d songs to %s "
            "using single thread!" % ((end_time - start_time), play_list_name,
                                      play_list_id, total, save_path))
示例#3
0
    def download_singer_hot_songs_by_name(self, singer_name):
        '''
		通过输入歌手名字来下载歌手的全部热门歌曲,单线程实现
		:param singer_name: 歌手名字
		:return:
		'''
        start_time = time.time()
        # 热门歌曲保存地址
        save_path = os.path.join(Constants.SINGER_SAVE_DIR, singer_name,
                                 Constants.HOT_SONGS_SAVE_NAME)
        # 根据名字得到歌手id
        uid = self.get_singer_id_by_name(singer_name)
        # 歌手主页地址
        singer_url = "http://music.163.com/artist?id=%d" % uid
        # 歌手全部热门歌曲id list
        hot_songs_ids = Helper.get_singer_hot_songs_ids(singer_url)
        # 通过歌曲id得到下载url
        urls_list = self.get_download_urls_by_ids(hot_songs_ids)
        # 通过歌曲id获得歌曲名
        songs_name_and_singer_name_str_list = self.get_songs_name_and_singer_name_str_list_by_ids_list(
            hot_songs_ids)
        # 全部热门歌曲数
        total = len(urls_list)
        Helper.mkdir(save_path)
        self.logger.info("%s has total %d hot songs!" % (singer_name, total))
        self.logger.info(
            "(single thread)Now start download hot musics of %s(save path is:%s):"
            % (singer_name, save_path))
        for index, url in enumerate(urls_list, 1):
            try:
                # 下载
                Helper.download_network_resource(
                    url,
                    os.path.join(
                        save_path, "%s.mp3" %
                        songs_name_and_singer_name_str_list[index - 1]))
                self.logger.info(
                    "Successfully download %d/%d(%s)!" %
                    (index, total,
                     songs_name_and_singer_name_str_list[index - 1]))
            except Exception:
                self.logger.info(
                    "Fail download %d/%d(%s)!" %
                    (index, total,
                     songs_name_and_singer_name_str_list[index - 1]))
                continue
        end_time = time.time()
        self.logger.info(
            "It costs %.2f seconds to download singer %s's %d hot songs to %s "
            "using single thread!" %
            ((end_time - start_time), singer_name, total, save_path))
示例#4
0
    def core_visual_analyse(self):
        '''
        评论以及用户信息可视化,核心函数,使用pyecharts绘制
        1. 评论时间的分布,包括月和天,柱状图
        2. 赞同数分布,柱状图
        3. 去除停用词之后评论关键词的分布,柱状图
        4. 用户地理位置的分布,使用地图展示
        5. 用户地理位置的分布,使用柱状图展示
        6. 用户动态的分布,柱状图展示
        7. 用户关注人数的分布,柱状图展示
        8. 用户粉丝数的分布,柱状图展示
        9. 去停用词之后用户个人描述关键词分布,柱状图
        10. 用户年龄的分布,柱状图
        11. 用户听歌总数分布,柱状图
        '''
        plot_save_path = os.path.join(self.song_path,Constants.PLOTS_SAVE_NAME)
        Helper.mkdir(plot_save_path)
        # 加载全部评论
        comments_list = Helper.load_file_format_json(self.comments_file_path)
        # 加载全部用户信息
        users_info_list = Helper.load_file_format_json(self.users_info_file_path)


        # 1.评论时间的分布, 包括月和天, 柱状图
        comments_time = [comment[Constants.CREATE_TIME_STAMP_KEY] for comment in comments_list]
        # 年-月 格式的时间
        comments_date_year_month = []
        # 年-月-日 格式的时间
        comments_date_year_month_day = []
        for comment_time in comments_time:
            # 时间戳要除以1000得到实际的时间戳
            year_month = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m")
            year_month_day = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m-%d")
            comments_date_year_month.append(year_month)
            comments_date_year_month_day.append(year_month_day)

        self.save_sorted_bar_plot(
            datas = comments_date_year_month,
            label = "年-月",
            title = "歌曲<{song_name}>评论时间(年-月)数量分布".format(song_name = self.song_name),
            key_index = 0,
            save_path = os.path.join(plot_save_path,Constants.ECHARTS_COMMENTS_YEAR_MONTH_BAR_HTML)
        )

        self.save_sorted_bar_plot(
            datas = comments_date_year_month_day,
            label = "年-月-日",
            title = "歌曲<{song_name}>评论时间(年-月-日)数量分布".format(song_name=self.song_name),
            key_index = 0,
            save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_YEAR_MONTH_DAY_BAR_HTML)
        )


        # 2. 赞同数分布,柱状图
        liked_count_list = [int(comment[Constants.LIKED_COUNT_KEY]) for comment in comments_list
                            if comment[Constants.LIKED_COUNT_KEY] != Constants.UNKNOWN_TOKEN]
        self.save_sorted_bar_plot(
            datas = liked_count_list,
            label = "点赞数量",
            title = "歌曲<{song_name}>评论点赞数量分布".format(song_name = self.song_name),
            key_index = 0,
            save_path = os.path.join(plot_save_path, Constants.ECHARTS_LIKED_COUNT_BAR_HTML)
        )

        # 3. 去除停用词之后评论关键词的分布,柱状图
        comments_text = "".join([comment[Constants.COMMENT_CONTENT_KEY] for comment in comments_list])
        comments_keywords = Helper.cut_text(comments_text)
        # 移除长度小于2的词以及停用词
        stopwords = Helper.load_stopwords()
        comments_keywords = [keyword for keyword in comments_keywords if keyword not in stopwords and len(keyword) > 1]

        self.save_sorted_bar_plot(
            datas=comments_keywords,
            label="关键词",
            title="歌曲<{song_name}>评论关键词数量分布(已去除停用词)".format(song_name = self.song_name),
            key_index=1,
            save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_KEYWORDS_BAR_HTML),
            reverse=True
        )


        # 4. 用户地理位置的分布,使用地图展示
        users_location = [user_info[Constants.LOCATION_KEY] for user_info in users_info_list]
        users_city = [] # 用户所处城市
        all_support_cities = Helper.load_echarts_support_cities()
        for location in users_location:
            for city in all_support_cities:
                if city in location:
                    users_city.append(city)
                    break
        users_city_data = list(Counter(users_city).items()) 
        users_city_geo = Geo("歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name),title_color="#fff", title_pos="left",
                                width=1200, height=600, background_color='#404a59')
        attr, value = users_city_geo.cast(users_city_data)
        users_city_geo.add("", attr, value, visual_range=[0, 200], visual_text_color="#fff", symbol_size=15, is_visualmap=True)
        users_city_save_path = os.path.join(plot_save_path,Constants.ECHARTS_USERS_CITY_GEO_HTML)
        Helper.check_file_exits_and_overwrite(users_city_save_path)
        users_city_geo.render(users_city_save_path)



        # 5.用户地理位置分布的柱状图展示
        self.save_sorted_bar_plot(
            datas=users_location,
            label="用户所在地区",
            title="歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name),
            key_index=1,
            save_path=os.path.join(plot_save_path, Constants.ECHARTS_USERS_LOCATION_BAR_HTML),
            reverse=True
        )

        # 6. 用户动态数量的分布,柱状图展示
        events_count_list = [int(user_info[Constants.EVENT_COUNT_KEY]) for user_info in users_info_list
                             if user_info[Constants.EVENT_COUNT_KEY] != Constants.UNKNOWN_TOKEN]
        self.save_sorted_bar_plot(
            datas=events_count_list,
            label="用户动态总数",
            title="歌曲<{song_name}>评论用户动态总数分布".format(song_name = self.song_name),
            key_index=0,
            save_path=os.path.join(plot_save_path, Constants.ECHARTS_EVENTS_COUNT_BAR_HTML)
        )

        # 7. 用户关注人数的分布,柱状图展示
        follow_count_list = [int(user_info[Constants.FOLLOW_COUNT_KEY]) for user_info in users_info_list
                             if user_info[Constants.FOLLOW_COUNT_KEY] != Constants.UNKNOWN_TOKEN]
        self.save_sorted_bar_plot(
            datas=follow_count_list,
            label="用户关注人数",
            title="歌曲<{song_name}>评论用户关注人数分布".format(song_name = self.song_name),
            key_index=0,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_FOLLOW_COUNT_BAR_HTML)
        )

        # 8. 用户粉丝数的分布,柱状图展示
        fan_count_list = [int(user_info[Constants.FAN_COUNT_KEY]) for user_info in users_info_list
                          if user_info[Constants.FAN_COUNT_KEY] != Constants.UNKNOWN_TOKEN]
        self.save_sorted_bar_plot(
            datas=fan_count_list,
            label="用户粉丝人数",
            title="歌曲<{song_name}>评论用户粉丝人数分布".format(song_name = self.song_name),
            key_index=0,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_FAN_COUNT_BAR_HTML)
        )


        # 9. 去停用词之后用户个人描述关键词分布,柱状图
        description_text = "".join([user_info[Constants.USER_DESCRIPTION_KEY] for user_info in users_info_list])
        description_keywords = Helper.cut_text(description_text)
        description_keywords_list = [keyword for keyword in description_keywords if keyword not in stopwords and len(keyword) > 1]
        self.save_sorted_bar_plot(
            datas=description_keywords_list,
            label="用户简介关键词",
            title="歌曲<{song_name}>评论用户简介关键词数量分布(已去除停用词)".format(song_name = self.song_name),
            key_index=1,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_DESCRIPTION_KEYWORDS_BAR_HTML),
            reverse=True
        )

        # 10. 用户年龄分布
        age_count_list = [int(user_info[Constants.USER_AGE_KEY]) for user_info in users_info_list
                          if user_info[Constants.USER_AGE_KEY] != Constants.UNKNOWN_TOKEN]

        age_count_list = [age for age in age_count_list if age >= 0] # 年龄必须要大于等于0
        self.save_sorted_bar_plot(
            datas=age_count_list,
            label="年龄",
            title="歌曲<{song_name}>评论用户年龄分布".format(song_name = self.song_name),
            key_index=0,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_AGE_BAR_HTML)
        )

        # 11. 累计听歌数量分布
        listening_songs_num_list = [int(user_info[Constants.LISTENING_SONGS_NUM_KEY]) for user_info in users_info_list
                                    if user_info[Constants.LISTENING_SONGS_NUM_KEY] != Constants.UNKNOWN_TOKEN]
        # 听歌数量离散化(因为极差太大)
        listening_songs_dict = {'0-100':0,'100-1000':0,'1000-10000':0,'>10000':0}
        for c in listening_songs_num_list:
            if c < 100:
                listening_songs_dict['0-100'] += 1
            elif c < 1000:
                listening_songs_dict['100-1000'] += 1
            elif c < 10000:
                listening_songs_dict['1000-10000'] += 1
            else:
                listening_songs_dict['>10000'] += 1

        self.save_sorted_bar_plot(
            datas=listening_songs_dict,
            label="听歌总数",
            title="歌曲<{song_name}>评论用户听歌总数分布".format(song_name = self.song_name),
            key_index=1,
            save_path=os.path.join(plot_save_path,Constants.ECHARTS_LISTENING_SONGS_NUM_BAR_HTML),
            reverse=True
        )
示例#5
0
 def __init__(self, song_name, singer_name, song_id=None, singer_id=None):
     self.logger = Helper.get_logger()
     # 如果id缺失,则尝试登录以从name获取id
     if song_id is None or singer_id is None:
         # 从用户机器配置文件加载登录信息
         config_dict = Helper._parse_config_xml()
         phone = config_dict['phone']
         password = config_dict['password']
         email = config_dict['email']
         rememberLogin = config_dict['rememberLogin']
         try:
             netcloud_login = NetCloudLogin(phone, password, email,
                                            rememberLogin)
             if song_id is None:
                 song_id = netcloud_login.get_song_id_by_name(song_name)
                 self.logger.info(
                     "Login to get %s's song_id(=%s) succeed!" %
                     (song_name, song_id))
             if singer_id is None:
                 singer_id = netcloud_login.get_singer_id_by_name(
                     singer_name)
                 self.logger.info(
                     "Login to get %s's singer_id(=%s) succeed!" %
                     (singer_name, singer_id))
         except Exception as e:
             self.logger.error("NetCloud login failed:%s" % e)
             self.logger.error(
                 "Please fullfill singer_id and song_id parameter"
                 " or check your login info in %s!" %
                 Constants.USER_CONFIG_FILE_PATH)
             return
     self.song_name = song_name
     self.song_id = song_id
     self.singer_name = singer_name
     self.singer_id = singer_id
     self.comments_url = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_{song_id}/?csrf_token=".format(
         song_id=song_id)
     self.singer_url = 'http://music.163.com/artist?id={singer_id}'.format(
         singer_id=singer_id)
     # 保存下载文件(歌曲,评论等)的地址
     self.singer_root_dir = Constants.SINGER_SAVE_DIR
     Helper.mkdir(self.singer_root_dir)
     # 同一个歌手的相关文件保存在同一文件夹下
     self.singer_path = os.path.join(self.singer_root_dir, self.singer_name)
     Helper.mkdir(self.singer_path)
     # 同一首歌的相关文件保存在同一文件夹下
     self.song_path = os.path.join(self.singer_path, self.song_name)
     Helper.mkdir(self.song_path)
     # 评论文件保存地址
     self.comments_file_path = os.path.join(
         self.song_path, self.song_name + "_all_comments.json")
     # 用户信息保存地址
     self.users_info_file_path = os.path.join(self.song_path,
                                              Constants.USER_INFO_FILENAME)
     # 歌手全部热门歌曲文件保存地址
     self.singer_all_hot_comments_file_path = os.path.join(
         self.singer_path, Constants.SINGER_ALL_HOT_COMMENTS_FILENAME)
     # 计数器
     self.no_counter = 0
     # 多线程锁,防止文件写入冲突以及计数冲突
     self.lock = Lock()
示例#6
0
    def download_play_list_songs_by_multi_threading(self,
                                                    play_list_id,
                                                    limit=1000,
                                                    threads=20):
        '''
		下载歌单中的全部歌曲,多线程
		:param play_list_id: 歌单id
		:param limit: 下载的最大数量
		:param threads:线程数
		:return:
		'''
        start_time = time.time()
        # 获取歌单详情
        res = self.get_play_list_detail(play_list_id, limit).json()
        songs_id_list = []
        # 获取歌单歌曲id list
        for content in res['playlist']["trackIds"]:
            songs_id_list.append(content['id'])
        # 歌单名字
        play_list_name = res['playlist']['name']
        # 歌单下载音乐保存地址
        save_path = os.path.join(Constants.PLAY_LIST_SAVE_DIR, play_list_name)
        Helper.mkdir(save_path)
        # 获取歌曲名+歌手名字符串列表
        songs_name_and_singer_name_str_list = self.get_songs_name_and_singer_name_str_list_by_ids_list(
            songs_id_list)
        # 获取歌曲下载url list
        urls_list = self.get_download_urls_by_ids(songs_id_list)
        # 全部歌曲数目
        total = len(urls_list)
        self.logger.info("play list %s has total %d songs!" %
                         (play_list_name, total))
        self.logger.info(
            "(multi threads,thread_num = %d)Now start download musics of %s(save path is:%s):"
            % (threads, play_list_name, save_path))

        # 计数器初始化为
        self.no_counter = 0
        threads_list = []
        pack = total // threads
        for i in range(threads):
            begin_index = i * pack
            if i < threads - 1:
                end_index = (i + 1) * pack
            else:
                end_index = total
            urls = urls_list[begin_index:end_index]
            save_list = [
                os.path.join(
                    save_path,
                    "%s.mp3" % songs_name_and_singer_name_str_list[index])
                for index in range(begin_index, end_index)
            ]
            t = Thread(target=self._download_list_songs_to_file,
                       args=(urls, save_list, total))
            threads_list.append(t)
        for thread in threads_list:
            thread.start()
        for thread in threads_list:
            thread.join()
        end_time = time.time()
        self.logger.info(
            "Download play list %s(id=%s)'s all %d songs to %s succeed!"
            "Costs %.2f seconds!" %
            (play_list_name, play_list_id, total, save_path,
             (end_time - start_time)))
示例#7
0
import getpass
import os
import platform
import shutil

from netcloud.util import Helper

PROJECT_NAME = "NetCloud"
UNKNOWN_TOKEN = "unknown"  # 标记未知的标识符
MUSIC163_BASE_URL = "http://music.163.com"

# 文件默认下载路径
DEFAULT_SAVE_ROOT_DIR = Helper._get_default_save_root_dir()
# 在用户机器上配置文件路径
USER_CONFIG_DIR = "%s/config" % DEFAULT_SAVE_ROOT_DIR
Helper.mkdir(USER_CONFIG_DIR)
USER_CONFIG_FILE_PATH = "%s/config.xml" % USER_CONFIG_DIR

# 项目路径
PROJECT_BASE_PATH = Helper.get_current_file_abs_path(
)[:Helper.get_current_file_abs_path().find(PROJECT_NAME) + len(PROJECT_NAME)]
SOURCE_PATH = Helper.get_current_file_abs_path() + "/source"  # 资源文件
# 各种资源文件路径
SOURCE_CONFIG_TEMPLATE_XML_PATH = "%s/config.template.xml" % SOURCE_PATH
DEFAULT_BACKGROUND_PATH = os.path.join(SOURCE_PATH, "JayChou.jpg")  # 默认背景图
DEFAULT_FONT_PATH = os.path.join(SOURCE_PATH, "simsun.ttc")  # 字体文件
STOPWORDS_PATH = os.path.join(SOURCE_PATH, "stopwords.txt")  # 停用词文件
PROVINCE_CITIES_JSON_PATH = os.path.join(SOURCE_PATH,
                                         "province_cities.json")  # 省市文件
# echarts目前支持的city json 文件
ECHARTS_SUPPORT_CITIES_JSON_PATH = os.path.join(SOURCE_PATH,