def download_singer_hot_songs_by_name_with_multi_threading( self, singer_name, threads=20): ''' 通过输入歌手名字来下载歌手的全部热门歌曲,多线程实现 :param singer_name: 歌手名字 :param threads: 线程数 :return: ''' start_time = time.time() # 热门歌曲保存地址 save_path = os.path.join(Constants.SINGER_SAVE_DIR, singer_name, Constants.HOT_SONGS_SAVE_NAME) # 根据名字得到歌手id uid = self.get_singer_id_by_name(singer_name) # 歌手主页地址 singer_url = "http://music.163.com/artist?id=%d" % uid # 歌手全部热门歌曲id list hot_songs_ids = Helper.get_singer_hot_songs_ids(singer_url) # 通过歌曲id得到下载url urls_list = self.get_download_urls_by_ids(hot_songs_ids) # 通过歌曲id获得歌曲名 songs_name_and_singer_name_str_list = self.get_songs_name_and_singer_name_str_list_by_ids_list( hot_songs_ids) # 全部热门歌曲数 total = len(urls_list) Helper.mkdir(save_path) self.logger.info("%s has total %d hot songs!" % (singer_name, total)) self.logger.info( "(multi threads,thread_num = %d)Now start download hot musics of %s(save path is:%s):" % (threads, singer_name, save_path)) # 计数器初始化为 self.no_counter = 0 threads_list = [] pack = total // threads for i in range(threads): begin_index = i * pack if i < threads - 1: end_index = (i + 1) * pack else: end_index = total urls = urls_list[begin_index:end_index] save_list = [ os.path.join(save_path, "%s.mp3" % name) for name in songs_name_and_singer_name_str_list[begin_index:end_index] ] t = Thread(target=self._download_list_songs_to_file, args=(urls, save_list, total)) threads_list.append(t) for thread in threads_list: thread.start() for thread in threads_list: thread.join() end_time = time.time() self.logger.info("Download %s's %d hot songs to %s succeed!" "Costs %.2f seconds!" % (singer_name, total, save_path, (end_time - start_time)))
def download_play_list_songs(self, play_list_id, limit=1000): ''' 下载歌单中的全部歌曲,单线程 :param play_list_id: 歌单id :param limit: 下载的最大数量 :return: ''' start_time = time.time() # 获取歌单详情 res = self.get_play_list_detail(play_list_id, limit).json() songs_id_list = [] # 获取歌单歌曲id list for content in res['playlist']["trackIds"]: songs_id_list.append(content['id']) # 歌单名字 play_list_name = res['playlist']['name'] # 歌单下载音乐保存地址 save_path = os.path.join(Constants.PLAY_LIST_SAVE_DIR, play_list_name) Helper.mkdir(save_path) # 获取歌曲名+歌手名字符串列表 songs_name_and_singer_name_str_list = self.get_songs_name_and_singer_name_str_list_by_ids_list( songs_id_list) # 获取歌曲下载url list urls_list = self.get_download_urls_by_ids(songs_id_list) # 全部歌曲数目 total = len(urls_list) self.logger.info("play list %s has total %d songs!" % (play_list_name, total)) self.logger.info( "(single thread)Now start download musics of %s(save path is:%s):" % (play_list_name, save_path)) for index, url in enumerate(urls_list, 1): try: Helper.download_network_resource( url, os.path.join( save_path, "%s.mp3" % songs_name_and_singer_name_str_list[index - 1])) self.logger.info( "Successfully download %d/%d(%s)!" % (index, total, songs_name_and_singer_name_str_list[index - 1])) except Exception: self.logger.info( "Fail download %d/%d(%s)!" % (index, total, songs_name_and_singer_name_str_list[index - 1])) continue end_time = time.time() self.logger.info( "It costs %.2f seconds to download play list %s(id=%s)'s %d songs to %s " "using single thread!" % ((end_time - start_time), play_list_name, play_list_id, total, save_path))
def download_singer_hot_songs_by_name(self, singer_name): ''' 通过输入歌手名字来下载歌手的全部热门歌曲,单线程实现 :param singer_name: 歌手名字 :return: ''' start_time = time.time() # 热门歌曲保存地址 save_path = os.path.join(Constants.SINGER_SAVE_DIR, singer_name, Constants.HOT_SONGS_SAVE_NAME) # 根据名字得到歌手id uid = self.get_singer_id_by_name(singer_name) # 歌手主页地址 singer_url = "http://music.163.com/artist?id=%d" % uid # 歌手全部热门歌曲id list hot_songs_ids = Helper.get_singer_hot_songs_ids(singer_url) # 通过歌曲id得到下载url urls_list = self.get_download_urls_by_ids(hot_songs_ids) # 通过歌曲id获得歌曲名 songs_name_and_singer_name_str_list = self.get_songs_name_and_singer_name_str_list_by_ids_list( hot_songs_ids) # 全部热门歌曲数 total = len(urls_list) Helper.mkdir(save_path) self.logger.info("%s has total %d hot songs!" % (singer_name, total)) self.logger.info( "(single thread)Now start download hot musics of %s(save path is:%s):" % (singer_name, save_path)) for index, url in enumerate(urls_list, 1): try: # 下载 Helper.download_network_resource( url, os.path.join( save_path, "%s.mp3" % songs_name_and_singer_name_str_list[index - 1])) self.logger.info( "Successfully download %d/%d(%s)!" % (index, total, songs_name_and_singer_name_str_list[index - 1])) except Exception: self.logger.info( "Fail download %d/%d(%s)!" % (index, total, songs_name_and_singer_name_str_list[index - 1])) continue end_time = time.time() self.logger.info( "It costs %.2f seconds to download singer %s's %d hot songs to %s " "using single thread!" % ((end_time - start_time), singer_name, total, save_path))
def core_visual_analyse(self): ''' 评论以及用户信息可视化,核心函数,使用pyecharts绘制 1. 评论时间的分布,包括月和天,柱状图 2. 赞同数分布,柱状图 3. 去除停用词之后评论关键词的分布,柱状图 4. 用户地理位置的分布,使用地图展示 5. 用户地理位置的分布,使用柱状图展示 6. 用户动态的分布,柱状图展示 7. 用户关注人数的分布,柱状图展示 8. 用户粉丝数的分布,柱状图展示 9. 去停用词之后用户个人描述关键词分布,柱状图 10. 用户年龄的分布,柱状图 11. 用户听歌总数分布,柱状图 ''' plot_save_path = os.path.join(self.song_path,Constants.PLOTS_SAVE_NAME) Helper.mkdir(plot_save_path) # 加载全部评论 comments_list = Helper.load_file_format_json(self.comments_file_path) # 加载全部用户信息 users_info_list = Helper.load_file_format_json(self.users_info_file_path) # 1.评论时间的分布, 包括月和天, 柱状图 comments_time = [comment[Constants.CREATE_TIME_STAMP_KEY] for comment in comments_list] # 年-月 格式的时间 comments_date_year_month = [] # 年-月-日 格式的时间 comments_date_year_month_day = [] for comment_time in comments_time: # 时间戳要除以1000得到实际的时间戳 year_month = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m") year_month_day = Helper.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m-%d") comments_date_year_month.append(year_month) comments_date_year_month_day.append(year_month_day) self.save_sorted_bar_plot( datas = comments_date_year_month, label = "年-月", title = "歌曲<{song_name}>评论时间(年-月)数量分布".format(song_name = self.song_name), key_index = 0, save_path = os.path.join(plot_save_path,Constants.ECHARTS_COMMENTS_YEAR_MONTH_BAR_HTML) ) self.save_sorted_bar_plot( datas = comments_date_year_month_day, label = "年-月-日", title = "歌曲<{song_name}>评论时间(年-月-日)数量分布".format(song_name=self.song_name), key_index = 0, save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_YEAR_MONTH_DAY_BAR_HTML) ) # 2. 赞同数分布,柱状图 liked_count_list = [int(comment[Constants.LIKED_COUNT_KEY]) for comment in comments_list if comment[Constants.LIKED_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas = liked_count_list, label = "点赞数量", title = "歌曲<{song_name}>评论点赞数量分布".format(song_name = self.song_name), key_index = 0, save_path = os.path.join(plot_save_path, Constants.ECHARTS_LIKED_COUNT_BAR_HTML) ) # 3. 去除停用词之后评论关键词的分布,柱状图 comments_text = "".join([comment[Constants.COMMENT_CONTENT_KEY] for comment in comments_list]) comments_keywords = Helper.cut_text(comments_text) # 移除长度小于2的词以及停用词 stopwords = Helper.load_stopwords() comments_keywords = [keyword for keyword in comments_keywords if keyword not in stopwords and len(keyword) > 1] self.save_sorted_bar_plot( datas=comments_keywords, label="关键词", title="歌曲<{song_name}>评论关键词数量分布(已去除停用词)".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path, Constants.ECHARTS_COMMENTS_KEYWORDS_BAR_HTML), reverse=True ) # 4. 用户地理位置的分布,使用地图展示 users_location = [user_info[Constants.LOCATION_KEY] for user_info in users_info_list] users_city = [] # 用户所处城市 all_support_cities = Helper.load_echarts_support_cities() for location in users_location: for city in all_support_cities: if city in location: users_city.append(city) break users_city_data = list(Counter(users_city).items()) users_city_geo = Geo("歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name),title_color="#fff", title_pos="left", width=1200, height=600, background_color='#404a59') attr, value = users_city_geo.cast(users_city_data) users_city_geo.add("", attr, value, visual_range=[0, 200], visual_text_color="#fff", symbol_size=15, is_visualmap=True) users_city_save_path = os.path.join(plot_save_path,Constants.ECHARTS_USERS_CITY_GEO_HTML) Helper.check_file_exits_and_overwrite(users_city_save_path) users_city_geo.render(users_city_save_path) # 5.用户地理位置分布的柱状图展示 self.save_sorted_bar_plot( datas=users_location, label="用户所在地区", title="歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path, Constants.ECHARTS_USERS_LOCATION_BAR_HTML), reverse=True ) # 6. 用户动态数量的分布,柱状图展示 events_count_list = [int(user_info[Constants.EVENT_COUNT_KEY]) for user_info in users_info_list if user_info[Constants.EVENT_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas=events_count_list, label="用户动态总数", title="歌曲<{song_name}>评论用户动态总数分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path, Constants.ECHARTS_EVENTS_COUNT_BAR_HTML) ) # 7. 用户关注人数的分布,柱状图展示 follow_count_list = [int(user_info[Constants.FOLLOW_COUNT_KEY]) for user_info in users_info_list if user_info[Constants.FOLLOW_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas=follow_count_list, label="用户关注人数", title="歌曲<{song_name}>评论用户关注人数分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path,Constants.ECHARTS_FOLLOW_COUNT_BAR_HTML) ) # 8. 用户粉丝数的分布,柱状图展示 fan_count_list = [int(user_info[Constants.FAN_COUNT_KEY]) for user_info in users_info_list if user_info[Constants.FAN_COUNT_KEY] != Constants.UNKNOWN_TOKEN] self.save_sorted_bar_plot( datas=fan_count_list, label="用户粉丝人数", title="歌曲<{song_name}>评论用户粉丝人数分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path,Constants.ECHARTS_FAN_COUNT_BAR_HTML) ) # 9. 去停用词之后用户个人描述关键词分布,柱状图 description_text = "".join([user_info[Constants.USER_DESCRIPTION_KEY] for user_info in users_info_list]) description_keywords = Helper.cut_text(description_text) description_keywords_list = [keyword for keyword in description_keywords if keyword not in stopwords and len(keyword) > 1] self.save_sorted_bar_plot( datas=description_keywords_list, label="用户简介关键词", title="歌曲<{song_name}>评论用户简介关键词数量分布(已去除停用词)".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_DESCRIPTION_KEYWORDS_BAR_HTML), reverse=True ) # 10. 用户年龄分布 age_count_list = [int(user_info[Constants.USER_AGE_KEY]) for user_info in users_info_list if user_info[Constants.USER_AGE_KEY] != Constants.UNKNOWN_TOKEN] age_count_list = [age for age in age_count_list if age >= 0] # 年龄必须要大于等于0 self.save_sorted_bar_plot( datas=age_count_list, label="年龄", title="歌曲<{song_name}>评论用户年龄分布".format(song_name = self.song_name), key_index=0, save_path=os.path.join(plot_save_path,Constants.ECHARTS_USER_AGE_BAR_HTML) ) # 11. 累计听歌数量分布 listening_songs_num_list = [int(user_info[Constants.LISTENING_SONGS_NUM_KEY]) for user_info in users_info_list if user_info[Constants.LISTENING_SONGS_NUM_KEY] != Constants.UNKNOWN_TOKEN] # 听歌数量离散化(因为极差太大) listening_songs_dict = {'0-100':0,'100-1000':0,'1000-10000':0,'>10000':0} for c in listening_songs_num_list: if c < 100: listening_songs_dict['0-100'] += 1 elif c < 1000: listening_songs_dict['100-1000'] += 1 elif c < 10000: listening_songs_dict['1000-10000'] += 1 else: listening_songs_dict['>10000'] += 1 self.save_sorted_bar_plot( datas=listening_songs_dict, label="听歌总数", title="歌曲<{song_name}>评论用户听歌总数分布".format(song_name = self.song_name), key_index=1, save_path=os.path.join(plot_save_path,Constants.ECHARTS_LISTENING_SONGS_NUM_BAR_HTML), reverse=True )
def __init__(self, song_name, singer_name, song_id=None, singer_id=None): self.logger = Helper.get_logger() # 如果id缺失,则尝试登录以从name获取id if song_id is None or singer_id is None: # 从用户机器配置文件加载登录信息 config_dict = Helper._parse_config_xml() phone = config_dict['phone'] password = config_dict['password'] email = config_dict['email'] rememberLogin = config_dict['rememberLogin'] try: netcloud_login = NetCloudLogin(phone, password, email, rememberLogin) if song_id is None: song_id = netcloud_login.get_song_id_by_name(song_name) self.logger.info( "Login to get %s's song_id(=%s) succeed!" % (song_name, song_id)) if singer_id is None: singer_id = netcloud_login.get_singer_id_by_name( singer_name) self.logger.info( "Login to get %s's singer_id(=%s) succeed!" % (singer_name, singer_id)) except Exception as e: self.logger.error("NetCloud login failed:%s" % e) self.logger.error( "Please fullfill singer_id and song_id parameter" " or check your login info in %s!" % Constants.USER_CONFIG_FILE_PATH) return self.song_name = song_name self.song_id = song_id self.singer_name = singer_name self.singer_id = singer_id self.comments_url = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_{song_id}/?csrf_token=".format( song_id=song_id) self.singer_url = 'http://music.163.com/artist?id={singer_id}'.format( singer_id=singer_id) # 保存下载文件(歌曲,评论等)的地址 self.singer_root_dir = Constants.SINGER_SAVE_DIR Helper.mkdir(self.singer_root_dir) # 同一个歌手的相关文件保存在同一文件夹下 self.singer_path = os.path.join(self.singer_root_dir, self.singer_name) Helper.mkdir(self.singer_path) # 同一首歌的相关文件保存在同一文件夹下 self.song_path = os.path.join(self.singer_path, self.song_name) Helper.mkdir(self.song_path) # 评论文件保存地址 self.comments_file_path = os.path.join( self.song_path, self.song_name + "_all_comments.json") # 用户信息保存地址 self.users_info_file_path = os.path.join(self.song_path, Constants.USER_INFO_FILENAME) # 歌手全部热门歌曲文件保存地址 self.singer_all_hot_comments_file_path = os.path.join( self.singer_path, Constants.SINGER_ALL_HOT_COMMENTS_FILENAME) # 计数器 self.no_counter = 0 # 多线程锁,防止文件写入冲突以及计数冲突 self.lock = Lock()
def download_play_list_songs_by_multi_threading(self, play_list_id, limit=1000, threads=20): ''' 下载歌单中的全部歌曲,多线程 :param play_list_id: 歌单id :param limit: 下载的最大数量 :param threads:线程数 :return: ''' start_time = time.time() # 获取歌单详情 res = self.get_play_list_detail(play_list_id, limit).json() songs_id_list = [] # 获取歌单歌曲id list for content in res['playlist']["trackIds"]: songs_id_list.append(content['id']) # 歌单名字 play_list_name = res['playlist']['name'] # 歌单下载音乐保存地址 save_path = os.path.join(Constants.PLAY_LIST_SAVE_DIR, play_list_name) Helper.mkdir(save_path) # 获取歌曲名+歌手名字符串列表 songs_name_and_singer_name_str_list = self.get_songs_name_and_singer_name_str_list_by_ids_list( songs_id_list) # 获取歌曲下载url list urls_list = self.get_download_urls_by_ids(songs_id_list) # 全部歌曲数目 total = len(urls_list) self.logger.info("play list %s has total %d songs!" % (play_list_name, total)) self.logger.info( "(multi threads,thread_num = %d)Now start download musics of %s(save path is:%s):" % (threads, play_list_name, save_path)) # 计数器初始化为 self.no_counter = 0 threads_list = [] pack = total // threads for i in range(threads): begin_index = i * pack if i < threads - 1: end_index = (i + 1) * pack else: end_index = total urls = urls_list[begin_index:end_index] save_list = [ os.path.join( save_path, "%s.mp3" % songs_name_and_singer_name_str_list[index]) for index in range(begin_index, end_index) ] t = Thread(target=self._download_list_songs_to_file, args=(urls, save_list, total)) threads_list.append(t) for thread in threads_list: thread.start() for thread in threads_list: thread.join() end_time = time.time() self.logger.info( "Download play list %s(id=%s)'s all %d songs to %s succeed!" "Costs %.2f seconds!" % (play_list_name, play_list_id, total, save_path, (end_time - start_time)))
import getpass import os import platform import shutil from netcloud.util import Helper PROJECT_NAME = "NetCloud" UNKNOWN_TOKEN = "unknown" # 标记未知的标识符 MUSIC163_BASE_URL = "http://music.163.com" # 文件默认下载路径 DEFAULT_SAVE_ROOT_DIR = Helper._get_default_save_root_dir() # 在用户机器上配置文件路径 USER_CONFIG_DIR = "%s/config" % DEFAULT_SAVE_ROOT_DIR Helper.mkdir(USER_CONFIG_DIR) USER_CONFIG_FILE_PATH = "%s/config.xml" % USER_CONFIG_DIR # 项目路径 PROJECT_BASE_PATH = Helper.get_current_file_abs_path( )[:Helper.get_current_file_abs_path().find(PROJECT_NAME) + len(PROJECT_NAME)] SOURCE_PATH = Helper.get_current_file_abs_path() + "/source" # 资源文件 # 各种资源文件路径 SOURCE_CONFIG_TEMPLATE_XML_PATH = "%s/config.template.xml" % SOURCE_PATH DEFAULT_BACKGROUND_PATH = os.path.join(SOURCE_PATH, "JayChou.jpg") # 默认背景图 DEFAULT_FONT_PATH = os.path.join(SOURCE_PATH, "simsun.ttc") # 字体文件 STOPWORDS_PATH = os.path.join(SOURCE_PATH, "stopwords.txt") # 停用词文件 PROVINCE_CITIES_JSON_PATH = os.path.join(SOURCE_PATH, "province_cities.json") # 省市文件 # echarts目前支持的city json 文件 ECHARTS_SUPPORT_CITIES_JSON_PATH = os.path.join(SOURCE_PATH,