def save_singer_all_hot_comments_to_file(self): ''' 保存歌手的全部热门评论到磁盘 :param singer_name: 歌手名字 :param singer_id:歌手 id ''' save_path = self.singer_all_hot_comments_file_path Helper.check_file_exits_and_overwrite(save_path) song_ids = Helper.get_singer_hot_songs_ids( self.singer_url) # 歌手全部歌曲id list if len(song_ids) == 0: self.logger.error( "crawl from %s to get %s all hot songs ids failed!" % (self.singer_url, self.singer_name)) return # first line is headers all_hot_comments_list = [] for song_id in song_ids: url = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_{song_id}/?csrf_token=".format( song_id=song_id) hot_comments_list = self.get_hot_comments(url) all_hot_comments_list.extend(hot_comments_list) all_hot_comments_json_str_list = [ self.extract_comment_info_as_json_str(comment) for comment in all_hot_comments_list ] Helper.save_lines_to_file(all_hot_comments_json_str_list, save_path) self.logger.info( "Write {singer_name}'s {num} hot songs hot comments successfully!". format(singer_name=self.singer_name, num=len(song_ids)))
def save_all_users_info_to_file(self): ''' 保存一首歌曲下全部用户信息到磁盘 :return: ''' Helper.check_file_exits_and_overwrite(self.users_info_file_path) users_info_list = self.get_users_info_list() Helper.save_lines_to_file(users_info_list,self.users_info_file_path)
def save_users_info(self,users_url,total_urls_num): ''' 保存用户信息到磁盘,该函数会被save_users_info_to_file_by_multi_threading 多线程函数调用 :param users_url: 待处理的用户url list :param total:全部用户url数量 :param total_urls_num:全部url数量 ''' # 追加写入 users_info_list = self.get_users_info_list(users_url,total_urls_num) # 写入文件需要加锁 if self.lock.acquire(): Helper.save_lines_to_file(users_info_list,self.users_info_file_path,"a") self.lock.release()
def save_all_comments_to_file(self): ''' 顺序保存全部评论到磁盘 :return: ''' Helper.check_file_exits_and_overwrite(self.comments_file_path) start_time = time.time() all_comments_list = self.get_all_comments() # comment dict to json str all_comments_json_str_list = [ self.extract_comment_info_as_json_str(comment) for comment in all_comments_list ] Helper.save_lines_to_file(all_comments_json_str_list, self.comments_file_path) end_time = time.time() print("It costs %.2f seconds to crawler <%s>." % (end_time - start_time, self.song_name))
def save_pages_comments(self, begin_page, end_page, total_comments_num): ''' 保存从begin_page 到 end_page的评论(called by multi threading) :param begin_page: 开始页数 :param end_page: 结束页数 :param total_comments_num:全部评论数 ''' comments_info_list = [] # 保存全部评论的list,每条评论以json 字符串形式表示 for i in range(begin_page, end_page): json_dict = self.get_page_comments_format_dict( self.comments_url, i + 1) try: for item in json_dict[Constants.COMMENTS_KEY]: json_str = self.extract_comment_info_as_json_str(item) # 更新计数器,需要加锁 if self.lock.acquire(): self.no_counter += 1 self.logger.info("get %d/%d music comment succeed!" % (self.no_counter, total_comments_num)) self.lock.release() comments_info_list.append(json_str) except KeyError as key_error: self.logger.error("Fail to get page {page}.".format(page=i + 1)) self.logger.error( "Server parse error:{error}".format(error=key_error)) except Exception as e: self.logger.error("Fail to get page {page}.".format(page=i + 1)) self.logger.error(e) else: self.logger.info( "Successfully to save page {page}.".format(page=i + 1)) # 追加,加锁写入 if self.lock.acquire(): Helper.save_lines_to_file(comments_info_list, self.comments_file_path, "a") self.lock.release() self.logger.info( "Write page {begin_page} to {end_page} successfully!".format( begin_page=begin_page, end_page=end_page))