def __download_pic(self, pic, path): """ 下载单个图片 :param pic: 图片数据 :param path: 存储路径目录 :return: bool 下载结果, str 下载路径 """ path = os.path.join(path, self.__make_photo_name(pic)) if not os.path.exists(path): url = WeiboApi.make_large_url(pic) response = WeiboApi.get(url, timeout=60) with open(path, 'wb') as fp: fp.write(response.content) return True, path return False, path
def __init__(self, target_url, MAX_PAGE): self.logger = logging.getLogger(__name__) self.MAX_PAGE = MAX_PAGE # 目标数据 self.logger.info(Fore.GREEN + target_url) self.target = WeiboApi.fetch_user_info(target_url) self.page_id, self.title_value = self.target['page_id'], self.target[ 'title_value'] # 断点数据 self.cpdir = './checkpoints/' if not os.path.isdir(self.cpdir): os.makedirs(self.cpdir) self.filename = self.page_id + '_resume.pkl.gz' self.sleeptime = settings.SLEEPTIME if os.path.isfile(os.path.join(self.cpdir, self.filename)): with gzip.open(os.path.join(self.cpdir, self.filename), 'rb') as f: self.since_id, self.page = pickle.load(f) self.logger.info(Fore.GREEN + 'Resuming from {0} with page={1}'.format( self.filename, self.page)) else: self.since_id = '' self.page = 1 # Page starts from 1 self.logger.info( Fore.GREEN + 'Starting from begining with page={0}'.format(self.page)) self.root = self.__init_folder()
def __download_album(self, album): """ 下载单个相册 :param album: 相册数据 :return: None """ # 相册所有图片的id all_photo_ids = WeiboApi.fetch_photo_ids(self.uid, album['album_id'], album['type']) self.logger.info(Fore.BLUE + '检测到 %d 张图片' % len(all_photo_ids)) # 相册所有大图的数据 all_large_pics = self.__fetch_large_pics(album, all_photo_ids) total = len(all_large_pics) # 下载所有大图 with concurrent.futures.ThreadPoolExecutor() as executor: album_path = self.__make_album_path(album) future_to_large = { executor.submit(self.__download_pic, large, album_path): large for large in all_large_pics } for i, future in enumerate( concurrent.futures.as_completed(future_to_large)): large = future_to_large[future] count_msg = '%d/%d ' % (i + 1, total) try: result, path = future.result() except Exception as exc: err = '%s 抛出了异常: %s' % (WeiboApi.make_large_url(large), exc) self.logger.error(''.join([Fore.RED, count_msg, err])) else: style = result and Style.NORMAL or Style.DIM self.logger.info(''.join( [Fore.GREEN, style, count_msg, path])) else: self.logger.info(Fore.BLUE + '《%s》 已完成' % album['caption'])
def __init__(self, target_url): """ 初始化 :param target_url: 目标微博主页url """ self.logger = logging.getLogger(self.__class__.__name__) # 目标数据 self.logger.info(Fore.BLUE + target_url) self.target = WeiboApi.fetch_user_info(target_url) self.uid, self.name = self.target['oid'], self.target['onick'] # 本地预处理 self.root = self.__init_folder()
def start(self): """ 依次下载每一个相册 :return: None """ self.logger.info(Fore.BLUE + Style.BRIGHT + '开始下载 "%s" 的微博相册' % self.name) # 获取每一页的相册列表 page_size, album_count = 20, 0 for page in itertools.count(1): total, album_list = WeiboApi.fetch_album_list(self.uid, page, page_size) if not album_list: break for album in album_list: album_count += 1 msg = '开始下载第 %d / %d 个微博相册《%s》' % (album_count, total, album['caption']) self.logger.info(Fore.BLUE + msg) self.__download_album(album)
def __init__(self, target_url): """ 初始化 :param target_url: 目标微博主页url """ self.logger = logging.getLogger(self.__class__.__name__) # 目标数据 self.logger.info(Fore.BLUE + target_url) while True: try: self.target = WeiboApi.fetch_user_info(target_url) self.uid, self.name = self.target['oid'], self.target['onick'] except KeyError as err: wait = 60 message = '获取 %s 失败,可能原因为 Cookies 过期,或 API 发生变化,等待 %ds 后重试。' % ( err, wait) self.logger.error(Fore.RED + message) sleep(wait) else: break # 本地预处理 self.root = self.__init_folder()
def start(self): """开始下载图片缩略图和大图 """ flag = True while flag: self.logger.info( Fore.YELLOW + 'since_id={0}, page={1}'.format(self.since_id, str(self.page))) try: # 有时候会解析失败 r = WeiboApi.chaohua_img_info(self.since_id, self.page_id, self.page) # print(r.url) b = BeautifulSoup(r.json()['data'], features="lxml") except Exception as e: self.logger.error(Fore.RED + 'since_id={0}, page={1}, err={2}'.format( self.since_id, self.page, e)) with gzip.open(os.path.join(self.cpdir, self.filename), 'wb') as f: pickle.dump(obj=[self.since_id, self.page], file=f) # 保存现场 sys.exit('Bye! Use `python main.py` to continue downloading.') for i in b.findAll('img'): uri = i.attrs['src'] if 'http' not in uri: uri = 'https:' + uri # 有些URI头部没有HTTPS,加上 # print(uri) try: uri_pair = { 'thumbnail': uri, 'large': uri.replace('thumb300', 'large') } for t in uri_pair: time.sleep(self.sleeptime) uri = uri_pair[t] filename = os.path.join( self.root, t + '/' + uri[uri.rfind('/') + 1:]) if os.path.isfile(filename): self.logger.info(Fore.BLUE + Style.DIM + filename) continue else: urllib.request.urlretrieve(uri, filename) self.logger.info(Fore.BLUE + filename) except KeyboardInterrupt: self.logger.error( Fore.RED + 'since_id={0}, page={1}, err=KeyboardInterrupt'.format( self.since_id, self.page)) with gzip.open(os.path.join(self.cpdir, self.filename), 'wb') as f: pickle.dump(obj=[self.since_id, self.page], file=f) # 保存现场 sys.exit( 'Bye! Use `python main.py` to continue downloading.') except Exception as e: self.logger.error(Fore.RED + 'Img={0}, err={1}'.format(uri, e)) # 模拟翻页 flag = False for div in reversed(b.findAll('div')): # action-data 里的 since_id 是翻页的根据 if 'action-data' in div.attrs: tmp = div.attrs['action-data'] dic = dict( [j.split('=')[0], j.split('=')[1]] for j in tmp.split('&')) if 'since_id' in dic and self.page < self.MAX_PAGE: flag = True self.since_id = urllib.parse.unquote(dic['since_id']) self.page += 1 break if not flag: with gzip.open(os.path.join(self.cpdir, self.filename), 'wb') as f: pickle.dump(obj=[self.since_id, self.page], file=f) # 保存末尾 self.logger.info(Fore.BLUE + 'Crawler finished')