def download_image(self, scrapy): """下载图片保存到本地。 :param scrapy: 记录任务信息的数据体 :return Scrapy: 记录任务信息的数据体 """ try: name = re.findall(r'(?<=/)\w*?\.(?:jpg|gif|png|bmp)', scrapy.url, re.IGNORECASE)[0] except IndexError: name = uuid4().hex + '.jpg' path = op.join(self.directory, convert_to_safe_filename(scrapy.title)) filename = op.join(path, f'[{scrapy.index + 1 or 0:02d}]{name}') if (not self.overwrite) and op.isfile(filename): return scrapy url = scrapy.url if self.thumbnail: if url.lower().endswith(('jpg', 'png', 'bmp')): url = f'{scrapy.url}@1280w_1l_2o_100sh.{url[-3:]}' resp = session_request(url) mkdirs_if_not_exist(path) with open(filename, 'wb') as f: for chunk in resp.iter_content(8192): f.write(chunk) return scrapy
def download_image(self, scrapy): try: name = re.findall(r'(?<=/)\w*?\.jpg|\.png', scrapy.url, re.IGNORECASE)[0] except IndexError: name = uuid4().hex + '.jpg' path = os.path.join(self.directory, convert_to_safe_filename(scrapy.title)) filename = os.path.join(path, name) if (not self.override) and os.path.isfile(filename): return scrapy resp = requests.get(scrapy.url, proxies=self.proxies, timeout=TIMEOUT) if resp.status_code != 200: raise Exception(f'Response status code: {resp.status_code}') mkdirs_if_not_exist(path) with open(filename, 'wb') as fi: fi.write(resp.content) return scrapy
def __init__(self, user_id=None, username=None, directory=None, max_pages=None, max_topics=None, max_workers=None, retries=None, redownload=None, override=None, proxies=None): self.start_time = datetime.now() print(f'\n *** {self.start_time.ctime()} ***\n') self.max_topics = max_topics or 'all' self.max_workers = max_workers or MAX_WORKERS self.pool = ThreadPoolExecutor(self.max_workers) self.override = override self.pages = Queue() self.topics = Queue() self.images = Queue() self.stat = { 'npages': 0, 'ntopics': 0, 'nimages': 0, 'pages_pass': set([]), 'pages_fail': set([]), 'topics_pass': set([]), 'topics_fail': set([]), 'images_pass': set([]), 'images_fail': set([]) } if retries: global RETRIES RETRIES = retries if isinstance(proxies, str): try: self.proxies = json.loads(proxies) except Exception: print(f'Invalid proxies: {proxies}') sys.exit(1) else: self.proxies = None if redownload: self.username = self._reload_records(redownload) self.user_id = self._search_id_by_username(self.username) self.max_pages = self.pages.qsize() self.max_topics = self.topics.qsize() self.directory = os.path.abspath( os.path.join(directory or '', urlparse(HOST_PAGE).netloc, convert_to_safe_filename(self.username))) self.stat.update({ 'npages': self.max_pages, 'ntopics': self.max_topics, 'nimages': self.images.qsize(), }) print( f'Username: {self.username}\n' f'ID: {self.user_id}\n' f'Pages to scrapy: {self.max_pages:2d}\n' f'Topics to scrapy: {self.max_topics:3d}\n' f'Images to scrapy: {self.images.qsize():4d}\n' f'Storage directory: {self.directory}', end='\n\n') return self.user_id = user_id or self._search_id_by_username(username) self.base_url = urljoin(HOST_PAGE, USER_SUFFIX.format(id=self.user_id)) try: response = requests.get(self.base_url, proxies=self.proxies, timeout=TIMEOUT) except Exception: print(f'Failed to connect to {self.base_url}') sys.exit(1) soup = BeautifulSoup(markup=response.text, features='html.parser') try: author = soup.find(name='div', id='body').get('data-name') if username and username != author: print('Wrong <user id> or <username>!') sys.exit(1) self.username = author except Exception: self.username = username or 'anonymous' self.directory = os.path.abspath( os.path.join(directory or '', urlparse(HOST_PAGE).netloc, convert_to_safe_filename(self.username))) try: max_page = int( soup.find(id='laypage_0').find_all(name='a')[-2].text) except Exception: max_page = 1 self.max_pages = min(max_pages or 9999, max_page) print( f'Username: {self.username}\n' f'ID: {self.user_id}\n' f'Maximum pages: {max_page}\n' f'Pages to scrapy: {self.max_pages}\n' f'Topics to scrapy: {"all" if self.max_topics == "all" else (self.max_pages * self.max_topics)}\n' f'Storage directory: {self.directory}', end='\n\n') self._fetch_all()
def __init__(self, user_id=None, username=None, destination=None, max_pages=None, spec_topics=None, max_topics=None, max_workers=None, retries=None, redownload=None, overwrite=False, thumbnail=False): """初始化下载参数。 :param int user_id: 用户 ID :param str username: 用户名 :param str destination: 图片保存到本地的路径,默认当前路径 :param int max_pages: 最大爬取页数,默认所有 :param list spec_topics: 需要下载的特定主题 :param int max_topics: 最大下载主题数量,默认所有 :param int max_workers: 线程开启个数,默认 20 :param int retries: 请求异常时的重试次数,默认 3 :param str redownload: 下载记录文件,给定此文件则从失败记录进行下载 :param bool overwrite: 是否覆盖已存在的文件,默认 False :param bool thumbnail: 是否下载缩略图,默认 False """ self.start_time = datetime.now() print(f' - - - - - -+-+ {self.start_time.ctime()} +-+- - - - - -\n') self.spec_topics = spec_topics self.max_topics = max_topics or 'all' self.max_workers = max_workers or MAX_WORKERS self.pool = ThreadPoolExecutor(self.max_workers) self.overwrite = overwrite self.thumbnail = thumbnail self.pages = Queue() self.topics = Queue() self.images = Queue() self.stat = {'npages': 0, 'ntopics': 0, 'nimages': 0, 'pages_pass': set(), 'pages_fail': set(), 'topics_pass': set(), 'topics_fail': set(), 'images_pass': set(), 'images_fail': set()} if retries: # 重置全局变量 RETRIES global RETRIES RETRIES = retries if redownload: # 从记录文件中的失败项开始下载 self.username = self.reload_records(redownload) self.user_id = self.search_id_by_username(self.username) self.max_pages = self.pages.qsize() self.max_topics = self.topics.qsize() self.directory = op.abspath(op.join(destination or '', urlparse(HOST_PAGE).netloc, convert_to_safe_filename(self.username))) self.stat.update({'npages': self.max_pages, 'ntopics': self.max_topics, 'nimages': self.images.qsize()}) print(f'{"Username".rjust(17)}: {colored(self.username, "cyan")}\n' f'{"User ID".rjust(17)}: {self.user_id}\n' f'{"Pages to scrapy".rjust(17)}: {self.max_pages:2d}\n' f'{"Topics to scrapy".rjust(17)}: {self.max_topics:3d}\n' f'{"Images to scrapy".rjust(17)}: {self.images.qsize():4d}\n' f'Storage directory: {colored(self.directory, attrs=["underline"])}', end='\n\n') self.fetch_all(redownload=True) return self.user_id = user_id or self.search_id_by_username(username) self.base_url = urljoin(HOST_PAGE, USER_SUFFIX.format(id=self.user_id)) try: response = session_request(self.base_url) except requests.exceptions.ProxyError: cprint('Cannot connect to proxy.', 'red') sys.exit(1) except Exception as e: cprint(f'Failed to connect to {self.base_url}, {e}', 'red') sys.exit(1) soup = BeautifulSoup(markup=response.text, features='html.parser') try: author = soup.find(name='div', id='body').get('data-name') if username and username != author: cprint(f'Invalid user id:「{user_id}」or username:「{username}」!', 'red') sys.exit(1) self.username = author except Exception: self.username = username or 'anonymous' self.directory = op.abspath(op.join(destination or '', urlparse(HOST_PAGE).netloc, convert_to_safe_filename(self.username))) try: max_page = int(soup.find(id='laypage_0').find_all(name='a')[-2].text) except Exception: max_page = 1 self.max_pages = min(max_pages or 9999, max_page) if self.spec_topics: topics = ', '.join(self.spec_topics) elif self.max_topics == 'all': topics = 'all' else: topics = self.max_pages * self.max_topics print(f'{"Username".rjust(17)}: {colored(self.username, "cyan")}\n' f'{"User ID".rjust(17)}: {self.user_id}\n' f'{"Maximum pages".rjust(17)}: {max_page}\n' f'{"Pages to scrapy".rjust(17)}: {self.max_pages}\n' f'{"Topics to scrapy".rjust(17)}: {topics}\n' f'Storage directory: {colored(self.directory, attrs=["underline"])}', end='\n\n') self.END_PARSING_TOPICS = False self.fetch_all()