示例#1
0
    def download_image(self, scrapy):
        """下载图片保存到本地。

         :param scrapy: 记录任务信息的数据体
         :return Scrapy: 记录任务信息的数据体
         """
        try:
            name = re.findall(r'(?<=/)\w*?\.(?:jpg|gif|png|bmp)', scrapy.url, re.IGNORECASE)[0]
        except IndexError:
            name = uuid4().hex + '.jpg'

        path = op.join(self.directory, convert_to_safe_filename(scrapy.title))
        filename = op.join(path, f'[{scrapy.index + 1 or 0:02d}]{name}')
        if (not self.overwrite) and op.isfile(filename):
            return scrapy

        url = scrapy.url
        if self.thumbnail:
            if url.lower().endswith(('jpg', 'png', 'bmp')):
                url = f'{scrapy.url}@1280w_1l_2o_100sh.{url[-3:]}'
        resp = session_request(url)

        mkdirs_if_not_exist(path)
        with open(filename, 'wb') as f:
            for chunk in resp.iter_content(8192):
                f.write(chunk)
        return scrapy
示例#2
0
    def download_image(self, scrapy):
        try:
            name = re.findall(r'(?<=/)\w*?\.jpg|\.png', scrapy.url, re.IGNORECASE)[0]
        except IndexError:
            name = uuid4().hex + '.jpg'

        path = os.path.join(self.directory, convert_to_safe_filename(scrapy.title))
        filename = os.path.join(path, name)
        if (not self.override) and os.path.isfile(filename):
            return scrapy

        resp = requests.get(scrapy.url, proxies=self.proxies, timeout=TIMEOUT)
        if resp.status_code != 200:
            raise Exception(f'Response status code: {resp.status_code}')

        mkdirs_if_not_exist(path)
        with open(filename, 'wb') as fi:
            fi.write(resp.content)
        return scrapy
示例#3
0
    def __init__(self,
                 user_id=None,
                 username=None,
                 directory=None,
                 max_pages=None,
                 max_topics=None,
                 max_workers=None,
                 retries=None,
                 redownload=None,
                 override=None,
                 proxies=None):
        self.start_time = datetime.now()
        print(f'\n *** {self.start_time.ctime()} ***\n')

        self.max_topics = max_topics or 'all'
        self.max_workers = max_workers or MAX_WORKERS
        self.pool = ThreadPoolExecutor(self.max_workers)
        self.override = override
        self.pages = Queue()
        self.topics = Queue()
        self.images = Queue()
        self.stat = {
            'npages': 0,
            'ntopics': 0,
            'nimages': 0,
            'pages_pass': set([]),
            'pages_fail': set([]),
            'topics_pass': set([]),
            'topics_fail': set([]),
            'images_pass': set([]),
            'images_fail': set([])
        }

        if retries:
            global RETRIES
            RETRIES = retries

        if isinstance(proxies, str):
            try:
                self.proxies = json.loads(proxies)
            except Exception:
                print(f'Invalid proxies: {proxies}')
                sys.exit(1)
        else:
            self.proxies = None

        if redownload:
            self.username = self._reload_records(redownload)
            self.user_id = self._search_id_by_username(self.username)
            self.max_pages = self.pages.qsize()
            self.max_topics = self.topics.qsize()
            self.directory = os.path.abspath(
                os.path.join(directory or '',
                             urlparse(HOST_PAGE).netloc,
                             convert_to_safe_filename(self.username)))
            self.stat.update({
                'npages': self.max_pages,
                'ntopics': self.max_topics,
                'nimages': self.images.qsize(),
            })
            print(
                f'Username: {self.username}\n'
                f'ID: {self.user_id}\n'
                f'Pages to scrapy: {self.max_pages:2d}\n'
                f'Topics to scrapy: {self.max_topics:3d}\n'
                f'Images to scrapy: {self.images.qsize():4d}\n'
                f'Storage directory: {self.directory}',
                end='\n\n')
            return

        self.user_id = user_id or self._search_id_by_username(username)
        self.base_url = urljoin(HOST_PAGE, USER_SUFFIX.format(id=self.user_id))

        try:
            response = requests.get(self.base_url,
                                    proxies=self.proxies,
                                    timeout=TIMEOUT)
        except Exception:
            print(f'Failed to connect to {self.base_url}')
            sys.exit(1)
        soup = BeautifulSoup(markup=response.text, features='html.parser')

        try:
            author = soup.find(name='div', id='body').get('data-name')
            if username and username != author:
                print('Wrong <user id> or <username>!')
                sys.exit(1)
            self.username = author
        except Exception:
            self.username = username or 'anonymous'
        self.directory = os.path.abspath(
            os.path.join(directory or '',
                         urlparse(HOST_PAGE).netloc,
                         convert_to_safe_filename(self.username)))

        try:
            max_page = int(
                soup.find(id='laypage_0').find_all(name='a')[-2].text)
        except Exception:
            max_page = 1
        self.max_pages = min(max_pages or 9999, max_page)

        print(
            f'Username: {self.username}\n'
            f'ID: {self.user_id}\n'
            f'Maximum pages: {max_page}\n'
            f'Pages to scrapy: {self.max_pages}\n'
            f'Topics to scrapy: {"all" if self.max_topics == "all" else (self.max_pages * self.max_topics)}\n'
            f'Storage directory: {self.directory}',
            end='\n\n')
        self._fetch_all()
示例#4
0
    def __init__(self, user_id=None, username=None, destination=None, max_pages=None,
                 spec_topics=None, max_topics=None, max_workers=None, retries=None,
                 redownload=None, overwrite=False, thumbnail=False):
        """初始化下载参数。

        :param int user_id: 用户 ID
        :param str username: 用户名
        :param str destination: 图片保存到本地的路径,默认当前路径
        :param int max_pages: 最大爬取页数,默认所有
        :param list spec_topics: 需要下载的特定主题
        :param int max_topics: 最大下载主题数量,默认所有
        :param int max_workers: 线程开启个数,默认 20
        :param int retries: 请求异常时的重试次数,默认 3
        :param str redownload: 下载记录文件,给定此文件则从失败记录进行下载
        :param bool overwrite: 是否覆盖已存在的文件,默认 False
        :param bool thumbnail: 是否下载缩略图,默认 False
        """
        self.start_time = datetime.now()
        print(f' - - - - - -+-+ {self.start_time.ctime()} +-+- - - - - -\n')

        self.spec_topics = spec_topics
        self.max_topics = max_topics or 'all'
        self.max_workers = max_workers or MAX_WORKERS
        self.pool = ThreadPoolExecutor(self.max_workers)
        self.overwrite = overwrite
        self.thumbnail = thumbnail
        self.pages = Queue()
        self.topics = Queue()
        self.images = Queue()
        self.stat = {'npages': 0,
                     'ntopics': 0,
                     'nimages': 0,
                     'pages_pass': set(),
                     'pages_fail': set(),
                     'topics_pass': set(),
                     'topics_fail': set(),
                     'images_pass': set(),
                     'images_fail': set()}

        if retries:
            # 重置全局变量 RETRIES
            global RETRIES
            RETRIES = retries

        if redownload:
            # 从记录文件中的失败项开始下载
            self.username = self.reload_records(redownload)
            self.user_id = self.search_id_by_username(self.username)
            self.max_pages = self.pages.qsize()
            self.max_topics = self.topics.qsize()
            self.directory = op.abspath(op.join(destination or '',
                                                urlparse(HOST_PAGE).netloc,
                                                convert_to_safe_filename(self.username)))
            self.stat.update({'npages': self.max_pages,
                              'ntopics': self.max_topics,
                              'nimages': self.images.qsize()})
            print(f'{"Username".rjust(17)}: {colored(self.username, "cyan")}\n'
                  f'{"User ID".rjust(17)}: {self.user_id}\n'
                  f'{"Pages to scrapy".rjust(17)}: {self.max_pages:2d}\n'
                  f'{"Topics to scrapy".rjust(17)}: {self.max_topics:3d}\n'
                  f'{"Images to scrapy".rjust(17)}: {self.images.qsize():4d}\n'
                  f'Storage directory: {colored(self.directory, attrs=["underline"])}', end='\n\n')
            self.fetch_all(redownload=True)
            return

        self.user_id = user_id or self.search_id_by_username(username)
        self.base_url = urljoin(HOST_PAGE, USER_SUFFIX.format(id=self.user_id))

        try:
            response = session_request(self.base_url)
        except requests.exceptions.ProxyError:
            cprint('Cannot connect to proxy.', 'red')
            sys.exit(1)
        except Exception as e:
            cprint(f'Failed to connect to {self.base_url}, {e}', 'red')
            sys.exit(1)

        soup = BeautifulSoup(markup=response.text, features='html.parser')
        try:
            author = soup.find(name='div', id='body').get('data-name')
            if username and username != author:
                cprint(f'Invalid user id:「{user_id}」or username:「{username}」!', 'red')
                sys.exit(1)
            self.username = author
        except Exception:
            self.username = username or 'anonymous'
        self.directory = op.abspath(op.join(destination or '',
                                            urlparse(HOST_PAGE).netloc,
                                            convert_to_safe_filename(self.username)))
        try:
            max_page = int(soup.find(id='laypage_0').find_all(name='a')[-2].text)
        except Exception:
            max_page = 1
        self.max_pages = min(max_pages or 9999, max_page)

        if self.spec_topics:
            topics = ', '.join(self.spec_topics)
        elif self.max_topics == 'all':
            topics = 'all'
        else:
            topics = self.max_pages * self.max_topics
        print(f'{"Username".rjust(17)}: {colored(self.username, "cyan")}\n'
              f'{"User ID".rjust(17)}: {self.user_id}\n'
              f'{"Maximum pages".rjust(17)}: {max_page}\n'
              f'{"Pages to scrapy".rjust(17)}: {self.max_pages}\n'
              f'{"Topics to scrapy".rjust(17)}: {topics}\n'
              f'Storage directory: {colored(self.directory, attrs=["underline"])}', end='\n\n')

        self.END_PARSING_TOPICS = False
        self.fetch_all()