Python detect_response_encoding示例，wpull.document.detect_response_encoding Python示例

示例#1

0

显示文件

文件： scraper.py 项目： yipdw/wpull

    def scrape(self, request, response):
        if not self.is_supported(request=request, response=response):
            return

        base_url = request.url_info.url
        encoding = self._encoding_override \
            or detect_response_encoding(response)
        links = set()

        try:
            with wpull.util.reset_file_offset(response.body.content_file):
                link_iter = self.read_links(
                    response.body.content_file, encoding=encoding
                )

                for link in link_iter:
                    link = urljoin_safe(
                        base_url,
                        clean_link_soup(link)
                    )

                    if link:
                        links.add(link)

        except (UnicodeError, lxml.etree.LxmlError) as error:
            _logger.warning(
                _('Failed to read document at ‘{url}’: {error}')\
                .format(url=request.url_info.url, error=error)
            )

        return {
            'inline_urls': (),
            'linked_urls': links,
            'encoding': encoding
        }

示例#2

0

显示文件

文件： scraper.py 项目： yipdw/wpull

    def scrape(self, request, response):
        if not self.is_supported(request=request, response=response):
            return

        base_url = request.url_info.url
        content_file = response.body.content_file
        encoding = self._encoding_override \
            or detect_response_encoding(response, is_html=True)
        linked_urls = set()
        inline_urls = set()

        try:
            with wpull.util.reset_file_offset(content_file):
                elements = self.read_links(content_file, encoding=encoding)

                result_meta_info = self._process_elements(
                    elements, response, base_url, linked_urls, inline_urls
                )

        except (UnicodeError, lxml.etree.LxmlError) as error:
            _logger.warning(
                _('Failed to read document at ‘{url}’: {error}')\
                .format(url=request.url_info.url, error=error)
            )
            result_meta_info = {}

        if result_meta_info.get('robots_no_follow'):
            linked_urls.clear()

        return {
            'inline_urls': inline_urls,
            'linked_urls': linked_urls,
            'base_url': base_url,
            'encoding': encoding,
        }

示例#3

0

显示文件

    def scrape(self, request, response):
        if not self.is_supported(request=request, response=response):
            return

        base_url = request.url_info.url
        encoding = self._encoding_override \
            or detect_response_encoding(response)
        links = set()

        try:
            with wpull.util.reset_file_offset(response.body.content_file):
                link_iter = self.read_links(response.body.content_file,
                                            encoding=encoding)

                for link in link_iter:
                    link = urljoin_safe(base_url, clean_link_soup(link))

                    if link:
                        links.add(link)

        except (UnicodeError, lxml.etree.LxmlError) as error:
            _logger.warning(
                _('Failed to read document at ‘{url}’: {error}')\
                .format(url=request.url_info.url, error=error)
            )

        return {'inline_urls': (), 'linked_urls': links, 'encoding': encoding}

示例#4

0

显示文件

    def scrape(self, request, response):
        if not self.is_supported(request=request, response=response):
            return

        base_url = request.url_info.url
        content_file = response.body.content_file
        encoding = self._encoding_override \
            or detect_response_encoding(response, is_html=True)
        linked_urls = set()
        inline_urls = set()

        try:
            with wpull.util.reset_file_offset(content_file):
                elements = self.read_links(content_file, encoding=encoding)

                result_meta_info = self._process_elements(
                    elements, response, base_url, linked_urls, inline_urls)

        except (UnicodeError, lxml.etree.LxmlError) as error:
            _logger.warning(
                _('Failed to read document at ‘{url}’: {error}')\
                .format(url=request.url_info.url, error=error)
            )
            result_meta_info = {}

        if result_meta_info.get('robots_no_follow'):
            linked_urls.clear()

        return {
            'inline_urls': inline_urls,
            'linked_urls': linked_urls,
            'base_url': base_url,
            'encoding': encoding,
        }

示例#5

0

显示文件

文件： scraper.py 项目： yipdw/wpull

    def iter_scrape(self, request, response):
        if not self.is_supported(request=request, response=response):
            return

        base_url = request.url_info.url
        encoding = self._encoding_override \
            or detect_response_encoding(response)

        with wpull.util.reset_file_offset(response.body.content_file):
            for link in self.read_links(response.body.content_file, encoding):
                link = urljoin_safe(base_url, link, allow_fragments=False)

                if link:
                    yield ScrapedLinkResult(link, True, encoding)

示例#6

0

显示文件

    def iter_scrape(self, request, response):
        if not self.is_supported(request=request, response=response):
            return

        base_url = request.url_info.url
        encoding = self._encoding_override \
            or detect_response_encoding(response)

        with wpull.util.reset_file_offset(response.body.content_file):
            for link in self.read_links(response.body.content_file, encoding):
                link = urljoin_safe(base_url, link, allow_fragments=False)

                if link:
                    yield ScrapedLinkResult(link, True, encoding)