Пример #1
0
    def scrape(self, request, response):
        if not self.is_supported(request=request, response=response):
            return

        base_url = request.url_info.url
        encoding = self._encoding_override \
            or detect_response_encoding(response)
        links = set()

        try:
            with wpull.util.reset_file_offset(response.body.content_file):
                link_iter = self.read_links(
                    response.body.content_file, encoding=encoding
                )

                for link in link_iter:
                    link = urljoin_safe(
                        base_url,
                        clean_link_soup(link)
                    )

                    if link:
                        links.add(link)

        except (UnicodeError, lxml.etree.LxmlError) as error:
            _logger.warning(
                _('Failed to read document at ‘{url}’: {error}')\
                .format(url=request.url_info.url, error=error)
            )

        return {
            'inline_urls': (),
            'linked_urls': links,
            'encoding': encoding
        }
Пример #2
0
    def scrape(self, request, response):
        if not self.is_supported(request=request, response=response):
            return

        base_url = request.url_info.url
        content_file = response.body.content_file
        encoding = self._encoding_override \
            or detect_response_encoding(response, is_html=True)
        linked_urls = set()
        inline_urls = set()

        try:
            with wpull.util.reset_file_offset(content_file):
                elements = self.read_links(content_file, encoding=encoding)

                result_meta_info = self._process_elements(
                    elements, response, base_url, linked_urls, inline_urls
                )

        except (UnicodeError, lxml.etree.LxmlError) as error:
            _logger.warning(
                _('Failed to read document at ‘{url}’: {error}')\
                .format(url=request.url_info.url, error=error)
            )
            result_meta_info = {}

        if result_meta_info.get('robots_no_follow'):
            linked_urls.clear()

        return {
            'inline_urls': inline_urls,
            'linked_urls': linked_urls,
            'base_url': base_url,
            'encoding': encoding,
        }
Пример #3
0
    def scrape(self, request, response):
        if not self.is_supported(request=request, response=response):
            return

        base_url = request.url_info.url
        encoding = self._encoding_override \
            or detect_response_encoding(response)
        links = set()

        try:
            with wpull.util.reset_file_offset(response.body.content_file):
                link_iter = self.read_links(response.body.content_file,
                                            encoding=encoding)

                for link in link_iter:
                    link = urljoin_safe(base_url, clean_link_soup(link))

                    if link:
                        links.add(link)

        except (UnicodeError, lxml.etree.LxmlError) as error:
            _logger.warning(
                _('Failed to read document at ‘{url}’: {error}')\
                .format(url=request.url_info.url, error=error)
            )

        return {'inline_urls': (), 'linked_urls': links, 'encoding': encoding}
Пример #4
0
    def scrape(self, request, response):
        if not self.is_supported(request=request, response=response):
            return

        base_url = request.url_info.url
        content_file = response.body.content_file
        encoding = self._encoding_override \
            or detect_response_encoding(response, is_html=True)
        linked_urls = set()
        inline_urls = set()

        try:
            with wpull.util.reset_file_offset(content_file):
                elements = self.read_links(content_file, encoding=encoding)

                result_meta_info = self._process_elements(
                    elements, response, base_url, linked_urls, inline_urls)

        except (UnicodeError, lxml.etree.LxmlError) as error:
            _logger.warning(
                _('Failed to read document at ‘{url}’: {error}')\
                .format(url=request.url_info.url, error=error)
            )
            result_meta_info = {}

        if result_meta_info.get('robots_no_follow'):
            linked_urls.clear()

        return {
            'inline_urls': inline_urls,
            'linked_urls': linked_urls,
            'base_url': base_url,
            'encoding': encoding,
        }
Пример #5
0
    def iter_scrape(self, request, response):
        if not self.is_supported(request=request, response=response):
            return

        base_url = request.url_info.url
        encoding = self._encoding_override \
            or detect_response_encoding(response)

        with wpull.util.reset_file_offset(response.body.content_file):
            for link in self.read_links(response.body.content_file, encoding):
                link = urljoin_safe(base_url, link, allow_fragments=False)

                if link:
                    yield ScrapedLinkResult(link, True, encoding)
Пример #6
0
    def iter_scrape(self, request, response):
        if not self.is_supported(request=request, response=response):
            return

        base_url = request.url_info.url
        encoding = self._encoding_override \
            or detect_response_encoding(response)

        with wpull.util.reset_file_offset(response.body.content_file):
            for link in self.read_links(response.body.content_file, encoding):
                link = urljoin_safe(base_url, link, allow_fragments=False)

                if link:
                    yield ScrapedLinkResult(link, True, encoding)