Пример #1
0
    def scrape(self, request, response, link_type=None):
        if not self.is_supported(request=request, response=response):
            return
        if link_type and link_type != LinkType.css:
            return

        link_contexts = set()
        base_url = request.url_info.url
        encoding = self._encoding_override or \
            detect_response_encoding(response)

        try:
            with wpull.util.reset_file_offset(response.body):
                for link, context in self.iter_processed_links(
                        response.body, encoding, base_url, context=True):
                    if context == 'import':
                        link_type = LinkType.css
                    else:
                        link_type = LinkType.media

                    link_contexts.add(LinkContext(link, inline=True, link_type=link_type))

        except UnicodeError as error:
            _logger.warning(__(
                _('Failed to read document at ‘{url}’: {error}'),
                url=request.url_info.url, error=error
            ))

        return ScrapeResult(link_contexts, encoding)
Пример #2
0
    def scrape(self, request, response, link_type=None):
        if not self.is_supported(request=request, response=response):
            return
        if link_type and link_type != LinkType.sitemap:
            return

        base_url = request.url_info.url
        encoding = self._encoding_override \
            or detect_response_encoding(response)
        link_contexts = set()

        try:
            with wpull.util.reset_file_offset(response.body):
                link_iter = self.iter_processed_links(response.body, encoding,
                                                      base_url)
                for link in link_iter:
                    link_contexts.add(LinkContext(link, linked=True))

        except (UnicodeError, self._html_parser.parser_error) as error:
            _logger.warning(
                _('Failed to read document at ‘{url}’: {error}'),
                url=request.url_info.url, error=error
            )

        return ScrapeResult(link_contexts, encoding)
Пример #3
0
    def scrape(self, request, response, link_type=None):
        if not self.is_supported(request=request, response=response):
            return
        if link_type and link_type != LinkType.javascript:
            return

        link_contexts = set()
        base_url = request.url_info.url
        encoding = self._encoding_override or \
            detect_response_encoding(response)

        try:
            with wpull.util.reset_file_offset(response.body):
                for link, context in self.iter_processed_links(
                        response.body, encoding, base_url, context=True):
                    inline = is_likely_inline(link)

                    if context is True:
                        link_type = None
                    else:
                        link_type = context

                    link_contexts.add(
                        LinkContext(link, inline=inline, linked=not inline,
                                    link_type=link_type)
                    )

        except UnicodeError as error:
            _logger.warning(__(
                _('Failed to read document at ‘{url}’: {error}'),
                url=request.url_info.url, error=error
            ))

        return ScrapeResult(link_contexts, encoding)
Пример #4
0
    def scrape(self, request, response, link_type=None):
        if not self.is_supported(request=request, response=response):
            return
        if link_type and link_type != LinkType.html:
            return

        base_url = request.url_info.url
        content_file = response.body
        encoding = self._encoding_override \
            or detect_response_encoding(response, is_html=True)
        link_contexts = set()

        try:
            with wpull.util.reset_file_offset(content_file):
                elements = self.iter_elements(content_file, encoding=encoding)

                result_meta_info = self._process_elements(
                    elements, response, base_url, link_contexts
                )

        except (UnicodeError, self._html_parser.parser_error) as error:
            _logger.warning(__(
                _('Failed to read document at ‘{url}’: {error}'),
                url=request.url_info.url, error=error
            ))
            result_meta_info = {}

        if result_meta_info.get('robots_no_follow'):
            link_contexts.discard(frozenset(
                context for context in link_contexts if context.linked
            ))

        scrape_result = ScrapeResult(link_contexts, encoding)
        scrape_result['base_url'] = base_url
        return scrape_result
Пример #5
0
    def scrape(self, request, response, link_type=None):
        if not self.is_supported(request=request, response=response):
            return
        if link_type and link_type != LinkType.sitemap:
            return

        base_url = request.url_info.url
        encoding = self._encoding_override \
            or detect_response_encoding(response)
        link_contexts = set()

        try:
            with wpull.util.reset_file_offset(response.body):
                link_iter = self.iter_processed_links(response.body, encoding,
                                                      base_url)
                for link in link_iter:
                    link_contexts.add(LinkContext(link, linked=True))

        except (UnicodeError, self._html_parser.parser_error) as error:
            _logger.warning(
                __(_('Failed to read document at ‘{url}’: {error}'),
                   url=request.url_info.url,
                   error=error))

        return ScrapeResult(link_contexts, encoding)