def scrape(self, request, response, link_type=None): if not self.is_supported(request=request, response=response): return if link_type and link_type != LinkType.css: return link_contexts = set() base_url = request.url_info.url encoding = self._encoding_override or \ detect_response_encoding(response) try: with wpull.util.reset_file_offset(response.body): for link, context in self.iter_processed_links( response.body, encoding, base_url, context=True): if context == 'import': link_type = LinkType.css else: link_type = LinkType.media link_contexts.add(LinkContext(link, inline=True, link_type=link_type)) except UnicodeError as error: _logger.warning(__( _('Failed to read document at ‘{url}’: {error}'), url=request.url_info.url, error=error )) return ScrapeResult(link_contexts, encoding)
def scrape(self, request, response, link_type=None): if not self.is_supported(request=request, response=response): return if link_type and link_type != LinkType.sitemap: return base_url = request.url_info.url encoding = self._encoding_override \ or detect_response_encoding(response) link_contexts = set() try: with wpull.util.reset_file_offset(response.body): link_iter = self.iter_processed_links(response.body, encoding, base_url) for link in link_iter: link_contexts.add(LinkContext(link, linked=True)) except (UnicodeError, self._html_parser.parser_error) as error: _logger.warning( _('Failed to read document at ‘{url}’: {error}'), url=request.url_info.url, error=error ) return ScrapeResult(link_contexts, encoding)
def scrape(self, request, response, link_type=None): if not self.is_supported(request=request, response=response): return if link_type and link_type != LinkType.javascript: return link_contexts = set() base_url = request.url_info.url encoding = self._encoding_override or \ detect_response_encoding(response) try: with wpull.util.reset_file_offset(response.body): for link, context in self.iter_processed_links( response.body, encoding, base_url, context=True): inline = is_likely_inline(link) if context is True: link_type = None else: link_type = context link_contexts.add( LinkContext(link, inline=inline, linked=not inline, link_type=link_type) ) except UnicodeError as error: _logger.warning(__( _('Failed to read document at ‘{url}’: {error}'), url=request.url_info.url, error=error )) return ScrapeResult(link_contexts, encoding)
def scrape(self, request, response, link_type=None): if not self.is_supported(request=request, response=response): return if link_type and link_type != LinkType.html: return base_url = request.url_info.url content_file = response.body encoding = self._encoding_override \ or detect_response_encoding(response, is_html=True) link_contexts = set() try: with wpull.util.reset_file_offset(content_file): elements = self.iter_elements(content_file, encoding=encoding) result_meta_info = self._process_elements( elements, response, base_url, link_contexts ) except (UnicodeError, self._html_parser.parser_error) as error: _logger.warning(__( _('Failed to read document at ‘{url}’: {error}'), url=request.url_info.url, error=error )) result_meta_info = {} if result_meta_info.get('robots_no_follow'): link_contexts.discard(frozenset( context for context in link_contexts if context.linked )) scrape_result = ScrapeResult(link_contexts, encoding) scrape_result['base_url'] = base_url return scrape_result
def scrape(self, request, response, link_type=None): if not self.is_supported(request=request, response=response): return if link_type and link_type != LinkType.sitemap: return base_url = request.url_info.url encoding = self._encoding_override \ or detect_response_encoding(response) link_contexts = set() try: with wpull.util.reset_file_offset(response.body): link_iter = self.iter_processed_links(response.body, encoding, base_url) for link in link_iter: link_contexts.add(LinkContext(link, linked=True)) except (UnicodeError, self._html_parser.parser_error) as error: _logger.warning( __(_('Failed to read document at ‘{url}’: {error}'), url=request.url_info.url, error=error)) return ScrapeResult(link_contexts, encoding)