def scrape(self, request, response): if not self.is_supported(request=request, response=response): return base_url = request.url_info.url encoding = self._encoding_override \ or detect_response_encoding(response) links = set() try: with wpull.util.reset_file_offset(response.body.content_file): link_iter = self.read_links( response.body.content_file, encoding=encoding ) for link in link_iter: link = urljoin_safe( base_url, clean_link_soup(link) ) if link: links.add(link) except (UnicodeError, lxml.etree.LxmlError) as error: _logger.warning( _('Failed to read document at ‘{url}’: {error}')\ .format(url=request.url_info.url, error=error) ) return { 'inline_urls': (), 'linked_urls': links, 'encoding': encoding }
def scrape(self, request, response): if not self.is_supported(request=request, response=response): return base_url = request.url_info.url content_file = response.body.content_file encoding = self._encoding_override \ or detect_response_encoding(response, is_html=True) linked_urls = set() inline_urls = set() try: with wpull.util.reset_file_offset(content_file): elements = self.read_links(content_file, encoding=encoding) result_meta_info = self._process_elements( elements, response, base_url, linked_urls, inline_urls ) except (UnicodeError, lxml.etree.LxmlError) as error: _logger.warning( _('Failed to read document at ‘{url}’: {error}')\ .format(url=request.url_info.url, error=error) ) result_meta_info = {} if result_meta_info.get('robots_no_follow'): linked_urls.clear() return { 'inline_urls': inline_urls, 'linked_urls': linked_urls, 'base_url': base_url, 'encoding': encoding, }
def scrape(self, request, response): if not self.is_supported(request=request, response=response): return base_url = request.url_info.url encoding = self._encoding_override \ or detect_response_encoding(response) links = set() try: with wpull.util.reset_file_offset(response.body.content_file): link_iter = self.read_links(response.body.content_file, encoding=encoding) for link in link_iter: link = urljoin_safe(base_url, clean_link_soup(link)) if link: links.add(link) except (UnicodeError, lxml.etree.LxmlError) as error: _logger.warning( _('Failed to read document at ‘{url}’: {error}')\ .format(url=request.url_info.url, error=error) ) return {'inline_urls': (), 'linked_urls': links, 'encoding': encoding}
def scrape(self, request, response): if not self.is_supported(request=request, response=response): return base_url = request.url_info.url content_file = response.body.content_file encoding = self._encoding_override \ or detect_response_encoding(response, is_html=True) linked_urls = set() inline_urls = set() try: with wpull.util.reset_file_offset(content_file): elements = self.read_links(content_file, encoding=encoding) result_meta_info = self._process_elements( elements, response, base_url, linked_urls, inline_urls) except (UnicodeError, lxml.etree.LxmlError) as error: _logger.warning( _('Failed to read document at ‘{url}’: {error}')\ .format(url=request.url_info.url, error=error) ) result_meta_info = {} if result_meta_info.get('robots_no_follow'): linked_urls.clear() return { 'inline_urls': inline_urls, 'linked_urls': linked_urls, 'base_url': base_url, 'encoding': encoding, }
def iter_scrape(self, request, response): if not self.is_supported(request=request, response=response): return base_url = request.url_info.url encoding = self._encoding_override \ or detect_response_encoding(response) with wpull.util.reset_file_offset(response.body.content_file): for link in self.read_links(response.body.content_file, encoding): link = urljoin_safe(base_url, link, allow_fragments=False) if link: yield ScrapedLinkResult(link, True, encoding)