def process_splash_response(self, url, splash_response): data = json.loads(splash_response.text, encoding='utf8') screenshot_path = self.save_screenshot(get_domain(url), data) html_rendered = data["html"] return screenshot_path, html_rendered
def _load_webpage_item(self, response, is_seed): depth = response.meta.get('link_depth', 0) ld = WebpageItemLoader(response=response) ld.add_value('url', response.url) ld.add_value('host', get_domain(response.url)) ld.add_xpath('title', '//title/text()') ld.add_value('depth', depth) ld.add_value('total_depth', response.meta.get('depth')) ld.add_value('crawled_at', datetime.utcnow()) ld.add_value('is_seed', is_seed) ld.add_value('crawler_score', response.meta['score']) if self.save_html: ld.add_value('html', response.body_as_unicode()) if 'link' in response.meta: link = response.meta['link'] ld.add_value('link_text', link['text']) ld.add_value('link_url', link['url']) ld.add_value('referrer_url', response.meta['referrer_url']) ld.add_value('referrer_depth', response.meta['referrer_depth']) return ld