def open_spider(self, spider): self.enabled = self.is_enabled(spider) self.spider = spider for k, type_ in self._settings: setattr(self, k, self._get_setting_value(spider, k, type_)) self._headers = self.crawler.settings.get('CRAWLERA_DEFAULT_HEADERS', {}).items() self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max) if not self.enabled and not self.force_enable_on_http_codes: return if not self.apikey: logging.warning("Crawlera can't be used without a APIKEY", extra={'spider': spider}) return self._proxyauth = self.get_proxyauth(spider) logging.info( "Using crawlera at %s (apikey: %s)" % (self.url, self.apikey[:7]), extra={'spider': spider}, ) if not self.preserve_delay: # Setting spider download delay to 0 to get maximum crawl rate spider.download_delay = 0 logging.info( "CrawleraMiddleware: disabling download delays on Scrapy side to optimize delays introduced by Crawlera. " "To avoid this behaviour you can use the CRAWLERA_PRESERVE_DELAY setting but keep in mind that this may slow down the crawl significantly", extra={'spider': spider}, )
def process_response(self, request, response, spider): if not self._is_enabled_for_request(request): return response key = self._get_slot_key(request) self._restore_original_delay(request) if self._is_no_available_proxies(response): self._set_custom_delay(request, next(self.exp_backoff)) else: self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max) if self._is_banned(response): self._bans[key] += 1 if self._bans[key] > self.maxbans: self.crawler.engine.close_spider(spider, 'banned') else: after = response.headers.get('retry-after') if after: self._set_custom_delay(request, float(after)) self.crawler.stats.inc_value('crawlera/response/banned') else: self._bans[key] = 0 # If placed behind `RedirectMiddleware`, it would not count 3xx responses self.crawler.stats.inc_value('crawlera/response') self.crawler.stats.inc_value('crawlera/response/status/%s' % response.status) crawlera_error = response.headers.get('X-Crawlera-Error') if crawlera_error: self.crawler.stats.inc_value('crawlera/response/error') self.crawler.stats.inc_value('crawlera/response/error/%s' % crawlera_error.decode('utf8')) return response
def process_response(self, request, response, spider): if not self._is_enabled_for_request(request): return response key = self._get_slot_key(request) self._restore_original_delay(request) if self._is_no_available_proxies(response): self._set_custom_delay(request, next(self.exp_backoff)) else: self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max) if self._is_banned(response): self._bans[key] += 1 if self._bans[key] > self.maxbans: self.crawler.engine.close_spider(spider, 'banned') else: after = response.headers.get('retry-after') if after: self._set_custom_delay(request, float(after)) self.crawler.stats.inc_value('crawlera/response/banned') else: self._bans[key] = 0 # If placed behind `RedirectMiddleware`, it would not count 3xx responses self.crawler.stats.inc_value('crawlera/response') self.crawler.stats.inc_value('crawlera/response/status/%s' % response.status) crawlera_error = response.headers.get('X-Crawlera-Error') if crawlera_error: self.crawler.stats.inc_value('crawlera/response/error') self.crawler.stats.inc_value( 'crawlera/response/error/%s' % crawlera_error.decode('utf8')) return response
def process_response(self, request, response, spider): if not self._is_enabled_for_request(request): return self._handle_not_enabled_response(request, response) if not self._is_crawlera_response(response): return response key = self._get_slot_key(request) self._restore_original_delay(request) if self._is_no_available_proxies(response) or self._is_auth_error(response): if self._is_no_available_proxies(response): reason = 'noslaves' else: reason = 'autherror' self._set_custom_delay(request, next(self.exp_backoff), reason=reason) else: self.crawler.stats.inc_value('crawlera/delay/reset_backoff') self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max) if self._is_auth_error(response): # When crawlera has issues it might not be able to authenticate users # we must retry retries = request.meta.get('crawlera_auth_retry_times', 0) if retries < self.max_auth_retry_times: return self._retry_auth(response, request, spider) else: self.crawler.stats.inc_value('crawlera/retries/auth/max_reached') logging.warning( "Max retries for authentication issues reached, please check auth" " information settings", extra={'spider': self.spider}, ) if self._is_banned(response): self._bans[key] += 1 if self._bans[key] > self.maxbans: self.crawler.engine.close_spider(spider, 'banned') else: after = response.headers.get('retry-after') if after: self._set_custom_delay(request, float(after), reason='banned') self.crawler.stats.inc_value('crawlera/response/banned') else: self._bans[key] = 0 # If placed behind `RedirectMiddleware`, it would not count 3xx responses self.crawler.stats.inc_value('crawlera/response') self.crawler.stats.inc_value('crawlera/response/status/%s' % response.status) crawlera_error = response.headers.get('X-Crawlera-Error') if crawlera_error: self.crawler.stats.inc_value('crawlera/response/error') self.crawler.stats.inc_value( 'crawlera/response/error/%s' % crawlera_error.decode('utf8')) return response
def open_spider(self, spider): self.enabled = self.is_enabled(spider) if not self.enabled: return for k, type_ in self._settings: setattr(self, k, self._get_setting_value(spider, k, type_)) self._proxyauth = self.get_proxyauth(spider) logging.info("Using crawlera at %s (apikey: %s)" % ( self.url, self.apikey[:7]) ) if not self.preserve_delay: # Setting spider download delay to 0 to get maximum crawl rate spider.download_delay = 0 logging.info( "CrawleraMiddleware: disabling download delays on Scrapy side to optimize delays introduced by Crawlera. " "To avoid this behaviour you can use the CRAWLERA_PRESERVE_DELAY setting but keep in mind that this may slow down the crawl significantly") self._headers = self.crawler.settings.get('CRAWLERA_DEFAULT_HEADERS', {}).items() self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)