def process_request(self, request, spider): useragent = self._useragent rp = self.robot_parser(request, spider) if rp and not rp.can_fetch(useragent, request.url): log.msg(format="Forbidden by robots.txt: %(request)s", level=log.DEBUG, request=request) raise IgnoreRequest
def _debug_set_cookie(self, response, spider): if self.debug: cl = response.headers.getlist('Set-Cookie') if cl: msg = "Received cookies from: %s" % response + os.linesep msg += os.linesep.join("Set-Cookie: %s" % c for c in cl) log.msg(msg, spider=spider, level=log.DEBUG)
def _debug_cookie(self, request, spider): if self.debug: cl = request.headers.getlist('Cookie') if cl: msg = "Sending cookies to: %s" % request + os.linesep msg += os.linesep.join("Cookie: %s" % c for c in cl) log.msg(msg, spider=spider, level=log.DEBUG)
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 if retries <= self.max_retry_times: log.msg(format="Retrying %(request)s (failed %(retries)d times): %(reason)s", level=log.DEBUG, spider=spider, request=request, retries=retries, reason=reason) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust return retryreq else: log.msg(format="Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", level=log.DEBUG, spider=spider, request=request, retries=retries, reason=reason)
def _redirect(self, redirected, request, spider, reason): ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times) redirects = request.meta.get('redirect_times', 0) + 1 if ttl and redirects <= self.max_redirect_times: redirected.meta['redirect_times'] = redirects redirected.meta['redirect_ttl'] = ttl - 1 redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \ [request.url] redirected.dont_filter = request.dont_filter redirected.priority = request.priority + self.priority_adjust log.msg(format="Redirecting (%(reason)s) to %(redirected)s from %(request)s", level=log.DEBUG, spider=spider, request=request, redirected=redirected, reason=reason) return redirected else: log.msg(format="Discarding %(request)s: max redirections reached", level=log.DEBUG, spider=spider, request=request) raise IgnoreRequest("max redirections reached")
def from_settings(cls, settings, signals=None, stats=None): mwlist = cls._get_mwlist_from_settings(settings) middlewares = [] for clspath in mwlist: try: mwcls = load_object(clspath) if hasattr(mwcls, 'from_settings'): mw = mwcls.from_settings(settings, signals, stats) else: mw = mwcls() middlewares.append(mw) except NotConfigured as e: if e.args: clsname = clspath.split('.')[-1] log.msg(format="Disabled %(clsname)s: %(eargs)s", level=log.WARNING, clsname=clsname, eargs=e.args[0]) enabled = [x.__class__.__name__ for x in middlewares] log.msg(format="Enabled %(componentname)ss: %(enabledlist)s", level=log.INFO, componentname=cls.component_name, enabledlist=', '.join(enabled)) return cls(*middlewares)
def process_response(self, request, response, spider): if not isinstance(response, HtmlResponse) or response.status != 200: return response if request.method != 'GET': # other HTTP methods are either not safe or don't have a body return response if 'ajax_crawlable' in request.meta: # prevent loops return response if not self._has_ajax_crawlable_variant(response): return response # scrapy already handles #! links properly ajax_crawl_request = request.replace(url=request.url+'#!') log.msg(format="Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s", level=log.DEBUG, spider=spider, ajax_crawl_request=ajax_crawl_request, request=request) ajax_crawl_request.meta['ajax_crawlable'] = True return ajax_crawl_request
def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named): """Like pydispatcher.robust.sendRobust but it also logs errors and returns Failures instead of exceptions. """ dont_log = named.pop('dont_log', None) spider = named.get('spider', None) responses = [] for receiver in liveReceivers(getAllReceivers(sender, signal)): try: response = robustApply(receiver, signal=signal, sender=sender, *arguments, **named) if isinstance(response, Deferred): log.msg(format="Cannot return deferreds from signal handler: %(receiver)s", level=log.ERROR, spider=spider, receiver=receiver) except dont_log: result = Failure() except Exception: result = Failure() log.err(result, "Error caught on signal handler: %s" % receiver, \ spider=spider) else: result = response responses.append((receiver, result)) return responses
def close_spider(self, spider, reason): if self._dump: log.msg("Dumping Scrapy stats:\n" + pprint.pformat(self._stats), \ spider=spider) self._persist_stats(self._stats, spider)