def _process_spidermw_output(self, output, request, response, spider): """Process each Request/Item (given in the output parameter) returned from the given spider """ # TODO: keep closing state internally instead of checking engine if spider in self.engine.closing: return elif isinstance(output, Request): send_catch_log(signal=signals.request_received, request=output, \ spider=spider) self.engine.crawl(request=output, spider=spider) elif isinstance(output, BaseItem): log.msg("Scraped %s in <%s>" % (output, request.url), level=log.DEBUG, \ spider=spider) send_catch_log(signal=signals.item_scraped, sender=self.__class__, \ item=output, spider=spider, response=response) self.sites[spider].itemproc_size += 1 # FIXME: this can't be called here because the stats spider may be # already closed #stats.max_value('scraper/max_itemproc_size', \ # self.sites[spider].itemproc_size, spider=spider) dfd = self.itemproc.process_item(output, spider) dfd.addBoth(self._itemproc_finished, output, spider) return dfd elif output is None: pass else: log.msg("Spider must return Request, BaseItem or None, got %r in %s" % \ (type(output).__name__, request), log.ERROR, spider=spider)
def _on_success(response): assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received log.msg(log.formatter.crawled(request, response, spider), level=log.DEBUG, spider=spider) send_catch_log(signal=signals.response_received, response=response, request=request, spider=spider) return response
def close_spider(self, spider, reason): send_catch_log(stats_spider_closing, spider=spider, reason=reason) stats = self._stats.pop(spider) send_catch_log(stats_spider_closed, spider=spider, reason=reason, spider_stats=stats) if self._dump: log.msg("Dumping spider stats:\n" + pprint.pformat(stats), spider=spider) self._persist_stats(stats, spider)
def handle_spider_error(self, _failure, request, response, spider): exc = _failure.value if isinstance(exc, CloseSpider): self.crawler.engine.close_spider(spider, exc.reason or "cancelled") return log.err(_failure, "Spider error processing %s" % request, spider=spider) send_catch_log(signal=signals.spider_error, failure=_failure, response=response, spider=spider) stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, spider=spider)
def handle_spider_error(self, _failure, request, response, spider, propagated_failure=None): referer = request.headers.get('Referer', None) msg = "Spider error processing <%s> (referer: <%s>)" % \ (request.url, referer) log.err(_failure, msg, spider=spider) send_catch_log(signal=signals.spider_error, failure=_failure, response=response, \ spider=spider) stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \ spider=spider)
def open_spider(self, spider): log.msg("Spider opened", spider=spider) self.next_request(spider) self.downloader.open_spider(spider) self.scraper.open_spider(spider) stats.open_spider(spider) send_catch_log(signals.spider_opened, sender=self.__class__, spider=spider)
def close_spider(self, spider, reason): send_catch_log(stats_spider_closing, spider=spider, reason=reason) stats = self._stats.pop(spider) send_catch_log(stats_spider_closed, spider=spider, reason=reason, \ spider_stats=stats) if self._dump: log.msg("Dumping spider stats:\n" + pprint.pformat(stats), \ spider=spider) self._persist_stats(stats, spider)
def _on_success(response): assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received log.msg(log.formatter.crawled(request, response, spider), \ level=log.DEBUG, spider=spider) send_catch_log(signal=signals.response_received, \ response=response, request=request, spider=spider) return response
def start(self): """Start the execution engine""" if self.running: return self.start_time = time() send_catch_log(signal=signals.engine_started, sender=self.__class__) self._mainloop_task.start(5.0, now=True) reactor.callWhenRunning(self._mainloop) self.running = True
def test_error_logged_if_deferred_not_supported(self): test_signal = object() test_handler = lambda: defer.Deferred() dispatcher.connect(test_handler, test_signal) with LogCapture() as l: send_catch_log(test_signal) self.assertEqual(len(l.records), 1) self.assertIn("Cannot return deferreds from signal handler", str(l)) dispatcher.disconnect(test_handler, test_signal)
def open_spider(self, spider): assert self.has_capacity(), "No free spider slots when opening %r" % \ spider.name log.msg("Spider opened", spider=spider) self.scheduler.open_spider(spider) self.downloader.open_spider(spider) self.scraper.open_spider(spider) stats.open_spider(spider) send_catch_log(signals.spider_opened, sender=self.__class__, spider=spider) self.next_request(spider)
def _on_success(response): """handle the result of a page download""" assert isinstance(response, (Response, Request)) if isinstance(response, Response): response.request = request # tie request to response received log.msg(log.formatter.crawled(request, response, spider), \ level=log.DEBUG, spider=spider) send_catch_log(signal=signals.response_received, \ response=response, request=request, spider=spider) return response
def handle_spider_error(self, _failure, request, response, spider): exc = _failure.value if isinstance(exc, CloseSpider): self.crawler.engine.close_spider(spider, exc.reason or 'cancelled') return log.err(_failure, "Spider error processing %s" % request, spider=spider) send_catch_log(signal=signals.spider_error, failure=_failure, response=response, \ spider=spider) stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \ spider=spider)
def parse(self, response, basic_link_info, spider): if not self.conditon_permit(response, basic_link_info, spider): return ReturnStatus.move_on self.log("use parser: %s" % type(self)) self.init_context(response, basic_link_info, spider) item_num = self.process() send_catch_log(signal=signals.item_extracted, url=self.response.url, item_num=item_num) return ReturnStatus.stop_it
def parse(self, response, basic_link_info, spider): if not self.conditon_permit(response, basic_link_info, spider): return ReturnStatus.move_on self.log("use parser: %s" % type(self)) self.init_context(response, basic_link_info, spider) link_num = self.process() send_catch_log(signal=signals.link_extracted, url=self.response.url, link_num=link_num) return ReturnStatus.stop_it
def test_error_logged_if_deferred_not_supported(self): test_signal = object() test_handler = lambda: defer.Deferred() log_events = [] txlog.addObserver(log_events.append) dispatcher.connect(test_handler, test_signal) send_catch_log(test_signal) self.assertTrue(log_events) self.assertIn("Cannot return deferreds from signal handler", str(log_events)) txlog.removeObserver(log_events.append) self.flushLoggedErrors() dispatcher.disconnect(test_handler, test_signal)
def test_error_logged_if_deferred_not_supported(self): test_signal = object() test_handler = lambda: defer.Deferred() log_events = [] txlog.addObserver(log_events.append) dispatcher.connect(test_handler, test_signal) send_catch_log(test_signal) self.failUnless(log_events) self.failUnless( "Cannot return deferreds from signal handler" in str(log_events)) txlog.removeObserver(log_events.append) self.flushLoggedErrors() dispatcher.disconnect(test_handler, test_signal)
def execute(argv=None): if argv is None: argv = sys.argv cmds = _get_commands_dict() cmdname = _get_command_name(argv) _update_default_settings("scrapy.conf.commands", cmdname) _update_default_settings(settings["COMMANDS_SETTINGS_MODULE"], cmdname) parser = optparse.OptionParser( formatter=optparse.TitledHelpFormatter(), conflict_handler="resolve", add_help_option=False ) if cmdname in cmds: cmd = cmds[cmdname] cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) cmd.process_options(args, opts) parser.usage = "%%prog %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() if cmd.requires_project and not settings.settings_module: print "Error running: scrapy-ctl.py %s\n" % cmdname print "Cannot find project settings module in python path: %s" % settings.settings_module_path sys.exit(1) if opts.help: parser.print_help() sys.exit() elif not cmdname: cmd = ScrapyCommand() cmd.add_options(parser) opts, args = parser.parse_args(args=argv) cmd.process_options(args, opts) _print_usage(settings.settings_module) sys.exit(2) else: print "Unknown command: %s\n" % cmdname print 'Use "scrapy-ctl.py -h" for help' sys.exit(2) del args[0] # remove command name from args send_catch_log(signal=command_executed, cmdname=cmdname, cmdobj=cmd, args=args, opts=opts) from scrapy.core.manager import scrapymanager scrapymanager.configure(control_reactor=True) ret = _run_command(cmd, args, opts) if ret is False: parser.print_help()
def send(self, to, subject, body, cc=None, attachs=()): if attachs: msg = MIMEMultipart() else: msg = MIMENonMultipart('text', 'plain') msg['From'] = self.mailfrom msg['To'] = COMMASPACE.join(to) msg['Date'] = formatdate(localtime=True) msg['Subject'] = subject rcpts = to[:] if cc: rcpts.extend(cc) msg['Cc'] = COMMASPACE.join(cc) if attachs: msg.attach(MIMEText(body)) for attach_name, mimetype, f in attachs: part = MIMEBase(*mimetype.split('/')) part.set_payload(f.read()) Encoders.encode_base64(part) part.add_header('Content-Disposition', 'attachment; filename="%s"' \ % attach_name) msg.attach(part) else: msg.set_payload(body) send_catch_log(signal=mail_sent, to=to, subject=subject, body=body, cc=cc, attach=attachs, msg=msg) if self.debug: log.msg('Debug mail sent OK: To=%s Cc=%s Subject="%s" Attachs=%d' % \ (to, cc, subject, len(attachs)), level=log.DEBUG) return dfd = self._sendmail(rcpts, msg.as_string()) dfd.addCallbacks(self._sent_ok, self._sent_failed, callbackArgs=[to, cc, subject, len(attachs)], errbackArgs=[to, cc, subject, len(attachs)]) reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd) return dfd
def _get_telnet_vars(self): # Note: if you add entries here also update topics/telnetconsole.rst telnet_vars = { 'engine': scrapymanager.engine, 'manager': scrapymanager, 'extensions': extensions, 'stats': stats, 'spiders': spiders, 'settings': settings, 'est': print_engine_status, 'p': pprint.pprint, 'prefs': print_live_refs, 'hpy': hpy, } send_catch_log(update_telnet_vars, telnet_vars=telnet_vars) return telnet_vars
def test_send_catch_log(self): handlers_called = set() def test_handler_error(arg): handlers_called.add(test_handler_error) a = 1/0 def test_handler_check(arg): handlers_called.add(test_handler_check) assert arg == 'test' return "OK" def log_received(message, level): handlers_called.add(log_received) assert "test_handler_error" in message assert level == log.ERROR dispatcher.connect(log_received, signal=log.logmessage_received) dispatcher.connect(test_handler_error, signal=test_signal) dispatcher.connect(test_handler_check, signal=test_signal) result = send_catch_log(test_signal, arg='test') assert test_handler_error in handlers_called assert test_handler_check in handlers_called assert log_received in handlers_called self.assertEqual(result[0][0], test_handler_error) self.assert_(isinstance(result[0][1], Exception)) self.assertEqual(result[1], (test_handler_check, "OK")) dispatcher.disconnect(log_received, signal=log.logmessage_received) dispatcher.disconnect(test_handler_error, signal=test_signal) dispatcher.disconnect(test_handler_check, signal=test_signal)
def _finish_closing_spider(self, spider): """This function is called after the spider has been closed""" reason = self.closing.pop(spider, 'finished') send_catch_log(signal=signals.spider_closed, sender=self.__class__, \ spider=spider, reason=reason) call = self._next_request_calls.pop(spider, None) if call and call.active(): call.cancel() dfd = defer.maybeDeferred(stats.close_spider, spider, reason=reason) dfd.addErrback(log.err, "Unhandled error in stats.close_spider()", spider=spider) dfd.addBoth(lambda _: spiders.close_spider(spider)) dfd.addErrback(log.err, "Unhandled error in spiders.close_spider()", spider=spider) dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider)) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) return dfd
def process_response(response): assert response is not None, 'Received None in process_response' if isinstance(response, Request): return response for method in self.response_middleware: response = method(request=request, response=response, spider=spider) assert isinstance(response, (Response, Request)), \ 'Middleware %s.process_response must return Response or Request, got %s' % \ (method.im_self.__class__.__name__, type(response)) if isinstance(response, Request): send_catch_log(signal=signals.response_received, \ sender=self.__class__, response=response, spider=spider) return response send_catch_log(signal=signals.response_received, sender=self.__class__, \ response=response, spider=spider) return response
def _itemproc_finished(self, output, item, spider): """ItemProcessor finished for the given ``item`` and returned ``output`` """ self.sites[spider].itemproc_size -= 1 if isinstance(output, Failure): ex = output.value if isinstance(ex, DropItem): log.msg("Dropped %s - %s" % (item, str(ex)), level=log.WARNING, spider=spider) send_catch_log(signal=signals.item_dropped, sender=self.__class__, \ item=item, spider=spider, exception=output.value) else: log.msg('Error processing %s - %s' % (item, output), \ log.ERROR, spider=spider) else: log.msg("Passed %s" % item, log.INFO, spider=spider) send_catch_log(signal=signals.item_passed, sender=self.__class__, \ item=item, spider=spider, output=output)
def _process_spidermw_output(self, output, request, response, spider): """Process each Request/Item (given in the output parameter) returned from the given spider """ if isinstance(output, Request): send_catch_log(signal=signals.request_received, request=output, \ spider=spider) self.crawler.engine.crawl(request=output, spider=spider) elif isinstance(output, BaseItem): self.slots[spider].itemproc_size += 1 dfd = self.itemproc.process_item(output, spider) dfd.addBoth(self._itemproc_finished, output, response, spider) return dfd elif output is None: pass else: log.msg("Spider must return Request, BaseItem or None, got %r in %s" % \ (type(output).__name__, request), log.ERROR, spider=spider)
def send_catch_log(self, signal, **kwargs): """ Send a signal, catch exceptions and log them. The keyword arguments are passed to the signal handlers (connected through the :meth:`connect` method). """ kwargs.setdefault('sender', self.sender) return _signal.send_catch_log(signal, **kwargs)
def _get_telnet_vars(self): # Note: if you add entries here also update topics/telnetconsole.rst telnet_vars = { 'engine': crawler.engine, 'manager': crawler, 'extensions': crawler.extensions, 'stats': stats, 'spiders': crawler.spiders, 'settings': settings, 'est': print_engine_status, 'p': pprint.pprint, 'prefs': print_live_refs, 'hpy': hpy, 'help': "This is Scrapy telnet console. For more info see: " \ "http://doc.scrapy.org/topics/telnetconsole.html", # see #284 } send_catch_log(update_telnet_vars, telnet_vars=telnet_vars) return telnet_vars
def send(self, to, subject, body, cc=None, attachs=()): if attachs: msg = MIMEMultipart() else: msg = MIMENonMultipart('text', 'plain') msg['From'] = self.mailfrom msg['To'] = COMMASPACE.join(to) msg['Date'] = formatdate(localtime=True) msg['Subject'] = subject rcpts = to[:] if cc: rcpts.extend(cc) msg['Cc'] = COMMASPACE.join(cc) if attachs: msg.attach(MIMEText(body)) for attach_name, mimetype, f in attachs: part = MIMEBase(*mimetype.split('/')) part.set_payload(f.read()) Encoders.encode_base64(part) part.add_header('Content-Disposition', 'attachment; filename="%s"' \ % attach_name) msg.attach(part) else: msg.set_payload(body) send_catch_log(signal=mail_sent, to=to, subject=subject, body=body, cc=cc, attach=attachs, msg=msg) if settings.getbool('MAIL_DEBUG'): log.msg('Debug mail sent OK: To=%s Cc=%s Subject="%s" Attachs=%d' % \ (to, cc, subject, len(attachs)), level=log.DEBUG) return dfd = self._sendmail(rcpts, msg.as_string()) dfd.addCallbacks(self._sent_ok, self._sent_failed, callbackArgs=[to, cc, subject, len(attachs)], errbackArgs=[to, cc, subject, len(attachs)]) reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd) return dfd
def put(self, url, name, cat, price, collection_name=None): uid = get_uid(url) domain = get_domain(url) crawl_time = int(time.time()) item = self.get(url, collection_name) if item: if item.add_price(price, crawl_time): self.dbclient.update_field(uid, collection_name, data=item.data, bottom_price=item.bottom_price) else: log.msg('duplicate price') return else: item = { "url":url, 'uid': uid, "name":name, "cat":cat, "data":[(price, crawl_time)], "bottom_price":(price, crawl_time), "domain":domain } self.dbclient.insert(item, uid, collection_name) send_catch_log(signal=signals.item_saved, item=item)
def _process_spidermw_output(self, output, request, response, spider): """Process each Request/Item (given in the output parameter) returned from the given spider """ if isinstance(output, Request): send_catch_log(signal=signals.request_received, request=output, \ spider=spider) self.engine.crawl(request=output, spider=spider) elif isinstance(output, BaseItem): log.msg(log.formatter.scraped(output, request, response, spider), \ level=log.DEBUG, spider=spider) self.sites[spider].itemproc_size += 1 dfd = send_catch_log_deferred(signal=signals.item_scraped, \ item=output, spider=spider, response=response) dfd.addBoth(lambda _: self.itemproc.process_item(output, spider)) dfd.addBoth(self._itemproc_finished, output, spider) return dfd elif output is None: pass else: log.msg("Spider must return Request, BaseItem or None, got %r in %s" % \ (type(output).__name__, request), log.ERROR, spider=spider)
def _get_telnet_vars(self): # Note: if you add entries here also update topics/telnetconsole.rst slots = self.crawler.engine.slots if len(slots) == 1: spider, slot = slots.items()[0] telnet_vars = { 'engine': self.crawler.engine, 'spider': spider, 'slot': slot, 'manager': self.crawler, 'extensions': self.crawler.extensions, 'stats': stats, 'spiders': self.crawler.spiders, 'settings': self.crawler.settings, 'est': lambda: print_engine_status(self.crawler.engine), 'p': pprint.pprint, 'prefs': print_live_refs, 'hpy': hpy, 'help': "This is Scrapy telnet console. For more info see: " \ "http://doc.scrapy.org/en/latest/topics/telnetconsole.html", } send_catch_log(update_telnet_vars, telnet_vars=telnet_vars) return telnet_vars
def _spider_idle(self, spider): """Called when a spider gets idle. This function is called when there are no remaining pages to download or schedule. It can be called multiple times. If some extension raises a DontCloseSpider exception (in the spider_idle signal handler) the spider is not closed until the next loop and this function is guaranteed to be called (at least) once again for this spider. """ res = send_catch_log(signal=signals.spider_idle, spider=spider, dont_log=DontCloseSpider) if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) for _, x in res): reactor.callLater(5, self.next_request, spider) return if self.spider_is_idle(spider): self.close_spider(spider, reason="finished")
def _spider_idle(self, spider): """Called when a spider gets idle. This function is called when there are no remaining pages to download or schedule. It can be called multiple times. If some extension raises a DontCloseSpider exception (in the spider_idle signal handler) the spider is not closed until the next loop and this function is guaranteed to be called (at least) once again for this spider. """ res = send_catch_log(signal=signals.spider_idle, \ spider=spider, dont_log=DontCloseSpider) if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \ for _, x in res): self.slots[spider].nextcall.schedule(5) return if self.spider_is_idle(spider): self.close_spider(spider, reason='finished')
def _downloaded(response): send_catch_log(signal=signals.response_downloaded, \ response=response, request=request, spider=spider) return response
def _deactivate(response): send_catch_log(signal=signals.response_received, \ response=response, request=request, spider=spider) site.active.remove(request) self._close_if_idle(spider) return response
def _get_result(self, signal, *a, **kw): return send_catch_log(signal, *a, **kw)
def send_catch_log(self, *a, **kw): kw.setdefault('sender', self.sender) return signal.send_catch_log(*a, **kw)
def open_spider(self, spider): self._stats[spider] = {} send_catch_log(stats_spider_opened, spider=spider)