def execute(self, args, opts): task = None if opts.task_id: task = Task().load(id=opts.task_id) if opts.task_name: task = Task().next(name=opts.task_name) if task or len(args): if task: domain = task.domain else: domain = args[0] spider = spiders.fromdomain(domain) scrapymanager.configure() if opts.child: def _stop(): pass # monkeypatching stop command to prevent stoping prematurely in child mode scrapymanager.stop = _stop if not task.locked: task.lock() self.crawl(spider, task) scrapyengine.start() else: log.msg('You must specify atleast 1 domain', level=log.ERROR)
def run(self): self.port = start_test_site() self.portno = self.port.getHost().port self.spider = TestSpider() if self.spider: self.spider.start_urls = [ self.geturl("/"), self.geturl("/redirect"), ] dispatcher.connect(self.record_signal, signals.engine_started) dispatcher.connect(self.record_signal, signals.engine_stopped) dispatcher.connect(self.record_signal, signals.spider_opened) dispatcher.connect(self.record_signal, signals.spider_idle) dispatcher.connect(self.record_signal, signals.spider_closed) dispatcher.connect(self.item_scraped, signals.item_scraped) dispatcher.connect(self.request_received, signals.request_received) dispatcher.connect(self.response_downloaded, signals.response_downloaded) scrapymanager.configure() scrapymanager.queue.append_spider(self.spider) scrapymanager.start() self.port.stopListening() self.wasrun = True
def run(self): if not self.portno: self.port = start_test_site() self.portno = self.port.getHost().port else: self.port = start_test_site(self.portno) self.spider.start_urls = [self.geturl("/")] scrapymanager.configure() scrapymanager.runonce(self.spider)
def run(self, args, opts): if not len(args) == 1 or not is_url(args[0]): return False responses = [] # to collect downloaded responses request = Request(args[0], callback=responses.append) if opts.spider: try: spider = spiders.create(opts.spider) except KeyError: log.msg("Unable to find spider: %s" % opts.spider, log.ERROR) return else: spider = spiders.create_for_request(request) scrapymanager.configure() scrapymanager.queue.append_request(request, spider) scrapymanager.start() if not responses: log.msg("No response returned", log.ERROR, spider=spider) return # now process response # - if callbacks defined then call each one print results # - if --rules option given search for matching spider's rule # - default print result using default 'parse' spider's callback response = responses[0] if self.callbacks: # apply each callback for callback in self.callbacks: items, links = self.run_callback(spider, response, callback, args, opts) self.print_results(items, links, callback, opts) elif opts.rules: # search for matching spider's rule if hasattr(spider, "rules") and spider.rules: items, links = [], [] for rule in spider.rules: if rule.link_extractor.matches(response.url) and rule.callback: items, links = self.run_callback(spider, response, rule.callback, args, opts) self.print_results(items, links, rule.callback, opts) # first-match rule breaks rules loop break else: log.msg( 'No rules found for spider "%s", ' "please specify a callback for parsing" % spider.name, log.ERROR ) else: # default callback 'parse' items, links = self.run_callback(spider, response, "parse", args, opts) self.print_results(items, links, "parse", opts)
def __init__(self, enable_log=False, stop_on_error=False, silence_errors=False, \ settings=None): self.stop_on_error = stop_on_error self.silence_errors = silence_errors # disable offsite middleware (by default) because it prevents free crawling if settings is not None: settings.overrides.update(settings) scrapy_settings.overrides['SPIDER_MIDDLEWARES'] = { 'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': None} scrapy_settings.overrides['LOG_ENABLED'] = enable_log scrapymanager.configure() dispatcher.connect(self._logmessage_received, signal=log.logmessage_received)
def execute(argv=None): if argv is None: argv = sys.argv cmds = _get_commands_dict() cmdname = _get_command_name(argv) _update_default_settings("scrapy.conf.commands", cmdname) _update_default_settings(settings["COMMANDS_SETTINGS_MODULE"], cmdname) parser = optparse.OptionParser( formatter=optparse.TitledHelpFormatter(), conflict_handler="resolve", add_help_option=False ) if cmdname in cmds: cmd = cmds[cmdname] cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) cmd.process_options(args, opts) parser.usage = "%%prog %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() if cmd.requires_project and not settings.settings_module: print "Error running: scrapy-ctl.py %s\n" % cmdname print "Cannot find project settings module in python path: %s" % settings.settings_module_path sys.exit(1) if opts.help: parser.print_help() sys.exit() elif not cmdname: cmd = ScrapyCommand() cmd.add_options(parser) opts, args = parser.parse_args(args=argv) cmd.process_options(args, opts) _print_usage(settings.settings_module) sys.exit(2) else: print "Unknown command: %s\n" % cmdname print 'Use "scrapy-ctl.py -h" for help' sys.exit(2) del args[0] # remove command name from args send_catch_log(signal=command_executed, cmdname=cmdname, cmdobj=cmd, args=args, opts=opts) from scrapy.core.manager import scrapymanager scrapymanager.configure(control_reactor=True) ret = _run_command(cmd, args, opts) if ret is False: parser.print_help()
def run(self, args, opts): if len(args) != 1 or not is_url(args[0]): return False cb = lambda x: self._print_response(x, opts) request = Request(args[0], callback=cb, dont_filter=True) spider = None if opts.spider: try: spider = spiders.create(opts.spider) except KeyError: log.msg("Could not find spider: %s" % opts.spider, log.ERROR) scrapymanager.configure() scrapymanager.queue.append_request(request, spider, \ default_spider=BaseSpider('default')) scrapymanager.start()
def main(): """Install item signal and run scrapy""" @connect(signals.item_passed) def catch_item(sender, item, **kwargs): print "Got:", item # shut off log settings.overrides['LOG_ENABLED'] = False scrapymanager.configure() spider = MySpider() scrapymanager.queue.append_spider(spider) print "STARTING ENGINE" scrapymanager.start() print "ENGINE STOPPED"
def run(self): self.running = True scrapymanager.configure(control_reactor=False) scrapymanager.start() reactor.run(installSignalHandlers=False)