def run(self): self.port = start_test_site() self.portno = self.port.getHost().port self.spider = TestSpider() if self.spider: self.spider.start_urls = [ self.geturl("/"), self.geturl("/redirect"), ] dispatcher.connect(self.record_signal, signals.engine_started) dispatcher.connect(self.record_signal, signals.engine_stopped) dispatcher.connect(self.record_signal, signals.spider_opened) dispatcher.connect(self.record_signal, signals.spider_idle) dispatcher.connect(self.record_signal, signals.spider_closed) dispatcher.connect(self.item_scraped, signals.item_scraped) dispatcher.connect(self.request_received, signals.request_received) dispatcher.connect(self.response_downloaded, signals.response_downloaded) scrapymanager.configure() scrapymanager.queue.append_spider(self.spider) scrapymanager.start() self.port.stopListening() self.wasrun = True
def start(self, url): # disable accidental Ctrl-C key press from shutting down the engine signal.signal(signal.SIGINT, signal.SIG_IGN) reactor.callInThread(self._console_thread, url) scrapymanager.queue = KeepAliveExecutionQueue() scrapymanager.start()
def run(self, args, opts): if not len(args) == 1 or not is_url(args[0]): return False responses = [] # to collect downloaded responses request = Request(args[0], callback=responses.append) if opts.spider: try: spider = spiders.create(opts.spider) except KeyError: log.msg("Unable to find spider: %s" % opts.spider, log.ERROR) return else: spider = spiders.create_for_request(request) scrapymanager.configure() scrapymanager.queue.append_request(request, spider) scrapymanager.start() if not responses: log.msg("No response returned", log.ERROR, spider=spider) return # now process response # - if callbacks defined then call each one print results # - if --rules option given search for matching spider's rule # - default print result using default 'parse' spider's callback response = responses[0] if self.callbacks: # apply each callback for callback in self.callbacks: items, links = self.run_callback(spider, response, callback, args, opts) self.print_results(items, links, callback, opts) elif opts.rules: # search for matching spider's rule if hasattr(spider, "rules") and spider.rules: items, links = [], [] for rule in spider.rules: if rule.link_extractor.matches(response.url) and rule.callback: items, links = self.run_callback(spider, response, rule.callback, args, opts) self.print_results(items, links, rule.callback, opts) # first-match rule breaks rules loop break else: log.msg( 'No rules found for spider "%s", ' "please specify a callback for parsing" % spider.name, log.ERROR ) else: # default callback 'parse' items, links = self.run_callback(spider, response, "parse", args, opts) self.print_results(items, links, "parse", opts)
def run(self, args, opts): if len(args) != 1: return False if opts.output: file = open(opts.output, 'w+b') exporter = XmlItemExporter(file) dispatcher.connect(exporter.export_item, signal=signals.item_passed) exporter.start_exporting() module = _import_file(args[0]) # schedule spider and start engine scrapymanager.queue.append_spider(module.SPIDER) scrapymanager.start() if opts.output: exporter.finish_exporting()
def run(self, args, opts): if len(args) != 1 or not is_url(args[0]): return False cb = lambda x: self._print_response(x, opts) request = Request(args[0], callback=cb, dont_filter=True) spider = None if opts.spider: try: spider = spiders.create(opts.spider) except KeyError: log.msg("Could not find spider: %s" % opts.spider, log.ERROR) scrapymanager.configure() scrapymanager.queue.append_request(request, spider, \ default_spider=BaseSpider('default')) scrapymanager.start()
def main(): """Install item signal and run scrapy""" @connect(signals.item_passed) def catch_item(sender, item, **kwargs): print "Got:", item # shut off log settings.overrides['LOG_ENABLED'] = False scrapymanager.configure() spider = MySpider() scrapymanager.queue.append_spider(spider) print "STARTING ENGINE" scrapymanager.start() print "ENGINE STOPPED"
def run(self, args, opts): q = ExecutionQueue() urls, names = self._split_urls_and_names(args) for name in names: q.append_spider_name(name) if opts.spider: try: spider = spiders.create(opts.spider) for url in urls: q.append_url(url, spider) except KeyError: log.msg('Unable to find spider: %s' % opts.spider, log.ERROR) else: for name, urls in self._group_urls_by_spider(urls): spider = spiders.create(name) for url in urls: q.append_url(url, spider) scrapymanager.queue = q scrapymanager.start()
def run(self, args, opts): scrapymanager.start(*args)
def run(self, args, opts): queue_class = load_object(settings['SERVICE_QUEUE']) scrapymanager.queue = queue_class() scrapymanager.start()
def run(self): self.running = True scrapymanager.configure(control_reactor=False) scrapymanager.start() reactor.run(installSignalHandlers=False)