Exemplo n.º 1
0
    def run(self):
        self.port = start_test_site()
        self.portno = self.port.getHost().port

        self.spider = TestSpider()
        if self.spider:
            self.spider.start_urls = [
                self.geturl("/"),
                self.geturl("/redirect"),
                ]

            dispatcher.connect(self.record_signal, signals.engine_started)
            dispatcher.connect(self.record_signal, signals.engine_stopped)
            dispatcher.connect(self.record_signal, signals.spider_opened)
            dispatcher.connect(self.record_signal, signals.spider_idle)
            dispatcher.connect(self.record_signal, signals.spider_closed)
            dispatcher.connect(self.item_scraped, signals.item_scraped)
            dispatcher.connect(self.request_received, signals.request_received)
            dispatcher.connect(self.response_downloaded, signals.response_downloaded)

            scrapymanager.configure()
            scrapymanager.queue.append_spider(self.spider)
            scrapymanager.start()
            self.port.stopListening()
            self.wasrun = True
Exemplo n.º 2
0
    def start(self, url):
        # disable accidental Ctrl-C key press from shutting down the engine
        signal.signal(signal.SIGINT, signal.SIG_IGN)

        reactor.callInThread(self._console_thread, url)
        scrapymanager.queue = KeepAliveExecutionQueue()
        scrapymanager.start()
Exemplo n.º 3
0
    def run(self, args, opts):
        if not len(args) == 1 or not is_url(args[0]):
            return False

        responses = []  # to collect downloaded responses
        request = Request(args[0], callback=responses.append)

        if opts.spider:
            try:
                spider = spiders.create(opts.spider)
            except KeyError:
                log.msg("Unable to find spider: %s" % opts.spider, log.ERROR)
                return
        else:
            spider = spiders.create_for_request(request)

        scrapymanager.configure()
        scrapymanager.queue.append_request(request, spider)
        scrapymanager.start()

        if not responses:
            log.msg("No response returned", log.ERROR, spider=spider)
            return

        # now process response
        #   - if callbacks defined then call each one print results
        #   - if --rules option given search for matching spider's rule
        #   - default print result using default 'parse' spider's callback
        response = responses[0]

        if self.callbacks:
            # apply each callback
            for callback in self.callbacks:
                items, links = self.run_callback(spider, response, callback, args, opts)
                self.print_results(items, links, callback, opts)
        elif opts.rules:
            # search for matching spider's rule
            if hasattr(spider, "rules") and spider.rules:
                items, links = [], []
                for rule in spider.rules:
                    if rule.link_extractor.matches(response.url) and rule.callback:

                        items, links = self.run_callback(spider, response, rule.callback, args, opts)
                        self.print_results(items, links, rule.callback, opts)
                        # first-match rule breaks rules loop
                        break
            else:
                log.msg(
                    'No rules found for spider "%s", ' "please specify a callback for parsing" % spider.name, log.ERROR
                )
        else:
            # default callback 'parse'
            items, links = self.run_callback(spider, response, "parse", args, opts)
            self.print_results(items, links, "parse", opts)
Exemplo n.º 4
0
    def run(self, args, opts):
        if len(args) != 1:
            return False
        if opts.output:
            file = open(opts.output, 'w+b')
            exporter = XmlItemExporter(file)
            dispatcher.connect(exporter.export_item, signal=signals.item_passed)
            exporter.start_exporting()
        module = _import_file(args[0])

        # schedule spider and start engine
        scrapymanager.queue.append_spider(module.SPIDER)
        scrapymanager.start()

        if opts.output:
            exporter.finish_exporting()
Exemplo n.º 5
0
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            return False
        cb = lambda x: self._print_response(x, opts)
        request = Request(args[0], callback=cb, dont_filter=True)

        spider = None
        if opts.spider:
            try:
                spider = spiders.create(opts.spider)
            except KeyError:
                log.msg("Could not find spider: %s" % opts.spider, log.ERROR)

        scrapymanager.configure()
        scrapymanager.queue.append_request(request, spider, \
            default_spider=BaseSpider('default'))
        scrapymanager.start()
Exemplo n.º 6
0
def main():
    """Install item signal and run scrapy"""
    @connect(signals.item_passed)
    def catch_item(sender, item, **kwargs):
        print "Got:", item

    # shut off log
    settings.overrides['LOG_ENABLED'] = False

    scrapymanager.configure()

    spider = MySpider()
    scrapymanager.queue.append_spider(spider)

    print "STARTING ENGINE"
    scrapymanager.start()
    print "ENGINE STOPPED"
Exemplo n.º 7
0
    def run(self, args, opts):
        q = ExecutionQueue()
        urls, names = self._split_urls_and_names(args)
        for name in names:
            q.append_spider_name(name)

        if opts.spider:
            try:
                spider = spiders.create(opts.spider)
                for url in urls:
                    q.append_url(url, spider)
            except KeyError:
                log.msg('Unable to find spider: %s' % opts.spider, log.ERROR)
        else:
            for name, urls in self._group_urls_by_spider(urls):
                spider = spiders.create(name)
                for url in urls:
                    q.append_url(url, spider)

        scrapymanager.queue = q
        scrapymanager.start()
Exemplo n.º 8
0
 def run(self, args, opts):
     scrapymanager.start(*args)
Exemplo n.º 9
0
 def run(self, args, opts):
     queue_class = load_object(settings['SERVICE_QUEUE'])
     scrapymanager.queue = queue_class()
     scrapymanager.start()
Exemplo n.º 10
0
 def run(self):
     self.running = True
     scrapymanager.configure(control_reactor=False)
     scrapymanager.start()
     reactor.run(installSignalHandlers=False)
Exemplo n.º 11
0
 def run(self):
     self.running = True
     scrapymanager.configure(control_reactor=False)
     scrapymanager.start()
     reactor.run(installSignalHandlers=False)