Пример #1
0
    def run(self, args, opts):
        url = args[0] if args else None
        if url:
            # first argument may be a local file
            url = guess_scheme(url)

        spider_loader = self.crawler_process.spider_loader

        spidercls = DefaultSpider
        if opts.spider:
            spidercls = spider_loader.load(opts.spider)
        elif url:
            spidercls = spidercls_for_request(spider_loader, Request(url),
                                              spidercls, log_multiple=True)

        # The crawler is created this way since the Shell manually handles the
        # crawling engine, so the set up in the crawl method won't work
        crawler = self.crawler_process._create_crawler(spidercls)
        # The Shell class needs a persistent engine in the crawler
        crawler.engine = crawler._create_engine()
        crawler.engine.start()

        self._start_crawler_thread()

        shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
        shell.start(url=url, redirect=not opts.no_redirect)
Пример #2
0
def get_fetch(log=False):
    settings = Settings()
    settings.set('LOG_ENABLED', log)

    crawler_process = CrawlerProcess(settings)
    crawler = crawler_process.create_crawler()
    crawler_process.start_crawling()

    t = Thread(target=crawler_process.start_reactor)
    t.daemon = True
    t.start()

    shell = Shell(crawler)
    shell.code = 'adsf'

    import threading
    lock = threading.Lock()

    def fetch(url_or_request):
        lock.acquire()
        try:
            shell.fetch(url_or_request)
            response = shell.vars.get('response')
            return response
        finally:
            lock.release()

    return fetch
Пример #3
0
 def run(self, args, opts):
     url = args[0] if args else None
     shell = Shell(self.crawler,
                   update_vars=self.update_vars,
                   code=opts.code)
     self._start_crawler_thread()
     shell.start(url=url)
Пример #4
0
    def run(self, args, opts):
        url = args[0] if args else None
        if url:
            # first argument may be a local file
            url = guess_scheme(url)

        spider_loader = self.crawler_process.spider_loader

        spidercls = DefaultSpider
        if opts.spider:
            spidercls = spider_loader.load(opts.spider)
        elif url:
            spidercls = spidercls_for_request(spider_loader,
                                              Request(url),
                                              spidercls,
                                              log_multiple=True)

        # The crawler is created this way since the Shell manually handles the
        # crawling engine, so the set up in the crawl method won't work
        crawler = self.crawler_process._create_crawler(spidercls)
        # The Shell class needs a persistent engine in the crawler
        crawler.engine = crawler._create_engine()
        crawler.engine.start()

        self._start_crawler_thread()

        shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
        shell.start(url=url, redirect=not opts.no_redirect)
Пример #5
0
    def run(self, args, opts):
        url = args[0] if args else None
        spiders = self.crawler_process.spiders

        spidercls = DefaultSpider
        if opts.spider:
            spidercls = spiders.load(opts.spider)
        elif url:
            spidercls = spidercls_for_request(spiders,
                                              Request(url),
                                              spidercls,
                                              log_multiple=True)

        # The crawler is created this way since the Shell manually handles the
        # crawling engine, so the set up in the crawl method won't work
        crawler = self.crawler_process._create_logged_crawler(spidercls)
        # The Shell class needs a persistent engine in the crawler
        crawler.engine = crawler._create_engine()
        crawler.engine.start()

        self.crawler_process.start(start_reactor=False)
        self._start_crawler_thread()

        shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
        shell.start(url=url)
Пример #6
0
 def run(self, args, opts):
     url = args[0] if args else None
     spider = None
     if opts.spider:
         spider = self.crawler.spiders.create(opts.spider)
     shell = Shell(self.crawler, update_vars=self.update_vars, code=opts.code)
     self._start_crawler_thread()
     shell.start(url=url, spider=spider)
Пример #7
0
    def test_inspect_response_text(self):
        response = TextResponse(url='http://example.com/', body='''
            {"hello": "world"}
        ''')
        shell = Shell(self.crawler, code='None')
        shell.start(response=response, spider=self.spider)

        self.assertNotIn('sel', shell.vars)
Пример #8
0
    def test_inspect_response_xml(self):
        response = XmlResponse(url='http://example.com/', body='''
            <?xml version="1.0" encoding="UTF-8"?>
            <foo>Testing</foo>
        ''')
        shell = Shell(self.crawler, code='None')
        shell.start(response=response, spider=self.spider)

        self.assertIn('sel', shell.vars)
Пример #9
0
 def run(self, args, opts):
     url = args[0] if args else None
     spider = None
     if opts.spider:
         spider = self.crawler.spiders.create(opts.spider)
     shell = Shell(self.crawler,
                   update_vars=self.update_vars,
                   code=opts.code)
     self._start_crawler_thread()
     shell.start(url=url, spider=spider)
Пример #10
0
    def test_inspect_response_binary(self):
        response = Response(url='http://example.com/', body='''
            '{\xcc\xe8\x92\xe6\xb8\xa21\xb2\xe5O6\xc9\x84\xba8
            \xa3\x877\xa8v\xee9p.UJ\xa1m\x8a"H\xb3\xcc\x08\xff
            \x87d\x00i\xce\xb7a\xff\x8c\xd8NX\xae\xc2'
        ''')
        shell = Shell(self.crawler, code='None')
        shell.start(response=response, spider=self.spider)

        self.assertNotIn('sel', shell.vars)
Пример #11
0
    def test_inspect_response_html(self):
        response = HtmlResponse(url='http://example.com/', body='''
            <!doctype html>
            <html>
                <p>Testing</p>
            </html>
        ''')
        shell = Shell(self.crawler, code='None')
        shell.start(response=response, spider=self.spider)

        self.assertIn('sel', shell.vars)
Пример #12
0
 def run(self, args, opts):
     url = args[0] if args else None
     shell = Shell(self.crawler, update_vars=self.update_vars, inthread=True, \
         code=opts.code)
     def err(f):
         log.err(f, "Shell error")
         self.exitcode = 1
     d = shell.start(url=url)
     d.addErrback(err)
     d.addBoth(lambda _: self.crawler.stop())
     self.crawler.start()
Пример #13
0
    def run(self, args, opts):
        crawler = self.crawler_process.create_crawler()

        url = args[0] if args else None
        spider = crawler.spiders.create(opts.spider) if opts.spider else None

        self.crawler_process.start_crawling()
        self._start_crawler_thread()

        shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
        shell.start(url=url, spider=spider)
Пример #14
0
    def run(self, args, opts):
        crawler = self.crawler_process.create_crawler()

        url = args[0] if args else None
        spider = crawler.spiders.create(opts.spider) if opts.spider else None

        self.crawler_process.start_crawling()
        self._start_crawler_thread()

        shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
        shell.start(url=url, spider=spider)
Пример #15
0
def shell(argv):
    """ Open a url in the scrapy shell """
    parser = argparse.ArgumentParser('ozzy shell',
                                     description=shell.__doc__)
    parser.add_argument('url', help="URL to open in a shell")
    args = parser.parse_args(argv)

    crawler_process = CrawlerProcess(load_settings())
    crawler = crawler_process.create_crawler()
    crawler_process.start_crawling()
    thread = Thread(target=crawler_process.start_reactor)
    thread.daemon = True
    thread.start()
    sh = Shell(crawler)
    sh.start(url=args.url)
Пример #16
0
    def run(self, args, opts):
        url = args[0] if args else None
        spiders = self.crawler_process.spiders

        spidercls = DefaultSpider
        if opts.spider:
            spidercls = spiders.load(opts.spider)
        elif url:
            spidercls = spidercls_for_request(spiders, Request(url),
                                              spidercls, log_multiple=True)

        # The crawler is created this way since the Shell manually handles the
        # crawling engine, so the set up in the crawl method won't work
        crawler = self.crawler_process._create_logged_crawler(spidercls)
        # The Shell class needs a persistent engine in the crawler
        crawler.engine = crawler._create_engine()
        crawler.engine.start()

        self.crawler_process.start(start_reactor=False)
        self._start_crawler_thread()

        shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
        shell.start(url=url)
Пример #17
0
 def run(self, args, opts):
     url = args[0] if args else None
     shell = Shell(self.update_vars)
     shell.start(url)
Пример #18
0
 def run(self, args, opts):
     url = args[0] if args else None
     shell = Shell(self.crawler, update_vars=self.update_vars, code=opts.code)
     self._start_crawler_thread()
     shell.start(url=url)