예제 #1
0
파일: shell.py 프로젝트: elacuesta/scrapy
    def run(self, args, opts):
        url = args[0] if args else None
        if url:
            # first argument may be a local file
            url = guess_scheme(url)

        spider_loader = self.crawler_process.spider_loader

        spidercls = DefaultSpider
        if opts.spider:
            spidercls = spider_loader.load(opts.spider)
        elif url:
            spidercls = spidercls_for_request(spider_loader, Request(url),
                                              spidercls, log_multiple=True)

        # The crawler is created this way since the Shell manually handles the
        # crawling engine, so the set up in the crawl method won't work
        crawler = self.crawler_process._create_crawler(spidercls)
        # The Shell class needs a persistent engine in the crawler
        crawler.engine = crawler._create_engine()
        crawler.engine.start()

        self._start_crawler_thread()

        shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
        shell.start(url=url, redirect=not opts.no_redirect)
예제 #2
0
    def run(self, args, opts):
        url = args[0] if args else None
        if url:
            # first argument may be a local file
            url = guess_scheme(url)

        spider_loader = self.crawler_process.spider_loader

        spidercls = DefaultSpider
        if opts.spider:
            spidercls = spider_loader.load(opts.spider)
        elif url:
            spidercls = spidercls_for_request(spider_loader,
                                              Request(url),
                                              spidercls,
                                              log_multiple=True)

        # The crawler is created this way since the Shell manually handles the
        # crawling engine, so the set up in the crawl method won't work
        crawler = self.crawler_process._create_crawler(spidercls)
        # The Shell class needs a persistent engine in the crawler
        crawler.engine = crawler._create_engine()
        crawler.engine.start()

        self._start_crawler_thread()

        shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
        shell.start(url=url, redirect=not opts.no_redirect)
예제 #3
0
 def do_expected(self):
     url = guess_scheme(args[0])
     assert url.startswith(args[1]), "Wrong scheme guessed: for `%s` got `%s`, expected `%s...`" % (
         args[0],
         url,
         args[1],
     )
예제 #4
0
    def start_requests(self):
        if self.seeds_url is None:
            raise ValueError("Please pass seeds_url to the spider. It should "
                             "be a text file with urls, one per line.")
        seeds_url = guess_scheme(self.seeds_url)

        # don't log DepthMiddleware messages
        # see https://github.com/scrapy/scrapy/issues/1308
        logging.getLogger("scrapy.spidermiddlewares.depth").setLevel(
            logging.INFO)

        # increase response count on filtered out requests
        self.crawler.signals.connect(self.on_offdomain_request_dropped,
                                     offdomain_request_dropped)

        yield scrapy.Request(seeds_url,
                             self._parse_seeds,
                             dont_filter=True,
                             meta={'dont_obey_robotstxt': True})
예제 #5
0
 def do_expected(self):
     raise unittest.SkipTest(args[2])
     url = guess_scheme(args[0])
     assert url.startswith(args[1])
예제 #6
0
 def do_expected(self):
     url = guess_scheme(args[0])
     assert url.startswith(args[1]), \
         'Wrong scheme guessed: for `%s` got `%s`, expected `%s...`' % (
             args[0], url, args[1])
예제 #7
0
 def do_expected(self):
     url = guess_scheme(args[0])
     assert url.startswith(args[1]), \
         f'Wrong scheme guessed: for `{args[0]}` got `{url}`, expected `{args[1]}...`'
예제 #8
0
 def do_expected(self):
     url = guess_scheme(args[0])
     assert url.startswith(args[1]), \
         'Wrong scheme guessed: for `{0!s}` got `{1!s}`, expected `{2!s}...`'.format(
             args[0], url, args[1])