def run(self, args, opts): url = args[0] if args else None if url: # first argument may be a local file url = guess_scheme(url) spider_loader = self.crawler_process.spider_loader spidercls = DefaultSpider if opts.spider: spidercls = spider_loader.load(opts.spider) elif url: spidercls = spidercls_for_request(spider_loader, Request(url), spidercls, log_multiple=True) # The crawler is created this way since the Shell manually handles the # crawling engine, so the set up in the crawl method won't work crawler = self.crawler_process._create_crawler(spidercls) # The Shell class needs a persistent engine in the crawler crawler.engine = crawler._create_engine() crawler.engine.start() self._start_crawler_thread() shell = Shell(crawler, update_vars=self.update_vars, code=opts.code) shell.start(url=url, redirect=not opts.no_redirect)
def do_expected(self): url = guess_scheme(args[0]) assert url.startswith(args[1]), "Wrong scheme guessed: for `%s` got `%s`, expected `%s...`" % ( args[0], url, args[1], )
def start_requests(self): if self.seeds_url is None: raise ValueError("Please pass seeds_url to the spider. It should " "be a text file with urls, one per line.") seeds_url = guess_scheme(self.seeds_url) # don't log DepthMiddleware messages # see https://github.com/scrapy/scrapy/issues/1308 logging.getLogger("scrapy.spidermiddlewares.depth").setLevel( logging.INFO) # increase response count on filtered out requests self.crawler.signals.connect(self.on_offdomain_request_dropped, offdomain_request_dropped) yield scrapy.Request(seeds_url, self._parse_seeds, dont_filter=True, meta={'dont_obey_robotstxt': True})
def do_expected(self): raise unittest.SkipTest(args[2]) url = guess_scheme(args[0]) assert url.startswith(args[1])
def do_expected(self): url = guess_scheme(args[0]) assert url.startswith(args[1]), \ 'Wrong scheme guessed: for `%s` got `%s`, expected `%s...`' % ( args[0], url, args[1])
def do_expected(self): url = guess_scheme(args[0]) assert url.startswith(args[1]), \ f'Wrong scheme guessed: for `{args[0]}` got `{url}`, expected `{args[1]}...`'
def do_expected(self): url = guess_scheme(args[0]) assert url.startswith(args[1]), \ 'Wrong scheme guessed: for `{0!s}` got `{1!s}`, expected `{2!s}...`'.format( args[0], url, args[1])