def test_six_range(self): import six.moves seq = six.moves.range(10**3, 10**6) d = SequenceExclude(seq) self.assertIn(10**2, d) self.assertIn(10**7, d) self.assertNotIn(10**4, d)
def run(self, args, opts): if len(args) != 1 or not is_url(args[0]): raise UsageError() request = Request( args[0], callback=self._print_response, cb_kwargs={"opts": opts}, dont_filter=True, ) # by default, let the framework handle redirects, # i.e. command handles all codes expect 3xx if not opts.no_redirect: request.meta["handle_httpstatus_list"] = SequenceExclude( range(300, 400)) else: request.meta["handle_httpstatus_all"] = True spidercls = DefaultSpider spider_loader = self.crawler_process.spider_loader if opts.spider: spidercls = spider_loader.load(opts.spider) else: spidercls = spidercls_for_request(spider_loader, request, spidercls) self.crawler_process.crawl(spidercls, start_requests=lambda: [request]) self.crawler_process.start()
def run(self, args, opts): # 参数校验 if len(args) != 1 or not is_url(args[0]): raise UsageError() # 定义输出回调函数 cb = lambda x: self._print_response(x, opts) # 初始化一个request对象 request = Request(args[0], callback=cb, dont_filter=True) # by default, let the framework handle redirects, # i.e. command handles all codes expect 3xx # 如果选项中没有no_redirect,即不进行转发,则可处理的状态列表中包含除了300到400之间的所有状态码 if not opts.no_redirect: request.meta['handle_httpstatus_list'] = SequenceExclude( range(300, 400)) else: # 否则全部够可以处理,转发有请求库自动处理 request.meta['handle_httpstatus_all'] = True # 初始化赋值为自带简易爬虫 spidercls = DefaultSpider # 初始化爬虫加载器 spider_loader = self.crawler_process.spider_loader # 如果给定了爬虫选项,则根据给定的爬虫来进行爬取,否则根据request url来查找匹配爬虫 if opts.spider: spidercls = spider_loader.load(opts.spider) else: spidercls = spidercls_for_request(spider_loader, request, spidercls) # 默认使用自带的简易爬虫(scrapy.utils.spider.DefaultSpider)来进行给定url的数据抓取,只需要传递start_requests self.crawler_process.crawl(spidercls, start_requests=lambda: [request]) # 爬虫开启 self.crawler_process.start()
def test_range_step(self): seq = range(10, 20, 3) d = SequenceExclude(seq) are_not_in = [v for v in range(10, 20, 3) if v in d] self.assertEqual([], are_not_in) are_not_in = [v for v in range(10, 20) if v in d] self.assertEqual([11, 12, 14, 15, 17, 18], are_not_in)
def test_set(self): """Anything that is not in the supplied sequence will evaluate as 'in' the container.""" seq = {-3, "test", 1.1} d = SequenceExclude(seq) self.assertIn(0, d) self.assertIn("foo", d) self.assertIn(3.14, d) self.assertIn(set("bar"), d) # supplied sequence is a set, so checking for list (non)inclusion fails self.assertRaises(TypeError, (0, 1, 2) in d) self.assertRaises(TypeError, d.__contains__, ['a', 'b', 'c']) for v in [-3, "test", 1.1]: self.assertNotIn(v, d)
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs): if isinstance(request_or_url, Request): request = request_or_url else: url = any_to_uri(request_or_url) request = Request(url, dont_filter=True, **kwargs) if redirect: request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400)) else: request.meta['handle_httpstatus_all'] = True response = None try: response, spider = threads.blockingCallFromThread( reactor, self._schedule, request, spider) except IgnoreRequest: pass self.populate_vars(response, request, spider)
def test_stringset_seq(self): seq = set("cde") d = SequenceExclude(seq) chars = "".join(v for v in "abcdefg" if v in d) self.assertEqual("abfg", chars)
def test_range(self): seq = range(10, 20) d = SequenceExclude(seq) self.assertIn(5, d) self.assertIn(20, d) self.assertNotIn(15, d)
def test_list(self): seq = [1, 2, 3] d = SequenceExclude(seq) self.assertIn(0, d) self.assertIn(4, d) self.assertNotIn(2, d)
def _handle_statuses(self, allow_redirects): self.handle_httpstatus_list = None if allow_redirects: self.handle_httpstatus_list = SequenceExclude(range(300, 400))