def test_six_range(self):
     import six.moves
     seq = six.moves.range(10**3, 10**6)
     d = SequenceExclude(seq)
     self.assertIn(10**2, d)
     self.assertIn(10**7, d)
     self.assertNotIn(10**4, d)
Пример #2
0
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            raise UsageError()
        request = Request(
            args[0],
            callback=self._print_response,
            cb_kwargs={"opts": opts},
            dont_filter=True,
        )
        # by default, let the framework handle redirects,
        # i.e. command handles all codes expect 3xx
        if not opts.no_redirect:
            request.meta["handle_httpstatus_list"] = SequenceExclude(
                range(300, 400))
        else:
            request.meta["handle_httpstatus_all"] = True

        spidercls = DefaultSpider
        spider_loader = self.crawler_process.spider_loader
        if opts.spider:
            spidercls = spider_loader.load(opts.spider)
        else:
            spidercls = spidercls_for_request(spider_loader, request,
                                              spidercls)
        self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
        self.crawler_process.start()
Пример #3
0
 def run(self, args, opts):
     # 参数校验
     if len(args) != 1 or not is_url(args[0]):
         raise UsageError()
     # 定义输出回调函数
     cb = lambda x: self._print_response(x, opts)
     # 初始化一个request对象
     request = Request(args[0], callback=cb, dont_filter=True)
     # by default, let the framework handle redirects,
     # i.e. command handles all codes expect 3xx
     # 如果选项中没有no_redirect,即不进行转发,则可处理的状态列表中包含除了300到400之间的所有状态码
     if not opts.no_redirect:
         request.meta['handle_httpstatus_list'] = SequenceExclude(
             range(300, 400))
     else:
         # 否则全部够可以处理,转发有请求库自动处理
         request.meta['handle_httpstatus_all'] = True
     # 初始化赋值为自带简易爬虫
     spidercls = DefaultSpider
     # 初始化爬虫加载器
     spider_loader = self.crawler_process.spider_loader
     # 如果给定了爬虫选项,则根据给定的爬虫来进行爬取,否则根据request url来查找匹配爬虫
     if opts.spider:
         spidercls = spider_loader.load(opts.spider)
     else:
         spidercls = spidercls_for_request(spider_loader, request,
                                           spidercls)
     # 默认使用自带的简易爬虫(scrapy.utils.spider.DefaultSpider)来进行给定url的数据抓取,只需要传递start_requests
     self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
     # 爬虫开启
     self.crawler_process.start()
    def test_range_step(self):
        seq = range(10, 20, 3)
        d = SequenceExclude(seq)
        are_not_in = [v for v in range(10, 20, 3) if v in d]
        self.assertEqual([], are_not_in)

        are_not_in = [v for v in range(10, 20) if v in d]
        self.assertEqual([11, 12, 14, 15, 17, 18], are_not_in)
    def test_set(self):
        """Anything that is not in the supplied sequence will evaluate as 'in' the container."""
        seq = {-3, "test", 1.1}
        d = SequenceExclude(seq)
        self.assertIn(0, d)
        self.assertIn("foo", d)
        self.assertIn(3.14, d)
        self.assertIn(set("bar"), d)

        # supplied sequence is a set, so checking for list (non)inclusion fails
        self.assertRaises(TypeError, (0, 1, 2) in d)
        self.assertRaises(TypeError, d.__contains__, ['a', 'b', 'c'])

        for v in [-3, "test", 1.1]:
            self.assertNotIn(v, d)
Пример #6
0
 def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
     if isinstance(request_or_url, Request):
         request = request_or_url
     else:
         url = any_to_uri(request_or_url)
         request = Request(url, dont_filter=True, **kwargs)
         if redirect:
             request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
         else:
             request.meta['handle_httpstatus_all'] = True
     response = None
     try:
         response, spider = threads.blockingCallFromThread(
             reactor, self._schedule, request, spider)
     except IgnoreRequest:
         pass
     self.populate_vars(response, request, spider)
 def test_stringset_seq(self):
     seq = set("cde")
     d = SequenceExclude(seq)
     chars = "".join(v for v in "abcdefg" if v in d)
     self.assertEqual("abfg", chars)
 def test_range(self):
     seq = range(10, 20)
     d = SequenceExclude(seq)
     self.assertIn(5, d)
     self.assertIn(20, d)
     self.assertNotIn(15, d)
 def test_list(self):
     seq = [1, 2, 3]
     d = SequenceExclude(seq)
     self.assertIn(0, d)
     self.assertIn(4, d)
     self.assertNotIn(2, d)
Пример #10
0
 def _handle_statuses(self, allow_redirects):
     self.handle_httpstatus_list = None
     if allow_redirects:
         self.handle_httpstatus_list = SequenceExclude(range(300, 400))