예제 #1
0
    def next_requests(self):
        fetch_one = self.redis_con.spop if self.use_set else self.redis_con.lpop
        found = 0
        while found < self.redis_batch_size:
            data = fetch_one(self.task_queue)
            if not data:
                break
            url = data.decode()
            req = Request(url)
            if req:
                yield req
                found += 1

        crawler_logger.info('Read {} requests from {}'.format(
            found, self.task_queue))
예제 #2
0
 def next_requests_process(self, task_queue):
     fetch_one = self.redis_con.spop if self.use_set else self.redis_con.lpop
     found = 0
     while found < self.redis_batch_size:
         data = fetch_one(task_queue)
         if not data:
             break
         proxy_url = data.decode()
         for url in self.urls:
             req = Request(url,
                           meta={'proxy': proxy_url},
                           callback=self.parse,
                           errback=self.parse_error)
             yield req
             found += 1
     crawler_logger.info('Read {} ip proxies from {}'.format(
         found, task_queue))
예제 #3
0
    def next_requests(self):
        fetch_one = self.redis_con.spop if self.use_set else self.redis_con.lpop
        found = 0
        while found < self.redis_batch_size:
            data = fetch_one(self.task_queue)
            if not data:
                break
            url = data.decode()
            req = SplashRequest(url,
                                args={
                                    'await': 2,
                                    'timeout': 90
                                },
                                endpoint='render.html')
            if req:
                yield req
                found += 1

        crawler_logger.info('Read {} requests from {}'.format(
            found, self.task_queue))