def next_requests(self): fetch_one = self.redis_con.spop if self.use_set else self.redis_con.lpop found = 0 while found < self.redis_batch_size: data = fetch_one(self.task_queue) if not data: break url = data.decode() req = Request(url) if req: yield req found += 1 crawler_logger.info('Read {} requests from {}'.format( found, self.task_queue))
def next_requests_process(self, task_queue): fetch_one = self.redis_con.spop if self.use_set else self.redis_con.lpop found = 0 while found < self.redis_batch_size: data = fetch_one(task_queue) if not data: break proxy_url = data.decode() for url in self.urls: req = Request(url, meta={'proxy': proxy_url}, callback=self.parse, errback=self.parse_error) yield req found += 1 crawler_logger.info('Read {} ip proxies from {}'.format( found, task_queue))
def next_requests(self): fetch_one = self.redis_con.spop if self.use_set else self.redis_con.lpop found = 0 while found < self.redis_batch_size: data = fetch_one(self.task_queue) if not data: break url = data.decode() req = SplashRequest(url, args={ 'await': 2, 'timeout': 90 }, endpoint='render.html') if req: yield req found += 1 crawler_logger.info('Read {} requests from {}'.format( found, self.task_queue))