def next_request(self): queues = self.redis_conn.keys(self.queue_name) if queues: queue = random.choice(queues) self.logger.info("length of queue %s is %s" % (queue, self.redis_conn.zcard(queue))) if self.settings.get("CUSTOM_REDIS"): item = self.redis_conn.zpop(queue) else: pipe = self.redis_conn.pipeline() pipe.multi() pipe.zrange(queue, 0, 0).zremrangebyrank(queue, 0, 0) result, count = pipe.execute() item = result[0] if item: item = json.loads(item) try: req = Request(item['url']) except ValueError: req = Request('http://' + item['url']) if 'callback' in item: cb = item['callback'] if cb and self.spider: cb = getattr(self.spider, cb) req.callback = cb if 'errback' in item: eb = item['errback'] if eb and self.spider: eb = getattr(self.spider, eb) req.errback = eb if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in item.keys(): req.meta[key] = item[key] if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], basestring): req.cookies = parse_cookie(item['cookie']) return req
def next_request(self): self.logger.info( "length of queue %s is %s" % (self.queue_name, self.redis_conn.zcard(self.queue_name))) item = None if time.time() - self.request_interval < self.last_acs_time: return item if self.settings.getbool("CUSTOM_REDIS"): item = self.redis_conn.zpop(self.queue_name) else: pipe = self.redis_conn.pipeline() pipe.multi() pipe.zrange(self.queue_name, 0, 0).zremrangebyrank(self.queue_name, 0, 0) result, count = pipe.execute() if result: item = result[0] if item: self.last_acs_time = time.time() item = pickle.loads(item) self.present_item = item headers = item.get("headers", {}) body = item.get("body") if item.get("method"): method = item.get("method") else: method = "GET" try: req = Request(item['url'], method=method, body=body, headers=headers) except ValueError: req = Request('http://' + item['url'], method=method, body=body, headers=headers) if 'callback' in item: cb = item['callback'] if cb and self.spider: cb = getattr(self.spider, cb) req.callback = cb if 'errback' in item: eb = item['errback'] if eb and self.spider: eb = getattr(self.spider, eb) req.errback = eb if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in item.keys(): req.meta[key] = item[key] if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], (str, bytes)): req.cookies = parse_cookie(item['cookie']) return req