class SchedulerServer(): ''' schedule server send request to client accept request result send to client ''' def __init__(self, settings): self.settings = settings self.mq_class = load_object(settings['SCHEDULER_MEMORY_QUEUE']) self.mqs = PriorityQueue(self.priority) self.status = ScheduleStatus() def has_pending_requests(self): return len(self) > 0 def push_queue_request(self, request): self._mq_push(request) self.status.add_push_queue() def next_request(self): request = self.mqs.pop() if request: self.status.add_pop_queue() return request def __len__(self): return len(self.mqs) def _mq_push(self, request): self.mqs.push(request, -request.priority) def priority(self, priority): return self.mq_class()
def open(self, spider): #打开调度器,绑定spider,实例化mqs,dqs。 self.spider = spider #PriorityQueue见import部分。 #mqs=memery_queues,queues使用的是自己实现的PriorityQueue。 #队列中没一项都是一个memeryqueue-scrapy.squeues.LifoMemoryQueue self.mqs = PriorityQueue(self._newmq) #当指定了JOBDIR,self.dqdir是JOBDIR的子目录。 #self.dqdir = setting['JOBDIR'] + '/request.queue' #在获取这个文件夹path同时,也创建了这个文件夹。 #self._dq():返回了一个diskqueue的PriorityQueue,会处理文件中的内容读入到内存。 self.dqs = self._dq() if self.dqdir else None #返回dupefilter实例。 #RFPDupeFilter没有实现open方法, #其基类BaseDupeFilter.open(),只有一句pass #所以这个scheduler.open(),在不改变调度器和去重类的情况下,直接返回的是None return self.df.open()
def _dq(self): activef = join(self.dqdir, 'active.json') if exists(activef): with open(activef) as f: prios = json.load(f) else: prios = () q = PriorityQueue(self._newdq, startprios=prios) if q: logger.info("Resuming crawl (%(queuesize)d requests scheduled)", {'queuesize': len(q)}, extra={'spider': self.spider}) return q
def _dq(self): activef = join(self.dqdir, 'active.json') if exists(activef): with open(activef) as f: prios = json.load(f) else: prios = () q = PriorityQueue(self._newdq, startprios=prios) if q: log.msg(format="Resuming crawl (%(queuesize)d requests scheduled)", spider=self.spider, queuesize=len(q)) return q
def __init__(self, crawler, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False, stats=None,run_as_daemon=False): self.df = dupefilter self.dqdir = self._dqdir(jobdir) self.dqclass = dqclass self.mqclass = mqclass self.logunser = logunser self.stats = stats self.run_as_daemon = run_as_daemon self.dqs = None self.mqs = PriorityQueue(self._newmq) #Scheduler.__init__(self, dupefilter, jobdir, dqclass, mqclass, logunser, stats) self.redis_handler = redis_handler('localhost', 6379, 0) self.redis_handler.connect_db() crawler.signals.connect(self.enqueue, signal=signals.request_scheduled)
def __init__(self, key, scheduler): self.scheduler = scheduler self.df = scheduler.dupefilter self.dqdir = self._dqdir(scheduler.jobdir, key) self.dqclass = scheduler.dqclass self.mqclass = scheduler.mqclass self.logunser = scheduler.logunser self.stats = scheduler.stats self.total_concurrency = scheduler.total_concurrency self.domain_concurrency = scheduler.domain_concurrency self.ip_concurrency = scheduler.ip_concurrency self.spider = None self.mqs = PriorityQueue(self._newmq) self.dqs = self._dq() if self.dqdir else None return self.df.open()
def _dq(self): #self.dqdir = setting['JOBDIR'] + '/request.queue' #activef = setting['JOBDIR'] + '/request.queue' +'/active.json' activef = join(self.dqdir, 'active.json') #如果这个文件存在,则打开,并将文件中的内容,读取进入prios(json结构) if exists(activef): with open(activef) as f: prios = json.load(f) #如果不存在这个文件,则prios为空集合。 else: prios = () q = PriorityQueue(self._newdq, startprios=prios) if q: logger.info("Resuming crawl (%(queuesize)d requests scheduled)", {'queuesize': len(q)}, extra={'spider': self.spider}) return q
def __init__(self, settings): self.settings = settings self.mq_class = load_object(settings['SCHEDULER_MEMORY_QUEUE']) self.mqs = PriorityQueue(self.priority) self.status = ScheduleStatus()
PyPI: https://pypi.python.org/pypi/queuelib Github: https://github.com/scrapy/queuelib Document: 没有 """ from queuelib import PriorityQueue, FifoDiskQueue if __name__ == "__main__": import string import random import time def randstr(): res = list() for _ in range(8): res.append(random.choice(string.ascii_letters)) return "".join(res).encode("utf-8") tasks = [(randstr(), random.randint(1, 5))for i in range(1000)] st = time.clock() qfactory = lambda priority: FifoDiskQueue('queue-dir-%s' % priority) pq = PriorityQueue(qfactory) for task in tasks: pq.push(*task) for _ in range(len(pq)): pq.pop() print("queuelib takes %.6f sec." % (time.clock() - st,))
class Scheduler(object): #Scheduler实例有三个主要的属性或者说是数据结构: #1 self.dupefilter 去重功能实例。 #2 self.dqs 磁盘队列管理器,里面按优先级为关键字,每一个优先级对应一个request队列。 #4 self.mqs 内存队列管理器,同上。 #__init__方法是通过类方法from_crawler来调用的,也就是一般都要一个crawler才方便实例化。 def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False, stats=None): self.df = dupefilter self.dqdir = self._dqdir(jobdir) self.dqclass = dqclass self.mqclass = mqclass #这个东西是从setting读取出来的,用来干嘛呢? self.logunser = logunser #crawler的stats,这个属性是做什么的呢? self.stats = stats @classmethod def from_crawler(cls, crawler): settings = crawler.settings #DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter' dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) #SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue' dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) #SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue' mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) #LOG_UNSERIALIZABLE_REQUESTS:False logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats) def has_pending_requests(self): #len(self)会调用该类中__len__方法。计量内存和硬盘队列的总长。 return len(self) > 0 def __len__(self): #如果self.dqs(diskqueue)不为空,则加上self.dqs,否则返回self.mqs return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs) def open(self, spider): #打开调度器,绑定spider,实例化mqs,dqs。 self.spider = spider #PriorityQueue见import部分。 #mqs=memery_queues,queues使用的是自己实现的PriorityQueue。 #队列中没一项都是一个memeryqueue-scrapy.squeues.LifoMemoryQueue self.mqs = PriorityQueue(self._newmq) #当指定了JOBDIR,self.dqdir是JOBDIR的子目录。 #self.dqdir = setting['JOBDIR'] + '/request.queue' #在获取这个文件夹path同时,也创建了这个文件夹。 #self._dq():返回了一个diskqueue的PriorityQueue,会处理文件中的内容读入到内存。 self.dqs = self._dq() if self.dqdir else None #返回dupefilter实例。 #RFPDupeFilter没有实现open方法, #其基类BaseDupeFilter.open(),只有一句pass #所以这个scheduler.open(),在不改变调度器和去重类的情况下,直接返回的是None return self.df.open() def close(self, reason): if self.dqs: #PriorityQueue.close()方法,获取到每一个prios,并关闭dqs中的每一个队列。 prios = self.dqs.close() with open(join(self.dqdir, 'active.json'), 'w') as f: json.dump(prios, f) #mqs不做处理。 #self.df.close():如果JOBDIR存在则将,则关闭指定的request.seen文件。否则不做任何操作。 return self.df.close(reason) def enqueue_request(self, request): #去重标志为真,dupefilter中出现过当前request: #不做操作,记录日志。 if not request.dont_filter and self.df.request_seen(request): self.df.log(request, self.spider) return False #如果进入disk队列,就不会进入内存队列。 #入列先尝试disk,disk没有再尝试内存。 dqok = self._dqpush(request) if dqok: #self.stats是干嘛用的。 self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider) else: self._mqpush(request) self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider) self.stats.inc_value('scheduler/enqueued', spider=self.spider) return True def next_request(self): #出列先尝试从内存队列读取,内存队列没有,再从 request = self.mqs.pop() if request: self.stats.inc_value('scheduler/dequeued/memory', spider=self.spider) else: request = self._dqpop() if request: self.stats.inc_value('scheduler/dequeued/disk', spider=self.spider) if request: self.stats.inc_value('scheduler/dequeued', spider=self.spider) return request #优先级队列入列,确定使用哪一个队列的标志是request中的priority属性。 def _dqpush(self, request): #self.dqs为空,也就是磁盘队列没有实例化,直接返回。 if self.dqs is None: return try: #把request请求放入到对应的优先级队列中。 #request_to_dict方法可以把request实例,转换成一个字典: #d = {'url': request.url.decode('ascii'), # urls should be safe (safe_string_url) #'callback': cb, #'errback': eb, #'method': request.method, #'headers': dict(request.headers), #'body': request.body, #'cookies': request.cookies, #'meta': request.meta, #'_encoding': request._encoding, #'priority': request.priority, #'dont_filter': request.dont_filter,} reqd = request_to_dict(request, self.spider) #字典根据优先级入列。 self.dqs.push(reqd, -request.priority) except ValueError as e: # non serializable request if self.logunser: logger.error( "Unable to serialize request: %(request)s - reason: %(reason)s", { 'request': request, 'reason': e }, exc_info=True, extra={'spider': self.spider}) return else: #如果没有错误,则返回真。 return True def _mqpush(self, request): #内存队列,就直接入列,不转换成字典。 self.mqs.push(request, -request.priority) def _dqpop(self): if self.dqs: d = self.dqs.pop() if d: return request_from_dict(d, self.spider) def _newmq(self, priority): return self.mqclass() def _newdq(self, priority): return self.dqclass(join(self.dqdir, 'p%s' % priority)) #当self.dqdir有值的时候,才会调用该方法 def _dq(self): #self.dqdir = setting['JOBDIR'] + '/request.queue' #activef = setting['JOBDIR'] + '/request.queue' +'/active.json' activef = join(self.dqdir, 'active.json') #如果这个文件存在,则打开,并将文件中的内容,读取进入prios(json结构) if exists(activef): with open(activef) as f: prios = json.load(f) #如果不存在这个文件,则prios为空集合。 else: prios = () q = PriorityQueue(self._newdq, startprios=prios) if q: logger.info("Resuming crawl (%(queuesize)d requests scheduled)", {'queuesize': len(q)}, extra={'spider': self.spider}) return q def _dqdir(self, jobdir): if jobdir: dqdir = join(jobdir, 'requests.queue') if not exists(dqdir): os.makedirs(dqdir) return dqdir
class Scheduler(object): def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False, stats=None): self.df = dupefilter self.dqdir = self._dqdir(jobdir) self.dqclass = dqclass self.mqclass = mqclass self.logunser = logunser self.stats = stats @classmethod def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats) def has_pending_requests(self): return len(self) > 0 def open(self, spider): self.spider = spider self.mqs = PriorityQueue(self._newmq) self.dqs = self._dq() if self.dqdir else None return self.df.open() def close(self, reason): if self.dqs: prios = self.dqs.close() with open(join(self.dqdir, 'active.json'), 'w') as f: json.dump(prios, f) return self.df.close(reason) def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): self.df.log(request, self.spider) return False dqok = self._dqpush(request) if dqok: self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider) else: self._mqpush(request) self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider) self.stats.inc_value('scheduler/enqueued', spider=self.spider) return True def next_request(self): request = self.mqs.pop() if request: self.stats.inc_value('scheduler/dequeued/memory', spider=self.spider) else: request = self._dqpop() if request: self.stats.inc_value('scheduler/dequeued/disk', spider=self.spider) if request: self.stats.inc_value('scheduler/dequeued', spider=self.spider) return request def __len__(self): return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs) def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, -request.priority) except ValueError as e: # non serializable request if self.logunser: logger.error("Unable to serialize request: %(request)s - reason: %(reason)s", {'request': request, 'reason': e}, exc_info=True, extra={'spider': self.spider}) return else: return True def _mqpush(self, request): self.mqs.push(request, -request.priority) def _dqpop(self): if self.dqs: d = self.dqs.pop() if d: return request_from_dict(d, self.spider) def _newmq(self, priority): return self.mqclass() def _newdq(self, priority): return self.dqclass(join(self.dqdir, 'p%s' % priority)) def _dq(self): activef = join(self.dqdir, 'active.json') if exists(activef): with open(activef) as f: prios = json.load(f) else: prios = () q = PriorityQueue(self._newdq, startprios=prios) if q: logger.info("Resuming crawl (%(queuesize)d requests scheduled)", {'queuesize': len(q)}, extra={'spider': self.spider}) return q def _dqdir(self, jobdir): if jobdir: dqdir = join(jobdir, 'requests.queue') if not exists(dqdir): os.makedirs(dqdir) return dqdir
def push(self, request, priority=0): slot = request.get('meta', {}).get('scheduler_slot', None) if slot not in self.pqueues: self.pqueues[slot] = PriorityQueue(self.qfactory) self._slots.append(slot) self.pqueues[slot].push(request, priority)
class Scheduler(object): def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False, stats=None): self.df = dupefilter self.dqdir = self._dqdir(jobdir) self.dqclass = dqclass self.mqclass = mqclass self.logunser = logunser self.stats = stats @classmethod def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats) def has_pending_requests(self): return len(self) > 0 def open(self, spider): self.spider = spider self.mqs = PriorityQueue(self._newmq) self.dqs = self._dq() if self.dqdir else None return self.df.open() def close(self, reason): if self.dqs: prios = self.dqs.close() with open(join(self.dqdir, 'active.json'), 'w') as f: json.dump(prios, f) return self.df.close(reason) def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): self.df.log(request, self.spider) return False dqok = self._dqpush(request) if dqok: self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider) else: self._mqpush(request) self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider) self.stats.inc_value('scheduler/enqueued', spider=self.spider) return True def next_request(self): request = self.mqs.pop() if request: self.stats.inc_value('scheduler/dequeued/memory', spider=self.spider) else: request = self._dqpop() if request: self.stats.inc_value('scheduler/dequeued/disk', spider=self.spider) if request: self.stats.inc_value('scheduler/dequeued', spider=self.spider) return request def __len__(self): return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs) def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, -request.priority) except ValueError as e: # non serializable request if self.logunser: logger.error( "Unable to serialize request: %(request)s - reason: %(reason)s", { 'request': request, 'reason': e }, exc_info=True, extra={'spider': self.spider}) return else: return True def _mqpush(self, request): self.mqs.push(request, -request.priority) def _dqpop(self): if self.dqs: d = self.dqs.pop() if d: return request_from_dict(d, self.spider) def _newmq(self, priority): return self.mqclass() def _newdq(self, priority): return self.dqclass(join(self.dqdir, 'p%s' % priority)) def _dq(self): activef = join(self.dqdir, 'active.json') if exists(activef): with open(activef) as f: prios = json.load(f) else: prios = () q = PriorityQueue(self._newdq, startprios=prios) if q: logger.info("Resuming crawl (%(queuesize)d requests scheduled)", {'queuesize': len(q)}, extra={'spider': self.spider}) return q def _dqdir(self, jobdir): if jobdir: dqdir = join(jobdir, 'requests.queue') if not exists(dqdir): os.makedirs(dqdir) return dqdir
class Scheduler(object): def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False, stats=None): self.df = dupefilter self.dqdir = self._dqdir(jobdir) self.dqclass = dqclass self.mqclass = mqclass self.logunser = logunser self.stats = stats @classmethod def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats) def has_pending_requests(self): return len(self) > 0 def open(self, spider): self.spider = spider self.mqs = PriorityQueue(self._newmq) self.dqs = self._dq() if self.dqdir else None return self.df.open() def close(self, reason): if self.dqs: prios = self.dqs.close() with open(join(self.dqdir, 'active.json'), 'w') as f: json.dump(prios, f) return self.df.close(reason) def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): self.df.log(request, self.spider) return dqok = self._dqpush(request) if dqok: self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider) else: self._mqpush(request) self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider) self.stats.inc_value('scheduler/enqueued', spider=self.spider) def next_request(self): request = self.mqs.pop() if request: self.stats.inc_value('scheduler/dequeued/memory', spider=self.spider) else: request = self._dqpop() if request: self.stats.inc_value('scheduler/dequeued/disk', spider=self.spider) if request: self.stats.inc_value('scheduler/dequeued', spider=self.spider) return request def __len__(self): return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs) def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, -request.priority) except ValueError, e: # non serializable request if self.logunser: log.msg(format="Unable to serialize request: %(request)s - reason: %(reason)s", level=log.ERROR, spider=self.spider, request=request, reason=e) return else:
class Slot(object): def __init__(self, key, scheduler): self.scheduler = scheduler self.df = scheduler.dupefilter self.dqdir = self._dqdir(scheduler.jobdir, key) self.dqclass = scheduler.dqclass self.mqclass = scheduler.mqclass self.logunser = scheduler.logunser self.stats = scheduler.stats self.total_concurrency = scheduler.total_concurrency self.domain_concurrency = scheduler.domain_concurrency self.ip_concurrency = scheduler.ip_concurrency self.spider = None self.mqs = PriorityQueue(self._newmq) self.dqs = self._dq() if self.dqdir else None return self.df.open() def close(self, reason): if self.dqs: prios = self.dqs.close() with open(join(self.dqdir, 'active.json'), 'w') as f: json.dump(prios, f) return self.df.close(reason) def has_pending_requests(self): return len(self) > 0 def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): self.df.log(request, self.spider) return dqok = self._dqpush(request) if dqok: self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider) else: self._mqpush(request) self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider) self.stats.inc_value('scheduler/enqueued', spider=self.spider) return len(self) def next_request(self): request = self.mqs.pop() if request: self.stats.inc_value('scheduler/dequeued/memory', spider=self.spider) else: request = self._dqpop() if request: self.stats.inc_value('scheduler/dequeued/disk', spider=self.spider) if request: self.stats.inc_value('scheduler/dequeued', spider=self.spider) return request def __len__(self): return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs) def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, -request.meta['priority']) except ValueError as e: # non serializable request if self.logunser: log.msg(format="Unable to serialize request: %(request)s - reason: %(reason)s", level=log.ERROR, spider=self.spider, request=request, reason=e) return else: return True def _mqpush(self, request): self.mqs.push(request, -request.meta['priority']) def _dqpop(self): if self.dqs: d = self.dqs.pop() if d: return request_from_dict(d, self.spider) def _newmq(self, priority): return self.mqclass() def _newdq(self, priority): return self.dqclass(join(self.dqdir, 'p%s' % priority)) def _dq(self): activef = join(self.dqdir, 'active.json') if exists(activef): with open(activef) as f: priiios = json.load(f) else: prios = () q = PriorityQueue(self._newdq, startprios=prios) if q: log.msg(format="Resuming crawl (%(queuesize)d requests scheduled)", spider=self.spider, queuesize=len(q)) return q def _dqdir(self, jobdir, key): if jobdir: dqdir = join(jobdir, 'requests.queue.%s'%key) if not exists(dqdir): os.makedirs(dqdir) return dqdir
class Scheduler(object): def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False, stats=None): self.df = dupefilter self.dqdir = self._dqdir(jobdir) self.dqclass = dqclass self.mqclass = mqclass self.logunser = logunser self.stats = stats @classmethod def from_crawler(cls, crawler): settings = crawler.settings dupefilter_cls = load_object(settings['DUPEFILTER_CLASS']) dupefilter = dupefilter_cls.from_settings(settings) dqclass = load_object(settings['SCHEDULER_DISK_QUEUE']) mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE']) logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS') return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats) def has_pending_requests(self): return len(self) > 0 def open(self, spider): self.spider = spider self.mqs = PriorityQueue(self._newmq) self.dqs = self._dq() if self.dqdir else None return self.df.open() def close(self, reason): if self.dqs: prios = self.dqs.close() with open(join(self.dqdir, 'active.json'), 'w') as f: json.dump(prios, f) return self.df.close(reason) def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): self.df.log(request, self.spider) return dqok = self._dqpush(request) if dqok: self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider) else: self._mqpush(request) self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider) self.stats.inc_value('scheduler/enqueued', spider=self.spider) def next_request(self): request = self.mqs.pop() if request: self.stats.inc_value('scheduler/dequeued/memory', spider=self.spider) else: request = self._dqpop() if request: self.stats.inc_value('scheduler/dequeued/disk', spider=self.spider) if request: self.stats.inc_value('scheduler/dequeued', spider=self.spider) return request def __len__(self): return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs) def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, -request.priority) except ValueError, e: # non serializable request if self.logunser: log.msg( format= "Unable to serialize request: %(request)s - reason: %(reason)s", level=log.ERROR, spider=self.spider, request=request, reason=e) return else:
def open(self, spider): self.spider = spider self.mqs = PriorityQueue(self._newmq) self.dqs = self._dq() if self.dqdir else None return self.df.open()