Пример #1
0
class SchedulerServer():
    '''
    schedule server
    send request to client
    accept request result send to client
    '''
    def __init__(self, settings):
        self.settings = settings
        self.mq_class = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        self.mqs = PriorityQueue(self.priority)
        self.status = ScheduleStatus()

    def has_pending_requests(self):
        return len(self) > 0

    def push_queue_request(self, request):
        self._mq_push(request)
        self.status.add_push_queue()

    def next_request(self):
        request = self.mqs.pop()
        if request:
            self.status.add_pop_queue()
        return request

    def __len__(self):
        return len(self.mqs)

    def _mq_push(self, request):
        self.mqs.push(request, -request.priority)

    def priority(self, priority):
        return self.mq_class()
    def open(self, spider):
        #打开调度器,绑定spider,实例化mqs,dqs。
        self.spider = spider

        #PriorityQueue见import部分。
        #mqs=memery_queues,queues使用的是自己实现的PriorityQueue。
        #队列中没一项都是一个memeryqueue-scrapy.squeues.LifoMemoryQueue
        self.mqs = PriorityQueue(self._newmq)

        #当指定了JOBDIR,self.dqdir是JOBDIR的子目录。
        #self.dqdir = setting['JOBDIR'] + '/request.queue'
        #在获取这个文件夹path同时,也创建了这个文件夹。
        #self._dq():返回了一个diskqueue的PriorityQueue,会处理文件中的内容读入到内存。
        self.dqs = self._dq() if self.dqdir else None

        #返回dupefilter实例。
        #RFPDupeFilter没有实现open方法,
        #其基类BaseDupeFilter.open(),只有一句pass
        #所以这个scheduler.open(),在不改变调度器和去重类的情况下,直接返回的是None
        return self.df.open()
Пример #3
0
 def _dq(self):
     activef = join(self.dqdir, 'active.json')
     if exists(activef):
         with open(activef) as f:
             prios = json.load(f)
     else:
         prios = ()
     q = PriorityQueue(self._newdq, startprios=prios)
     if q:
         logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
                     {'queuesize': len(q)}, extra={'spider': self.spider})
     return q
Пример #4
0
 def _dq(self):
     activef = join(self.dqdir, 'active.json')
     if exists(activef):
         with open(activef) as f:
             prios = json.load(f)
     else:
         prios = ()
     q = PriorityQueue(self._newdq, startprios=prios)
     if q:
         log.msg(format="Resuming crawl (%(queuesize)d requests scheduled)",
                 spider=self.spider, queuesize=len(q))
     return q
Пример #5
0
 def __init__(self, crawler, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False, stats=None,run_as_daemon=False):
     self.df = dupefilter
     self.dqdir = self._dqdir(jobdir)
     self.dqclass = dqclass
     self.mqclass = mqclass
     self.logunser = logunser
     self.stats = stats
     self.run_as_daemon = run_as_daemon
     self.dqs = None 
     self.mqs = PriorityQueue(self._newmq)
     #Scheduler.__init__(self, dupefilter, jobdir, dqclass, mqclass, logunser, stats)
     self.redis_handler = redis_handler('localhost', 6379, 0)
     self.redis_handler.connect_db()
     crawler.signals.connect(self.enqueue, signal=signals.request_scheduled)
Пример #6
0
 def __init__(self, key, scheduler):
     self.scheduler = scheduler
     self.df = scheduler.dupefilter
     self.dqdir = self._dqdir(scheduler.jobdir, key)
     self.dqclass = scheduler.dqclass
     self.mqclass = scheduler.mqclass
     self.logunser = scheduler.logunser
     self.stats = scheduler.stats
     self.total_concurrency = scheduler.total_concurrency
     self.domain_concurrency = scheduler.domain_concurrency
     self.ip_concurrency = scheduler.ip_concurrency
     self.spider = None
     self.mqs = PriorityQueue(self._newmq)
     self.dqs = self._dq() if self.dqdir else None
     return self.df.open()
    def _dq(self):
        #self.dqdir = setting['JOBDIR'] + '/request.queue'
        #activef = setting['JOBDIR'] + '/request.queue' +'/active.json'
        activef = join(self.dqdir, 'active.json')

        #如果这个文件存在,则打开,并将文件中的内容,读取进入prios(json结构)
        if exists(activef):
            with open(activef) as f:
                prios = json.load(f)

        #如果不存在这个文件,则prios为空集合。
        else:
            prios = ()

        q = PriorityQueue(self._newdq, startprios=prios)
        if q:
            logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
                        {'queuesize': len(q)},
                        extra={'spider': self.spider})
        return q
Пример #8
0
 def __init__(self, settings):
     self.settings = settings
     self.mq_class = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     self.mqs = PriorityQueue(self.priority)
     self.status = ScheduleStatus()
PyPI: https://pypi.python.org/pypi/queuelib
Github: https://github.com/scrapy/queuelib
Document: 没有
"""

from queuelib import PriorityQueue, FifoDiskQueue

if __name__ == "__main__":
    import string
    import random
    import time
    
    def randstr():
        res = list()
        for _ in range(8):
            res.append(random.choice(string.ascii_letters))
        return "".join(res).encode("utf-8")
    
    tasks = [(randstr(), random.randint(1, 5))for i in range(1000)]
    
    st = time.clock()
    
    qfactory = lambda priority: FifoDiskQueue('queue-dir-%s' % priority)
    pq = PriorityQueue(qfactory)
    for task in tasks:
        pq.push(*task)
    for _ in range(len(pq)):
        pq.pop()
        
    print("queuelib takes %.6f sec." % (time.clock() - st,))
class Scheduler(object):
    #Scheduler实例有三个主要的属性或者说是数据结构:
    #1 self.dupefilter 去重功能实例。
    #2 self.dqs 磁盘队列管理器,里面按优先级为关键字,每一个优先级对应一个request队列。
    #4 self.mqs 内存队列管理器,同上。

    #__init__方法是通过类方法from_crawler来调用的,也就是一般都要一个crawler才方便实例化。
    def __init__(self,
                 dupefilter,
                 jobdir=None,
                 dqclass=None,
                 mqclass=None,
                 logunser=False,
                 stats=None):
        self.df = dupefilter
        self.dqdir = self._dqdir(jobdir)
        self.dqclass = dqclass
        self.mqclass = mqclass

        #这个东西是从setting读取出来的,用来干嘛呢?
        self.logunser = logunser

        #crawler的stats,这个属性是做什么的呢?
        self.stats = stats

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings

        #DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter'
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = dupefilter_cls.from_settings(settings)

        #SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])

        #SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])

        #LOG_UNSERIALIZABLE_REQUESTS:False
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')

        return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser,
                   crawler.stats)

    def has_pending_requests(self):
        #len(self)会调用该类中__len__方法。计量内存和硬盘队列的总长。
        return len(self) > 0

    def __len__(self):
        #如果self.dqs(diskqueue)不为空,则加上self.dqs,否则返回self.mqs
        return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs)

    def open(self, spider):
        #打开调度器,绑定spider,实例化mqs,dqs。
        self.spider = spider

        #PriorityQueue见import部分。
        #mqs=memery_queues,queues使用的是自己实现的PriorityQueue。
        #队列中没一项都是一个memeryqueue-scrapy.squeues.LifoMemoryQueue
        self.mqs = PriorityQueue(self._newmq)

        #当指定了JOBDIR,self.dqdir是JOBDIR的子目录。
        #self.dqdir = setting['JOBDIR'] + '/request.queue'
        #在获取这个文件夹path同时,也创建了这个文件夹。
        #self._dq():返回了一个diskqueue的PriorityQueue,会处理文件中的内容读入到内存。
        self.dqs = self._dq() if self.dqdir else None

        #返回dupefilter实例。
        #RFPDupeFilter没有实现open方法,
        #其基类BaseDupeFilter.open(),只有一句pass
        #所以这个scheduler.open(),在不改变调度器和去重类的情况下,直接返回的是None
        return self.df.open()

    def close(self, reason):
        if self.dqs:
            #PriorityQueue.close()方法,获取到每一个prios,并关闭dqs中的每一个队列。
            prios = self.dqs.close()
            with open(join(self.dqdir, 'active.json'), 'w') as f:
                json.dump(prios, f)
        #mqs不做处理。

        #self.df.close():如果JOBDIR存在则将,则关闭指定的request.seen文件。否则不做任何操作。
        return self.df.close(reason)

    def enqueue_request(self, request):
        #去重标志为真,dupefilter中出现过当前request:
        #不做操作,记录日志。
        if not request.dont_filter and self.df.request_seen(request):
            self.df.log(request, self.spider)
            return False

        #如果进入disk队列,就不会进入内存队列。
        #入列先尝试disk,disk没有再尝试内存。
        dqok = self._dqpush(request)
        if dqok:
            #self.stats是干嘛用的。
            self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider)
        else:
            self._mqpush(request)
            self.stats.inc_value('scheduler/enqueued/memory',
                                 spider=self.spider)
        self.stats.inc_value('scheduler/enqueued', spider=self.spider)
        return True

    def next_request(self):
        #出列先尝试从内存队列读取,内存队列没有,再从
        request = self.mqs.pop()
        if request:
            self.stats.inc_value('scheduler/dequeued/memory',
                                 spider=self.spider)
        else:
            request = self._dqpop()
            if request:
                self.stats.inc_value('scheduler/dequeued/disk',
                                     spider=self.spider)
        if request:
            self.stats.inc_value('scheduler/dequeued', spider=self.spider)
        return request

    #优先级队列入列,确定使用哪一个队列的标志是request中的priority属性。
    def _dqpush(self, request):
        #self.dqs为空,也就是磁盘队列没有实例化,直接返回。
        if self.dqs is None:
            return
        try:
            #把request请求放入到对应的优先级队列中。
            #request_to_dict方法可以把request实例,转换成一个字典:
            #d = {'url': request.url.decode('ascii'), # urls should be safe (safe_string_url)
            #'callback': cb,
            #'errback': eb,
            #'method': request.method,
            #'headers': dict(request.headers),
            #'body': request.body,
            #'cookies': request.cookies,
            #'meta': request.meta,
            #'_encoding': request._encoding,
            #'priority': request.priority,
            #'dont_filter': request.dont_filter,}
            reqd = request_to_dict(request, self.spider)
            #字典根据优先级入列。
            self.dqs.push(reqd, -request.priority)
        except ValueError as e:  # non serializable request
            if self.logunser:
                logger.error(
                    "Unable to serialize request: %(request)s - reason: %(reason)s",
                    {
                        'request': request,
                        'reason': e
                    },
                    exc_info=True,
                    extra={'spider': self.spider})
            return
        else:
            #如果没有错误,则返回真。
            return True

    def _mqpush(self, request):
        #内存队列,就直接入列,不转换成字典。
        self.mqs.push(request, -request.priority)

    def _dqpop(self):
        if self.dqs:
            d = self.dqs.pop()
            if d:
                return request_from_dict(d, self.spider)

    def _newmq(self, priority):
        return self.mqclass()

    def _newdq(self, priority):
        return self.dqclass(join(self.dqdir, 'p%s' % priority))

    #当self.dqdir有值的时候,才会调用该方法
    def _dq(self):
        #self.dqdir = setting['JOBDIR'] + '/request.queue'
        #activef = setting['JOBDIR'] + '/request.queue' +'/active.json'
        activef = join(self.dqdir, 'active.json')

        #如果这个文件存在,则打开,并将文件中的内容,读取进入prios(json结构)
        if exists(activef):
            with open(activef) as f:
                prios = json.load(f)

        #如果不存在这个文件,则prios为空集合。
        else:
            prios = ()

        q = PriorityQueue(self._newdq, startprios=prios)
        if q:
            logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
                        {'queuesize': len(q)},
                        extra={'spider': self.spider})
        return q

    def _dqdir(self, jobdir):
        if jobdir:
            dqdir = join(jobdir, 'requests.queue')
            if not exists(dqdir):
                os.makedirs(dqdir)
            return dqdir
Пример #11
0
class Scheduler(object):

    def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False, stats=None):
        self.df = dupefilter
        self.dqdir = self._dqdir(jobdir)
        self.dqclass = dqclass
        self.mqclass = mqclass
        self.logunser = logunser
        self.stats = stats

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = dupefilter_cls.from_settings(settings)
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
        return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats)

    def has_pending_requests(self):
        return len(self) > 0

    def open(self, spider):
        self.spider = spider
        self.mqs = PriorityQueue(self._newmq)
        self.dqs = self._dq() if self.dqdir else None
        return self.df.open()

    def close(self, reason):
        if self.dqs:
            prios = self.dqs.close()
            with open(join(self.dqdir, 'active.json'), 'w') as f:
                json.dump(prios, f)
        return self.df.close(reason)

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            self.df.log(request, self.spider)
            return False
        dqok = self._dqpush(request)
        if dqok:
            self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider)
        else:
            self._mqpush(request)
            self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider)
        self.stats.inc_value('scheduler/enqueued', spider=self.spider)
        return True

    def next_request(self):
        request = self.mqs.pop()
        if request:
            self.stats.inc_value('scheduler/dequeued/memory', spider=self.spider)
        else:
            request = self._dqpop()
            if request:
                self.stats.inc_value('scheduler/dequeued/disk', spider=self.spider)
        if request:
            self.stats.inc_value('scheduler/dequeued', spider=self.spider)
        return request

    def __len__(self):
        return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs)

    def _dqpush(self, request):
        if self.dqs is None:
            return
        try:
            reqd = request_to_dict(request, self.spider)
            self.dqs.push(reqd, -request.priority)
        except ValueError as e: # non serializable request
            if self.logunser:
                logger.error("Unable to serialize request: %(request)s - reason: %(reason)s",
                             {'request': request, 'reason': e},
                             exc_info=True, extra={'spider': self.spider})
            return
        else:
            return True

    def _mqpush(self, request):
        self.mqs.push(request, -request.priority)

    def _dqpop(self):
        if self.dqs:
            d = self.dqs.pop()
            if d:
                return request_from_dict(d, self.spider)

    def _newmq(self, priority):
        return self.mqclass()

    def _newdq(self, priority):
        return self.dqclass(join(self.dqdir, 'p%s' % priority))

    def _dq(self):
        activef = join(self.dqdir, 'active.json')
        if exists(activef):
            with open(activef) as f:
                prios = json.load(f)
        else:
            prios = ()
        q = PriorityQueue(self._newdq, startprios=prios)
        if q:
            logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
                        {'queuesize': len(q)}, extra={'spider': self.spider})
        return q

    def _dqdir(self, jobdir):
        if jobdir:
            dqdir = join(jobdir, 'requests.queue')
            if not exists(dqdir):
                os.makedirs(dqdir)
            return dqdir
Пример #12
0
 def push(self, request, priority=0):
     slot = request.get('meta', {}).get('scheduler_slot', None)
     if slot not in self.pqueues:
         self.pqueues[slot] = PriorityQueue(self.qfactory)
         self._slots.append(slot)
     self.pqueues[slot].push(request, priority)
Пример #13
0
class Scheduler(object):
    def __init__(self,
                 dupefilter,
                 jobdir=None,
                 dqclass=None,
                 mqclass=None,
                 logunser=False,
                 stats=None):
        self.df = dupefilter
        self.dqdir = self._dqdir(jobdir)
        self.dqclass = dqclass
        self.mqclass = mqclass
        self.logunser = logunser
        self.stats = stats

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = dupefilter_cls.from_settings(settings)
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
        return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser,
                   crawler.stats)

    def has_pending_requests(self):
        return len(self) > 0

    def open(self, spider):
        self.spider = spider
        self.mqs = PriorityQueue(self._newmq)
        self.dqs = self._dq() if self.dqdir else None
        return self.df.open()

    def close(self, reason):
        if self.dqs:
            prios = self.dqs.close()
            with open(join(self.dqdir, 'active.json'), 'w') as f:
                json.dump(prios, f)
        return self.df.close(reason)

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            self.df.log(request, self.spider)
            return False
        dqok = self._dqpush(request)
        if dqok:
            self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider)
        else:
            self._mqpush(request)
            self.stats.inc_value('scheduler/enqueued/memory',
                                 spider=self.spider)
        self.stats.inc_value('scheduler/enqueued', spider=self.spider)
        return True

    def next_request(self):
        request = self.mqs.pop()
        if request:
            self.stats.inc_value('scheduler/dequeued/memory',
                                 spider=self.spider)
        else:
            request = self._dqpop()
            if request:
                self.stats.inc_value('scheduler/dequeued/disk',
                                     spider=self.spider)
        if request:
            self.stats.inc_value('scheduler/dequeued', spider=self.spider)
        return request

    def __len__(self):
        return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs)

    def _dqpush(self, request):
        if self.dqs is None:
            return
        try:
            reqd = request_to_dict(request, self.spider)
            self.dqs.push(reqd, -request.priority)
        except ValueError as e:  # non serializable request
            if self.logunser:
                logger.error(
                    "Unable to serialize request: %(request)s - reason: %(reason)s",
                    {
                        'request': request,
                        'reason': e
                    },
                    exc_info=True,
                    extra={'spider': self.spider})
            return
        else:
            return True

    def _mqpush(self, request):
        self.mqs.push(request, -request.priority)

    def _dqpop(self):
        if self.dqs:
            d = self.dqs.pop()
            if d:
                return request_from_dict(d, self.spider)

    def _newmq(self, priority):
        return self.mqclass()

    def _newdq(self, priority):
        return self.dqclass(join(self.dqdir, 'p%s' % priority))

    def _dq(self):
        activef = join(self.dqdir, 'active.json')
        if exists(activef):
            with open(activef) as f:
                prios = json.load(f)
        else:
            prios = ()
        q = PriorityQueue(self._newdq, startprios=prios)
        if q:
            logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
                        {'queuesize': len(q)},
                        extra={'spider': self.spider})
        return q

    def _dqdir(self, jobdir):
        if jobdir:
            dqdir = join(jobdir, 'requests.queue')
            if not exists(dqdir):
                os.makedirs(dqdir)
            return dqdir
Пример #14
0
class Scheduler(object):

    def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False, stats=None):
        self.df = dupefilter
        self.dqdir = self._dqdir(jobdir)
        self.dqclass = dqclass
        self.mqclass = mqclass
        self.logunser = logunser
        self.stats = stats

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = dupefilter_cls.from_settings(settings)
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
        return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser, crawler.stats)

    def has_pending_requests(self):
        return len(self) > 0

    def open(self, spider):
        self.spider = spider
        self.mqs = PriorityQueue(self._newmq)
        self.dqs = self._dq() if self.dqdir else None
        return self.df.open()

    def close(self, reason):
        if self.dqs:
            prios = self.dqs.close()
            with open(join(self.dqdir, 'active.json'), 'w') as f:
                json.dump(prios, f)
        return self.df.close(reason)

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            self.df.log(request, self.spider)
            return
        dqok = self._dqpush(request)
        if dqok:
            self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider)
        else:
            self._mqpush(request)
            self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider)
        self.stats.inc_value('scheduler/enqueued', spider=self.spider)

    def next_request(self):
        request = self.mqs.pop()
        if request:
            self.stats.inc_value('scheduler/dequeued/memory', spider=self.spider)
        else:
            request = self._dqpop()
            if request:
                self.stats.inc_value('scheduler/dequeued/disk', spider=self.spider)
        if request:
            self.stats.inc_value('scheduler/dequeued', spider=self.spider)
        return request

    def __len__(self):
        return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs)

    def _dqpush(self, request):
        if self.dqs is None:
            return
        try:
            reqd = request_to_dict(request, self.spider)
            self.dqs.push(reqd, -request.priority)
        except ValueError, e: # non serializable request
            if self.logunser:
                log.msg(format="Unable to serialize request: %(request)s - reason: %(reason)s",
                        level=log.ERROR, spider=self.spider,
                        request=request, reason=e)
            return
        else:
Пример #15
0
class Slot(object):

    def __init__(self, key, scheduler):
        self.scheduler = scheduler
        self.df = scheduler.dupefilter
        self.dqdir = self._dqdir(scheduler.jobdir, key)
        self.dqclass = scheduler.dqclass
        self.mqclass = scheduler.mqclass
        self.logunser = scheduler.logunser
        self.stats = scheduler.stats
        self.total_concurrency = scheduler.total_concurrency
        self.domain_concurrency = scheduler.domain_concurrency
        self.ip_concurrency = scheduler.ip_concurrency
        self.spider = None
        self.mqs = PriorityQueue(self._newmq)
        self.dqs = self._dq() if self.dqdir else None
        return self.df.open()

    def close(self, reason):
        if self.dqs:
            prios = self.dqs.close()
            with open(join(self.dqdir, 'active.json'), 'w') as f:
                json.dump(prios, f)
        return self.df.close(reason)

    def has_pending_requests(self):
        return len(self) > 0

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            self.df.log(request, self.spider)
            return
        dqok = self._dqpush(request)
        if dqok:
            self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider)
        else:
            self._mqpush(request)
            self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider)
        self.stats.inc_value('scheduler/enqueued', spider=self.spider)

        return len(self)

    def next_request(self):
        request = self.mqs.pop()
        if request:
            self.stats.inc_value('scheduler/dequeued/memory', spider=self.spider)
        else:
            request = self._dqpop()
            if request:
                self.stats.inc_value('scheduler/dequeued/disk', spider=self.spider)
        if request:
            self.stats.inc_value('scheduler/dequeued', spider=self.spider)
        return request

    def __len__(self):
        return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs)

    def _dqpush(self, request):
        if self.dqs is None:
            return
        try:
            reqd = request_to_dict(request, self.spider)
            self.dqs.push(reqd, -request.meta['priority'])
        except ValueError as e: # non serializable request
            if self.logunser:
                log.msg(format="Unable to serialize request: %(request)s - reason: %(reason)s",
                        level=log.ERROR, spider=self.spider,
                        request=request, reason=e)
            return
        else:
            return True

    def _mqpush(self, request):
        self.mqs.push(request, -request.meta['priority'])

    def _dqpop(self):
        if self.dqs:
            d = self.dqs.pop()
            if d:
                return request_from_dict(d, self.spider)

    def _newmq(self, priority):
        return self.mqclass()

    def _newdq(self, priority):
        return self.dqclass(join(self.dqdir, 'p%s' % priority))

    def _dq(self):
        activef = join(self.dqdir, 'active.json')
        if exists(activef):
            with open(activef) as f:
                priiios = json.load(f)
        else:
            prios = ()
        q = PriorityQueue(self._newdq, startprios=prios)
        if q:
            log.msg(format="Resuming crawl (%(queuesize)d requests scheduled)",
                    spider=self.spider, queuesize=len(q))
        return q

    def _dqdir(self, jobdir, key):
        if jobdir:
            dqdir = join(jobdir, 'requests.queue.%s'%key)
            if not exists(dqdir):
                os.makedirs(dqdir)
            return dqdir
Пример #16
0
class Scheduler(object):
    def __init__(self,
                 dupefilter,
                 jobdir=None,
                 dqclass=None,
                 mqclass=None,
                 logunser=False,
                 stats=None):
        self.df = dupefilter
        self.dqdir = self._dqdir(jobdir)
        self.dqclass = dqclass
        self.mqclass = mqclass
        self.logunser = logunser
        self.stats = stats

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
        dupefilter = dupefilter_cls.from_settings(settings)
        dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
        mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
        logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS')
        return cls(dupefilter, job_dir(settings), dqclass, mqclass, logunser,
                   crawler.stats)

    def has_pending_requests(self):
        return len(self) > 0

    def open(self, spider):
        self.spider = spider
        self.mqs = PriorityQueue(self._newmq)
        self.dqs = self._dq() if self.dqdir else None
        return self.df.open()

    def close(self, reason):
        if self.dqs:
            prios = self.dqs.close()
            with open(join(self.dqdir, 'active.json'), 'w') as f:
                json.dump(prios, f)
        return self.df.close(reason)

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            self.df.log(request, self.spider)
            return
        dqok = self._dqpush(request)
        if dqok:
            self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider)
        else:
            self._mqpush(request)
            self.stats.inc_value('scheduler/enqueued/memory',
                                 spider=self.spider)
        self.stats.inc_value('scheduler/enqueued', spider=self.spider)

    def next_request(self):
        request = self.mqs.pop()
        if request:
            self.stats.inc_value('scheduler/dequeued/memory',
                                 spider=self.spider)
        else:
            request = self._dqpop()
            if request:
                self.stats.inc_value('scheduler/dequeued/disk',
                                     spider=self.spider)
        if request:
            self.stats.inc_value('scheduler/dequeued', spider=self.spider)
        return request

    def __len__(self):
        return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs)

    def _dqpush(self, request):
        if self.dqs is None:
            return
        try:
            reqd = request_to_dict(request, self.spider)
            self.dqs.push(reqd, -request.priority)
        except ValueError, e:  # non serializable request
            if self.logunser:
                log.msg(
                    format=
                    "Unable to serialize request: %(request)s - reason: %(reason)s",
                    level=log.ERROR,
                    spider=self.spider,
                    request=request,
                    reason=e)
            return
        else:
Пример #17
0
 def open(self, spider):
     self.spider = spider
     self.mqs = PriorityQueue(self._newmq)
     self.dqs = self._dq() if self.dqdir else None
     return self.df.open()
Пример #18
0
 def open(self, spider):
     self.spider = spider
     self.mqs = PriorityQueue(self._newmq)
     self.dqs = self._dq() if self.dqdir else None
     return self.df.open()