Exemplo n.º 1
0
class Scheduler(object):
    """Redis-based scheduler"""

    def __init__(self,redis_cli, persist, queue_key):
        self.redis_cli = redis_cli
        self.persist = persist
        self.queue_key = queue_key
        pass
        

    def __len__(self):
        return len(self.queue)
    
    @classmethod
    def from_settings(cls, settings):
        redis_cli = CyeRedis.getInstance()
        persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        return cls(redis_cli, persist, queue_key)

    def open(self, spider):
        self.spider = spider
        self.queue_key = self.get_queue_key(spider)
        self.queue = SpiderQueue(self.redis_cli, spider, self.queue_key)
        self.df = RFPDupeFilter(self.redis_cli, self.get_dupefilter_key(spider))
        if spider is not None:
            spider.log("Queue key of redis (%s)" % self.get_queue_key(spider), log.INFO)
            spider.log("Dupefilter key of redis (%s)" % self.get_dupefilter_key(spider), log.INFO)
            
        # notice if there are requests already in the queue
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

    def close(self, reason):
        if not self.persist:
            self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        self.queue.push(request)

    def next_request(self):
        return self.queue.pop()

    def has_pending_requests(self):
        return len(self) > 0
    
    def get_queue_key(self, spider):
        """Returns redis dupe key based on given spider"""
        qkey = QUEUE_KEY % {'namespace':spider.namespace, 'spider': spider.name}
        return str(qkey)
    
    def get_dupefilter_key(self, spider):
        """Returns redis dupe key based on given spider"""
        return DUPEFILTER_KEY % {'namespace':spider.namespace, 'spider': spider.name}
Exemplo n.º 2
0
    def open(self, spider):
        self.spider = spider
        self.queue_key = self.get_queue_key(spider)
        self.queue = SpiderQueue(self.redis_cli, spider, self.queue_key)
        self.df = RFPDupeFilter(self.redis_cli,
                                self.get_dupefilter_key(spider))
        if spider is not None:
            spider.log("Queue key of redis (%s)" % self.get_queue_key(spider),
                       log.INFO)
            spider.log(
                "Dupefilter key of redis (%s)" %
                self.get_dupefilter_key(spider), log.INFO)

        # notice if there are requests already in the queue
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" %
                       len(self.queue))
Exemplo n.º 3
0
 def open(self, spider):
     self.spider = spider
     self.queue_key = self.get_queue_key(spider)
     self.queue = SpiderQueue(self.redis_cli, spider, self.queue_key)
     self.df = RFPDupeFilter(self.redis_cli, self.get_dupefilter_key(spider))
     if spider is not None:
         spider.log("Queue key of redis (%s)" % self.get_queue_key(spider), log.INFO)
         spider.log("Dupefilter key of redis (%s)" % self.get_dupefilter_key(spider), log.INFO)
         
     # notice if there are requests already in the queue
     if len(self.queue):
         spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
Exemplo n.º 4
0
class Scheduler(object):
    """Redis-based scheduler"""
    def __init__(self, redis_cli, persist, queue_key):
        self.redis_cli = redis_cli
        self.persist = persist
        self.queue_key = queue_key
        pass

    def __len__(self):
        return len(self.queue)

    @classmethod
    def from_settings(cls, settings):
        redis_cli = CyeRedis.getInstance()
        persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        return cls(redis_cli, persist, queue_key)

    def open(self, spider):
        self.spider = spider
        self.queue_key = self.get_queue_key(spider)
        self.queue = SpiderQueue(self.redis_cli, spider, self.queue_key)
        self.df = RFPDupeFilter(self.redis_cli,
                                self.get_dupefilter_key(spider))
        if spider is not None:
            spider.log("Queue key of redis (%s)" % self.get_queue_key(spider),
                       log.INFO)
            spider.log(
                "Dupefilter key of redis (%s)" %
                self.get_dupefilter_key(spider), log.INFO)

        # notice if there are requests already in the queue
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" %
                       len(self.queue))

    def close(self, reason):
        if not self.persist:
            self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        self.queue.push(request)

    def next_request(self):
        return self.queue.pop()

    def has_pending_requests(self):
        return len(self) > 0

    def get_queue_key(self, spider):
        """Returns redis dupe key based on given spider"""
        qkey = QUEUE_KEY % {
            'namespace': spider.namespace,
            'spider': spider.name
        }
        return str(qkey)

    def get_dupefilter_key(self, spider):
        """Returns redis dupe key based on given spider"""
        return DUPEFILTER_KEY % {
            'namespace': spider.namespace,
            'spider': spider.name
        }