예제 #1
0
 def open(self, spider):
     self.spider = spider
     self.setup()
     self.dupefilter = RFPDupeFilter(self.redis_conn,
                                     self.spider.name + ':dupefilter', self.rfp_timeout)
예제 #2
0
class DistributedScheduler(object):
    '''
    Scrapy request scheduler that utilizes Priority Queues
    to moderate scrape requests within a distributed scrapy
    cluster
    '''
    redis_conn = None  # the redis connection
    queue = None  # the queue to use for crawling
    spider = None  # the spider using this scheduler
    queue_class = None  # the class to use for the queue
    dupefilter = None  # the redis dupefilter
    item_retries = 0  # the number of extra tries to get an item

    def __init__(self, server, persist, timeout, retries):
        '''
        Initialize the scheduler
        '''
        self.redis_conn = server
        self.persist = persist
        self.rfp_timeout = timeout
        self.item_retires = retries

    def setup(self):
        '''
        Used to initialize things when using mock
        spider.name is not set yet
        '''
        self.queue = RedisPriorityQueue(self.redis_conn,
                                        self.spider.name + ":queue")

    @classmethod
    def from_settings(cls, settings):
        server = redis.Redis(host=settings.get('REDIS_HOST'),
                             port=settings.get('REDIS_PORT'))
        persist = settings.get('SCHEDULER_PERSIST', True)
        timeout = settings.get('DUPEFILTER_TIMEOUT', 600)
        retries = settings.get('SCHEDULER_ITEM_RETRIES', 3)

        return cls(server, persist, timeout, retries)

    @classmethod
    def from_crawler(cls, crawler):
        return cls.from_settings(crawler.settings)

    def open(self, spider):
        self.spider = spider
        self.setup()
        self.dupefilter = RFPDupeFilter(self.redis_conn,
                                        self.spider.name + ':dupefilter', self.rfp_timeout)

    def close(self, reason):
        if not self.persist:
            self.dupefilter.clear()
            self.queue.clear()

    def is_blacklisted(self, appid, crawlid):
        '''
        Checks the redis blacklist for crawls that should not be propagated
        either from expiring or stopped
        @return: True if the appid crawlid combo is blacklisted
        '''
        key_check = '{appid}||{crawlid}'.format(appid=appid,
                                                crawlid=crawlid)
        redis_key = self.spider.name + ":blacklist"
        return self.redis_conn.sismember(redis_key, key_check)

    def enqueue_request(self, request):
        '''
        Pushes a request from the spider back into the queue
        '''
        if not request.dont_filter and self.dupefilter.request_seen(request):
            return
        req_dict = self.request_to_dict(request)

        if not self.is_blacklisted(req_dict['meta']['appid'],
                                   req_dict['meta']['crawlid']):
            key = "{sid}:queue".format(sid=req_dict['meta']['spiderid'])
            curr_time = time.time()

            # insert if crawl never expires (0) or time < expires
            if req_dict['meta']['expires'] == 0 or \
                            curr_time < req_dict['meta']['expires']:
                self.queue.push(req_dict, req_dict['meta']['priority'])

    def request_to_dict(self, request):
        '''
        Convert Request object to a dict.
        modified from scrapy.utils.reqser
        '''
        req_dict = {
            # urls should be safe (safe_string_url)
            'url': request.url.decode('ascii'),
            'method': request.method,
            'headers': dict(request.headers),
            'body': request.body,
            'cookies': request.cookies,
            'meta': request.meta,
            '_encoding': request._encoding,
            'priority': request.priority,
            'dont_filter': request.dont_filter,
        }
        return req_dict

    def find_item(self):
        '''
        Finds an item from the queue
        '''
        count = 0

        while count <= self.item_retries:
            item = self.queue.pop()
            if item:
                # very basic limiter
                time.sleep(1)
                return item
            # we want the spiders to get slightly out of sync
            # with each other for better performance
            time.sleep(random.random())
            count = count + 1

        return None

    def next_request(self):
        '''
        Logic to handle getting a new url request
        '''
        t = time.time()

        item = self.find_item()
        if item:
            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'])

            if 'meta' in item:
                item = item['meta']

            # defaults
            if "attrs" not in item:
                item["attrs"] = {}
            if "allowed_domains" not in item:
                item["allowed_domains"] = ()
            if "allow_regex" not in item:
                item["allow_regex"] = ()
            if "deny_regex" not in item:
                item["deny_regex"] = ()
            if "deny_extensions" not in item:
                item["deny_extensions"] = None
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "maxdepth" not in item:
                item["maxdepth"] = 0
            if "priority" not in item:
                item['priority'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0
            if "expires" not in item:
                item['expires'] = 0

            for key in ('attrs', 'allowed_domains', 'curdepth', 'maxdepth',
                        'appid', 'crawlid', 'spiderid', 'priority', 'retry_times',
                        'expires', 'allow_regex', 'deny_regex', 'deny_extensions'):
                req.meta[key] = item[key]

            return req

        return None

    def has_pending_requests(self):
        '''
        We never want to say we have pending requests
        If this returns True scrapy sometimes hangs.
        '''
        return False