예제 #1
0
    def next_request(self):

        queues = self.redis_conn.keys(self.queue_name)

        if queues:
            queue = random.choice(queues)
            self.logger.info("length of queue %s is %s" %
                             (queue, self.redis_conn.zcard(queue)))

            if self.settings.get("CUSTOM_REDIS"):
                item = self.redis_conn.zpop(queue)
            else:
                pipe = self.redis_conn.pipeline()
                pipe.multi()
                pipe.zrange(queue, 0, 0).zremrangebyrank(queue, 0, 0)
                result, count = pipe.execute()
                item = result[0]

            if item:
                item = json.loads(item)

                try:
                    req = Request(item['url'])
                except ValueError:
                    req = Request('http://' + item['url'])

                if 'callback' in item:
                    cb = item['callback']
                    if cb and self.spider:
                        cb = getattr(self.spider, cb)
                        req.callback = cb

                if 'errback' in item:
                    eb = item['errback']
                    if eb and self.spider:
                        eb = getattr(self.spider, eb)
                        req.errback = eb

                if 'meta' in item:
                    item = item['meta']

                # defaults not in schema
                if 'curdepth' not in item:
                    item['curdepth'] = 0

                if "retry_times" not in item:
                    item['retry_times'] = 0

                for key in item.keys():
                    req.meta[key] = item[key]

                if 'useragent' in item and item['useragent'] is not None:
                    req.headers['User-Agent'] = item['useragent']

                if 'cookie' in item and item['cookie'] is not None:
                    if isinstance(item['cookie'], dict):
                        req.cookies = item['cookie']
                    elif isinstance(item['cookie'], basestring):
                        req.cookies = parse_cookie(item['cookie'])
                return req
예제 #2
0
    def next_request(self):

        self.logger.info(
            "length of queue %s is %s" %
            (self.queue_name, self.redis_conn.zcard(self.queue_name)))
        item = None
        if time.time() - self.request_interval < self.last_acs_time:
            return item
        if self.settings.getbool("CUSTOM_REDIS"):
            item = self.redis_conn.zpop(self.queue_name)
        else:
            pipe = self.redis_conn.pipeline()
            pipe.multi()
            pipe.zrange(self.queue_name, 0,
                        0).zremrangebyrank(self.queue_name, 0, 0)
            result, count = pipe.execute()

            if result:
                item = result[0]

        if item:
            self.last_acs_time = time.time()
            item = pickle.loads(item)
            self.present_item = item
            headers = item.get("headers", {})
            body = item.get("body")
            if item.get("method"):
                method = item.get("method")
            else:
                method = "GET"

            try:
                req = Request(item['url'],
                              method=method,
                              body=body,
                              headers=headers)
            except ValueError:
                req = Request('http://' + item['url'],
                              method=method,
                              body=body,
                              headers=headers)

            if 'callback' in item:
                cb = item['callback']
                if cb and self.spider:
                    cb = getattr(self.spider, cb)
                    req.callback = cb

            if 'errback' in item:
                eb = item['errback']
                if eb and self.spider:
                    eb = getattr(self.spider, eb)
                    req.errback = eb

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0

            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in item.keys():
                req.meta[key] = item[key]

            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']

            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], (str, bytes)):
                    req.cookies = parse_cookie(item['cookie'])

            return req