예제 #1
0
class RedisFeed(object):
    def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host,
                 custom):

        self.name = "redis_feed"
        self.crawlid = crawlid
        self.spiderid = spiderid
        self.url = url
        self.urls_file = urls_file
        self.priority = priority
        self.port = port
        self.host = host
        self.custom = custom
        self.inc = 0
        self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0

        if self.custom:
            from custom_redis.client import Redis
        else:
            from redis import Redis

        self.redis_conn = Redis(host=self.host, port=self.port)
        self.clean_previous_task(self.crawlid)

    @classmethod
    def parse_args(cls):

        parser = argparse.ArgumentParser(description="usage: %prog [options]")
        parser.add_argument('-rh',
                            "--redis-host",
                            dest="host",
                            type=str,
                            default="127.0.0.1",
                            help="Redis host to feed in. ")
        parser.add_argument('-rp',
                            "--redis-port",
                            dest="port",
                            type=int,
                            default=6379,
                            help="Redis port to feed in. ")
        parser.add_argument('-u',
                            '--url',
                            type=str,
                            help="The url to crawl, a list of products. ")
        parser.add_argument('-uf',
                            '--urls-file',
                            type=str,
                            help="The urlsfile to crawl, single product. ")
        parser.add_argument('-c',
                            '--crawlid',
                            required=True,
                            type=str,
                            help="An unique Id for a crawl task. ")
        parser.add_argument('-s',
                            '--spiderid',
                            required=True,
                            type=str,
                            help="The website you wanna crawl. ")
        parser.add_argument('-p',
                            '--priority',
                            type=int,
                            default=100,
                            help="Feed in the task queue with priority. ")
        parser.add_argument('--custom',
                            action="store_true",
                            help="Use the custom redis whether or not. ")
        return cls(**vars(parser.parse_args()))

    def clean_previous_task(self, crawlid):
        failed_keys = self.redis_conn.keys("failed_download_*:%s" % crawlid)
        for fk in failed_keys:
            self.redis_conn.delete(fk)

        self.redis_conn.delete("crawlid:%s" % crawlid)
        self.redis_conn.delete("crawlid:%s:model" % crawlid)

    def start(self):
        sucess_rate, failed_rate = 0, 0
        # item抓取
        if self.urls_file:
            with open(self.urls_file) as f:
                lst = f.readlines()
                lines_count = len(lst)
                for index, url in enumerate(lst):
                    json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse_item", "priority":%s}' % (
                        url.strip("\357\273\277\r\n"), self.crawlid,
                        self.spiderid, self.priority)
                    self.failed_count += self.feed(self.get_name(), json_req)
                    sucess_rate, failed_rate = self.show_process_line(
                        lines_count, index + 1, self.failed_count)
                self.redis_conn.hset("crawlid:%s" % self.crawlid,
                                     "total_pages", lines_count)
                self.redis_conn.expire("crawlid:%s" % self.crawlid,
                                       2 * 24 * 60 * 60)
        # 分类抓取
        else:
            url_list = self.url.split("     ")
            lines_count = len(url_list)

            for index, url in enumerate(url_list):
                json_req = '{"url":"%s","crawlid":"%s","spiderid":"%s","callback":"parse","priority":%s}' % (
                    url.strip(),
                    self.crawlid,
                    self.spiderid,
                    self.priority,
                )
                self.failed_count += self.feed(self.get_name(), json_req)
                sucess_rate, failed_rate = self.show_process_line(
                    lines_count, index + 1, self.failed_count)
        print("\ntask feed complete. sucess_rate:%s%%, failed_rate:%s%%" %
              (sucess_rate, failed_rate))

    def get_name(self):
        return "{sid}:item:queue".format(sid=self.spiderid)

    def feed(self, queue_name, req):

        if self.custom:
            from custom_redis.client.errors import RedisError
        else:
            from redis import RedisError

        try:
            self.redis_conn.zadd(queue_name, req, -self.priority)
            return 0
        except RedisError:
            traceback.print_exc()
            return 1

    def show_process_line(self, count, num, failed):

        per = count / 100
        success = num - failed
        success_rate = success * 100.0 / count
        failed_rate = failed * 100.0 / count
        str_success_rate = "%.2f%%  " % success_rate
        str_failed_rate = "%.2f%%  " % failed_rate

        if num >= self.inc:
            self.inc += per

            if sys.platform == "win32":
                import ctypes
                std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11)
                color_ctl = ctypes.windll.kernel32.SetConsoleTextAttribute
                color_ctl(std_out_handle, 2)
                print("\r", str_success_rate, "")
                color_ctl(std_out_handle, 32)
                print(int(success_rate * 30 / 100) * ' ', "")
                if int(failed_rate):
                    color_ctl(std_out_handle, 64)
                    print(int(failed_rate * 30 / 100) * ' ', "")
                color_ctl(std_out_handle, 0)
                color_ctl(std_out_handle, 4)
                print(str_failed_rate, "")
                color_ctl(std_out_handle, 7)
            else:
                print("\r", str_success_rate, "")
                print(
                    "%s%s" %
                    (int(success_rate * 50 / 100) * '\033[42m \033[0m',
                     int(failed_rate * 50 / 100) * '\033[41m \033[0m'),
                    str_failed_rate)

        return success_rate, failed_rate
예제 #2
0
class Scheduler(Logger):
    # 记录当前正在处理的item, 在处理异常时使用
    present_item = None

    def __init__(self, crawler):

        self.settings = crawler.settings
        self.set_logger(crawler)
        if self.settings.get("CUSTOM_REDIS"):
            from custom_redis.client import Redis
        else:
            from redis import Redis
        self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                                self.settings.get("REDIS_PORT"))
        self.queue_name = "%s:*:queue"
        self.queues = {}

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def open(self, spider):

        self.spider = spider
        self.queue_name = self.queue_name % spider.name
        spider.set_redis(self.redis_conn)
        spider.set_logger(self.logger)

    def request_to_dict(self, request):

        headers = dict([(item[0].decode("ascii"), item[1])
                        for item in request.headers.items()])
        req_dict = {
            'url':
            request.url,
            'method':
            request.method,
            'headers':
            headers,
            'body':
            request.body,
            'cookies':
            request.cookies,
            'meta':
            request.meta,
            '_encoding':
            request._encoding,
            'dont_filter':
            request.dont_filter,
            'callback':
            None if request.callback is None else request.callback.__name__,
            'errback':
            None if request.errback is None else request.errback.__name__,
        }
        return req_dict

    @enqueue_request_method_wrapper
    def enqueue_request(self, request):

        req_dict = self.request_to_dict(request)
        key = "{sid}:item:queue".format(sid=req_dict['meta']['spiderid'])
        self.redis_conn.zadd(key, json.dumps(req_dict, cls=P22P3Encoder),
                             -int(req_dict["meta"]["priority"]))
        self.logger.debug("Crawlid: '{id}' Url: '{url}' added to queue".format(
            id=req_dict['meta']['crawlid'], url=req_dict['url']))

    @next_request_method_wrapper
    def next_request(self):

        queues = self.redis_conn.keys(self.queue_name)

        if queues:
            queue = random.choice(queues)
            self.logger.info("length of queue %s is %s" %
                             (queue, self.redis_conn.zcard(queue)))

            item = None
            if self.settings.get("CUSTOM_REDIS"):
                item = self.redis_conn.zpop(queue)
            else:
                pipe = self.redis_conn.pipeline()
                pipe.multi()
                pipe.zrange(queue, 0, 0).zremrangebyrank(queue, 0, 0)
                result, count = pipe.execute()
                # 1.1.8 add
                if result:
                    item = result[0]

            if item:
                item = json.loads(item)
                self.present_item = item
                headers = item.get("headers", {})
                body = item.get("body")
                if item.get("method"):
                    method = item.get("method")
                else:
                    method = "GET"

                try:
                    req = Request(item['url'],
                                  method=method,
                                  body=body,
                                  headers=headers)
                except ValueError:
                    req = Request('http://' + item['url'],
                                  method=method,
                                  body=body,
                                  headers=headers)

                if 'callback' in item:
                    cb = item['callback']
                    if cb and self.spider:
                        cb = getattr(self.spider, cb)
                        req.callback = cb

                if 'errback' in item:
                    eb = item['errback']
                    if eb and self.spider:
                        eb = getattr(self.spider, eb)
                        req.errback = eb

                if 'meta' in item:
                    item = item['meta']

                # defaults not in schema
                if 'curdepth' not in item:
                    item['curdepth'] = 0

                if "retry_times" not in item:
                    item['retry_times'] = 0

                for key in item.keys():
                    req.meta[key] = item[key]

                if 'useragent' in item and item['useragent'] is not None:
                    req.headers['User-Agent'] = item['useragent']

                if 'cookie' in item and item['cookie'] is not None:
                    if isinstance(item['cookie'], dict):
                        req.cookies = item['cookie']
                    elif isinstance(item['cookie'], (str, bytes)):
                        req.cookies = parse_cookie(item['cookie'])

                return req

    def close(self, reason):
        self.logger.info("Closing Spider", {'spiderid': self.spider.name})

    def has_pending_requests(self):
        return False
예제 #3
0
class Scheduler(Logger):

    def __init__(self, crawler):

        self.settings = crawler.settings
        self.set_logger(crawler)

        if self.settings.get("CUSTOM_REDIS"):
            from custom_redis.client import Redis
        else:
            from redis import Redis

        self.redis_conn = Redis(self.settings.get("REDIS_HOST"),
                                self.settings.get("REDIS_PORT"))
        self.queue_name = "%s:*:queue"
        self.queues = {}
        self.extract = tldextract.extract

    @classmethod
    def from_crawler(cls, crawler):

        return cls(crawler)

    def open(self, spider):

        self.spider = spider
        self.queue_name = self.queue_name%spider.name
        spider.set_redis(self.redis_conn)
        spider.set_logger(self.logger)

    def request_to_dict(self, request):

        req_dict = {
            'url': request.url.decode('ascii'),
            'method': request.method,
            'headers': dict(request.headers),
            'body': request.body,
            'cookies': request.cookies,
            'meta': request.meta,
            '_encoding': request._encoding,
            'priority': request.priority,
            'dont_filter': request.dont_filter,
            'callback': None if request.callback is None else request.callback.func_name,
            'errback': None if request.errback is None else request.errback.func_name,
        }
        return req_dict

    def enqueue_request(self, request):

        req_dict = self.request_to_dict(request)
        ex_res = self.extract(req_dict['url'])
        key = "{sid}:{dom}.{suf}:queue".format(
            sid=req_dict['meta']['spiderid'],
            dom=ex_res.domain,
            suf=ex_res.suffix)
        self.redis_conn.zadd(key, json.dumps(req_dict), -int(req_dict["priority"]))
        self.logger.debug("Crawlid: '{id}' Url: '{url}' added to queue"
                          .format(id=req_dict['meta']['crawlid'],
                                  url=req_dict['url']))

    def next_request(self):

        queues = self.redis_conn.keys(self.queue_name)

        if queues:
            queue = random.choice(queues)
            self.logger.info("length of queue %s is %s" %
                             (queue, self.redis_conn.zcard(queue)))

            if self.settings.get("CUSTOM_REDIS"):
                item = self.redis_conn.zpop(queue)
            else:
                pipe = self.redis_conn.pipeline()
                pipe.multi()
                pipe.zrange(queue, 0, 0).zremrangebyrank(queue, 0, 0)
                result, count = pipe.execute()
                item = result[0]

            if item:
                item = json.loads(item)

                try:
                    req = Request(item['url'])
                except ValueError:
                    req = Request('http://' + item['url'])

                if 'callback' in item:
                    cb = item['callback']
                    if cb and self.spider:
                        cb = getattr(self.spider, cb)
                        req.callback = cb

                if 'errback' in item:
                    eb = item['errback']
                    if eb and self.spider:
                        eb = getattr(self.spider, eb)
                        req.errback = eb

                if 'meta' in item:
                    item = item['meta']

                # defaults not in schema
                if 'curdepth' not in item:
                    item['curdepth'] = 0

                if "retry_times" not in item:
                    item['retry_times'] = 0

                for key in item.keys():
                    req.meta[key] = item[key]

                if 'useragent' in item and item['useragent'] is not None:
                    req.headers['User-Agent'] = item['useragent']

                if 'cookie' in item and item['cookie'] is not None:
                    if isinstance(item['cookie'], dict):
                        req.cookies = item['cookie']
                    elif isinstance(item['cookie'], basestring):
                        req.cookies = parse_cookie(item['cookie'])
                return req


    def close(self, reason):

        self.logger.info("Closing Spider", {'spiderid': self.spider.name})

    def has_pending_requests(self):

        return False
예제 #4
0
class SpiderFeeder(object):
    def __init__(self, crawlid, spiderid, url, urls_file, priority, port, host,
                 custom):
        self.crawlid = crawlid
        self.spiderid = spiderid
        self.url = url
        self.urls_file = urls_file
        self.priority = priority
        self.port = port
        self.host = host
        self.custom = custom
        self.inc = 0
        self.failed_count, self.failed_rate, self.sucess_rate = 0, 0, 0

        if self.custom:
            from custom_redis.client import Redis
        else:
            from redis import Redis

        self.redis_conn = Redis(host=self.host, port=self.port)
        self.clean_previous_task(self.crawlid)

    def clean_previous_task(self, crawlid):
        failed_keys = self.redis_conn.keys("failed_download_*:%s" % crawlid)
        for fk in failed_keys:
            self.redis_conn.delete(fk)
        self.redis_conn.delete("crawlid:%s" % crawlid)
        self.redis_conn.delete("crawlid:%s:model" % crawlid)

    def start(self):
        success_rate, failed_rate = 0, 0
        # item抓取
        if self.urls_file:
            with open(self.urls_file) as f:
                lst = f.readlines()
                lines_count = len(lst)
                for index, url in enumerate(lst):
                    req = Request(url=url.strip("\357\273\277\r\n"),
                                  callback="parse_item",
                                  meta={
                                      "crawlid": self.crawlid,
                                      "spiderid": self.spiderid,
                                      "priority": self.priority
                                  })
                    self.failed_count += self.feed(self.get_name(),
                                                   pickle.dumps(req))
                    success_rate, failed_rate = \
                        self.show_process_line(
                            lines_count, index + 1, self.failed_count)
                self.redis_conn.hset("crawlid:%s" % self.crawlid,
                                     "total_pages", lines_count)
        # 分类抓取
        else:
            url_list = self.url.split("     ")
            lines_count = len(url_list)

            for index, url in enumerate(url_list):
                req = Request(url=url.strip(),
                              callback="parse",
                              meta={
                                  "crawlid": self.crawlid,
                                  "spiderid": self.spiderid,
                                  "priority": self.priority
                              })
                self.failed_count += self.feed(self.get_name(),
                                               pickle.dumps(req))
                sucess_rate, failed_rate = self.show_process_line(
                    lines_count, index + 1, self.failed_count)
        print("\ntask feed complete. sucess_rate:%s%%, failed_rate:%s%%" %
              (success_rate, failed_rate))

    def get_name(self):
        return "{sid}:request:queue".format(sid=self.spiderid)

    def feed(self, queue_name, req):
        if self.custom:
            from custom_redis.client.errors import RedisError
        else:
            from redis import RedisError
        try:
            self.redis_conn.zadd(queue_name, req, -self.priority)
            return 0
        except RedisError:
            traceback.print_exc()
            return 1

    def show_process_line(self, count, num, failed):
        per = count / 100
        success = num - failed
        success_rate = success * 100.0 / count
        failed_rate = failed * 100.0 / count
        str_success_rate = "%.2f%%  " % success_rate
        str_failed_rate = "%.2f%%  " % failed_rate

        if num >= self.inc:
            self.inc += per
            if sys.platform == "win32":
                import ctypes
                std_out_handle = ctypes.windll.kernel32.GetStdHandle(-11)
                color_ctl = ctypes.windll.kernel32.SetConsoleTextAttribute
                color_ctl(std_out_handle, 2)
                print("\r", str_success_rate, "")
                color_ctl(std_out_handle, 32)
                print(int(success_rate * 30 / 100) * ' ', "")
                if int(failed_rate):
                    color_ctl(std_out_handle, 64)
                    print(int(failed_rate * 30 / 100) * ' ', "")
                color_ctl(std_out_handle, 0)
                color_ctl(std_out_handle, 4)
                print(str_failed_rate, "")
                color_ctl(std_out_handle, 7)
            else:
                print("\r", str_success_rate, "")
                print(
                    "%s%s" %
                    (int(success_rate * 50 / 100) * '\033[42m \033[0m',
                     int(failed_rate * 50 / 100) * '\033[41m \033[0m'),
                    str_failed_rate)
        return success_rate, failed_rate