예제 #1
0
파일: spider.py 프로젝트: Catcherman/ark
class Spider(object):
    """爬虫主类"""
    logger = logging.getLogger("spider")

    def __init__(self,
                 concurrent_num=10,
                 crawl_tags=[],
                 custom_headers={},
                 plugin=['send_url_to_celery'],
                 depth=10,
                 max_url_num=3000000,
                 internal_timeout=20,
                 spider_timeout=6 * 3600,
                 crawler_mode=1,
                 same_origin=True,
                 dynamic_parse=True):
        """
        concurrent_num    : 并行crawler和fetcher数量
        crawl_tags        : 爬行时收集URL所属标签列表
        custom_headers    : 自定义HTTP请求头
        plugin            : 自定义插件列表
        depth             : 爬行深度限制
        max_url_num       : 最大收集URL数量
        internal_timeout  : 内部调用超时时间
        spider_timeout    : 爬虫超时时间
        crawler_mode      : 爬取器模型(0:多线程模型,1:gevent模型)
        same_origin       : 是否限制相同域下
        dynamic_parse     : 是否使用WebKit动态解析
        """
        USER_AGENTS = [
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
            "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
            "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
            "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
            "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
            "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
            "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
            "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
        ]

        self.logger.setLevel(logging.DEBUG)
        hd = logging.StreamHandler()
        formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
        hd.setFormatter(formatter)
        self.logger.addHandler(hd)

        self.stopped = event.Event()
        self.internal_timeout = internal_timeout
        self.internal_timer = Timeout(internal_timeout)

        self.crawler_mode = crawler_mode  #爬取器模型
        self.concurrent_num = concurrent_num
        self.fetcher_pool = pool.Pool(self.concurrent_num)
        if self.crawler_mode == 0:
            self.crawler_pool = threadpool.ThreadPool(
                min(50, self.concurrent_num))
        else:
            self.crawler_pool = pool.Pool(self.concurrent_num)

        #self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100)
        self.fetcher_queue = threadpool.Queue(
            maxsize=self.concurrent_num * 10000)
        self.crawler_queue = threadpool.Queue(
            maxsize=self.concurrent_num * 10000)

        self.fetcher_cache = UrlCache()
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = [
            'a', 'base', 'iframe', 'frame', 'object', 'framset'
        ]
        self.ignore_ext = [
            'cab', 'ico', 'swf', 'rar', 'zip', 'tar', 'gz', 'js', '7z', 'bz2',
            'iso', 'nrg', 'uif', 'exe', 'rpm', 'deb', 'dmg', 'jar', 'jad',
            'bin', 'apk', 'run', 'msi', 'xls', 'xlsx', 'ppt', 'pptx', 'pdf',
            'doc', 'docx', 'odf', 'rtf', 'odt', 'mkv', 'avi', 'mp4', 'flv',
            'WebM', 'mov', 'wmv', '3gp', 'mpg', 'mpeg', 'mp3', 'wav', 'ss3',
            'ogg', 'mp4a', 'wma', 'png', 'jpeg', 'jpg', 'xpm', 'gif', 'tiff',
            'css', 'bmp', 'svg', 'exif', 'thmx', 'xml', 'txt'
        ]
        self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags))
        self.same_origin = same_origin
        self.depth = depth
        self.max_url_num = max_url_num
        self.dynamic_parse = dynamic_parse
        if self.dynamic_parse:
            self.webkit = WebKit()
        self.crawler_stopped = event.Event()

        self.plugin_handler = plugin  #注册Crawler中使用的插件
        self.custom_headers = {'User-Agent': random.choice(USER_AGENTS)}

    def _start_fetcher(self):
        '''
        启动下载器
        '''
        for i in xrange(self.concurrent_num):
            fetcher = Fetcher(self)
            self.fetcher_pool.start(fetcher)

    def _start_crawler(self):
        '''
        启动爬取器
        '''
        for _ in xrange(self.concurrent_num):
            self.crawler_pool.spawn(self.crawler)

    def start(self):
        '''
        主启动函数
        '''
        self.logger.info("spider starting...")

        if self.crawler_mode == 0:
            self.logger.info("crawler run in multi-thread mode.")
        elif self.crawler_mode == 1:
            self.logger.info("crawler run in gevent mode.")

        self._start_fetcher()
        self._start_crawler()

        self.stopped.wait()  #等待停止事件置位

        try:
            self.internal_timer.start()
            self.fetcher_pool.join(timeout=self.internal_timer)
            if self.crawler_mode == 1:
                self.crawler_pool.join(timeout=self.internal_timer)
            else:
                self.crawler_pool.join()
        except Timeout:
            self.logger.error("internal timeout triggered")
        finally:
            self.internal_timer.cancel()

        self.stopped.clear()
        if self.dynamic_parse:
            self.webkit.close()
        self.logger.info("crawler_cache:%s fetcher_cache:%s" % (len(
            self.crawler_cache), len(self.fetcher_cache)))
        self.logger.info("spider process quit.")

    def crawler(self, _dep=None):
        '''
        爬行器主函数
        '''
        while not self.stopped.isSet() and not self.crawler_stopped.isSet():
            try:
                self._maintain_spider()  #维护爬虫池
                url_data = self.crawler_queue.get(block=False)
            except queue.Empty, e:
                if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0:
                    self.stop()
                else:
                    if self.crawler_mode == 1:
                        gevent.sleep()
            else:
                pre_depth = url_data.depth
                curr_depth = pre_depth + 1
                link_generator = HtmlAnalyzer.extract_links(
                    url_data.html, url_data.url, self.crawl_tags)
                link_list = [url for url in link_generator]
                if self.dynamic_parse:
                    link_generator = self.webkit.extract_links(url_data.url)
                    link_list.extend([url for url in link_generator])
                link_list = list(set(link_list))
                for index, link in enumerate(link_list):
                    if not self.check_url_usable(link):
                        continue
                    # 增加url相似性判断,详见urlfilter.py
                    if not self.check_url_similar(link):
                        continue
                    # 增加url重复判断,详见urlfilter.py
                    if not self.check_url_repeat(link):
                        continue
                    if curr_depth > self.depth:  #最大爬行深度判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break

                    if len(self.
                           fetcher_cache) == self.max_url_num:  #最大收集URL数量判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break
                    link = to_unicode(link)
                    url = UrlData(link, depth=curr_depth)
                    self.fetcher_cache.insert(url)
                    self.fetcher_queue.put(url, block=True)

                for plugin_name in self.plugin_handler:  #循环动态调用初始化时注册的插件
                    try:
                        plugin_obj = eval(plugin_name)()
                        plugin_obj.start(url_data)
                    except Exception, e:
                        import traceback
                        traceback.print_exc()

                self.crawler_queue.task_done()
예제 #2
0
class Spider(object):
    """爬虫主类"""
    logger = logging.getLogger("spider")

    def __init__(self,
                 concurrent_num=20,
                 crawl_tags=[],
                 depth=3,
                 max_url_num=300,
                 internal_timeout=60,
                 spider_timeout=6 * 3600,
                 crawler_mode=0,
                 same_origin=True,
                 dynamic_parse=False):
        """
        concurrent_num    : 并行crawler和fetcher数量
        crawl_tags        : 爬行时收集URL所属标签列表
        depth             : 爬行深度限制
        max_url_num       : 最大收集URL数量
        internal_timeout  : 内部调用超时时间
        spider_timeout    : 爬虫超时时间
        crawler_mode      : 爬取器模型(0:多线程模型,1:gevent模型)
        same_origin       : 是否限制相同域下
        dynamic_parse     : 是否使用WebKit动态解析
        """

        self.logger.setLevel(logging.DEBUG)
        hd = logging.StreamHandler()
        formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
        hd.setFormatter(formatter)
        self.logger.addHandler(hd)

        self.stopped = event.Event()
        self.internal_timer = Timeout(internal_timeout)

        self.crawler_mode = crawler_mode  #爬取器模型
        self.concurrent_num = concurrent_num
        self.fetcher_pool = pool.Pool(self.concurrent_num)
        if self.crawler_mode == 0:
            self.crawler_pool = threadpool.ThreadPool(
                min(50, self.concurrent_num))
        else:
            self.crawler_pool = pool.Pool(self.concurrent_num)

        #self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100)
        self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              100)
        self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              100)

        self.fetcher_cache = UrlCache()
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object']
        self.ignore_ext = [
            'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg',
            'exe', 'rar', 'zip'
        ]
        self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags))
        self.same_origin = same_origin
        self.depth = depth
        self.max_url_num = max_url_num
        self.dynamic_parse = dynamic_parse
        if self.dynamic_parse:
            self.webkit = WebKit()
        self.crawler_stopped = event.Event()

    def _start_fetcher(self):
        '''
        启动下载器
        '''
        for i in xrange(self.concurrent_num):
            fetcher = Fetcher(self)
            self.fetcher_pool.start(fetcher)

    def _start_crawler(self):
        '''
        启动爬取器
        '''
        for _ in xrange(self.concurrent_num):
            self.crawler_pool.spawn(self.crawler)

    def start(self):
        '''
        主启动函数
        '''
        self.logger.info("spider starting...")

        if self.crawler_mode == 0:
            self.logger.info("crawler run in multi-thread mode.")
        elif self.crawler_mode == 1:
            self.logger.info("crawler run in gevent mode.")

        self._start_fetcher()
        self._start_crawler()

        self.stopped.wait()  #等待停止事件置位

        try:
            self.internal_timer.start()
            self.fetcher_pool.join(timeout=self.internal_timer)
            if self.crawler_mode == 1:
                self.crawler_pool.join(timeout=self.internal_timer)
            else:
                self.crawler_pool.join()
        except Timeout:
            self.logger.error("internal timeout triggered")
        finally:
            self.internal_timer.cancel()

        self.stopped.clear()
        if self.dynamic_parse:
            self.webkit.close()
        self.logger.info("crawler_cache:%s fetcher_cache:%s" %
                         (len(self.crawler_cache), len(self.fetcher_cache)))
        self.logger.info("spider process quit.")

    def crawler(self, _dep=None):
        '''
        爬行器主函数
        '''
        while not self.stopped.isSet() and not self.crawler_stopped.isSet():
            try:
                self._maintain_spider()  #维护爬虫池
                url_data = self.crawler_queue.get(block=False)
            except queue.Empty, e:
                if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0:
                    self.stop()
                else:
                    if self.crawler_mode == 1:
                        gevent.sleep()
            else:
                pre_depth = url_data.depth
                curr_depth = pre_depth + 1
                link_generator = HtmlAnalyzer.extract_links(
                    url_data.html, url_data.url, self.crawl_tags)
                link_list = [url for url in link_generator]
                if self.dynamic_parse:
                    link_generator = self.webkit.extract_links(url_data.url)
                    link_list.extend([url for url in link_generator])
                link_list = list(set(link_list))
                for index, link in enumerate(link_list):
                    if not self.check_url_usable(link):
                        continue
                    if curr_depth > self.depth:  #最大爬行深度判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break

                    if len(self.fetcher_cache
                           ) == self.max_url_num:  #最大收集URL数量判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break
                    link = to_unicode(link)
                    url = UrlData(link, depth=curr_depth)
                    self.fetcher_cache.insert(url)
                    self.fetcher_queue.put(url, block=True)
                self.crawler_queue.task_done()
예제 #3
0
파일: vulcan.py 프로젝트: kjabko/python
class Spider(object):
    """爬虫主类"""

    logger = logging.getLogger("spider")

    def __init__(
        self,
        concurrent_num=20,
        crawl_tags=[],
        depth=3,
        max_url_num=300,
        internal_timeout=60,
        spider_timeout=6 * 3600,
        crawler_mode=0,
        same_origin=True,
        dynamic_parse=False,
    ):
        """
        concurrent_num    : 并行crawler和fetcher数量
        crawl_tags        : 爬行时收集URL所属标签列表
        depth             : 爬行深度限制
        max_url_num       : 最大收集URL数量
        internal_timeout  : 内部调用超时时间
        spider_timeout    : 爬虫超时时间
        crawler_mode      : 爬取器模型(0:多线程模型,1:gevent模型)
        same_origin       : 是否限制相同域下
        dynamic_parse     : 是否使用WebKit动态解析
        """

        self.logger.setLevel(logging.DEBUG)
        hd = logging.StreamHandler()
        formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
        hd.setFormatter(formatter)
        self.logger.addHandler(hd)

        self.stopped = event.Event()
        self.internal_timer = Timeout(internal_timeout)

        self.crawler_mode = crawler_mode  # 爬取器模型
        self.concurrent_num = concurrent_num
        self.fetcher_pool = pool.Pool(self.concurrent_num)
        if self.crawler_mode == 0:
            self.crawler_pool = threadpool.ThreadPool(min(50, self.concurrent_num))
        else:
            self.crawler_pool = pool.Pool(self.concurrent_num)

        # self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100)
        self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num * 100)
        self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num * 100)

        self.fetcher_cache = UrlCache()
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = ["a", "base", "iframe", "frame", "object"]
        self.ignore_ext = ["js", "css", "png", "jpg", "gif", "bmp", "svg", "exif", "jpeg", "exe", "rar", "zip"]
        self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags))
        self.same_origin = same_origin
        self.depth = depth
        self.max_url_num = max_url_num
        self.dynamic_parse = dynamic_parse
        if self.dynamic_parse:
            self.webkit = WebKit()
        self.crawler_stopped = event.Event()

    def _start_fetcher(self):
        """
        启动下载器
        """
        for i in xrange(self.concurrent_num):
            fetcher = Fetcher(self)
            self.fetcher_pool.start(fetcher)

    def _start_crawler(self):
        """
        启动爬取器
        """
        for _ in xrange(self.concurrent_num):
            self.crawler_pool.spawn(self.crawler)

    def start(self):
        """
        主启动函数
        """
        self.logger.info("spider starting...")

        if self.crawler_mode == 0:
            self.logger.info("crawler run in multi-thread mode.")
        elif self.crawler_mode == 1:
            self.logger.info("crawler run in gevent mode.")

        self._start_fetcher()
        self._start_crawler()

        self.stopped.wait()  # 等待停止事件置位

        try:
            self.internal_timer.start()
            self.fetcher_pool.join(timeout=self.internal_timer)
            if self.crawler_mode == 1:
                self.crawler_pool.join(timeout=self.internal_timer)
            else:
                self.crawler_pool.join()
        except Timeout:
            self.logger.error("internal timeout triggered")
        finally:
            self.internal_timer.cancel()

        self.stopped.clear()
        if self.dynamic_parse:
            self.webkit.close()
        self.logger.info("crawler_cache:%s fetcher_cache:%s" % (len(self.crawler_cache), len(self.fetcher_cache)))
        self.logger.info("spider process quit.")

    def crawler(self, _dep=None):
        """
        爬行器主函数
        """
        while not self.stopped.isSet() and not self.crawler_stopped.isSet():
            try:
                self._maintain_spider()  # 维护爬虫池
                url_data = self.crawler_queue.get(block=False)
            except queue.Empty, e:
                if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0:
                    self.stop()
                else:
                    if self.crawler_mode == 1:
                        gevent.sleep()
            else:
                pre_depth = url_data.depth
                curr_depth = pre_depth + 1
                link_generator = HtmlAnalyzer.extract_links(url_data.html, url_data.url, self.crawl_tags)
                link_list = [url for url in link_generator]
                if self.dynamic_parse:
                    link_generator = self.webkit.extract_links(url_data.url)
                    link_list.extend([url for url in link_generator])
                link_list = list(set(link_list))
                for index, link in enumerate(link_list):
                    if not self.check_url_usable(link):
                        continue
                    if curr_depth > self.depth:  # 最大爬行深度判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break

                    if len(self.fetcher_cache) == self.max_url_num:  # 最大收集URL数量判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break
                    link = to_unicode(link)
                    url = UrlData(link, depth=curr_depth)
                    self.fetcher_cache.insert(url)
                    self.fetcher_queue.put(url, block=True)
                self.crawler_queue.task_done()
예제 #4
0
파일: vulcan.py 프로젝트: Flygend/vulcan
class Spider(object):
    """爬虫主类"""
    logger = logging.getLogger("spider")

    #   相当于构造函数
    def __init__(self,
                 concurrent_num=20,
                 crawl_tags=[],
                 custom_headers={},
                 plugin=[],
                 depth=3,
                 max_url_num=300,
                 internal_timeout=60,
                 spider_timeout=6 * 3600,
                 crawler_mode=0,
                 same_origin=True,
                 dynamic_parse=False):
        """
        concurrent_num    : 并行crawler和fetcher数量
        crawl_tags        : 爬行时收集URL所属标签列表
        custom_headers    : 自定义HTTP请求头
        plugin            : 自定义插件列表
        depth             : 爬行深度限制
        max_url_num       : 最大收集URL数量
        internal_timeout  : 内部调用超时时间
        spider_timeout    : 爬虫超时时间
        crawler_mode      : 爬取器模型(0:多线程模型,1:gevent模型)
        same_origin       : 是否限制相同域下
        dynamic_parse     : 是否使用WebKit动态解析
        """

        #   日志模块
        self.logger.setLevel(logging.DEBUG)  #   日志级别
        formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s")  #   日志格式
        hd = logging.StreamHandler()
        hd.setFormatter(formatter)
        self.logger.addHandler(hd)

        self.stopped = event.Event()
        self.internal_timeout = internal_timeout  #   内部调用超时时间
        self.internal_timer = Timeout(internal_timeout)

        self.crawler_mode = crawler_mode  #   爬取器模型
        self.concurrent_num = concurrent_num  #   并行crawler与fetcher数量

        #   fetcher使用gevent模型
        self.fetcher_pool = pool.Pool(self.concurrent_num)

        #   crawler模型设置
        #   crawler负责解析并爬取HTML中的URL,送入fetcher,fetcher负责获取HTML,送入crawler
        if self.crawler_mode == 0:
            #   线程池模型
            self.crawler_pool = threadpool.ThreadPool(
                min(50, self.concurrent_num))
        else:
            #   gevent模型
            self.crawler_pool = pool.Pool(self.concurrent_num)

        #   fetcher和crawler两部分独立工作,互不干扰,通过queue进行链接
        # self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100)
        self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              10000)
        self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num *
                                              10000)

        self.fetcher_cache = UrlCache()
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = ['a', 'base', 'iframe', 'frame',
                                   'object']  #   默认的爬行时收集URL所属标签列表
        self.ignore_ext = [
            'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg',
            'exe', 'rar', 'zip'
        ]  #   爬行时忽略的URL种类
        self.crawl_tags = list(set(self.default_crawl_tags)
                               | set(crawl_tags))  #   爬行时收集URL所属标签列表
        self.same_origin = same_origin  #   是否同源
        self.depth = depth  #   爬行深度限制
        self.max_url_num = max_url_num  #   最大收集URL数量
        self.dynamic_parse = dynamic_parse  #   是否使用WebKit动态解析

        #   如果开启动态解析
        if self.dynamic_parse:
            self.webkit = WebKit()
        self.crawler_stopped = event.Event()

        self.plugin_handler = plugin  # 注册Crawler中使用的插件
        #   自定义HTTP头
        self.custom_headers = custom_headers

    def _start_fetcher(self):
        '''
        启动下载器
        '''
        for i in xrange(self.concurrent_num):  #   concurrent_num:并行数量
            fetcher = Fetcher(self)  #   实例化一个 fetcher
            self.fetcher_pool.start(fetcher)  #   调用start()开始执行_run()

    def _start_crawler(self):
        '''
        启动爬取器
        '''
        for _ in xrange(self.concurrent_num):
            self.crawler_pool.spawn(self.crawler)  #   启动self.crawler()函数

    def start(self):
        '''
        主启动函数
        '''
        self.logger.info("spider starting...")

        if self.crawler_mode == 0:
            self.logger.info("crawler run in multi-thread mode.")
        elif self.crawler_mode == 1:
            self.logger.info("crawler run in gevent mode.")

        #   开启fetcher与crawler
        self._start_fetcher()  #   初始爬取URL已在main函数中spider.feed_url(url)中设置
        self._start_crawler()

        self.stopped.wait()  # 等待停止事件置位

        #   等待fetcher与crawler执行结束
        try:
            self.internal_timer.start()
            self.fetcher_pool.join(timeout=self.internal_timer)
            if self.crawler_mode == 1:
                self.crawler_pool.join(timeout=self.internal_timer)
            else:
                self.crawler_pool.join()
        except Timeout:
            self.logger.error("internal timeout triggered")
        finally:
            self.internal_timer.cancel()

        self.stopped.clear()
        if self.dynamic_parse:
            self.webkit.close()
        self.logger.info("crawler_cache:%s fetcher_cache:%s" %
                         (len(self.crawler_cache), len(self.fetcher_cache)))
        self.logger.info("spider process quit.")

    def crawler(self, _dep=None):
        '''
        爬行器主函数
        '''
        while not self.stopped.isSet() and not self.crawler_stopped.isSet():
            try:
                self._maintain_spider()  # 维护爬虫池
                url_data = self.crawler_queue.get(
                    block=False)  #   从爬取队列取出一个URL
            except queue.Empty, e:  #   队列为空
                if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0:  #   全部处理完毕
                    self.stop()
                else:  #   fetcher没有处理完毕
                    if self.crawler_mode == 1:
                        gevent.sleep()
            else:
                pre_depth = url_data.depth
                curr_depth = pre_depth + 1  #   当前深度

                #   生成URL list
                link_generator = HtmlAnalyzer.extract_links(
                    url_data.html, url_data.url, self.crawl_tags)
                link_list = [url for url in link_generator]
                if self.dynamic_parse:  #   WebKit动态解析
                    link_generator = self.webkit.extract_links(url_data.url)
                    link_list.extend([url for url in link_generator])
                link_list = list(set(link_list))  #   去重

                #   遍历解析出的URL list
                for index, link in enumerate(link_list):
                    if not self.check_url_usable(link):
                        continue
                    if curr_depth > self.depth:  # 最大爬行深度判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break

                    if len(self.fetcher_cache
                           ) == self.max_url_num:  # 最大收集URL数量判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break
                    link = to_unicode(link)
                    url = UrlData(link, depth=curr_depth)
                    #   此处调整顺序,应该先加入下载队列
                    self.fetcher_queue.put(url, block=True)
                    self.fetcher_cache.insert(url)  #   加入到已经处理fetcher队列

                #   插件部分,暂时不关注
                for plugin_name in self.plugin_handler:  # 循环动态调用初始化时注册的插件
                    try:
                        plugin_obj = eval(plugin_name)()
                        plugin_obj.start(url_data)
                    except Exception, e:
                        import traceback
                        traceback.print_exc()

                self.crawler_queue.task_done()
예제 #5
0
파일: spider.py 프로젝트: BoyceYang/wsbs
class Spider(object):
    """
        concurrent_num    : 并行crawler和fetcher数量
        crawl_tags        : 爬行时收集URL所属标签列表
        custom_headers    : 自定义HTTP请求头
        plugin            : 自定义插件列表
        depth             : 爬行深度限制
        max_url_num       : 最大收集URL数量
        internal_timeout  : 内部调用超时时间
        spider_timeout    : 爬虫超时时间
        crawler_mode      : 爬取器模型(0:多线程模型,1:gevent模型)
        same_origin       : 是否限制相同域下
        dynamic_parse     : 是否使用WebKit动态解析
    """
    def __init__(self, concurrent_num=20, crawl_tags=[], custom_headers={}, plugin=[], depth=3, max_url_num=300,
                 internal_timeout=60, spider_timeout=6*3600, crawler_mode=0, same_origin=True, dynamic_parse=False,
                 spider_type='img'):
        self.stopped = event.Event()
        self.internal_timeout = internal_timeout
        self.internal_timer = Timeout(internal_timeout)

        self.crawler_mode = crawler_mode             # 爬取器模型
        self.concurrent_num = concurrent_num
        self.fetcher_pool = pool.Pool(self.concurrent_num)                  # 运行的进程数目

        if self.crawler_mode == 0:
            self.crawler_pool = threadpool.ThreadPool(min(50, self.concurrent_num))
        else:
            self.crawler_pool = pool.Pool(self.concurrent_num)

        # self.crawler_pool = []
        self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num*10000)
        self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num*10000)

        self.fetcher_cache = UrlCache()                     # url缓存设置
        self.crawler_cache = UrlCache()

        self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object']
        self.ignore_ext = ['js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip']

        self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags))       # 取二者的并集
        self.same_origin = same_origin
        self.depth = depth                              # 爬虫访问深度
        self.max_url_num = max_url_num
        self.dynamic_parse = dynamic_parse
        if self.dynamic_parse:
            self.webkit = WebKit()
        self.crawler_stopped = event.Event()

        self.plugin_handler = plugin   # 注册Crawler中使用的插件
        self.custom_headers = custom_headers
        self.unspider_url_list = []
        self.spider_type = spider_type

    def _start_fetcher(self):
        for i in xrange(self.concurrent_num):
            fetcher = Fetcher(self)
            self.fetcher_pool.start(fetcher)

    def _start_crawler(self):
        for _ in xrange(self.concurrent_num):
            self.crawler_pool.spawn(self.crawler)
        # self.crawler_pool = [gevent.spawn(self.crawler) for _ in xrange(self.concurrent_num)]

    def start(self):
        logging.info("spider starting...")

        if self.crawler_mode == 0:
            logging.info("crawler run in multi-thread mode.")
        elif self.crawler_mode == 1:
            logging.info("crawler run in gevent mode.")

        self._start_fetcher()
        self._start_crawler()

        self.stopped.wait()      # 等待停止事件置位

        try:
            self.internal_timer.start()
            self.fetcher_pool.join(timeout=self.internal_timer)

            if self.crawler_mode == 1:
                self.crawler_pool.join(timeout=self.internal_timer)
            else:
                self.crawler_pool.join()

        except Timeout:
            logging.error("internal timeout triggered")
        finally:
            self.internal_timer.cancel()

        self.stopped.clear()
        if self.dynamic_parse:
            self.webkit.close()

        unspider_url_list = list(set(self.unspider_url_list))          # 对未访问的list进行去重
        unspider_url_list.sort(key=self.unspider_url_list.index)
        redis_key = IMG_UNSPIDER_URL_KEY         # redis_key控制spider检索的数据类型

        try:
            for url_link in unspider_url_list:
                with global_redis.pipeline() as pipe:
                    pipe.lpush(redis_key, url_link).ltrim(redis_key, 0, 100).expire(redis_key, 72000).execute()
        except:
            logging.info("store unspider url error!!")
            pass

        logging.info("crawler_cache:%s fetcher_cache:%s" % (len(self.crawler_cache), len(self.fetcher_cache)))
        logging.info("spider process quit.")

    def crawler(self, _dep=None):
        while not self.stopped.isSet() and not self.crawler_stopped.isSet():
            try:
                self._maintain_spider()         # 维护爬虫池
                url_data = self.crawler_queue.get(block=False)
            except queue.Empty, e:
                if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0:
                    self.stop()
                else:
                    if self.crawler_mode == 1:
                        gevent.sleep()
            else:
                pre_depth = url_data.depth
                curr_depth = pre_depth+1
                link_generator = HtmlAnalyzer.extract_links(url_data.html, url_data.url, self.crawl_tags)
                link_list = [url for url in link_generator]

                if self.dynamic_parse:
                    link_generator = self.webkit.extract_links(url_data.url)
                    link_list.extend([url for url in link_generator])
                link_list = list(set(link_list))
                for index, link in enumerate(link_list):
                    if not self.check_url_usable(link):
                        continue
                    if curr_depth > self.depth:     # 最大爬行深度判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break
                    if len(self.fetcher_cache) == self.max_url_num:   # 最大收集URL数量判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break
                    link = to_unicode(link)
                    url = UrlData(link, depth=curr_depth)
                    self.fetcher_cache.insert(url)
                    self.fetcher_queue.put(url, block=True)

                for plugin_name in self.plugin_handler:     # 循环动态调用初始化时注册的插件
                    try:
                        plugin_obj = eval(plugin_name)()
                        plugin_obj.start(url_data)
                    except Exception, e:
                        import traceback
                        traceback.print_exc()

                self.crawler_queue.task_done()