class Spider(object): """爬虫主类""" logger = logging.getLogger("spider") def __init__(self, concurrent_num=10, crawl_tags=[], custom_headers={}, plugin=['send_url_to_celery'], depth=10, max_url_num=3000000, internal_timeout=20, spider_timeout=6 * 3600, crawler_mode=1, same_origin=True, dynamic_parse=True): """ concurrent_num : 并行crawler和fetcher数量 crawl_tags : 爬行时收集URL所属标签列表 custom_headers : 自定义HTTP请求头 plugin : 自定义插件列表 depth : 爬行深度限制 max_url_num : 最大收集URL数量 internal_timeout : 内部调用超时时间 spider_timeout : 爬虫超时时间 crawler_mode : 爬取器模型(0:多线程模型,1:gevent模型) same_origin : 是否限制相同域下 dynamic_parse : 是否使用WebKit动态解析 """ USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", ] self.logger.setLevel(logging.DEBUG) hd = logging.StreamHandler() formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") hd.setFormatter(formatter) self.logger.addHandler(hd) self.stopped = event.Event() self.internal_timeout = internal_timeout self.internal_timer = Timeout(internal_timeout) self.crawler_mode = crawler_mode #爬取器模型 self.concurrent_num = concurrent_num self.fetcher_pool = pool.Pool(self.concurrent_num) if self.crawler_mode == 0: self.crawler_pool = threadpool.ThreadPool( min(50, self.concurrent_num)) else: self.crawler_pool = pool.Pool(self.concurrent_num) #self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100) self.fetcher_queue = threadpool.Queue( maxsize=self.concurrent_num * 10000) self.crawler_queue = threadpool.Queue( maxsize=self.concurrent_num * 10000) self.fetcher_cache = UrlCache() self.crawler_cache = UrlCache() self.default_crawl_tags = [ 'a', 'base', 'iframe', 'frame', 'object', 'framset' ] self.ignore_ext = [ 'cab', 'ico', 'swf', 'rar', 'zip', 'tar', 'gz', 'js', '7z', 'bz2', 'iso', 'nrg', 'uif', 'exe', 'rpm', 'deb', 'dmg', 'jar', 'jad', 'bin', 'apk', 'run', 'msi', 'xls', 'xlsx', 'ppt', 'pptx', 'pdf', 'doc', 'docx', 'odf', 'rtf', 'odt', 'mkv', 'avi', 'mp4', 'flv', 'WebM', 'mov', 'wmv', '3gp', 'mpg', 'mpeg', 'mp3', 'wav', 'ss3', 'ogg', 'mp4a', 'wma', 'png', 'jpeg', 'jpg', 'xpm', 'gif', 'tiff', 'css', 'bmp', 'svg', 'exif', 'thmx', 'xml', 'txt' ] self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) self.same_origin = same_origin self.depth = depth self.max_url_num = max_url_num self.dynamic_parse = dynamic_parse if self.dynamic_parse: self.webkit = WebKit() self.crawler_stopped = event.Event() self.plugin_handler = plugin #注册Crawler中使用的插件 self.custom_headers = {'User-Agent': random.choice(USER_AGENTS)} def _start_fetcher(self): ''' 启动下载器 ''' for i in xrange(self.concurrent_num): fetcher = Fetcher(self) self.fetcher_pool.start(fetcher) def _start_crawler(self): ''' 启动爬取器 ''' for _ in xrange(self.concurrent_num): self.crawler_pool.spawn(self.crawler) def start(self): ''' 主启动函数 ''' self.logger.info("spider starting...") if self.crawler_mode == 0: self.logger.info("crawler run in multi-thread mode.") elif self.crawler_mode == 1: self.logger.info("crawler run in gevent mode.") self._start_fetcher() self._start_crawler() self.stopped.wait() #等待停止事件置位 try: self.internal_timer.start() self.fetcher_pool.join(timeout=self.internal_timer) if self.crawler_mode == 1: self.crawler_pool.join(timeout=self.internal_timer) else: self.crawler_pool.join() except Timeout: self.logger.error("internal timeout triggered") finally: self.internal_timer.cancel() self.stopped.clear() if self.dynamic_parse: self.webkit.close() self.logger.info("crawler_cache:%s fetcher_cache:%s" % (len( self.crawler_cache), len(self.fetcher_cache))) self.logger.info("spider process quit.") def crawler(self, _dep=None): ''' 爬行器主函数 ''' while not self.stopped.isSet() and not self.crawler_stopped.isSet(): try: self._maintain_spider() #维护爬虫池 url_data = self.crawler_queue.get(block=False) except queue.Empty, e: if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0: self.stop() else: if self.crawler_mode == 1: gevent.sleep() else: pre_depth = url_data.depth curr_depth = pre_depth + 1 link_generator = HtmlAnalyzer.extract_links( url_data.html, url_data.url, self.crawl_tags) link_list = [url for url in link_generator] if self.dynamic_parse: link_generator = self.webkit.extract_links(url_data.url) link_list.extend([url for url in link_generator]) link_list = list(set(link_list)) for index, link in enumerate(link_list): if not self.check_url_usable(link): continue # 增加url相似性判断,详见urlfilter.py if not self.check_url_similar(link): continue # 增加url重复判断,详见urlfilter.py if not self.check_url_repeat(link): continue if curr_depth > self.depth: #最大爬行深度判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break if len(self. fetcher_cache) == self.max_url_num: #最大收集URL数量判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break link = to_unicode(link) url = UrlData(link, depth=curr_depth) self.fetcher_cache.insert(url) self.fetcher_queue.put(url, block=True) for plugin_name in self.plugin_handler: #循环动态调用初始化时注册的插件 try: plugin_obj = eval(plugin_name)() plugin_obj.start(url_data) except Exception, e: import traceback traceback.print_exc() self.crawler_queue.task_done()
class Spider(object): """爬虫主类""" logger = logging.getLogger("spider") def __init__(self, concurrent_num=20, crawl_tags=[], depth=3, max_url_num=300, internal_timeout=60, spider_timeout=6 * 3600, crawler_mode=0, same_origin=True, dynamic_parse=False): """ concurrent_num : 并行crawler和fetcher数量 crawl_tags : 爬行时收集URL所属标签列表 depth : 爬行深度限制 max_url_num : 最大收集URL数量 internal_timeout : 内部调用超时时间 spider_timeout : 爬虫超时时间 crawler_mode : 爬取器模型(0:多线程模型,1:gevent模型) same_origin : 是否限制相同域下 dynamic_parse : 是否使用WebKit动态解析 """ self.logger.setLevel(logging.DEBUG) hd = logging.StreamHandler() formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") hd.setFormatter(formatter) self.logger.addHandler(hd) self.stopped = event.Event() self.internal_timer = Timeout(internal_timeout) self.crawler_mode = crawler_mode #爬取器模型 self.concurrent_num = concurrent_num self.fetcher_pool = pool.Pool(self.concurrent_num) if self.crawler_mode == 0: self.crawler_pool = threadpool.ThreadPool( min(50, self.concurrent_num)) else: self.crawler_pool = pool.Pool(self.concurrent_num) #self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100) self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num * 100) self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num * 100) self.fetcher_cache = UrlCache() self.crawler_cache = UrlCache() self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object'] self.ignore_ext = [ 'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip' ] self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) self.same_origin = same_origin self.depth = depth self.max_url_num = max_url_num self.dynamic_parse = dynamic_parse if self.dynamic_parse: self.webkit = WebKit() self.crawler_stopped = event.Event() def _start_fetcher(self): ''' 启动下载器 ''' for i in xrange(self.concurrent_num): fetcher = Fetcher(self) self.fetcher_pool.start(fetcher) def _start_crawler(self): ''' 启动爬取器 ''' for _ in xrange(self.concurrent_num): self.crawler_pool.spawn(self.crawler) def start(self): ''' 主启动函数 ''' self.logger.info("spider starting...") if self.crawler_mode == 0: self.logger.info("crawler run in multi-thread mode.") elif self.crawler_mode == 1: self.logger.info("crawler run in gevent mode.") self._start_fetcher() self._start_crawler() self.stopped.wait() #等待停止事件置位 try: self.internal_timer.start() self.fetcher_pool.join(timeout=self.internal_timer) if self.crawler_mode == 1: self.crawler_pool.join(timeout=self.internal_timer) else: self.crawler_pool.join() except Timeout: self.logger.error("internal timeout triggered") finally: self.internal_timer.cancel() self.stopped.clear() if self.dynamic_parse: self.webkit.close() self.logger.info("crawler_cache:%s fetcher_cache:%s" % (len(self.crawler_cache), len(self.fetcher_cache))) self.logger.info("spider process quit.") def crawler(self, _dep=None): ''' 爬行器主函数 ''' while not self.stopped.isSet() and not self.crawler_stopped.isSet(): try: self._maintain_spider() #维护爬虫池 url_data = self.crawler_queue.get(block=False) except queue.Empty, e: if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0: self.stop() else: if self.crawler_mode == 1: gevent.sleep() else: pre_depth = url_data.depth curr_depth = pre_depth + 1 link_generator = HtmlAnalyzer.extract_links( url_data.html, url_data.url, self.crawl_tags) link_list = [url for url in link_generator] if self.dynamic_parse: link_generator = self.webkit.extract_links(url_data.url) link_list.extend([url for url in link_generator]) link_list = list(set(link_list)) for index, link in enumerate(link_list): if not self.check_url_usable(link): continue if curr_depth > self.depth: #最大爬行深度判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break if len(self.fetcher_cache ) == self.max_url_num: #最大收集URL数量判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break link = to_unicode(link) url = UrlData(link, depth=curr_depth) self.fetcher_cache.insert(url) self.fetcher_queue.put(url, block=True) self.crawler_queue.task_done()
class Spider(object): """爬虫主类""" logger = logging.getLogger("spider") def __init__( self, concurrent_num=20, crawl_tags=[], depth=3, max_url_num=300, internal_timeout=60, spider_timeout=6 * 3600, crawler_mode=0, same_origin=True, dynamic_parse=False, ): """ concurrent_num : 并行crawler和fetcher数量 crawl_tags : 爬行时收集URL所属标签列表 depth : 爬行深度限制 max_url_num : 最大收集URL数量 internal_timeout : 内部调用超时时间 spider_timeout : 爬虫超时时间 crawler_mode : 爬取器模型(0:多线程模型,1:gevent模型) same_origin : 是否限制相同域下 dynamic_parse : 是否使用WebKit动态解析 """ self.logger.setLevel(logging.DEBUG) hd = logging.StreamHandler() formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") hd.setFormatter(formatter) self.logger.addHandler(hd) self.stopped = event.Event() self.internal_timer = Timeout(internal_timeout) self.crawler_mode = crawler_mode # 爬取器模型 self.concurrent_num = concurrent_num self.fetcher_pool = pool.Pool(self.concurrent_num) if self.crawler_mode == 0: self.crawler_pool = threadpool.ThreadPool(min(50, self.concurrent_num)) else: self.crawler_pool = pool.Pool(self.concurrent_num) # self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100) self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num * 100) self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num * 100) self.fetcher_cache = UrlCache() self.crawler_cache = UrlCache() self.default_crawl_tags = ["a", "base", "iframe", "frame", "object"] self.ignore_ext = ["js", "css", "png", "jpg", "gif", "bmp", "svg", "exif", "jpeg", "exe", "rar", "zip"] self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) self.same_origin = same_origin self.depth = depth self.max_url_num = max_url_num self.dynamic_parse = dynamic_parse if self.dynamic_parse: self.webkit = WebKit() self.crawler_stopped = event.Event() def _start_fetcher(self): """ 启动下载器 """ for i in xrange(self.concurrent_num): fetcher = Fetcher(self) self.fetcher_pool.start(fetcher) def _start_crawler(self): """ 启动爬取器 """ for _ in xrange(self.concurrent_num): self.crawler_pool.spawn(self.crawler) def start(self): """ 主启动函数 """ self.logger.info("spider starting...") if self.crawler_mode == 0: self.logger.info("crawler run in multi-thread mode.") elif self.crawler_mode == 1: self.logger.info("crawler run in gevent mode.") self._start_fetcher() self._start_crawler() self.stopped.wait() # 等待停止事件置位 try: self.internal_timer.start() self.fetcher_pool.join(timeout=self.internal_timer) if self.crawler_mode == 1: self.crawler_pool.join(timeout=self.internal_timer) else: self.crawler_pool.join() except Timeout: self.logger.error("internal timeout triggered") finally: self.internal_timer.cancel() self.stopped.clear() if self.dynamic_parse: self.webkit.close() self.logger.info("crawler_cache:%s fetcher_cache:%s" % (len(self.crawler_cache), len(self.fetcher_cache))) self.logger.info("spider process quit.") def crawler(self, _dep=None): """ 爬行器主函数 """ while not self.stopped.isSet() and not self.crawler_stopped.isSet(): try: self._maintain_spider() # 维护爬虫池 url_data = self.crawler_queue.get(block=False) except queue.Empty, e: if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0: self.stop() else: if self.crawler_mode == 1: gevent.sleep() else: pre_depth = url_data.depth curr_depth = pre_depth + 1 link_generator = HtmlAnalyzer.extract_links(url_data.html, url_data.url, self.crawl_tags) link_list = [url for url in link_generator] if self.dynamic_parse: link_generator = self.webkit.extract_links(url_data.url) link_list.extend([url for url in link_generator]) link_list = list(set(link_list)) for index, link in enumerate(link_list): if not self.check_url_usable(link): continue if curr_depth > self.depth: # 最大爬行深度判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break if len(self.fetcher_cache) == self.max_url_num: # 最大收集URL数量判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break link = to_unicode(link) url = UrlData(link, depth=curr_depth) self.fetcher_cache.insert(url) self.fetcher_queue.put(url, block=True) self.crawler_queue.task_done()
class Spider(object): """爬虫主类""" logger = logging.getLogger("spider") # 相当于构造函数 def __init__(self, concurrent_num=20, crawl_tags=[], custom_headers={}, plugin=[], depth=3, max_url_num=300, internal_timeout=60, spider_timeout=6 * 3600, crawler_mode=0, same_origin=True, dynamic_parse=False): """ concurrent_num : 并行crawler和fetcher数量 crawl_tags : 爬行时收集URL所属标签列表 custom_headers : 自定义HTTP请求头 plugin : 自定义插件列表 depth : 爬行深度限制 max_url_num : 最大收集URL数量 internal_timeout : 内部调用超时时间 spider_timeout : 爬虫超时时间 crawler_mode : 爬取器模型(0:多线程模型,1:gevent模型) same_origin : 是否限制相同域下 dynamic_parse : 是否使用WebKit动态解析 """ # 日志模块 self.logger.setLevel(logging.DEBUG) # 日志级别 formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") # 日志格式 hd = logging.StreamHandler() hd.setFormatter(formatter) self.logger.addHandler(hd) self.stopped = event.Event() self.internal_timeout = internal_timeout # 内部调用超时时间 self.internal_timer = Timeout(internal_timeout) self.crawler_mode = crawler_mode # 爬取器模型 self.concurrent_num = concurrent_num # 并行crawler与fetcher数量 # fetcher使用gevent模型 self.fetcher_pool = pool.Pool(self.concurrent_num) # crawler模型设置 # crawler负责解析并爬取HTML中的URL,送入fetcher,fetcher负责获取HTML,送入crawler if self.crawler_mode == 0: # 线程池模型 self.crawler_pool = threadpool.ThreadPool( min(50, self.concurrent_num)) else: # gevent模型 self.crawler_pool = pool.Pool(self.concurrent_num) # fetcher和crawler两部分独立工作,互不干扰,通过queue进行链接 # self.fetcher_queue = queue.JoinableQueue(maxsize=self.concurrent_num*100) self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num * 10000) self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num * 10000) self.fetcher_cache = UrlCache() self.crawler_cache = UrlCache() self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object'] # 默认的爬行时收集URL所属标签列表 self.ignore_ext = [ 'js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip' ] # 爬行时忽略的URL种类 self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) # 爬行时收集URL所属标签列表 self.same_origin = same_origin # 是否同源 self.depth = depth # 爬行深度限制 self.max_url_num = max_url_num # 最大收集URL数量 self.dynamic_parse = dynamic_parse # 是否使用WebKit动态解析 # 如果开启动态解析 if self.dynamic_parse: self.webkit = WebKit() self.crawler_stopped = event.Event() self.plugin_handler = plugin # 注册Crawler中使用的插件 # 自定义HTTP头 self.custom_headers = custom_headers def _start_fetcher(self): ''' 启动下载器 ''' for i in xrange(self.concurrent_num): # concurrent_num:并行数量 fetcher = Fetcher(self) # 实例化一个 fetcher self.fetcher_pool.start(fetcher) # 调用start()开始执行_run() def _start_crawler(self): ''' 启动爬取器 ''' for _ in xrange(self.concurrent_num): self.crawler_pool.spawn(self.crawler) # 启动self.crawler()函数 def start(self): ''' 主启动函数 ''' self.logger.info("spider starting...") if self.crawler_mode == 0: self.logger.info("crawler run in multi-thread mode.") elif self.crawler_mode == 1: self.logger.info("crawler run in gevent mode.") # 开启fetcher与crawler self._start_fetcher() # 初始爬取URL已在main函数中spider.feed_url(url)中设置 self._start_crawler() self.stopped.wait() # 等待停止事件置位 # 等待fetcher与crawler执行结束 try: self.internal_timer.start() self.fetcher_pool.join(timeout=self.internal_timer) if self.crawler_mode == 1: self.crawler_pool.join(timeout=self.internal_timer) else: self.crawler_pool.join() except Timeout: self.logger.error("internal timeout triggered") finally: self.internal_timer.cancel() self.stopped.clear() if self.dynamic_parse: self.webkit.close() self.logger.info("crawler_cache:%s fetcher_cache:%s" % (len(self.crawler_cache), len(self.fetcher_cache))) self.logger.info("spider process quit.") def crawler(self, _dep=None): ''' 爬行器主函数 ''' while not self.stopped.isSet() and not self.crawler_stopped.isSet(): try: self._maintain_spider() # 维护爬虫池 url_data = self.crawler_queue.get( block=False) # 从爬取队列取出一个URL except queue.Empty, e: # 队列为空 if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0: # 全部处理完毕 self.stop() else: # fetcher没有处理完毕 if self.crawler_mode == 1: gevent.sleep() else: pre_depth = url_data.depth curr_depth = pre_depth + 1 # 当前深度 # 生成URL list link_generator = HtmlAnalyzer.extract_links( url_data.html, url_data.url, self.crawl_tags) link_list = [url for url in link_generator] if self.dynamic_parse: # WebKit动态解析 link_generator = self.webkit.extract_links(url_data.url) link_list.extend([url for url in link_generator]) link_list = list(set(link_list)) # 去重 # 遍历解析出的URL list for index, link in enumerate(link_list): if not self.check_url_usable(link): continue if curr_depth > self.depth: # 最大爬行深度判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break if len(self.fetcher_cache ) == self.max_url_num: # 最大收集URL数量判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break link = to_unicode(link) url = UrlData(link, depth=curr_depth) # 此处调整顺序,应该先加入下载队列 self.fetcher_queue.put(url, block=True) self.fetcher_cache.insert(url) # 加入到已经处理fetcher队列 # 插件部分,暂时不关注 for plugin_name in self.plugin_handler: # 循环动态调用初始化时注册的插件 try: plugin_obj = eval(plugin_name)() plugin_obj.start(url_data) except Exception, e: import traceback traceback.print_exc() self.crawler_queue.task_done()
class Spider(object): """ concurrent_num : 并行crawler和fetcher数量 crawl_tags : 爬行时收集URL所属标签列表 custom_headers : 自定义HTTP请求头 plugin : 自定义插件列表 depth : 爬行深度限制 max_url_num : 最大收集URL数量 internal_timeout : 内部调用超时时间 spider_timeout : 爬虫超时时间 crawler_mode : 爬取器模型(0:多线程模型,1:gevent模型) same_origin : 是否限制相同域下 dynamic_parse : 是否使用WebKit动态解析 """ def __init__(self, concurrent_num=20, crawl_tags=[], custom_headers={}, plugin=[], depth=3, max_url_num=300, internal_timeout=60, spider_timeout=6*3600, crawler_mode=0, same_origin=True, dynamic_parse=False, spider_type='img'): self.stopped = event.Event() self.internal_timeout = internal_timeout self.internal_timer = Timeout(internal_timeout) self.crawler_mode = crawler_mode # 爬取器模型 self.concurrent_num = concurrent_num self.fetcher_pool = pool.Pool(self.concurrent_num) # 运行的进程数目 if self.crawler_mode == 0: self.crawler_pool = threadpool.ThreadPool(min(50, self.concurrent_num)) else: self.crawler_pool = pool.Pool(self.concurrent_num) # self.crawler_pool = [] self.fetcher_queue = threadpool.Queue(maxsize=self.concurrent_num*10000) self.crawler_queue = threadpool.Queue(maxsize=self.concurrent_num*10000) self.fetcher_cache = UrlCache() # url缓存设置 self.crawler_cache = UrlCache() self.default_crawl_tags = ['a', 'base', 'iframe', 'frame', 'object'] self.ignore_ext = ['js', 'css', 'png', 'jpg', 'gif', 'bmp', 'svg', 'exif', 'jpeg', 'exe', 'rar', 'zip'] self.crawl_tags = list(set(self.default_crawl_tags) | set(crawl_tags)) # 取二者的并集 self.same_origin = same_origin self.depth = depth # 爬虫访问深度 self.max_url_num = max_url_num self.dynamic_parse = dynamic_parse if self.dynamic_parse: self.webkit = WebKit() self.crawler_stopped = event.Event() self.plugin_handler = plugin # 注册Crawler中使用的插件 self.custom_headers = custom_headers self.unspider_url_list = [] self.spider_type = spider_type def _start_fetcher(self): for i in xrange(self.concurrent_num): fetcher = Fetcher(self) self.fetcher_pool.start(fetcher) def _start_crawler(self): for _ in xrange(self.concurrent_num): self.crawler_pool.spawn(self.crawler) # self.crawler_pool = [gevent.spawn(self.crawler) for _ in xrange(self.concurrent_num)] def start(self): logging.info("spider starting...") if self.crawler_mode == 0: logging.info("crawler run in multi-thread mode.") elif self.crawler_mode == 1: logging.info("crawler run in gevent mode.") self._start_fetcher() self._start_crawler() self.stopped.wait() # 等待停止事件置位 try: self.internal_timer.start() self.fetcher_pool.join(timeout=self.internal_timer) if self.crawler_mode == 1: self.crawler_pool.join(timeout=self.internal_timer) else: self.crawler_pool.join() except Timeout: logging.error("internal timeout triggered") finally: self.internal_timer.cancel() self.stopped.clear() if self.dynamic_parse: self.webkit.close() unspider_url_list = list(set(self.unspider_url_list)) # 对未访问的list进行去重 unspider_url_list.sort(key=self.unspider_url_list.index) redis_key = IMG_UNSPIDER_URL_KEY # redis_key控制spider检索的数据类型 try: for url_link in unspider_url_list: with global_redis.pipeline() as pipe: pipe.lpush(redis_key, url_link).ltrim(redis_key, 0, 100).expire(redis_key, 72000).execute() except: logging.info("store unspider url error!!") pass logging.info("crawler_cache:%s fetcher_cache:%s" % (len(self.crawler_cache), len(self.fetcher_cache))) logging.info("spider process quit.") def crawler(self, _dep=None): while not self.stopped.isSet() and not self.crawler_stopped.isSet(): try: self._maintain_spider() # 维护爬虫池 url_data = self.crawler_queue.get(block=False) except queue.Empty, e: if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0: self.stop() else: if self.crawler_mode == 1: gevent.sleep() else: pre_depth = url_data.depth curr_depth = pre_depth+1 link_generator = HtmlAnalyzer.extract_links(url_data.html, url_data.url, self.crawl_tags) link_list = [url for url in link_generator] if self.dynamic_parse: link_generator = self.webkit.extract_links(url_data.url) link_list.extend([url for url in link_generator]) link_list = list(set(link_list)) for index, link in enumerate(link_list): if not self.check_url_usable(link): continue if curr_depth > self.depth: # 最大爬行深度判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break if len(self.fetcher_cache) == self.max_url_num: # 最大收集URL数量判断 if self.crawler_stopped.isSet(): break else: self.crawler_stopped.set() break link = to_unicode(link) url = UrlData(link, depth=curr_depth) self.fetcher_cache.insert(url) self.fetcher_queue.put(url, block=True) for plugin_name in self.plugin_handler: # 循环动态调用初始化时注册的插件 try: plugin_obj = eval(plugin_name)() plugin_obj.start(url_data) except Exception, e: import traceback traceback.print_exc() self.crawler_queue.task_done()