# -*- coding: utf-8 -*- """ CrawlSpider(class scrapy.spiders.CrawlSpider) Spider类: 只爬取start_urls列表中的网页 CrawlSpider类: 定义了一些规则(rule)来提供跟进link的机制,从爬取的网页中获取link并继续爬取 LinkExtractors类: 用于提取链接,该类的extract_links()方法接收一个Response对象,返回一个scrapy.link.Link对象 该类需要实例化一次,并且extract_links()方法会根据不同的response调用多次提取链接 class scrapy.linkextractors.LinkExtractor( allow = (), # 满足括号中'正则表达式'的值会被提取,如果为空则全部匹配(常用) deny = (), # 与这个正则表达式不匹配的URL一定不提取 allow_domains = (), # 会被提取的链接的domains(常用) deny_domains = (), # 一定不会被提取链接的domains deny_extensions = None, restrict_xpaths = (), # 使用xpath表达式,和allow共同作用过滤链接 tags = ('a','area'), attrs = ('href'), canonicalize = True, unique = True, process_value = None ) rules: 包含一个或多个Rule对象,每个Rule对爬取网站的动作定义了特定操作 class scrapy.spiders.Rule( link_extractor, # 是一个LinkExtractor对象,定义需要提取的链接 callback = None, # 从link_extractor中每获取到新的链接时,指定回调函数处理response响应的数据 cb_kwargs = None,
def test_crawl_spider(self): assert issubclass(CrawlSpider, Spider) assert issubclass(CrawlSpider, BaseSpider) assert isinstance(CrawlSpider(name='foo'), Spider) assert isinstance(CrawlSpider(name='foo'), BaseSpider)
def test_crawl_spider(self): assert issubclass(CrawlSpider, Spider) assert isinstance(CrawlSpider(name="foo"), Spider)