示例#1
0
# -*- coding: utf-8 -*-
"""
CrawlSpider(class scrapy.spiders.CrawlSpider)
Spider类:
    只爬取start_urls列表中的网页
CrawlSpider类:
    定义了一些规则(rule)来提供跟进link的机制,从爬取的网页中获取link并继续爬取

LinkExtractors类:
    用于提取链接,该类的extract_links()方法接收一个Response对象,返回一个scrapy.link.Link对象
    该类需要实例化一次,并且extract_links()方法会根据不同的response调用多次提取链接
class scrapy.linkextractors.LinkExtractor(
    allow = (),                 # 满足括号中'正则表达式'的值会被提取,如果为空则全部匹配(常用)
    deny = (),                  # 与这个正则表达式不匹配的URL一定不提取
    allow_domains = (),         # 会被提取的链接的domains(常用)
    deny_domains = (),          # 一定不会被提取链接的domains
    deny_extensions = None,
    restrict_xpaths = (),       # 使用xpath表达式,和allow共同作用过滤链接
    tags = ('a','area'),
    attrs = ('href'),
    canonicalize = True,
    unique = True,
    process_value = None
)

rules:
    包含一个或多个Rule对象,每个Rule对爬取网站的动作定义了特定操作
class scrapy.spiders.Rule(
    link_extractor,             # 是一个LinkExtractor对象,定义需要提取的链接
    callback = None,            # 从link_extractor中每获取到新的链接时,指定回调函数处理response响应的数据
    cb_kwargs = None,
示例#2
0
 def test_crawl_spider(self):
     assert issubclass(CrawlSpider, Spider)
     assert issubclass(CrawlSpider, BaseSpider)
     assert isinstance(CrawlSpider(name='foo'), Spider)
     assert isinstance(CrawlSpider(name='foo'), BaseSpider)
示例#3
0
 def test_crawl_spider(self):
     assert issubclass(CrawlSpider, Spider)
     assert isinstance(CrawlSpider(name="foo"), Spider)