class Dgtle(RedisCrawlSpider): # 爬虫名 name = "dgtle" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = ["dgtle.com"] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = 'dgtle:start_urls' # start_urls = ['http://www.dgtle.com/'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'/portal.php\?mod=list&catid=\d{2}', )), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'/article[\d|-]+\.html', )), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '/html/body/div[3]/h2/a/text()').extract()[0].strip() item['pub_time'] = response.xpath( '/html/body/div[3]/div/div[1]/i/text()').extract()[0].strip() item['content_code'] = response.xpath( '/html/body/div[4]/div[1]').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
class WangyiNewsSpider(CrawlSpider): name = '163_news' allowed_domains = ['news.163.com'] start_urls = ['http://news.163.com/'] article_extract = LxmlLinkExtractor( allow=('/\d{2}/\d{4}/\d{2}/[a-zA-Z0-9_]+.html', 'photoview/[a-zA-Z0-9]+/\d+.html', '/\d+/\d+/[A-Z0-9]+.html', '/photo/[A-Z0-9]+/\d+.html', '/\d+/\d/[a-zA-Z0-9_]+.html'), allow_domains=('news.163.com')) follow_extract = LxmlLinkExtractor(allow_domains=('news.163.com')) rules = (Rule(article_extract, follow=True, callback='parse_article'), Rule(follow_extract, follow=True, callback='parse_follow')) a_count = 0 f_count = 0 def parse_article(self, response): self.a_count += 1 print('article: ' + str(self.a_count) + ' ' + response.url) sel = Selector(response) # http://news.163.com/17/0117/14/CB07N4J4000187VE.html news_1_div = sel.xpath( '//div[@id="epContentLeft"]/div[@id="post_body"]') def parse_follow(self, response): self.f_count += 1 print('follow: ' + str(self.f_count) + ' ' + response.url)
class InfoQ(RedisCrawlSpider): # 爬虫名 name = "infoq" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = [ "infoq.com", ] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = 'infoq:start_urls' # start_urls = ['http://www.infoq.com/cn/'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'infoq\.com/cn/[a-z]+/.+', )), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'infoq\.com/cn/news/\d{4}/\d{2}/.+', )), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//*[@id="content"]/h1/text()').extract()[0].strip() item['pub_time'] = response.url.split( '/')[-3] + '-' + response.url.split('/')[-2] item['content_code'] = response.xpath( '//*[@id="content"]/div[2]/div[1]').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
class Zaker(RedisCrawlSpider): # 爬虫名 name = "zaker" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = ["myzaker.com"] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = 'zaker:start_urls' # start_urls = ['https://www.myzaker.com/'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'/channel/[13|5|4|1039]')), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'/article/.+/', )), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//div[@id="content"]/div/div/div/h1/text()').extract()[0].strip() item['pub_time'] = response.xpath( '//div[@id="article"]/div[1]/div/a/span[3]/text()').extract( )[0].strip() item['content_code'] = response.xpath( '//div[@class="article_content"]').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
class Guokr(RedisCrawlSpider): # 爬虫名 name = "guokr" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = [ "guokr.com", ] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = 'guokr:start_urls' # start_urls = ['http://www.guokr.com/scientific/'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'/scientific/channel/[a-z]+/', )), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'/article/\d+/', )), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//*[@id="articleTitle"]/text()').extract()[0].strip() item['pub_time'] = '2017-08-27' item['content_code'] = response.xpath( '//*[@id="articleContent"]/div/div[1]').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
class Techreview(RedisCrawlSpider): # 爬虫名 name = "techreview" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = ["technologyreview.com"] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = 'techreview:start_urls' # start_urls = ['https://www.technologyreview.com/'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'/topic/.+/', )), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'/s/\d{6}/[a-z0-9|-]+/', )), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('/html/body/main/section/article/div/div[1]/div[2]/div/div[1]/h1/text()').extract()[0].strip() item['pub_time'] = response.xpath('/html/body/main/section/article/div/div[1]/div[2]/div/div[2]/ul/li[2]/text()').extract()[0].strip() item['content_code'] = response.xpath('/html/body/main/section/article/div/div[2]').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
class mobilephoneSpider(CrawlSpider): name="mobilephone" allowed_domains=["zol.com.cn"] start_urls=['http://detail.zol.com.cn/cell_phone_index/subcate57_list_1.html'] rules=( Rule( LxmlLinkExtractor( allow=('detail.zol.com.cn/\d*?/\d*?/param.shtml',), //means only allow this webpage deny=(), ), follow=False, //if links should be followed from each response extracted with this rule process_links=lambda links:[link for link in links if not link.nofollow], callback='parse'), Rule( LxmlLinkExtractor( allow=('detail.zol.com.cn/cell_phone/index\d*?.shtml',), deny=(), ), follow=False, process_links=lambda links:[link for link in links if not link.nofollow], callback='parse_price'), Rule( LxmlLinkExtractor( allow=('/cell_phone_index/subcate57_\d*?_list_1[_\d]*?\.html',), //this format is wrong // check this http://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_1_2_0_1.html deny=('digital','notebook','tablepc','gps','keyboards_mouse','desktop_pc', 'gpswatch','zsyxj','motherboard','vga','cpu','hard_drives','menmery', 'case','power','cooling_product','solid_state_drive','dvdrw','sound_card', 'diy_host','usb-hub','speaker','mb_chip'), ), follow=True, process_links=lambda links:[link for link in links if not link.nofollow], ) )
class Tmtpost(RedisCrawlSpider): # 爬虫名 name = "tmtpost" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = [ "tmtpost.com", ] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = "tmtpost:start_urls" # start_urls = ['http://www.tmtpost.com/'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'/column/\d+', )), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'tmtpost.com/\d+\.html', )), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('//h1/text()').extract()[0].strip() item['pub_time'] = response.xpath( '//span[@class="time"]/text()').extract()[0][:10] item['content_code'] = response.xpath('//article/div[2]').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
class FTchinese(RedisCrawlSpider): # 爬虫名 name = "ftchinese" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = ["ftchinese.com",] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = 'ftchinese:start_urls' # start_urls = ['http://www.ftchinese.com/'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'/channel/.+\.html', )), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'/story/.+', )), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('/html/body/div[5]/div/div[1]/div/div[1]/h1/text()').extract()[0].strip() item['pub_time'] = response.xpath('/html/body/div[5]/div/div[1]/div/div[1]/div[5]/span[1]/text()').extract()[0].strip() item['content_code'] = response.xpath('/html/body/div[5]/div/div[1]/div/div[1]/div[6]').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
class Centerwatch(CrawlSpider): name = 'centerwatch' allowed_domains = ['centerwatch.com'] start_urls = [ "http://www.centerwatch.com/drug-information/fda-approved-drugs/therapeutic-areas" ] rules = ( Rule( LxmlLinkExtractor( restrict_xpaths=('.//li/a[contains(@id, "ctl00")]'))), Rule(LxmlLinkExtractor( restrict_xpaths=('//div[@id="ctl00_BodyContent_AreaDetails"]')), callback='parse_drug'), ) def parse_drug(self, response): page = response.xpath('//div[@class="row"]')[3] summary_cols = page.xpath('.//div[@id="SummaryColumn"]/div/div/p') drug = Drug( name=page.xpath('.//h1/text()').extract_first(), company=summary_cols[1].xpath('./a/text()').extract_first(), approval_status=summary_cols[3].xpath('./text()').extract_first(), specific_treatment=summary_cols[5].xpath( './text()').extract_first(), therapeutic_areas=summary_cols[7].xpath('./a/text()').extract()) yield drug
class MoiveSpider(CrawlSpider): """docstring for ClassName""" name = 'moive' allowed_domains = ['movie.douban.com'] start_urls = ['http://movie.douban.com/top250'] rules = [ Rule(LxmlLinkExtractor(allow=(r'\?start = \d+.*'))), Rule(LxmlLinkExtractor(allow=(r'http://movie.douban.com/subject/\d+')), callback='parse_item', follow=True) ] def parse_item(self, response): sel = Selector(response) item = MoviceItem() item['name'] = sel.xpath( "//*div[@id='content']/h1/span[1]/text()").extract() item['year'] = sel.xpath("//*[@id='content']/h1/span[2]/text()").re( r'\((\d+)\)') item['score'] = sel.xpath( "//*div[@class='clearfix']/strong/text()").extract() item['director'] = sel.xpath( "//*div[@id='info']/span[1]/a/text()").extract() item['classification'] = sel.xpath( "//span[@property='v:genre']/text()").extract() item['actor'] = sel.xpath( "//*span[@class='actor']//a/text()").extract() return item
def parse_data(self, response): item = CrawlerItem() title = response.css("head title::text").extract_first().strip() # Extract page title if title.endswith(' | University of Illinois at Chicago'): title = title[:-36] soup = BeautifulSoup(response.text, "html.parser") for div in soup.find_all("div", {'class': 'browser-stripe'}): div.decompose() # Extract page content contents = soup.findAll(text=True) visible_texts = filter(tag_visible, contents) item['content'] = " ".join(t.strip() for t in visible_texts) outlinks = [] le = LxmlLinkExtractor(allow_domains=('uic.edu'), deny_domains=('login.uic.edu'), unique=True, canonicalize=True) for link in le.extract_links(response): outlinks.append(link.url) if title != 'UIC Directory' and title != 'Search Help' and 'uic.edu' in response.request.url: item['title'] = title item['url'] = response.request.url item['outlinks'] = outlinks yield item
class NewsFeedSpider(CrawlSpider): name = 'urls' allowed_domains = [domain.strip() for domain in domains] start_urls = URLS content_types = ['text/xml', 'application/xml', 'rss', 'xml'] rules = ( Rule(LxmlLinkExtractor(allow=('.*\.xml$', '.*\.atom$', '.*\.rss$', '.*\.feed$', '.*\.feeds$'), ), callback='parse_item'), Rule(LxmlLinkExtractor(allow=('.*xml.*', '.*xml.*', '.*rss.*', '.*feed.*', '.*feeds.*'), ), callback='parse_item', follow=True), Rule(LxmlLinkExtractor(allow=('.*', ), )), ) def parse_item(self, response): cts = response.headers.get('Content-Type') for ct in self.content_types: if ct in cts: print response.url url = FeedUrl() url['url'] = response.url yield url
class technode(RedisCrawlSpider): # 爬虫名 name = "technode" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = ["technode.com",] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = "technode:start_urls" # start_urls = ['http://cn.technode.com/'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'/post/category/[a-z|-]+/', )), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'/post/[\d|-]{10}/.+/', )), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('//*/header/h1/text()').extract()[0].strip() item['pub_time'] = response.xpath('//*/header/div/time/text()').extract()[0] item['content_code'] = response.xpath('//*[@id="inner-wrap"]/div[5]/div/div/div/div/div/div[1]/article').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
class LinkSpider(CrawlSpider): """ TODO find a way to keep referer and title of broken href """ name = 'link_spider' allow_domains = [] start_urls = [] handle_httpstatus_list = [404, 500, 403, 401, 400] broken_links = set() rules = ( Rule(LxmlLinkExtractor(allow=(), allow_domains=['qrpay.ai']), callback='parse_obj', follow=True), Rule(LxmlLinkExtractor(allow=()), callback='parse_obj', follow=False), ) def parse_obj(self, response): if response.status not in ('200', '302', '301', 200, 302, 301): print(self.start_domain) item = BrokenItem() item['url'] = response.url item['referer'] = response.request.headers.get('Referer', '').decode('utf-8') item['status'] = response.status # TODO put into pipline if item not in broken_links: link = json.dumps(dict(item)) broken_links.add(link) redis_store.lpush('broken_links_%s' % (self.start_domain), link) return item
class Techqq(RedisCrawlSpider): # 爬虫名 name = "techqq" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = ["tech.qq.com"] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = "techqq:start_urls" # start_urls = ['http://tech.qq.com/'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'/web/[a-z]+\.htm', r'/[a-z]+\.html')), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'/a/\d{8}/\d+\.htm', )), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()').extract()[0].strip() item['pub_time'] = response.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/div/div[1]/span[3]/text()').extract()[0].strip() item['content_code'] = response.xpath('//*[@id="Cnt-Main-Article-QQ"]').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
class Aitists(RedisCrawlSpider): # 爬虫名 name = "aitists" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = ["aitists.com", "mp.weixin.qq.com"] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = 'aitists:start_urls' # start_urls = ['http://www.aitists.com/'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'/category/.+', )), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'com/s/.+', )), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//*[@id="activity-name"]/text()').extract()[0].strip() item['pub_time'] = response.xpath( '//*[@id="post-date"]/text()').extract()[0].strip() item['content_code'] = response.xpath( '//*[@id="js_content"]').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
class Economist(RedisCrawlSpider): # 爬虫名 name = "economist" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = ["economist.com"] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = 'economist:start_urls' # start_urls = ['https://www.economist.com/'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'/sections/.+', )), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'/news/.+/.+', )), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//h1/span[2]/text()').extract()[0].strip() item['pub_time'] = response.xpath( '//time[1]/text()').extract()[0].strip() item['content_code'] = response.xpath( '//main/div/div[1]/div/article/div[1]').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
class Donews(RedisCrawlSpider): # 爬虫名 name = "donews" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = ["donews.com",] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = 'donews:start_urls' # start_urls = ['http://www.donews.com/', 'http://www.donews.com/idonews/'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'donews.com/[a-z]+/index', )), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'donews.com/news/detail/\d/\d+\.html', r'/article/detail/\d+/\d+\.html')), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('//*[@id="main"]/div[2]/h2/text()').extract()[0] item['pub_time'] = response.xpath('//*[@id="main"]/div[2]/div[1]/p/span[2]/text()').extract()[0] item['content_code'] = response.xpath('//*[@id="main"]/div[2]/div[2]').extract()[0] # 返回每个item yield item
def parse(self, response): links = LxmlLinkExtractor(allow=()).extract_links(response) links = [str(link.url) for link in links] links.append(str(response.url)) for link in links: yield scrapy.Request(url=link, callback=self.parse_link)
class Githubtrendingrepocrawler(CrawlSpider): # 1 name = 'GithubTrendingRepoCrawler' # 2 start_urls = ['http://github.com/trending/'] # 2 # 3 rules = ( # Extract link from this path only Rule(LxmlLinkExtractor( restrict_xpaths=["//ol[@id=repo-list]//h3/a/@href"], allow_domains=['https://github.com/trending']), callback='parse'), # link should match this pattern and create new requests Rule(LxmlLinkExtractor(allow='https://github.com/[\w-]+/[\w-]+$', allow_domains=['github.com']), callback='parse_product_page'), # # Recursive Rule # Rule( # LxmlLinkExtractor(allow='https://github.com/[\w-]+/[\w-]+$', allow_domains=['github.com']), # callback='parse_product_page', follow=True # ), ) # 4 def parse_product_page(self, response): item = PageContentItem() item['url'] = response.url item['content'] = response.css('article').get() yield item
def parsePreDesigne(self, response): singleLinks = LxmlLinkExtractor( allow=('/pre-designed-home-range/[\w-]+/$'), restrict_xpaths= '//div[@id="double-storey"]/preceding-sibling::section' ).extract_links(response) doubleLinks = LxmlLinkExtractor( allow=('/pre-designed-home-range/[\w-]+/$'), restrict_xpaths= '//div[@id="double-storey"]/following-sibling::section' ).extract_links(response) for link in singleLinks: meta = {'storey': 1} yield Request(link.url, callback=self.parseItem, dont_filter=True, meta=meta) for link in doubleLinks: meta = {'storey': 0} yield Request(link.url, callback=self.parseItem, dont_filter=True, meta=meta)
class icsspider(CrawlSpider): download_delay = 3 retry_times = 10 name = 'ics' start_urls = ['http://ics.cnvd.org.cn/?max=20&offset=400'] # start_urls = ['http://www.cnvd.org.cn/flaw/list.htm?max=20&offset=3240'] allowed_domains = ['ics.cnvd.org.cn', 'www.cnvd.org.cn'] rules = (Rule( LxmlLinkExtractor(allow=('/\?max=\d+', ), restrict_xpaths=("//a[@class='nextLink']", ))), Rule(LxmlLinkExtractor(allow=('/flaw/show/', ), restrict_xpaths=("//tbody[@id='tr']", )), follow=True, callback='parse_item')) def parse_item(self, response): sel = Selector(response) try: item = ics() item['cnvd'] = ''.join( sel.xpath("//table[@class='gg_detail']\ /tbody/tr[1]/td[2]/text()").extract()).strip() yield item except exceptions: print 'url: %s 解析出错' % response.url pass
class Finance_ifeng(RedisCrawlSpider): # 爬虫名 name = "finance_ifeng" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = ["ifeng.com"] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = 'finance_ifeng:start_urls' # start_urls = ['http://finance.ifeng.com/', # 'http://tech.ifeng.com/', # 'http://finance.ifeng.com/stock/gstzgc/'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'ifeng\.com/[a-z]+/', )), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'ifeng\.com/a/\d{8}/[\d|_]+\.shtml', )), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//*[@id="artical_topic"]/text()').extract()[0].strip() item['pub_time'] = response.xpath( '//*[@id="artical_sth"]/p/span[1]/text()').extract()[0].strip() item['content_code'] = response.xpath( '//*[@id="main_content"]').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
class Cyzone(RedisCrawlSpider): # 爬虫名 name = "cyzone" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = ["cyzone.cn"] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = 'cyzone:start_urls' # start_urls = ['http://www.cyzone.cn/'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'/category/\d+/', )), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'/a/\d{8}/\d+\.html', )), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('//*[@id="article"]/div[1]/div[1]/h1/text()').extract()[0].strip() item['pub_time'] = response.url.split("/")[-2] item['content_code'] = response.xpath('//*[@id="article"]/div[1]/div[2]').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
class NewsFeedSpider(CrawlSpider): name = 'sites' allowed_domains = [domain.strip() for domain in domains] start_urls = ['http://www.nytimes.com/services/xml/rss/index.html'] #URLS rules = ( # '.*xml.*', '.*xml.*', '.*rss.*', '.*feed.*', '.*feeds.*' Rule(LxmlLinkExtractor( allow=('.*\.xml$', '.*\.atom$', '.*\.rss$', '.*\.feed$', '.*\.feeds$'), ), callback='parse_item'), Rule(LxmlLinkExtractor( allow=('.*', ), )), ) def parse_item(self, response): page = feedparser.parse(response.body) item = FeedUrl() item['url'] = response.url print response.url return item
class Dsj36(RedisCrawlSpider): # 爬虫名 name = "36dsj" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = [ "36dsj.com", ] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = '36dsj:start_urls' # start_urls = ['http://www.36dsj.com/'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'/archives/category/[a-z|-]+', )), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'/archives/\d+', )), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '/html/body/section/div/div/header/h1/a/text()').extract( )[0].strip() item['pub_time'] = response.xpath( '/html/body/section/div/div/header/ul/li[2]/text()').extract()[0] item['content_code'] = response.xpath( '/html/body/section/div/div/article').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
class KejijieChannelsSpider(CrawlSpider): start_urls = ["http://www.kejilie.com/channelsubscribe.html"] name = 'kejilieChannels' allowed_domains = ['www.kejilie.com'] Redis2Info = config.info["Redis2Info"] redis_db = StrictRedis(host=Redis2Info['host'], port=Redis2Info['port'], password=Redis2Info['pwd'], db=Redis2Info['db']) rules = (Rule( LxmlLinkExtractor(allow=('http://www.kejilie.com/channeltype/.*', )), follow=True), Rule(LxmlLinkExtractor( allow=('http://www.kejilie.com/channel/.*', ), deny=("http://www.kejilie.com/channel/.*/feed")), callback='parseChannel')) def parseChannel(self, response): info("-----------------kejiliechannels url:" + response.url) title = response.xpath( "//div[@class='am_news_list_all']//div[@class='am-titlebar am-titlebar-default mt-0']/h1/text()" ).extract_first() logo = response.xpath( "//div[@class='am_news_list_all']//div[@class='mt-10']/div[@class='am-fl']/img/@src" ).extract_first() self.redis_db.sadd( "kejiliechannels", json.dumps({ 'url': response.url, 'title': title, 'logo': logo }))
class Vcbeat(RedisCrawlSpider): # 爬虫名 name = "vcbeat" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = [ "vcbeat.net", ] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = "vcbeat:start_urls" # start_urls = ['http://vcbeat.net/', 'http://vcbeat.net/Series/seriesIndex'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'/seriesD/\d{1,2}', )), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'vcbeat\.net/.+=', )), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath( '//*[@id="article_title"]/p/text()').extract()[0].strip() item['pub_time'] = response.xpath( '//*[@id="article_title"]/div/span[2]/text()').extract()[0] item['content_code'] = response.xpath( '/html/body/div[7]/div/div[1]/div[1]').extract()[0] # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
class Syncedreview(RedisCrawlSpider): # 爬虫名 name = "rsarxiv" # 爬取域范围, 允许爬虫在这个域名下进行爬取 allowed_domains = ["rsarxiv.github.io"] # 起始url列表, 爬虫执行后的第一批请求, 队列处理 redis_key = 'rsarxiv:start_urls' # start_urls = ['http://rsarxiv.github.io/'] rules = ( # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接,并使用parse来解析 Rule(LxmlLinkExtractor(allow=(r'/tags/.+/', )), follow=True), # 提取匹配'/article/[\d]+.html'的链接,并使用parse_item_yield来解析它们下载后的内容,不递归 Rule(LxmlLinkExtractor(allow=(r'/\d{4}/\d{2}/\d{2}/.+/', )), callback='parse_item'), ) def parse_item(self, response): item = NewsItem() item['url'] = response.url item['title'] = response.xpath('//*[@id="wrapper"]/article/div/header/h1/text()').extract()[0].strip() item['pub_time'] = response.xpath('//*[@id="wrapper"]/article/div/header/time/a/text()').extract()[0].strip() item['content_code'] = response.xpath('//*[@id="wrapper"]/article/div/div').extract()[0].strip() # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码 yield item
def get_links(url, body): start_url = url if '.html' in start_url: start_url = start_url.rsplit('/', 1)[0] response = HtmlResponse( url=start_url, body=body, encoding='utf8' ) link_extractor = LxmlLinkExtractor( allow=[start_url], deny=[], tags='a', attrs='href', canonicalize=True ) return link_extractor.extract_links(response)
class LinkProcedure(BaseProcedure): """ 基于scrapy的LxmlLinkExtractor的链接提取器 link xpath css xpath string|array 参考LxmlLinkExtractor的restrict_xpaths css string|array 参考LxmlLinkExtractor的restrict_css """ def __init__(self, *args): xpath = args[0] css = len(args) >= 2 and args[1] or None self._extractor = LxmlLinkExtractor(restrict_xpaths=xpath, restrict_css=css) def do(self, input_, **kwargs): if isinstance(input_, Response): links = self._extractor.extract_links(input_) return [i.url.strip() for i in links] else: raise Exception('link input error')
def __init__(self, *args): xpath = args[0] css = len(args) >= 2 and args[1] or None self._extractor = LxmlLinkExtractor(restrict_xpaths=xpath, restrict_css=css)