def parse(self, response): papers = response.xpath(".//div[@class='day']") from scrapy.shell import inspect_response inspect_response(response, self) for paper in papers: url = paper.xpath( ".//div[@class='postTitle']/a/@href").extract_first() title = paper.xpath( ".//div[@class='postTitle']/a/text()").extract_first() time = paper.xpath( ".//div[@class='dayTitle']/a/text()").extract_first() content = paper.xpath( ".//div[@class='postCon']/div/text()").extract_first() item = CnblogspiderItem(url=url, title=title, time=time, content=content) request = scrapy.Request(url=url, callback=self.parse_body) request.meta['item'] = item yield request next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>') if next_page: yield scrapy.Request(url=next_page[0], callback=self.parse)
def parse(self, response): #实现网页的解析 #首先抽取所有的文章 papers = response.xpath(".//*[@class='day']") for paper in papers: url = paper.xpath(".//*[@class='postTitle']/a/@href").extract()[0] title = paper.xpath( ".//*[@class='postTitle']/a/text()").extract()[0] time = paper.xpath(".//*[@class='dayTitle']/a/text()").extract()[0] content = paper.xpath( ".//*[@class='postTitle']/a/text()").extract()[0] item = CnblogspiderItem(url=url, title=title, time=time, content=content) request = scrapy.Request(url=url, callback=self.pare_body) request.meta['item'] = item #将item暂存 # print (item) yield request #代码最后使用yield关键字提交item,讲parse方法打造成一个生成器 next_page = Selector(response).re( u'<a href="(\S*)">下一页</a>') #会返回下一页的连接 #next_page=Selector(response).xpath("//div[@id='nav_next_page']/a/@href").extract() if next_page: yield scrapy.Request(url=next_page[0], callback=self.parse)
def parse(self, response): # 实现网页的解析 # 首先抽取所有的文章 papers = response.xpath(".//*[@class='day']") # 从每篇文章中抽取数据 for paper in papers: url = paper.xpath(".//*[@class='postTitle']/a/@href").extract()[0] title = paper.xpath( ".//*[@class='postTitle']/a/text()").extract()[0] time = paper.xpath(".//*[@class='dayTitle']/a/text()").extract()[0] content = paper.xpath( ".//*[@class='postCon']/div/text()").extract()[0] # print url,title,time,content item = CnblogspiderItem(url=url, title=title, time=time, content=content) #request = scrapy.Request(url=url,callback=self.parse_body()) #request.meta['item'] = item # 将item暂存 yield item # 翻页功能 next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>') if next_page: yield scrapy.Request(url=next_page[0], callback=self.parse)
def parse(self, response): # print(response.body) # filename = "cnblog.html" # with open(filename, 'w') as f: # f.write(response.body) #存放博客的集合 items = [] for each in response.xpath(".//*[@class='day']"): item = CnblogspiderItem() url = each.xpath('.//*[@class="postTitle"]/a/@href').extract()[0] title = each.xpath('.//*[@class="postTitle"]/a/text()').extract()[0] time = each.xpath('.//*[@class="dayTitle"]/a/text()').extract()[0] content = each.xpath('.//*[@class="postCon"]/div/text()').extract()[0] item['url'] = url item['title'] = title item['time'] = time print(content) item['content'] = content yield item next_page = response.selector.re(u'<a href="(\S*)">下一页</a>') if next_page: yield scrapy.Request(url=next_page[0], callback=self.parse)
def parse(self, response): # 实现网络的解析 # 首先抽取所有的文章 papers = response.xpath(".//*[@class='day']") for paper in papers: url = paper.xpath(".//*[@class='postTitle']/a/@href").extract()[0] title = paper.xpath( ".//*[@class='postTitle']/a/text()").extract()[0] time = paper.xpath(".//*[@class='dayTitle']/a/text()").extract()[0] content = paper.xpath( ".//*[@class='c_b_p_desc']/text()").extract()[0] item = CnblogspiderItem(url=url, title=title, time=time, content=content) request = scrapy.Request(url=url, callback=self.parse_body) request.meta['item'] = item # yield item yield request # print url,title,time next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>') print next_page if next_page: yield scrapy.Request(url=next_page[0], callback=self.parse)
def parse(self, response): #首先抽取每篇文章的一大块,这种抽取方法较好,不像那种把每个抽取对象单独抽取成一个列表之后再整理 papers = response.xpath('//div[@class="day"]') for paper in papers: url = paper.xpath('div[@class="postTitle"]/a/@href').extract()[0] title = paper.xpath( 'div[@class="postTitle"]/a/text()').extract()[0] time = paper.xpath('div[@class="dayTitle"]/a/text()').extract()[0] content = paper.xpath( 'div[@class="postCon"]/div/text()').extract()[0] # print(url,title,time,content) #item的另一种构造方法 item = CnblogspiderItem( url=url, title=title, time=time, content=content) #也可以item=CnblogspiderItem(),再按照字典键的方法添加 yield item # print(response.text.find('下一页')) #re.S不是必要的 使 # *是匹配一次或者多次 用“a”去匹配" aab"得到的是'' 因为匹配到的空格也是满足条件的 很少这样用吧 #\s 匹配空字符 #\S 匹配非空字符 #可以用scrapy shell url 单独测试xpath等匹配是否正确 next_page = re.search('<a href="(\S*?)">\s*下一页\s*</a>', response.text, re.S).group(1) # print('='*80) #会自动爬取后面的url #判定如果还有页面需要爬取 if next_page: yield scrapy.Request(url=next_page, callback=self.parse)
def parse(self, response): inner_link = None print("start_urls", self.start_urls) # all_urls = set() urls = set() all_urls = set() # print(self.start_urls[0]) all_link = response.xpath( "//*//@src | //*//@href | //*//@url | //*//@ocde").extract( ) #页面内的所有链接 #item = CnblogspiderItem() for link in all_link: all_urls.add(link) item = CnblogspiderItem() item['start_link'] = self.start_urls[0] item['hash_start_link'] = hashlib.md5( b'self.start_urls[0]').hexdigest() if not link.startswith('http'): inner_link = link print("111: " + str(inner_link) + "==>from: " + str(response)) link = urljoin(self.start_urls[0], link) if urlparse(link).netloc != urlparse(self.start_urls[0]).netloc: print("外链" + link + "==>from: " + str(response)) out_link = link inner_link = str() item['link'] = link + str(response) # item['from_link'] = str(response) yield item else: if inner_link: item['link'] = inner_link + "内链" + str(response) else: item['link'] = link + "内链" + str(response) urls.add(link) print("内链" + link + "==>from: " + str(response)) #item['from_link'] = str(response) self.layer += 1 print("layer", str(self.layer)) yield item print("all_urls:" + str(all_urls)) new_urls = urls - self.ALL_urls urls_list = list(new_urls) self.ALL_urls = self.ALL_urls.union(urls) #print("内链:" + urls_list) for url in urls_list: print("0000:" + url) url = response.urljoin(url) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): pageList = response.xpath('//div[@class="day"]') for page in pageList: item = CnblogspiderItem() item["url"] = page.xpath('./div[2]/a/@href').extract()[0] item["title"] = page.xpath('./div[2]/a/text()').extract()[0] item["time"] = page.xpath('./div[1]/a/text()').extract()[0] item["content"] = page.xpath('./div[3]/div/text()').extract()[0] yield item next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>')[0] if next_page: yield scrapy.Request(url=next_page, callback=self.parse)
def parse(self, response): divDayList = response.xpath(r"//div[@class='day']") for divDay in divDayList: date = divDay.xpath( r"./div[@class='dayTitle'][1]/a/text()").extract()[0] title = divDay.xpath( r"./div[@class='postTitle'][1]/a/text()").extract()[0] url = divDay.xpath( r"./div[@class='postTitle'][1]/a/@href").extract()[0] item = CnblogspiderItem(date=date, title=title, url=url) yield item next_page = response.xpath(r"//a[text()='下一页'][1]/@href").extract() if next_page: yield scrapy.Request(url=next_page[0], callback=self.parse)
def parse(self, response): # 实现网页的解析 # 首先抽取所有的文章,response中选择所有class='day'对应的节点元素 # 分析原文可以发现,每一篇文章都是放在,<div class="day"> </div>标签里面 # 选取的结果就是一个列表,就是多篇文章 papers = response.xpath(".//*[@class='day']") # 加入调试查看命令,可以分布查看每一次爬行的结果 # from scrapy.shell import inspect_response # inspect_response(response, self) # 从每篇文章中抽取数据 for paper in papers: # 提取出一篇文章的地址,标题,时间,和内容(就是标题) # .从paper下面开始选,//不管在什么位置,*所有的,class='postTitle'属性下a标签下href属性的第一个值 # .extract()[0]提取文字 url = paper.xpath(".//*[@class='postTitle']/a/@href").extract()[0] # .从paper下面开始选,//不管在什么位置,*所有的,class='postTitle'属性下a标签下的第一个文字内容 title = paper.xpath( ".//*[@class='postTitle']/a/text()").extract()[0] time = paper.xpath(".//*[@class='dayTitle']/a/text()").extract()[0] # 提取文章的摘要 content = paper.xpath( ".//*[@class='c_b_p_desc']/text()").extract()[0] # 将提取到的数据封装成一个Item对象,封装之后类似一个字典,url为键,提取到的url地址就是值 item = CnblogspiderItem(url=url, title=title, time=time, content=content) # 上面已经提取到了具体的一篇文章的地址,然后通过改地址,创建一个新的请求 # 通过Request类请求,传入上面解析到的文章的URL,传入一个回调方法进行网页的解析 # Request中有个meta参数,用来传递信息,传递信息的格式必须是一个字典类型, # 通过Request传进去,通过Request的请求结果response取出来,取出方法与字典一样 request = scrapy.Request(url=url, meta={'item': item}, callback=self.parse_body) # 将parse打造成一个生成器,生成item,经过循环之后会生成很多item字典对象 # 函数生成器,相当于return返回request请求对象的值,上面的request执行了parse_body方法, # 最终得到的就是一个item字典对象,实际返回的就是一个容器,里面存储着分析网页得到的各种数据 yield request next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>') if next_page: # url为请求的对象,callback为回调方法,指定由谁来解析请求的响应 yield scrapy.Request(url=next_page[0], callback=self.parse)
def parse(self,response): papers = response.xpath('.//*[@class="day"]') for paper in papers: url = paper.xpath('.//div[@class="postTitle"]/a/@href').extract()[0] title = paper.xpath('.//div[@class="postTitle"]/a/text()').extract()[0] time = paper.xpath('.//div[@class="dayTitle"]/a/text()').extract()[0] content = paper.xpath('.//div[@class="c_b_p_desc"]/text()').extract()[0] # print '\n' # print title,url,time,content item = CnblogspiderItem(url=url,title=title,time=time,content=content) yield item next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>') if next_page: yield scrapy.Request(url=next_page[0],callback=self.parse)
def parse(self, response): #找到所有文章 papers = response.xpath('//*[@class="day"]') #提取文章链接、标题和摘要 for paper in papers: url = paper.xpath('.//*[@class="postTitle"]/a/@href').extract()[0] title = paper.xpath('.//*[@class="postTitle"]/a/text()').extract()[0] summary = paper.xpath('.//*[@class="postCon"]/div/text()').extract()[0] item = CnblogspiderItem(url=url,title=title,summary=summary) request=scrapy.Request(url = url,callback=self.parse_body) request.meta['item'] = item #将item暂存 yield item #找下一页超链 next_page = Selector(response).re(u'<a href = "(\S*)"下一页</a>') if next_page: yield scrapy.Request(next_page,callback = self.parse)
def parse(self, response): # 实现网页的解析 # 首先抽取所有的文章,response中选择所有class='day'对应的节点元素 # 分析原文可以发现,每一篇文章都是放在,<div class="day"> </div>标签里面 # 选取的结果就是一个列表,就是多篇文章 papers = response.xpath(".//*[@class='day']") # 加入调试查看命令,可以分布查看每一次爬行的结果 # from scrapy.shell import inspect_response # inspect_response(response, self) # 从每篇文章中抽取数据 for paper in papers: # 提取出一篇文章的地址,标题,时间,和内容(就是标题) # .从paper下面开始选,//不管在什么位置,*所有的,class='postTitle'属性下a标签下href属性的第一个值 # 先返回的是一个selector列表,extract()返回的才是内容的列表,.extract()[0]提取列表中第一个内容 url = paper.xpath(".//*[@class='postTitle']/a/@href").extract()[0] # .从paper下面开始选,//不管在什么位置,*所有的,class='postTitle'属性下a标签下的第一个文字内容 title = paper.xpath(".//*[@class='postTitle']/a/text()").extract()[0] time = paper.xpath(".//*[@class='dayTitle']/a/text()").extract()[0] # 提取文章的摘要 content = paper.xpath(".//*[@class='c_b_p_desc']/text()").extract()[0] # 将提取到的数据封装成一个Item对象,封装之后类似一个字典,url为键,提取到的url地址就是值 item = CnblogspiderItem(url=url, title=title, time=time, content=content) # 上面已经提取到了具体的一篇文章的地址,然后通过该地址,创建一个新的请求,调用第二层的解析方法parse_body # 通过Request类请求,传入上面解析到的文章的URL,传入一个回调方法进行网页的解析 # Request中有个meta参数,用来传递信息,传递信息的格式必须是一个字典类型, # 通过Request传进去,通过Request的请求结果response取出来,取出方法与字典一样 request = scrapy.Request(url=url, meta={'item': item}, callback=self.parse_body) # 上面request调用parse_body方法后放回的就是一个最终的item对象 # 下面又执行yield request实际就是返回了item对象 # 将parse打造成一个生成器,生成item,经过循环之后会生成很多item字典对象 # 函数生成器,相当于return返回request请求对象的值,上面的request执行了parse_body方法, # 最终得到的就是一个item字典对象,实际返回的就是一个容器,里面存储着分析网页得到的各种数据 yield request # 爬取下一页,先查找下一页的连接地址 # 下一页源码格式:<a href="https://www.cnblogs.com/qiyeboy/default.html?page=2">下一页</a> next_page = re.findall(u'<a href="(\S*)">下一页</a>', response.content.decode())[0] # \S 匹配任何非空白字符,*尽可能多的匹配 if next_page: # url为请求的对象,callback为回调方法,指定由谁来解析请求的响应 yield scrapy.Request(url=next_page, callback=self.parse)
def parse(self, response): days = response.xpath('.//*[@class="day"]') for day in days: postTitle = day.xpath( './/*[@class="postTitle"]/a/text()').extract()[0] time = day.xpath('.//*[@class="dayTitle"]/a/text()').extract()[0] content = day.xpath( './/*[@class="postCon"]/div/text()').extract()[0] url = day.xpath('.//*[@class="postTitle"]/a/@href').extract()[0] items = CnblogspiderItem(url=url, postTitle=postTitle, time=time, content=content) print url, postTitle, time, content yield items next_paper = Selector(response).re(u'<a href="(\S*)">下一页</a>') if next_paper: yield scrapy.Request(url=next_paper[0], callback=self.parse)
def parse(self, response): all_articles = response.xpath(".//*[@class='day']") for article in all_articles: title = article.xpath( './/*[@class="postTitle"]/a/text()').extract()[0] abstract = article.xpath( './/*[@class="postCon"]/div/text()').extract()[0] href = article.xpath( './/*[@class="postTitle"]/a/@href').extract()[0] day = article.xpath( './/*[@class="dayTitle"]/a/text()').extract()[0] # print title,'\n',day,'\n',href,'\n',abstract,'\n' item = CnblogspiderItem(url=href, title=title, abstract=abstract, day=day) request = scrapy.Request(url=href, callback=self.pase_body) request.meta['item'] = item yield request next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>') if next_page: yield scrapy.Request(url=next_page[0], callback=self.parse)
def parse_item(self, response): print(response) papers = response.xpath(".//div[@class='day']") #from scrapy.shell import inspect_response #inspect_response(response, self) for paper in papers: url = paper.xpath( ".//div[@class='postTitle']/a/@href").extract_first() title = paper.xpath( ".//div[@class='postTitle']/a/text()").extract_first() time = paper.xpath( ".//div[@class='dayTitle']/a/text()").extract_first() content = paper.xpath( ".//div[@class='postCon']/div/text()").extract_first() item = CnblogspiderItem(url=url, title=title, time=time, content=content) request = scrapy.Request(url=url, callback=self.parse_body) request.meta['item'] = item yield request