def parse(self, response): # 实例化 item = BlogspiderItem() item['name'] = response.xpath("//span[@class='ArticleTitleText']/a/text()").extract() item['url'] = response.xpath("//span[@class='ArticleTitleText']/a/@href").extract() # 接下来需要使用urllib和re模块获取博文的评论数和阅读数 # 首先提取存储评论数和点击数网址的正则表达式 pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">' # hcurl为存储评论数和点击数的网址 hcurl = re.compile(pat1).findall(str(response.body))[0] # 模拟成浏览器 headers2 = ("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2669.400 QQBrowser/9.6.10990.400") opener = urllib.request.build_opener() opener.addheaders = [headers2] # 将opener安装为全局 urllib.request.install_opener(opener) # data为对应博客列表页的所有博文的点击数与评论数数据 data = urllib.request.urlopen(hcurl).read() # pat2 为提取文章阅读数的正则表达式 pat2 = "click\d*?','(\d*?)'" # pat3 为提取文章评论数的正则表达式 pat3 = "comment\d*?','(\d*?)'" # 提取阅读数和评论数 数据并分别赋值给item下的hits和comment item["hits"] = re.compile(pat2).findall(str(data)) item["comment"] = re.compile(pat3).findall(str(data)) yield item # 提取博文列表页的总页数 pat4 = "blog.hexun.com/p(.*?)/" # 通过正则表达式获取到的数据为一个列表, 倒数第二个元素为总页数 data2 = re.compile(pat4).findall(str(response.body)) if(len(data2) >= 2): totalurl = data2[-2] else: totalurl = 1 # 在实际运行中, 下一行print的代码可以注释掉, 在调试过程中, 可以开启下一行print的代码 # print("一共"+str(totalurl)+"页") # 进入for循环, 依次爬取各博文列表页的博文数据 for i in range(2, int(totalurl)+1): # 构造下一次要爬取的url, 爬取下一页博文列表中的数据 nexturl = "http://"+str(self.uid)+".blog.hexun.com/p"+str(i)+"/default.html" # 进行下一次爬取, 下一次爬取仍然模拟成浏览器进行 yield Request(nexturl, callback=self.parse, headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2669.400 QQBrowser/9.6.10990.400"})
def parse_item(self, response): title1 = response.xpath( '//h2[@class="titName SG_txta"]/text()').extract() if len(title1) > 0: title = title1[0] else: title = "空" print("标题:", title) url = response.url print("链接:", url) item = BlogspiderItem()
def parse_content(self, response): self.log('parse_content...') item = BlogspiderItem() title = response.xpath( '//div[@class="articalTitle"]/h2/text()').extract()[0] self.log('title:' + title) url = response.url self.log('url:' + url) content = response.xpath( '//div[@class="articalContent newfont_family"]//text()').extract( ) content = ''.join(content) self.log(content) item['title'] = title item['url'] = url item['content'] = content yield item
def parse(self, response): papers = response.xpath(".//*[@class='day']") for paper in papers: url = paper.xpath(".//*[@class='postTitle2']/@href").extract()[0] title = paper.xpath( ".//*[@class='postTitle2']/text()").extract()[0] time = paper.xpath(".//*[@class='dayTitle']/a/text()").extract()[0] content = paper.xpath( ".//*[@class='c_b_p_desc']/text()").extract()[0] item = BlogspiderItem(url=url, title=title, time=time, content=content) #request = scrapy.Request(url=url,meta={'item':item},callback=self.parse_body) request = scrapy.Request(url=url, callback=self.parse_body) request.meta['item'] = item yield request next_page = Selector(response).re(r'<a href="(\S+?)">下一页</a>') if next_page: yield scrapy.Request(url=next_page[0], callback=self.parse)
def parse(self, response): #存放文章信息列表 items = [] soup = BeautifulSoup(response.text, "lxml") #解析整个网站的页面,获取到整个HTML frist_title = soup.find( 'h1', class_='post-title').a.text.strip() #获取文章第一个个标题 # print('第一个标题是:%s' % (frist_title)) all_title = soup.find_all('h1', class_='post-title') #获取文章所有标题 for i in range(len(all_title)): #将数据封装到BlogspiderItem对象,字典类型数据 item = BlogspiderItem() title = all_title[i].a.text.strip() link = all_title[i].a['href'] #变成字典 item["title"] = title item["link"] = link #根据文章的链接,发送request请求,并传递item参数 yield scrapy.Request(url=link, meta={"item": item}, callback=self.parse2)
def parse(self, response): # 第一部分代码:将html保存到本地 # print (response.text) # filename = "index.html" # with open(filename, 'w', encoding="utf-8") as f: # f.write(response.text) # 第二部分代码:打印文章标题 # soup = BeautifulSoup(response.text, "lxml") # first_title = soup.find("h1", class_= "post-title").a.text.strip() # print ("第一篇文章的标题是:", first_title) # for i in range(len(title_list)): # title = title_list[i].a.text.strip() # print('第 %s 篇文章的标题是:%s' %(i+1, title)) #第三部分代码: # soup = BeautifulSoup(response.text, "lxml") # first_title = soup.find("h1", class_= "post-title").a.text.strip() # print ("第一篇文章的标题是:", first_title) # for i in range(len(title_list)): # title = title_list[i].a.text.strip() # print('第 %s 篇文章的标题是:%s' %(i+1, title)) #第四部分代码:储存文章内容 soup = BeautifulSoup(response.text, "lxml") title_list = soup.find_all("h1", class_="post-title") for i in range(len(title_list)): # 将数据封装到BlogspiderItem对象,字典类型数据 item = BlogspiderItem() title = title_list[i].a.text.strip() link = title_list[i].a["href"] # 变成字典 item["title"] = title item["link"] = link # 根据文章链接,发送Request请求,并传递item参数 yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse(self, response): #里面实现网页的而解析 # 里面实现网页的而解析 # 首先抽取所有的文章 papers = response.xpath(".//*[@class='day']") # 从每篇文章中抽取数据 for paper in papers: url = paper.xpath(".//*[@class='postTitle']/a/@href").extract()[0] title = paper.xpath( ".//*[@class='postTitle']/a/text()").extract()[0] time = paper.xpath(".//*[@class='dayTitle']/a/text()").extract()[0] content = paper.xpath( ".//*[@class='postCon']/div/text()").extract()[0] item = BlogspiderItem(url=url, title=title, time=time, content=content) request = scrapy.Request(url=url, callback=self.parse_body) request.meta['item'] = item # 将item暂存 yield request next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>') if next_page: yield scrapy.Request(url=next_page[0], callback=self.parse)