示例#1
0
    def parse(self, response):
        # 实例化
        item = BlogspiderItem()
        item['name'] = response.xpath("//span[@class='ArticleTitleText']/a/text()").extract()
        item['url'] = response.xpath("//span[@class='ArticleTitleText']/a/@href").extract()
        
        # 接下来需要使用urllib和re模块获取博文的评论数和阅读数
        # 首先提取存储评论数和点击数网址的正则表达式
        
        pat1 = '<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)">'
        
        # hcurl为存储评论数和点击数的网址
        hcurl = re.compile(pat1).findall(str(response.body))[0]
        
        # 模拟成浏览器
        headers2 = ("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2669.400 QQBrowser/9.6.10990.400")
        
        opener = urllib.request.build_opener()
        opener.addheaders = [headers2]
        # 将opener安装为全局
        urllib.request.install_opener(opener)
        # data为对应博客列表页的所有博文的点击数与评论数数据
        data = urllib.request.urlopen(hcurl).read()
        
        # pat2 为提取文章阅读数的正则表达式
        pat2 = "click\d*?','(\d*?)'"
        # pat3 为提取文章评论数的正则表达式
        pat3 = "comment\d*?','(\d*?)'"
        
        # 提取阅读数和评论数 数据并分别赋值给item下的hits和comment
        item["hits"] = re.compile(pat2).findall(str(data))
        item["comment"] = re.compile(pat3).findall(str(data))
        
        yield item

        # 提取博文列表页的总页数
        pat4 = "blog.hexun.com/p(.*?)/"
        # 通过正则表达式获取到的数据为一个列表, 倒数第二个元素为总页数
        data2 = re.compile(pat4).findall(str(response.body))
        
        if(len(data2) >= 2):
            totalurl = data2[-2]
        else:
            totalurl = 1
            
        # 在实际运行中, 下一行print的代码可以注释掉, 在调试过程中, 可以开启下一行print的代码
        
        # print("一共"+str(totalurl)+"页")
        
        # 进入for循环, 依次爬取各博文列表页的博文数据
        for i in range(2, int(totalurl)+1):
            # 构造下一次要爬取的url, 爬取下一页博文列表中的数据
            nexturl = "http://"+str(self.uid)+".blog.hexun.com/p"+str(i)+"/default.html"
            
            # 进行下一次爬取, 下一次爬取仍然模拟成浏览器进行
            yield Request(nexturl, callback=self.parse, headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2669.400 QQBrowser/9.6.10990.400"})
示例#2
0
 def parse_item(self, response):
     title1 = response.xpath(
         '//h2[@class="titName SG_txta"]/text()').extract()
     if len(title1) > 0:
         title = title1[0]
     else:
         title = "空"
     print("标题:", title)
     url = response.url
     print("链接:", url)
     item = BlogspiderItem()
示例#3
0
文件: blog.py 项目: VIP-G/Scrapy
    def parse_content(self, response):
        self.log('parse_content...')
        item = BlogspiderItem()
        title = response.xpath(
            '//div[@class="articalTitle"]/h2/text()').extract()[0]
        self.log('title:' + title)
        url = response.url
        self.log('url:' + url)
        content = response.xpath(
            '//div[@class="articalContent   newfont_family"]//text()').extract(
            )
        content = ''.join(content)
        self.log(content)
        item['title'] = title
        item['url'] = url
        item['content'] = content

        yield item
示例#4
0
 def parse(self, response):
     papers = response.xpath(".//*[@class='day']")
     for paper in papers:
         url = paper.xpath(".//*[@class='postTitle2']/@href").extract()[0]
         title = paper.xpath(
             ".//*[@class='postTitle2']/text()").extract()[0]
         time = paper.xpath(".//*[@class='dayTitle']/a/text()").extract()[0]
         content = paper.xpath(
             ".//*[@class='c_b_p_desc']/text()").extract()[0]
         item = BlogspiderItem(url=url,
                               title=title,
                               time=time,
                               content=content)
         #request = scrapy.Request(url=url,meta={'item':item},callback=self.parse_body)
         request = scrapy.Request(url=url, callback=self.parse_body)
         request.meta['item'] = item
         yield request
         next_page = Selector(response).re(r'<a href="(\S+?)">下一页</a>')
         if next_page:
             yield scrapy.Request(url=next_page[0], callback=self.parse)
示例#5
0
    def parse(self, response):
        #存放文章信息列表
        items = []
        soup = BeautifulSoup(response.text, "lxml")  #解析整个网站的页面,获取到整个HTML
        frist_title = soup.find(
            'h1', class_='post-title').a.text.strip()  #获取文章第一个个标题
        # print('第一个标题是:%s' % (frist_title))
        all_title = soup.find_all('h1', class_='post-title')  #获取文章所有标题
        for i in range(len(all_title)):
            #将数据封装到BlogspiderItem对象,字典类型数据
            item = BlogspiderItem()
            title = all_title[i].a.text.strip()
            link = all_title[i].a['href']

            #变成字典
            item["title"] = title
            item["link"] = link
            #根据文章的链接,发送request请求,并传递item参数
            yield scrapy.Request(url=link,
                                 meta={"item": item},
                                 callback=self.parse2)
示例#6
0
    def parse(self, response):
        # 第一部分代码:将html保存到本地
        # print (response.text)
        # filename = "index.html"
        # with open(filename, 'w', encoding="utf-8") as f:
        #     f.write(response.text)

        # 第二部分代码:打印文章标题
        # soup = BeautifulSoup(response.text, "lxml")
        # first_title = soup.find("h1", class_= "post-title").a.text.strip()
        # print ("第一篇文章的标题是:", first_title)
        # for i in range(len(title_list)):
        #     title = title_list[i].a.text.strip()
        #     print('第 %s 篇文章的标题是:%s' %(i+1, title))

        #第三部分代码:
        # soup = BeautifulSoup(response.text, "lxml")
        # first_title = soup.find("h1", class_= "post-title").a.text.strip()
        # print ("第一篇文章的标题是:", first_title)

        # for i in range(len(title_list)):
        #     title = title_list[i].a.text.strip()
        #     print('第 %s 篇文章的标题是:%s' %(i+1, title))

        #第四部分代码:储存文章内容
        soup = BeautifulSoup(response.text, "lxml")
        title_list = soup.find_all("h1", class_="post-title")
        for i in range(len(title_list)):
            # 将数据封装到BlogspiderItem对象,字典类型数据
            item = BlogspiderItem()
            title = title_list[i].a.text.strip()
            link = title_list[i].a["href"]
            # 变成字典
            item["title"] = title
            item["link"] = link
            # 根据文章链接,发送Request请求,并传递item参数
            yield scrapy.Request(url=link,
                                 meta={'item': item},
                                 callback=self.parse2)
示例#7
0
 def parse(self, response):
     #里面实现网页的而解析
     # 里面实现网页的而解析
     # 首先抽取所有的文章
     papers = response.xpath(".//*[@class='day']")
     # 从每篇文章中抽取数据
     for paper in papers:
         url = paper.xpath(".//*[@class='postTitle']/a/@href").extract()[0]
         title = paper.xpath(
             ".//*[@class='postTitle']/a/text()").extract()[0]
         time = paper.xpath(".//*[@class='dayTitle']/a/text()").extract()[0]
         content = paper.xpath(
             ".//*[@class='postCon']/div/text()").extract()[0]
         item = BlogspiderItem(url=url,
                               title=title,
                               time=time,
                               content=content)
         request = scrapy.Request(url=url, callback=self.parse_body)
         request.meta['item'] = item  # 将item暂存
         yield request
     next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>')
     if next_page:
         yield scrapy.Request(url=next_page[0], callback=self.parse)