def parse(self, response):
        item = CnblogItem()
        item['title'] = response.xpath('//a[@class="titlelnk"]/text()').extract()
        item['link'] = response.xpath('//a[@class="titlelnk"]/@href').extract()
        item['name'] = response.xpath('//div[@class="post_item_foot"]/a/text()').extract()

        yield item
    def parse(self, response, **kwargs):
        divLst = response.xpath('//div[@id="post_list"]/div')
        for div in divLst:
            item = CnblogItem()
            item["post_author"] = div.xpath(".//div[@class='post_item_foot']/a/text()").extract_first()
            item["author_link"] = div.xpath(".//div[@class='post_item_foot']/a/@href").extract_first()
            item["post_date"] = div.xpath(".//div[@class='post_item_foot']/text()").extract()[1].strip().replace('发布于 ',
                                                                                                                 '')
            item["comment_num"] = div.xpath(".//span[@class='article_comment']/a/text()").extract_first()
            item["view_num"] = div.xpath(".//span[@class='article_view']/a/text()").extract_first()
            item["title"] = div.xpath(".//h3/a/text()").extract_first()
            item["title_link"] = div.xpath(".//h3/a/@href").extract_first()
            summary_lst = div.xpath(".//p[@class='post_item_summary']/text()").extract()
            if len(summary_lst) > 1:
                item["item_summary"] = summary_lst[1].strip()
            else:
                item["item_summary"] = summary_lst[0].strip()
            item["digg_num"] = div.xpath(".//span[@class='diggnum']/text()").extract_first()
            yield item

            nexturl = response.xpath('.//a[text()="Next >"]/@href').extract_first()

            if nexturl is not None:
                nexturl = 'https://www.cnblogs.com' + nexturl
                yield scrapy.Request(nexturl,callback=self.parse)
示例#3
0
 def parse(self, response):
     item = CnblogItem()  #新添加
     item['title'] = response.xpath(
         '//a[@class="titlelnk"]/text()').extract()  #修改
     item['link'] = response.xpath(
         '//a[@class="titlelnk"]/@href').extract()  #修改
     yield item  #新添加
示例#4
0
 def parse(self, response):
     article_list = response.xpath('//*[@id="post_list"]/article')
     item = CnblogItem()
     for artilce in article_list:
         item['title'] = artilce.xpath('.//div[@class="post-item-text"]/a/text()').extract_first()
         item['link'] = artilce.xpath('.//div[@class="post-item-text"]/a/@href').extract_first()
         yield item
     print('第' + str(response.meta.get('page')) + '页完成')
示例#5
0
 def parse_item(self, response):
     article_list = response.xpath('//*[@id="post_list"]/article')
     item = CnblogItem()
     for artilce in article_list:
         item['title'] = artilce.xpath(
             './/div[@class="post-item-text"]/a/text()').extract_first()
         item['link'] = artilce.xpath(
             './/div[@class="post-item-text"]/a/@href').extract_first()
         yield item
 def parse(self, response):
     papers = response.xpath('.//*[@class="day"]')
     for paper in papers:
         url = paper.xpath('.//*[@class="postTitle"]/a/@href').extract()[0]
         title = paper.xpath('.//*[@class="postTitle"]/a/text()').extract()[0]
         time = paper.xpath('.//*[@class="dayTitle"]/a/text()').extract()[0]
         content = paper.xpath('.//*[@class="postTitle"]/a/text()').extract()[0]
         print(url,title,time,content)
         item = CnblogItem(url=url, title=title, time=time, content=content)
         yield item
示例#7
0
    def parse(self, response):
        article_list = response.xpath('//*[@id="post_list"]/article')
        item = CnblogItem()
        for artilce in article_list:
            item['title'] = artilce.xpath('.//div[@class="post-item-text"]/a/text()').extract_first()
            item['link'] = artilce.xpath('.//div[@class="post-item-text"]/a/@href').extract_first()
            yield item
        print('第' + str(self.page-1) + '页完成')

        if self.page <= self.settings.get('MAX_PAGE'):
            new_url = self.url.format(self.page)
            self.page += 1
            yield scrapy.Request(new_url, callback=self.parse, dont_filter=True)
示例#8
0
 def detail(self, response):
     print "detail"
     item = CnblogItem()
     try:
         item['url'] = response.url
         item['title'] = response.xpath(
             '//*[@id="cb_post_title_url"]/text()').extract()[0].encode(
                 'utf-8')
         item['sort'] = response.meta["genre"]
         data = response.xpath('//*[@id="cnblogs_post_body"]')
         item['article'] = data.xpath('string(.)').extract()[0]
         tags = jieba.analyse.extract_tags(item['article'], topK=topK)
         item['keywords'] = (','.join(tags))
     except Exception, e:
         print "%s" % e
    def parse(self, response):

        item = CnblogItem()

        item['title'] = response.xpath(
            '//a[@class="titlelnk"]/text()').extract()  #使用xpath搜索
        item['link'] = response.xpath('//a[@class="titlelnk"]/@href').extract()

        yield item

        print("第{0}页爬取完成".format(self.offset))
        if self.offset < 10:  #爬取到第几页
            self.offset += 1
        url2 = self.url + str(self.offset)  #拼接url
        print(url2)
        yield scrapy.Request(url=url2, callback=self.parse)
示例#10
0
    def parse_page(self, response):

        item = CnblogItem()
        item['title'] = response.xpath(
            '//h1[@class="postTitle"]/a/text()').extract()
        item['author'] = response.xpath(
            '//div[@class="postDesc"]/a[1]/text()').extract()
        item['time'] = response.xpath(
            '//div[@class="postDesc"]/span[@id="post-date"]/text()').extract()
        item['comment'] = response.xpath(
            '//div[@class="postDesc"]/span[@id="post_comment_count"]/text()'
        ).extract()
        item['read_num'] = response.xpath(
            '//div[@class="postDesc"]/span[@id="post_view_count"]/text()'
        ).extract()
        item['content'] = response.xpath(
            '//div[@id="cnblogs_post_body"]//p/text()').extract()
        yield item
示例#11
0
	def parse_blog(self, response):
		for blog in response.xpath('//div[@class="post_item"]'):
			item = CnblogItem()
			item['title'] = blog.xpath('.//a[@class = "titlelnk"]/text()').extract_first().strip()
			item['url'] = blog.xpath('.//a[@class = "titlelnk"]/@href').extract_first()
			#取后一项,不包含图片元素
			item['summary'] = blog.xpath('.//p[@class = "post_item_summary"]/text()').extract()[-1].strip()
			item['id'] = blog.xpath('.//div[@class = "post_item_foot"]/a/text()').extract_first()
			item['recommand'] = blog.xpath('.//span/text()').extract_first()
			item['comment'] = blog.xpath('.//div[@class = "post_item_foot"]/span[1]/a/text()').extract_first().strip().split('(')[1].split(')')[0]
			item['view'] = blog.xpath('.//div[@class = "post_item_foot"]/span[2]/a/text()').extract_first().strip().split('(')[1].split(')')[0]
			#print item['title']
			#print item['url']
			#print item['summary']
			#print item['recommand']
			#print item['comment']
			#print item['view']
			#print "insert into cnblogsinfo(title, url, summary, id, recommand, comment, view) values(" + item['title'] + "," + \
			#item['url'] + "," + item['summary'] + "," + item['id'] + "," + item['recommand'] + "," + item['comment'] + "," \
			#+ item['view'] + ")"

			yield item
示例#12
0
    def detail(self, response):
        print "detail"
        item = CnblogItem()

        item['url'] = response.url
        item['title'] = response.xpath(
            '//*[@id="cb_post_title_url"]/text()').extract()[0].encode('utf-8')
        item['sort'] = response.meta["genre"]
        data = response.xpath('//*[@id="cnblogs_post_body"]')
        item['article'] = data.xpath('string(.)').extract()[0]
        tags = jieba.analyse.extract_tags(item['article'], topK=topK)
        item['keywords'] = (','.join(tags))

        #print item['title']
        #print item['url']
        #print item['sort']
        #print item['article']
        #print item['keywords']

        #yield item
        zk.delete(response.meta["task"])
        work_co -= 8