def parse(self, response): item = CnblogItem() item['title'] = response.xpath('//a[@class="titlelnk"]/text()').extract() item['link'] = response.xpath('//a[@class="titlelnk"]/@href').extract() item['name'] = response.xpath('//div[@class="post_item_foot"]/a/text()').extract() yield item
def parse(self, response, **kwargs): divLst = response.xpath('//div[@id="post_list"]/div') for div in divLst: item = CnblogItem() item["post_author"] = div.xpath(".//div[@class='post_item_foot']/a/text()").extract_first() item["author_link"] = div.xpath(".//div[@class='post_item_foot']/a/@href").extract_first() item["post_date"] = div.xpath(".//div[@class='post_item_foot']/text()").extract()[1].strip().replace('发布于 ', '') item["comment_num"] = div.xpath(".//span[@class='article_comment']/a/text()").extract_first() item["view_num"] = div.xpath(".//span[@class='article_view']/a/text()").extract_first() item["title"] = div.xpath(".//h3/a/text()").extract_first() item["title_link"] = div.xpath(".//h3/a/@href").extract_first() summary_lst = div.xpath(".//p[@class='post_item_summary']/text()").extract() if len(summary_lst) > 1: item["item_summary"] = summary_lst[1].strip() else: item["item_summary"] = summary_lst[0].strip() item["digg_num"] = div.xpath(".//span[@class='diggnum']/text()").extract_first() yield item nexturl = response.xpath('.//a[text()="Next >"]/@href').extract_first() if nexturl is not None: nexturl = 'https://www.cnblogs.com' + nexturl yield scrapy.Request(nexturl,callback=self.parse)
def parse(self, response): item = CnblogItem() #新添加 item['title'] = response.xpath( '//a[@class="titlelnk"]/text()').extract() #修改 item['link'] = response.xpath( '//a[@class="titlelnk"]/@href').extract() #修改 yield item #新添加
def parse(self, response): article_list = response.xpath('//*[@id="post_list"]/article') item = CnblogItem() for artilce in article_list: item['title'] = artilce.xpath('.//div[@class="post-item-text"]/a/text()').extract_first() item['link'] = artilce.xpath('.//div[@class="post-item-text"]/a/@href').extract_first() yield item print('第' + str(response.meta.get('page')) + '页完成')
def parse_item(self, response): article_list = response.xpath('//*[@id="post_list"]/article') item = CnblogItem() for artilce in article_list: item['title'] = artilce.xpath( './/div[@class="post-item-text"]/a/text()').extract_first() item['link'] = artilce.xpath( './/div[@class="post-item-text"]/a/@href').extract_first() yield item
def parse(self, response): papers = response.xpath('.//*[@class="day"]') for paper in papers: url = paper.xpath('.//*[@class="postTitle"]/a/@href').extract()[0] title = paper.xpath('.//*[@class="postTitle"]/a/text()').extract()[0] time = paper.xpath('.//*[@class="dayTitle"]/a/text()').extract()[0] content = paper.xpath('.//*[@class="postTitle"]/a/text()').extract()[0] print(url,title,time,content) item = CnblogItem(url=url, title=title, time=time, content=content) yield item
def parse(self, response): article_list = response.xpath('//*[@id="post_list"]/article') item = CnblogItem() for artilce in article_list: item['title'] = artilce.xpath('.//div[@class="post-item-text"]/a/text()').extract_first() item['link'] = artilce.xpath('.//div[@class="post-item-text"]/a/@href').extract_first() yield item print('第' + str(self.page-1) + '页完成') if self.page <= self.settings.get('MAX_PAGE'): new_url = self.url.format(self.page) self.page += 1 yield scrapy.Request(new_url, callback=self.parse, dont_filter=True)
def detail(self, response): print "detail" item = CnblogItem() try: item['url'] = response.url item['title'] = response.xpath( '//*[@id="cb_post_title_url"]/text()').extract()[0].encode( 'utf-8') item['sort'] = response.meta["genre"] data = response.xpath('//*[@id="cnblogs_post_body"]') item['article'] = data.xpath('string(.)').extract()[0] tags = jieba.analyse.extract_tags(item['article'], topK=topK) item['keywords'] = (','.join(tags)) except Exception, e: print "%s" % e
def parse(self, response): item = CnblogItem() item['title'] = response.xpath( '//a[@class="titlelnk"]/text()').extract() #使用xpath搜索 item['link'] = response.xpath('//a[@class="titlelnk"]/@href').extract() yield item print("第{0}页爬取完成".format(self.offset)) if self.offset < 10: #爬取到第几页 self.offset += 1 url2 = self.url + str(self.offset) #拼接url print(url2) yield scrapy.Request(url=url2, callback=self.parse)
def parse_page(self, response): item = CnblogItem() item['title'] = response.xpath( '//h1[@class="postTitle"]/a/text()').extract() item['author'] = response.xpath( '//div[@class="postDesc"]/a[1]/text()').extract() item['time'] = response.xpath( '//div[@class="postDesc"]/span[@id="post-date"]/text()').extract() item['comment'] = response.xpath( '//div[@class="postDesc"]/span[@id="post_comment_count"]/text()' ).extract() item['read_num'] = response.xpath( '//div[@class="postDesc"]/span[@id="post_view_count"]/text()' ).extract() item['content'] = response.xpath( '//div[@id="cnblogs_post_body"]//p/text()').extract() yield item
def parse_blog(self, response): for blog in response.xpath('//div[@class="post_item"]'): item = CnblogItem() item['title'] = blog.xpath('.//a[@class = "titlelnk"]/text()').extract_first().strip() item['url'] = blog.xpath('.//a[@class = "titlelnk"]/@href').extract_first() #取后一项,不包含图片元素 item['summary'] = blog.xpath('.//p[@class = "post_item_summary"]/text()').extract()[-1].strip() item['id'] = blog.xpath('.//div[@class = "post_item_foot"]/a/text()').extract_first() item['recommand'] = blog.xpath('.//span/text()').extract_first() item['comment'] = blog.xpath('.//div[@class = "post_item_foot"]/span[1]/a/text()').extract_first().strip().split('(')[1].split(')')[0] item['view'] = blog.xpath('.//div[@class = "post_item_foot"]/span[2]/a/text()').extract_first().strip().split('(')[1].split(')')[0] #print item['title'] #print item['url'] #print item['summary'] #print item['recommand'] #print item['comment'] #print item['view'] #print "insert into cnblogsinfo(title, url, summary, id, recommand, comment, view) values(" + item['title'] + "," + \ #item['url'] + "," + item['summary'] + "," + item['id'] + "," + item['recommand'] + "," + item['comment'] + "," \ #+ item['view'] + ")" yield item
def detail(self, response): print "detail" item = CnblogItem() item['url'] = response.url item['title'] = response.xpath( '//*[@id="cb_post_title_url"]/text()').extract()[0].encode('utf-8') item['sort'] = response.meta["genre"] data = response.xpath('//*[@id="cnblogs_post_body"]') item['article'] = data.xpath('string(.)').extract()[0] tags = jieba.analyse.extract_tags(item['article'], topK=topK) item['keywords'] = (','.join(tags)) #print item['title'] #print item['url'] #print item['sort'] #print item['article'] #print item['keywords'] #yield item zk.delete(response.meta["task"]) work_co -= 8