def parse_long_comment(self, response): selector = Selector(response) item = DoubanspiderItem() item['bookid'] = response.meta['bookurl'] item['comment'] = selector.xpath( '//*[@id="link-report"]/div').extract()[0].encode('utf-8') yield item
def process_music(self, response): music_name = response.xpath( '//*[@id="wrapper"]/h1/span/text()').extract_first() info = response.xpath('//*[@id="info"]') music_type = info.re_first(r'流派\S*\s*(..)') music_poster = response.xpath( '//*[@id="mainpic"]/span/a/img/@src').get() item = DoubanspiderItem() item['music_name'] = music_name item['music_url'] = response.url item['music_type'] = music_type item['music_poster'] = [music_poster] short_remarks_list = [] i = 1 while True: if i > 5 or len(response.xpath( '//*[@id="comments"]/ul/li[%i]' % i)) == 0: break short_remarks_dict = {} short_remarks_dict['id'] = \ response.xpath('//*[@id="comments"]/ul/li[%i]/div/h3/span[2]/a/text()'%i).get() short_remarks_dict['content'] = \ response.xpath('//*[@id="comments"]/ul/li[%i]/div/p/span/text()'%i).get() short_remarks_dict['star_number'] = \ response.xpath('//*[@id="comments"]/ul/li[%i]/div/h3/span[2]/span[1]/@class'%i).re_first(r'allstar(.)') short_remarks_dict['useful_number'] = \ response.xpath('//*[@id="comments"]/ul/li[%i]/div[@class="comment"]/h3[1]/span[1]/span[1]/text()'%1).get() short_remarks_list.append(short_remarks_dict) i += 1 item['short_remarks'] = short_remarks_list long_remarks_list = [] i = 1 while True: if i > 3 or len( response.xpath( '//div[@class="review-list "]/div[%d]' % i)) == 0: break long_remarks_dict = {} long_remarks_dict['id'] = \ response.xpath('//div[@class="review-list "]/div[%d]//header[1]/a[2]/text()'%i).extract() long_remarks_dict['star_number'] = \ response.xpath('//div[@class="review-list "]/div[%d]//header[1]/span[1]/@class'%i).re_first(r'allstar(.)') long_remarks_dict['useful_number'] = \ response.xpath('//div[@class="review-list "]/div[%d]//a[@title="有用"]/span[1]/text()'%i).re_first(r'\s*([0-9]*)') content_url = response.xpath( '//div[@class="review-list "]/div[%d]//div[@class="main-bd"]/h2[1]/a[1]/@href' % i).get() long_remarks_dict['content'] = self.get_long_remark_content( content_url) long_remarks_list.append(long_remarks_dict) i += 1 item['long_remarks'] = long_remarks_list with open('feiyunzhixia.html', 'wb') as f: f.write(response.body) return item
def parse(self, response): item = DoubanspiderItem() selector = Selector(response) for sel in selector.xpath('//div[@class="info"]'): item['chinese_title'] = sel.xpath( 'div[@class="hd"]/a/span/text()').extract()[0] item['other_title'] = sel.xpath( 'div[@class="hd"]/a/span/text()').extract()[1] item['link'] = sel.xpath('div[@class="hd"]/a/@href').extract()[0] item['star'] = sel.xpath('div[2]/div/span/text()').extract()[0] item['num'] = sel.xpath('div[2]/div/span/text()').extract()[0] item['actor'] = sel.xpath( 'string(//*[@id="content"]/div/div/ol/li/div/div[2]/div/p[1]/text()[1])' ).extract()[0] yield item # 下一页 # for sel in selector.xpath('//div[@class="info"]'): # item['chinese_title'] = sel.xpath('div[@class="hd"]/a/span/text()').extract() # item['other_title'] = sel.xpath('div[@class="hd"]/a/span/text()').extract() # item['link'] = sel.xpath('div[@class="hd"]/a/@href').extract() # # item['star'] = sel.xpath('div[2]/div/span/text()').extract() # # item['num'] = sel.xpath('div[2]/div/span/text()').extract() # # item['actor']= sel.xpath('string(//*[@id="content"]/div/div/ol/li/div/div[2]/div/p[1]/text()[1])').extract() # # yield item # 下一页 next_link = selector.xpath( '//span[@class="next"]/a/@href').extract()[0] if next_link: print(next_link) yield Request(self.start_urls[0] + next_link, self.parse)
def parse(self, response): # pass item = DoubanspiderItem() movies = response.xpath("//div[@class='info']") for each in movies: # 标题 item['title'] = each.xpath( ".//span[@class='title'][1]/text()").extract()[0] # 信息 item['bd'] = each.xpath( ".//div[@class='bd']/p/text()").extract()[0] # 评分 item['star'] = each.xpath( ".//div[@class='star']/span[@class='rating_num']/text()" ).extract()[0] # 简介 quote = each.xpath(".//p[@class='quote']/span/text()").extract() if len(quote) != 0: item['quote'] = quote[0] yield item if self.offset <= 225: self.offset += 25 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): movie_list = response.xpath("//ol[@class='grid_view']/li") for i_item in movie_list: douban_item = DoubanspiderItem() douban_item['movie_num'] = i_item.xpath( ".//div[@class='item']//em/text()").extract_first() douban_item['movie_name'] = i_item.xpath( ".//div[@class='hd']/a/span[1]/text()").extract_first() content_list = i_item.xpath( ".//div[@class='info']/div[@class='bd']/p/text()").extract() for content_i in content_list: content_s = "".join(content_i.split()) douban_item['movie_introduce'] = content_s douban_item['movie_star'] = i_item.xpath( ".//div[@class='item']//span[@class='rating_num']/text()" ).extract_first() douban_item['movie_eval'] = i_item.xpath( ".//div[@class='item']//div[@class='star']/span[4]/text()" ).extract_first() douban_item['movie_image_url'] = i_item.xpath( ".//div[@class='pic']//img/@src").extract_first() yield douban_item next_link = response.xpath( "//span[@class='next']/link/@href").extract() if next_link: next_link = next_link[0] yield scrapy.Request("https://movie.douban.com/top250" + next_link, callback=self.parse)
def parse(self, response): item = DoubanspiderItem() for comment in response.css('div.comment'): item['author'] = comment.css( 'span.comment-info > a::text').extract_first() item['vote'] = comment.css( 'span.comment-vote > span.vote-count::text').extract_first() item['comment'] = comment.css( 'p.comment-content::text').extract_first() yield item next_page = response.css('li.p a.page-btn::attr("href")').extract()[-1] if next_page is not None: yield response.follow(next_page, self.parse)
def parse(self, response): item = ItemLoader(item=DoubanspiderItem(), response=response) item.add_xpath('title', '//span/@data-ip') yield item.load_item() # def get_cookie(self): # with open ('cookie.txt','r') as f: # cookies = {} # for line in f.read().split(';'): # name,value = line.strip().split('=',1) # cookies[name] = value # print(cookies) # return cookies
def parse(self, response): item = DoubanspiderItem() #使用Xpah选择器获取所有标签页的链接 book_titles = response.xpath('//div[@class=""]//tbody/tr/td/a/@href').extract() for i in book_titles: item['book_title'] = i url = 'https://book.douban.com' + urllib.parse.quote(i) baseurl = {} baseurl['url'] = url yield scrapy.Request(url = url, meta = {'item1':copy.deepcopy(item),'baseurl':copy.deepcopy(baseurl)}, callback= self.get_booklinks, dont_filter=True )
def parse(self, response): movies = response.xpath("//div[@class='info']") item = DoubanspiderItem() for m in movies: title = m.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract() content = m.xpath('div[@class="bd"]/p/text()').extract() score = m.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract() info = m.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract() item['title'] = title[0] item['content'] = ';'.join(content) item['score'] = score[0] item['info'] = info[0] yield item if self.start <= 225: self.start += 25 yield scrapy.Request(self.url + str(self.start) + self.end, callback=self.parse)
def parse(self, response): item = DoubanspiderItem() # for i in range(1, 37): page = json.loads(response.body) tag = page['res']['kind_str'] if tag != 'excerpt': sort_name = page['res']['payload']['title'] subjects = page['res']['subjects'] item['sort_name'] = sort_name # yield item for n, book in enumerate(subjects): number = n + 1 title = book['title'] rating = book['rating'] item['number'] = number item['title'] = title item['rating'] = rating yield item next_page = int(response.url.split('/')[-1]) + 1 next_url = response.urljoin(str(next_page)) yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): selector = Selector(response) comments = selector.xpath('//*[@id="comments"]/ul/li') # 这里提取出来的是个列表 bookid = response.meta['book_id'] # print comments for comment in comments: print comment item = DoubanspiderItem() item['bookid'] = 'https://book.douban.com/subject/' + str(bookid) print item['bookid'] try: item['comment'] = comment.xpath('div[2]/p/text()').extract()[0].encode('utf-8') # /text()可以只把文本出来 print item['comment'] except: item['comment'] = "没有评论信息" print "没有评论信息" yield item time.sleep(0.05) if response.meta['page'] < max_page: page = response.meta['page'] + 1 url = 'https://book.douban.com/subject/{}/comments/hot?p={}'.format(bookid, page) print 'page:', page yield scrapy.Request(url=url, headers={'user-agent': 'Mozilla/5.0'}, meta={'book_id': bookid, 'page': page})
def parse_item(self, response): sel = Selector(response) item = DoubanspiderItem() item['name'] = sel.xpath( '//*[@id="content"]/h1/span[1]/text()').extract() item['year'] = sel.xpath('//*[@id="content"]/h1/span[2]/text()').re( r'\((\d+)\)') # item['score'] = sel.xpath('//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract() item['score'] = sel.xpath( '//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract() item['director'] = sel.xpath( '//*[@id="info"]/span[1]/span/a/text()').extract() item['celebrity'] = sel.xpath( '//*[@id="info"]/span[2]/span/a/text()').extract() item['classification'] = sel.xpath( '//span[@property="v:genre"]/text()').extract() item['actor'] = sel.xpath( '//*[@id="info"]/span[3]//span/a/text()').extract() item['date'] = sel.xpath( '//span[@property="v:initialReleaseDate"]/text()').extract() item['len_time'] = sel.xpath( '//span[@property="v:runtime"]/text()').extract() return item
def parse_item(self, response): # item = DoubanspiderItem() # item['movie_name'] = '绿皮书' # item['movie_id'] = '27060077' # item['comment_head'] = response.xpath('//span[@property="v:summary"]/text()').extract_first() # item['comment_data'] = ''.join(response.xpath('//div[@id="review-content"]//text()').extract()).strip() # item['comment_url'] = response.url # item['people_name'] = response.xpath('//div[@id="review-content"]/@data-author').extract() item = ItemLoader(item=DoubanspiderItem(), response=response) # item.add_value('movie_name', '绿皮书') item.add_xpath('movie_name', '//header/a[2]/text()') item.add_value('movie_id', re.search('[0-9]+', response.url).group()) item.add_xpath('comment_rate', '//span[contains(@class,"main-title-rating")]/@title') item.add_xpath('comment_head', '//span[@property="v:summary"]/text()') item.add_xpath('comment_data', '//div[@id="review-content"]//text()', Join()) item.add_value('comment_url', response.url) item.add_xpath('people_name', '//div[@id="review-content"]/@data-author') return item.load_item()