def parse(self, response): #获取当前页面中的所有电影采集标签item movie_items = response.xpath('//div[@class="item"]') #使用for循环遍历每一个电影标签,并采集数据封装成一个采集项对象 for item in movie_items: #创建一个空的类采集对象 movie = DoubanmoiveItem() movie['rank'] = item.xpath('div[@class="pic"]/em/text()').extract() movie['title'] = item.xpath( 'div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()' ).extract() movie['rating_num'] = item.xpath( 'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract() movie['comments'] = item.xpath( 'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()' ).extract() #if movie['comments']: #movie['comments']=movie['comments'][0].strip() movie['imageLink'] = item.xpath( 'div[@class="pic"]/a/img/@src').extract() yield movie pass #获取当前url的下一页 next_page = response.xpath( '//span[@class="next"]/a/@href').extract_first() if next_page: request_url = response.urljoin(next_page) print(request_url) yield Request(request_url, self.parse)
def parse(self, response): #print response.body selector = scrapy.Selector(response) book = selector.xpath('//div[@class="doulist-collect"]') items = DoubanmoiveItem() for each in book: title = each.xpath('//div[@class="title"]/a/text()').extract() rate = each.xpath( '//div[@class="rating"]/span[@class="rating_nums"]/text()' ).extract() author = each.xpath('//div[@class="abstract"]/text()').extract() title = ','.join(title) rate = ','.join(rate) author = ','.join(author) title = title.replace(' ', '').replace('\n', '') rate = rate.replace(' ', '').replace('\n', '') author = author.replace(' ', '').replace('\n', '') items['title'] = title items['rate'] = rate items['author'] = author #print title,rate,author yield items nextpage = selector.xpath('//span[@class="next"]/link/@href').extract() print nextpage if nextpage: next = nextpage[0] print next yield scrapy.http.Request(next, callback=self.parse)
def parse(self, response): selector = Selector(response) movies = selector.xpath('//div[@class="info"]') for moive in movies: item = DoubanmoiveItem() title = moive.xpath('div[@class="hd"]/a/span/text()').extract() complete_title = '' for each in title: complete_title += each minfo = movie.xpath('div[@class="bd"]/p/text()').extract() star = moive.xpath( 'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()' ).extract()[0] quote = moive.xpath('div[@class="bd"]/p/span/text()').extract()[0] if not quote: quote = '' item['title'] = complete_title item['info'] = ' '.join(minfo).replace(' ', '').replace('\n', '') item['star'] = star[0] item['quote'] = quote[0] yield item nextPage = response.xpath('//span[@class="next"]/link/@href').extract() if nextPage: nextPage = nextPage[0] yield Request(self.url + str(nextPage), callback=self.parse)
def parse(self, response): items = [] bs_in = BeautifulSoup(response.text, 'html.parser') title_list = bs_in.find_all('div', attrs={'class': 'hd'}) for titleItem in title_list: item = DoubanmoiveItem() title = titleItem.find('a').find('span').text link = titleItem.find('a').get('href') item['title'] = title item['link'] = link yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse_item(self, response): sel = Selector(response) item = DoubanmoiveItem() item['name'] = sel.xpath( '//*[@id="content"]/h1/span[1]/text()').extract() item['year'] = sel.xpath('//*[@id="content"]/h1/span[2]/text()').re( r'\((\d+)\)') item['score'] = sel.xpath( '//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract() item['director'] = sel.xpath( '//*[@id="info"]/span[1]/a/text()').extract() item['classification'] = sel.xpath( '//span[@property="v:genre"]/text()').extract() item['actor'] = sel.xpath( '//*[@id="info"]/span[3]/a[1]/text()').extract() return item
def parse_item(self,response): sel=Selector(response) item=DoubanmoiveItem() item['name']=sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract() item['year']=sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'\((\d+)\)') item['score']=sel.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract() item['director']=sel.xpath('//*[@id="info"]/span[1]/span[2]/a/text()').extract() item['classification']= sel.xpath('//span[@property="v:genre"]/text()').extract() item['actor']= sel.xpath('//*[@id="info"]/span[3]/span[2]/a[@rel="v:starring"]/text()').extract() return item # scrapy parse --spider=doubanmoive -d 3 "http://movie.douban.com/top250" # scrapy view "https://movie.douban.com/top250" # scrapy crawl doubanmoive -o items.json # //*[@id="interest_sectl"]/div[1]/div[2]/strong