예제 #1
0
    def parse(self, response):
        #获取当前页面中的所有电影采集标签item
        movie_items = response.xpath('//div[@class="item"]')
        #使用for循环遍历每一个电影标签,并采集数据封装成一个采集项对象
        for item in movie_items:
            #创建一个空的类采集对象
            movie = DoubanmoiveItem()
            movie['rank'] = item.xpath('div[@class="pic"]/em/text()').extract()
            movie['title'] = item.xpath(
                'div[@class="info"]/div[@class="hd"]/a/span[@class="title"][1]/text()'
            ).extract()
            movie['rating_num'] = item.xpath(
                'div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract()
            movie['comments'] = item.xpath(
                'div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()'
            ).extract()
            #if movie['comments']:
            #movie['comments']=movie['comments'][0].strip()
            movie['imageLink'] = item.xpath(
                'div[@class="pic"]/a/img/@src').extract()
            yield movie
        pass

        #获取当前url的下一页
        next_page = response.xpath(
            '//span[@class="next"]/a/@href').extract_first()
        if next_page:
            request_url = response.urljoin(next_page)
            print(request_url)
            yield Request(request_url, self.parse)
예제 #2
0
 def parse(self, response):
     #print response.body
     selector = scrapy.Selector(response)
     book = selector.xpath('//div[@class="doulist-collect"]')
     items = DoubanmoiveItem()
     for each in book:
         title = each.xpath('//div[@class="title"]/a/text()').extract()
         rate = each.xpath(
             '//div[@class="rating"]/span[@class="rating_nums"]/text()'
         ).extract()
         author = each.xpath('//div[@class="abstract"]/text()').extract()
         title = ','.join(title)
         rate = ','.join(rate)
         author = ','.join(author)
         title = title.replace(' ', '').replace('\n', '')
         rate = rate.replace(' ', '').replace('\n', '')
         author = author.replace(' ', '').replace('\n', '')
         items['title'] = title
         items['rate'] = rate
         items['author'] = author
         #print  title,rate,author
         yield items
     nextpage = selector.xpath('//span[@class="next"]/link/@href').extract()
     print nextpage
     if nextpage:
         next = nextpage[0]
         print next
         yield scrapy.http.Request(next, callback=self.parse)
예제 #3
0
    def parse(self, response):

        selector = Selector(response)
        movies = selector.xpath('//div[@class="info"]')
        for moive in movies:
            item = DoubanmoiveItem()
            title = moive.xpath('div[@class="hd"]/a/span/text()').extract()
            complete_title = ''
            for each in title:
                complete_title += each
            minfo = movie.xpath('div[@class="bd"]/p/text()').extract()
            star = moive.xpath(
                'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract()[0]
            quote = moive.xpath('div[@class="bd"]/p/span/text()').extract()[0]
            if not quote:
                quote = ''
            item['title'] = complete_title
            item['info'] = ' '.join(minfo).replace(' ', '').replace('\n', '')
            item['star'] = star[0]
            item['quote'] = quote[0]
            yield item
        nextPage = response.xpath('//span[@class="next"]/link/@href').extract()
        if nextPage:
            nextPage = nextPage[0]
            yield Request(self.url + str(nextPage), callback=self.parse)
예제 #4
0
 def parse(self, response):
     items = []
     bs_in = BeautifulSoup(response.text, 'html.parser')
     title_list = bs_in.find_all('div', attrs={'class': 'hd'})
     for titleItem in title_list:
         item = DoubanmoiveItem()
         title = titleItem.find('a').find('span').text
         link = titleItem.find('a').get('href')
         item['title'] = title
         item['link'] = link
         yield scrapy.Request(url=link,
                              meta={'item': item},
                              callback=self.parse2)
예제 #5
0
 def parse_item(self, response):
     sel = Selector(response)
     item = DoubanmoiveItem()
     item['name'] = sel.xpath(
         '//*[@id="content"]/h1/span[1]/text()').extract()
     item['year'] = sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(
         r'\((\d+)\)')
     item['score'] = sel.xpath(
         '//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract()
     item['director'] = sel.xpath(
         '//*[@id="info"]/span[1]/a/text()').extract()
     item['classification'] = sel.xpath(
         '//span[@property="v:genre"]/text()').extract()
     item['actor'] = sel.xpath(
         '//*[@id="info"]/span[3]/a[1]/text()').extract()
     return item
예제 #6
0
    def parse_item(self,response): 
        sel=Selector(response) 
        item=DoubanmoiveItem() 
        item['name']=sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
        item['year']=sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'\((\d+)\)') 
        item['score']=sel.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract() 
        item['director']=sel.xpath('//*[@id="info"]/span[1]/span[2]/a/text()').extract() 
        item['classification']= sel.xpath('//span[@property="v:genre"]/text()').extract() 
        item['actor']= sel.xpath('//*[@id="info"]/span[3]/span[2]/a[@rel="v:starring"]/text()').extract() 
        return item



# scrapy parse --spider=doubanmoive  -d 3 "http://movie.douban.com/top250"
# scrapy view  "https://movie.douban.com/top250"
# scrapy crawl doubanmoive -o items.json


# //*[@id="interest_sectl"]/div[1]/div[2]/strong