Пример #1
0
 def parse_long_comment(self, response):
     selector = Selector(response)
     item = DoubanspiderItem()
     item['bookid'] = response.meta['bookurl']
     item['comment'] = selector.xpath(
         '//*[@id="link-report"]/div').extract()[0].encode('utf-8')
     yield item
Пример #2
0
    def process_music(self, response):
        music_name = response.xpath(
            '//*[@id="wrapper"]/h1/span/text()').extract_first()
        info = response.xpath('//*[@id="info"]')
        music_type = info.re_first(r'流派\S*\s*(..)')
        music_poster = response.xpath(
            '//*[@id="mainpic"]/span/a/img/@src').get()

        item = DoubanspiderItem()
        item['music_name'] = music_name
        item['music_url'] = response.url
        item['music_type'] = music_type
        item['music_poster'] = [music_poster]

        short_remarks_list = []
        i = 1
        while True:
            if i > 5 or len(response.xpath(
                    '//*[@id="comments"]/ul/li[%i]' % i)) == 0:
                break
            short_remarks_dict = {}
            short_remarks_dict['id'] = \
                response.xpath('//*[@id="comments"]/ul/li[%i]/div/h3/span[2]/a/text()'%i).get()
            short_remarks_dict['content'] = \
                response.xpath('//*[@id="comments"]/ul/li[%i]/div/p/span/text()'%i).get()
            short_remarks_dict['star_number'] = \
                response.xpath('//*[@id="comments"]/ul/li[%i]/div/h3/span[2]/span[1]/@class'%i).re_first(r'allstar(.)')
            short_remarks_dict['useful_number'] = \
                response.xpath('//*[@id="comments"]/ul/li[%i]/div[@class="comment"]/h3[1]/span[1]/span[1]/text()'%1).get()
            short_remarks_list.append(short_remarks_dict)
            i += 1
        item['short_remarks'] = short_remarks_list

        long_remarks_list = []
        i = 1
        while True:
            if i > 3 or len(
                    response.xpath(
                        '//div[@class="review-list  "]/div[%d]' % i)) == 0:
                break
            long_remarks_dict = {}
            long_remarks_dict['id'] = \
                response.xpath('//div[@class="review-list  "]/div[%d]//header[1]/a[2]/text()'%i).extract()
            long_remarks_dict['star_number'] = \
                response.xpath('//div[@class="review-list  "]/div[%d]//header[1]/span[1]/@class'%i).re_first(r'allstar(.)')
            long_remarks_dict['useful_number'] = \
                response.xpath('//div[@class="review-list  "]/div[%d]//a[@title="有用"]/span[1]/text()'%i).re_first(r'\s*([0-9]*)')
            content_url = response.xpath(
                '//div[@class="review-list  "]/div[%d]//div[@class="main-bd"]/h2[1]/a[1]/@href'
                % i).get()
            long_remarks_dict['content'] = self.get_long_remark_content(
                content_url)
            long_remarks_list.append(long_remarks_dict)
            i += 1
        item['long_remarks'] = long_remarks_list

        with open('feiyunzhixia.html', 'wb') as f:
            f.write(response.body)

        return item
Пример #3
0
 def parse(self, response):
     item = DoubanspiderItem()
     selector = Selector(response)
     for sel in selector.xpath('//div[@class="info"]'):
         item['chinese_title'] = sel.xpath(
             'div[@class="hd"]/a/span/text()').extract()[0]
         item['other_title'] = sel.xpath(
             'div[@class="hd"]/a/span/text()').extract()[1]
         item['link'] = sel.xpath('div[@class="hd"]/a/@href').extract()[0]
         item['star'] = sel.xpath('div[2]/div/span/text()').extract()[0]
         item['num'] = sel.xpath('div[2]/div/span/text()').extract()[0]
         item['actor'] = sel.xpath(
             'string(//*[@id="content"]/div/div/ol/li/div/div[2]/div/p[1]/text()[1])'
         ).extract()[0]
         yield item  # 下一页
         # for sel in selector.xpath('//div[@class="info"]'):
         #     item['chinese_title'] = sel.xpath('div[@class="hd"]/a/span/text()').extract()
         #     item['other_title'] = sel.xpath('div[@class="hd"]/a/span/text()').extract()
         #     item['link'] = sel.xpath('div[@class="hd"]/a/@href').extract()
         # #     item['star'] = sel.xpath('div[2]/div/span/text()').extract()
         # #     item['num'] = sel.xpath('div[2]/div/span/text()').extract()
         # #     item['actor']= sel.xpath('string(//*[@id="content"]/div/div/ol/li/div/div[2]/div/p[1]/text()[1])').extract()
         # #     yield item  # 下一页
         next_link = selector.xpath(
             '//span[@class="next"]/a/@href').extract()[0]
         if next_link:
             print(next_link)
         yield Request(self.start_urls[0] + next_link, self.parse)
Пример #4
0
    def parse(self, response):
        # pass
        item = DoubanspiderItem()

        movies = response.xpath("//div[@class='info']")

        for each in movies:
            # 标题
            item['title'] = each.xpath(
                ".//span[@class='title'][1]/text()").extract()[0]
            # 信息
            item['bd'] = each.xpath(
                ".//div[@class='bd']/p/text()").extract()[0]
            # 评分
            item['star'] = each.xpath(
                ".//div[@class='star']/span[@class='rating_num']/text()"
            ).extract()[0]
            # 简介
            quote = each.xpath(".//p[@class='quote']/span/text()").extract()
            if len(quote) != 0:
                item['quote'] = quote[0]
            yield item
        if self.offset <= 225:
            self.offset += 25
            yield scrapy.Request(self.url + str(self.offset),
                                 callback=self.parse)
Пример #5
0
    def parse(self, response):
        movie_list = response.xpath("//ol[@class='grid_view']/li")
        for i_item in movie_list:
            douban_item = DoubanspiderItem()
            douban_item['movie_num'] = i_item.xpath(
                ".//div[@class='item']//em/text()").extract_first()
            douban_item['movie_name'] = i_item.xpath(
                ".//div[@class='hd']/a/span[1]/text()").extract_first()
            content_list = i_item.xpath(
                ".//div[@class='info']/div[@class='bd']/p/text()").extract()
            for content_i in content_list:
                content_s = "".join(content_i.split())
                douban_item['movie_introduce'] = content_s
            douban_item['movie_star'] = i_item.xpath(
                ".//div[@class='item']//span[@class='rating_num']/text()"
            ).extract_first()
            douban_item['movie_eval'] = i_item.xpath(
                ".//div[@class='item']//div[@class='star']/span[4]/text()"
            ).extract_first()
            douban_item['movie_image_url'] = i_item.xpath(
                ".//div[@class='pic']//img/@src").extract_first()

            yield douban_item
        next_link = response.xpath(
            "//span[@class='next']/link/@href").extract()
        if next_link:
            next_link = next_link[0]
            yield scrapy.Request("https://movie.douban.com/top250" + next_link,
                                 callback=self.parse)
Пример #6
0
    def parse(self, response):
        item = DoubanspiderItem()
        for comment in response.css('div.comment'):
            item['author'] = comment.css(
                'span.comment-info > a::text').extract_first()
            item['vote'] = comment.css(
                'span.comment-vote > span.vote-count::text').extract_first()
            item['comment'] = comment.css(
                'p.comment-content::text').extract_first()
            yield item

        next_page = response.css('li.p a.page-btn::attr("href")').extract()[-1]
        if next_page is not None:
            yield response.follow(next_page, self.parse)
Пример #7
0
    def parse(self, response):
        item = ItemLoader(item=DoubanspiderItem(), response=response)
        item.add_xpath('title', '//span/@data-ip')
        yield item.load_item()


#    def get_cookie(self):
#        with open ('cookie.txt','r') as f:
#            cookies = {}
#            for line in f.read().split(';'):
#                name,value = line.strip().split('=',1)
#                cookies[name] = value
#        print(cookies)
#        return cookies
Пример #8
0
    def parse(self, response):
        item = DoubanspiderItem()
        #使用Xpah选择器获取所有标签页的链接

        book_titles = response.xpath('//div[@class=""]//tbody/tr/td/a/@href').extract()

        for i in book_titles:
            item['book_title'] = i
            url = 'https://book.douban.com' + urllib.parse.quote(i)
            baseurl = {}
            baseurl['url'] = url
            yield scrapy.Request(url = url, meta = {'item1':copy.deepcopy(item),'baseurl':copy.deepcopy(baseurl)},
                                 callback= self.get_booklinks,
                                 dont_filter=True
            )
Пример #9
0
    def parse(self, response):
        movies = response.xpath("//div[@class='info']")
        item = DoubanspiderItem()
        for m in movies:
            title = m.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract()
            content = m.xpath('div[@class="bd"]/p/text()').extract()
            score = m.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()
            info = m.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()

            item['title'] =  title[0]
            item['content'] = ';'.join(content)
            item['score'] = score[0]
            item['info'] = info[0]
            yield item

        if self.start <= 225:
            self.start += 25
            yield scrapy.Request(self.url + str(self.start) + self.end, callback=self.parse)
Пример #10
0
    def parse(self, response):
        item = DoubanspiderItem()
        # for i in range(1, 37):
        page = json.loads(response.body)
        tag = page['res']['kind_str']

        if tag != 'excerpt':
            sort_name = page['res']['payload']['title']
            subjects = page['res']['subjects']
            item['sort_name'] = sort_name
            # yield item
            for n, book in enumerate(subjects):
                number = n + 1
                title = book['title']
                rating = book['rating']
                item['number'] = number
                item['title'] = title
                item['rating'] = rating
                yield item
        next_page = int(response.url.split('/')[-1]) + 1
        next_url = response.urljoin(str(next_page))
        yield scrapy.Request(next_url, callback=self.parse)
Пример #11
0
 def parse(self, response):
     selector = Selector(response)
     comments = selector.xpath('//*[@id="comments"]/ul/li')   # 这里提取出来的是个列表
     bookid = response.meta['book_id']
     # print comments
     for comment in comments:
         print comment
         item = DoubanspiderItem()
         item['bookid'] = 'https://book.douban.com/subject/' + str(bookid)
         print item['bookid']
         try:
             item['comment'] = comment.xpath('div[2]/p/text()').extract()[0].encode('utf-8')  # /text()可以只把文本出来
             print item['comment']
         except:
             item['comment'] = "没有评论信息"
             print "没有评论信息"
         yield item
         time.sleep(0.05)
     if response.meta['page'] < max_page:
         page = response.meta['page'] + 1
         url = 'https://book.douban.com/subject/{}/comments/hot?p={}'.format(bookid, page)
         print 'page:', page
         yield scrapy.Request(url=url, headers={'user-agent': 'Mozilla/5.0'}, meta={'book_id': bookid, 'page': page})
Пример #12
0
 def parse_item(self, response):
     sel = Selector(response)
     item = DoubanspiderItem()
     item['name'] = sel.xpath(
         '//*[@id="content"]/h1/span[1]/text()').extract()
     item['year'] = sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(
         r'\((\d+)\)')
     # item['score'] = sel.xpath('//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract()
     item['score'] = sel.xpath(
         '//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract()
     item['director'] = sel.xpath(
         '//*[@id="info"]/span[1]/span/a/text()').extract()
     item['celebrity'] = sel.xpath(
         '//*[@id="info"]/span[2]/span/a/text()').extract()
     item['classification'] = sel.xpath(
         '//span[@property="v:genre"]/text()').extract()
     item['actor'] = sel.xpath(
         '//*[@id="info"]/span[3]//span/a/text()').extract()
     item['date'] = sel.xpath(
         '//span[@property="v:initialReleaseDate"]/text()').extract()
     item['len_time'] = sel.xpath(
         '//span[@property="v:runtime"]/text()').extract()
     return item
Пример #13
0
    def parse_item(self, response):
        # item = DoubanspiderItem()
        # item['movie_name'] = '绿皮书'
        # item['movie_id'] = '27060077'
        # item['comment_head'] = response.xpath('//span[@property="v:summary"]/text()').extract_first()
        # item['comment_data'] = ''.join(response.xpath('//div[@id="review-content"]//text()').extract()).strip()
        # item['comment_url'] = response.url
        # item['people_name'] = response.xpath('//div[@id="review-content"]/@data-author').extract()

        item = ItemLoader(item=DoubanspiderItem(), response=response)
        # item.add_value('movie_name', '绿皮书')
        item.add_xpath('movie_name', '//header/a[2]/text()')
        item.add_value('movie_id', re.search('[0-9]+', response.url).group())
        item.add_xpath('comment_rate',
                       '//span[contains(@class,"main-title-rating")]/@title')
        item.add_xpath('comment_head', '//span[@property="v:summary"]/text()')
        item.add_xpath('comment_data', '//div[@id="review-content"]//text()',
                       Join())
        item.add_value('comment_url', response.url)
        item.add_xpath('people_name',
                       '//div[@id="review-content"]/@data-author')

        return item.load_item()