示例#1
0
    def parse_reviews(self, response):
        _setDNSCache()

        for item in response.xpath('//div[@class="main review-item"]'):
            url = item.xpath('div/h2/a/@href').get()
            yield Request(url=url, callback=self.parse_replies)
            break
            # TODO::next page
            url = urljoin(response.url, response.xpath('//span[@class="next"]/link/@href').get())
            if url:
                yield Request(url=url, callback=self.parse_reviews)
示例#2
0
    def parse_comments(self, response):
        _setDNSCache()

        # crawl comments
        comment = DoubanCommentItem()
        # print(response.body)
        for item in response.xpath('//div[@class="comment-item"]'):
            # 短评对应的电影id
            comment['movie_id'] = response.url.split('/')[4]
            # 短评的唯一id
            comment['comment_id'] = item.xpath(
                'div[@class="comment"]/h3/span[@class="comment-vote"]/input/@value'
            ).extract()[0].strip()
            # 多少人评论有用
            comment['votes'] = item.xpath(
                'div[@class="comment"]/h3/span[@class="comment-vote"]/span/text()'
            ).extract()[0].strip()
            # 状态
            # comment['user_status'] = item.xpath('div[@class="comment"]/h3/span[@class="comment-info"]/span[1]/text()').extract()[0].strip()
            # 评分
            comment['rating'] = item.xpath(
                'div[@class="comment"]/h3/span[@class="comment-info"]/span[2]/@class'
            ).extract()[0].strip()
            if comment['rating'] == 'comment-time':
                comment['rating'] = 'none'
            # 评论时间
            comment['comment_time'] = item.xpath(
                'div[@class="comment"]/h3/span[@class="comment-info"]/span[@class="comment-time "]/@title'
            ).extract()
            # 评论内容
            comment['content'] = item.xpath(
                'div[@class="comment"]/p/span[@class="short"]/text()').extract(
                )[0].strip()
            # 评论者名字(唯一)
            comment['user_name'] = item.xpath(
                'div[@class="avatar"]/a/@title').extract()[0]
            # 评论者页面
            comment['user_id'] = item.xpath(
                'div[@class="avatar"]/a/@href').extract()[0].split('/')[4]

            yield comment

        # try next page
        try:
            next_page = response.urljoin(
                response.xpath('//a[@class="next"]/@href').extract()[0])
        except:
            next_page = ''
        if next_page:
            # self.pages.append(next_page)
            yield Request(url=next_page, callback=self.parse_comments)
示例#3
0
    def parse_review_details(self, response):
        _setDNSCache()

        # crawl review details
        review = DoubanReviewItem()
        article = response.xpath('//div[@class="article"]')

        review['review_title'] = article.xpath('h1/span/text()').get()
        review['review_id'] = response.url.split('/')[4]

        main = article.xpath('//div[@class="main"]')
        review['user_name'] = main.xpath(
            'header[@class="main-hd"]/a[1]/span/text()').get()
        review['user_id'] = main.xpath(
            'header[@class="main-hd"]/a[1]/@href').get().split('/')[4]
        review['movie_id'] = main.xpath(
            'header[@class="main-hd"]/a[2]/@href').get().split('/')[4]
        # TODO:: review may not have a rating
        review['rating'] = main.xpath(
            'header[@class="main-hd"]/span[1]/@class').get().split(' ')[0]
        review['comment_time'] = main.xpath(
            'header[@class="main-hd"]/span[@class="main-meta"]/text()').get()

        review['content'] = []
        for c in main.xpath('//div[@id="link-report"]/div/node()'):
            if c.xpath('text()').get():
                review['content'].append(c.xpath('text()').get().strip())
            elif c.get().strip() != '<br>' and c.get().strip() != '':
                review['content'].append(c.get().strip())
        # review['content'] = main.xpath('//div[@id="link-report"]/div/node()').getall()

        review['votes'] = main.xpath(
            '//div[@class="main-panel-useful"]/button[1]/text()').get().strip(
            ).split(' ')[1]
        review['useless_votes'] = main.xpath(
            '//div[@class="main-panel-useful"]/button[2]/text()').get().strip(
            ).split(' ')[1]
        review['forwards'] = main.xpath(
            '//span[@class="rec-num"]/text()').get()
        yield review
示例#4
0
    def parse_movies(self, response):
        # print(response.xpath('//a[@class="next"]/@href'))
        _setDNSCache()

        movie = DoubanMovieItem()
        movie['movie_id'] = response.url.split('/')[4]
        # movie['movie_name'] = response.xpath('//div[@id="content"]/h1/span[1]/text()').extract()[0].strip()
        movie['movie_name'] = response.xpath('//title/text()').get().rstrip().strip('\n').strip()[:-4].rstrip()

        movie_info = response.xpath('//div[@id="info"]')
        movie['director'] = movie_info.xpath('span[1]/span[@class="attrs"]/a/text()').getall()
        movie['author'] = movie_info.xpath('span[2]/span[@class="attrs"]/a/text()').getall()
        movie['actors'] = movie_info.xpath('span[@class="actor"]/span[@class="attrs"]/a/text()').getall()
        movie['movie_type'] = movie_info.xpath('span[@property="v:genre"]/text()').getall()
        movie['official_website'] = movie_info.xpath('span[@class="pl" and text()="官方网站:"]/following-sibling::a/text()').get()
        movie['region_made'] = movie_info.xpath('span[@class="pl" and text()="制片国家/地区:"]/following-sibling::text()').get()
        movie['language'] = movie_info.xpath('span[@class="pl" and text()="语言:"]/following-sibling::text()').get()
        movie['date_published'] = movie_info.xpath('span[@property="v:initialReleaseDate"]/text()').get()
        movie['movie_length'] = movie_info.xpath('span[@property="v:runtime"]/text()').get()
        movie['alias'] = movie_info.xpath('span[@class="pl"][6]/following-sibling::text()').get()

        # movie['votes'] = response.xpath('//div[@class="rating_self clearfix"]/div[@class="rating_right"]/div[@class="rating_sum"]/a/span/text()').get()
        movie['votes'] = response.xpath('//span[@property="v:votes"]/text()').get()
        movie['average_rating'] = response.xpath('//div[@class="rating_self clearfix"]/strong/text()').get()
        movie['stars5_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][1]/span[@class="rating_per"]/text()').get()
        movie['stars4_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][2]/span[@class="rating_per"]/text()').get()
        movie['stars3_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][3]/span[@class="rating_per"]/text()').get()
        movie['stars2_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][4]/span[@class="rating_per"]/text()').get()
        movie['stars1_ratings'] = response.xpath('//div[@class="ratings-on-weight"]/div[@class="item"][5]/span[@class="rating_per"]/text()').get()

        movie['description'] = response.xpath('//span[@property="v:summary"]/text()').getall()
        movie['recommendations'] = response.xpath('//div[@class="recommendations-bd"]/dl/dd/a/text()').getall()
        movie['labels'] = response.xpath('//div[@class="tags-body"]/a/text()').getall()
        movie['collections'] = response.xpath('//div[@class="subject-others-interests-ft"]/a[1]/text()').get()
        movie['wishes'] = response.xpath('//div[@class="subject-others-interests-ft"]/a[2]/text()').get()

        yield movie
示例#5
0
    def parse_replies(self, response):
        _setDNSCache()
        self.driver.get(response.url)
        self.driver.implicitly_wait(5)

        # crawl replies
        reply = DoubanReplyItem()
        try:
            comments = self.driver.find_elements_by_xpath('//div[@class="item comment-item"]')
        except:
            comments = []

        for item in comments:
            #回复对应的电影id
            reply['movie_id'] = response.url.split('/')[4]
            #回复的id
            reply['reply_id'] = item.get_attribute('data-cid')
            #回复时间
            reply['reply_time'] = item.find_element_by_xpath('div[@class="comment-item-body"]/div[@class="comment-main"]/div[@class="meta-header"]/time').text
            #回复内容
            reply['content'] = []
            for c in item.find_elements_by_xpath('div[@class="comment-item-body"]/div[@class="comment-main"]/div[@class="comment-content"]/span'):
                if c.text.strip() != '<br>' and c.text.strip != '':
                    reply['content'].append(c.text.strip())
            #回复者名字
            reply['user_name'] = item.find_element_by_xpath('div[@class="comment-item-body"]/div[@class="comment-main"]/div[@class="meta-header"]/a').text
            #回复者id
            reply['user_id'] = item.find_element_by_xpath('div[@class="comment-item-body"]/div[@class="comment-main"]/div[@class="meta-header"]/a[1]').get_attribute("href").split('/')[4]
            #所回复的回复的id(如本条回复直接回复到影评,则没有)
            reply['reply_to'] = ''

            yield reply

            try:
                reply_list = item.find_element_by_xpath('div[@class="reply-list"]')
                parent_id = reply['reply_id']
                # yield self.parse_reply_list(reply_list, reply["reply_id"])
                try:
                    get_more = reply_list.find_element_by_xpath('div[@class="replies-list-control"]/button')
                    get_more.click()
                    self.driver.implicitly_wait(5)
                except:
                    pass

                for item in reply_list.find_elements_by_xpath('div[@class="item reply-item"]'):
                    reply['movie_id'] = item.get_attribute('data-target_id')
                    reply['reply_id'] = item.get_attribute('data-cid')
                    reply['reply_time'] = item.find_element_by_xpath('div[@class="comment-item-body"]/div[@class="comment-main"]/div[@class="meta-header"]/time').text
                    reply['content'] = []
                    for c in item.find_elements_by_xpath('div[@class="comment-item-body"]/div[@class="comment-main"]/div[@class="comment-content"]/span'):
                        if c.text.strip() != '<br>' and c.text.strip != '':
                            reply['content'].append(c.text.strip())
                    reply['user_name'] = item.find_element_by_xpath('div[@class="comment-item-body"]/div[@class="comment-main"]/div[@class="meta-header"]/a').text
                    reply['user_id'] = item.find_element_by_xpath('div[@class="comment-item-body"]/div[@class="comment-main"]/div[@class="meta-header"]/a[1]').get_attribute("href").split('/')[4]
                    reply['reply_to'] = parent_id
                    yield reply

            except:
                pass
            break
                
        try:
            next_page = response.urljoin(self.driver.find_element_by_xpath('//a[@class="next"]').get_attribute("href"))
        except:
            next_page = ""

        if next_page:
            yield Request(url=next_page, callback=self.parse_replies)