Python TripAdvisorReviewItem示例，hotel_sentiment.items.TripAdvisorReviewItem Python示例

示例#1

0

显示文件

文件： tripadvisor_spider_moreinfo.py 项目： xl2602/nlp-final-project

    def parse_review(self, response):
        item = TripAdvisorReviewItem()
        #item['title'] = response.xpath('//div[@class="quote"]/text()')[0].extract()[1:-1] #strip the quotes (first and last char)
        #item['content'] = response.xpath('//div[@class="entry"]/p/text()').extract()[0][1:-1]
        item['user_name'] = response.xpath(
            '//div[@class="username mo"]/span[starts-with(@class,"expand_inline scrname")]/text()'
        ).extract()[0]
        item['reviewer_location'] = response.xpath(
            '//div[@class="location"]/text()')[0].extract()[1:-1]
        #item['review_stars'] = response.xpath('//span[@class="rate sprite-rating_s rating_s"]/img/@alt').extract()[0][0]

        #item['reviewer_location'] = response.xpath('//div[@class="location"]/text()')[0].extract()[1:-1]

        #item['city'] = response.xpath('//li[starts-with(@class,"breadcrumb_item")]/a/span/text()')[-3].extract()

        #locationcontent = response.xpath('//div[starts-with(@class,"locationContent")]')
        #item['hotel_name'] = locationcontent.xpath('.//div[starts-with(@class,"surContent")]/a/text()')[0].extract()

        #hotelclass = locationcontent.xpath('.//span[starts-with(@class,"star")]/span/img/@alt')
        #if hotelclass:
        #item['hotel_classs'] = hotelclass[0].extract()[0]

        #item['hotel_review_stars'] = locationcontent.xpath('.//div[starts-with(@class,"userRating")]/div/span/img/@alt').extract()[0][0]
        #item['hotel_review_qty'] = locationcontent.xpath('.//div[starts-with(@class,"userRating")]/div/a/text()')[0].extract()

        return item

示例#2

0

显示文件

    def parse_review(self, response):
        item = TripAdvisorReviewItem()
        item['title'] = response.xpath('//div[@class="quote"]/text()')[
            0].extract()[1:-1]  #strip the quotes (first and last char)
        item['content'] = response.xpath(
            '//div[@class="entry"]/p/text()').extract()[0]
        item['review_stars'] = response.xpath(
            '//span[@class="rate sprite-rating_s rating_s"]/img/@alt').extract(
            )[0]

        item['reviewer_location'] = response.xpath(
            '//div[@class="location"]/text()')[0].extract()[1:-1]

        item['city'] = response.xpath(
            '//li[starts-with(@class,"breadcrumb_item")]/a/span/text()'
        )[-3].extract()

        locationcontent = response.xpath(
            '//div[starts-with(@class,"locationContent")]')
        item['hotel_name'] = locationcontent.xpath(
            './/div[starts-with(@class,"surContent")]/a/text()')[0].extract()
        item['hotel_url'] = response.urljoin(
            locationcontent.xpath(
                './/div[starts-with(@class,"surContent")]/a/@href')
            [0].extract())

        hotelclass = locationcontent.xpath(
            './/span[starts-with(@class,"star")]/span/img/@alt')
        if hotelclass:
            item['hotel_classs'] = hotelclass[0].extract()

        hoteladdress = locationcontent.xpath(
            './/span[starts-with(@class,"street-address")]/text()')
        if hoteladdress:
            item['hotel_address'] = hoteladdress[0].extract()

        hotellocality = locationcontent.xpath(
            './/span[starts-with(@class,"locality")]/text()')
        if hotellocality:
            item['hotel_locality'] = hotellocality[0].extract()

        item['hotel_review_stars'] = locationcontent.xpath(
            './/div[starts-with(@class,"userRating")]/div/span/img/@alt'
        )[0].extract()
        item['hotel_review_qty'] = locationcontent.xpath(
            './/div[starts-with(@class,"userRating")]/div/a/text()'
        )[0].extract()

        return item

示例#3

0

显示文件

    def parse_review(self, response):
        item = TripAdvisorReviewItem()
        """
        Ex.

         "start_urls" is as bellow;

        # アジア  日本  近畿地方  京都府  京都  京都市 ホテル
        'https://www.tripadvisor.jp/Hotels-g298564-Kyoto_Kyoto_Prefecture_Kinki-Hotels.html'

        Kyoto has so many good sightseeing spots, hotels, I promise.
        If you have the little time, you should go there!

        Hotel Name : 祇園畑中
        Review Title
        Review URL : 'https://www.tripadvisor.jp/ShowUserReviews-g298564-d1071044-r577749431-Gion_Hatanaka-Kyoto_Kyoto_Prefecture_Kinki.html'
        Reviewer Name : leo868
        Reviewer Location : 大分市, 大分県
        Content : 立地がよく、混雑をさけ早朝、夜の東山観光に便利。料理は美味しく量も十分。部屋に風呂もあるが大(以下、略)
        Tips : 離れのスタンダード客室で眺望はないが静かでくつろげたが、空室があれば眺望の(以下、略)
        Purpose : 2018年5月、家族旅行
        Datetime : Date & Time Logged by This Python File
        Date : Only Date
        """

        # Hotel Name
        try:
            item['hotel_name'] = response.xpath(
                '//*[@id="CHECK_RATES_CONT"]/div/div[1]/div/span/text()'
            ).extract()[0]
        except:
            pass

        # Review Title
        try:
            item['review_title'] = response.xpath(
                '//h1[@id="HEADING"]/text()').extract()[0]
        except:
            pass

        # Review URL
        try:
            item['review_url'] = response.url
        except:
            pass

        # Reviewer Name
        try:
            item['reviewer_name'] = response.xpath(
                '//div[@class="memberOverlayLink"]/div[2]/div[1]/text()'
            ).extract()[0]
        except:
            pass

        # Reviewer Location
        try:
            item['reviewer_location'] = response.xpath(
                '//div[@class="userLoc"]/strong/text()').extract()[0]
        except:
            pass

        # Content
        try:
            # item['content'] = '\n'.join([line.strip() for line in response.xpath('//div[@class="rev_wrap ui_columns is-multiline"]/div[2]/div[3]/div/p/span/text()')[0].extract()])
            item['content'] = response.xpath(
                '//div[@class="rev_wrap ui_columns is-multiline"]/div[2]/div[3]/div/p/span/text()'
            )[0].extract()
        except:
            pass

        # Tips
        try:
            # item['tips'] = '\n'.join([line.strip() for line in response.xpath('//div[@class="reviewItem inlineRoomTip"]/text()')[0].extract()])
            item['tips'] = response.xpath(
                '//div[@class="reviewItem inlineRoomTip"]/text()')[0].extract(
                )
        except:
            pass

        # Purpose
        try:
            item['purpose'] = response.xpath(
                '//div[@class="recommend-titleInline noRatings"]/text()'
            ).extract()[0]
        except:
            pass

        # Datetime
        try:
            item['datetime'] = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')
        except:
            pass

        # Date
        try:
            item['date'] = datetime.datetime.now().strftime('%Y-%m-%d')
        except:
            pass

        return item

示例#4

0

显示文件

文件： tripadvisor_spider_moreinfo.py 项目： nilansharora/hotel-reviewing

    def parse_review(self, response):
        item = TripAdvisorReviewItem()
        item['title'] = response.xpath('//div[@class="quote"]/text()')[
            0].extract()[1:-1]  #strip the quotes (first and last char)
        # Get all of the lines for just this review.
        item['content'] = '\n'.join([
            line.strip() for line in response.xpath(
                '(//div[@class="entry"])[1]//p/text()').extract()
        ])
        item['review_stars'] = response.xpath(
            '//span[@class="rate sprite-rating_s rating_s"]/img/@alt').extract(
            )[0]

        try:
            item['reviewer_id'] = response.xpath(
                '//div[@class="memberOverlayLink"]/@id').extract()[0]
            item['reviewer_name'] = response.xpath(
                '//div[contains(@class, "username")]/span/text()').extract()[0]
            item['reviewer_level'] = response.xpath(
                '//div[contains(@class, "levelBadge")]/@class').extract(
                )[0].split()[-1]
            item['reviewer_location'] = response.xpath(
                '//div[@class="location"]/text()')[0].extract()[1:-1]
        except:
            # Not all reviews have a logged in reviewer
            pass

        item['city'] = response.xpath(
            '//li[starts-with(@class,"breadcrumb_item")]/a/span/text()'
        )[-3].extract()

        locationcontent = response.xpath(
            '//div[starts-with(@class,"locationContent")]')
        item['hotel_name'] = locationcontent.xpath(
            './/div[starts-with(@class,"surContent")]/a/text()')[0].extract()
        item['hotel_url'] = response.urljoin(
            locationcontent.xpath(
                './/div[starts-with(@class,"surContent")]/a/@href')
            [0].extract())

        hotelclass = locationcontent.xpath(
            './/span[starts-with(@class,"star")]/span/img/@alt')
        if hotelclass:
            item['hotel_classs'] = hotelclass[0].extract()

        hoteladdress = locationcontent.xpath(
            './/span[starts-with(@class,"street-address")]/text()')
        if hoteladdress:
            item['hotel_address'] = hoteladdress[0].extract()

        hotellocality = locationcontent.xpath(
            './/span[starts-with(@class,"locality")]/text()')
        if hotellocality:
            item['hotel_locality'] = hotellocality[0].extract()

        item['hotel_review_stars'] = locationcontent.xpath(
            './/div[starts-with(@class,"userRating")]/div/span/img/@alt'
        )[0].extract()
        item['hotel_review_qty'] = locationcontent.xpath(
            './/div[starts-with(@class,"userRating")]/div/a/text()'
        )[0].extract()

        return item