def parse_review(self, response): item = TripAdvisorReviewItem() #item['title'] = response.xpath('//div[@class="quote"]/text()')[0].extract()[1:-1] #strip the quotes (first and last char) #item['content'] = response.xpath('//div[@class="entry"]/p/text()').extract()[0][1:-1] item['user_name'] = response.xpath( '//div[@class="username mo"]/span[starts-with(@class,"expand_inline scrname")]/text()' ).extract()[0] item['reviewer_location'] = response.xpath( '//div[@class="location"]/text()')[0].extract()[1:-1] #item['review_stars'] = response.xpath('//span[@class="rate sprite-rating_s rating_s"]/img/@alt').extract()[0][0] #item['reviewer_location'] = response.xpath('//div[@class="location"]/text()')[0].extract()[1:-1] #item['city'] = response.xpath('//li[starts-with(@class,"breadcrumb_item")]/a/span/text()')[-3].extract() #locationcontent = response.xpath('//div[starts-with(@class,"locationContent")]') #item['hotel_name'] = locationcontent.xpath('.//div[starts-with(@class,"surContent")]/a/text()')[0].extract() #hotelclass = locationcontent.xpath('.//span[starts-with(@class,"star")]/span/img/@alt') #if hotelclass: #item['hotel_classs'] = hotelclass[0].extract()[0] #item['hotel_review_stars'] = locationcontent.xpath('.//div[starts-with(@class,"userRating")]/div/span/img/@alt').extract()[0][0] #item['hotel_review_qty'] = locationcontent.xpath('.//div[starts-with(@class,"userRating")]/div/a/text()')[0].extract() return item
def parse_review(self, response): item = TripAdvisorReviewItem() item['title'] = response.xpath('//div[@class="quote"]/text()')[ 0].extract()[1:-1] #strip the quotes (first and last char) item['content'] = response.xpath( '//div[@class="entry"]/p/text()').extract()[0] item['review_stars'] = response.xpath( '//span[@class="rate sprite-rating_s rating_s"]/img/@alt').extract( )[0] item['reviewer_location'] = response.xpath( '//div[@class="location"]/text()')[0].extract()[1:-1] item['city'] = response.xpath( '//li[starts-with(@class,"breadcrumb_item")]/a/span/text()' )[-3].extract() locationcontent = response.xpath( '//div[starts-with(@class,"locationContent")]') item['hotel_name'] = locationcontent.xpath( './/div[starts-with(@class,"surContent")]/a/text()')[0].extract() item['hotel_url'] = response.urljoin( locationcontent.xpath( './/div[starts-with(@class,"surContent")]/a/@href') [0].extract()) hotelclass = locationcontent.xpath( './/span[starts-with(@class,"star")]/span/img/@alt') if hotelclass: item['hotel_classs'] = hotelclass[0].extract() hoteladdress = locationcontent.xpath( './/span[starts-with(@class,"street-address")]/text()') if hoteladdress: item['hotel_address'] = hoteladdress[0].extract() hotellocality = locationcontent.xpath( './/span[starts-with(@class,"locality")]/text()') if hotellocality: item['hotel_locality'] = hotellocality[0].extract() item['hotel_review_stars'] = locationcontent.xpath( './/div[starts-with(@class,"userRating")]/div/span/img/@alt' )[0].extract() item['hotel_review_qty'] = locationcontent.xpath( './/div[starts-with(@class,"userRating")]/div/a/text()' )[0].extract() return item
def parse_review(self, response): item = TripAdvisorReviewItem() """ Ex. "start_urls" is as bellow; # アジア 日本 近畿地方 京都府 京都 京都市 ホテル 'https://www.tripadvisor.jp/Hotels-g298564-Kyoto_Kyoto_Prefecture_Kinki-Hotels.html' Kyoto has so many good sightseeing spots, hotels, I promise. If you have the little time, you should go there! Hotel Name : 祇園畑中 Review Title Review URL : 'https://www.tripadvisor.jp/ShowUserReviews-g298564-d1071044-r577749431-Gion_Hatanaka-Kyoto_Kyoto_Prefecture_Kinki.html' Reviewer Name : leo868 Reviewer Location : 大分市, 大分県 Content : 立地がよく、混雑をさけ早朝、夜の東山観光に便利。料理は美味しく量も十分。部屋に風呂もあるが大(以下、略) Tips : 離れのスタンダード客室で眺望はないが静かでくつろげたが、空室があれば眺望の(以下、略) Purpose : 2018年5月、家族旅行 Datetime : Date & Time Logged by This Python File Date : Only Date """ # Hotel Name try: item['hotel_name'] = response.xpath( '//*[@id="CHECK_RATES_CONT"]/div/div[1]/div/span/text()' ).extract()[0] except: pass # Review Title try: item['review_title'] = response.xpath( '//h1[@id="HEADING"]/text()').extract()[0] except: pass # Review URL try: item['review_url'] = response.url except: pass # Reviewer Name try: item['reviewer_name'] = response.xpath( '//div[@class="memberOverlayLink"]/div[2]/div[1]/text()' ).extract()[0] except: pass # Reviewer Location try: item['reviewer_location'] = response.xpath( '//div[@class="userLoc"]/strong/text()').extract()[0] except: pass # Content try: # item['content'] = '\n'.join([line.strip() for line in response.xpath('//div[@class="rev_wrap ui_columns is-multiline"]/div[2]/div[3]/div/p/span/text()')[0].extract()]) item['content'] = response.xpath( '//div[@class="rev_wrap ui_columns is-multiline"]/div[2]/div[3]/div/p/span/text()' )[0].extract() except: pass # Tips try: # item['tips'] = '\n'.join([line.strip() for line in response.xpath('//div[@class="reviewItem inlineRoomTip"]/text()')[0].extract()]) item['tips'] = response.xpath( '//div[@class="reviewItem inlineRoomTip"]/text()')[0].extract( ) except: pass # Purpose try: item['purpose'] = response.xpath( '//div[@class="recommend-titleInline noRatings"]/text()' ).extract()[0] except: pass # Datetime try: item['datetime'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') except: pass # Date try: item['date'] = datetime.datetime.now().strftime('%Y-%m-%d') except: pass return item
def parse_review(self, response): item = TripAdvisorReviewItem() item['title'] = response.xpath('//div[@class="quote"]/text()')[ 0].extract()[1:-1] #strip the quotes (first and last char) # Get all of the lines for just this review. item['content'] = '\n'.join([ line.strip() for line in response.xpath( '(//div[@class="entry"])[1]//p/text()').extract() ]) item['review_stars'] = response.xpath( '//span[@class="rate sprite-rating_s rating_s"]/img/@alt').extract( )[0] try: item['reviewer_id'] = response.xpath( '//div[@class="memberOverlayLink"]/@id').extract()[0] item['reviewer_name'] = response.xpath( '//div[contains(@class, "username")]/span/text()').extract()[0] item['reviewer_level'] = response.xpath( '//div[contains(@class, "levelBadge")]/@class').extract( )[0].split()[-1] item['reviewer_location'] = response.xpath( '//div[@class="location"]/text()')[0].extract()[1:-1] except: # Not all reviews have a logged in reviewer pass item['city'] = response.xpath( '//li[starts-with(@class,"breadcrumb_item")]/a/span/text()' )[-3].extract() locationcontent = response.xpath( '//div[starts-with(@class,"locationContent")]') item['hotel_name'] = locationcontent.xpath( './/div[starts-with(@class,"surContent")]/a/text()')[0].extract() item['hotel_url'] = response.urljoin( locationcontent.xpath( './/div[starts-with(@class,"surContent")]/a/@href') [0].extract()) hotelclass = locationcontent.xpath( './/span[starts-with(@class,"star")]/span/img/@alt') if hotelclass: item['hotel_classs'] = hotelclass[0].extract() hoteladdress = locationcontent.xpath( './/span[starts-with(@class,"street-address")]/text()') if hoteladdress: item['hotel_address'] = hoteladdress[0].extract() hotellocality = locationcontent.xpath( './/span[starts-with(@class,"locality")]/text()') if hotellocality: item['hotel_locality'] = hotellocality[0].extract() item['hotel_review_stars'] = locationcontent.xpath( './/div[starts-with(@class,"userRating")]/div/span/img/@alt' )[0].extract() item['hotel_review_qty'] = locationcontent.xpath( './/div[starts-with(@class,"userRating")]/div/a/text()' )[0].extract() return item