def parse(self, response): url_start = 'http://www.tripadvisor.com' hxs = HtmlXPathSelector(response) # Parse the page for hotels and yield them # if the page is a city page hotel_urls = hxs.select('//a[contains(@class, "property_title")]/@href').extract() if hotel_urls: for hotel_url in hotel_urls: yield Request(url_start + hotel_url, self.parse) # Parse for the next button and yield the next city page # The next button is available both on the top and bottom # of the page. Yield any one of those # I chose the bottom link. Feels more... comfortable ;) next_page_url = clean_parsed_string(get_parsed_string( hxs, '//div[contains(@id, "pager_bottom")]//a[contains(@class, "guiArw sprite-pageNext pid0")]/@href')) if next_page_url and len(next_page_url) > 0: next_page = url_start + next_page_url yield Request(next_page, self.parse) # If the page itself is a hotels page, get the details and # return the hotel item if response.url.find('/Hotel_Review') != -1: hi = HotelItem() hi['item_type'] = 'hotel' hi['hotel_id'] = re.search('d[0-9]+', response.url).group(0) hi['name'] = clean_parsed_string(get_parsed_string( hxs, '//h1[contains(@id, "HEADING")]/text()')) hi['locality'] = clean_parsed_string(get_parsed_string( hxs, '//div[contains(@class, "wrap infoBox")]//span[contains(@property, "v:locality")]/text()')) hi['region'] = clean_parsed_string(get_parsed_string( hxs, '//div[contains(@class, "wrap infoBox")]//span[contains(@property, "v:region")]/text()')) hi['postal_code'] = clean_parsed_string(get_parsed_string( hxs, '//div[contains(@class, "wrap infoBox")]//span[contains(@property, "v:postal-code")]/text()')) hi['country'] = clean_parsed_string(get_parsed_string( hxs, '//div[contains(@class, "wrap infoBox")]//span[contains(@property, "v:country-name")]/text()')) rating_string = clean_parsed_string(get_parsed_string( hxs, '//div[contains(@rel, "v:rating")]//img[contains(@class, "sprite-ratings")]/@alt')) review_count = clean_parsed_string(get_parsed_string( hxs, '//div[contains(@class, "rs rating")]//span[contains(@property, "v:count")]/text()')) price_range = clean_parsed_string(get_parsed_string( hxs, '//span[contains(@property, "v:pricerange")]/text()')) hi['price_range'] = len(price_range) if price_range else None # Some review counts are written '1 review' instead of just '1123' # So split the numerical part and convert into integer hi['review_count'] = int(review_count.split()[0]) if review_count else None hi['rating'] = float(re.search('[0-9].[0-9]', rating_string).group(0)) if rating_string else None hi['url'] = response.url print hi['name'] yield hi
def parse(self, response): url_start = 'http://www.tripadvisor.com' hxs = HtmlXPathSelector(response) # The locations may or may not contain sub-locations sub_urls = hxs.select('//div[contains(@class, "rolluptopdestlink")]/a/@href').extract() # If the page contain sub-locations request all the # sub-locations and yield the next page if available if sub_urls: for sub_url in sub_urls: city_url = url_start + sub_url yield Request(city_url, self.parse) # Now comes the next-page part next_page_url = clean_parsed_string(get_parsed_string( hxs, '//a[contains(@class, "guiArw sprite-pageNext")]/@href')) if next_page_url and len(next_page_url) > 0: next_page = url_start + next_page_url yield Request(next_page, self.parse) # If no sub-locations are present, return the CityItem else: url = response.url city = clean_parsed_string(get_parsed_string( hxs, '//h1[contains(@class, "header")]/text()')) geo_id = re.search('g[0-9]+', url).group(0) city = city.split('Hotels')[0].strip() if city else None print city ci = CityItem() ci['item_type'] = 'city' ci['city'] = city ci['geo_id'] = geo_id ci['url'] = url yield ci
def parse(self, response): url_start = 'http://www.tripadvisor.com' hxs = HtmlXPathSelector(response) # The default hotels page contains the reviews # but the reviews are shrunk and need to click # 'more' to view the complete content. An alternate # way is to click one of the reviews in the page review_url = clean_parsed_string(get_parsed_string( hxs, '//div[contains(@class, "basic_review first")]//a/@href')) if review_url: yield Request(url_start + review_url, self.parse) # If the page is not a basic review page, we can proceed with # parsing the reviews else: raw_reviews = hxs.select('//div[contains(@class, "review extended")]') for raw_review in raw_reviews: ri = ReviewItem() ri['item_type'] = 'review' ri['hotel_id'] = re.search('d[0-9]+', response.url).group(0) ri['review_id'] = clean_parsed_string(get_parsed_review_element( raw_review, '@id')) rdate_text = clean_parsed_string(get_parsed_review_element( raw_review, 'div//span[contains(@class, "ratingDate")]/text()')) rdate_text = rdate_text.split('Reviewed')[1].strip() if rdate_text else None rdate = time.strptime(rdate_text, '%B %d, %Y') if rdate_text else None ri['review_date'] = time.strftime('%Y-%m-%d', rdate) if rdate else None ri['reviewer_type'] = None # TODO: Try to find the info and insert here ri['summary'] = clean_parsed_string(get_parsed_review_element( raw_review, 'div//div[contains(@class, "quote")]/text()')) ri['reviewer_name'] = clean_parsed_string(get_parsed_review_element( raw_review, 'div//div[contains(@class, "username mo")]/span/text()')) reviewer_rcount = clean_parsed_string(get_parsed_review_element( raw_review, 'div//div[contains(@class, "totalReviewBadge")]//span[contains(@class, "badgeText")]/text()')) ri['reviewer_rcount'] = int(reviewer_rcount.split()[0]) if reviewer_rcount else None reviewer_locality = clean_parsed_string(get_parsed_review_element( raw_review, 'div//div[contains(@class, "member_info")]//div[contains(@class, "location")]/text()')) ri['reviewer_locality'] = reviewer_locality.title() if reviewer_locality else None ri['content'] = clean_parsed_string(get_parsed_review_element( raw_review, 'div//div[contains(@class, "entry")]//p')) rating_text = clean_parsed_string(get_parsed_review_element( raw_review, 'div//div[contains(@class, "rating reviewItemInline")]//img/@alt')) ri['rating'] = int(rating_text.split()[0]) if rating_text else None ri['recommendations'] = raw_review.select('div//li[contains(@class, "recommend-answer")]').extract() print '%s:%s:%s' % (ri['review_id'], ri['reviewer_name'], ri['review_date']) yield ri # Find the next page link if available and yield it next_page_url = clean_parsed_string(get_parsed_string( hxs, '//a[contains(@class, "guiArw sprite-pageNext")]/@href')) if next_page_url and len(next_page_url) > 0: next_page = url_start + next_page_url yield Request(next_page, self.parse)
def parse(self, response): url_start = 'http://www.tripadvisor.com' hxs = HtmlXPathSelector(response) # The default hotels page contains the reviews # but the reviews are shrunk and need to click # 'more' to view the complete content. An alternate # way is to click one of the reviews in the page review_url = clean_parsed_string( get_parsed_string( hxs, '//div[contains(@class, "basic_review first")]//a/@href')) if review_url: yield Request(url_start + review_url, self.parse) # If the page is not a basic review page, we can proceed with # parsing the reviews else: raw_reviews = hxs.select( '//div[contains(@class, "review extended")]') for raw_review in raw_reviews: ri = ReviewItem() ri['item_type'] = 'review' ri['hotel_id'] = re.search('d[0-9]+', response.url).group(0) ri['review_id'] = clean_parsed_string( get_parsed_review_element(raw_review, '@id')) rdate_text = clean_parsed_string( get_parsed_review_element( raw_review, 'div//span[contains(@class, "ratingDate")]/text()')) rdate_text = rdate_text.split( 'Reviewed')[1].strip() if rdate_text else None rdate = time.strptime(rdate_text, '%B %d, %Y') if rdate_text else None ri['review_date'] = time.strftime('%Y-%m-%d', rdate) if rdate else None ri['reviewer_type'] = None # TODO: Try to find the info and insert here ri['summary'] = clean_parsed_string( get_parsed_review_element( raw_review, 'div//div[contains(@class, "quote")]/text()')) ri['reviewer_name'] = clean_parsed_string( get_parsed_review_element( raw_review, 'div//div[contains(@class, "username mo")]/span/text()' )) reviewer_rcount = clean_parsed_string( get_parsed_review_element( raw_review, 'div//div[contains(@class, "totalReviewBadge")]//span[contains(@class, "badgeText")]/text()' )) ri['reviewer_rcount'] = int( reviewer_rcount.split()[0]) if reviewer_rcount else None reviewer_locality = clean_parsed_string( get_parsed_review_element( raw_review, 'div//div[contains(@class, "member_info")]//div[contains(@class, "location")]/text()' )) ri['reviewer_locality'] = reviewer_locality.title( ) if reviewer_locality else None ri['content'] = clean_parsed_string( get_parsed_review_element( raw_review, 'div//div[contains(@class, "entry")]//p')) rating_text = clean_parsed_string( get_parsed_review_element( raw_review, 'div//div[contains(@class, "rating reviewItemInline")]//img/@alt' )) ri['rating'] = int( rating_text.split()[0]) if rating_text else None ri['recommendations'] = raw_review.select( 'div//li[contains(@class, "recommend-answer")]').extract() print '%s:%s:%s' % (ri['review_id'], ri['reviewer_name'], ri['review_date']) yield ri # Find the next page link if available and yield it next_page_url = clean_parsed_string( get_parsed_string( hxs, '//a[contains(@class, "guiArw sprite-pageNext")]/@href')) if next_page_url and len(next_page_url) > 0: next_page = url_start + next_page_url yield Request(next_page, self.parse)
def parse(self, response): url_start = 'http://www.tripadvisor.com' hxs = HtmlXPathSelector(response) # Parse the page for hotels and yield them # if the page is a city page hotel_urls = hxs.select( '//a[contains(@class, "property_title")]/@href').extract() if hotel_urls: for hotel_url in hotel_urls: yield Request(url_start + hotel_url, self.parse) # Parse for the next button and yield the next city page # The next button is available both on the top and bottom # of the page. Yield any one of those # I chose the bottom link. Feels more... comfortable ;) next_page_url = clean_parsed_string( get_parsed_string( hxs, '//div[contains(@id, "pager_bottom")]//a[contains(@class, "guiArw sprite-pageNext pid0")]/@href' )) if next_page_url and len(next_page_url) > 0: next_page = url_start + next_page_url yield Request(next_page, self.parse) # If the page itself is a hotels page, get the details and # return the hotel item if response.url.find('/Hotel_Review') != -1: hi = HotelItem() hi['item_type'] = 'hotel' hi['hotel_id'] = re.search('d[0-9]+', response.url).group(0) hi['name'] = clean_parsed_string( get_parsed_string(hxs, '//h1[contains(@id, "HEADING")]/text()')) hi['locality'] = clean_parsed_string( get_parsed_string( hxs, '//div[contains(@class, "wrap infoBox")]//span[contains(@property, "v:locality")]/text()' )) hi['region'] = clean_parsed_string( get_parsed_string( hxs, '//div[contains(@class, "wrap infoBox")]//span[contains(@property, "v:region")]/text()' )) hi['postal_code'] = clean_parsed_string( get_parsed_string( hxs, '//div[contains(@class, "wrap infoBox")]//span[contains(@property, "v:postal-code")]/text()' )) hi['country'] = clean_parsed_string( get_parsed_string( hxs, '//div[contains(@class, "wrap infoBox")]//span[contains(@property, "v:country-name")]/text()' )) rating_string = clean_parsed_string( get_parsed_string( hxs, '//div[contains(@rel, "v:rating")]//img[contains(@class, "sprite-ratings")]/@alt' )) review_count = clean_parsed_string( get_parsed_string( hxs, '//div[contains(@class, "rs rating")]//span[contains(@property, "v:count")]/text()' )) price_range = clean_parsed_string( get_parsed_string( hxs, '//span[contains(@property, "v:pricerange")]/text()')) hi['price_range'] = len(price_range) if price_range else None # Some review counts are written '1 review' instead of just '1123' # So split the numerical part and convert into integer hi['review_count'] = int( review_count.split()[0]) if review_count else None hi['rating'] = float( re.search('[0-9].[0-9]', rating_string).group(0)) if rating_string else None hi['url'] = response.url print hi['name'] yield hi