def parseDetail(self, response): url = response.request.url thing_to_do = self.validateStr( response.xpath("//h1[@id='HEADING']/text()").extract_first()) addr_list = response.xpath( "//div[@class='detail_section address']/span/text()").extract() address = '' for addr in addr_list: address = address + self.validateStr(addr) + ' ' address = self.validateStr(address) city = self.validateCityUrl( response.xpath("//span[@class='locality']/text()").extract_first()) phone_list = response.xpath( "//div[@class='blEntry phone']/span/text()").extract() phone_number = '' for phone in phone_list: phone_number = phone_number + self.validateStr(phone) + ' ' phone_number = self.validateStr(phone_number) email = '' num_review = self.validateStr( response.xpath("//span[@property='count']/text()").extract_first()) num_rating = self.validateStr( response.xpath( "//span[@class='overallRating']/text()").extract_first()) is_bookable = False activities = response.xpath("//div[@class='ui_link']") if activities: is_bookable = True for activity in activities: label = self.validateStr( activity.xpath( ".//div[@class='MultiTourOffer__title_container--3SBSu']//span[@class='MultiTourOffer__title--4PROg']/text()" ).extract_first()) price = self.validateStr( activity.xpath( ".//div[@class='MultiTourOffer__price_container--1yJni']//span[@class='fromPrice']/text()" ).extract_first()) item = TripadvisorItem() item['url'] = url item['thing_to_do'] = thing_to_do item['address'] = address item['city'] = city item['phone_number'] = phone_number item['email'] = '' item['num_review'] = num_review item['num_rating'] = num_rating item['is_bookable'] = is_bookable item['activity'] = label item['price'] = price yield item # show more button actions else: item = TripadvisorItem() item['url'] = url item['thing_to_do'] = thing_to_do item['address'] = address item['city'] = city item['phone_number'] = phone_number item['email'] = '' item['num_review'] = num_review item['num_rating'] = num_rating item['is_bookable'] = is_bookable item['activity'] = '' item['price'] = '' yield item
def parse_each_attraction(self, response): sel = Selector(response) # print("********") # print(response.url) # print("********") trip_item = TripadvisorItem() title = response.xpath("//h1[@id='HEADING']/text()").extract() place_title = "" for t in title: place_title += t trip_item['PlaceTitle'] = place_title.strip() trip_item['PlaceURL'] = response.url if len(response.xpath('//a[@class="more"]/text()').extract()) > 0: total_reviews = response.xpath( '//a[@class="more"]/text()').extract()[0].split("R")[0] trip_item['TotalReviews'] = total_reviews else: trip_item['TotalReviews'] = "" # total_reviews = response.xpath('//a[@class="more"]/text()').extract()[0].split("R")[0] # if len(response.xpath("//div[@class='separator']").extract()) > 0: placeCatDiv = response.xpath("//div[@class='separator']").extract() placeCatSel = Selector(text=placeCatDiv[0]) trip_item['PlaceCategory'] = placeCatSel.xpath( "string(//div[1])").extract_first().strip() else: trip_item['PlaceCategory'] = "" trip_item['StreetAddress'] = response.xpath( '//span[@class="street-address"]/text()').extract( ) + response.xpath( 'span[@class="extended-address"]/text()').extract() trip_item['AddressLocality'] = response.xpath( '//span[@class="locality"]/span[@property="addressLocality"]/text()' ).extract() trip_item['AddressRegion'] = response.xpath( '//span[@class="locality"]/span[@property="addressRegion"]/text()' ).extract() trip_item['PostCode'] = response.xpath( '//span[@class="locality"]/span[@property="postalCode"]/text()' ).extract() phone_number = response.xpath( '//div[@class="phoneNumber"]/text()').extract() if phone_number: trip_item['PhoneNumber'] = phone_number[0].split(":")[1].strip() # # div = response.xpath("//div[@class='slim_ranking']").extract() if div: sel = Selector(text=div[0]) trip_item['Ranking'] = sel.xpath( "string(//div[1])").extract_first().strip() else: trip_item['Ranking'] = "" len_fee = response.xpath( "//div[@class='details_wrapper']/div[@class='detail']/text()" ).extract() if len_fee: trip_item['LengthOfVisit'] = len_fee[-3].strip() trip_item['Fee'] = len_fee[-1].strip() else: trip_item['LengthOfVisit'] = "" trip_item['Fee'] = "" trip_item['Description'] = response.xpath( "//div[@class='listing_details']/p/text()").extract() # trip_item['AverageRating'] = response.xpath("//div[@class='valueCount fr part']/text()")[2].extract() rating = response.xpath( "//div[@class='valueCount fr part']/text()").extract() if rating: trip_item['AverageRating'] = rating[2] else: trip_item['AverageRating'] = "" email_div = response.xpath( "//div[@class='taLnk fl']/@onclick").extract() if len(email_div) > 0: email = email_div[0].split(",") trip_item['Email'] = email[6].strip("/'") else: trip_item['Email'] = "" days = [ d.strip() for d in response.xpath("//span[@class='days']/text()").extract() if d ] hours = [ h.strip() for h in response.xpath("//span[@class='hours']/text()").extract() if h ] days_hours = dict(zip(days, hours)) trip_item['OpeningHours'] = "" for day, hour in days_hours.items(): trip_item['OpeningHours'] += day + " " + hour + " , " yield trip_item