Пример #1
0
    def parse(self, response):
        data = json.loads(response.body)
        listing_data = data.get('explore_tabs')[0].get('sections')[0].get(
            'listings')
        country = str(data.get('metadata').get('geography').get('country'))

        for listing in listing_data:
            listing_item = AirbnbItem()
            details = listing.get('listing')
            price = listing.get('pricing_quote')

            listing_id = str(details.get('id'))
            rate = price.get('rate').get('amount')
            rate_w_service = price.get('rate_with_service_fee').get('amount')

            listing_item['listing_id'] = listing_id
            listing_item['title'] = str(details.get('name'))
            listing_item['country'] = country
            listing_item['city'] = str(details.get('localized_city'))
            listing_item['lat'] = details.get('lat')
            listing_item['lon'] = details.get('lon')
            listing_item['bedrooms'] = details.get('bedrooms')
            listing_item['bathrooms'] = details.get('bathrooms')
            listing_item['max_occupancy'] = details.get('person_capacity')
            listing_item['service_fee'] = rate_w_service - rate

            yield listing_item
Пример #2
0
    def parse_room_second(self, response):
        try:
            flag = response.meta.get('parse')
        except Exception as e:
            self.logger.info(e)
        if flag:
            self.logger.info('From First {}'.format(response.url))
            data = json.loads(response.text)
            price = data['nightly_price']  # not exactly
            name = response.meta.get('name')
            owner = response.meta.get('owner')
            owner_id = response.meta.get('owner_id')
            room_id = response.meta.get('id')
            yield AirbnbItem({'room_id': room_id, 'name': name, 'owner': owner, 'owner_id': owner_id,
                              'price': price})
            room_id = re.findall(r'\d{3,10}', response.url)[0]
        else:

            self.logger.info('From similar {}'.format(response.url))
            room_id = response.url.split('/')[-1]

        guest_list = response.css('div.show-inline-block > div:nth-child(2) > div > div.name > a')
        for one in guest_list:
            name = one.css('span::text').extract_first()
            guest_id = one.css('::attr(href)').extract_first()
            yield UserItem({'name': name, 'user_id': guest_id})

        self.logger.info('id = {}'.format(room_id))
        f = furl('https://zh.airbnb.com/api/v2/similar_listings?')
        arg = {'key': 'd306zoyjsyarp7ifhu67rjxn52tv0t20', 'currency': 'CNY', 'locale': 'zh',
               '_format': 'for_listing_card', 'filter_instant_book': 'false', 'listing_id': str(room_id)}
        f.add(args=arg)
        url = f.url
        path = f.query
        yield scrapy.FormRequest(url=url, callback=self.parse_similar)
Пример #3
0
    def parse_listing_contents(self, response):
        item = AirbnbItem()

        json_array = response.xpath(
            '//meta[@id="_bootstrap-room_options"]/@content').extract()
        if json_array:
            airbnb_json_all = json.loads(json_array[0])
            airbnb_json = airbnb_json_all['airEventData']
            item['rev_count'] = airbnb_json['visible_review_count']
            item['amenities'] = airbnb_json['amenities']
            item['host_id'] = airbnb_json_all['hostId']
            item['hosting_id'] = airbnb_json['hosting_id']
            item['room_type'] = airbnb_json['room_type']
            item['price'] = airbnb_json['price']
            item['bed_type'] = airbnb_json['bed_type']
            item['person_capacity'] = airbnb_json['person_capacity']
            item['cancel_policy'] = airbnb_json['cancel_policy']
            item['rating_communication'] = airbnb_json['communication_rating']
            item['rating_cleanliness'] = airbnb_json['cleanliness_rating']
            item['rating_checkin'] = airbnb_json['checkin_rating']
            item['satisfaction_guest'] = airbnb_json[
                'guest_satisfaction_overall']
            item['instant_book'] = airbnb_json['instant_book_possible']
            item['accuracy_rating'] = airbnb_json['accuracy_rating']
            item['response_time'] = airbnb_json['response_time_shown']
            #    item['response_rate'] = airbnb_json['reponse_rate_shown']
            item['nightly_price'] = airbnb_json_all['nightly_price']
        item['url'] = response.url
        yield item
Пример #4
0
 def parse_meetup(self, response):
     sel = Selector(response)
     item = AirbnbItem()
     item['title'] = sel.xpath('//h1[@itemprop="name"]/text()').extract()
     item['link'] = response.url
     item['description'] = sel.xpath(
         '//div[@id="past-event-description-wrap"]//text()').extract()
     yield item
Пример #5
0
 def parse_similar(self, response):
     data = json.loads(response.text)
     for one in data['similar_listings']:
         room_id = one['listing']['id']
         name = one['listing']['name']
         owner = one['listing']['primary_host']['first_name']
         owner_id = one['listing']['primary_host']['id']
         price = one['pricing_quote']['rate']['amount_formatted']
         yield AirbnbItem({'room_id': room_id, 'name': name, 'owner': owner, 'owner_id': owner_id,
                           'price': price})
         yield scrapy.Request(url='https://zh.airbnb.com/rooms/' + str(room_id), meta={'parse': False},
                              callback=self.parse_room_second)
Пример #6
0
    def parse_locations(self, response):
        sel = Selector(response)

        property_site = AirbnbItem()

        property_site['R_Hostname'] = sel.xpath(
            '//a[contains(@href, "#host-profile")]/text()').extract()[1]
        property_site['R_Hostprofile'] = sel.xpath(
            '//div[contains(@data-reactid, ".agcwfvnqbk.2.0.0.0.2.1")]/a/@href'
        ).extract()
        property_site['R_Listname'] = sel.xpath(
            '//h1[@id = "listing_name"]/text()').extract()
        property_site['R_Reviews'] = sel.xpath(
            '//span[@itemprop = "reviewCount"]/text()').extract()

        property_site['S_Accommodates'] = sel.xpath(
            '//strong[contains(@data-reactid, "Accommodates=2.2")]/text()'
        ).extract()
        property_site['S_Bedrooms'] = sel.xpath(
            '//strong[contains(@data-reactid, "Bedrooms=2.2")]/text()'
        ).extract()
        property_site['S_Bathrooms'] = sel.xpath(
            '//strong[contains(@data-reactid, "Bathrooms=2.2")]/text()'
        ).extract()
        property_site['S_Numbeds'] = sel.xpath(
            '//strong[contains(@data-reactid, "Beds=2.2")]/text()').extract()
        property_site['S_Bedtype'] = sel.xpath(
            '//strong[contains(@data-reactid, "Bed type=2.2")]/text()'
        ).extract()
        property_site['S_Checkin'] = sel.xpath(
            '//strong[contains(@data-reactid, "Check In=2.2")]/text()'
        ).extract()
        property_site['S_Checkout'] = sel.xpath(
            '//strong[contains(@data-reactid, "Check Out=2.2")]/text()'
        ).extract()

        property_site['A_Availability'] = sel.xpath(
            '//div[@class = "col-md-6"]/strong/text()').extract()
        #Why the f**k does this work?
        property_site['R_Value'] = sel.xpath(
            '//div[@class = "col-sm-8"]/div/span/text()').extract()
        #Not a fan of the following xpath, fix later
        property_site['R_Roomtype'] = sel.xpath(
            '//div[@class = "col-sm-3"]/text()').extract()[0]

        property_site['A_Cleaningfee'] = sel.xpath(
            '//strong[contains(@data-reactid, "Cleaning Fee=2.2")]/text()'
        ).extract()

        return property_site
Пример #7
0
    def parse(self, response):
        def clean_the_shit(shit):
            return max(shit.splitlines()).strip()

        responseSelector = Selector(response)
        listing_item = responseSelector.css(
            'div.col-sm-12.row-space-2.col-md-6').css('div.listing')
        items_top = listing_item.css('div.panel-image.listing-img').css(
            'div.listing-description')
        items_bottom = listing_item.css(
            'div.panel-body.panel-card-section').css('div.media')
        items_image = listing_item.css('div.panel-image.listing-img')
        for bottom, top, image in zip(items_bottom, items_top, items_image):
            item = AirbnbItem()
            item['name'] = bottom.css('a').css('h3::text').extract()[0].strip()
            if item['name'] == '\u661f\u7ea7\u6c11\u5bbf \u8212\u9002\u5e72\u51c0  \u4ea4\u901a\u65b9\u4fbf':
                import ipdb
                ipdb.set_trace()
            item['user_url'], item['room_url'] = bottom.xpath(
                'a/@href').extract()
            type_and_reviews = bottom.css(
                'div.text-muted.listing-location.text-truncate').css(
                    'a::text').extract()
            item['room_type'] = clean_the_shit(type_and_reviews[0])
            if len(type_and_reviews) == 2:
                item['reviews'] = clean_the_shit(type_and_reviews[1])[2:]
            item['summary'] = top.css('div.summary').css(
                'p::text').extract()[0].strip()
            item['address'] = top.css('p.address').css('p::text').extract()[0]
            item['image'] = image.css('a.media-photo.media-cover').css(
                'img').xpath('@src').extract()[0]
            # TODO: add cookie to get consistent prices
            item['price'] = int(
                image.css(
                    'a.link-reset.panel-overlay-bottom-left.panel-overlay-label.panel-overlay-listing-label'
                ).css('div').css(
                    'span.h3.text-contrast.price-amount::text').extract()[0])
            item['coin'] = ''.join(
                image.css(
                    'a.link-reset.panel-overlay-bottom-left.panel-overlay-label.panel-overlay-listing-label'
                ).css('div').css('sup.h6.text-contrast::text').extract())
            yield item

            # pagination
            next_page = responseSelector.xpath(
                '//li[contains(@class, "next_page")]').xpath('a/@href')
            if next_page:
                url = response.urljoin(next_page[0].extract())
                yield Request(url, self.parse)
Пример #8
0
	def parse(self, response):
		#l = ItemLoader(item = ItjuziItem(),response=response)
		jsonresponse = json.loads(response.body_as_unicode())
		for i in range(0,len(jsonresponse['results_json']['search_results'])):
			l = ItemLoader(item = AirbnbItem(),response=response)
			bedrooms         = jsonresponse['results_json']['search_results'][i]['listing']['bedrooms']
			beds             = jsonresponse['results_json']['search_results'][i]['listing']['beds']
			name             = jsonresponse['results_json']['search_results'][i]['listing']['name']
			person_capacity  = jsonresponse['results_json']['search_results'][i]['listing']['person_capacity']
			primary_host     = jsonresponse['results_json']['search_results'][i]['listing']['primary_host']['first_name']
			host_id          = jsonresponse['results_json']['search_results'][i]['listing']['primary_host']['id']
			host_url         = "https://zh.airbnb.com/users/show/{}".format(host_id)
			property_type    = jsonresponse['results_json']['search_results'][i]['listing']['property_type']
			room_id          = jsonresponse['results_json']['search_results'][i]['listing']['id']
			room_url         = "https://zh.airbnb.com/rooms/{}".format(room_id)
			is_new_listing   = jsonresponse['results_json']['search_results'][i]['listing']['is_new_listing']
			public_address   = jsonresponse['results_json']['search_results'][i]['listing']['public_address']
			room_type        = jsonresponse['results_json']['search_results'][i]['listing']['room_type']
			star_rating      = jsonresponse['results_json']['search_results'][i]['listing']['star_rating']
			reviews_count    = jsonresponse['results_json']['search_results'][i]['listing']['reviews_count']
			guests           = jsonresponse['results_json']['search_results'][i]['pricing_quote']['guests']
			amount           = jsonresponse['results_json']['search_results'][i]['pricing_quote']['rate']['amount']
			currency         = jsonresponse['results_json']['search_results'][i]['pricing_quote']['rate']['currency']
			l.add_value('bedrooms',bedrooms)
			l.add_value('beds',beds)
			l.add_value('name',name)
			l.add_value('person_capacity',person_capacity)
			l.add_value('primary_host',primary_host)
			l.add_value('host_id',host_id)
			l.add_value('host_url',host_url)
			l.add_value('property_type',property_type)
			l.add_value('room_id',room_id)
			l.add_value('room_url',room_url)
			l.add_value('is_new_listing',is_new_listing)
			l.add_value('public_address',public_address)
			l.add_value('room_type',room_type)
			l.add_value('star_rating',star_rating)
			l.add_value('reviews_count',reviews_count)
			l.add_value('guests',guests)
			l.add_value('amount',amount)
			l.add_value('currency',currency)
			print l
			yield l.load_item()
Пример #9
0
    def parse_listing_contents(self, response):
        item = AirbnbItem()

        json_array = response.xpath(
            '//meta[@id="_bootstrap-room_options"]/@content').extract()

        if json_array:
            airbnb_json_all = json.loads(json_array[0])

            airbnb_json = airbnb_json_all['airEventData']

            item['host_id'] = airbnb_json_all['hostId']
            item['hosting_id'] = airbnb_json['hosting_id']
            item['room_type'] = airbnb_json['room_type']
            item['price'] = airbnb_json['price']
            item['bed_type'] = airbnb_json['bed_type']
            item['person_capacity'] = airbnb_json['person_capacity']
            item['listing_lat'] = airbnb_json['listing_lat']
            item['listing_lng'] = airbnb_json['listing_lng']
            item['nightly_price'] = airbnb_json_all['nightly_price']
        item['url'] = response.url
        yield item
Пример #10
0
    def parse_details(self, response):
        # GOAL HERE:
        # GET ALL OF THE DETAILS OF THE PAGE HERE
        # WITH SCRAPY SPLASH WE CAN GET THE TEXT BODY OF THE RESPONSE.
        # WITH THIS, WE CAN REGEX THE ENTIRE BODY TO GET MOST OF THE INFORMATION.

        print("-" * 50)

        item = AirbnbItem()

        #Extracting the roomID from url.
        try:
            roomID = re.search('rooms/([0-9]*)\?location',
                               str(response.url)).group(1)
        except AttributeError:
            roomID = ''

        # Extracting rating and numReviews from below xpath object string.
        string1 = str(
            response.xpath(
                '//button[@class="_ff6jfq"]/@aria-label').extract_first())

        try:
            rating = re.search('Rated ([0-5](.[0-9])?) out of 5',
                               string1).group(1)
        except AttributeError:
            rating = ''

        try:
            numReviews = re.search('from ([0-9]*) reviews', string1).group(1)
        except AttributeError:
            numReviews = ''

        price = response.meta['price']

        ###########################  Overview  #######################
        item['roomID'] = roomID
        item['numReviews'] = numReviews
        item['price'] = price
        # this line was causing:
        # 	AttributeError: 'NoneType' object has no attribute 'group'
        # item['shortDesc'] = (re.search('"localized_room_type":"(.{1,50})","city',response.text)).group(1)

        #######################  Host   ##############################
        item['numHostReviews'] = response.xpath(
            '//span[@class="_e296pg"]/span[@class="_1uhfauip"]/text()'
        ).extract_first()
        # item['isSuperhost'] = (re.search('"is_superhost":(.{1,5}),',response.text)).group(1)

        #################  Numbers of rooms/baths/guests  ############
        # item['numBaths'] = (re.search('"bathroom_label":"([0-9]\.?[0-9]?).*","bed_label"', response.text)).group(1)
        item['numBeds'] = (re.search('"bed_label":"(.).*","bedroom_label"',
                                     response.text)).group(1)

        if re.search('"bedroom_label":"([0-9][0-9]?).*","guest_label"',
                     response.text) != None:
            item['numRooms'] = (re.search(
                '"bedroom_label":"([0-9][0-9]?).*","guest_label"',
                response.text)).group(1)
        else:
            item['numRooms'] = 0
        if re.search('"guest_label":".{1,8}([0-9][0-9]?).{1,8}",',
                     response.text) != None:
            item['numGuests'] = (re.search(
                '"guest_label":".{1,8}([0-9][0-9]?).{1,8}",',
                response.text)).group(1)
        else:
            item['numGuests'] = (re.search(
                '"guest_label":"([0-9][0-9]?) guest.*',
                response.text)).group(1)

        ############## Types of rooms/baths/guests  ###################
        item['bathType'] = (re.search(
            '"bathroom_label":"[0-9].?[0-9]? (.*)","bed_label"',
            response.text)).group(1)
        if re.search('"bedroom_label":"[0-9] (.*)","guest_label"',
                     response.text) != None:
            item['bedroomType'] = (re.search(
                '"bedroom_label":"[0-9] (.*)","guest_label"',
                response.text)).group(1)
        else:
            item['bedroomType'] = (re.search(
                '"bedroom_label":"(..?.?.?.?.?.?.?.?.?.?.?)","guest_label"',
                response.text)).group(1)
        item['bedType'] = (re.search(
            '"bed_label":"[0-9] (.*)","bedroom_label"',
            response.text)).group(1)

        ########################  Coordinates  ########################
        coordinates = re.search(
            '"listing_lat":([0-9]{2}.[0-9]*),"listing_lng":(-[0-9]{2}.[0-9]*),',
            response.text)
        item['latitude'] = coordinates.group(1)
        item['longitude'] = coordinates.group(2)

        ##########################  Ratings  ##########################
        # Sometimes the ratings are not available...
        if numReviews:
            item['rating'] = rating
            item['accuracy'] = (re.search('"accuracy_rating":([0-9][0-9]?),"',
                                          response.text)).group(1)
            item['communication'] = (re.search(
                '"communication_rating":([0-9][0-9]?),"',
                response.text)).group(1)
            item['cleanliness'] = (re.search(
                '"cleanliness_rating":([0-9][0-9]?),"',
                response.text)).group(1)
            item['location'] = (re.search('"location_rating":([0-9][0-9]?),"',
                                          response.text)).group(1)
            item['checkin'] = (re.search('"checkin_rating":([0-9][0-9]?),"',
                                         response.text)).group(1)
            item['value'] = (re.search('"cleanliness_rating":([0-9][0-9]?),"',
                                       response.text)).group(1)
            item['guestSatisfaction'] = (re.search(
                '"guest_satisfaction_overall":([0-9][0-9][0-9]?),"',
                response.text)).group(1)
        else:
            item['rating'] = ''
            item['accuracy'] = ''
            item['communication'] = ''
            item['cleanliness'] = ''
            item['location'] = ''
            item['checkin'] = ''
            item['value'] = ''
            item['guestSatisfaction'] = ''

        yield item