Пример #1
0
 def parse_common(self, response):
     """Parse common fields for both."""
     self.shutdown_on_error()
     item = ItemLoader(ApartmentItem(), response=response)
     item.add_value('url', response.url)
     item.add_css('title', 'h1.title::text')
     return item
Пример #2
0
    def parse_item(self, response):
        """Parse the flat response.

        @url https://www.immowelt.de/expose/2GT7W4N
        @returns items 1 1
        @scrapes url title address rooms size cold_rent_price warm_rent_price additional_price heating_price description
        """
        self.shutdown_on_error()
        item = ItemLoader(ApartmentItem(), response=response)
        item.add_value('url', response.url)
        item.add_xpath('title', '//h1/text()')
        item.add_xpath('address', '//div[@class="location"]/span[@class="no_s"]/text()')
        item.add_xpath('rooms', '//div[contains(@class, "quickfacts")]//div[@class="hardfact rooms"]/text()[1]')
        item.add_xpath('size', '//div[contains(@class, "hardfacts")]/div[contains(@class, "hardfact")][2]/text()[1]')
        item.add_xpath('cold_rent_price', '//div[contains(@class, "hardfacts")]/div[contains(@class, "hardfact")][1]'
                                          '/strong/text()')

        item.add_xpath('description', '//div[contains(@class, "section_label")][starts-with('
                                      'normalize-space(.), "Objekt")]/following-sibling::div/child::p/text()')

        for field, cell_text in {'warm_rent_price': 'Warmmiete', 'additional_price': 'Nebenkosten',
                                 'heating_price': 'Heizkosten'}.items():
            item.add_xpath(
                field, '//div[contains(@class, "datatable")]/div[contains(@class, "datarow")]/div[contains'
                       '(@class, "datalabel")][starts-with(normalize-space(.), "{}")]/following-sibling::div'
                       '[contains(@class, "datacontent")]/text()'.format(cell_text))
        yield item.load_item()
Пример #3
0
    def parse_item(self, response):
        """Parse a page with an apartment.

        @url https://www.city-wohnen.de/eng/berlin/32608-furnished-apartment-berlin-friedrichshain-pettenkoferstrasse
        @returns items 1 1
        @scrapes url title availability description neighborhood address warm_rent size rooms
        """
        self.shutdown_on_error()
        item = ItemLoader(ApartmentItem(), response=response)
        item.add_value('url', response.url)

        item.add_css('title', 'div.text_data > h2::text')
        item.add_css('availability', 'div.row > div.text_data > p::text')
        item.add_css('description', 'div.object_details div.col_left p::text')
        item.add_value(
            'neighborhood',
            response.css(
                'div.object_meta div.container div.text_data p strong::text').
            extract()[0])
        item.add_xpath('address', "//li[@class='map']/a/@href")

        keys = response.css(
            'div.object_meta table.object_meta_data th::text').extract()
        values = response.css(
            'div.object_meta table.object_meta_data td::text').extract()
        features = dict(zip(keys, values))
        item.add_value('warm_rent', features.get('Rent'))
        item.add_value('size', features.get('Size'))
        item.add_value('rooms', features.get('Room/s'))

        return item.load_item()
Пример #4
0
    def parse_item(self, response):
        """Parse the flat response.

        @url https://www.immonet.de/angebot/32437621?drop=sel&related=false
        @returns items 1 1
        @scrapes url title address rooms size cold_rent_price warm_rent_price additional_price description
        @scrapes equipment location
        """
        self.shutdown_on_error()
        item = ItemLoader(ApartmentItem(), response=response)
        item.add_value('url', response.url)
        item.add_xpath('title', '//h1/text()')
        item.add_xpath(
            'address',
            '//div[contains(@class, "row")]//span[@id = "infobox-static-address"]/text()'
        )

        for field, id_ in {
                'rooms': 'equipmentid_1',
                'size': 'areaid_1',
                'cold_rent_price': 'priceid_2',
                'warm_rent_price': 'priceid_4',
                'additional_price': 'priceid_20',
                'heating_price': 'priceid_5',
                'description': 'objectDescription',
                'equipment': 'ausstattung',
                'location': 'locationDescription',
                'other': 'otherDescription'
        }.items():
            item.add_xpath(field, '//*[@id="{}"]/text()'.format(id_))

        yield item.load_item()
Пример #5
0
    def parse_item(self, response):
        """Parse an ad page, with an apartment.

        @url https://www.berlinovo.de/en/apartment/2-room-suite-house-heinrich-heine-stra-e-18-24-berlin-mitte
        @returns items 1 1
        @scrapes url title description location address other neighborhood rooms
        """
        self.shutdown_on_error()
        item = ItemLoader(ApartmentItem(), response=response)
        item.add_value('url', response.url)
        item.add_css('title', 'h1.title::text')
        item.add_xpath(
            'description',
            '//div[contains(@class, field-name-body)]/div/div[4]/div/div/p/text()'
        )
        item.add_xpath(
            'location',
            '//div[contains(@class, field-name-field-position)]/div/div[5]/div[2]/div/text()'
        )

        zipcode = response.xpath(
            '//*[@id="block-views-aktuelle-wohnung-block-3"]/div/div/div/div/div[3]/div/span/text()[1]') \
            .extract()[0].strip()
        street = response.xpath(
            '//*[@id="block-views-aktuelle-wohnung-block-3"]/div/div/div/div/div[3]/div/span/text()[2]') \
            .extract()[0].strip()
        item.add_value('address', u'{}, {}'.format(street, zipcode))

        item.add_xpath(
            'equipment',
            '//*[@id="block-views-aktuelle-wohnung-block-3"]/div/div/div/div/div[18]/div/div/ul/li/span/text()'
        )
        item.add_xpath(
            'warm_rent',
            '//*[@id="block-views-aktuelle-wohnung-block-3"]/div/div/div/div/div[5]/span[2]/text()'
        )

        item.add_xpath(
            'other',
            '//*[@id="block-views-aktuelle-wohnung-block-3"]/div/div/div/div/div/span/text()'
        )

        item.add_value(
            'neighborhood',
            response.css('#page-title::text').extract()[0].strip().split(
                'Berlin-')[-1])

        room_list = response.xpath(
            '//*[@id="block-views-aktuelle-wohnung-block-3"]/div/div/div/div/div'
            '[contains(@class, views-field-field-rooms-description)]/div/text()'
        ).extract()
        item.add_value('rooms',
                       re.findall(r'([0-9]+)', ' '.join(room_list))[0])

        yield item.load_item()
Пример #6
0
    def parse(self, response):
        """Parse the items from the main list, then start requests to get more details.

        The number of rooms are only available on the list; maybe on purpose, to make scraping harder.
        """
        parser = etree.HTMLParser()
        for html in response.xpath('//figure').extract():
            tree = etree.fromstring(html, parser)
            item = ApartmentItem()
            item['url'] = response.urljoin(tree.xpath('//a/@href')[0])
            item['rooms'] = tree.xpath('//p/span[@class="rooms"]/text()')[0]
            item['size'] = tree.xpath('//p/span[@class="areaSize"]/text()')[0]
            item['address'] = tree.xpath('normalize-space(//h3)')
            yield item

        for request in super().parse(response):
            yield request
Пример #7
0
    def parse_item(self, response):
        """Parse a page with an apartment.

        @url https://www.akelius.de/en/search/apartments/osten/berlin/2.7037.16
        @returns items 1 1
        @scrapes url title warm_rent_price size availability cold_rent_price description address
        """
        self.shutdown_on_error()
        item = ItemLoader(ApartmentItem(), response=response)
        item.add_value('url', response.url)
        item.add_xpath('title', '//h2/text()')
        item.add_xpath(
            'warm_rent_price',
            '//h2/following-sibling::p[starts-with(normalize-space(.), "Total rent")]/text()'
        )
        item.add_xpath('size', '//h2//following-sibling::p[2]/text()')
        item.add_xpath(
            'location',
            '//h3[starts-with(normalize-space(.), "Location")]/following-sibling::div//span/text()'
        )
        item.add_xpath('availability', '//h2//following-sibling::p[4]/text()')
        item.add_xpath(
            'cold_rent_price',
            '//h3[starts-with(normalize-space(.), "Apartment")]/following-sibling::div[1]/p[2]/span/text()'
        )
        item.add_xpath(
            'description',
            '//h3[starts-with(normalize-space(.), "Building")]/following-sibling::div//span/text()'
        )

        # The map is shown with JavaScript; get the HTML
        # and use a regex to extract the part of the script with the address.
        map_response = requests.get(response.url + '/karte')
        if map_response.status_code == 200:
            html_string = ''.join(self.ADDRESS_REGEX.findall(
                map_response.text))

            # Extract the address from the HTML.
            root = etree.fromstring(html_string, etree.HTMLParser())
            item.add_value('address', ', '.join(root.xpath('//p/text()')))

        return item.load_item()
Пример #8
0
    def parse_item(self, response):
        """Parse an ad page with an apartment.

        @url https://www.immobilienscout24.de/expose/93354819
        @returns items 1 1
        @scrapes url title address neighborhood cold_rent_price warm_rent_price rooms
        """
        self.shutdown_on_error()
        item = ItemLoader(ApartmentItem(), response=response)
        item.add_value('url', response.url)
        item.add_css('title', 'h1#expose-title::text')

        for field, css_class in self.DIV_PRE_MAPPING.items():
            item.add_xpath(
                field,
                "//div/pre[contains(@class, '{}')]/text()".format(css_class))

        full_address = ''.join(
            response.xpath("//span[@data-qa='is24-expose-address']/div//text()"
                           ).extract()).strip()
        parts = full_address.split(self.CITY)
        if len(parts) == 1:
            item.add_value('address', full_address)
        else:
            street_zip = (parts[0] + self.CITY).strip(' ,').replace(
                ' (zur Karte) ', '')
            item.add_value('address', street_zip)
            item.add_value('neighborhood', ''.join(parts[1:]).strip(' ,'))

        item.add_css('cold_rent_price', 'div.is24qa-kaltmiete::text')
        item.add_css('warm_rent_price', 'dd.is24qa-gesamtmiete::text')
        item.add_css('rooms', 'div.is24qa-zi::text')
        item.add_xpath('size',
                       '//div[contains(@class, "is24qa-flaeche ")]/text()')
        item.add_xpath(
            'active', '//div[contains(@class, "status-message")]'
            '/h3[starts-with(normalize-space(.), "Angebot")]/text()')
        yield item.load_item()
Пример #9
0
    def parse_item(self, response):
        """Parse a page with an apartment.

        @url http://www.merkur-berlin.de/?page_id=39&showExpose=1&exposeID=926C081BECA043C9BE7756469D94722F
        @returns items 1 1
        @scrapes url title address rooms size warm_rent description location
        """
        self.shutdown_on_error()
        item = ItemLoader(ApartmentItem(), response=response)
        item.add_value('url', response.url)
        item.add_xpath('title', '//h4[@class="entry-title"]/text()')
        item.add_xpath('address', '//address/text()')

        for field, info in dict(rooms='Rooms', size='AreaLiving', warm_rent='PriceWarmmiete',
                                cold_rent='Price').items():
            item.add_xpath(field, '//div[@class="infotables"]//tr[@id="infotable_{info}"]/td[@class='
                                  '"infotable_value"]/text()'.format(info=info))

        for field, h2 in dict(description='Objekt', equipment='Ausstattung',
                              location='Lage', other='Mehr Angebote').items():
            item.add_xpath(field, '//div[@class="infoblock"]/h2[starts-with(normalize-space(.),'
                                  ' "{h2}")]/following-sibling::p/text()'.format(h2=h2))

        return item.load_item()