Пример #1
0
	def parse_detail(self, response):
		# State = response.meta['State']
		# City = response.meta['City']
		# Numberof_reviews = response.meta['Numberof_reviews']
		# Review_ratings = response.meta['Review_ratings']
		# Population = response.meta['Population']
		State = response.xpath('//div[@class="profile__bucket--3"]//span/text()').extract()[0]
		City = response.xpath('//div[@class="blank__bucket"]//a/text()').extract()[0]
		Numberof_reviews = response.xpath('//div[@class="review__stars review__stars--white"]//span/text()').extract()
		Review_ratings = response.xpath('//div[@class="review__stars review__stars--white"]//span/@class').extract()
		Population = response.xpath('//div[@class="scalar__value"]//span/text()').extract()[0]

		Median_Home_Value = response.xpath('//div[@class="scalar__value"]/span/text()').extract()[1]
		Median_Rent = response.xpath('//div[@class="scalar__value"]/span/text()').extract()[2]
		Area_Feel = response.xpath('//div[@class="scalar__value"]/span/text()').extract()[3]
		Crime_Safty = response.xpath('//ol[@class="ordered__list__bucket"]//div/text()').extract()[3]
		Diversity = response.xpath('//ol[@class="ordered__list__bucket"]//div/text()').extract()[11]
		PublicSchool_level = response.xpath('//ol[@class="ordered__list__bucket"]//div/text()').extract()[1]
		Children_percent = response.xpath('//div[@class="scalar__value"]/span/text()').extract()[5]
		Master_Degree = response.xpath('//ul[@class="breakdown-facts breakdown-facts--national"]//div/text()').extract()[1]
		Bachelor_Degree = response.xpath('//ul[@class="breakdown-facts breakdown-facts--national"]//div/text()').extract()[4]
		Associate_Degree = response.xpath('//ul[@class="breakdown-facts breakdown-facts--national"]//div/text()').extract()[7]
		Jobs_Level = response.xpath('//ol[@class="ordered__list__bucket"]//div/text()').extract()[13]
		Median_salary = response.xpath('//div[@class="scalar__value"]/span/text()').extract()[4]
		Cost_of_Living = response.xpath('//ol[@class="ordered__list__bucket"]//div/text()').extract()[17]
		Weather = response.xpath('//ol[@class="ordered__list__bucket"]//div/text()').extract()[15]
		NightLife = response.xpath('//ol[@class="ordered__list__bucket"]//div/text()').extract()[7]
		Outdoor_Activity = response.xpath('//ol[@class="ordered__list__bucket"]//div/text()').extract()[21]
		




		
		item = RealestateItem()

		item['State'] = State
		item['City'] = City
		item['Median_Home_Value'] = Median_Home_Value
		item['Median_Rent'] = Median_Rent
		item['Population'] = Population
		item['Area_Feel'] = Area_Feel
		item['Crime_Safty'] = Crime_Safty
		item['Diversity'] = Diversity
		item['PublicSchool_level'] = PublicSchool_level
		item['Children_percent'] = Children_percent
		item['Master_Degree'] = Master_Degree
		item['Bachelor_Degree'] = Bachelor_Degree
		item['Associate_Degree'] = Associate_Degree
		item['Jobs_Level'] = Jobs_Level
		item['Median_salary'] = Median_salary
		item['Cost_of_Living'] = Cost_of_Living
		item['Weather'] = Weather
		item['NightLife'] = NightLife
		item['Outdoor_Activity'] = Outdoor_Activity
		item['Numberof_reviews'] = Numberof_reviews
		item['Review_ratings'] = Review_ratings

		yield item
Пример #2
0
    def parse(self, response):
        json_data = json.loads(response.body)
        for data in json_data['results']:
            try:
                item = RealestateItem()
                item['online'] = 1
                item['website'] = self.name
                item[
                    'website_logo'] = 'https://media2.nexity.fr/nfr2016/picto/nexity-logo.svg'
                item['url'] = 'https://www.nexity.fr/' + data['0']['url']
                item['pieces'] = data['0']['nb_piece']
                item['description'] = data['0']['description'].replace(
                    '<br>', '')
                item['title'] = data['0']['visuel_alt']

                if 'surface' in data['0'].keys():
                    item['size'] = data['0']['surface']

                if 'location' in item['url']:
                    item['rent_buy'] = 'rent'
                else:
                    item['rent_buy'] = 'buy'
                item['type'] = 'appartment'
                item['city'] = data['0']['ville'].lower()
                item['district'] = data['0']['code_postal']
                item['price'] = data['c_prix_min']
                imgs = []
                for img in data['0']['photos']:
                    imgs.append(img['direct'])
                item['images'] = ','.join(imgs)
                item['deposit'] = data['0']['depot_garantie']
                if 'etage' in data['0'].keys():
                    item['floor'] = data['0']['etage']

                # if 'honoraires' in data['0'].keys() and data['0']['honoraires'] != 0:
                #     item['agency_fee'] = int(data['0']['honoraires'])
                # if 'honoraires_part_etat_des_lieux' in data['0'].keys() and data['0']['honoraires_part_etat_des_lieux'] != 0:
                #     item['other_charges'] = int(data['0']['honoraires_part_etat_des_lieux'])
                #     item['agency_fee'] = int(data['0']['honoraires']) - int(data['0']['honoraires_part_etat_des_lieux'])

                self.count += 1
                print("Total Count: " + str(self.count))
                yield item
            except Exception as e:
                print("err: " + e.args[0])
                self.count += 1
                print("Total Count: " + str(self.count))
                yield item

        total = int(json_data['pagination']['pageCount'])
        current = int(json_data['pagination']['current'])
        if current < total:
            next = current + 1
            next_page_url = 'https://www.nexity.fr/ws-rest/offre/biens/moteur.json?1=1&types_bien=Appartement,Maison/Villa&type_commercialisation=Location&pageNumber={}&pageSize=12&anciennete=0&locations=104&sortField=prix&sortOrder=asc&withPartners=1'.format(
                next)
            yield Request(next_page_url, self.parse)
Пример #3
0
    def final_parse(self, response):
        json_data = json.loads(response.body)
        item = RealestateItem()

        item['online'] = 1
        item['website'] = self.name
        item[
            'website_logo'] = 'https://dc0r5opm7495b.cloudfront.net/assets/logos/logo_white.fr-d5e56db342eda1a81e02b633d1a339a708e5ed1d823ffa8bdd16db6eab5cc405.png'
        item['url'] = json_data['full_url']
        desc = json_data['description']
        if desc:
            desc = desc.replace('<br/>', '\n')
        item['description'] = desc
        item['title'] = json_data['listing_title_string']
        item['price'] = json_data['cost_total_rent']
        item['size'] = json_data['lodging_surface']
        item['type'] = json_data['lodging_type_string']
        item['deposit'] = json_data['cost_caution']
        item['other_charges'] = json_data['cost_charges']
        item['city'] = json_data['address_city']
        address_list = [json_data['address_city']]
        if json_data['address_street']:
            address_list.append(json_data['address_street'])
        item['address'] = ' '.join(address_list)

        # available_from = json_data['lodging_availability_string']

        pieces = re.findall('[\d.,]+', json_data['lodging_size_string'])
        if pieces:
            pieces = pieces[0]
            item['pieces'] = pieces

        imgs = json_data['pictures']
        images = []
        for img in imgs:
            if 'image_large' in img.keys():
                images.append(img['image_large'])
            elif 'image_medium' in img.keys():
                images.append(img['image_medium'])
        if images:
            image_urls = ','.join(images)
            item['images'] = image_urls

        item['rent_buy'] = 'rent'

        self.count += 1
        print("Total Count: " + str(self.count))

        yield item
Пример #4
0
    def final_parse(self, response):
        item = RealestateItem()

        item['online'] = 1
        item['website'] = self.name
        item[
            'website_logo'] = 'https://en.location-etudiant.fr/images/logo.png'
        item['url'] = response.url
        item['description'] = response.xpath(
            '//p[@itemprop="description"]/text()').extract_first()
        item['title'] = response.xpath(
            '//h1[@itemprop="name"]/text()').extract_first()

        price = response.xpath('//div[@class="aPartirDe"]/span/text()').re(
            r'[\d.,]+')
        if price:
            try:
                price = ''.join(price)
                item['price'] = price.replace(',', '.')
            except:
                pass

        item['district'] = str(
            response.body).split('"postalCode":')[-1].split(',')[0].replace(
                '"', '')
        item['city'] = str(response.body).split(
            '"addressLocality":')[-1].split(',')[0].replace(
                '"', '').strip().split(' ')[0]
        item['address'] = response.xpath(
            '//span[@itemprop="addressLocality"]/text()').extract_first()

        images = response.xpath(
            '//div[@class="photoVignette"]/img/@src').extract()
        image_urls = []
        for img in images:
            img = 'https://www.location-etudiant.fr' + img.replace(
                'h=81&w=81', 'h=410&w=525')
            image_urls.append(img)
        item['images'] = ','.join(image_urls)

        if 'location' in response.url:
            item['rent_buy'] = 'rent'
        else:
            item['rent_buy'] = 'buy'

        self.count += 1
        print("Total Count: " + str(self.count))

        yield item
Пример #5
0
    def parse(self, response):
        item = RealestateItem()
        sel = Selector(response)

        item['name'] = ''.join(
            sel.xpath(
                '//section[@id="_mg_listing_detail"]//h1/text()').extract())

        #item['price'] = ''.join(sel.xpath('///section[@id="_mg_listing_detail"]/div/div[1]/div[1]/p/text()').extract())
        item['price'] = ''.join(
            sel.xpath(
                '///section[@id="_mg_listing_detail"]/div/div[2]/div[2]/p[3]/text()'
            ).extract())

        item['url'] = response.url

        yield item
 def parse_items(self, response):
     """
     default parse method, rule is not useful now
     """
     # import pdb; pdb.set_trace()
     self.logger.info('Item Page %s', response.url)
     for sel in response.xpath('.//article[contains(@class,"resultBody")]'):
         item = RealestateItem()
         item['url'] = sel.xpath(
             './/a[contains(@rel,"listingName")]/@href').extract_first()
         item['address'] = sel.xpath(
             './/a[contains(@rel,"listingName")]/text()').extract_first()
         item['priceText'] = sel.xpath(
             './/p[@class="priceText"]/text()').extract_first()
         item['bedrooms'] = sel.xpath('.//dd[1]/text()').extract_first()
         item['bathrooms'] = sel.xpath('.//dd[2]/text()').extract_first()
         item['cars'] = sel.xpath('.//dd[3]/text()').extract_first()
         yield item
Пример #7
0
    def final_parse(self, response):
        item = RealestateItem()

        item['online'] = 1
        item['website'] = self.name
        item[
            'website_logo'] = 'https://static.paruvendu.fr/2018073108/communfo/img/structuresite/home/logoparuvendufr2016.png'
        item['url'] = response.url
        item['description'] = ''
        title = ' '.join(
            response.xpath('//h1[@class="auto2012_dettophead1txt1"]//text()').
            extract()).replace('\n', '').replace('\r', '').strip()
        item['title'] = title

        price = response.xpath('//div[@id="autoprix"]/text()').re(r'[\d.,]+')
        if price:
            try:
                price = ''.join(price)
                item['price'] = price.replace(',', '.')
            except:
                pass

        type1 = response.xpath(
            '//h1[@class="auto2012_dettophead1txt1"]/span/text()'
        ).extract_first()
        type1 = response.url.split('/')[-3]
        item['type'] = type1

        addr_text = response.xpath(
            '//h1[@class="auto2012_dettophead1txt1"]/text()').extract(
            )[-1].strip()
        if addr_text:
            addr = addr_text.strip().split('\n')[-1].strip().split(' ')
            item['city'] = addr[0]

            district = re.findall(r'[\d]+', addr_text)
            if len(district) > 1:
                try:
                    item['district'] = int(district[-1])
                except:
                    pass

        images = response.xpath(
            '//div[@class="imdet15-ConteneurMiniGlob"]//img/@src').extract()
        if images:
            imgs = []
            for img in images:
                img = img.replace('88x88', '1000x1000')
                imgs.append(img)
            image_urls = ','.join(imgs)
            item['images'] = image_urls

        # agency_fee = response.xpath('//span[@class="lbl-agencyfees"]/text()').re(r'[\d.,]+')
        # if agency_fee:
        #     agency_fee = ''.join(agency_fee)
        #     item['agency_fee'] = agency_fee.replace(',', '.')

        desc = ''.join(
            response.xpath(
                '//div[@class="im12_txt_ann im12_txt_ann_auto"]/text()').
            extract()).strip()
        if desc != '':
            item['description'] = desc

        if addr_text:
            addr = addr_text.strip().split('\n')[0].strip().split(' ')
            size = re.findall(r'[\d.,]+', addr[0])
            if size:
                item['size'] = size[0]

        pieces = response.xpath(
            '//h1[@class="auto2012_dettophead1txt1"]/strong/text()').re(
                r'[\d]+')
        if pieces:
            pieces = pieces[0]
            item['pieces'] = pieces

        options = response.xpath('//div[@class="im11_hd_det"]')
        for option in options:
            key = option.xpath('./span/text()').extract_first()
            if 'Dont charges/mois' in key:
                other_charges = option.xpath('./strong/text()').re(r'[\d.,]+')
                if other_charges:
                    item['other_charges'] = other_charges[0]
            elif 'Dépôt de garantie' in key:
                deposit = option.xpath('./strong/text()').re(r'[\d.,]+')
                if deposit:
                    item['deposit'] = ''.join(deposit)
            elif 'Honoraires' in key:
                agency_fee = option.xpath('./strong/text()').re(r'[\d.,]+')
                if agency_fee:
                    item['agency_fee'] = ''.join(agency_fee)

        agency_name = response.xpath(
            '//div[@class="contact16-lheig"]/strong/text()').extract_first()
        if agency_name:
            item['agency_name'] = agency_name.strip().replace('\n', ' ')

        agency_logo = response.xpath(
            '//div[@clas="im11_blc_visite_R"]/a/img/@src').extract_first()
        if agency_logo:
            item['agency_logo'] = agency_logo

        agency_address = response.xpath(
            '//div[@class="contact16-lheig contact16-lname"]/span/text()'
        ).extract_first()
        if agency_address:
            item['agency_address'] = agency_address.strip()
        else:
            agency_address = response.xpath(
                '//div[@class="contact16-adr"]/text()').extract()
            if agency_address:
                item['agency_address'] = '\n'.join(agency_address)

        li_attrs = response.xpath('//ul[@class="imdet15-infoscles"]/li')
        for li_attr in li_attrs:
            furnished = li_attr.xpath('./strong/text()').extract_first()
            if 'Meublé' == furnished:
                item['furnished'] = 1

        floors = response.xpath(
            '//div[@class="im11_col_enr"]/dd/text()').extract()
        for text_floor in floors:
            if 'étage' in text_floor or 'Etage' in text_floor:
                item['rent_buy'] = re.findall(r'[\d]+', text_floor)[-1]

        if 'location' in response.url:
            item['rent_buy'] = 'rent'
        else:
            item['rent_buy'] = 'buy'

        self.count += 1
        print("Total Count: " + str(self.count))

        yield item
Пример #8
0
    def final_parse(self, response):
        item = RealestateItem()

        item['online'] = 1
        item['website'] = self.name
        item[
            'website_logo'] = 'https://www.fnaim.fr/uploads/Image/6e/SIT_FNAIM_637_SIT_FNAIM_537_LOGOFNAIM-SSBASELINE-AGENCE.png'
        item['url'] = response.url
        item['description'] = ''
        title = response.xpath('//h1[@itemprop="name"]/text()').extract_first()
        if title:
            title = title.strip()
            item['title'] = title

        price = response.xpath(
            '//span[@itemprop="price"]/text()').extract_first()
        if price:
            try:
                item['price'] = price.replace(',', '.').replace(' ', '')
            except:
                pass

        images = response.xpath('//div[@id="carousel"]//img/@src').extract()
        if images:
            image_urls = ','.join(images)
            item['images'] = image_urls

        temp_data = response.xpath(
            '//p[@id="chemin"]/span//span[@itemprop="title"]/text()').extract(
            )
        if temp_data:
            temp_data = temp_data[3]
            item['city'] = temp_data.split(' ')[0]
            district = temp_data.split(' ')[1]
            district = re.findall('[\d.,]+', district)
            district = ''.join(district)
            try:
                item['district'] = int(district)
            except:
                pass

        area = response.xpath('//li[@class="picto surface"]/b/text()').re(
            r'[\d.,]+')
        if area:
            area = area[0].replace(',', '.')
            item['size'] = area

        pieces = response.xpath('//li[@class="picto pieces"]/b/text()').re(
            r'[\d.,]+')
        if pieces:
            pieces = pieces[0]
            item['pieces'] = pieces

        agency_name = response.xpath(
            '//div[@class="libelle"]/a/text()').extract_first()
        if agency_name:
            agency_name = agency_name.strip()
            item['agency_name'] = agency_name

        agency_address = response.xpath(
            '//div[@class="coordonnees"]/p/text()').extract()
        if agency_address:
            new_agency_address = []
            for addr in agency_address:
                new_agency_address.append(addr.strip())
            if new_agency_address:
                item['agency_address'] = ' '.join(new_agency_address)
        item['agency_logo'] = response.xpath(
            '//a[@class="visuel"]/img/@src').extract_first()
        # other_charges_tages = response.xpath('//div[@class="description"]/p/text()').re(r'[\d.,]+')
        # other_charges = response.xpath('//div[@class="description"]/p[@itemprop="description"]/text()').re(r'[\d.,]+')
        temps = response.xpath(
            '//div[@class="description"]/p[not(@itemprop="description")]/text()'
        ).extract()
        if temps:
            for t in temps:
                if 'provision pour charges' in t:
                    other_charges = re.findall('[\d.,]+', t)
                    other_charges = ''.join(other_charges)
                    item['other_charges'] = other_charges.replace(',', '.')
                elif 'Honoraires charge locataire' in t:
                    agency_fee = re.findall('[\d.,]+', t)
                    agency_fee = ''.join(agency_fee)
                    item['agency_fee'] = agency_fee.replace(',', '.')
                elif 'Dépôt de garantie' in t:
                    deposit = re.findall('[\d.,]+', t)
                    deposit = ''.join(deposit)
                    item['deposit'] = deposit.replace(',', '.')

        descs = response.xpath(
            '//div[@class="description"]/p[@itemprop="description"]/text()'
        ).extract_first()
        if descs:
            descs = descs.strip()
            item['description'] = descs

        characteristics_tds = response.xpath(
            '//div[@class="caracteristique tab-left"]/ul/li')
        for td in characteristics_tds:
            spans_strs = td.xpath('./label/text()').extract_first()
            if spans_strs:
                if 'Type d’habitation' in spans_strs:
                    type1 = td.xpath('./text()').extract_first()
                    if type1:
                        type1 = type1.strip()
                        item['type'] = type1
                elif 'Surface habitable' in spans_strs:
                    area = td.xpath('./text()').re(r'[\d.,]+')
                    if area:
                        area = area[0].replace(',', '.')
                        item['size'] = area
                elif 'Meublé' in spans_strs:
                    furnished = td.xpath('./text()').extract_first()
                    if furnished:
                        if 'Oui' == furnished.strip():
                            furnished = 1
                        else:
                            furnished = 0
                        item['furnished'] = furnished
                elif 'chambre' in spans_strs:
                    rooms = td.xpath('./text()').re(r'[\d.,]+')
                    if rooms:
                        rooms = rooms[0]
                        item['rooms'] = rooms
                elif 'construction' in spans_strs:
                    construction_year = td.xpath('./text()').re(r'[\d.,]+')
                    if construction_year:
                        construction_year = construction_year[0]
                        item['construction_year'] = construction_year
                elif 'Nombre d’étages:' in spans_strs:
                    total_floors = td.xpath('./text()').re(r'[\d.,]+')
                    if total_floors:
                        total_floors = total_floors[0]
                        # item['toilettes'] = total_floors
                elif 'Étage' in spans_strs:
                    floors = td.xpath('./text()').re(r'[\d.,]+')
                    if floors:
                        floors = floors[0]
                        item['floor'] = floors

        rent = "rent"
        item['rent_buy'] = rent

        self.count += 1
        print("Total Count: " + str(self.count))

        yield item
Пример #9
0
    def final_parse(self, response):
        item = RealestateItem()

        item['online'] = 1
        item['website'] = self.name
        item[
            'website_logo'] = 'https://header-figaroimmobilier.figarocms.com/img/logo-figimmo.4f72456.svg'
        item['url'] = response.url
        item['description'] = ''
        title = response.xpath(
            '//div[@id="contenu"]/div/h1/text()').extract_first()
        if title:
            title = title.strip()
            item['title'] = title

        images = response.xpath(
            '//div[@class="item js-img-popup"]/a/@href').extract()
        if images:
            image_urls = ','.join(images)
            item['images'] = image_urls

        location = response.xpath(
            '//div[@id="contenu"]/div/h1/span/text()').extract_first()
        if location:
            location = location.strip()
            locations = location.split(' ')
            if len(locations) > 2:
                city = locations[1]
                district = locations[2].replace('me', '').replace('è',
                                                                  '').replace(
                                                                      'er', '')
                item['city'] = city
                try:
                    item['district'] = int(district)
                except:
                    pass
            else:
                city = locations[1]
                item['city'] = city

        price = response.xpath(
            '//div[@id="js-complements-infos"]//span[@class="price"]/text()'
        ).re(r'[\d.,]+')
        if price:
            try:
                price = ''.join(price)
                item['price'] = price.replace(',', '.')
            except:
                pass

        other_charges = response.xpath(
            '//div[@id="js-complements-infos"]//span[@class="charges"]/text()'
        ).re(r'[\d.,]+')
        if other_charges:
            other_charges = ''.join(other_charges)
            item['other_charges'] = other_charges.replace(',', '.')

        deposit = response.xpath(
            '//div[@id="js-complements-infos"]//span[@class="garantie"]/span[@class="value"]/text()'
        ).re(r'[\d.,]+')
        if deposit:
            deposit = ''.join(deposit)
            item['deposit'] = deposit.replace(',', '.')

        agency_fee = response.xpath(
            '//div[@id="js-complements-infos"]//span[@class="honoraires"]/span[@class="value"]/text()'
        ).re(r'[\d.,]+')
        if agency_fee:
            agency_fee = ''.join(agency_fee)
            item['agency_fee'] = agency_fee.replace(',', '.')

        agency_name = response.xpath(
            '//div[@class="container-agency-infos "]/span[@class="agency-name"]/text()'
        ).extract_first()
        if agency_name:
            agency_name = agency_name.strip()
            item['agency_name'] = agency_name

        agency_address_xpaths = response.xpath(
            '//div[@class="agency-location"]')
        if agency_address_xpaths:
            agency_address_xpath = agency_address_xpaths[0]
            agency_address = agency_address_xpath.xpath('./text()').extract()
            new_agency_address = []
            for addr in agency_address:
                new_agency_address.append(addr.strip())
            if new_agency_address:
                item['agency_address'] = ' '.join(new_agency_address)

        ageny_logo = response.xpath(
            '//a[@class="agencyInformation"]/img/@src').extract_first()
        if ageny_logo:
            item['agency_logo'] = ageny_logo

        descs = response.xpath(
            '//p[@id="js-clicphone-description"]/text()').extract_first()
        if descs:
            descs = descs.strip()
            item['description'] = descs

        type1 = response.xpath(
            '//div[@id="js-container-secondary-infos"]//ul[@class="unstyled flex"]/li/text()'
        ).extract()
        if type1:
            try:
                type1 = type1[1].strip()
                item['type'] = type1
            except:
                pass

        characteristics_tds = response.xpath(
            '//div[@class="container-features"]/ul[@class="list-features"]/li')
        for td in characteristics_tds:
            spans_strs = td.xpath('./text()').extract_first()
            if spans_strs:
                if 'm²' in spans_strs:
                    area = td.xpath('./text()').re(r'[\d.,]+')
                    if area:
                        area = area[0].replace(',', '.')
                        item['size'] = area
                elif 'pièce' in spans_strs:
                    pieces = td.xpath('./text()').re(r'[\d.,]+')
                    if pieces:
                        pieces = pieces[0]
                        item['pieces'] = pieces
                elif 'chambre' in spans_strs:
                    rooms = td.xpath('./text()').re(r'[\d.,]+')
                    if rooms:
                        rooms = rooms[0]
                        item['rooms'] = rooms
                elif 'salle de bain' in spans_strs:
                    bath_rooms = td.xpath('./text()').re(r'[\d.,]+')
                    if bath_rooms:
                        bath_rooms = bath_rooms[0]
                        item['rooms'] = rooms
                elif 'Toilettes:' in spans_strs:
                    toilettes = td.xpath('./text()').re(r'[\d.,]+')
                    if toilettes:
                        toilettes = toilettes[0]
                        # item['toilettes'] = toilettes
                elif 'étage' in spans_strs:
                    floors = td.xpath('./text()').re(r'[\d.,]+')
                    if floors:
                        floors = floors[0]
                        item['floor'] = floors
                if 'Meublé' in spans_strs:
                    furnished = 1
                else:
                    furnished = 0
                    furnished = td.xpath('./text()').extract_first()
                    if furnished:
                        if 'Non' in furnished:
                            furnished = 0
                        else:
                            furnished = 1
                item['furnished'] = furnished

        rent = "rent"
        item['rent_buy'] = rent

        self.count += 1
        print("Total Count: " + str(self.count))

        yield item
Пример #10
0
    def final_parse(self, response):
        item = RealestateItem()

        item['online'] = 1
        item['website'] = self.name
        item['website_url'] = 'https://d26q4asbryw2nm.cloudfront.net/2390803/bundles/sahapp/favicon/largetile.png'
        item['url'] = response.url
        item['description'] = ''
        title = response.xpath('//div[@class="property-title-section"]/h1/text()').extract_first()
        if title:
            item['title'] = title

            price = response.xpath('//span[@class="rentable-unit-price"]/text()').re(r'[\d.,]+')
            if price:
                try:
                    price = ''.join(price)
                    item['price'] = price.replace(',', '.')
                except:
                    pass

            item['city'] = response.url.split('https://www.spotahome.com/')[-1].split('/')[0]

            district = response.xpath('//div[@class="property-title-section"]/h1/text()').re(r'[\d.,]+')
            if district:
                try:
                    item['district'] = int(district[-1].replace(',', ''))
                except:
                    pass
            addr = response.xpath('//section[@class="l-main-section l-property-main-section"]/div/div[@class="breadcrumb"]/span/text()').extract()
            if addr:
                item['address'] = addr[-1]

            avaiable_from = response.xpath('//div[@class="room--availability ga-detail-room-availability"]/text()').extract_first()
            if avaiable_from:
                avaiable_from = avaiable_from.split('Available: ')[-1]
                if avaiable_from:
                    avaiable_from = avaiable_from

            addr = response.xpath('//div[@itemprop="address"]/p/text()').extract_first()
            if addr:
                addr = addr.split(' ')
                item['city'] = addr[0]

                district = response.xpath('//div[@itemprop="address"]/p/text()').re(r'[\d.,]+')
                if len(district) > 1:
                    item['district'] = district[-1]

            images = response.xpath('//meta[@itemprop="image"]/@content').extract()
            if images:
                new_imgs = []
                for img in images:
                    img = img.strip()
                    if img:
                        new_imgs.append(img)
                if new_imgs:
                    item['images'] = ','.join(new_imgs)

            area = response.xpath('//div[@class="left-panel"]//div[@class="btn btn-default btn-rounded btn-top-cover-default bold btn-shadow"]/text()').re(r'[\d.,]+')
            if area:
                area = area[-1].replace(',', '.')
                item['size'] = area

            furnished = response.xpath('//div[@class="AvailableRoomFeatures"]/text()').extract_first()
            if furnished == 'Furnished':
                item['furnished'] = 1

            temp = response.xpath('//div[@class="left-panel"]/p/text()').extract()
            for t in temp:
                if 'Disponibilité' in t:
                    avaiable_deposit = t.split(' ')[-1]

            descs = response.text.split('"description":"')
            if descs:
                new_desc = []
                for d in descs:
                    if d[:3] == '<p>':
                        d = d.split('"')[0]
                        d = d.strip().replace('</p>', '')
                        ds = d.split('<p>')
                        for dd in ds:
                            dd = dd.strip()
                            if dd:
                                new_desc.append(dd)
                        break
                if new_desc:
                    item['description'] = '\n'.join(new_desc)

            if 'Property type:' in response.text:
                type1 = response.text.split('Property type:')[-1].split('</li>')[0]
                if type1:
                    item['type'] = type1.strip()

            if 'Floor area:' in response.text:
                area = response.text.split('Floor area:')[-1].split('</li>')[0]
                if area:
                    area = re.findall('[\d.,]+', area)
                    if area:
                        area = area[0].replace(',', '.')
                        item['size'] = area
            if 'Floor:' in response.text:
                floor = response.text.split('Floor:')[-1].split('</li>')[0]
                if floor:
                    floor = re.findall('[\d.,]+', floor)
                    if floor:
                        floor = floor[0]
                        item['floor'] = floor
            if 'Number of bathrooms:' in response.text:
                bathrooms = response.text.split('Number of bathrooms:')[-1].split('</li>')[0]
                if bathrooms:
                    bathrooms = bathrooms.strip()

            self.count += 1
            print("Total Count: " + str(self.count))
            item['rent_buy'] = 'rent'
            yield item
Пример #11
0
    def final_parse(self, response):
        item = RealestateItem()

        item['online'] = 1
        item['website'] = self.name
        item['website_logo'] = 'https://www.century21.fr/theme/generic/css/images/logo_century21-header.png'
        item['url'] = response.url
        item['description'] = ''
        title = response.xpath('//h1[@class="h1_page"]//text()').extract_first()
        if title:
            item['title'] = title

        price = response.xpath('//div[@id="focusAnnonceV2"]/section/span[@class="yellow"]/b/text()').re(r'[\d.,]+')
        if price:
            try:
                price = ''.join(price)
                item['price'] = price.replace(',', '.')
            except:
                pass

        images = response.xpath('//div[@class="zone-galerie"]/div/a//img/@src').extract()
        new_images = []
        if images:
            for img in images:
                new_images.append(response.urljoin(img))
            image_urls = ','.join(new_images)
            item['images'] = image_urls

        desc = response.xpath('//div[@class="desc-fr"]/p/text()').extract_first()
        if desc:
            item['description'] = desc.strip()

        address = title.split(' - ')
        if address:
            item['city'] = address[len(address) - 3]
            try:
                item['district'] = int(address[len(address) - 2])
            except:
                pass

        characteristics_tds = response.xpath('//div[@class="col-gauche-slide"]/div/ul/li')
        item['furnished'] = 0
        for td in characteristics_tds:
            spans_strs = ''.join(td.xpath('./span/text()').extract())
            if spans_strs:
                if 'Location meublée' in spans_strs:
                    item['furnished'] = 1
                elif 'Nombre de pièces' in spans_strs:
                    pieces = td.xpath('./text()').extract_first()
                    if pieces:
                        pieces = pieces.strip()
                        item['pieces'] = pieces
                elif 'Type d\'appartement' in spans_strs:
                    pieces = td.xpath('./text()').extract_first()
                    if pieces:
                        pieces = pieces.strip()
                        item['pieces'] = pieces
                elif 'Surface totale' in spans_strs:
                    area = td.xpath('./text()').re(r'[\d.,]+')
                    if area:
                        area = area[0].replace(',', '.')
                        item['size'] = area
                elif 'Année construction' in spans_strs:
                    construction_year = td.xpath('./text()').re(r'[\d.,]+')
                    if construction_year:
                        construction_year =  construction_year[0]
                        item['construction_year'] = construction_year
                elif 'Honoraires charge locataire' in spans_strs:
                    agency_fee = td.xpath('./text()').re(r'[\d.,]+')
                    if agency_fee:
                        agency_fee = agency_fee[0].replace(',', '.')
                        item['agency_fee'] = agency_fee
                elif 'Dépôt de garantie' in spans_strs:
                    deposit = td.xpath('./text()').re(r'[\d.,]+')
                    if deposit:
                        deposit = deposit[0].replace(',', '.')
                        item['deposit'] = deposit
                elif 'Détail du loyer' in spans_strs:
                    other_charges = td.xpath('./ul/li/text()').re(r'[\d.,]+')
                    if other_charges:
                        other_charges = other_charges[0].replace(',', '.')
                        item['other_charges'] = other_charges

        rent = "buy"
        tt = response.xpath('//div[@id="filAriane"]//span[@itemprop="title"]/text()').extract()
        if 'Location Appartement' in tt:
            item['type'] = 'Appartement'
        if 'Location' in tt:
            rent = 'rent'
        item['rent_buy'] = rent

        self.count += 1
        print("Total Count: " + str(self.count))

        yield item
Пример #12
0
    def final_parse(self, response):
        item = RealestateItem()

        item['online'] = 1
        item['website'] = self.name
        item['url'] = response.url
        item['description'] = ''

        title = response.xpath(
            '//div[@class="fd-title"]/h1/span[@class="mainh1"]/text()'
        ).extract_first()
        if title:
            title = title.strip()
            item['title'] = title

        temp_data = title.split(' ')
        item['type'] = temp_data[1]
        try:
            item['district'] = int(
                re.findall('[\d]+',
                           title.split('Paris')[-1])[-1])
        except:
            pass
        item['city'] = temp_data[len(temp_data) - 4]

        images = response.xpath(
            '//div[@id="bxSliderContainer"]//img[contains(@id, "media")]/@src'
        ).extract()
        if images:
            image_urls = ','.join(images)
            item['images'] = image_urls

        price = response.xpath('//span[@id="fd-price-val"]/text()').re(
            r'[\d.,]+')
        if price:
            try:
                price = ''.join(price)
                item['price'] = price
            except:
                pass

        descs = response.xpath(
            '//div[@id="propertyDesc"]/text()').extract_first()
        if descs:
            descs = descs.strip()
            item['description'] = descs

        item['furnished'] = 0
        characteristics_tds = response.xpath(
            '//div[@class="property-description-characteristics"]/table//td')
        for td in characteristics_tds:
            spans_strs = td.xpath('./span/text()').extract()
            if len(spans_strs) > 1:
                if 'Surface:' in spans_strs[0]:
                    area = td.xpath('./span[2]/text()').re(r'[\d.,]+')
                    if area:
                        area = area[0]
                        item['size'] = area
                elif 'Pièce(s):' in spans_strs[0]:
                    pieces = td.xpath('./span[2]/text()').re(r'[\d.,]+')
                    if pieces:
                        pieces = pieces[0]
                        item['pieces'] = pieces
                elif 'Chambre(s):' in spans_strs[0]:
                    rooms = td.xpath('./span[2]/text()').re(r'[\d.,]+')
                    if rooms:
                        rooms = rooms[0]
                        item['rooms'] = rooms
                elif 'Salle(s)' in spans_strs[0]:
                    bath_rooms = td.xpath('./span[2]/text()').re(r'[\d.,]+')
                    if bath_rooms:
                        bath_rooms = bath_rooms[0]
                        # item['rooms'] = rooms
                elif 'Toilettes:' in spans_strs[0]:
                    toilettes = td.xpath('./span[2]/text()').re(r'[\d.,]+')
                    if toilettes:
                        toilettes = toilettes[0]
                        # item['toilettes'] = toilettes
                elif 'Nombre d\'étages:' in spans_strs[0]:
                    floors = td.xpath('./span[2]/text()').re(r'[\d.,]+')
                    if floors:
                        floors = floors[0]
                        item['floor'] = floors
                elif 'Construit en:' in spans_strs[0]:
                    construction_year = td.xpath(
                        './span[2]/text()').extract_first()
                    if construction_year:
                        item['construction_year'] = construction_year
                elif 'Meublé' == spans_strs[0]:
                    item['furnished'] = 1

        pricing_data_spans = response.xpath(
            '//div[@class="pricing-data"]/ul/li/span')
        for span in pricing_data_spans:
            spans_strs = span.xpath('./text()').extract_first()
            if spans_strs:
                if 'Loyer mensuel:' in spans_strs:
                    rent = span.xpath('./text()').re(r'[\d.,]+')
                    if rent:
                        rent = ''.join(rent)
                        # item['rent_price'] = rent
                elif 'Charges mensuelles:' in spans_strs:
                    other_charges = span.xpath('./text()').re(r'[\d.,]+')
                    if other_charges:
                        other_charges = ''.join(other_charges)
                        item['other_charges'] = other_charges
                elif 'Honoraires à la charge du locataire:' in spans_strs:
                    agency_fee = re.findall('[\d.,]+',
                                            spans_strs.split('(')[0])
                    if agency_fee:
                        agency_fee = ''.join(agency_fee)
                        item['agency_fee'] = agency_fee
                # elif 'Règlement des charges:' in spans_strs:
                #     other_agency_fee = re.findall('[\d.,]+', spans_strs.split('(')[0])
                #     if other_agency_fee:
                #         other_agency_fee = ''.join(other_agency_fee)
                #         item['other_agency_fee'] = other_agency_fee
                elif 'Dépôt de garantie:' in spans_strs:
                    deposit = re.findall('[\d.,]+', spans_strs.split('(')[0])
                    if deposit:
                        deposit = ''.join(deposit)
                        item['deposit'] = deposit
        agency_name = response.xpath(
            '//div[@class="agency-title"]/span/@title').extract_first()
        if agency_name:
            item['agency_name'] = agency_name

        agency_address = response.xpath(
            '//div[@class="agency-address"]/span/text()').extract_first()
        if agency_address:
            item['agency_address'] = agency_address
        agency_logo = response.xpath(
            '//div[@class="agency-logo"]/img/@src').extract_first()
        if agency_logo:
            item['agency_logo'] = agency_logo

        rent = "buy"
        if 'location' in str(response.url):
            rent = "rent"
        item['rent_buy'] = rent

        self.count += 1
        print("Total Count: " + str(self.count))
        item[
            'website_logo'] = 'https://www.avendrealouer.fr/Content/Default/Images/57x57-logoAVAL.png'
        yield item
Пример #13
0
    def final_parse(self, response):
        item = RealestateItem()

        item['online'] = 1
        item['website'] = self.name
        item['website_logo'] = 'https://www.pap.fr/images/logos/logo-pap.png'
        item['url'] = response.url
        item['description'] = ''
        title = response.xpath(
            '//h1[@class="item-title"]/span[@class="h1"]/text()'
        ).extract_first()
        if title:
            item['title'] = title

        price = response.xpath(
            '//h1[@class="item-title"]/span[@class="item-price"]/text()').re(
                r'[\d.,]+')
        if price:
            try:
                price = ''.join(price)
                item['price'] = price.replace('.', '')
            except:
                pass
        temp = response.xpath(
            '//a[@itemprop="item"]/span[@itemprop="name"]/text()'
        ).extract_first()
        t = temp.split(' ')[0]
        if t == 'Location':
            item['rent_buy'] = 'rent'
        type1 = temp.split(' ')[-1]
        if type1:
            type1 = type1.strip()
            if type1:
                item['type'] = type1

        characteristics_tds = response.xpath(
            '//ul[@class="item-tags"]/li/strong')
        for td in characteristics_tds:
            spans_strs = td.xpath('./text()').extract_first()
            if spans_strs:
                if 'pièce' in spans_strs:
                    pieces = td.xpath('./text()').re(r'[\d.,]+')
                    if pieces:
                        pieces = pieces[0]
                        item['pieces'] = pieces
                elif 'm²' in spans_strs:
                    area = td.xpath('./text()').re(r'[\d.,]+')
                    if area:
                        area = area[0].replace(',', '.')
                        item['size'] = area
                elif 'chambre' in spans_strs:
                    rooms = td.xpath('./text()').re(r'[\d.,]+')
                    if rooms:
                        rooms = rooms[0]
                        item['rooms'] = rooms

        addr = response.xpath(
            '//div[@class="item-description margin-bottom-30"]/h2/text()'
        ).extract_first()
        if addr:
            addr = addr.strip().split(' ')
            item['city'] = addr[0]

            district = response.xpath(
                '//div[@class="item-description margin-bottom-30"]/h2/text()'
            ).re(r'[\d.,]+')
            if len(district) > 0:
                try:
                    item['district'] = int(district[-1])
                except:
                    pass

        images = response.xpath(
            '//div[@class="owl-thumbs"]/a/img/@src').extract()
        if images:
            image_urls = ','.join(images)
            item['images'] = image_urls

        desc = response.xpath(
            '//div[@class="margin-bottom-30"]/p/text()').extract()
        if desc:
            new_desc = []
            for d in desc:
                d = d.strip()
                if d:
                    new_desc.append(d)
            if new_desc:
                item['description'] = '\n'.join(new_desc)

        self.count += 1
        print("Total Count: " + str(self.count))

        yield item
Пример #14
0
    def final_parse(self, response):
        item = RealestateItem()

        item['online'] = 1
        item['website'] = self.name
        item['url'] = response.url
        item['description'] = ''
        title = response.xpath(
            '//div[@class="col-md-8 hidden-xs hidden-sm"]/h3/text()'
        ).extract_first()
        if title:
            item['title'] = title

            temp = response.xpath('head/title/text()').extract_first()
            if temp:
                try:
                    temp = temp.split(' - ')
                    price = temp[-1]
                    price = re.findall('[\d.,]+', price)
                    price = ''.join(price)
                    item['price'] = price.replace(',', '.')
                    item['city'] = temp[len(temp) - 2]
                    item['district'] = int(temp[len(temp) - 3])
                except:
                    pass

            type1 = response.url.replace('https://www.flatlooker.com/',
                                         '').split('/')[0]
            if type1:
                if 'appartement' in type1:
                    item['type'] = 'appartement'
                else:
                    item['type'] = type1

            # addr = response.xpath('//div[@itemprop="address"]/p/text()').extract_first()
            # if addr:
            #     addr = addr.split(' ')
            #     item['city'] = addr[0]
            #
            #     district = response.xpath('//div[@itemprop="address"]/p/text()').re(r'[\d.,]+')
            #     if len(district) > 1:
            #         item['district'] = district[-1]

            images = response.xpath(
                'body/div[@class="container-fluid"]/img/@src').extract()
            if images:
                image_urls = ','.join(images)
                item['images'] = image_urls

            addr = response.xpath(
                '//div[@class="left-panel"]/h4[@class="orange bold"]/text()'
            ).extract_first()
            if addr:
                item['address'] = addr.strip()

            area = response.xpath(
                '//div[@class="left-panel"]//div[@class="btn btn-default btn-rounded btn-top-cover-default bold btn-shadow"]/text()'
            ).re(r'[\d.,]+')
            if area:
                pe = area[0]
                item['pieces'] = pe
                area = area[-1].replace(',', '.')
                item['size'] = area

            furnished = response.xpath(
                '//div[@class="left-panel"]/div[@class="flex-vcenter"]/p/text()'
            ).extract_first()
            if 'Non' in furnished:
                item['furnished'] = 0
            else:
                item['furnished'] = 1

            temp = response.xpath(
                '//div[@class="left-panel"]/p/text()').extract()
            for t in temp:
                if 'Disponibilité' in t:
                    avaiable_deposit = t.split(' ')[-1]

            descs = response.xpath(
                '//div[@id="annonce"]/div[2]//div[@class="block-with-text"]//text()'
            ).extract()
            if descs:
                new_desc = []
                for d in descs:
                    d = d.strip()
                    if d:
                        new_desc.append(d)
                if new_desc:
                    item['description'] = '\n'.join(new_desc)

            other_charges = response.xpath(
                '//span[@id="valueChargeRentProperty"]/text()').re(r'[\d.,]+')
            if other_charges:
                other_charges = ''.join(other_charges)
                item['other_charges'] = other_charges.replace(',', '.')

            temp = response.xpath('//table[@id="table-essentials"]//tr')
            for t in temp:
                strs = t.xpath('./td/text()').extract()
                if '\nDépôt de garantie\n' in strs:
                    deposit = strs[1]
                    if deposit and deposit.strip():
                        deposit = deposit.strip()
                        deposit = re.findall('[\d.,]+', deposit)
                        deposit = ''.join(deposit)
                        item['deposit'] = deposit
                elif '\nHonoraires de location\n' in strs:
                    agency_fee = strs[1]
                    if agency_fee and agency_fee.strip():
                        agency_fee = agency_fee.strip()
                        agency_fee = re.findall('[\d.,]+', agency_fee)
                        agency_fee = ''.join(agency_fee)
                        item['agency_fee'] = agency_fee

            temp = response.xpath(
                '//table[@id="table-mesure"]/tbody//td/text()').extract()
            for i, t in enumerate(temp):
                strs = t
                if 'Étage' in strs:
                    floor = temp[i + 1]
                    if floor and floor.strip():
                        floor = floor.strip()
                        floor = re.findall('[\d.,]+', floor)
                        floor = floor[0]
                        item['floor'] = floor

            if 'Location' in title:
                item['rent_buy'] = 'rent'
            self.count += 1
            print("Total Count: " + str(self.count))

            yield item
Пример #15
0
    def final_parse(self, response):
        item = RealestateItem()

        item['online'] = 1
        item['website'] = self.name
        item[
            'website_logo'] = 'https://fr.foncia.com/bundles/fonciainternet/images/logos/[email protected]'
        item['url'] = response.url
        item['description'] = response.xpath(
            '//div[@class="OfferDetails-content"]/p/text()').extract_first()
        item['title'] = response.xpath(
            '//title/text()').extract_first().replace('- Foncia', '').strip()

        price = response.xpath('//p[@class="OfferTop-price"]/text()').re(
            r'[\d.,]+')
        if price:
            try:
                price = ''.join(price)
                item['price'] = price.replace(',', '.')
            except:
                pass

        type1 = response.xpath(
            '//p[@class="Breadcrumbs-inner"]/span/text()').extract()
        type1 = response.url.split('/')[-3]
        item['type'] = type1[1]
        item['city'] = 'Paris'
        item['district'] = response.xpath(
            '//p[@class="OfferTop-loc"]/@data-gtm-zipcode').extract_first()

        images = response.xpath(
            '//ul[@class="OfferSlider-main-slides"]//img/@src').extract()
        image_urls = ','.join(images)
        item['images'] = image_urls

        other_tags = response.xpath('//*[@class="List List--data"]/li')
        for li in other_tags:
            key = ''.join(li.xpath('./span//text()').extract())
            if 'Honoraires charge' in key:
                item['agency_fee'] = ''.join(
                    li.xpath('./strong/text()').re(r'[\d.,]+'))
            elif 'Dépôt de garantie' in key:
                item['deposit'] = ''.join(
                    li.xpath('./strong/text()').re(r'[\d.,]+'))
            elif 'Année de construction' in key:
                item['construction_year'] = li.xpath(
                    './strong/text()').extract_first()

        attrs = response.xpath(
            '//div[@class="MiniData-row MiniData-row--bg"]/p/text()').extract(
            )
        for txt_attr in attrs:
            if txt_attr[-1] == 'm':
                item['size'] = re.findall(r'[\d.,]+', txt_attr)[0]
            elif 'pièce' in txt_attr:
                item['pieces'] = re.findall(r'[\d]+', txt_attr)[0]
            elif 'chambre' in txt_attr:
                item['rooms'] = re.findall(r'[\d]+', txt_attr)[0]

        agency_name = response.xpath(
            '//p[@class="OfferContact-address OfferContact-address--center rwd--noMobile rwd--noTablet"]/a/strong/text()'
        ).extract_first()
        if agency_name:
            item['agency_name'] = agency_name.strip().replace('\n', ' ')

        agency_address = ''.join(
            response.xpath(
                '//p[@class="OfferContact-address OfferContact-address--center rwd--noMobile rwd--noTablet"]/a/text()'
            ).extract()).replace(' ', '').strip().replace('\n', ' ')
        if agency_address:
            item['agency_address'] = agency_address.strip()

        address = response.xpath(
            '//p[@data-behat="adresseBien"]/text()').extract_first()
        if address:
            item['address'] = address.strip().replace('  ',
                                                      '').replace('\n', ' ')

        if 'location' in response.url:
            item['rent_buy'] = 'rent'
        else:
            item['rent_buy'] = 'buy'

        self.count += 1
        print("Total Count: " + str(self.count))

        yield item
Пример #16
0
    def final_parse(self, response):
        item = RealestateItem()

        item['online'] = 1
        item['website'] = self.name
        item['description'] = ''

        title = response.xpath('//h1[@class="_1KQme"]/text()').extract_first()
        if title:
            title = title.strip()
            item['title'] = title

        images = response.xpath(
            '//div[@data-qa-id="adview_gallery_container"]//img/@src'
        ).extract_first()
        if images:
            item['images'] = images

        if 'colocations' in response.url:
            item['type'] = 'Appartement'
        else:
            item['type'] = response.xpath(
                '//div[@data-qa-id="adview_price"]/span/text()').extract_first(
                )
        item['url'] = response.url

        price = response.xpath(
            '//div[@data-qa-id="adview_price"]/div/span/text()').extract_first(
            )
        if price:
            try:
                price = float(price.replace(' ', ''))
                item['price'] = price
            except:
                pass

        date_added_str_list = response.xpath(
            '//div[@data-qa-id="adview_date"]/text()').re('\d+')
        if len(date_added_str_list) > 3:
            year = int(date_added_str_list[2])
            month = int(date_added_str_list[1])
            day = int(date_added_str_list[0])
            hour = int(date_added_str_list[3])
            minute = int(date_added_str_list[4])

            item["date_added"] = datetime.datetime(year, month, day, hour,
                                                   minute)
        elif len(date_added_str_list) > 0:
            year = int(date_added_str_list[2])
            month = int(date_added_str_list[1])
            day = int(date_added_str_list[0])

            item["date_added"] = datetime.datetime(year, month, day)

        type1 = response.xpath(
            '//div[@data-qa-id="criteria_item_real_estate_type"]/div/div[@class="_3Jxf3"]/text()'
        ).extract_first()
        if type1:
            item['type'] = type1

        area = response.xpath(
            '//div[@data-qa-id="criteria_item_square"]/div/div[@class="_3Jxf3"]/text()'
        ).re(r'[\d.,]+')
        if area:
            area = area[0]
            item['size'] = area

        rooms_count = response.xpath(
            '//div[@data-qa-id="criteria_item_rooms"]/div/div[@class="_3Jxf3"]/text()'
        ).re(r'[\d.,]+')
        if rooms_count:
            rooms_count = rooms_count[0]
            item['pieces'] = rooms_count

        furnished = response.xpath(
            '//div[@data-qa-id="criteria_item_furnished"]/div/div[@class="_3Jxf3"]/text()'
        ).extract_first()
        if furnished:
            if 'Non' in furnished:
                furnished = 0
            else:
                furnished = 1
            item['furnished'] = furnished

        descs = response.xpath(
            '//div[@data-qa-id="adview_description_container"]/div/span[@class="_2wB1z"]/text()'
        ).extract()
        if descs:
            descs = '\n'.join(descs)
            item['description'] = descs

        locations = response.xpath(
            '//div[@data-qa-id="adview_location_informations"]/span/text()'
        ).extract()
        if len(locations) > 1:
            city = locations[0]
            district = locations[-1]
            item['city'] = city
            try:
                item['district'] = int(district)
            except:
                pass

        if len(locations) > 0:
            city = locations[0]
            item['city'] = city
        rent = "buy"
        if 'location' in str(response.url):
            rent = "rent"
        item['rent_buy'] = rent

        self.count += 1
        print("Total Count: " + str(self.count))

        yield item
Пример #17
0
    def final_parse(self, response):
        item = RealestateItem()

        item['online'] = 1
        item['website'] = self.name
        item['website_logo'] = 'https://www.orpi.com/mstile-310x310.png'
        item['url'] = response.url
        item['description'] = ''
        title = response.xpath(
            '//div[@class="synopsis-textcell"]/h1/span//text()').extract()
        if title:
            title = ''.join(title)
            item['title'] = title

        price = response.xpath('//span[@class="price"]/text()').re(r'[\d.,]+')
        if price:
            try:
                price = ''.join(price)
                item['price'] = price.replace(',', '.')
            except:
                pass

        type1 = response.xpath(
            '//span[@class="c-vignette__type"]/text()').extract_first()
        if type1:
            type1 = type1.split(' ')[0]
            if type1:
                item['type'] = type1

        addr = response.xpath(
            '//span[@class="c-vignette__address"]/text()').extract_first()
        if addr:
            addr = addr.split(' ')
            item['city'] = addr[0]
            if len(addr) > 1:
                try:
                    item['district'] = int(addr[1])
                except:
                    pass

        images = response.xpath(
            '//ul[@class="estate-carousel-nav-dots show-for-large js-estate-carousel-nav"]/li/img/@src'
        ).extract()
        if images:
            image_urls = ','.join(images)
            item['images'] = image_urls

        desc = response.xpath(
            '//div[@class="estateNeighborhood gutters brd-rg estateDescription"]//div[@class="paragraphs-textcell"]/p/text()'
        ).extract_first()
        if desc:
            item['description'] = desc

        address = response.xpath(
            '//address[@class="address"]/text()').extract_first()
        if address:
            address = address.split(' ')
            if len(address) > 1:
                item['city'] = address[0]
                item['district'] = address[1]
            elif len(address) == 1:
                item['city'] = address[0]

        characteristics_tds = response.xpath(
            '//ul[@class="dotted-list dotted-list--ocom"]/li')
        for td in characteristics_tds:
            spans_strs = td.xpath('./mark[1]/text()').extract_first()
            if spans_strs:
                if 'Nombre de pièce(s)' in spans_strs:
                    pieces = td.xpath('./mark[2]/text()').extract_first()
                    if pieces:
                        pieces = pieces.strip()
                        item['pieces'] = pieces
                elif 'Surface' in spans_strs:
                    area = td.xpath('./mark[2]/text()').re(r'[\d.,]+')
                    if area:
                        area = area[0].replace(',', '.')
                        item['size'] = area
                elif 'Nombre de chambre(s)' in spans_strs:
                    rooms = td.xpath('./mark[2]/text()').re(r'[\d.,]+')
                    if rooms:
                        rooms = rooms[0]
                        item['rooms'] = rooms
                elif 'Année de construction' in spans_strs:
                    construction_year = td.xpath(
                        './mark[2]/text()').extract_first()
                    if construction_year:
                        item['construction_year'] = construction_year
                elif 'Nombre d\'étages de l\'immeuble' in spans_strs:
                    total_floors = td.xpath('./mark[2]/text()').extract_first()
                    if total_floors:
                        total_floors = total_floors
                        item['toilettes'] = total_floors
                elif 'Nombre de salle(s) de bain/d’eau' in spans_strs:
                    bath_rooms = td.xpath('./mark[2]/text()').extract_first()
                    if bath_rooms:
                        bath_rooms = bath_rooms
                        item['bath_rooms'] = bath_rooms
                elif 'Étage' in spans_strs:
                    floors = td.xpath('./mark[2]/text()').extract_first()
                    if floors:
                        floors = floors[0]
                        item['floor'] = floors

        characteristics_tds = response.xpath(
            '//div[@class="onusBlock onusBlock--ocom"]/ul/li')
        for td in characteristics_tds:
            spans_strs = td.xpath('./text()').extract_first()
            if spans_strs:
                if 'Dépôt' in spans_strs:
                    deposit = td.xpath('./text()').re(r'[\d.,]+')
                    if deposit:
                        deposit = ''.join(deposit)
                        item['deposit'] = deposit.replace(',', '.')
                elif 'Honoraires TTC à la charge du locataire' in spans_strs:
                    agency_fee = td.xpath('./text()').re(r'[\d.,]+')
                    if agency_fee:
                        agency_fee = ''.join(agency_fee)
                        item['agency_fee'] = agency_fee.replace(',', '.')
                elif 'd\'honoraires d\'état des lieux' in spans_strs:
                    other_agency_fee = td.xpath('./text()').re(r'[\d.,]+')
                    if other_agency_fee:
                        other_agency_fee = ''.join(other_agency_fee)
                        item['other_agency_fee'] = other_agency_fee.replace(
                            ',', '.')
                elif 'Provisions pour charges' in spans_strs:
                    other_charges = td.xpath('./text()').re(r'[\d.,]+')
                    if other_charges:
                        other_charges = ''.join(other_charges)
                        item['other_charges'] = other_charges.replace(',', '.')

        rent = "rent"
        item['rent_buy'] = rent

        self.count += 1
        print("Total Count: " + str(self.count))

        yield item
Пример #18
0
    def final_parse(self, response):
        item = RealestateItem()
        properties = ''
        try:
            properties = str(
                response.body).split('var __property = {};')[-1].split(
                    '__property.serverUrl')[0].split(';')
        except:
            return
        main_image = ''
        for prop in properties:
            if 'property.title' in prop:
                item['title'] = prop.split('=')[-1].replace('\\', '').replace(
                    ';', '').strip()
            elif 'property.rooms' in prop:
                item['pieces'] = prop.split('=')[-1].replace(';', '').strip()
            elif 'property.description' in prop:
                item['description'] = prop.split('=')[-1].strip()[:-1].replace(
                    '\\\\',
                    "\\").replace('"',
                                  '').encode('utf-8').decode('unicode-escape')
            elif 'property.description' in prop:
                item['description'] = prop.split('=')[-1].replace(
                    '\\', '').strip()[:-1]
            elif 'property.size' in prop:
                item['size'] = prop.split('=')[-1].replace('\\', '').replace(
                    ';', '').strip()
            elif 'property.loyer' in prop:
                item['price'] = prop.split('=')[-1].replace('\\', '').replace(
                    ';', '').strip()
            elif 'property.charges' in prop:
                item['other_charges'] = prop.split('=')[-1].replace(
                    '\\', '').replace(';', '').strip()
            # elif 'property.disponibilite' in prop:
            #     data = prop.split('=')[-1].replace('\\', '').replace(';', '').strip()
            #     if data !='':
            #         item['caf_avaliable'] = 1
            elif 'property.main_photo' in prop:
                main_image = prop.split('=')[-1].replace('\\', '').replace(
                    ';', '').strip()
            elif 'property.photos_all' in prop:
                datas = prop.split('=')[-1].replace('\\', '').replace(
                    ';', '').strip().replace(']',
                                             '').replace('[',
                                                         '').replace('"', '')
                prifix = 'https://www.ommi.fr/image/by/w/900/h/900/i/' + main_image.replace(
                    main_image.split('/')[-1], '').replace('/', '_').replace(
                        '"', '')
                images = []
                for img in datas.split(','):
                    image = prifix + img
                    images.append(image)
                item['images'] = ','.join(images)
            elif 'property.short_address' in prop:
                address = prop.split('=')[-1].replace('\\',
                                                      '').replace(';',
                                                                  '').strip()
                if address and 'Paris' in address:
                    item['city'] = 'Paris'
                    # item['parisian_district'] = address.split(' ')[1].replace('er', '').replace('e', '').replace('"', '')
                    try:
                        item['district'] = int(
                            address.split(' ')[1].replace('er', '').replace(
                                'e', '').replace('"', ''))
                    except:
                        try:
                            item['district'] = int(
                                re.findall('[\d]+', address)[0])
                        except:
                            pass

        item['url'] = response.url
        title = response.xpath('//title/text()').extract_first()
        if title:
            item['title'] = title.split('-')[0].strip()

        type = response.url.split('/')[-1].split('-')[0]
        item['type'] = type
        rent = "rent"
        if 'achat' in str(response.url):
            rent = "buy"
        item['rent_buy'] = rent

        item['online'] = 1
        item['website'] = 'ommi'
        self.count += 1
        print(self.count)
        yield item
Пример #19
0
    def final_parse(self, response):
        item = RealestateItem()

        item['online'] = 1
        item['website'] = self.name
        item['website_logo'] = 'https://www.lesiteimmo.com/images/logo.svg?id=409b2bdfb76416b8f554'
        item['url'] = response.url
        item['description'] = response.xpath('//div[@itemprop="description"]/text()').extract_first().strip()
        item['title'] = response.xpath('//title/text()').extract_first()

        price = response.xpath('//span[@class="text-xl font-medium"]/span[@class="value"]/text()').re(r'[\d.,]+')
        if price:
            try:
                price = ''.join(price)
                item['price'] = price.replace(',', '.')
            except:
                pass

        type1 = response.xpath('//h1[@itemprop="name"]/text()').extract_first()
        item['type'] = type1.strip().split(' ')[0]

        item['city'] = 'Paris'
        item['district'] = response.url.split('/')[-2].split('-')[-1]

        images = response.xpath('//div[contains(@class,"bg-cover h-64 lg:h-128 w-full")]/@style').extract()
        image_urls = []
        for img in images:
            img_url = img.split("url('")[-1].split("')")[0]
            image_urls.append(img_url)
        item['images'] = ','.join(image_urls)

        agency_name = response.xpath('//div[@class="font-medium text-grey-darkest"]/text()').extract_first()
        if agency_name:
            item['agency_name'] = agency_name.strip().replace('\n', ' ')

        agency_address = response.xpath('//div[@class="text-grey"]/text()').extract_first()
        if agency_address:
            item['agency_address'] = agency_address.strip()

        agency_logo = response.xpath('//div[@class="mb-2"]/img/@src').extract_first()
        if agency_logo:
            item['agency_logo'] = agency_logo

        attr_tags = response.xpath('//div[@class="p-4 flex flex-wrap justify-start items-start"]//div[@class="flex w-full p-2 bg-grey-lightest"]')
        for div in attr_tags:
            key = div.xpath('./div[@class="w-2/3 text-grey-darker mr-2"]/text()').extract_first()
            val = div.xpath('./div[@class="w-1/3 text-grey text-right"]/text()').extract_first()
            if 'Étage' in key:
                item['floor'] = re.findall(r'[\d]+', str(val))[0]
            elif 'Nbre. de chambres' in key:
                item['rooms'] = val.strip()
            elif 'Adresse' == key:
                item['address'] = val.strip()
            elif 'Nb. de pièces' in key:
                item['pieces'] = val.strip()
            elif 'Charges' == key:
                item['other_charges'] = re.findall(r'[\d.,\s]+', str(val))[0].replace(' ', '')
            elif 'Dépôt de garantie' in key:
                item['deposit'] = re.findall(r'[\d.,\s]+', str(val))[0].replace(' ', '')
            elif 'Honoraires' == key:
                item['agency_fee'] = re.findall(r'[\d.,\s]+', str(val))[0].replace(' ', '')
            elif 'Année de construction' in key:
                item['construction_year'] = val.strip()
            elif 'Surface habitable' in key:
                item['size'] = re.findall(r'[\d.,\s]+', str(val))[0].replace(' ', '')

        if 'location' in response.url:
            item['rent_buy'] = 'rent'
        else:
            item['rent_buy'] = 'buy'
        item['rent_buy'] = 'rent'
        self.count += 1
        print("Total Count: " + str(self.count))

        yield item
Пример #20
0
    def final_parse(self, response):
        item = RealestateItem()

        item['online'] = 1
        item['website'] = self.name
        item['url'] = response.url
        item['description'] = ''
        title = response.xpath('//table[@class="licom-breadcrumb"]//td/h1/text()').extract_first()
        if title:
            item['title'] = title

        price = response.xpath('//div[@itemprop="price"]/h2/text()').re(r'[\d.,]+')
        if price:
            try:
                price = ''.join(price)
                item['price'] = price.replace(',', '.')
            except:
                pass

        type1 = response.xpath('//div[@class="col-xs-3 offer-type"]/p/text()').extract_first()
        if type1:
            type1 = type1.strip()
            if type1:
                item['type'] = type1

        addr = response.xpath('//div[@itemprop="address"]/p/text()').extract_first()
        if addr:
            addr = addr.split(' ')
            item['city'] = addr[0]

            district = response.xpath('//div[@itemprop="address"]/p/text()').re(r'[\d]+')
            if len(district) > 1:
                try:
                    item['district'] = int(district[-1])
                except:
                    pass

        images = response.xpath('//div[@class="carousel-content noSlider"]//img/@src').extract()
        if images:
            image_urls = ','.join(images)
            item['images'] = image_urls

        agency_fee = response.xpath('//span[@class="lbl-agencyfees"]/text()').re(r'[\d.,]+')
        if agency_fee:
            agency_fee = ''.join(agency_fee)
            item['agency_fee'] = agency_fee.replace(',', '.')

        desc = response.xpath('//div[@class="offer-description-text"]/meta/@content').extract_first()
        if desc:
            item['description'] = desc

        area = response.xpath('//span[@class="offer-area-number"]/text()').re(r'[\d.,]+')
        if area:
            area = area[0].replace(',', '.')
            item['size'] = area

        pieces = response.xpath('//span[@class="offer-rooms-number"]/text()').re(r'[\d.,]+')
        if pieces:
            pieces = pieces[0]
            item['pieces'] = pieces

        other_charges = response.xpath('//span[@id="valueChargeRentProperty"]/text()').re(r'[\d.,]+')
        if other_charges:
            other_charges = ''.join(other_charges)
            item['other_charges'] = other_charges.replace(',', '.')

        deposit = response.xpath('//span[@id="valueDepotRentGarantee"]/text()').re(r'[\d.,]+')
        if deposit:
            deposit = ''.join(deposit)
            item['deposit'] = deposit.replace(',', '.')

        agency_fee = response.xpath('//span[@id="valueFeesRentAgency"]/text()').re(r'[\d.,]+')
        if agency_fee:
            agency_fee = ''.join(agency_fee)
            item['agency_fee'] = agency_fee.replace(',', '.')

        agency_name = response.xpath('//span[@itemprop="seller"]/text()').extract_first()
        if agency_name:
            item['agency_name'] = agency_name

        agency_address = response.xpath('//p[@class="agency-infos size_12 nomargin"]//span/text()').extract()
        if agency_address:
            item['agency_address'] = '\n'.join(agency_address)

        characteristics_tds = response.xpath('//ul[@itemprop="description"]/li')
        for td in characteristics_tds:
            spans_strs = td.xpath('./div[1]/text()').extract_first()
            if spans_strs:
                if 'Nombre d\'étages de l\'immeuble' in spans_strs:
                    total_floors = td.xpath('./div[2]/text()').extract_first()
                    if total_floors:
                        total_floors = total_floors
                        item['toilettes'] = total_floors
                elif 'Nombre de salle de bain' in spans_strs:
                    bath_rooms = td.xpath('./div[2]/text()').extract_first()
                    if bath_rooms:
                        bath_rooms = bath_rooms
                        item['bath_rooms'] = bath_rooms
                elif 'Etage du bien' in spans_strs:
                    floors = td.xpath('./div[2]/text()').extract_first()
                    if floors:
                        item['floor'] = floors.replace('e', '')
                elif 'Meublé' in spans_strs:
                    furnished = td.xpath('./div[2]/text()').extract_first()
                    if furnished == 'Oui':
                        item['furnished'] = 1

        t = response.xpath('//li[@class="columns current"]/a/span/text()').extract_first()
        if t == 'Location':
            item['rent_buy'] = 'rent'

        self.count += 1
        print("Total Count: " + str(self.count))

        yield item
Пример #21
0
    def parse(self, response):
        json_data = json.loads(response.body)
        for data in json_data['realEstateAds']:
            try:
                item = RealestateItem()
                item['online'] = 1
                item['website'] = self.name
                item['website_logo'] = 'https://www.bienici.com/cacheForever/45ee97a38fe6a64ae66' \
                                       'c7a2310cf2192ec35f538/logos/logo_bienici.svg'
                if 'roomsQuantity' in data.keys():
                    item['url'] = 'https://www.bienici.com/annonce/location/{}/appartement/{}pie' \
                                  'ces/{}?q=%2Frecherche%2Flocation%2Fparis-75000'\
                        .format(data['city'].strip().replace(' ', '-'), data['roomsQuantity'], data['id'])
                    item['pieces'] = data['roomsQuantity']
                    item['rooms'] = data['roomsQuantity']
                else:
                    item['url'] = 'https://www.bienici.com/annonce/location/{}/appartement/{}?q=%' \
                                  '2Frecherche%2Flocation%2Fparis-75000'\
                        .format(data['city'].strip().replace(' ', '-'), data['roomsQuantity'], data['id'])
                item['description'] = data['description']
                item['title'] = data['title']

                if 'surfaceArea' in data.keys():
                    item['size'] = data['surfaceArea']

                if item['title'] == "":
                    item['title'] = "Appartement {} pièces {} m²".format(
                        item['pieces'], item['size'])

                item['rent_buy'] = data['adType']
                item['city'] = data['city'].strip().split(' ')[0]
                item['district'] = data['postalCode']
                item['price'] = data['price']

                if 'floor' in data.keys():
                    item['floor'] = data['floor']

                if 'agencyRentalFee' in data.keys():
                    item['agency_fee'] = data['agencyRentalFee']

                if 'safetyDeposit' in data.keys():
                    item['deposit'] = data['safetyDeposit']

                if 'isFurnished' in data.keys():
                    item['furnished'] = 1

                if 'yearOfConstruction' in data.keys():
                    item['construction_year'] = data['yearOfConstruction']

                imgs = []

                for img in data['photos']:
                    imgs.append(img['url'])
                item['images'] = ','.join(imgs)
                # self.count += 1
                # print("Total Count: " + str(self.count))

                yield Request(
                    'https://www.bienici.com/realEstateAd.json?id={}&access_token=2lWi9yZU%2FR%'
                    '2FuoEAybaCQI7Q0CMe3RD5aquaK7rLs63Y%3D%3A5b543410ac93c7009bfa3572'
                    .format(data['id']),
                    self.final_parse,
                    meta={'item': item})
                # yield item
            except Exception as e:
                print("err: " + e.args[0])
                self.count += 1
                print("Total Count: " + str(self.count))
                yield item

        total = int(json_data['total'])
        current = int(json_data['from'])
        if current < total:
            next = current + 24
            page = int((current / 24) + 1)
            next_page_url = 'https://www.bienici.com/realEstateAds.json?filters=%7B%22size%22%3A24%2C%22fr' \
                            'om%22%3A{}%2C%22filterType%22%3A%22rent%22%2C%22propertyType%22%3A%5B%22house%' \
                            '22%2C%22flat%22%5D%2C%22page%22%3A{}%2C%22resultsPerPage%22%3A24%2C%22maxAuth' \
                            'orizedResults%22%3A2400%2C%22sortBy%22%3A%22relevance%22%2C%22sortOrder%22%3A%' \
                            '22desc%22%2C%22onTheMarket%22%3A%5Btrue%5D%2C%22showAllModels%22%3Afalse%2C%22z' \
                            'oneIdsByTypes%22%3A%7B%22zoneIds%22%3A%5B%22-7444%22%5D%7D%7D&extensionType=exte' \
                            'ndedIfNoResult&leadingCount=2'.format(next, page)
            yield Request(next_page_url, self.parse)

        if next_page_url:
            yield Request(response.urljoin(next_page_url),
                          callback=self.parse,
                          dont_filter=True)
Пример #22
0
    def parse(self, response):
        url_tags = response.xpath('//ul[@class="results-compact"]/li')
        for tag in url_tags:
            data_json = tag.xpath('./@data-json').extract_first()
            data_json = json.loads(data_json)
            url = tag.xpath('./a/@href').extract_first()

            item = RealestateItem()
            item['title'] = data_json['title'] +' ' + data_json['title_city']
            item['online'] = 1
            item['website'] = self.name
            item['website_logo'] = 'http://www.laforet.com/sites/default/themes/laforet/logo.png'

            try:
                item['description'] = data_json['description']
            except:
                pass

            try:
                item['price'] = data_json['price'].replace(' ', '')
            except:
                pass

            try:
                item['type'] = data_json['title'].split(' ')[0].lower()
            except:
                pass

            try:
                item['city'] = data_json['title_city'].split(' ')[0]
            except:
                pass

            try:
                item['district'] = data_json['postalCode']
            except:
                try:
                    item['district'] = re.findall(r'[\d]+', data_json['title_city'])[-1]
                except:
                    pass

            try:
                item['city'] = data_json['title_city'].split(' ')[0]
            except:
                pass

            imgurl1 = data_json['imageUrl']
            imgurl = imgurl1 +',' + imgurl1.split('.jpg')[0][0:-1] + 'b.jpg'
            imgurl = imgurl +',' + imgurl1.split('.jpg')[0][0:-1] + 'c.jpg'
            imgurl = imgurl +',' + imgurl1.split('.jpg')[0][0:-1] + 'd.jpg'
            item['images'] = imgurl

            try:
                item['size'] = data_json['surface']
            except:
                pass

            try:
                item['pieces'] = data_json['roomsQuantity']
            except:
                pass

            try:
                item['deposit'] = data_json['deposit']
            except:
                pass

            try:
                item['agency_fee'] = data_json['fees']
            except:
                pass

            item['rent_buy'] = 'rent'
            item['url'] = response.urljoin(data_json['url'])

            yield Request(item['url'], callback=self.final_parse, meta={'item': item}, )

        next_page_url = response.xpath('//*[@aria-label="Next"]/a[@aria-label="Next"]/@href').extract_first()

        if next_page_url:
            yield Request(response.urljoin(next_page_url), callback=self.parse, dont_filter=True)
Пример #23
0
    def final_parse(self, response):
        item = RealestateItem()
        title= response.xpath('//h1[@class="detail-title title1"]/text()').extract_first()
        if title:
            title = title.strip()
        item['title'] = title

        item['type'] = response.xpath('//h2[@class="c-h2"]/text()').extract_first()
        item['url'] = response.url

        rent = "rent"
        if 'achat' in str(response.url):
            rent = "buy"
        item['rent_buy'] = rent

        # price = response.xpath('//a[@class="js-smooth-scroll-link price"]/text()').extract_first()
        # if price:
        #     price = price.strip().replace('\xa0', '').replace('€', '').replace(' ', '').strip()
        #     item['price'] = float(price)
        # else:
        #     item['price'] = None

        rooms= 0
        pieces= 0
        size= 0

        details= response.xpath('//ul[@class="criterion"]/li')
        for detail_li in details:
            detail = detail_li.xpath('./text()').extract_first()
            if 'pièce' in detail:
                pieces= detail.split(" ")[0]
            if 'chambre' in detail:
                rooms= detail.split(" ")[0]
            if 'm²' in detail:
                size= detail.split(" ")[0].replace(",", ".")
        item['rooms'] = rooms
        item['pieces'] = pieces
        item['size'] = size

        location = response.xpath('//p[@class="localite"]/text()').extract_first()
        if location and 'Paris' in location:
            city= 'Paris'
            parisian_district= location.split(' ')[1].replace('ème', '')
        else:
            city= location
            parisian_district = None
        item['city'] = city
        item['parisian_district'] = parisian_district
        try:
            item['district'] = int(parisian_district)
        except:
            pass

        agency_name= response.xpath('//a[@class="agence-link"]/@title').extract_first()
        item['agency_name'] = agency_name
        try:
            agency_address= response.xpath('//div[@class="agence-adresse fi fi-map-pin"]/text()').extract_first().strip()
        except:
            agency_address= None
        item['agency_address'] = agency_address

        agency_phone= response.xpath('//a[@class="bub-phone tagClick"]/@data-phone').extract_first()
        item['agency_phone'] = agency_phone
        try:
            agency_postal_code= [i for i in agency_address.split(" ") if RepresentsInt(i)][0]
        except:
            agency_postal_code= None
        item['agency_postal_code'] = agency_postal_code
        item['agency_logo'] = response.xpath('//img[@class="agence-logo-img"]/@src').extract_first()
        images= response.xpath('//div[contains(@class, "carrousel_slide")]/div/@data-lazy').extract()
        pic = []
        for img_data in images:
            picture = json.loads(img_data)['url'].replace("//", "")
            pic.append(picture)

        images1= response.xpath('//div[contains(@class, "carrousel_slide")]/img/@src').extract()
        for img_data in images1:
            pic.append(img_data)
        pics = ",".join(pic)




        charges_val_keys = response.xpath('//*[@class="categorie with-padding-bottom"]//p/text()').extract()
        charges_val_texts = response.xpath('//*[@class="categorie with-padding-bottom"]//p/span/text()').extract()

        item['images'] = pics
        item['online'] = 1
        item['website'] = 'seloger'
        item['website_logo'] = 'https://static-seloger.com/z/produits/sl/homepage/assets/img/bandeau_app/sl_logo_152x152_thumb.png'

        id = str(response.url).split('/')[-1].split('.')[0]
        if id:
            url = 'https://www.seloger.com/detail,json,caracteristique_bien.json?idannonce=' + id

            # proxy = response.meta['proxy']
            yield Request(url, callback=self.final_attr, meta={'item':item}, dont_filter=True)
Пример #24
0
    def final_parse(self, response):
        item = RealestateItem()

        item['online'] = 1
        item['website'] = self.name
        item[
            'website_logo'] = 'https://static.meilleursagents.com/3.4.0/img/www/logo-meilleursagents-std.png'
        item['url'] = response.url
        item['description'] = response.xpath(
            '//p[@class="tjustify chapo-small"]/text()').extract_first()
        item['title'] = response.xpath(
            '//h1[@class="margin-none"]/text()').extract_first().strip()

        price = response.xpath('//div[@class="h2"]/strong/text()').re(
            r'[\d.,]+')
        if price:
            try:
                price = ''.join(price)
                item['price'] = price.replace(',', '.')
            except:
                pass
        item['type'] = response.xpath(
            '//table[@class="table table-striped chapo-small pull-left"]//tr/td/text()'
        ).extract_first().split(' ')[0]
        item['city'] = response.xpath('//div[@class="muted"]/text()'
                                      ).extract_first().strip().split(' ')[0]
        item['district'] = response.xpath('//div[@class="muted"]/text()').re(
            r'[\d]+')[0]

        images = response.xpath(
            '//div[@id="realtor_listing_carousel_pictures"]/a/@href').extract(
            )
        imgs = []
        for img in images:
            img = 'https:' + img
            imgs.append(img)
        image_urls = ','.join(imgs)
        item['images'] = image_urls

        agency_name = response.xpath(
            '//h4[@class="tcenter margin-bottom"]/a/text()').extract_first()
        if agency_name:
            item['agency_name'] = agency_name.strip()

        other_tags = response.xpath('//div[not(@id)]/table/tr')
        for li in other_tags:
            key = ''.join(li.xpath('./td//text()').extract())
            if 'pièces' in key:
                item['pieces'] = li.xpath('./td//text()').re(r'[\d]+')[0]
            elif 'Surface de' in key:
                item['size'] = li.xpath('./td//text()').re(r'[\d.,]+')[0]
            elif 'chambre' in key:
                item['rooms'] = li.xpath('./td//text()').re(r'[\d]+')[0]
            elif 'Etage' in key:
                item['floor'] = li.xpath('./td//text()').re(r'[\d]+')[0]
            elif 'Construit en' in key:
                item['construction_year'] = li.xpath('./td//text()').re(
                    r'[\d]+')[0]

        if "Studio" == item['type']:
            item['pieces'] = 1

        other_tags = response.xpath('//div[not(@id)]/table/tr')
        for li in other_tags:
            key = ''.join(li.xpath('./td//text()').extract())
            if 'pièces' in key:
                item['pieces'] = li.xpath('./td//text()').re(r'[\d]+')[0]
            elif 'Surface de' in key:
                item['size'] = li.xpath('./td//text()').re(r'[\d.,]+')[0]
            elif 'chambre' in key:
                item['rooms'] = li.xpath('./td//text()').re(r'[\d]+')[0]
            elif 'Etage' in key:
                item['floor'] = li.xpath('./td//text()').re(r'[\d]+')[0]
            elif 'Construit en' in key:
                item['construction_year'] = li.xpath('./td//text()').re(
                    r'[\d]+')[0]

        other_tags = response.xpath('//div[@id="details"]/table//tr')
        for li in other_tags:
            key = ''.join(li.xpath('./td[1]//text()').extract())
            if 'Charges locatives' in key:
                other_charges = li.xpath('./td[2]//text()').re(r'[\d]+')
                if other_charges:
                    item['other_charges'] = other_charges[0]
            elif 'Dépôt de garantie' in key:
                deposit = li.xpath('./td[2]//text()').re(r'[\d.,]+')
                if deposit:
                    item['deposit'] = deposit[0]
            elif 'Honoraires charge' in key:
                fee = li.xpath('./td[2]//text()').re(r'[\d]+')
                if fee:
                    item['agency_fee'] = fee[0]

        if 'location' in response.url:
            item['rent_buy'] = 'rent'
        else:
            item['rent_buy'] = 'buy'
        item['rent_buy'] = 'rent'
        self.count += 1
        print("Total Count: " + str(self.count))

        yield item