def next_page(self): js_datas = CleanText( '//div[@id="js-data"]/@data-rest-search-request' )(self).split('?')[-1].split('&') try: resultsPerPage = next( x for x in js_datas if 'resultsPerPage' in x ).split('=')[-1] currentPageNumber = next( x for x in js_datas if 'currentPageNumber' in x ).split('=')[-1] resultCount = CleanText( '(//div[@id="js-data"]/@data-result-count)[1]' )(self) totalPageNumber = math.ceil( int(resultCount) / int(resultsPerPage) ) next_page = int(currentPageNumber) + 1 if next_page <= totalPageNumber: return self.page.url.replace( 'page=%s' % currentPageNumber, 'page=%d' % next_page ) except StopIteration: pass
class item(ItemElement): klass = Housing obj_id = Format('colocation-%s', CleanText('./div/header/@id', replace=[('header-offer-', '')])) obj_type = POSTS_TYPES.SHARING obj_advert_type = ADVERT_TYPES.PROFESSIONAL obj_title = CleanText(CleanHTML('./div/header/section/p[@class="property-type"]/span/@title')) obj_area = CleanDecimal('./div/header/section/p[@class="offer-attributes"]/a/span[@class="offer-area-number"]', default=0) obj_cost = CleanDecimal('./div/header/section/p[@class="price"]', default=0) obj_currency = Currency( './div/header/section/p[@class="price"]' ) obj_utilities = UTILITIES.UNKNOWN obj_text = CleanText( './div/div[@class="content-offer"]/section[has-class("content-desc")]/p/span[has-class("offer-text")]/@title', default=NotLoaded ) obj_date = Date(Regexp(CleanText('./div/header/section/p[has-class("update-date")]'), ".*(\d{2}/\d{2}/\d{4}).*")) obj_location = CleanText( '(./div/div[@class="content-offer"]/section[has-class("content-desc")]/p)[1]/span/@title', default=NotLoaded )
class get_housing(ItemElement): klass = Housing obj_id = Env('_id') obj_title = CleanText('//h1[@itemprop="name"]') obj_location = CleanText('//span[@class="informations-localisation"]') obj_cost = CleanDecimal('//span[@itemprop="price"]') obj_currency = Currency('//span[@itemprop="price"]') obj_text = CleanHTML('//div[@itemprop="description"]') obj_url = BrowserURL('housing', _id=Env('_id')) obj_area = CleanDecimal(Regexp(CleanText('//h1[@itemprop="name"]'), r'(.*?)(\d*) m2(.*?)', '\\2'), default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() def obj_photos(self): photos = [] for img in XPath('//a[@class="thumbnail-link"]/img[@itemprop="image"]')(self): url = Regexp(CleanText('./@src'), r'http://thbr\.figarocms\.net.*(http://.*)')(img) photos.append(HousingPhoto(url)) return photos def obj_details(self): details = dict() for item in XPath('//div[@class="features clearfix"]/ul/li')(self): key = CleanText('./span[@class="name"]')(item) value = CleanText('./span[@class="value"]')(item) if value and key: details[key] = value key = CleanText('//div[@class="title-dpe clearfix"]')(self) value = CleanText('//div[@class="energy-consumption"]')(self) if value and key: details[key] = value return details
def obj_utilities(self): price = CleanText( './/strong[has-class("TeaserOffer-price-num")]')(self) if "charges comprises" in price.lower(): return UTILITIES.INCLUDED else: return UTILITIES.EXCLUDED
def obj_DPE(self): energy_value = CleanText( '//ul[@class="energyInfosDPE"]//li[@class="energyInfos"]/span/@data-class', default="" )(self) if len(energy_value): energy_value = energy_value.replace("DPE", "").strip()[0] return getattr(ENERGY_CLASS, energy_value, NotAvailable)
class item(ItemElement): klass = City obj_id = Format('%s %s', CleanText('./span[has-class("city")]'), CleanText('./span[@class="zipcode"]')) obj_name = Format('%s %s', CleanText('./span[has-class("city")]'), CleanText('./span[@class="zipcode"]'))
def obj_GES(self): greenhouse_value = CleanText( '//ul[@class="energyInfosGES"]//li[@class="energyInfos"]/span/@data-class', default="" )(self) if len(greenhouse_value): greenhouse_value = greenhouse_value.replace("GES", "").strip()[0] return getattr(ENERGY_CLASS, greenhouse_value, NotAvailable)
def obj_details(self): charges = CleanText('.//span[@class="price-fees"]', default=None)(self) if charges: return { "fees": charges.split(":")[1].strip() } else: return NotLoaded
def obj_photos(self): url = CleanText('./div[has-class("default-img")]/img/@data-src')(self) if url: url = unquote(url) if "http://" in url[3:]: rindex = url.rfind("?") if rindex == -1: rindex = None url = url[url.find("http://", 3):rindex] return [HousingPhoto(url)] else: return NotLoaded
def __init__(self, *args, **kwargs): HTMLPage.__init__(self, *args, **kwargs) add_content = CleanText('(//body/script)[4]', replace=[('window.FLUX_STATE = ', '')])( self.doc) or '{}' api_content = CleanText('(//body/script[@id="__NEXT_DATA__"])')( self.doc) self.htmldoc = self.doc self.api_content = json.loads(api_content) self.doc = json.loads(add_content)
def obj_details(self): details = dict() for item in XPath('//div[@class="features clearfix"]/ul/li')(self): key = CleanText('./span[@class="name"]')(item) value = CleanText('./span[@class="value"]')(item) if value and key: details[key] = value key = CleanText('//div[@class="title-dpe clearfix"]')(self) value = CleanText('//div[@class="energy-consumption"]')(self) if value and key: details[key] = value return details
def obj_details(self): details = {} honoraires = CleanText( ( self.offer_details_wrapper + '/div/div/p[@class="offer-agency-fees"]' ), default=None )(self) if honoraires: details["Honoraires"] = ( "{} (TTC, en sus)".format( honoraires.split(":")[1].strip() ) ) return details
def obj_utilities(self): value = CleanText(PopDetail('charges_included', default='Non'), default=NotAvailable)(self) if value == "Oui": return UTILITIES.INCLUDED else: return UTILITIES.EXCLUDED
def obj_cost(self): cost = CleanDecimal(Regexp(CleanText(self.price_selector, default=''), r'de (.*) à .*', default=0))(self) if cost == 0: return CleanDecimal(self.price_selector, default=NotAvailable)(self) else: return cost
def obj_phone(self): phone = CleanText('./div/div/ul/li[has-class("js-clickphone")]', replace=[('Téléphoner : ', '')], default=NotLoaded)(self) if '...' in phone: return NotLoaded return phone
def obj_photos(self): url = CleanText( Attr('.//a[has-class("TeaserOffer-ill")]/img', 'src'))(self) # If the used photo is a default no photo, the src is on the same domain. if url[0] == '/': return [] else: return [HousingPhoto(url)]
def obj_type(self): try: breadcrumb = int(Dict('category_id')(self)) except ValueError: breadcrumb = None if breadcrumb == 11: return POSTS_TYPES.SHARING elif breadcrumb == 10: isFurnished = CleanText(PopDetail('furnished', default=' '))(self) if isFurnished.lower() == u'meublé': return POSTS_TYPES.FURNISHED_RENT else: return POSTS_TYPES.RENT else: return POSTS_TYPES.SALE
def parse(self, el): rooms_bedrooms_area = el.xpath('.//ul[has-class("item-tags")]/li') self.env['rooms'] = NotAvailable self.env['bedrooms'] = NotAvailable self.env['area'] = NotAvailable for item in rooms_bedrooms_area: name = CleanText('.')(item) if 'chambre' in name.lower(): name = 'bedrooms' value = CleanDecimal('./strong')(item) elif 'pièce' in name.lower(): name = 'rooms' value = CleanDecimal('./strong')(item) elif ' m²' in name and 'le m²' not in name: name = 'area' value = CleanDecimal( Regexp(CleanText('.'), r'(\d*\.*\d*) .*'))(item) self.env[name] = value
def obj_utilities(self): utilities = CleanText( './div/div/span[@class="price-label"]|' './div/div[@class="item-price-pdf"]|' './div/div/span[@class="item-price"]' )(self) if "CC" in utilities: return UTILITIES.INCLUDED else: return UTILITIES.UNKNOWN
def __init__(self, *args, **kwargs): HTMLPage.__init__(self, *args, **kwargs) json_content = Regexp( CleanText('//script'), r"window\[\"initialData\"\] = JSON.parse\(\"({.*})\"\);window\[\"tags\"\]" )(self.doc) json_content = codecs.unicode_escape_decode(json_content)[0] json_content = json_content.encode('utf-8', 'surrogatepass').decode('utf-8') self.doc = json.loads(json_content)
def obj_type(self): url = BrowserURL('housing', _id=Env('_id'))(self) if 'colocation' in url: return POSTS_TYPES.SHARING elif 'location' in url: isFurnished = False for li in XPath('//ul[@itemprop="description"]/li')(self): label = CleanText('./span[has-class("criteria-label")]')(li) if label.lower() == "meublé": isFurnished = ( CleanText('./span[has-class("criteria-value")]')(li).lower() == 'oui' ) if isFurnished: return POSTS_TYPES.FURNISHED_RENT else: return POSTS_TYPES.RENT elif 'vente' in url: return POSTS_TYPES.SALE return NotAvailable
def obj_house_type(self): house_type = CleanText('.//h2[@class="offerMainFeatures"]/div')(self).lower() if house_type == "appartement": return HOUSE_TYPES.APART elif house_type == "maison": return HOUSE_TYPES.HOUSE elif house_type == "terrain": return HOUSE_TYPES.LAND elif house_type == "parking": return HOUSE_TYPES.PARKING else: return HOUSE_TYPES.OTHER
def obj_house_type(self): house_type = CleanText('.//div[has-class("offer-details-caracteristik")]/meta[@itemprop="name"]/@content')(self).lower() if house_type == "appartement": return HOUSE_TYPES.APART elif house_type == "maison": return HOUSE_TYPES.HOUSE elif house_type == "terrain": return HOUSE_TYPES.LAND elif house_type == "parking": return HOUSE_TYPES.PARKING else: return HOUSE_TYPES.OTHER
def obj_details(self): return { "dispo": Date( Attr('.//span[boolean(@data-dispo)]', 'data-dispo', default=datetime.date.today().isoformat()))(self), "priceMentions": CleanText( './/span[has-class("TeaserOffer-price-mentions")]')( self) }
def parse(self, el): rooms_bedrooms_area = el.xpath( './div/a[has-class("item-title")]/ul[has-class("item-tags")]/li' ) self.env['rooms'] = NotLoaded self.env['bedrooms'] = NotLoaded self.env['area'] = NotLoaded for item in rooms_bedrooms_area: name = CleanText('.')(item) if 'chambre' in name.lower(): name = 'bedrooms' value = CleanDecimal('.')(item) elif 'pièce' in name.lower(): name = 'rooms' value = CleanDecimal('.')(item) else: name = 'area' value = CleanDecimal( Regexp(CleanText('.'), r'(\d*\.*\d*) .*'))(item) self.env[name] = value
def obj_details(self): details = {} dispo = Date( Regexp(CleanText('//p[has-class("OfferTop-dispo")]'), r'.* (\d\d\/\d\d\/\d\d\d\d)', default=datetime.date.today().isoformat()))(self) if dispo is not None: details["dispo"] = dispo priceMentions = CleanText('//p[has-class("OfferTop-mentions")]', default=None)(self) if priceMentions is not None: details["priceMentions"] = priceMentions agency = CleanText('//p[has-class("OfferContact-address")]', default=None)(self) if agency is not None: details["agency"] = agency for item in self.xpath( '//div[has-class("OfferDetails-columnize")]/div'): category = CleanText( './h3[has-class("OfferDetails-title--2")]', default=None)(item) if not category: continue details[category] = {} for detail_item in item.xpath( './/ul[has-class("List--data")]/li'): detail_title = CleanText( './/span[has-class("List-data")]')(detail_item) detail_value = CleanText('.//*[has-class("List-value")]')( detail_item) details[category][detail_title] = detail_value for detail_item in item.xpath( './/ul[has-class("List--bullet")]/li'): detail_title = CleanText('.')(detail_item) details[category][detail_title] = True try: electric_consumption = CleanDecimal( Regexp( Attr('//div[has-class("OfferDetails-content")]//img', 'src'), r'https://dpe.foncia.net\/(\d+)\/.*'))(self) details["electric_consumption"] = ( '{} kWhEP/m².an'.format(electric_consumption)) except (RegexpError, XPathNotFound): pass return details
def obj_details(self): details = {} details["creationDate"] = Date( Regexp( CleanText( '//div[@class="offer-description-notes"]' ), u'.*Mis en ligne: (\d{2}/\d{2}/\d{4}).*' ), dayfirst=True )(self) honoraires = CleanText( ( '//div[has-class("offer-price")]/span[has-class("lbl-agencyfees")]' ), default=None )(self) if honoraires: details["Honoraires"] = ( "{} (TTC, en sus)".format( honoraires.split(":")[1].strip() ) ) for li in XPath('//ul[@itemprop="description"]/li')(self): label = CleanText('./span[has-class("criteria-label")]')(li) value = CleanText('./span[has-class("criteria-value")]')(li) details[label] = value return details
def obj_house_type(self): value = CleanText(PopDetail('real_estate_type'), default=' ')(self).lower() if value == 'parking': return HOUSE_TYPES.PARKING elif value == 'appartement': return HOUSE_TYPES.APART elif value == 'maison': return HOUSE_TYPES.HOUSE elif value == 'terrain': return HOUSE_TYPES.LAND else: return HOUSE_TYPES.OTHER
def obj_location(self): script = CleanText('./script')(self) try: # Should be standard JSON+LD data script = json.loads(script) except ValueError: try: # But explorimmo can't write JSON correctly and there # is a trailing "}" script = json.loads(script.strip().rstrip('}')) except ValueError: script = None if not script: return NotLoaded try: return '%s (%s)' % ( script['address']['addressLocality'], script['address']['postalCode'] ) except (KeyError): return NotLoaded
def __init__(self, *args, **kwargs): HTMLPage.__init__(self, *args, **kwargs) json_content = Regexp( CleanText('//script'), r"window\[\"initialData\"\] = JSON.parse\(\"({.*})\"\);")(self.doc) json_content = codecs.unicode_escape_decode(json_content)[0] json_content = json_content.encode('utf-8', 'surrogatepass').decode('utf-8') self.doc = { "advert": json.loads(json_content).get('advert', {}).get('mainAdvert', {}), "agency": json.loads(json_content).get('agency', {}) }