def obj_photos(self): photos = [] for photo in self.xpath('//div[has-class("OfferSlider")]//img'): photo_url = Attr('.', 'src')(photo) photo_url = photo_url.replace('640/480', '800/600') photos.append(HousingPhoto(photo_url)) return photos
def update_origin_account(self, origin_account): for account in self.doc.xpath('//ul[@id="idCptFrom"]//li'): # get all account data data = Attr('.', 'data-comptecomplet')(account) json_data = json.loads(data.replace('"', '"')) if ( origin_account.label == CleanText().filter(json_data['libelleCompte']) and origin_account.iban == json_data['ibanCompte'] ): origin_account._currency_code = json_data['codeDevise'] origin_account._formatted_iban = json_data['ibanFormateCompte'] origin_account._min_amount = json_data['montantMin'] origin_account._max_amount = json_data['montantMax'] origin_account._decimal_code = json_data['codeDecimal'] origin_account._manage_counter = json_data['guichetGestionnaire'] origin_account._account_title = json_data['intituleCompte'] origin_account._bic = json_data['bicCompte'] origin_account._id_service = json_data['idPrestation'] origin_account._product_code = json_data['codeProduit'] origin_account._underproduct_code = json_data['codeSousProduit'] break else: # some accounts are not able to do transfer self.logger.warning('Account %s not found on transfer page', origin_account.label)
def parse(self, el): raw = self.extract.match(el.text).group("html") raw = raw.replace('\\"', '"').replace('\\n', '').replace('\\/', '/') parsed = lxml.html.fromstring(raw) self.env['name'] = CleanText('.//span[@class="popUpTitleBold"]')(parsed) self.env['object'] = CleanText('.//span[@class="popUpTitleNormal"]')(parsed).strip(' /') url = Attr('.//div[@class="popUpMsDiagramm"]/img', 'src')(parsed) self.env['id'] = url.split('_')[1] for tr in parsed.xpath('.//tr'): td = tr.xpath('.//td') if len(td) == 1 and "Datum" in td[0].text: l = td[0].text.split()[1:3] self.env['datetime'] = "%s %s" % (l[0], l[1]) elif len(td) == 2: if "Wasserstand" in td[0].text: self.env['levelvalue'] = td[1].text.split()[0] elif "Durchfluss" in td[0].text: self.env['flowvalue'] = td[1].text.split()[0] elif "Tendenz" in td[0].text: try: self.env['forecast'] = Attr('img', 'src')(td[1]).split("/")[-1] except ParseError: self.env['forecast'] = None # TODO self.env['alarm'] = None
def get_history_jid(self): span = self.doc.xpath('//span[@id="index:panelASV"]') if len(span) > 1: # Assurance Vie, we do not support this kind of account. return None span = Attr('//span[starts-with(@id, "index:j_id")]', 'id')(self.doc) jid = span.split(':')[1] return jid
def obj_DPE(self): DPE = Attr( '//div[has-class("energy-box")]//div[has-class("energy-rank")]', 'class', default="" )(self) if DPE: DPE = [x.replace("energy-rank-", "").upper() for x in DPE.split() if x.startswith("energy-rank-")][0] return getattr(ENERGY_CLASS, DPE, NotAvailable)
def obj_photos(self): photos = [] url = Attr( './/div[has-class("offer-picture")]//img', 'src' )(self) if url: url = url.replace('400x267', '800x600') url = urljoin(self.page.url, url) # Ensure URL is absolute photos.append(HousingPhoto(url)) return photos
def obj_photos(self): url = Attr( '.', 'data-img', default=None )(self) if url: url = unquote(url) if "http://" in url[3:]: rindex = url.rfind("?") if rindex == -1: rindex = None url = url[url.find("http://", 3):rindex] return [HousingPhoto(url)] else: return NotLoaded
def login(self, login, password): maxlength = Attr('//input[@id="Email"]', 'data-val-maxlength-max')(self.doc) regex = Attr('//input[@id="Email"]', 'data-val-regex-pattern')(self.doc) # their regex is: ^([\w\-+\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([\w-]+\.)+))([a-zA-Z]{2,15}|[0-9]{1,3})(\]?)$ # but it is not very good, we escape - inside [] to avoid bad character range Exception regex = regex.replace('[\w-+\.]', '[\w\-+\.]') if len(login) > maxlength: # actually it's 60 char raise BrowserIncorrectPassword(Attr('//input[@id="Email"]', 'data-val-maxlength')(self.doc)) if not re.match(regex, login): raise BrowserIncorrectPassword(Attr('//input[@id="Email"]', 'data-val-regex')(self.doc)) form = self.get_form(xpath='//form[contains(@action, "/Login/Login")]') form['Email'] = login form['Password'] = password form.submit()
def obj_level(self): classes = Attr( u'//*[@class="lignes"]//div[@id="%s"]' % self.env[u'line'], attr='class' )(self) classes = classes.split() if u"perturb_critique_trav" in classes: return CRITICAL_AND_WORK elif u"perturb_critique" in classes: return CRITICAL elif u"perturb_alerte_trav" in classes: return ALERT_AND_WORK elif u"perturb_alerte" in classes: return ALERT elif u"perturb_normal_trav" in classes: return NORMAL_AND_WORK elif u"perturb_normal" in classes: return NORMAL
def iter_internal_recipients(self): if self.doc.xpath('//ul[@id="idCmptToInterne"]'): for account in self.doc.xpath('//ul[@id="idCmptToInterne"]/li'): data = Attr('.', 'data-comptecomplet')(account) json_data = json.loads(data.replace('"', '"')) rcpt = Recipient() rcpt.category = 'Interne' rcpt.id = rcpt.iban = json_data['ibanCompte'] rcpt.label = json_data['libelleCompte'] rcpt.enabled_at = date.today() rcpt._formatted_iban = json_data['ibanFormateCompte'] rcpt._account_title = json_data['intituleCompte'] rcpt._bic = json_data['bicCompte'] rcpt._ref = '' rcpt._code_origin = '' rcpt._created_date = '' yield rcpt
class item(ItemElement): klass = Housing obj_id = Format( '%s:%s', Env('type'), Attr('.//span[boolean(@data-reference)]', 'data-reference')) def obj_url(self): return urljoin( self.page.browser.BASEURL, Link('.//h3[has-class("TeaserOffer-title")]/a')(self)) obj_title = CleanText('.//h3[has-class("TeaserOffer-title")]') obj_area = CleanDecimal( Regexp( CleanText( './/div[has-class("MiniData")]//p[@data-behat="surfaceDesBiens"]' ), r'(\d*\.*\d*) .*')) obj_cost = CleanDecimal( Regexp( CleanText('.//strong[has-class("TeaserOffer-price-num")]'), r'([\d \.]*) .*')) obj_price_per_meter = PricePerMeterFilter() obj_currency = Regexp( CleanText('.//strong[has-class("TeaserOffer-price-num")]'), r'[\d \.]* (.) .*') obj_location = CleanText('.//p[has-class("TeaserOffer-loc")]') obj_text = CleanText('.//p[has-class("TeaserOffer-description")]') def obj_photos(self): return [ Attr('.//a[has-class("TeaserOffer-ill")]/img', 'src')(self) ] obj_date = datetime.date.today() def obj_utilities(self): price = CleanText( './/strong[has-class("TeaserOffer-price-num")]')(self) if "charges comprises" in price.lower(): return UTILITIES.INCLUDED else: return UTILITIES.EXCLUDED obj_rooms = CleanDecimal( './/div[has-class("MiniData")]//p[@data-behat="nbPiecesDesBiens"]' ) obj_bedrooms = NotAvailable def obj_details(self): return { "dispo": Date( Attr('.//span[boolean(@data-dispo)]', 'data-dispo', default=datetime.date.today().isoformat()))(self), "priceMentions": CleanText( './/span[has-class("TeaserOffer-price-mentions")]')( self) }
def obj_photos(self): return [ Attr('.//a[has-class("TeaserOffer-ill")]/img', 'src')(self) ]
def obj_photos(self): photos = [] for photo in self.xpath( '//li[has-class("OfferSlider-thumbs-item")]/img'): photos.append(Attr('.', 'src')(photo)) return photos
def obj_photos(self): photos = [] url = Attr('./div[@class="item_image"]/span/span/img', 'src', default=None)(self) if url: photos.append(HousingPhoto(url)) return photos
def condition(self): num = Attr('.', 'data-fact_ligne', default='')(self) return self.env['subid'] == num
def next_page(self): page = Attr('//a[@id="next-page"]', 'data')(self) return add_qs(self.page.url, page=page)
def obj_code(self): onclick = Attr(None, 'onclick').filter( (TableCell('label')(self)[0]).xpath('.//a')) m = re.search(',\s+\'([^\'_]+)', onclick) return NotAvailable if not m else m.group(1)
def next_page(self): idt = Attr('//a[@title="suivant"]', 'id', default=None)(self.page.doc) if idt: form = self.page.get_history_form(idt) return requests.Request("POST", form.url, data=dict(form))
def get_performance_url(self): return Attr('(//li[@role="presentation"])[1]//a', 'data-href', default=None)(self.doc)
def go_start(self): idt = Attr('//a[@title="debut" or @title="precedent"]', 'id', default=None)(self.doc) if idt: form = self.get_history_form(idt) form.submit()
def get_multi(self): return [Attr('.', 'value')(option) for option in \ self.doc.xpath('//select[@class="ComboEntreprise"]/option')]
def parse(self, el): # Trying to find vdate and unitvalue unitvalue, vdate = None, None for span in TableCell('label')(self)[0].xpath('.//span'): if unitvalue is None: unitvalue = Regexp(CleanText('.'), '^([\d,]+)$', default=None)(span) if vdate is None: vdate = None if any(x in CleanText('./parent::div')(span) for x in ["échéance", "Maturity"]) else \ Regexp(CleanText('.'), '^([\d\/]+)$', default=None)(span) self.env['unitvalue'] = MyDecimal().filter(unitvalue) if unitvalue else NotAvailable self.env['vdate'] = Date(dayfirst=True).filter(vdate) if vdate else NotAvailable self.env['_link'] = None self.env['asset_category'] = NotAvailable page = None link_id = Attr(u'.//a[contains(@title, "détail du fonds")]', 'id', default=None)(self) inv_id = Attr('.//a[contains(@id, "linkpdf")]', 'id', default=None)(self) if link_id and inv_id: form = self.page.get_form('//div[@id="operation"]//form') form['idFonds'] = inv_id.split('-', 1)[-1] form['org.richfaces.ajax.component'] = form[link_id] = link_id page = self.page.browser.open(form['javax.faces.encodedURL'], data=dict(form)).page if 'hsbc.fr' in self.page.browser.BASEURL: # Special space for HSBC, does not contain any information related to performances. m = re.search(r'fundid=(\w+).+SH=(\w+)', CleanText('//complete', default='')(page.doc)) if m: # had to put full url to skip redirections. page = page.browser.open('https://www.assetmanagement.hsbc.com/feedRequest?feed_data=gfcFundData&cod=FR&client=FCPE&fId=%s&SH=%s&lId=fr' % m.groups()).page elif not self.page.browser.history.is_here(): url = page.get_invest_url() if empty(url): self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable return # URLs used in browser.py to access investments performance history: if url.startswith('https://optimisermon.epargne-retraite-entreprises'): # This URL can be used to access the BNP Wealth API to fetch investment performance and ISIN code self.env['_link'] = url self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable return elif (url.startswith('http://sggestion-ede.com/product') or url.startswith('https://www.lyxorfunds.com/part') or url.startswith('https://www.societegeneralegestion.fr') or url.startswith('http://www.etoile-gestion.com/productsheet')): self.env['_link'] = url # Try to fetch ISIN code from URL with re.match match = re.match(r'http://www.cpr-am.fr/fr/fonds_detail.php\?isin=([A-Z0-9]+)', url) match = match or re.match(r'http://www.cpr-am.fr/particuliers/product/view/([A-Z0-9]+)', url) if match: self.env['code'] = match.group(1) if is_isin_valid(match.group(1)): self.env['code_type'] = Investment.CODE_TYPE_ISIN else: self.env['code_type'] = Investment.CODE_TYPE_AMF return # Try to fetch ISIN code from URL with re.search m = re.search(r'&ISIN=([^&]+)', url) m = m or re.search(r'&isin=([^&]+)', url) m = m or re.search(r'&codeIsin=([^&]+)', url) m = m or re.search(r'lyxorfunds\.com/part/([^/]+)', url) if m: self.env['code'] = m.group(1) if is_isin_valid(m.group(1)): self.env['code_type'] = Investment.CODE_TYPE_ISIN else: self.env['code_type'] = Investment.CODE_TYPE_AMF return useless_urls = ( # pdf... http://docfinder.is.bnpparibas-ip.com/api/files/040d05b3-1776-4991-aa49-f0cd8717dab8/1536 'http://docfinder.is.bnpparibas-ip.com/', # The AXA website displays performance graphs but everything is calculated using JS scripts. # There is an API but it only contains risk data and performances per year, not 1-3-5 years. 'https://epargne-salariale.axa-im.fr/fr/', # Redirection to the Rothschild Gestion website, which doesn't exist anymore... 'https://www.rothschildgestion.com', # URL to the Morningstar website does not contain any useful information 'http://doc.morningstar.com', ) for useless_url in useless_urls: if url.startswith(useless_url): self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable return if url.startswith('http://fr.swisslife-am.com/fr/'): self.page.browser.session.cookies.set('location', 'fr') self.page.browser.session.cookies.set('prof', 'undefined') try: page = self.page.browser.open(url).page except HTTPNotFound: # Some pages lead to a 404 so we must avoid unnecessary crash self.logger.warning('URL %s was not found, investment details will be skipped.', url) if isinstance(page, CodePage): self.env['code'] = page.get_code() self.env['code_type'] = page.CODE_TYPE self.env['asset_category'] = page.get_asset_category() else: # The page is not handled and does not have a get_code method. self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable self.env['asset_category'] = NotAvailable
def get_history_jid(self): span = Attr('//*[starts-with(@id, "index:j_id")]', 'id')(self.doc) jid = span.split(':')[1] return jid
def get_iframe(self): return Attr(None, 'src').filter( self.doc.xpath('//iframe[@id="iframePartenaire"]'))
def load_virtual(self, phonenumber): for div in self.doc.xpath('//div[@class="infosLigne pointer"]'): if CleanText('.')(div).split("-")[-1].strip() == phonenumber: return Attr('.', 'onclick')(div).split('(')[1][1]
def obj_label(self): name = Attr('.', 'data-nomcontrat', default=None)(self) if not name: name = CleanText('.')(self) return name
def get_dropdown_menu(self, account_id): # Get the 'idCptSelect' in a drop-down menu that corresponds the current account for cpt in self.doc.xpath( '//select[@id="idCptSelect"]//option[@value]'): if account_id in CleanText('.', replace=[(' ', '')])(cpt): return Attr('.', 'value')(cpt)
class item(ItemElement): klass = Housing obj_id = Format( '%s:%s', Env('type'), Attr('.//span[boolean(@data-reference)]', 'data-reference')) obj_url = AbsoluteLink('.//h3[has-class("TeaserOffer-title")]/a') obj_type = Env('query_type') obj_advert_type = ADVERT_TYPES.PROFESSIONAL def obj_house_type(self): url = self.obj_url(self) for house_type, types in QUERY_HOUSE_TYPES.items(): for type in types: if ('/%s/' % type) in url: return house_type return NotLoaded obj_url = AbsoluteLink('.//h3[has-class("TeaserOffer-title")]/a') obj_title = CleanText('.//h3[has-class("TeaserOffer-title")]') obj_area = CleanDecimal(Regexp(CleanText( './/div[has-class("MiniData")]//p[@data-behat="surfaceDesBiens"]' ), r'(\d*\.*\d*) .*', default=NotAvailable), default=NotAvailable) obj_cost = CleanDecimal( './/strong[has-class("TeaserOffer-price-num")]', default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() obj_currency = Currency( './/strong[has-class("TeaserOffer-price-num")]') obj_location = CleanText('.//p[has-class("TeaserOffer-loc")]') obj_text = CleanText('.//p[has-class("TeaserOffer-description")]') def obj_photos(self): url = CleanText( Attr('.//a[has-class("TeaserOffer-ill")]/img', 'src'))(self) # If the used photo is a default no photo, the src is on the same domain. if url[0] == '/': return [] else: return [HousingPhoto(url)] obj_date = datetime.date.today() def obj_utilities(self): price = CleanText( './/strong[has-class("TeaserOffer-price-num")]')(self) if "charges comprises" in price.lower(): return UTILITIES.INCLUDED else: return UTILITIES.EXCLUDED obj_rooms = CleanDecimal( './/div[has-class("MiniData")]//p[@data-behat="nbPiecesDesBiens"]', default=NotLoaded) obj_bedrooms = CleanDecimal( './/div[has-class("MiniData")]//p[@data-behat="nbChambresDesBiens"]', default=NotLoaded) def obj_details(self): return { "dispo": Date( Attr('.//span[boolean(@data-dispo)]', 'data-dispo', default=datetime.date.today().isoformat()))(self), "priceMentions": CleanText( './/span[has-class("TeaserOffer-price-mentions")]')( self) }
class item(ItemElement): klass = BaseObject obj_id = Attr('.', 'data-trend-name')
class get_housing(ItemElement): klass = Housing obj_id = Format( '%s:%s', Env('type'), Attr('//div[boolean(@data-property-reference)]', 'data-property-reference')) obj_advert_type = ADVERT_TYPES.PROFESSIONAL def obj_type(self): type = Env('type')(self) if type == 'location': if 'appartement-meuble' in self.page.url: return POSTS_TYPES.FURNISHED_RENT else: return POSTS_TYPES.RENT elif type == 'achat': return POSTS_TYPES.SALE else: return NotAvailable def obj_url(self): return self.page.url def obj_house_type(self): url = self.obj_url() for house_type, types in QUERY_HOUSE_TYPES.items(): for type in types: if ('/%s/' % type) in url: return house_type return NotAvailable obj_title = CleanText('//h1[has-class("OfferTop-title")]') obj_area = CleanDecimal(Regexp(CleanText( '//div[has-class("MiniData")]//p[has-class("MiniData-item")][1]'), r'(\d*\.*\d*) .*', default=NotAvailable), default=NotAvailable) obj_cost = CleanDecimal('//span[has-class("OfferTop-price")]', default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() obj_currency = Currency('//span[has-class("OfferTop-price")]') obj_location = Format('%s - %s', CleanText('//p[@data-behat="adresseBien"]'), CleanText('//p[has-class("OfferTop-loc")]')) obj_text = CleanText('//div[has-class("OfferDetails-content")]/p[1]') obj_phone = Regexp(Link('//a[has-class("OfferContact-btn--tel")]'), r'tel:(.*)') def obj_photos(self): photos = [] for photo in self.xpath('//div[has-class("OfferSlider")]//img'): photo_url = Attr('.', 'src')(photo) photo_url = photo_url.replace('640/480', '800/600') photos.append(HousingPhoto(photo_url)) return photos obj_date = datetime.date.today() def obj_utilities(self): price = CleanText('//p[has-class("OfferTop-price")]')(self) if "charges comprises" in price.lower(): return UTILITIES.INCLUDED else: return UTILITIES.EXCLUDED obj_rooms = CleanDecimal( '//div[has-class("MiniData")]//p[has-class("MiniData-item")][2]', default=NotAvailable) obj_bedrooms = CleanDecimal( '//div[has-class("MiniData")]//p[has-class("MiniData-item")][3]', default=NotAvailable) def obj_DPE(self): try: electric_consumption = CleanDecimal( Regexp( Attr('//div[has-class("OfferDetails-content")]//img', 'src'), r'https://dpe.foncia.net\/(\d+)\/.*'))(self) except (RegexpError, XPathNotFound): electric_consumption = None DPE = "" if electric_consumption is not None: if electric_consumption <= 50: DPE = "A" elif 50 < electric_consumption <= 90: DPE = "B" elif 90 < electric_consumption <= 150: DPE = "C" elif 150 < electric_consumption <= 230: DPE = "D" elif 230 < electric_consumption <= 330: DPE = "E" elif 330 < electric_consumption <= 450: DPE = "F" else: DPE = "G" return getattr(ENERGY_CLASS, DPE, NotAvailable) return NotAvailable def obj_details(self): details = {} dispo = Date( Regexp(CleanText('//p[has-class("OfferTop-dispo")]'), r'.* (\d\d\/\d\d\/\d\d\d\d)', default=datetime.date.today().isoformat()))(self) if dispo is not None: details["dispo"] = dispo priceMentions = CleanText('//p[has-class("OfferTop-mentions")]', default=None)(self) if priceMentions is not None: details["priceMentions"] = priceMentions agency = CleanText('//p[has-class("OfferContact-address")]', default=None)(self) if agency is not None: details["agency"] = agency for item in self.xpath( '//div[has-class("OfferDetails-columnize")]/div'): category = CleanText( './h3[has-class("OfferDetails-title--2")]', default=None)(item) if not category: continue details[category] = {} for detail_item in item.xpath( './/ul[has-class("List--data")]/li'): detail_title = CleanText( './/span[has-class("List-data")]')(detail_item) detail_value = CleanText('.//*[has-class("List-value")]')( detail_item) details[category][detail_title] = detail_value for detail_item in item.xpath( './/ul[has-class("List--bullet")]/li'): detail_title = CleanText('.')(detail_item) details[category][detail_title] = True try: electric_consumption = CleanDecimal( Regexp( Attr('//div[has-class("OfferDetails-content")]//img', 'src'), r'https://dpe.foncia.net\/(\d+)\/.*'))(self) details["electric_consumption"] = ( '{} kWhEP/m².an'.format(electric_consumption)) except (RegexpError, XPathNotFound): pass return details
def get_origin_account_id(self, origin): return [Attr('.', 'data-acct-number')(div) for div in self.doc.xpath('//div[@id="internalAccounts"]//div[@data-acct-number]') if Attr('.', 'data-acct-number')(div) in origin.id][0]
def obj_thumbnail(self): style = Attr( './/a[has-class("bu_cuisine_recette_img")]/span', 'style' )(self) return Thumbnail(style.replace("background-image:url(", "").rstrip(");"))
def able_to_transfer(self, origin): return [div for div in self.doc.xpath('//div[@id="internalAccounts"]//div[@data-acct-number]') if Attr('.', 'data-acct-number')(div) in origin.id and 'disabled' not in div.attrib['class']]
class item(ItemElement): offer_details_wrapper = ( './div/div/div[has-class("offer-details-wrapper")]') klass = Housing obj_id = Format( '%s-%s', Regexp(Env('type'), '(.*)-.*'), CleanText('./@id', replace=[('header-offer-', '')])) obj_type = Env('query_type') obj_advert_type = ADVERT_TYPES.PROFESSIONAL def obj_house_type(self): house_type = CleanText('.//p[has-class("offer-type")]')( self).lower() if house_type == "appartement": return HOUSE_TYPES.APART elif house_type == "maison": return HOUSE_TYPES.HOUSE elif house_type == "terrain": return HOUSE_TYPES.LAND elif house_type == "parking": return HOUSE_TYPES.PARKING else: return HOUSE_TYPES.OTHER obj_title = Attr( offer_details_wrapper + '/div/div/p[@class="offer-type"]/a', 'title') obj_url = Format( "http://www.logic-immo.com/%s.htm", CleanText('./@id', replace=[('header-offer-', 'detail-location-')])) obj_area = CleanDecimal( (offer_details_wrapper + '/div/div/div[has-class("offer-details-second")]' + '/div/h3[has-class("offer-attributes")]/span' + '/span[has-class("offer-area-number")]'), default=NotAvailable) obj_rooms = CleanDecimal( (offer_details_wrapper + '/div/div/div[has-class("offer-details-second")]' + '/div/h3[has-class("offer-attributes")]' + '/span[has-class("offer-rooms")]' + '/span[has-class("offer-rooms-number")]'), default=NotAvailable) obj_price_per_meter = PricePerMeterFilter() obj_cost = CleanDecimal(Regexp( CleanText((offer_details_wrapper + '/div/div/p[@class="offer-price"]/span'), default=NotAvailable), '(.*) [%s%s%s]' % (u'€', u'$', u'£'), default=NotAvailable), default=NotAvailable) obj_currency = Currency( offer_details_wrapper + '/div/div/p[has-class("offer-price")]/span') obj_utilities = UTILITIES.UNKNOWN obj_date = Date( Regexp( CleanText( './div/div/div[has-class("offer-picture-more")]/div/p[has-class("offer-update")]' ), ".*(\d{2}/\d{2}/\d{4}).*")) obj_text = CleanText( offer_details_wrapper + '/div/div/div/p[has-class("offer-description")]/span') obj_location = CleanText(offer_details_wrapper + '//div[has-class("offer-places-block")]') def obj_photos(self): photos = [] url = Attr('.//div[has-class("offer-picture")]//img', 'src')(self) if url: url = url.replace('400x267', '800x600') url = urljoin(self.page.url, url) # Ensure URL is absolute photos.append(HousingPhoto(url)) return photos def obj_details(self): details = {} honoraires = CleanText( (self.offer_details_wrapper + '/div/div/p[@class="offer-agency-fees"]'), default=None)(self) if honoraires: details["Honoraires"] = ("{} (TTC, en sus)".format( honoraires.split(":")[1].strip())) return details
class get_housing(ItemElement): klass = Housing obj_id = Format( '%s:%s', Env('type'), Attr('//div[boolean(@data-property-reference)]', 'data-property-reference')) def obj_url(self): return self.page.url obj_title = CleanText('//h1[has-class("OfferTop-title")]') obj_area = CleanDecimal( Regexp( CleanText( '//div[has-class("MiniData")]//p[has-class("MiniData-item")][1]' ), r'(\d*\.*\d*) .*')) obj_cost = CleanDecimal( Regexp(CleanText('//p[has-class("OfferTop-price")]'), r'([\d \.]*) .*')) obj_price_per_meter = PricePerMeterFilter() obj_currency = Regexp(CleanText('//p[has-class("OfferTop-price")]'), r'[\d \.]* (.) .*') obj_location = Format('%s - %s', CleanText('//p[@data-behat="adresseBien"]'), CleanText('//p[has-class("OfferTop-loc")]')) obj_text = CleanText('//div[has-class("OfferDetails-content")]/p[1]') obj_phone = Regexp(Link('//a[has-class("OfferContact-btn--tel")]'), r'tel:(.*)') def obj_photos(self): photos = [] for photo in self.xpath( '//li[has-class("OfferSlider-thumbs-item")]/img'): photos.append(Attr('.', 'src')(photo)) return photos obj_date = datetime.date.today() def obj_utilities(self): price = CleanText('//p[has-class("OfferTop-price")]')(self) if "charges comprises" in price.lower(): return UTILITIES.INCLUDED else: return UTILITIES.EXCLUDED obj_rooms = CleanDecimal( '//div[has-class("MiniData")]//p[has-class("MiniData-item")][2]') obj_bedrooms = CleanDecimal( '//div[has-class("MiniData")]//p[has-class("MiniData-item")][3]', default=NotAvailable) def obj_details(self): details = { "dispo": Date( Regexp(CleanText('//p[has-class("OfferTop-dispo")]'), r'.* (\d\d\/\d\d\/\d\d\d\d)', default=datetime.date.today().isoformat()))(self), "priceMentions": CleanText('//p[has-class("OfferTop-mentions")]')(self), "agency": CleanText('//p[has-class("OfferContact-address")]')(self) } for item in self.xpath( '//div[has-class("OfferDetails-columnize")]/div'): category = CleanText( './h3[has-class("OfferDetails-title--2")]')(item) details[category] = {} for detail_item in item.xpath( './/ul[has-class("List--data")]/li'): detail_title = CleanText( './/span[has-class("List-data")]')(detail_item) detail_value = CleanText('.//*[has-class("List-value")]')( detail_item) details[category][detail_title] = detail_value for detail_item in item.xpath( './/ul[has-class("List--bullet")]/li'): detail_title = CleanText('.')(detail_item) details[category][detail_title] = True electric_consumption = CleanDecimal( Regexp(Attr('//div[has-class("OfferDetails-content")]//img', 'src'), r'https://dpe.foncia.net\/(\d+)\/.*', default=None)(self)) if electric_consumption is not None: details["electric_consumption"] = '{} kWhEP/m².an'.format( electric_consumption) if electric_consumption <= 50: details["DPE"] = "A" elif electric_consumption > 50 and electric_consumption <= 90: details["DPE"] = "B" elif electric_consumption > 90 and electric_consumption <= 150: details["DPE"] = "C" elif electric_consumption > 150 and electric_consumption <= 230: details["DPE"] = "D" elif electric_consumption > 230 and electric_consumption <= 330: details["DPE"] = "E" elif electric_consumption > 330 and electric_consumption <= 450: details["DPE"] = "F" else: details["DPE"] = "G" else: details["electric_consumption"] = NotAvailable details["DPE"] = NotAvailable return details
class item(ItemElement): klass = Subscription # TODO: Handle energy type obj_label = CleanText(CleanHTML('.')) obj_id = Attr('./input', 'value')
def parse(self, el): # Trying to find vdate and unitvalue unitvalue, vdate = None, None for span in TableCell('label')(self)[0].xpath('.//span'): if unitvalue is None: unitvalue = Regexp(CleanText('.'), '^([\d,]+)$', default=None)(span) if vdate is None: vdate = None if any(x in CleanText('./parent::div')(span) for x in [u"échéance", "Maturity"]) else \ Regexp(CleanText('.'), '^([\d\/]+)$', default=None)(span) self.env['unitvalue'] = MyDecimal().filter(unitvalue) if unitvalue else NotAvailable self.env['vdate'] = Date(dayfirst=True).filter(vdate) if vdate else NotAvailable page = None link_id = Attr(u'.//a[contains(@title, "détail du fonds")]', 'id', default=None)(self) inv_id = Attr('.//a[contains(@id, "linkpdf")]', 'id', default=None)(self) if link_id and inv_id: form = self.page.get_form('//div[@id="operation"]//form') form['idFonds'] = inv_id.split('-', 1)[-1] form['org.richfaces.ajax.component'] = form[link_id] = link_id page = self.page.browser.open(form['javax.faces.encodedURL'], data=dict(form)).page if "hsbc.fr" in self.page.browser.BASEURL: # special space for HSBC m = re.search('fundid=(\w+).+SH=(\w+)', CleanText('//complete', default="")(page.doc)) if m: # had to put full url to skip redirections. page = page.browser.open('https://www.assetmanagement.hsbc.com/feedRequest?feed_data=gfcFundData&cod=FR&client=FCPE&fId=%s&SH=%s&lId=fr' % m.groups()).page elif "consulteroperations" not in self.page.browser.url: # not on history url = Regexp(CleanText('//complete'), r"openUrlFichesFonds\('(.*?)',true|false\).*", default=NotAvailable)(page.doc) if url is NotAvailable: # redirection to a useless graphplot page with url like /portal/salarie-sg/fichefonds?idFonds=XXX&source=/portal/salarie-sg/monepargne/mesavoirs # or on bnp, look for plot display function in a script assert CleanText('//redirect/@url')(page.doc) or CleanText('//script[contains(text(), "afficherGraphique")]')(page.doc) self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable return useless_urls = ( # pdf... http://docfinder.is.bnpparibas-ip.com/api/files/040d05b3-1776-4991-aa49-f0cd8717dab8/1536 'http://docfinder.is.bnpparibas-ip.com/', # Redirection to a useless page with url like "https://epargne-salariale.axa-im.fr/fr/" 'https://epargne-salariale.axa-im.fr/fr/', ) for useless_url in useless_urls: if url.startswith(useless_url): self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable return match = re.match(r'http://www.cpr-am.fr/fr/fonds_detail.php\?isin=([A-Z0-9]+)', url) match = match or re.match(r'http://www.cpr-am.fr/particuliers/product/view/([A-Z0-9]+)', url) if match: self.env['code'] = match.group(1) self.env['code_type'] = Investment.CODE_TYPE_ISIN return if url.startswith('http://fr.swisslife-am.com/fr/'): self.page.browser.session.cookies.set('location', 'fr') self.page.browser.session.cookies.set('prof', 'undefined') page = self.page.browser.open(url).page try: self.env['code'] = page.get_code() self.env['code_type'] = page.CODE_TYPE # Handle page is None and page has not get_code method except AttributeError: self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable
def get_typelist(self): url = Attr(None, 'src').filter( self.doc.xpath('//script[contains(@src, "comptes/scripts")]')) m = re.search('synthesecomptes[^\w]+([^:]+)[^\w]+([^"]+)', self.browser.open(url).content) return {m.group(1): m.group(2)}
def obj__link(self): link = Attr('.//a[@class="account--name"]', 'href', default=NotAvailable)(self) if not self.page.browser.webid: self.page.browser.webid = re.search('\/([^\/|?|$]{32})(\/|\?|$)', link).group(1) return '%smouvements' % link if link.startswith('/budget') else link
class item(Transaction.TransactionElement): load_details = Attr('.', 'href', default=None) & AsyncLoad def obj_type(self): type = Async( 'details', CleanText( u'//td[contains(text(), "Nature de l\'opération")]/following-sibling::*[1]' ))(self) if not type: return Transaction.TYPE_UNKNOWN for pattern, _type in Transaction.PATTERNS: match = pattern.match(type) if match: return _type break return Transaction.TYPE_UNKNOWN def condition(self): return self.parent.get_colnum('date') is not None and \ len(self.el.findall('td')) >= 3 and \ self.el.get('class') and \ 'tableTr' not in self.el.get('class') def validate(self, obj): if obj.category == 'RELEVE CB': obj.type = Transaction.TYPE_CARD_SUMMARY obj.deleted = True raw = Async( 'details', CleanText( u'//td[contains(text(), "Libellé")]/following-sibling::*[1]|//td[contains(text(), "Nom du donneur")]/following-sibling::*[1]', default=obj.raw))(self) if raw: if obj.raw in raw or raw in obj.raw or ' ' not in obj.raw: obj.raw = raw obj.label = raw else: obj.label = '%s %s' % (obj.raw, raw) obj.raw = '%s %s' % (obj.raw, raw) if not obj.date: obj.date = Async( 'details', Date(CleanText( u'//td[contains(text(), "Date de l\'opération")]/following-sibling::*[1]', default=u''), default=NotAvailable))(self) obj.rdate = obj.date obj.vdate = Async( 'details', Date(CleanText( u'//td[contains(text(), "Date de valeur")]/following-sibling::*[1]', default=u''), default=NotAvailable))(self) obj.amount = Async( 'details', CleanDecimal( u'//td[contains(text(), "Montant")]/following-sibling::*[1]', replace_dots=True, default=NotAvailable))(self) # ugly hack to fix broken html if not obj.amount: obj.amount = Async( 'details', CleanDecimal( u'//td[contains(text(), "Montant")]/following-sibling::*[1]', replace_dots=True, default=NotAvailable))(self) return True
def history_tabs_urls(self): return [ Attr('.', 'href')(tab) for tab in self.doc.xpath('//ul//a[contains(text(), "Débit le")]') ]
def get_login(self, phonenumber): return Attr('.', 'login')(self.doc.xpath( '//div[div[contains(text(), $phone)]]', phone=phonenumber)[0])
def has_next(self): current = Attr('//input[@id="numPage"]', 'value', default='')(self.doc) end = CleanText('//td[@id="numPageBloc"]/b[@class="contenu3-lien"]', replace=[('/', '')])(self.doc) return end and current and int(end) > int(current)
def get_number(self): return Attr('//div[@id="player"]', 'data-main-video')(self.doc)