def get_session_storage(self): sessionContent = Regexp( CleanText('//script[@type="text/javascript"]'), 'sessionStorage.setItem\((.*)\)' )(self.doc) key, value = map(lambda x: x.strip("'").strip(), sessionContent.split(",", 1)) return key, json.decoder.JSONDecoder().decode(value)
def obj_type(self): types = {'comptes? bancaires?': Account.TYPE_CHECKING, 'livrets?': Account.TYPE_SAVINGS, 'epargnes? logement': Account.TYPE_SAVINGS, "autres produits d'epargne": Account.TYPE_SAVINGS, 'comptes? titres? et pea': Account.TYPE_MARKET, 'compte-titres': Account.TYPE_MARKET, 'assurances? vie et retraite': Account.TYPE_LIFE_INSURANCE, u'prêt': Account.TYPE_LOAN, u'crédits?': Account.TYPE_LOAN, 'plan d\'epargne en actions': Account.TYPE_PEA } # first trying to match with label label = Field('label')(self) for atypetxt, atype in types.items(): if re.findall(atypetxt, label.lower()): # match with/without plurial in type return atype # then by type type = Regexp(CleanText('../../preceding-sibling::div[@class="avoirs"][1]/span[1]'), r'(\d+) (.*)', '\\2')(self) for atypetxt, atype in types.items(): if re.findall(atypetxt, type.lower()): # match with/without plurial in type return atype return Account.TYPE_UNKNOWN
def filter(self, el): duration = Regexp(CleanText('.'), r'.+\|(.+)')(el[0]) if duration[-1:] == "'": t = [0, int(duration[:-1])] else: t = map(int, duration.split(':')) return timedelta(hours=t[0], minutes=t[1])
def get_cards(self): cards = [] for tr in self.doc.getiterator('tr'): link = Regexp(CleanText('./@onclick'), "'(.*)'", default=None)(tr) if link is not None and link.startswith('/outil/UWCB/UWCBEncours') and 'listeOperations' in link: cards.append(link) return cards
def obj_id(self): id = Regexp(CleanText('./a/@href'), '//www.france.tv/(.*)/', default=None)(self) if not id: id = CleanText('.')(self) id = id.encode('ascii', 'ignore') id = hashlib.md5(id).hexdigest() id = u'vid_%s' % id return id
def obj_rdate(self): if self.obj.rdate: # Transaction.Raw may have already set it return self.obj.rdate s = Regexp(Field('raw'), ' (\d{2}/\d{2}/\d{2}) | (?!NUM) (\d{6}) ', default=NotAvailable)(self) if not s: return Field('date')(self) s = s.replace('/', '') # Sometimes the user enters an invalid date 16/17/19 for example return Date(dayfirst=True, default=NotAvailable).filter('%s%s%s%s%s' % (s[:2], '-', s[2:4], '-', s[4:]))
def on_load(self): # website may have identify us as a robot, if it happens login form won't be available try: attr = Attr('head/meta', 'name')(self.doc) except AttributeNotFound: # website have identify us as a human ;) return # sometimes robots is uppercase and there is an iframe # sometimes it's lowercase and there is an encoded javascript if attr == 'ROBOTS': self.browser.location(Attr('//iframe', 'src')(self.doc)) elif attr == 'robots': hexa_code = Regexp(CleanText('head/script[contains(text(), "function")]'), r'var b="(.*?)"')(self.doc) code = hexa_code.decode("hex") url = re.search(r'xhr.open\("GET","(.*?)"', code).group(1) self.browser.location(url)
def parse(self, el): # we have two kinds of page and sometimes we don't have any advisor agency_phone = CleanText('//span/a[contains(@href, "rendezVous")]', replace=[(' ', '')], default=NotAvailable)(self) or \ CleanText('//div[has-class("lbp-numero")]/span', replace=[(' ', '')], default=NotAvailable)(self) advisor_phone = Regexp(CleanText('//div[h3[contains(text(), "conseil")]]//span[2]', replace=[(' ', '')], default=""), '(\d+)', default="")(self) if advisor_phone.startswith(("06", "07")): self.env['phone'] = agency_phone self.env['mobile'] = advisor_phone else: self.env['phone'] = advisor_phone or agency_phone agency = CleanText('//div[h3[contains(text(), "Bureau")]]/div[not(@class)][1]')(self) or NotAvailable name = CleanText('//div[h3[contains(text(), "conseil")]]//span[1]', default=None)(self) or \ CleanText('//div[@class="lbp-font-accueil"]/div[2]/div[1]/span[1]', default=None)(self) if name: self.env['name'] = name self.env['agency'] = agency else: self.env['name'] = agency
class get_video(ItemElement): klass = RmllVideo obj_id = CleanHTML('/html/head/meta[@property="og:url"]/@content' ) & CleanText() & Regexp( pattern=r'.*/permalink/(.+)/$') obj_title = Format( u'%s', CleanHTML('/html/head/meta[@name="DC.title"]/@content') & CleanText()) obj_description = Format( u'%s', CleanHTML('/html/head/meta[@property="og:description"]/@content') & CleanText()) def obj_thumbnail(self): url = NormalizeThumbnail( CleanText('/html/head/meta[@property="og:image"]/@content'))( self) if url: thumbnail = Thumbnail(url) thumbnail.url = thumbnail.id return thumbnail obj_duration = CleanText('/html/head/script[not(@src)]') & Regexp( pattern=r'media_duration: ([^,.]+),?.*,', default='') & Duration(default=NotAvailable) def obj_url(self): links = XPath( '//div[@id="tab_sharing_content"]/div/div/div[@class="paragraph"]/div[@class="share"]/a[@target="_blank"]/@href' )(self) for link in links: ext = str(link).split('.')[-1] self.logger.debug("Link:%s Ext:%s", link, ext) if ext in ['mp4', 'webm']: return unicode(link)
def iter_pocket(self, label): date_available, condition = 0, 0 for tr in self.doc.xpath( u'//table[@summary="Liste des échéances"]/tbody/tr'): tds = tr.findall('td') pocket = Pocket() i = 0 if len(tds) <= 2: continue elif len(tds) < 6: pocket.availability_date = date_available pocket.condition = condition else: i += 1 pocket.availability_date = Date(Regexp(CleanText(tds[0]), '([\d\/]+)', default=NotAvailable), default=NotAvailable)(tr) date_available = pocket.availability_date pocket.condition = Pocket.CONDITION_DATE if pocket.availability_date is not NotAvailable else \ self.CONDITIONS.get(CleanText(tds[0])(tr).lower().split()[0], Pocket.CONDITION_UNKNOWN) condition = pocket.condition pocket.label = CleanText(tds[i])(tr) pocket.quantity = CleanDecimal(tds[i + 3], replace_dots=True)(tr) pocket.amount = CleanDecimal(tds[i + 4], replace_dots=True)(tr) if 'PEI' in label.split()[0]: label = 'PEE' if Regexp(CleanText(tds[i]), '\(([\w]+).*\)$')(tr) not in label.split()[0]: continue yield pocket
class item(ItemElement): klass = SensCritiquenCalendarEvent def condition(self): if '_id' in self.env and self.env['_id']: return Format(u'%s#%s#%s', Regexp(Link('.'), '/film/(.*)'), FormatDate("%Y%m%d%H%M", Date('div[@class="elgr-guide-details"]/div[@class="elgr-data-diffusion"]')), CleanText('./div/span[@class="d-offset"]', replace=[(' ', '-')]))(self) == self.env['_id'] return True def validate(self, obj): if 'date_from' in self.env and self.env['date_from'] and obj.start_date > self.env['date_from']: if not self.env['date_to']: return True else: if empty(obj.end_date): if obj.start_date < self.env['date_to']: return True elif obj.end_date <= self.env['date_to']: return True if '_id' in self.env: return True return False obj_id = Format(u'%s#%s#%s', Regexp(Link('.'), '/film/(.*)'), FormatDate("%Y%m%d%H%M", Date('div/div[@class="elgr-data-diffusion"]')), CleanText('./div/span[@class="d-offset"]', replace=[(' ', '-')])) obj_start_date = Date('div/div[@class="elgr-data-diffusion"]') obj_summary = Format('%s - %s', Regexp(CleanText('./div/img/@alt'), '^Affiche(.*)'), CleanText('./div/span[@class="d-offset"]'))
class get_housing(ItemElement): klass = Housing obj_id = Env('_id') obj_title = CleanText('//h1[@class="desc clearfix"]/span[@class="title"]') obj_cost = CleanDecimal('//h1[@class="desc clearfix"]/span[@class="prix"]') obj_currency = Regexp(CleanText('//h1[@class="desc clearfix"]/span[@class="prix"]'), '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') obj_area = CleanDecimal(Regexp(CleanText('//h1[@class="desc clearfix"]/span[@class="title"]'), '(.*?)(\d*) m\xb2(.*?)', '\\2'), default=NotAvailable) obj_location = CleanText('//div[@class="text-annonce"]/h2') obj_text = CleanText(CleanHTML('//div[@class="text-annonce-container"]/p')) obj_station = CleanText('//div[@class="metro"]') obj_phone = CleanText('(//span[@class="telephone hide-tel"])[1]') obj_url = BrowserURL('housing', _id=Env('_id')) def obj_details(self): details = dict() for item in XPath('//div[@class="footer-descriptif"]/ul/li')(self): key = CleanText('./span[@class="label"]')(item) value = CleanText('.', replace=[(key, '')])(item) if value and key: details[key] = value key = CleanText('//div[@class="classe-energie-content"]/div/div/span')(self) value = Format('%s(%s)', CleanText('//div[@class="classe-energie-content"]/div/div/p'), CleanText('//div[@class="classe-energie-content"]/div/@class', replace=[('-', ' ')]))(self) if value and key: details[key] = value return details def obj_photos(self): photos = [] for img in XPath('//div[@class="showcase-thumbnail"]/img/@src')(self): photos.append(HousingPhoto(u'%s' % img)) return photos
class item(ItemElement): klass = Housing obj_id = QueryValue( Attr('.//div[has-class("presentationItem")]/h2/a', 'href'), 'idter') obj_url = AbsoluteLink('.//h2/a') obj_type = POSTS_TYPES.SALE obj_advert_type = ADVERT_TYPES.PROFESSIONAL obj_house_type = HOUSE_TYPES.LAND obj_title = CleanText('.//div[@class="presentationItem"]/h2/a') obj_area = CleanDecimal( Regexp(CleanText('.//div[@class="presentationItem"]/h3'), 'surface de (\d+) m²')) obj_cost = CleanDecimal( CleanText('.//div[@class="presentationItem"]/h3/span[1]', replace=[(".", ""), (" €", "")])) obj_currency = Currency.get_currency(u'€') obj_date = Date( CleanText( './/div[@class="presentationItem"]//span[@class="majItem"]', replace=[("Mise à jour : ", "")])) obj_text = CleanText('.//div[@class="presentationItem"]/p') obj_phone = CleanText( './/div[@class="divBoutonContact"]/div[@class="phone-numbers-bloc"]/p[1]/strong' ) def obj_photos(self): for photo in self.xpath( './/div[has-class("photoItemListe")]/img/@data-src'): if photo: photo_url = BASE_URL + '/' + photo return [HousingPhoto(photo_url)] else: return [] obj_utilities = UTILITIES.UNKNOWN
class item(ItemElement): klass = GaugeMeasure verif = re.compile("\d\d.\d\d.\d+ \d\d:\d\d") obj_date = DateTime( Regexp(CleanText('.'), r'(\d+)\.(\d+)\.(\d+) (\d+):(\d+)', r'\3-\2-\1 \4:\5')) sensor_types = [u'Level', u'Flow'] def obj_level(self): index = self.sensor_types.index(self.env['sensor'].name) + 1 try: return float(self.el[index].text_content()) except ValueError: return NotAvailable
class get_job_advert(ItemElement): klass = BaseJobAdvert obj_description = Join('\n', '//div[@id="annonce-detail"]/p[@class="text"]', textCleaner=CleanHTML) obj_id = Env('_id') obj_url = BrowserURL('advert_page', _id=Env('_id')) obj_publication_date = Date(Regexp(CleanText('//div[@id="annonce-detail"]/p[@class="infos"]'), '(\d{2}/\d{2}/\d{4})', default=NotAvailable), default=NotAvailable) obj_title = CleanText('//div[@id="annonce"]/div/div/h1') obj_society_name = CleanText('//section[@class="entp-resume"]/h1/a') obj_contract_type = CleanText('//dl[@class="infos-annonce"]/dt[span[@class="picto picto-contrat-grey"]]/following-sibling::dd[1]') obj_place = CleanText('//dl[@class="infos-annonce"]/dt[span[@class="picto picto-geolocalisation-grey"]]/following-sibling::dd[1]') obj_pay = CleanText('//div[@id="annonce-detail"]/p[@class="infos"]/preceding-sibling::p[1]', replace=[('Salaire : ', '')])
class item(ItemElement): klass = Message obj_id = Regexp(Link('./div/div/small/a', default=''), '/.+/status/(.+)', default=None) obj_title = Regexp( CleanText('./div/p', replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]), '(.{50}|.+).+') obj_content = CleanText('./div/p', replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]) obj_sender = Regexp(Link('./div/div/small/a', default=''), '/(.+)/status/.+', default=None) obj_date = DatetimeFromTimestamp( Attr( './div/div[@class="stream-item-header"]/small/a/span | ./div/div[@class="ProfileTweet-authorDetails"]/span/a/span', 'data-time')) def validate(self, obj): return obj.id is not None
class item(ItemElement): klass = Investment def condition(self): return Field('quantity')(self) is not NotAvailable obj_label = CleanText('./th') obj_quantity = CleanDecimal(TableCell('quantity'), default=NotAvailable) obj_unitvalue = CleanDecimal(TableCell('unitvalue')) obj_valuation = CleanDecimal(TableCell('valuation')) obj_portfolio_share = Eval( lambda x: x / 100, CleanDecimal(TableCell('portfolio_share'))) obj_code = Regexp(Link('./th/a'), r'isin=(\w+)|/(\w+)\.pdf') obj_code_type = Investment.CODE_TYPE_ISIN
class item(ItemElement): klass = Transaction obj_date = Date(Regexp(CleanText('.//div[1]'), r'(\d{2}\/\d{2}\/\d{4})'), dayfirst=True) obj_label = Format( '%s %s', CleanText('./preceding::h3[1]'), Regexp(CleanText('./div[1]'), r'(\d{2}\/\d{2}\/\d{4})')) def obj_amount(self): return sum(x.valuation for x in Field('investments')(self)) def obj_investments(self): investments = [] for elem in self.xpath( './following-sibling::div[1]//tbody/tr'): inv = Investment() inv.label = CleanText('./td[1]')(elem) inv.valuation = Coalesce( CleanDecimal.French('./td[2]/p', default=NotAvailable), CleanDecimal.French('./td[2]'))(elem) investments.append(inv) return investments
class item(ItemElement): klass = Housing obj_id = Format( '%s-%s', Regexp(Env('type'), '(.*)-.*'), CleanText('./@id', replace=[('header-offer-', '')])) obj_title = CleanText( './div/div/div[@class="offer-details-wrapper"]/div/div/p[@class="offer-type"]/span/@title' ) obj_area = CleanDecimal( CleanText( './div/div/div[@class="offer-details-wrapper"]/div/div/div/div/h3/a/span[@class="offer-area-number"]', default=NotAvailable)) obj_cost = CleanDecimal(Regexp(CleanText( './div/div/div[@class="offer-details-wrapper"]/div/div/p[@class="offer-price"]/span', default=NotAvailable), '(.*) [%s%s%s]' % (u'€', u'$', u'£'), default=NotAvailable), default=Decimal(0)) obj_currency = Regexp(CleanText( './div/div/div[@class="offer-details-wrapper"]/div/div/p[@class="offer-price"]/span', default=NotAvailable), '.* ([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') obj_date = Date( Regexp( CleanText( './div/div/div[has-class("offer-picture-more")]/div/p[@class="offer-update"]' ), ".*(\d{2}/\d{2}/\d{4}).*")) obj_text = CleanText( './div/div/div[@class="offer-details-wrapper"]/div/div/div/p[has-class("offer-description")]/span' ) obj_location = CleanText( './div/div/div[@class="offer-details-wrapper"]/div/div/div/div/h2' )
def iter_investments(self, account): for row, elem_repartition, elem_pocket, elem_diff in self.iter_invest_rows( account=account): inv = Investment() inv._account = account inv._el_pocket = elem_pocket inv.label = CleanText('.//td[1]')(row) _url = Link('.//td[1]/a', default=None)(row) if _url: inv._url = self.absurl(_url) else: # If _url is None, self.absurl returns the BASEURL, so we need to set the value manually. inv._url = None inv.valuation = MyDecimal('.//td[2]')(row) # On all Cmes children the row shows percentages and the popup shows absolute values in currency. # On Cmes it is mirrored, the popup contains the percentage. is_mirrored = '%' in row.text_content() if not is_mirrored: inv.diff = MyDecimal('.//td[3]')(row) if elem_diff is not None: inv.diff_ratio = Eval( lambda x: x / 100, MyDecimal( Regexp(CleanText('.'), r'([+-]?[\d\s]+[\d,]+)\s*%')))(elem_diff) else: inv.diff = MyDecimal('.')(elem_diff) if elem_diff is not None: inv.diff_ratio = Eval( lambda x: x / 100, MyDecimal( Regexp(CleanText('.//td[3]'), r'([+-]?[\d\s]+[\d,]+)\s*%')))(row) yield inv
class item(ItemElement): klass = Advisor obj_name = Format('%s %s %s', Dict('data/civilite'), Dict('data/prenom'), Dict('data/nom')) obj_email = Regexp(Dict('data/mail'), '(?=\w)(.*)', default=NotAvailable) obj_phone = CleanText(Dict('data/telephone'), replace=[(' ', '')]) obj_mobile = CleanText(Dict('data/mobile'), replace=[(' ', '')]) obj_fax = CleanText(Dict('data/fax'), replace=[(' ', '')]) obj_agency = Dict('data/agence') obj_address = Format('%s %s %s', Dict('data/adresseAgence'), Dict('data/codePostalAgence'), Dict('data/villeAgence'))
class item(ItemElement): klass = Housing obj_id = Format( 'colocation-%s', CleanText('./div/header/@id', replace=[('header-offer-', '')])) obj_title = CleanText( CleanHTML( './div/header/section/p[@class="property-type"]/span/@title' )) obj_area = CleanDecimal( './div/header/section/p[@class="offer-attributes"]/a/span[@class="offer-area-number"]', default=0) obj_cost = CleanDecimal('./div/header/section/p[@class="price"]', default=0) obj_currency = Regexp( CleanText('./div/header/section/p[@class="price"]', default=NotAvailable), '.* ([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') obj_text = CleanText( './div/div[@class="content-offer"]/section[has-class("content-desc")]/p/span[has-class("offer-text")]/@title' ) obj_date = Date( Regexp( CleanText( './div/header/section/p[has-class("update-date")]'), ".*(\d{2}/\d{2}/\d{4}).*")) obj_location = CleanText( '(./div/div[@class="content-offer"]/section[has-class("content-desc")]/p)[1]/span/@title' )
def obj_type(self): types = { 'comptes? bancaires?': Account.TYPE_CHECKING, "plan d'epargne populaire": Account.TYPE_SAVINGS, 'livrets?': Account.TYPE_SAVINGS, 'epargnes? logement': Account.TYPE_SAVINGS, "autres produits d'epargne": Account.TYPE_SAVINGS, 'compte relais': Account.TYPE_SAVINGS, 'comptes? titres? et pea': Account.TYPE_MARKET, 'compte-titres': Account.TYPE_MARKET, 'assurances? vie': Account.TYPE_LIFE_INSURANCE, 'prêt': Account.TYPE_LOAN, 'crédits?': Account.TYPE_LOAN, 'plan d\'epargne en actions': Account.TYPE_PEA, 'comptes? attente': Account.TYPE_CHECKING, 'perp': Account.TYPE_PERP, 'assurances? retraite': Account.TYPE_PERP, } # first trying to match with label label = Field('label')(self) for atypetxt, atype in types.items(): if re.findall(atypetxt, label.lower()): # match with/without plurial in type return atype # then by type type = Regexp( CleanText( '../../preceding-sibling::div[@class="avoirs"][1]/span[1]'), r'(\d+) (.*)', '\\2')(self) for atypetxt, atype in types.items(): if re.findall(atypetxt, type.lower()): # match with/without plurial in type return atype return Account.TYPE_UNKNOWN
class item(ItemElement): klass = Account TYPE = { 'Livret': Account.TYPE_SAVINGS, 'Compte': Account.TYPE_CHECKING, 'PEA': Account.TYPE_PEA, 'PEA-PME': Account.TYPE_PEA, 'Compte-titres': Account.TYPE_MARKET, 'Assurance-vie': Account.TYPE_LIFE_INSURANCE, 'Crédit': Account.TYPE_LOAN, } obj_id = CleanText( './td//div[contains(@class, "-synthese-title") or contains(@class, "-synthese-text")]' ) & Regexp(pattern=r'(\d+)') obj_label = CleanText( './td//div[contains(@class, "-synthese-title")]') obj_balance = MyDecimal( './td//div[contains(@class, "-synthese-num")]', replace_dots=True) obj_currency = FrenchTransaction.Currency( './td//div[contains(@class, "-synthese-num")]') obj_type = Map(Regexp(Field('label'), r'^([^ ]*)'), TYPE, default=Account.TYPE_UNKNOWN) def obj_url(self): return urljoin(self.page.url, CleanText('./@data-href')(self)) obj__card_balance = CleanDecimal( './td//div[@class="synthese-encours"][last()]/div[2]', default=None) def condition(self): return not len(self.el.xpath('./td[@class="chart"]'))
def __init__(self, page): img_url = Regexp(CleanText('//style'), r'background:url\((.*?)\)', default=None)(page.doc) or \ Regexp(CleanText('//script'), r'IMG_ALL = "(.*?)"', default=None)(page.doc) size = 252 if not img_url: img_url = page.doc.xpath('//img[@id="imageCVS"]')[0].attrib['src'] size = 146 coords = {} x, y, width, height = (0, 0, size // 4, size // 4) for i, _ in enumerate( page.doc.xpath('//div[@id="imageclavier"]//button')): code = '%02d' % i coords[code] = (x + 4, y + 4, x + width - 8, y + height - 8) if (x + width + 1) >= size: y += height + 1 x = 0 else: x += width + 1 data = page.browser.open(img_url).content VirtKeyboard.__init__(self, BytesIO(data), coords, self.color) self.check_symbols(self.symbols, page.browser.responses_dirname)
class get_profile(ItemElement): klass = Person obj_email = CleanText( '//form[@id="idCoordonneePersonnelle"]//table//strong[contains(text(), "e-mail")]/parent::td', children=False) obj_phone = CleanText( '//form[@id="idCoordonneePersonnelle"]//table//strong[contains(text(), "mobile")]/parent::td', children=False) obj_address = Regexp( CleanText( '//form[@id="idCoordonneePersonnelle"]//table//strong[contains(text(), "adresse fiscale")]/parent::td', children=False), '^(.*?)\/')
class get_video(ItemElement): klass = BaseVideo obj_nsfw = True obj_ext = 'mp4' obj_title = Attr('//meta[@property="og:title"]', 'content') obj_id = Env('id') obj__props = Eval(json.loads, Regexp(RawText('//script[contains(text(),"window.initials =")]'), r'window.initials = (.*);\n')) obj_duration = Base(Field('_props'), Dict('videoModel/duration')) obj_url = Base(Field('_props'), Dict('videoModel/mp4File')) def obj__page(self): return self.page.url
class get_job_advert(ItemElement): klass = BaseJobAdvert obj_id = Env('id') obj_url = BrowserURL('advert_page', id=Env('id')) obj_title = CleanText('//title') obj_job_name = CleanText('//title') obj_society_name = CleanText('//div[2]/div[@class="col-md-9"]/h4[1]') obj_publication_date = Date(CleanText( '//div[2]/div[@class="col-md-9"]/small', replace=[(u'Ajoutée le', '')]), parse_func=parse_french_date) obj_place = Regexp(CleanText('//div[2]/div[@class="col-md-9"]/h4[2]'), '(.*) \(.*\)') obj_description = CleanHTML('//div[4]/div[@class="col-md-9"]')
class item(ItemElement): klass = Bill def obj_url(self): return urljoin(self.page.url, Regexp(Dict('sOperation'), r'"(/.*\.pdf)')(self)) _num = Regexp(Field('url'), r'facture_(\d+).pdf') obj_id = Format('%s_%s', Env('subid'), _num) obj_date = Eval(datetime.fromtimestamp, Dict('sTimestamp')) obj_label = Format('Facture %s', _num) obj_price = CleanDecimal(Dict('fMontant')) obj_currency = Currency(Dict('sMontant')) obj_type = 'bill' obj_format = 'pdf'
class item(ItemElement): klass = Transaction obj_amount = MyDecimal('./th[@scope="rowgroup"][2]') obj_label = CleanText('(//p[contains(@id, "smltitle")])[2]') obj_raw = Transaction.Raw(Field('label')) obj_date = Date(Regexp( CleanText('(//p[contains(@id, "smltitle")])[1]'), r'(\d{1,2}/\d{1,2}/\d+)'), dayfirst=True) def obj__account_label(self): account_label = CleanText('./th[@scope="rowgroup"][1]')(self) return self.page.ACCOUNTS_SPE_LABELS.get( account_label, account_label)
class get_transfer(ItemElement): klass = Transfer obj_amount = CleanDecimal('//p[@class="tabTxt tabTxt2"]/strong[1]', replace_dots=True) obj_exec_date = Date( CleanText('//p[@class="tabTxt tabTxt2"]/strong[2]'), dayfirst=True) obj_label = Regexp(CleanText('//p[@class="tabTxt tabTxt2"]/strong[3]'), u'« (.*) »') obj_account_id = Regexp( CleanText( '//div[@class="transAction"]/div[@class="inner"]/div[@class="first"]//small' ), r'N°(\w+)') obj_recipient_id = Regexp(CleanText( '//div[@class="transAction"]/div[@class="inner"]/div[not(@class="first")]//small' ), r'N°(\w+)', default=None) def obj_recipient_iban(self): if Field('recipient_id')(self) is None: return CleanText( '//div[@class="transAction"]/div[@class="inner"]/div[not(@class="first")]//span[@class="tabTxt"]' )(self).replace(' ', '')
class item(ItemElement): klass = BaseJobAdvert obj_id = Regexp( Link('./td/div/div[@class="jobTitleContainer"]/a'), 'http://offre-(d?)emploi.monster.fr:80/(.*?)(.aspx|\?).*', '\\1#\\2') obj_society_name = CleanText( './td/div/div[@class="companyContainer"]/div/a') obj_title = CleanText('./td/div/div[@class="jobTitleContainer"]/a') obj_publication_date = MonsterDate( CleanText('td/div/div[@class="fnt20"]')) obj_place = CleanText( './td/div/div[@class="jobLocationSingleLine"]/a/@title', default=NotAvailable)
class get_video(ItemElement): klass = YoupornVideo obj_author = CleanText('//div[has-class("submitByLink")]') #obj_date = Date('//div[@id="stats-date"]') obj_duration = NotAvailable obj_ext = 'mp4' obj_id = Env('id') obj_rating = CleanText('//div[@class="videoRatingPercentage"]') & Regexp(pattern=r'(\d+)%') & Type(type=int) obj_rating_max = 100 obj_thumbnail = NotAvailable obj_title = CleanText('//h1') def obj_url(self): return loads(re.search('videoUrl":(".*?")', self.page.text).group(1))
def populate(self, accounts): cards = [] for account in accounts: for li in self.doc.xpath('//li[@class="nav-category"]'): title = CleanText().filter(li.xpath('./h3')) for a in li.xpath('./ul/li//a'): label = CleanText().filter( a.xpath('.//span[@class="nav-category__name"]')) balance_el = a.xpath( './/span[@class="nav-category__value"]') balance = CleanDecimal( replace_dots=True, default=NotAvailable).filter(balance_el) if 'CARTE' in label and balance: acc = Account() acc.balance = balance acc.label = label acc.currency = FrenchTransaction.Currency().filter( balance_el) acc._link = Link().filter(a.xpath('.')) acc._history_page = acc._link acc.id = acc._webid = Regexp( pattern='([^=]+)$').filter(Link().filter( a.xpath('.'))) acc.type = Account.TYPE_CARD if not acc in cards: cards.append(acc) elif account.label == label and account.balance == balance: if not account.type: account.type = AccountsPage.ACCOUNT_TYPES.get( title, Account.TYPE_UNKNOWN) if account.type == Account.TYPE_LOAN: account._history_page = None elif account.type in (Account.TYPE_LIFE_INSURANCE, Account.TYPE_MARKET): account._history_page = re.sub( '/$', '', Link().filter(a.xpath('.'))) elif '/compte/cav' in a.attrib[ 'href'] or not 'titulaire' in self.url: account._history_page = self.browser.other_transactions else: account._history_page = self.browser.budget_transactions account._webid = Attr( None, 'data-account-label').filter( a.xpath( './/span[@class="nav-category__name"]')) accounts.extend(cards)
def get_list(self): account_type = Account.TYPE_UNKNOWN accounts = [] for tr in self.doc.xpath( '//div[@class="finance"]/form/table[@class="ecli"]/tr'): if tr.attrib.get('class', '') == 'entete': account_type = self.ACCOUNT_TYPES.get( tr.find('th').text.strip(), Account.TYPE_UNKNOWN) continue tds = tr.findall('td') a = tds[0].find('a') # Skip accounts that can't be accessed if a is None: continue balance = tds[-1].text.strip() account = Account() account.label = u' '.join( [txt.strip() for txt in tds[0].itertext()]) account.label = re.sub(u'[ \xa0\u2022\r\n\t]+', u' ', account.label).strip() account.id = Regexp(pattern=u'N° ((.*?) |(.*))').filter( account.label).strip() account.type = account_type if balance: account.balance = Decimal( FrenchTransaction.clean_amount(balance)) account.currency = account.get_currency(balance) if 'onclick' in a.attrib: m = re.search(r"javascript:submitForm\(([\w_]+),'([^']+)'\);", a.attrib['onclick']) if not m: self.logger.warning('Unable to find link for %r' % account.label) account._link = None else: account._link = m.group(2) else: account._link = a.attrib['href'].strip() accounts.append(account) return accounts
def get_not_rounded_valuations(self): def prepare_url(url, fields): components = urlparse(url) query_pairs = [(f, v) for (f, v) in parse_qsl(components.query) if f not in fields.iterkeys()] for (field, value) in fields.iteritems(): query_pairs.append((field, value)) new_query_str = urlencode(query_pairs) new_components = (components.scheme, components.netloc, components.path, components.params, new_query_str, components.fragment) return urlunparse(new_components) not_rounded_valuations = {} pages = [] try: for i in range( 1, CleanDecimal( Regexp( CleanText( u'(//table[form[contains(@name, "detailCompteTitresForm")]]//tr[1])[1]/td[3]/text()' ), r'\/(.*)'))(self.doc) + 1): pages.append( self.browser.open( prepare_url(self.browser.url, { 'action': '11', 'idCptSelect': '1', 'numPage': i })).page) except RegexpError: # no multiple page pages.append(self) for page in pages: for inv in page.doc.xpath( u'//table[contains(., "Détail du compte")]//tr[2]//table/tr[position() > 1]' ): if len(inv.xpath('.//td')) > 2: not_rounded_valuations[CleanText('.//td[1]/a/text()')( inv)] = CleanDecimal('.//td[7]/text()', replace_dots=True)(inv) return not_rounded_valuations
def parse(self, el): json_content = Regexp(CleanText('//script'), "var ava_data = ({.+?});")(self) json_content = json_content.replace("logged", "\"logged\"") json_content = json_content.replace("lengthcarrousel", "\"lengthcarrousel\"") json_content = json_content.replace("products", "\"products\"") json_content = json_content.replace( "// // ANNONCES_SIMILAIRE / RECO", "") self.house_json_datas = json.loads(json_content)['products'][0]
class item(ItemElement): klass = SongLyrics obj_title = CleanText('.', default=NotAvailable) obj_artist = Regexp( CleanText( '//div[has-class("breadcrumb")]//span[has-class("breadcrumb-current")]' ), 'Paroles (.*)') obj_content = NotLoaded def obj_id(self): href = CleanText('./@href')(self) aid = href.split('/')[-2] sid = href.split('/')[-1].replace('paroles-', '') id = '%s|%s' % (aid, sid) return id
def parse(self, el): # Trying to find vdate and unitvalue unitvalue, vdate = None, None for span in TableCell('label')(self)[0].xpath('.//span'): if unitvalue is None: unitvalue = Regexp(CleanText('.'), '^([\d,]+)$', default=None)(span) if vdate is None: vdate = None if any(x in CleanText('./parent::div')(span) for x in [u"échéance", "Maturity"]) else \ Regexp(CleanText('.'), '^([\d\/]+)$', default=None)(span) self.env['unitvalue'] = MyDecimal().filter(unitvalue) if unitvalue else NotAvailable self.env['vdate'] = Date(dayfirst=True).filter(vdate) if vdate else NotAvailable page = None link_id = Attr(u'.//a[contains(@title, "détail du fonds")]', 'id', default=None)(self) inv_id = Attr('.//a[contains(@id, "linkpdf")]', 'id', default=None)(self) if link_id and inv_id: form = self.page.get_form('//div[@id="operation"]//form') form['idFonds'] = inv_id.split('-', 1)[-1] form['org.richfaces.ajax.component'] = form[link_id] = link_id page = self.page.browser.open(form['javax.faces.encodedURL'], data=dict(form)).page if "hsbc.fr" in self.page.browser.BASEURL: # special space for HSBC m = re.search('fundid=(\w+).+SH=(\w+)', CleanText('//complete', default="")(page.doc)) if m: # had to put full url to skip redirections. page = page.browser.open('https://www.assetmanagement.hsbc.com/feedRequest?feed_data=gfcFundData&cod=FR&client=FCPE&fId=%s&SH=%s&lId=fr' % m.groups()).page elif "consulteroperations" not in self.page.browser.url: # not on history url = Regexp(CleanText('//complete'), r"openUrlFichesFonds\('(.*?)',true|false\).*", default=NotAvailable)(page.doc) if url is NotAvailable: # redirection to a useless graphplot page with url like /portal/salarie-sg/fichefonds?idFonds=XXX&source=/portal/salarie-sg/monepargne/mesavoirs # or on bnp, look for plot display function in a script assert CleanText('//redirect/@url')(page.doc) or CleanText('//script[contains(text(), "afficherGraphique")]')(page.doc) self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable return useless_urls = ( # pdf... http://docfinder.is.bnpparibas-ip.com/api/files/040d05b3-1776-4991-aa49-f0cd8717dab8/1536 'http://docfinder.is.bnpparibas-ip.com/', # Redirection to a useless page with url like "https://epargne-salariale.axa-im.fr/fr/" 'https://epargne-salariale.axa-im.fr/fr/', ) for useless_url in useless_urls: if url.startswith(useless_url): self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable return match = re.match(r'http://www.cpr-am.fr/fr/fonds_detail.php\?isin=([A-Z0-9]+)', url) match = match or re.match(r'http://www.cpr-am.fr/particuliers/product/view/([A-Z0-9]+)', url) if match: self.env['code'] = match.group(1) self.env['code_type'] = Investment.CODE_TYPE_ISIN return if url.startswith('http://fr.swisslife-am.com/fr/'): self.page.browser.session.cookies.set('location', 'fr') self.page.browser.session.cookies.set('prof', 'undefined') page = self.page.browser.open(url).page try: self.env['code'] = page.get_code() self.env['code_type'] = page.CODE_TYPE # Handle page is None and page has not get_code method except AttributeError: self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable
def iter_payment_details(self, sub): id_str = self.doc.xpath('//div[@class="entete container"]/h2')[0].text.strip() m = re.match('.*le (.*) pour un montant de.*', id_str) if m: blocs_benes = self.doc.xpath('//span[contains(@id,"nomBeneficiaire")]') blocs_prestas = self.doc.xpath('//table[@id="tableauPrestation"]') i = 0 last_bloc = len(blocs_benes) for i in range(0, last_bloc): bene = blocs_benes[i].text; id_str = m.group(1) id_date = datetime.strptime(id_str, '%d/%m/%Y').date() id = sub._id + "." + datetime.strftime(id_date, "%Y%m%d") table = blocs_prestas[i].xpath('.//tr') line = 1 last_date = None for tr in table: tds = tr.xpath('.//td') if len(tds) == 0: continue det = Detail() # TO TEST : Indemnités journalières : Pas pu tester de cas de figure similaire dans la nouvelle mouture du site if len(tds) == 4: date_str = Regexp(pattern=r'.*<br/>(\d+/\d+/\d+)\).*').filter(tds[0].text) det.id = id + "." + str(line) det.label = tds[0].xpath('.//span')[0].text.strip() jours = tds[1].text if jours is None: jours = '0' montant = tds[2].text if montant is None: montant = '0' price = tds[3].text if price is None: price = '0' if date_str is None or date_str == '': det.infos = u'' det.datetime = last_date else: det.infos = '%s (%sj) * %s€' % (date_str, re.sub(r'[^\d,-]+', '', jours), re.sub(r'[^\d,-]+', '', montant)) det.datetime = datetime.strptime(date_str.split(' ')[3], '%d/%m/%Y').date() last_date = det.datetime det.price = Decimal(re.sub('[^\d,-]+', '', price).replace(',', '.')) if len(tds) == 5: date_str = Regexp(pattern=r'\w*(\d{2})/(\d{2})/(\d{4}).*', template='\\1/\\2/\\3', default="").filter("".join(tds[0].itertext())) det.id = id + "." + str(line) det.label = '%s - %s' % (bene, tds[0].xpath('.//span')[0].text.strip()) paye = tds[1].text if paye is None: paye = '0' base = tds[2].text if base is None: base = '0' tdtaux = tds[3].xpath('.//span')[0].text if tdtaux is None: taux = '0' else: taux = tdtaux.strip() tdprice = tds[4].xpath('.//span')[0].text if tdprice is None: price = '0' else: price = tdprice.strip() if date_str is None or date_str == '': det.infos = u'' det.datetime = last_date else: det.infos = u' Payé %s€ / Base %s€ / Taux %s%%' % (re.sub(r'[^\d,-]+', '', paye), re.sub(r'[^\d,-]+', '', base), re.sub('[^\d,-]+', '', taux)) det.datetime = datetime.strptime(date_str, '%d/%m/%Y').date() last_date = det.datetime det.price = Decimal(re.sub('[^\d,-]+', '', price).replace(',', '.')) line = line + 1 yield det
def obj_label(self): label = Regexp(CleanText('.//div[@class="c-card-ghost__top-label"]'), pattern=r'^(.*?)(?: -[^-]*)?$')(self) return label.rstrip('-').rstrip()
def obj_split_path(self): _id = Regexp(CleanText('./@href'), '/\w{2}/(.*)', default=u'accueil')(self) return [SITE.CREATIVE.get('id')] + [_id.replace('/', '^')]
def get_params(self): a = Regexp(CleanText('//script'), '"algolia_app_id":"(.*)","algolia_api_key":"(.*)","algolia_api_index_taxonomy".*', '\\1|\\2')(self.doc) return a.split('|')
def obj_url(self): url = Regexp(AbsoluteLink('//div[has-class("torrentinfo")]//div[has-class("dltorrent")]//a[text()="Download torrent"]'), '(^.*)\?.*', '\\1')(self) return url.replace('http://', 'https://')
def obj_url(self): url = Regexp(AbsoluteLink('.//div[has-class("tt-name")]/a[1]'), '(^.*)\?.*', '\\1')(self) return url.replace('http://', 'https://')
def obj_rdate(self): s = Regexp(Field('raw'), ' (\d{2}/\d{2}/\d{2}) | (?!NUM) (\d{6}) ', default=NotAvailable)(self) if not s: return Field('date')(self) s = s.replace('/', '') return Date(dayfirst=True).filter('%s%s%s%s%s' % (s[:2], '-', s[2:4], '-', s[4:]))
def obj_split_path(self): _id = Regexp(CleanText('./a/@href'), '/\w{2}/(.*)')(self) return [SITE.CINEMA.get('id')] + _id.split('/')