def filter(self, url): qs = parse_qs(urlparse(url).query) if not qs.get(self.querykey): return self.default_or_raise(ParseError('Key %s not found' % self.querykey)) if len(qs[self.querykey]) > 1: raise ParseError('More than one value for key %s' % self.querykey) return qs[self.querykey][0]
def filter(self, txt): if empty(txt): return self.default_or_raise(ParseError('Unable to parse %r' % txt)) if self.minlen is not False and len(txt) <= self.minlen: return self.default_or_raise(ParseError('Unable to parse %r' % txt)) try: return self.type_func(txt) except ValueError as e: return self.default_or_raise(ParseError('Unable to parse %r: %s' % (txt, e)))
def iter_accounts(self): if not self.islogged: self.login() data = self.browser.open( "https://www.cmb.fr/domiweb/prive/particulier/releve/0-releve.act" ).content parser = etree.HTMLParser() tree = etree.parse(StringIO(data), parser) table = tree.xpath('/html/body/table') if len(table) == 0: title = tree.xpath('/html/head/title')[0].text if title == u"Utilisateur non identifié": self.login() data = self.browser.open( "https://www.cmb.fr/domiweb/prive/particulier/releve/0-releve.act" ).content parser = etree.HTMLParser() tree = etree.parse(StringIO(data), parser) table = tree.xpath('/html/body/table') if len(table) == 0: raise ParseError() else: raise ParseError() for tr in tree.xpath('/html/body//table[contains(@class, "Tb")]/tr'): if tr.get('class', None) not in ('LnTit', 'LnTot', 'LnMnTiers', None): account = Account() td = tr.xpath('td') a = td[1].xpath('a') account.label = unicode(a[0].text).strip() href = a[0].get('href') m = match(r"javascript:releve\((.*),'(.*)','(.*)'\)", href) if not m: continue account.id = unicode(m.group(1) + m.group(2) + m.group(3)) account._cmbvaleur = m.group(1) account._cmbvaleur2 = m.group(2) account._cmbtype = m.group(3) balance = u''.join([txt.strip() for txt in td[2].itertext()]) balance = balance.replace(',', '.').replace(u"\xa0", '') account.balance = Decimal(balance) span = td[4].xpath('a/span') if len(span): coming = span[0].text.replace(' ', '').replace(',', '.') coming = coming.replace(u"\xa0", '') account.coming = Decimal(coming) else: account.coming = NotAvailable yield account
def filter(self, txt): if empty(txt) or txt == '': return self.default_or_raise(ParseError('Unable to parse %r' % txt)) try: if self.translations: for search, repl in self.translations: txt = search.sub(repl, txt) return self.parse_func(txt, dayfirst=self.dayfirst, fuzzy=self.fuzzy) except (ValueError, TypeError) as e: return self.default_or_raise(ParseError('Unable to parse %r: %s' % (txt, e)))
def login(self, username, password): self.location( 'https://www.facebook.com/v2.6/dialog/oauth?redirect_uri=fb464891386855067%3A%2F%2Fauthorize%2F&display=touch&state=%7B%22challenge%22%3A%22IUUkEUqIGud332lfu%252BMJhxL4Wlc%253D%22%2C%220_auth_logger_id%22%3A%2230F06532-A1B9-4B10-BB28-B29956C71AB1%22%2C%22com.facebook.sdk_client_state%22%3Atrue%2C%223_method%22%3A%22sfvc_auth%22%7D&scope=user_birthday%2Cuser_photos%2Cuser_education_history%2Cemail%2Cuser_relationship_details%2Cuser_friends%2Cuser_work_history%2Cuser_likes&response_type=token%2Csigned_request&default_audience=friends&return_scopes=true&auth_type=rerequest&client_id=' + self.CLIENT_ID + '&ret=login&sdk=ios&logger_id=30F06532-A1B9-4B10-BB28-B29956C71AB1&ext=1470840777&hash=AeZqkIcf-NEW6vBd' ) page = HTMLPage(self, self.response) form = page.get_form() form['email'] = username form['pass'] = password form.submit(allow_redirects=False) if 'Location' not in self.response.headers: raise BrowserIncorrectPassword() self.location(self.response.headers['Location']) page = HTMLPage(self, self.response) if len(page.doc.xpath('//td/div[has-class("s")]')) > 0: raise BrowserIncorrectPassword( CleanText('//td/div[has-class("s")]')(page.doc)) form = page.get_form(nr=0, submit='//input[@name="__CONFIRM__"]') form.submit() m = re.search('access_token=([^&]+)&', self.response.text) if m: self.access_token = m.group(1) else: raise ParseError('Unable to find access_token') self.info = self.request('/me')
def login(self, username, password): self.location('https://www.facebook.com/v2.9/dialog/oauth?app_id=484681304938818&auth_type=rerequest&channel_url=https%3A%2F%2Fstaticxx.facebook.com%2Fconnect%2Fxd_arbiter.php%3Fversion%3D44%23cb%3Df33dd8340f36618%26domain%3Dwww.okcupid.com%26origin%3Dhttps%253A%252F%252Fwww.okcupid.com%252Ff5818a5f355be8%26relation%3Dopener&client_id=484681304938818&display=popup&domain=www.okcupid.com&e2e=%7B%7D&fallback_redirect_uri=https%3A%2F%2Fwww.okcupid.com%2Flogin&locale=en_US&origin=1&redirect_uri=https%3A%2F%2Fstaticxx.facebook.com%2Fconnect%2Fxd_arbiter.php%3Fversion%3D44%23cb%3Df2ce4ca90b82cb4%26domain%3Dwww.okcupid.com%26origin%3Dhttps%253A%252F%252Fwww.okcupid.com%252Ff5818a5f355be8%26relation%3Dopener%26frame%3Df3f40f304ac5e9&response_type=token%2Csigned_request&scope=email%2Cuser_birthday%2Cuser_photos&sdk=joey&version=v2.9') page = HTMLPage(self, self.response) form = page.get_form('//form[@id="login_form"]') form['email'] = username form['pass'] = password self.session.headers['cookie-installing-permission'] = 'required' self.session.cookies['wd'] = '640x1033' self.session.cookies['act'] = '1563018648141%2F0' form.submit(allow_redirects=False) if 'Location' not in self.response.headers: raise BrowserIncorrectPassword() self.location(self.response.headers['Location']) page = HTMLPage(self, self.response) if len(page.doc.xpath('//td/div[has-class("s")]')) > 0: raise BrowserIncorrectPassword(CleanText('//td/div[has-class("s")]')(page.doc)) script = page.doc.xpath('//script')[0].text m = re.search('access_token=([^&]+)&', script) if m: self.access_token = m.group(1) else: raise ParseError('Unable to find access_token')
def get_history(self, account, coming=False): if account.type is Account.TYPE_LOAN or not account._consultable: raise NotImplementedError() if account._univers != self.current_univers: self.move_to_univers(account._univers) today = date.today() seen = set() offset = 0 next_page = True while next_page: operation_list = self._make_api_call( account=account, start_date=date(day=1, month=1, year=2000), end_date=date.today(), offset=offset, max_length=50, ) transactions = [] for op in reversed(operation_list): t = Transaction() t.id = op['id'] if op['id'] in seen: raise ParseError('There are several transactions with the same ID, probably an infinite loop') seen.add(t.id) d = date.fromtimestamp(op.get('dateDebit', op.get('dateOperation'))/1000) op['details'] = [re.sub('\s+', ' ', i).replace('\x00', '') for i in op['details'] if i] # sometimes they put "null" elements... label = re.sub('\s+', ' ', op['libelle']).replace('\x00', '') raw = ' '.join([label] + op['details']) vdate = date.fromtimestamp(op.get('dateValeur', op.get('dateDebit', op.get('dateOperation')))/1000) t.parse(d, raw, vdate=vdate) t.amount = Decimal(str(op['montant'])) t.rdate = date.fromtimestamp(op.get('dateOperation', op.get('dateDebit'))/1000) if 'categorie' in op: t.category = op['categorie'] t.label = label t._coming = op['intraday'] if t._coming: # coming transactions have a random uuid id (inconsistent between requests) t.id = '' t._coming |= (t.date > today) if t.type == Transaction.TYPE_CARD and account.type == Account.TYPE_CARD: t.type = Transaction.TYPE_DEFERRED_CARD transactions.append(t) # Transactions are unsorted for t in sorted_transactions(transactions): if coming == t._coming: yield t elif coming and not t._coming: # coming transactions are at the top of history self.logger.debug('stopping coming after %s', t) return next_page = bool(transactions) offset += 50 assert offset < 30000, 'the site may be doing an infinite loop'
def obj_url(self): data = self.el if not data['request']['files']: raise ParseError( 'Unable to detect any stream method for id: %r (available: %s)' % (int(Field('id')(self)), data['request']['files'].keys())) # Choosen method is not available, we choose an other one method = self.obj._method if method not in data['request']['files']: method = data['request']['files'].keys()[0] streams = data['request']['files'][method] if not streams: raise ValueError('There is no url available for id: %r' % (int(Field('id')(self)))) # stream is single for hls, just return the url stream = streams['url'] if method == 'hls' else None # ...but a list for progressive # we assume the list is sorted by quality with best first if not stream: quality = self.obj._quality stream = streams[quality]['url'] if quality < len( streams) else streams[0]['url'] return stream.split('?')[0]
def login(self, username, password): self.location('https://www.facebook.com/dialog/oauth?client_id=%s&redirect_uri=fbconnect://success&scope=email,user_birthday,user_friends,public_profile,user_photos,user_likes&response_type=token' % self.CLIENT_ID) page = HTMLPage(self, self.response) form = page.get_form('//form[@id="login_form"]') form['email'] = username form['pass'] = password form.submit(allow_redirects=False) if 'Location' not in self.response.headers: raise BrowserIncorrectPassword() self.location(self.response.headers['Location']) page = HTMLPage(self, self.response) if len(page.doc.xpath('//td/div[has-class("s")]')) > 0: raise BrowserIncorrectPassword(CleanText('//td/div[has-class("s")]')(page.doc)) form = page.get_form(nr=0, submit='//input[@name="__CONFIRM__"]') form.submit() m = re.search('access_token=([^&]+)&', self.response.text) if m: self.access_token = m.group(1) else: raise ParseError('Unable to find access_token') self.info = self.request('/me')
def build_doc(self, content): try: doc = decompress_pdf(content) except OSError as e: raise ParseError(u'Make sure mupdf-tools is installed (%s)' % e) return doc
def obj__formats(self): player = Regexp(CleanText('//script'), '.*buildPlayer\((.*}})\);.*', default=None)(self) if player: info = json.loads(player) if info.get('error') is not None: raise ParseError(info['error']['title']) metadata = info.get('metadata') formats = {} for quality, media_list in metadata['qualities'].items(): for media in media_list: media_url = media.get('url') if not media_url: continue type_ = media.get('type') if type_ == 'application/vnd.lumberjack.manifest': continue ext = determine_ext(media_url) if ext in formats: if quality in formats.get(ext): formats[ext][quality] = media_url else: formats[ext] = {quality: media_url} else: formats[ext] = {quality: media_url} return formats
def __call__(self, item): raw = super(Filter, self).__call__(item) if item.obj.rdate is NotLoaded: item.obj.rdate = item.obj.date item.obj.category = NotAvailable if ' ' in raw: item.obj.category, useless, item.obj.label = [part.strip() for part in raw.partition(' ')] else: item.obj.label = raw for pattern, _type in patterns: m = pattern.match(raw) if m: args = m.groupdict() def inargs(key): """ inner function to check if a key is in args, and is not None. """ return args.get(key, None) is not None item.obj.type = _type if inargs('text'): item.obj.label = args['text'].strip() if inargs('category'): item.obj.category = args['category'].strip() # Set date from information in raw label. if inargs('dd') and inargs('mm'): dd = int(args['dd']) if args['dd'] != '00' else 1 mm = int(args['mm']) if inargs('yy'): yy = int(args['yy']) else: d = item.obj.date try: d = d.replace(month=mm, day=dd) except ValueError: d = d.replace(year=d.year-1, month=mm, day=dd) yy = d.year if d > item.obj.date: yy -= 1 if yy < 100: yy += 2000 try: if inargs('HH') and inargs('MM'): item.obj.rdate = datetime.datetime(yy, mm, dd, int(args['HH']), int(args['MM'])) else: item.obj.rdate = datetime.date(yy, mm, dd) except ValueError as e: raise ParseError('Unable to parse date in label %r: %s' % (raw, e)) break return raw
def login(self, login, passwd): try: vk = LCLVirtKeyboard(self) except VirtKeyboardError as err: self.logger.exception(err) return False password = vk.get_string_code(passwd) seed = -1 s = "var aleatoire = " for script in self.doc.findall("//script"): if script.text is None or len(script.text) == 0: continue offset = script.text.find(s) if offset != -1: seed = int(script.text[offset+len(s)+1:offset+len(s)+2]) break if seed==-1: raise ParseError("Variable 'aleatoire' not found") form = self.get_form('//form[@id="formAuthenticate"]') form['identifiant'] = login form['postClavierXor'] = base64.b64encode(self.myXOR(password,seed)) try: form['identifiantRouting'] = self.browser.IDENTIFIANT_ROUTING except AttributeError: pass try: form.submit() except BrowserUnavailable: # Login is not valid return False return True
def transfer(self, account, recipient, amount, reason): found = False # Automatically get the good transfer page self.logger.debug('Search %s' % recipient) for destination in self.get_recipients(account): self.logger.debug('Found %s ' % destination.id) if destination.id == recipient: found = True recipient = destination break if found: self.transferpage.open( data=self.page.buildonclick(recipient, account)) self.page.transfer(recipient, amount, reason) self.valtransferpage.go() if not self.valtransferpage.is_here(): raise TransferError("Invalid transfer (no confirmation page)") else: self.page.confirm(self.password) self.valtransferpage.go() recap = self.page.recap() if len(list(recap)) == 0: raise ParseError('Unable to find confirmation') return self.page.recap() else: raise TransferError('Recipient not found')
def filter(self, txt): m = self._regexp.search(txt) if m: kwargs = {} for key, index in self.kwargs.items(): kwargs[key] = int(m.groupdict()[index] or 0) return self.klass(**kwargs) return self.default_or_raise(ParseError('Unable to find time in %r' % txt))
def get_link(self, name): for script in self.doc.xpath('//script'): m = re.search(r"""\["%s",'([^']+)'""" % name, script.text or '', flags=re.MULTILINE) if m: return m.group(1) raise ParseError('Link %r not found' % name)
def login(self, login, password): vk = VirtKeyboard(self) form = self.get_form('//form[@id="formulaire-login"]') code = vk.get_string_code(password) assert len(code)==10, ParseError("Wrong number of character.") form['identifiant'] = login form['code'] = code form.submit()
def to_python(self, m): "Convert MatchObject to python value" values = m.groupdict() for t, v in values.iteritems(): if v is not None: break if self.need_type and t != self.need_type: raise ParseError('Value with type %s not found' % self.need_type) if t in ('int', 'float'): return literal_eval(v) if t == 'str': return literal_eval(v).decode('utf-8') if t == 'bool': return v == 'true' if t == 'None': return if self.default: return self.default raise ParseError('Unable to parse %r value' % m.group(0))
def get_date(self): date = None dt_elt = self.doc.xpath( '//td[text()="Période d\'emploi"]/following-sibling::td') if not dt_elt: dt_elt = self._doc2.xpath( '//td[text()="Période d\'emploi"]/following-sibling::td') if dt_elt: date = Date(Regexp(CleanText("."), r"au (\d{2}\/\d{2}\/\d{4})"), dayfirst=True)(dt_elt[0]) else: raise ParseError() return date
def parse(self, el): link = el.xpath('./td[1]/a')[0].get('href', '') if link.startswith('POR_SyntheseLst'): raise SkipItem() url = urlparse(link) p = parse_qs(url.query) if 'rib' not in p and 'webid' not in p: raise SkipItem() for td in el.xpath('./td[2] | ./td[3]'): try: balance = CleanDecimal('.', replace_dots=True)(td) except InvalidOperation: continue else: break else: raise ParseError('Unable to find balance for account %s' % CleanText('./td[1]/a')(el)) id = p['rib'][0] if 'rib' in p else p['webid'][0] # Handle cards if id in self.parent.objects: account = self.parent.objects[id] if not account.coming: account.coming = Decimal('0.0') account.coming += balance account._card_links.append(link) raise SkipItem() self.env['id'] = id # Handle real balances page = self.page.browser.open(link).page coming = page.find_amount( u"Opérations à venir") if page else None accounting = page.find_amount( u"Solde comptable") if page else None if accounting is not None and accounting + ( coming or Decimal('0')) != balance: self.page.logger.warning('%s + %s != %s' % (accounting, coming, balance)) if accounting is not None: balance = accounting self.env['balance'] = balance self.env['coming'] = coming or NotAvailable
def get_history(self, account): if not account._consultable: raise NotImplementedError() if account._univers != self.current_univers: self.move_to_univers(account._univers) offset = 0 next_page = True seen = set() while next_page: r = self.api_open( '/transactionnel/services/applications/operations/get/%(number)s/%(nature)s/00/%(currency)s/%(startDate)s/%(endDate)s/%(offset)s/%(limit)s' % { 'number': account._number, 'nature': account._nature, 'currency': account.currency, 'startDate': '2000-01-01', 'endDate': date.today().strftime('%Y-%m-%d'), 'offset': offset, 'limit': 50 }) next_page = False offset += 50 transactions = [] for op in reversed(r.json()['content']['operations']): next_page = True t = Transaction() if op['id'] in seen: raise ParseError( 'There are several transactions with the same ID, probably an infinite loop' ) t.id = op['id'] seen.add(t.id) t.amount = Decimal(str(op['montant'])) t.date = date.fromtimestamp( op.get('dateDebit', op.get('dateOperation')) / 1000) t.rdate = date.fromtimestamp( op.get('dateOperation', op.get('dateDebit')) / 1000) t.vdate = date.fromtimestamp( op.get('dateValeur', op.get('dateDebit', op.get('dateOperation'))) / 1000) if 'categorie' in op: t.category = op['categorie'] t.label = op['libelle'] t.raw = ' '.join([op['libelle']] + op['details']) transactions.append(t) # Transactions are unsorted for t in sorted(transactions, key=lambda t: t.rdate, reverse=True): yield t
def __call__(self, item): values = self.select(self.selector, item) date_guesser = self.date_guesser # In case Env() is used to kive date_guesser. if isinstance(date_guesser, _Filter): date_guesser = self.select(date_guesser, item) if isinstance(values, basestring): values = re.split('[/-]', values) if len(values) == 2: day, month = map(int, values) else: raise ParseError('Unable to take (day, month) tuple from %r' % values) return date_guesser.guess_date(day, month, **self.kwargs)
def login(self, login, password): vk = VirtKeyboard(self) form = self.get_form('//form[@id="formulaire-login"]') code = vk.get_string_code(password) assert len(code) == 10, ParseError("Wrong number of character.") form['identifiant'] = login form['codePinpad'] = code form['task'] = 'Login' form['process'] = 'Login' form['eventid'] = 'suivant' form['modeCodeSecret'] = 'pinpad' form['personneIdentifiee'] = 'N' form.submit()
def obj_url(self): quality = 'sd' codec = None data = self.el if 'vp6' in data['request']['files']: codec = 'vp6' if 'vp8' in data['request']['files']: codec = 'vp8' if 'h264' in data['request']['files']: codec = 'h264' if not codec: raise ParseError( 'Unable to detect available codec for id: %r' % int(Field('id')(self))) if 'hd' in data['request']['files'][codec]: quality = 'hd' return data['request']['files'][codec][quality]['url']
def filter(self, text): if empty(text): return self.default_or_raise(ParseError('Unable to parse %r' % text)) original_text = text = super(CleanDecimal, self).filter(text) if self.replace_dots: if type(self.replace_dots) is tuple: thousands_sep, decimal_sep = self.replace_dots else: thousands_sep, decimal_sep = '.', ',' text = text.replace(thousands_sep, '').replace(decimal_sep, '.') try: v = Decimal(re.sub(r'[^\d\-\.]', '', text)) if self.sign: v *= self.sign(original_text) return v except InvalidOperation as e: return self.default_or_raise(e)
def get_content(self, _id): url, _id = self.parse_id(_id) if url is None: return None self.location(url) if self.comment.is_here(): content = self.page.get_comment() elif self.content.is_here(): m = re.match('.*#comment-(\d+)$', url) if m: content = self.page.get_comment(int(m.group(1))) else: content = self.page.get_article() else: raise ParseError('Not on a content or comment page (%r)' % self.page) if _id is not None: content.id = _id return content
def iter_history(self, account): if not self.islogged: self.login() page = "https://www.cmb.fr/domiweb/prive/particulier/releve/" if account._cmbtype == 'D': page += "10-releve.act" else: page += "2-releve.act" page += "?noPageReleve=1&indiceCompte=" page += account._cmbvaleur page += "&typeCompte=" page += account._cmbvaleur2 page += "&deviseOrigineEcran=EUR" data = self.browser.open(page).content parser = etree.HTMLParser() tree = etree.parse(StringIO(data), parser) tables = tree.xpath('/html/body/table') if len(tables) == 0: title = tree.xpath('/html/head/title')[0].text if title == u"Utilisateur non identifié": self.login() data = self.browser.open(page).content parser = etree.HTMLParser() tree = etree.parse(StringIO(data), parser) tables = tree.xpath('/html/body/table') if len(tables) == 0: raise ParseError() else: raise ParseError() i = 0 for table in tables: if table.get('id') != "tableMouvements": continue for tr in table.getiterator('tr'): if (tr.get('class') != 'LnTit' and tr.get('class') != 'LnTot'): operation = Transaction(i) td = tr.xpath('td') div = td[1].xpath('div') d = div[0].text.split('/') operation.date = date(*reversed([int(x) for x in d])) div = td[2].xpath('div') label = div[0].xpath('a')[0].text.replace('\n', '') operation.raw = unicode(' '.join(label.split())) for pattern, _type, _label in self.LABEL_PATTERNS: mm = pattern.match(operation.raw) if mm: operation.type = _type operation.label = sub('[ ]+', ' ', _label % mm.groupdict()).strip() break amount = td[3].text if amount.count(',') != 1: amount = td[4].text amount = amount.replace(',', '.').replace(u'\xa0', '') operation.amount = Decimal(amount) else: amount = amount.replace(',', '.').replace(u'\xa0', '') operation.amount = -Decimal(amount) i += 1 yield operation
def parse(self, el): link = el.xpath('./td[1]/a')[0].get('href', '') if 'POR_SyntheseLst' in link: raise SkipItem() url = urlparse(link) p = parse_qs(url.query) if 'rib' not in p and 'webid' not in p: raise SkipItem() for td in el.xpath('./td[2] | ./td[3]'): try: balance = CleanDecimal('.', replace_dots=True)(td) except InvalidOperation: continue else: break else: raise ParseError('Unable to find balance for account %s' % CleanText('./td[1]/a')(el)) self.env['_is_webid'] = False if self.page.browser.is_new_website: id = CleanText( './td[1]/a/node()[contains(@class, "doux")]', replace=[(' ', '')])(el) else: if 'rib' in p: id = p['rib'][0] else: id = p['webid'][0] self.env['_is_webid'] = True page = self.page.browser.open(link).page # Handle cards if id in self.parent.objects: if page.is_fleet() or id in self.page.browser.fleet_pages: if not id in self.page.browser.fleet_pages: self.page.browser.fleet_pages[id] = [] self.page.browser.fleet_pages[id].append(page) else: account = self.parent.objects[id] if not account.coming: account.coming = Decimal('0.0') account.coming += balance account._card_links.append(link) raise SkipItem() self.env['id'] = id # Handle real balances coming = page.find_amount( u"Opérations à venir") if page else None accounting = page.find_amount( u"Solde comptable") if page else None if accounting is not None and accounting + ( coming or Decimal('0')) != balance: self.page.logger.warning('%s + %s != %s' % (accounting, coming, balance)) if accounting is not None: balance = accounting self.env['balance'] = balance self.env['coming'] = coming or NotAvailable
def filter(self, timestamp): try: return dt.date.fromtimestamp(int(timestamp[:-3])) except TypeError: return self.default_or_raise( ParseError('Element %r not found' % self.selector))
def __call__(self, item): try: return item.env[self.name] except KeyError: return self.default_or_raise( ParseError('Environment variable %s not found' % self.name))