def obj_type(self): url = BrowserURL('housing', _id=Env('_id'))(self) if 'colocation' in url: return POSTS_TYPES.SHARING elif 'location' in url: isFurnished = False for li in XPath('//ul[@itemprop="description"]/li')(self): label = CleanText('./div[has-class("criteria-label")]')(li) if label.lower() == "meublé": isFurnished = ( CleanText('./div[has-class("criteria-value")]')(li).lower() == 'oui' ) if isFurnished: return POSTS_TYPES.FURNISHED_RENT else: return POSTS_TYPES.RENT elif 'vente' in url: offertype = Attr( '//button[has-class("offer-contact-vertical-phone")][1]', 'data-offertransactiontype' )(self) if offertype == '4': return POSTS_TYPES.VIAGER else: return POSTS_TYPES.SALE return NotAvailable
def obj_photos(self): photos = [] for i in range(1, CleanDecimal('//NbPhotos')(self) + 1): img = CleanText('//LienImage%s' % i, replace=[(u'w=69&h=52', u'w=786&h=481')])(self) url = img if img.startswith('http') else u'http://www.entreparticuliers.com%s' % img photos.append(HousingPhoto(url)) return photos
def parse(self, el): rooms_bedrooms_area = el.xpath( './/div[@class="clearfix"]/ul[has-class("item-tags")]/li' ) self.env['rooms'] = NotLoaded self.env['bedrooms'] = NotLoaded self.env['area'] = NotLoaded for item in rooms_bedrooms_area: name = CleanText('.')(item) if 'chambre' in name.lower(): name = 'bedrooms' value = CleanDecimal('./strong')(item) elif 'pièce' in name.lower(): name = 'rooms' value = CleanDecimal('./strong')(item) else: name = 'area' value = CleanDecimal( Regexp( CleanText( '.' ), r'(\d*\.*\d*) .*' ) )(item) self.env[name] = value
def find_account(self, acclabel, accowner): accowner = sorted(accowner.lower().split()) # first name and last name may not be ordered the same way on market site... # Check if history is present if CleanText(default=None).filter(self.doc.xpath('//body/p[contains(text(), "indisponible pour le moment")]')): return False ids = None for a in self.doc.xpath('//a[contains(@onclick, "indiceCompte")]'): self.logger.debug("get investment from onclick") label = CleanText('.')(a) owner = CleanText('./ancestor::tr/preceding-sibling::tr[@class="LnMnTiers"][1]')(a) owner = sorted(owner.lower().split()) if label == acclabel and owner == accowner: ids = list(re.search(r'indiceCompte[^\d]+(\d+).*idRacine[^\d]+(\d+)', Attr('.', 'onclick')(a)).groups()) ids.append(CleanText('./ancestor::td/preceding-sibling::td')(a)) self.logger.debug("assign value to ids: {}".format(ids)) return ids for a in self.doc.xpath('//a[contains(@href, "indiceCompte")]'): self.logger.debug("get investment from href") if CleanText('.')(a) == acclabel: ids = list(re.search(r'indiceCompte[^\d]+(\d+).*idRacine[^\d]+(\d+)', Attr('.', 'href')(a)).groups()) ids.append(CleanText('./ancestor::td/preceding-sibling::td')(a)) self.logger.debug("assign value to ids: {}".format(ids)) return ids
def parse(self, el): self.env['category'] = u'Interne' if Attr('.', 'value')(self)[0] == 'I' else u'Externe' if self.env['category'] == u'Interne': # TODO use after 'I'? _id = Regexp(CleanText('.'), r'- (\w+\d\w+)')(self) # at least one digit accounts = list(self.page.browser.get_accounts_list()) + list(self.page.browser.get_loans_list()) match = [acc for acc in accounts if _id in acc.id] assert len(match) == 1 match = match[0] self.env['id'] = match.id self.env['iban'] = match.iban self.env['bank_name'] = u"Caisse d'Épargne" self.env['label'] = match.label # Usual case elif Attr('.', 'value')(self)[1] == '-': full = CleanText('.')(self) if full.startswith('- '): self.logger.warning('skipping recipient without a label: %r', full) raise SkipItem() # <recipient name> - <account number or iban> - <bank name (optional)> <optional last dash> mtc = re.match('(?P<label>.+) - (?P<id>[^-]+) -(?P<bank> [^-]*)?-?$', full) assert mtc self.env['id'] = self.env['iban'] = mtc.group('id') self.env['bank_name'] = (mtc.group('bank') and mtc.group('bank').strip()) or NotAvailable self.env['label'] = mtc.group('label') # Fcking corner case else: mtc = re.match('(?P<id>.+) - (?P<label>[^-]+) -( [^-]*)?-?$', CleanText('.')(self)) assert mtc self.env['id'] = mtc.group('id') self.env['iban'] = NotAvailable self.env['bank_name'] = NotAvailable self.env['label'] = mtc.group('label')
def parse(self, el): rooms_bedrooms_area = el.xpath( './/ul[has-class("item-tags")]/li' ) self.env['rooms'] = NotAvailable self.env['bedrooms'] = NotAvailable self.env['area'] = NotAvailable for item in rooms_bedrooms_area: name = CleanText('.')(item) if 'chambre' in name.lower(): name = 'bedrooms' value = CleanDecimal('./strong')(item) elif 'pièce' in name.lower(): name = 'rooms' value = CleanDecimal('./strong')(item) elif ' m²' in name and 'le m²' not in name: name = 'area' value = CleanDecimal( Regexp( CleanText( '.' ), r'(\d*\.*\d*) .*' ) )(item) self.env[name] = value
def obj_id(self): href = CleanText('./td[2]/a/@href', default=NotAvailable)(self) spl = href.replace('.html', '').split('/') lid = spl[2] aid = spl[3] sid = spl[4] return '%s|%s|%s' % (lid, aid, sid)
def obj_size(self): rawsize = CleanText('//div[has-class("files")]/../h5')(self) s = rawsize.split(',')[-1].replace(')', '') nsize = float(re.sub(r'[A-Za-z]', '', s)) usize = re.sub(r'[.0-9 ]', '', s).upper() size = get_bytes_size(nsize, usize) return size
def obj_size(self): rawsize = CleanText('(//div[@id="infosficher"]/span)[1]')(self) rawsize = rawsize.replace(',','.').strip() nsize = float(rawsize.split()[0]) usize = rawsize.split()[-1].upper().replace('O','B') size = get_bytes_size(nsize,usize) return size
def obj_size(self): rawsize = CleanText('./td[2]')(self) rawsize = rawsize.replace(',','.') nsize = float(rawsize.split()[0]) usize = rawsize.split()[-1].upper() size = get_bytes_size(nsize,usize) return size
def obj_size(self): rawsize = CleanText('./div[has-class("poid")]')(self) rawsize = rawsize.replace(',','.').strip() nsize = float(rawsize.split()[0]) usize = rawsize.split()[-1].upper().replace('O','B') size = get_bytes_size(nsize,usize) return size
def obj_details(self): details = {} details["creationDate"] = Date( Regexp( CleanText( '//p[@class="offer-description-notes"]|//p[has-class("darkergrey")]' ), u'.*Mis en ligne : (\d{2}/\d{2}/\d{4}).*' ), dayfirst=True )(self) honoraires = CleanText( ( '//div[has-class("offer-price")]/span[has-class("lbl-agencyfees")]' ), default=None )(self) if honoraires: details["Honoraires"] = ( "{} (TTC, en sus)".format( honoraires.split(":")[1].strip() ) ) for li in XPath('//ul[@itemprop="description"]/li')(self): label = CleanText('./div[has-class("criteria-label")]')(li) value = CleanText('./div[has-class("criteria-value")]')(li) details[label] = value return details
def obj_GES(self): greenhouse_value = CleanText( '//div[has-class("offer-energy-greenhouseeffect-summary")]//div[has-class("greenhouse-summary")]', default="" )(self) if len(greenhouse_value): greenhouse_value = greenhouse_value.replace("GES", "").strip()[0] return getattr(ENERGY_CLASS, greenhouse_value, NotAvailable)
def obj_size(self): rawsize = CleanText('//span[has-class("folder") or has-class("folderopen")]')(self) rawsize = rawsize.split(': ')[-1].split(')')[0].strip() rawsize = rawsize.replace(',','.') nsize = float(rawsize.split()[0]) usize = rawsize.split()[-1].upper() size = get_bytes_size(nsize,usize) return size
def obj_nb_person(self): nb_person = CleanText( '//span[@class="bu_cuisine_title_3 bu_cuisine_title_3--subtitle"]' )(self) nb_person = nb_person.lstrip('/').replace("pour", "").strip() return [ nb_person ]
def obj_DPE(self): energy_value = CleanText( '//div[has-class("offer-energy-greenhouseeffect-summary")]//div[has-class("energy-summary")]', default="" )(self) if len(energy_value): energy_value = energy_value.replace("DPE", "").strip()[0] return getattr(ENERGY_CLASS, energy_value, NotAvailable)
def obj_label(self): raw_label = CleanText(TableCell('label'))(self) label = CleanText(TableCell('label')(self)[0].xpath('./br/following-sibling::text()'))(self) if (label and label.split()[0] != raw_label.split()[0]) or not label: label = raw_label return CleanText(TableCell('label')(self)[0].xpath('./noscript'))(self) or label
def obj_utilities(self): price = CleanText( '//p[has-class("OfferTop-price")]' )(self) if "charges comprises" in price.lower(): return UTILITIES.INCLUDED else: return UTILITIES.EXCLUDED
def obj_utilities(self): price = CleanText( './/strong[has-class("TeaserOffer-price-num")]' )(self) if "charges comprises" in price.lower(): return UTILITIES.INCLUDED else: return UTILITIES.EXCLUDED
def on_load(self): h1 = CleanText('//h1[1]')(self.doc) if h1.startswith('Le service est moment'): text = CleanText('//h4[1]')(self.doc) or h1 raise BrowserUnavailable(text) if not self.browser.no_login: raise LoggedOut()
def obj_details(self): charges = CleanText('.//span[@class="price-fees"]', default=None)(self) if charges: return { "fees": charges.split(":")[1].strip() } else: return NotLoaded
def next_page(self): js_datas = CleanText('//div[@id="js-data"]/@data-rest-search-request')(self) total_page = self.page.browser.get_total_page(js_datas.split("?")[-1]) m = re.match(".*page=(\d?)(?:&.*)?", self.page.url) if m: current_page = int(m.group(1)) next_page = current_page + 1 if next_page <= total_page: return self.page.url.replace("page=%d" % current_page, "page=%d" % next_page)
def obj_url(self): url = CleanText('//meta[@property="og:url"]/@content', default=None)(self) if url is None: url = JSVar(CleanText('//script'), var='_JobUrl')(self) if not url.startswith('http'): url = 'www.adecco.fr%s' % url return url
def get_author(self): try: author = CleanText('.')(self.get_element_author()) if author.startswith('Par '): return author.split('Par ', 1)[1] else: return author except AttributeError: return ''
def load_product_codes(self): # store ISIN codes in a dictionary with a (label: isin) fashion product_codes = {} for table in self.doc.xpath('//table/tbody'): for row in table.xpath('//tr[contains(./th/@scope,"row")]'): label = CleanText('./th[1]', default=None)(row) isin_code = CleanText('./td[1]', default=None)(row) if label and isin_code: product_codes[label.upper()] = isin_code return product_codes
def check_errors(self): # check if user can add new recipient errors_id = ('popinClientNonEligible', 'popinClientNonEligibleBis') for error_id in errors_id: if self.doc.xpath('//script[contains(text(), "showDivJQInfo(\'%s\')")]' % error_id): msg = CleanText('//div[@id="%s"]//p' % error_id)(self.doc) # get the first sentence of information message # beacause the message is too long and contains unnecessary recommendations raise AddRecipientBankError(message=msg.split('.')[0])
def obj_photos(self): url = CleanText('./div[has-class("default-img")]/img/@data-src')(self) if url: url = unquote(url) if "http://" in url[3:]: rindex = url.rfind("?") if rindex == -1: rindex = None url = url[url.find("http://", 3):rindex] return [HousingPhoto(url)] else: return NotLoaded
def obj_type(self): try: el_to_check = CleanText(TableCell('type'))(self) type_dict = self.TYPE except ColumnNotFound: el_to_check = Field('label')(self) type_dict = self.TYPE_BY_LABELS for k, v in type_dict.items(): if el_to_check.startswith(k): return v return Account.TYPE_UNKNOWN
def on_load(self): if self.doc.xpath(u'//form//h1[1][contains(text(), "Accusé de reception du chéquier")]'): form = self.get_form(name='Alert') form['command'] = 'validateAlertMessage' form['radioValide_1_2_40003039944'] = 'Non' form.submit() elif self.doc.xpath(u'//p[@class="cddErrorMessage"]'): error_message = CleanText(u'//p[@class="cddErrorMessage"]')(self.doc) # TODO python2 handles unicode exceptions badly, fix when passing to python3 raise ActionNeeded(error_message.encode('ascii', 'replace')) else: raise ActionNeeded(CleanText(u'//form//h1[1]')(self.doc))
def obj_details(self): details = {} a = CleanText('//div[@class="box box-noborder"]/p[@class="size_13 darkergrey bold"]')(self) if a: splitted_a = a.split(':') dpe = Regexp(CleanText('//div[@id="energy-pyramid"]/img/@src'), 'http://mmf.logic-immo.com/mmf/fr/static/dpe/dpe_(\w)_b.gif', default="")(self) if len(splitted_a) > 1: details[splitted_a[0]] = '%s (%s)' % (splitted_a[1], dpe) elif dpe: details[splitted_a[0]] = '%s' return details
def decode_paste(self, key): d = json.loads(CleanText('//div[@id="cipherdata"]')(self.doc)) subd = json.loads(d[0]['data']) decr = decrypt(key, subd) return decompress(b64decode(decr), -MAX_WBITS)
class item(ItemElement): klass = Subscription obj_label = CleanText(Dict('nichandle')) obj_subscriber = Format("%s %s", CleanText(Dict('firstname')), CleanText(Dict('name'))) obj_id = CleanText(Dict('nichandle'))
def get_expire(self): d = json.loads(CleanText('//div[@id="cipherdata"]')(self.doc))[0]['meta'] if 'expire_date' in d: return datetime.fromtimestamp(d['expire_date'])
def on_load(self): error_message = CleanText( u'//td[contains(text(), "Votre adhésion au service WEB est résiliée depuis le")]' )(self.doc) if error_message: raise ActionNeeded(error_message)
def get_last_id(self): _el = self.page.doc.xpath('//*[@data-item-type="tweet"]/div')[-1] return CleanText('./@data-tweet-id')(_el)
class item(ItemElement): klass = Event obj_date = Date(CleanText('td[@headers="Date"]')) obj_activity = CleanText('td[@headers="Libelle"]') obj_location = CleanText('td[@headers="site"]')
def get_api_key(self): return JSVar(CleanText('//script'), var='apiKey', default=None)(self.doc)
def obj_id(self): product = Field('product')(self) _id = CleanText('./@id')(self) return u"%s.%s" % (product.id, _id)
def get_trends_token(self): json_data = CleanText('//input[@id="init-data"]/@value')(self.doc) return json.loads(json_data)['trendsCacheKey']
def get_message(self): return CleanText('//div[@id="div_text"]/h1 | //div[@id="div_text"]/p')(self.doc)
def obj_shop(self): _id = Field('id')(self) shop = Shop(_id) shop.name = CleanText('(./td)[4]')(self) shop.location = CleanText('(./td)[3]')(self) return shop
def obj_ingredients(self): ingredients = CleanText( '//p[@class="m_content_recette_ingredients"]', default='')(self).split('-') if len(ingredients) > 1: return ingredients[1:]
def obj_nb_person(self): nb_pers = Regexp(CleanText( '//p[@class="m_content_recette_ingredients"]/span[1]'), '.*\(pour (\d+) personnes\)', default=0)(self) return [nb_pers] if nb_pers else NotAvailable
def next_page(self): return CleanText( '//a[@id="ctl00_cphMainContent_m_ctrlSearchEngine_m_ctrlSearchListDisplay_m_ctrlSearchPagination_m_linkNextPage"]/@href', default=None)(self)
def obj_currency(self): txt = CleanText('./article/div/div[@itemprop="location"]')( self) return Currency.get_currency(txt)
def condition(self): return CleanText('./@id', default=False)(self)
def get_error(self): return CleanText("//div[@class='error']")(self.doc)
def obj_duration(self): _d = CleanText('./div/div/a/@data-duration')(self) return timedelta(seconds=int(_d))
def obj_date(self): time = CleanText(u'//span[@id="refresh_time"]')(self) time = [int(t) for t in time.split(":")] now = datetime.datetime.now() now.replace(hour=time[0], minute=time[1]) return now
def obj_thumbnail(self): thumbnail = Thumbnail(CleanText('//image[1]/url')(self)) thumbnail.url = thumbnail.id return thumbnail
def get_iban(self): return CleanText( '//td[contains(text(), "IBAN") and @class="ColonneCode"]', replace=[('IBAN', ''), (' ', '')])(self.doc)
def obj_duration(self): duration = self.el.xpath('itunes:duration', namespaces={'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd'}) return Duration(CleanText('.'))(duration[0])
def get_france_culture_podcasts_url(self): for a in XPath('//a[@class="podcast"]')(self.doc): emission_id = Regexp(CleanText('./@href'), 'http://radiofrance-podcast.net/podcast09/rss_(.*).xml', default=None)(a) if emission_id: return emission_id
def get_product_name(self): return CleanText('(//table[@id="tab_resultat"]/tr/th)[6]', default='')(self.doc)
def get_url(self): url = Regexp(CleanText('//script'), '.*liveUrl: \'(.*)\', timeshiftUrl.*', default=None)(self.doc) if not url: url = CleanText('//a[@id="player"][1]/@href')(self.doc) return url
class item(ItemElement): klass = Product obj_id = CleanText('./input/@value') obj_name = CleanText('./label')
def get_token(self): return CleanText( '//input[@id="recherche_recherchertype__token"]/@value')(self.doc)
def has_paste(self): return bool(CleanText('//div[@id="cipherdata"]')(self.doc))
def get_min_position(self): return CleanText( '//div[@class="stream-container "]/@data-min-position')(self.doc)
def check_double_auth(self): double_auth = self.doc.xpath('//input[@id="codeSMS"]') if double_auth: raise ActionNeeded(CleanText('(//div[contains(., "Two-Factor")])[5]')(self.doc))