def get_job_advert(self, url, advert): re_id = re.compile('http://(.*?)/offre_emploi/detailoffre.aspx\?numoffre=(.*?)&de=consultation', re.DOTALL) if advert is None: _id = u'%s|%s' % (re_id.search(url).group(1), re_id.search(url).group(2)) advert = RegionsJobAdvert(_id) advert.url = u'%s' % url div = self.document.getroot().xpath('//div[@id="annonce"]')[0] advert.title = u'%s' % self.parser.select(div, 'h1', 1, method='xpath').text content = self.parser.select(div, 'p', method='xpath') next_is_date = False next_is_pay = False description = '' for p in content: if next_is_date: m = re.match('(\d{2})\s(\d{2})\s(\d{4})', date) if m: dd = int(m.group(1)) mm = int(m.group(2)) yyyy = int(m.group(3)) advert.publication_date = datetime.date(yyyy, mm, dd) next_is_date = False elif next_is_pay: advert.pay = html2text(self.parser.tostring(p)) next_is_pay = False elif 'class' in p.attrib: if p.attrib['class'] == 'contrat_loc': _p = self.parser.select(div, 'p[@class="contrat_loc"]', 1, method='xpath') content_p = _p.text_content().strip().split('\r\n') for el in content_p: splitted_el = el.split(':') if len(splitted_el) == 2: if splitted_el[0] == 'Entreprise': advert.society_name = splitted_el[1] elif splitted_el[0] == 'Contrat': advert.contract_type = splitted_el[1] elif splitted_el[0] == 'Localisation': advert.place = splitted_el[1] elif p.attrib['class'] == 'date_ref': next_is_date = True elif p.attrib['class'] == 'rubrique_annonce' and p.text == 'Salaire': next_is_pay = True else: description = description + html2text(self.parser.tostring(p)) else: description = description + html2text(self.parser.tostring(p)) advert.description = u'%s' % description return advert
def format_obj(self, obj, alias): result = u'%sTitle:%s %s\n' % (self.BOLD, self.NC, obj.title) result += u'%sDate:%s %s\n' % (self.BOLD, self.NC, obj.date.strftime('%Y-%m-%d %H:%M')) result += u'%sFrom:%s %s\n' % (self.BOLD, self.NC, obj.sender) if hasattr(obj, 'receivers') and obj.receivers: result += u'%sTo:%s %s\n' % (self.BOLD, self.NC, ', '.join(obj.receivers)) if obj.flags & Message.IS_HTML: content = html2text(obj.content) else: content = obj.content result += '\n%s' % content if obj.signature: if obj.flags & Message.IS_HTML: signature = html2text(obj.signature) else: signature = obj.signature result += '\n-- \n%s' % signature return result
def format_obj(self, obj, alias): result = u'%s %s %s %s %s\n' % (self.colored(obj.project.name, 'blue', 'bold'), self.colored(u'—', 'cyan', 'bold'), self.colored(obj.fullid, 'red', 'bold'), self.colored(u'—', 'cyan', 'bold'), self.colored(obj.title, 'yellow', 'bold')) result += '\n%s\n\n' % obj.body result += self.format_key('Author', '%s (%s)' % (obj.author.name, obj.creation)) result += self.format_attr(obj, 'status') result += self.format_attr(obj, 'version') result += self.format_attr(obj, 'category') result += self.format_attr(obj, 'assignee') if hasattr(obj, 'fields') and not empty(obj.fields): for key, value in obj.fields.iteritems(): result += self.format_key(key.capitalize(), value) if hasattr(obj, 'attachments') and obj.attachments: result += '\n%s\n' % self.colored('Attachments:', 'green') for a in obj.attachments: result += '* %s%s%s <%s>\n' % (self.BOLD, a.filename, self.NC, a.url) if hasattr(obj, 'history') and obj.history: result += '\n%s\n' % self.colored('History:', 'green') for u in obj.history: result += '%s %s %s %s\n' % (self.colored('*', 'red', 'bold'), self.colored(u.date, 'yellow', 'bold'), self.colored(u'—', 'cyan', 'bold'), self.colored(u.author.name, 'blue', 'bold')) for change in u.changes: result += ' - %s %s %s %s\n' % (self.colored(change.field, 'green'), change.last, self.colored('->', 'magenta'), change.new) if u.message: result += ' %s\n' % html2text(u.message).strip().replace('\n', '\n ') return result
def format_obj(self, obj, alias): result = u"%s%s - #%s - %s%s\n" % (self.BOLD, obj.project.name, obj.fullid, obj.title, self.NC) result += "\n%s\n\n" % obj.body result += "Author: %s (%s)\n" % (obj.author.name, obj.creation) if hasattr(obj, "status") and obj.status: result += "Status: %s\n" % obj.status.name if hasattr(obj, "version") and obj.version: result += "Version: %s\n" % obj.version.name if hasattr(obj, "category") and obj.category: result += "Category: %s\n" % obj.category if hasattr(obj, "assignee") and obj.assignee: result += "Assignee: %s\n" % (obj.assignee.name) if hasattr(obj, "attachments") and obj.attachments: result += "\nAttachments:\n" for a in obj.attachments: result += "* %s%s%s <%s>\n" % (self.BOLD, a.filename, self.NC, a.url) if hasattr(obj, "history") and obj.history: result += "\nHistory:\n" for u in obj.history: result += "* %s%s - %s%s\n" % (self.BOLD, u.date, u.author.name, self.NC) for change in u.changes: result += " - %s%s%s: %s -> %s\n" % (self.BOLD, change.field, self.NC, change.last, change.new) if u.message: result += html2text(u.message) return result
def get_job_advert(self, url, advert): re_id = re.compile('http://www.adecco.fr/trouver-un-emploi/Pages/Details-de-l-Offre/(.*?)/(.*?).aspx\?IOF=(.*?)$', re.DOTALL) if advert is None: _id = u'%s/%s/%s' % (re_id.search(url).group(1), re_id.search(url).group(2), re_id.search(url).group(3)) advert = AdeccoJobAdvert(_id) advert.contract_type = re_id.search(url).group(1) div = self.document.getroot().xpath("//div[@class='contain_MoreResults']")[0] date = u'%s' % self.parser.select(div, "div[@class='dateResult']", 1, method='xpath').text.strip() m = re.match('(\d{2})\s(.*?)\s(\d{4})', date) if m: dd = int(m.group(1)) mm = MONTHS.index(m.group(2)) + 1 yyyy = int(m.group(3)) advert.publication_date = datetime.date(yyyy, mm, dd) title = self.parser.select(div, "h1", 1, method='xpath').text_content().strip() town = self.parser.select(div, "h1/span/span[@class='town']", 1, method='xpath').text_content() page_title = self.parser.select(div, "h1/span[@class='pageTitle']", 1, method='xpath').text_content() advert.title = u'%s' % title.replace(town, '').replace(page_title, '') spans = self.document.getroot().xpath("//div[@class='jobGreyContain']/table/tr/td/span[@class='value']") advert.job_name = u'%s' % spans[0].text advert.place = u'%s' % spans[1].text advert.pay = u'%s' % spans[2].text advert.contract_type = u'%s' % spans[3].text advert.url = url description = self.document.getroot().xpath("//div[@class='descriptionContainer']/p")[0] advert.description = html2text(self.parser.tostring(description)) return advert
def format_obj(self, obj, alias): result = u'%s%s - #%s - %s%s\n' % (self.BOLD, obj.project.name, obj.fullid, obj.title, self.NC) result += '\n%s\n\n' % obj.body result += 'Author: %s (%s)\n' % (obj.author.name, obj.creation) if hasattr(obj, 'status') and obj.status: result += 'Status: %s\n' % obj.status.name if hasattr(obj, 'version') and obj.version: result += 'Version: %s\n' % obj.version.name if hasattr(obj, 'category') and obj.category: result += 'Category: %s\n' % obj.category if hasattr(obj, 'assignee') and obj.assignee: result += 'Assignee: %s\n' % (obj.assignee.name) if hasattr(obj, 'attachments') and obj.attachments: result += '\nAttachments:\n' for a in obj.attachments: result += '* %s%s%s <%s>\n' % (self.BOLD, a.filename, self.NC, a.url) if hasattr(obj, 'history') and obj.history: result += '\nHistory:\n' for u in obj.history: result += '* %s%s - %s%s\n' % (self.BOLD, u.date, u.author.name, self.NC) for change in u.changes: result += ' - %s%s%s: %s -> %s\n' % (self.BOLD, change.field, self.NC, change.last, change.new) if u.message: result += html2text(u.message) return result
def get_video(self, video=None): if video is None: video = DailymotionVideo(self.group_dict['id']) div = self.parser.select(self.document.getroot(), 'div#content', 1) video.title = unicode(self.parser.select(div, 'span.title', 1).text).strip() video.author = unicode(self.parser.select(div, 'a.name, span.name, a[rel=author]', 1).text).strip() try: video.description = html2text(self.parser.tostring(self.parser.select(div, 'div#video_description', 1))).strip() or unicode() except BrokenPageError: video.description = u'' embed_page = self.browser.readurl('http://www.dailymotion.com/embed/video/%s' % video.id) m = re.search('var info = ({.*?}),[^{"]', embed_page) if not m: raise BrokenPageError('Unable to find information about video') info = json.loads(m.group(1)) for key in ['stream_h264_hd1080_url','stream_h264_hd_url', 'stream_h264_hq_url','stream_h264_url', 'stream_h264_ld_url']: if info.get(key):#key in info and info[key]: max_quality = key break else: raise BrokenPageError(u'Unable to extract video URL') video.url = info[max_quality] video.set_empty_fields(NotAvailable) return video
def get_video(self, video=None): if video is None: video = DailymotionVideo(self.group_dict['id']) div = self.parser.select(self.document.getroot(), 'div#content', 1) video.title = unicode(self.parser.select(div, 'span.title', 1).text).strip() video.author = unicode(self.parser.select(div, 'a.name, span.name', 1).text).strip() try: video.description = html2text(self.parser.tostring(self.parser.select(div, 'div#video_description', 1))).strip() or unicode() except BrokenPageError: video.description = u'' for script in self.parser.select(self.document.getroot(), 'div.dmco_html'): # TODO support videos from anyclip, cf http://www.dailymotion.com/video/xkyjiv for example if 'id' in script.attrib and script.attrib['id'].startswith('container_player_') and \ script.find('script') is not None: text = script.find('script').text mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', text) if mobj is None: mobj = re.search('"sdURL":.*?"(.*?)"', urllib.unquote(text)) mediaURL = mobj.group(1).replace("\\", "") else: mediaURL = urllib.unquote(mobj.group(1)) video.url = mediaURL video.set_empty_fields(NotAvailable) return video
def get_event(self, url, event): event.url = url header = self.document.getroot().xpath("//div[@class='pvi-hero-product']")[0] title = self.parser.select(header, "div[@class='d-rubric-inner']/h1", 1, method="xpath").text.strip() year = self.parser.select(header, "div[@class='d-rubric-inner']/small", 1, method="xpath").text.strip() _infos = self.parser.select(header, "ul[@class='pvi-product-specs']/li", method="xpath") infos = "" for li in _infos: infos += u"- %s\n" % self.parser.tocleanstring(li) section = self.document.getroot().xpath("//section[@class='pvi-productDetails']")[0] _infos = self.parser.select(section, "ul/li", method="xpath") for li in _infos: infos += u"- %s\n" % self.parser.tocleanstring(li) _resume = self.parser.select(section, "p[@data-rel='full-resume']", method="xpath") if not _resume: _resume = self.parser.select(section, "p[@data-rel='small-resume']", method="xpath") if _resume: resume = html2text(self.parser.tostring(_resume[0])) else: resume = "" else: _id = self.parser.select(_resume[0], "button", 1, method="xpath").attrib["data-sc-product-id"] resume = self.browser.get_resume(url, _id) event.description = u"%s %s\n\n%s\n\n%s" % (title, year, infos, resume) return event
def filter(self, el): _resume = el[0].xpath("p[@data-rel='full-resume']") if not _resume: _resume = el[0].xpath("p[@data-rel='small-resume']") if _resume: resume = html2text(CleanText(_resume[0])(self))[6:] return resume
def get_job_advert(self, url, advert): job_header = self.document.getroot().xpath('//div[@id="job_header"]')[0] if not advert: title = self.parser.select(job_header, 'b[@class="jobtitle"]', 1, method='xpath').text_content() society_name = self.parser.select(job_header, 'span[@class="company"]', 1, method='xpath').text_content() num_id = url.split('-')[-1] advert = IndeedJobAdvert(society_name + "|" + title + "|" + num_id) advert.place = u'%s' % self.parser.select(job_header, 'span[@class="location"]', 1, method='xpath').text_content() description_content = self.document.getroot().xpath('//span[@class="summary"]')[0] advert.description = html2text(self.parser.tostring(description_content)) advert.job_name = u'%s' % self.parser.select(job_header, 'b[@class="jobtitle"]', 1, method='xpath').text_content() advert.url = url date = self.document.getroot().xpath('//span[@class="date"]')[0].text_content().strip() now = datetime.datetime.now() number = re.search("\d+", date) if number: if 'heures' in date: date = now - datetime.timedelta(hours=int(number.group(0))) advert.publication_date = date elif 'jour' in date: date = now - datetime.timedelta(days=int(number.group(0))) advert.publication_date = date return advert
def read_renew(self, id): for tr in self.document.getroot().xpath('//tr[@class="patFuncEntry"]'): if len(tr.xpath('td/input[@value="%s"]' % id)) > 0: message = self.browser.parser.tostring(tr.xpath('td[@class="patFuncStatus"]')[0]) renew = Renew(id) renew.message = html2text(message).replace('\n', '') return renew
def set_video_metadata(self, video): head = self.parser.select(self.document.getroot(), 'head', 1) video.title = unicode(self.parser.select(head, 'meta[property="og:title"]', 1).get("content")).strip() video.author = unicode(self.parser.select(head, 'meta[name="author"]', 1).get("content")).strip() url = unicode(self.parser.select(head, 'meta[property="og:image"]', 1).get("content")).strip() # remove the useless anti-caching url = re.sub('\?\d+', '', url) video.thumbnail = BaseImage(url) video.thumbnail.url = video.thumbnail.id try: parts = self.parser.select(head, 'meta[property="video:duration"]', 1).get("content").strip().split(':') except BrokenPageError: # it's probably a live, np. video.duration = NotAvailable else: if len(parts) == 1: seconds = parts[0] hours = minutes = 0 elif len(parts) == 2: minutes, seconds = parts hours = 0 elif len(parts) == 3: hours, minutes, seconds = parts else: raise BrokenPageError('Unable to parse duration %r' % parts) video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds)) try: video.description = html2text(self.parser.select(head, 'meta[property="og:description"]', 1).get("content")).strip() or unicode() except BrokenPageError: video.description = u''
def get_video(self, video=None): if video is None: video = DailymotionVideo(self.group_dict['id']) div = self.parser.select(self.document.getroot(), 'div#content', 1) video.title = unicode(self.parser.select(div, 'span.title', 1).text).strip() video.author = unicode(self.parser.select(div, 'a.name, span.name, a[rel=author]', 1).text).strip() try: video.description = html2text(self.parser.tostring(self.parser.select(div, 'div#video_description', 1))).strip() or unicode() except BrokenPageError: video.description = u'' for script in self.parser.select(self.document.getroot(), 'div.dmco_html'): # TODO support videos from anyclip, cf http://www.dailymotion.com/video/xkyjiv for example if 'id' in script.attrib and script.attrib['id'].startswith('container_player_') and \ script.find('script') is not None: text = script.find('script').text mobj = re.search(r'\s*var flashvars = (.*)', text) if mobj is None: raise BrokenPageError('Unable to extract video url') flashvars = urllib.unquote(mobj.group(1)) for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: if key in flashvars: max_quality = key break mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) if mobj is None: raise BrokenPageError('Unable to extract video url') video.url = urllib.unquote(mobj.group(1)).replace('\\/', '/') video.set_empty_fields(NotAvailable) return video
def do_status(self, line): """ status Display status information about a backend. """ if len(line) > 0: backend_name = line else: backend_name = None results = {} for backend, field in self.do('get_account_status', backends=backend_name, caps=ICapAccount): if backend.name in results: results[backend.name].append(field) else: results[backend.name] = [field] for name, fields in results.iteritems(): print ':: %s ::' % name for f in fields: if f.flags & f.FIELD_HTML: value = html2text(f.value) else: value = f.value print '%s: %s' % (f.label, value) print ''
def fill_gallery(self, gallery): gallery.title = self.document.xpath("//h1[@id='gn']/text()")[0] try: gallery.original_title = self.document.xpath("//h1[@id='gj']/text()")[0] except IndexError: gallery.original_title = None description_div = self.document.xpath("//div[@id='gd71']")[0] description_html = self.parser.tostring(description_div) gallery.description = html2text(description_html) cardinality_string = self.document.xpath("//div[@id='gdd']//tr[td[@class='gdt1']/text()='Images:']/td[@class='gdt2']/text()")[0] gallery.cardinality = int(re.match(r"\d+", cardinality_string).group(0)) date_string = self.document.xpath("//div[@id='gdd']//tr[td[@class='gdt1']/text()='Posted:']/td[@class='gdt2']/text()")[0] gallery.date = datetime.strptime(date_string, "%Y-%m-%d %H:%M") rating_string = self.document.xpath("//td[@id='rating_label']/text()")[0] rating_match = re.search(r"\d+\.\d+", rating_string) if rating_match is None: gallery.rating = None else: gallery.rating = float(rating_match.group(0)) gallery.rating_max = 5 try: thumbnail_url = self.document.xpath("//div[@class='gdtm']/a/img/attribute::src")[0] except IndexError: thumbnail_style = self.document.xpath("//div[@class='gdtm']/div/attribute::style")[0] thumbnail_url = re.search(r"background:[^;]+url\((.+?)\)", thumbnail_style).group(1) gallery.thumbnail = BaseImage(thumbnail_url) gallery.thumbnail.url = gallery.thumbnail.id
def format_obj(self, obj, alias): result = u'%s%s - #%s - %s%s\n' % (self.BOLD, obj.project.name, obj.fullid, obj.title, self.NC) result += '\n%s\n\n' % obj.body result += 'Author: %s (%s)\n' % (obj.author.name, obj.creation) if hasattr(obj, 'status') and obj.status: result += 'Status: %s\n' % obj.status.name if hasattr(obj, 'version') and obj.version: result += 'Version: %s\n' % obj.version.name if hasattr(obj, 'category') and obj.category: result += 'Category: %s\n' % obj.category if hasattr(obj, 'assignee') and obj.assignee: result += 'Assignee: %s\n' % (obj.assignee.name) if hasattr(obj, 'attachments') and obj.attachments: result += '\nAttachments:\n' for a in obj.attachments: result += '* %s%s%s <%s>\n' % (self.BOLD, a.filename, self.NC, a.url) if hasattr(obj, 'history') and obj.history: result += '\nHistory:\n' for u in obj.history: result += '* %s%s - %s%s\n' % (self.BOLD, u.date, u.author.name, self.NC) for change in u.changes: result += ' - %s%s%s: %s -> %s\n' % ( self.BOLD, change.field, self.NC, change.last, change.new) if u.message: result += html2text(u.message) return result
def get_thread_mails(self): mails = { 'member' : {}, 'messages' : [], } try: mails['member']['pseudo'] = self.parser.tocleanstring(self.document.getroot().cssselect('div#message_heading div.username span.name')[0]) except IndexError: mails['member']['pseudo'] = 'Unknown' for li in reversed(self.document.xpath('//ul[@id="thread"]//li[contains(@id, "message_")]')): try: txt = self.parser.tostring(li.xpath('.//div[@class="message_body"]')[0]) except IndexError: continue # 'Match' message txt = html2text(txt).strip() m = re.search(r'(\d+), ', li.xpath('.//span[@class="timestamp"]//script')[0].text) assert m date = local2utc(datetime.fromtimestamp(int(m.group(1)))) id_from = li.find('a').attrib['href'].split('/')[-1].split('?')[0] mails['messages'].append({ 'date' : date, 'message' : unicode(txt), 'id_from' : unicode(id_from), }) return mails
def read_renew(self, id): for tr in self.document.getroot().xpath('//tr[@class="patFuncEntry"]'): if len(tr.xpath('td/input[@value="%s"]' % id)) > 0: message = self.browser.parser.tostring( tr.xpath('td[@class="patFuncStatus"]')[0]) renew = Renew(id) renew.message = html2text(message).replace('\n', '') return renew
def iter_videos(self): for div in self.parser.select(self.document.getroot(), 'div.dmpi_video_item'): _id = div.attrib.get('data-id', None) if _id is None: self.browser.logger.warning('Unable to find the ID of a video') continue video = DailymotionVideo(_id) video.title = unicode(self.parser.select(div, 'h3 a', 1).text).strip() video.author = unicode( self.parser.select(div, 'div.dmpi_user_login', 1).find('a').find('span').text).strip() video.description = html2text( self.parser.tostring( self.parser.select(div, 'div.dmpi_video_description', 1))).strip() or unicode() try: parts = self.parser.select(div, 'div.duration', 1).text.split(':') except BrokenPageError: # it's probably a live, np. video.duration = NotAvailable else: if len(parts) == 1: seconds = parts[0] hours = minutes = 0 elif len(parts) == 2: minutes, seconds = parts hours = 0 elif len(parts) == 3: hours, minutes, seconds = parts else: raise BrokenPageError( 'Unable to parse duration %r' % self.parser.select(div, 'div.duration', 1).text) video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds)) url = unicode( self.parser.select(div, 'img.dmco_image', 1).attrib['data-src']) # remove the useless anti-caching url = re.sub('\?\d+', '', url) # use the bigger thumbnail url = url.replace('jpeg_preview_medium.jpg', 'jpeg_preview_large.jpg') video.thumbnail = Thumbnail(unicode(url)) rating_div = self.parser.select(div, 'div.small_stars', 1) video.rating_max = self.get_rate(rating_div) video.rating = self.get_rate(rating_div.find('div')) video.set_empty_fields(NotAvailable, ('url', )) yield video
def parse_profile(self, profile, consts): if profile['online']: self.status = Contact.STATUS_ONLINE self.status_msg = u'online' self.status_msg = u'since %s' % profile['last_cnx'] else: self.status = Contact.STATUS_OFFLINE self.status_msg = u'last connection %s' % profile['last_cnx'] self.summary = html2text(profile.get('announce', '')).strip().replace('\n\n', '\n') if len(profile.get('shopping_list', '')) > 0: self.summary += u'\n\nLooking for:\n%s' % html2text(profile['shopping_list']).strip().replace('\n\n', '\n') for photo in profile['pics']: self.set_photo(photo.split('/')[-1], url=photo + '/full', thumbnail_url=photo + '/small', hidden=False) self.profile = OrderedDict() if 'sex' in profile: for section, d in self.TABLE.iteritems(): flags = ProfileNode.SECTION if section.startswith('_'): flags |= ProfileNode.HEAD if (section.startswith('+') and int(profile['sex']) != 1) or \ (section.startswith('-') and int(profile['sex']) != 0): continue section = section.lstrip('_+-') s = ProfileNode(section, section.capitalize(), OrderedDict(), flags=flags) for key, builder in d.iteritems(): try: value = builder.get_value(profile, consts[int(profile['sex'])]) except KeyError: pass else: s.value[key] = ProfileNode(key, key.capitalize().replace('_', ' '), value) self.profile[section] = s self._aum_profile = profile
def fill_special_advert(self, advert, div): advert.title = u'%s' % self.parser.select(div, 'div[@class="poste"]', 1, method='xpath').text description = self.parser.select(div, 'div[@id="jobBodyContent"]', 1, method='xpath') advert.description = html2text(self.parser.tostring(description)) titresmenuG = self.document.getroot().xpath('//div[@id="divmenuGauche"]')[0] contract_type = self.parser.select(titresmenuG, '//span[@itemprop="employmentType"]', method='xpath') if len(contract_type) != 0: advert.contract_type = u'%s' % contract_type[0].text_content() return self.fill_advert(advert, titresmenuG)
def get_video(self, video=None): if not video: video = ArteLiveVideo(self.group_dict['id']) div = self.document.xpath('//div[@class="bloc-presentation"]')[0] description = self.parser.select(div, 'div[@class="field field-name-body field-type-text-with-summary field-label-hidden bloc-rte"]', 1, method='xpath') video.description = html2text(self.parser.tostring(description)) json_url = self.document.xpath('//div[@class="video-container"]')[0].attrib['arte_vp_url'] return json_url, video
def test_content(self): urls = ['http://www.lefigaro.fr/international/2011/10/24/01003-20111024ARTFIG00704-les-islamo-conservateurs-maitres-du-jeu-tunisien.php', 'http://www.lefigaro.fr/international/2012/01/29/01003-20120129ARTFIG00191-floride-la-primaire-suspendue-a-l-humeur-des-hispaniques.php'] for url in urls: thread = self.backend.get_thread(url) assert len(thread.root.content) assert '<script' not in thread.root.content assert 'object' not in thread.root.content assert 'BFM' not in thread.root.content assert 'AUSSI' not in thread.root.content # no funny tags means html2text does not crash assert len(html2text(thread.root.content))
def fill_normal_advert(self, advert, div): advert.title = u'%s' % self.parser.select(div, 'h1', 1, method='xpath').text description = self.parser.select(div, 'div[@id="jobBodyContent"]', 1, method='xpath') advert.description = html2text(self.parser.tostring(description)) jobsummary = self.document.getroot().xpath('//div[@id="jobsummary_content"]')[0] contract_type = self.parser.select(jobsummary, 'dl/dd[@class="multipleddlast"]/span', method='xpath') if len(contract_type) != 0: advert.contract_type = u'%s' % contract_type[0].text_content() society_name = self.parser.select(jobsummary, '//span[@itemprop="name"]', method='xpath') if len(society_name) != 0: advert.society_name = u'%s' % society_name[0].text_content() return self.fill_advert(advert, jobsummary)
def test_content(self): urls = [ 'http://www.lefigaro.fr/international/2011/10/24/01003-20111024ARTFIG00704-les-islamo-conservateurs-maitres-du-jeu-tunisien.php', 'http://www.lefigaro.fr/international/2012/01/29/01003-20120129ARTFIG00191-floride-la-primaire-suspendue-a-l-humeur-des-hispaniques.php' ] for url in urls: thread = self.backend.get_thread(url) assert len(thread.root.content) assert '<script' not in thread.root.content assert 'object' not in thread.root.content assert 'BFM' not in thread.root.content assert 'AUSSI' not in thread.root.content # no funny tags means html2text does not crash assert len(html2text(thread.root.content))
def get_job_advert(self, url, advert): content = self.document.getroot().xpath('//div[@id="offre-body"]')[0] if not advert: _id = self.parser.select(content, 'div/div/ul/li/div[@class="value"]/span', 1, method='xpath').text advert = PopolemploiJobAdvert(_id) advert.title = u'%s' % self.parser.select(content, 'h4', 1, method='xpath').text.strip() advert.job_name = u'%s' % self.parser.select(content, 'h4', 1, method='xpath').text.strip() description = self.parser.select(content, 'p[@itemprop="description"]', 1, method='xpath') advert.description = html2text(self.parser.tostring(description)) society_name = self.parser.select(content, 'div[@class="vcard"]/p[@class="title"]/span', method='xpath') if society_name: advert.society_name = u'%s' % society_name[0].text advert.url = url place = u'%s' % self.parser.select(content, 'dl/dd/ul/li[@itemprop="addressRegion"]', 1, method='xpath').text advert.place = place.strip() contract_type = u'%s' % self.parser.select(content, 'dl/dd/span[@itemprop="employmentType"]', 1, method='xpath').text advert.contract_type = contract_type.strip() experience = u'%s' % self.parser.select(content, 'dl/dd/span[@itemprop="experienceRequirements"]', 1, method='xpath').text advert.experience = experience.strip() formation = u'%s' % self.parser.select(content, 'dl/dd/span[@itemprop="qualifications"]', 1, method='xpath').text advert.formation = formation.strip() pay = u'%s' % self.parser.select(content, 'dl/dd/span[@itemprop="baseSalary"]', 1, method='xpath').text advert.pay = pay.strip() return advert
def iter_videos(self): for div in self.parser.select(self.document.getroot(), 'div.dmpi_video_item'): _id = div.attrib.get('data-id', None) if _id is None: self.browser.logger.warning('Unable to find the ID of a video') continue video = DailymotionVideo(_id) video.title = unicode(self.parser.select(div, 'h3 a', 1).text).strip() video.author = unicode(self.parser.select(div, 'div.dmpi_user_login', 1).find('a').find('span').text).strip() video.description = html2text(self.parser.tostring(self.parser.select(div, 'div.dmpi_video_description', 1))).strip() or unicode() try: parts = self.parser.select(div, 'div.duration', 1).text.split(':') except BrokenPageError: # it's probably a live, np. video.duration = NotAvailable else: if len(parts) == 1: seconds = parts[0] hours = minutes = 0 elif len(parts) == 2: minutes, seconds = parts hours = 0 elif len(parts) == 3: hours, minutes, seconds = parts else: raise BrokenPageError('Unable to parse duration %r' % self.parser.select(div, 'div.duration', 1).text) video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds)) url = unicode(self.parser.select(div, 'img.dmco_image', 1).attrib['data-src']) # remove the useless anti-caching url = re.sub('\?\d+', '', url) # use the bigger thumbnail url = url.replace('jpeg_preview_medium.jpg', 'jpeg_preview_large.jpg') video.thumbnail = Thumbnail(unicode(url)) rating_div = self.parser.select(div, 'div.small_stars', 1) video.rating_max = self.get_rate(rating_div) video.rating = self.get_rate(rating_div.find('div')) video.set_empty_fields(NotAvailable, ('url',)) yield video
def get_video(self, video=None): if video is None: video = DailymotionVideo(self.group_dict['id']) div = self.parser.select(self.document.getroot(), 'div#content', 1) video.title = unicode(self.parser.select(div, 'span.title', 1).text).strip() video.author = unicode( self.parser.select(div, 'a.name, span.name, a[rel=author]', 1).text).strip() try: video.description = html2text( self.parser.tostring( self.parser.select(div, 'div#video_description', 1))).strip() or unicode() except BrokenPageError: video.description = u'' embed_page = self.browser.readurl( 'http://www.dailymotion.com/embed/video/%s' % video.id) m = re.search('var info = ({.*?}),[^{"]', embed_page) if not m: raise BrokenPageError('Unable to find information about video') info = json.loads(m.group(1)) for key in [ 'stream_h264_hd1080_url', 'stream_h264_hd_url', 'stream_h264_hq_url', 'stream_h264_url', 'stream_h264_ld_url' ]: if info.get(key): #key in info and info[key]: max_quality = key break else: raise BrokenPageError(u'Unable to extract video URL') video.url = info[max_quality] video.set_empty_fields(NotAvailable) return video
def format_obj(self, obj, alias): if hasattr(obj, "message") and obj.message: message = obj.message else: message = u"%s (%s)" % (obj.shop.name, obj.shop.location) result = u"%s%s%s\n" % (self.BOLD, message, self.NC) result += u"ID: %s\n" % obj.fullid result += u"Product: %s\n" % obj.product.name result += u"Cost: %s%s\n" % (obj.cost, obj.currency) if hasattr(obj, "date") and obj.date: result += u"Date: %s\n" % obj.date.strftime("%Y-%m-%d") result += u"\n%sShop:%s\n" % (self.BOLD, self.NC) result += u"\tName: %s\n" % obj.shop.name if obj.shop.location: result += u"\tLocation: %s\n" % obj.shop.location if obj.shop.info: result += u"\n\t" + html2text(obj.shop.info).replace("\n", "\n\t").strip() return result
def format_obj(self, obj, alias): if hasattr(obj, 'message') and obj.message: message = obj.message else: message = u'%s (%s)' % (obj.shop.name, obj.shop.location) result = u'%s%s%s\n' % (self.BOLD, message, self.NC) result += u'ID: %s\n' % obj.fullid result += u'Product: %s\n' % obj.product.name result += u'Cost: %s%s\n' % (obj.cost, obj.currency) if hasattr(obj, 'date') and obj.date: result += u'Date: %s\n' % obj.date.strftime('%Y-%m-%d') result += u'\n%sShop:%s\n' % (self.BOLD, self.NC) result += u'\tName: %s\n' % obj.shop.name if obj.shop.location: result += u'\tLocation: %s\n' % obj.shop.location if obj.shop.info: result += u'\n\t' + html2text(obj.shop.info).replace('\n', '\n\t').strip() return result
def format_obj(self, obj, alias): if hasattr(obj, 'message') and obj.message: message = obj.message else: message = u'%s (%s)' % (obj.shop.name, obj.shop.location) result = u'%s%s%s\n' % (self.BOLD, message, self.NC) result += u'ID: %s\n' % obj.fullid result += u'Product: %s\n' % obj.product.name result += u'Cost: %s%s\n' % (obj.cost, obj.currency) if hasattr(obj, 'date') and obj.date: result += u'Date: %s\n' % obj.date.strftime('%Y-%m-%d') result += u'\n%sShop:%s\n' % (self.BOLD, self.NC) result += u'\tName: %s\n' % obj.shop.name if obj.shop.location: result += u'\tLocation: %s\n' % obj.shop.location if obj.shop.info: result += u'\n\t' + html2text(obj.shop.info).replace( '\n', '\n\t').strip() return result
def fill_gallery(self, gallery): gallery.title = self.document.xpath("//h1[@id='gn']/text()")[0] try: gallery.original_title = self.document.xpath( "//h1[@id='gj']/text()")[0] except IndexError: gallery.original_title = None description_div = self.document.xpath("//div[@id='gd71']")[0] description_html = self.parser.tostring(description_div) gallery.description = html2text(description_html) cardinality_string = self.document.xpath( "//div[@id='gdd']//tr[td[@class='gdt1']/text()='Images:']/td[@class='gdt2']/text()" )[0] gallery.cardinality = int( re.match(r"\d+", cardinality_string).group(0)) date_string = self.document.xpath( "//div[@id='gdd']//tr[td[@class='gdt1']/text()='Posted:']/td[@class='gdt2']/text()" )[0] gallery.date = datetime.strptime(date_string, "%Y-%m-%d %H:%M") rating_string = self.document.xpath( "//td[@id='rating_label']/text()")[0] rating_match = re.search(r"\d+\.\d+", rating_string) if rating_match is None: gallery.rating = None else: gallery.rating = float(rating_match.group(0)) gallery.rating_max = 5 try: thumbnail_url = self.document.xpath( "//div[@class='gdtm']/a/img/attribute::src")[0] except IndexError: thumbnail_style = self.document.xpath( "//div[@class='gdtm']/div/attribute::style")[0] thumbnail_url = re.search(r"background:[^;]+url\((.+?)\)", thumbnail_style).group(1) gallery.thumbnail = Thumbnail(unicode(thumbnail_url))
def get_video(self, video=None): if video is None: video = DailymotionVideo(self.group_dict['id']) div = self.parser.select(self.document.getroot(), 'div#content', 1) video.title = unicode(self.parser.select(div, 'span.title', 1).text).strip() video.author = unicode( self.parser.select(div, 'a.name, span.name', 1).text).strip() try: video.description = html2text( self.parser.tostring( self.parser.select(div, 'div#video_description', 1))).strip() or unicode() except BrokenPageError: video.description = u'' for script in self.parser.select(self.document.getroot(), 'div.dmco_html'): # TODO support videos from anyclip, cf http://www.dailymotion.com/video/xkyjiv for example if 'id' in script.attrib and script.attrib['id'].startswith('container_player_') and \ script.find('script') is not None: text = script.find('script').text mobj = re.search( r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', text) if mobj is None: mobj = re.search('"sdURL":.*?"(.*?)"', urllib.unquote(text)) mediaURL = mobj.group(1).replace("\\", "") else: mediaURL = urllib.unquote(mobj.group(1)) video.url = mediaURL video.set_empty_fields(NotAvailable) return video
def get_job_advert(self, url, advert): re_id_title = re.compile('/offres-emploi-cadres/\d*_\d*_\d*_(.*?)________(.*?).html(.*?)', re.DOTALL) if advert is None: _id = u'%s/%s' % (re_id_title.search(url).group(1), re_id_title.search(url).group(2)) advert = ApecJobAdvert(_id) advert.title = re_id_title.search(url).group(2).replace('-', ' ') description = self.document.getroot().xpath("//div[@class='contentWithDashedBorderTop marginTop boxContent']/div")[0] advert.description = html2text(self.parser.tostring(description)) advert.job_name = advert.title trs = self.document.getroot().xpath("//table[@class='noFieldsTable']/tr") for tr in trs: th = self.parser.select(tr, 'th', 1, method='xpath') td = self.parser.select(tr, 'td', 1, method='xpath') if u'Date de publication' in u'%s' % th.text_content(): advert.publication_date = dateutil.parser.parse(td.text_content()).date() elif u'Société' in u'%s' % th.text_content() and not advert.society_name: society_name = td.text_content() a = self.parser.select(td, 'a', method='xpath') if a: advert.society_name = u'%s' % society_name.replace(a[0].text_content(), '').strip() else: advert.society_name = society_name.strip() elif u'Type de contrat' in u'%s' % th.text_content(): advert.contract_type = u'%s' % td.text_content().strip() elif u'Lieu' in u'%s' % th.text_content(): advert.place = u'%s' % td.text_content() elif u'Salaire' in u'%s' % th.text_content(): advert.pay = u'%s' % td.text_content() elif u'Expérience' in u'%s' % th.text_content(): advert.experience = u'%s' % td.text_content() advert.url = url return advert
def get_torrent(self, id): table = self.browser.parser.select(self.document.getroot(), 'div.thin', 1) h2 = table.xpath('.//h2') if len(h2) > 0: title = u''.join([txt.strip() for txt in h2[0].itertext()]) else: title = self.browser.parser.select(table, 'div.title_text', 1).text torrent = Torrent(id, title) if '.' in id: torrentid = id.split('.', 1)[1] else: torrentid = id table = self.browser.parser.select(self.document.getroot(), 'table.torrent_table') if len(table) == 0: table = self.browser.parser.select(self.document.getroot(), 'div.main_column', 1) is_table = False else: table = table[0] is_table = True for tr in table.findall('tr' if is_table else 'div'): if is_table and 'group_torrent' in tr.attrib.get('class', ''): tds = tr.findall('td') if not len(tds) == 5: continue url = tds[0].find('span').find('a').attrib['href'] m = self.TORRENTID_REGEXP.match(url) if not m: warning('ID not found') continue if m.group(1) != torrentid: continue torrent.url = self.format_url(url) size, unit = tds[1].text.split() torrent.size = get_bytes_size(float(size.replace(',', '')), unit) torrent.seeders = int(tds[3].text) torrent.leechers = int(tds[4].text) break elif not is_table and tr.attrib.get('class', '').startswith('torrent_widget') \ and tr.attrib.get('class', '').endswith('pad'): url = tr.cssselect('a[title=Download]')[0].attrib['href'] m = self.TORRENTID_REGEXP.match(url) if not m: warning('ID not found') continue if m.group(1) != torrentid: continue torrent.url = self.format_url(url) size, unit = tr.cssselect( 'div.details_title strong')[-1].text.strip('()').split() torrent.size = get_bytes_size(float(size.replace(',', '')), unit) torrent.seeders = int( tr.cssselect('img[title=Seeders]')[0].tail) torrent.leechers = int( tr.cssselect('img[title=Leechers]')[0].tail) break if not torrent.url: warning('Torrent %s not found in list' % torrentid) return None div = self.parser.select(self.document.getroot(), 'div.main_column', 1) for box in div.cssselect('div.box'): title = None body = None title_t = box.cssselect('div.head') if len(title_t) > 0: title_t = title_t[0] if title_t.find('strong') is not None: title_t = title_t.find('strong') if title_t.text is not None: title = title_t.text.strip() body_t = box.cssselect('div.body,div.desc') if body_t: body = html2text(self.parser.tostring(body_t[-1])).strip() if title and body: if torrent.description is NotLoaded: torrent.description = u'' torrent.description += u'%s\n\n%s\n' % (title, body) divs = self.document.getroot().cssselect( 'div#files_%s,div#filelist_%s,tr#torrent_%s td' % (torrentid, torrentid, torrentid)) if divs: torrent.files = [] for div in divs: table = div.find('table') if table is None: continue for tr in table: if tr.attrib.get('class', None) != 'colhead_dark': torrent.files.append(tr.find('td').text) return torrent
def send_email(self, backend, mail): domain = self.config.get('domain') recipient = self.config.get('recipient') reply_id = '' if mail.parent: reply_id = u'<%s.%s@%s>' % (backend.name, mail.parent.full_id, domain) subject = mail.title sender = u'"%s" <%s@%s>' % (mail.sender.replace('"', '""') if mail.sender else '', backend.name, domain) # assume that .date is an UTC datetime date = formatdate(time.mktime(utc2local(mail.date).timetuple()), localtime=True) msg_id = u'<%s.%s@%s>' % (backend.name, mail.full_id, domain) if self.config.get('html') and mail.flags & mail.IS_HTML: body = mail.content content_type = 'html' else: if mail.flags & mail.IS_HTML: body = html2text(mail.content) else: body = mail.content content_type = 'plain' if body is None: body = '' if mail.signature: if self.config.get('html') and mail.flags & mail.IS_HTML: body += u'<p>-- <br />%s</p>' % mail.signature else: body += u'\n\n-- \n' if mail.flags & mail.IS_HTML: body += html2text(mail.signature) else: body += mail.signature # Header class is smart enough to try US-ASCII, then the charset we # provide, then fall back to UTF-8. header_charset = 'ISO-8859-1' # We must choose the body charset manually for body_charset in 'US-ASCII', 'ISO-8859-1', 'UTF-8': try: body.encode(body_charset) except UnicodeError: pass else: break # Split real name (which is optional) and email address parts sender_name, sender_addr = parseaddr(sender) recipient_name, recipient_addr = parseaddr(recipient) # We must always pass Unicode strings to Header, otherwise it will # use RFC 2047 encoding even on plain ASCII strings. sender_name = str(Header(unicode(sender_name), header_charset)) recipient_name = str(Header(unicode(recipient_name), header_charset)) # Make sure email addresses do not contain non-ASCII characters sender_addr = sender_addr.encode('ascii') recipient_addr = recipient_addr.encode('ascii') # Create the message ('plain' stands for Content-Type: text/plain) msg = MIMEText(body.encode(body_charset), content_type, body_charset) msg['From'] = formataddr((sender_name, sender_addr)) msg['To'] = formataddr((recipient_name, recipient_addr)) msg['Subject'] = Header(unicode(subject), header_charset) msg['Message-Id'] = msg_id msg['Date'] = date if reply_id: msg['In-Reply-To'] = reply_id self.logger.info('Send mail from <%s> to <%s>' % (sender, recipient)) if len(self.config.get('pipe')) > 0: p = subprocess.Popen(self.config.get('pipe'), shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) p.stdin.write(msg.as_string()) p.stdin.close() if p.wait() != 0: self.logger.error('Unable to deliver mail: %s' % p.stdout.read().strip()) return False else: # Send the message via SMTP to localhost:25 try: smtp = SMTP(self.config.get('smtp')) smtp.sendmail(sender, recipient, msg.as_string()) except Exception as e: self.logger.error('Unable to deliver mail: %s' % e) return False else: smtp.quit() return True
def get_value(self, profile, consts): return html2text(unicode(profile[self.key])).strip()
def get_torrent(self, id): table = self.browser.parser.select(self.document.getroot(), "div.thin", 1) h2 = table.xpath(".//h2") if len(h2) > 0: title = u"".join([txt.strip() for txt in h2[0].itertext()]) else: title = self.browser.parser.select(table, "div.title_text", 1).text torrent = Torrent(id, title) if "." in id: torrentid = id.split(".", 1)[1] else: torrentid = id table = self.browser.parser.select(self.document.getroot(), "table.torrent_table") if len(table) == 0: table = self.browser.parser.select(self.document.getroot(), "div.main_column", 1) is_table = False else: table = table[0] is_table = True for tr in table.findall("tr" if is_table else "div"): if is_table and "group_torrent" in tr.attrib.get("class", ""): tds = tr.findall("td") if not len(tds) == 5: continue url = tds[0].find("span").find("a").attrib["href"] m = self.TORRENTID_REGEXP.match(url) if not m: warning("ID not found") continue if m.group(1) != torrentid: continue torrent.url = self.format_url(url) size, unit = tds[1].text.split() torrent.size = get_bytes_size(float(size.replace(",", "")), unit) torrent.seeders = int(tds[3].text) torrent.leechers = int(tds[4].text) break elif ( not is_table and tr.attrib.get("class", "").startswith("torrent_widget") and tr.attrib.get("class", "").endswith("pad") ): url = tr.cssselect("a[title=Download]")[0].attrib["href"] m = self.TORRENTID_REGEXP.match(url) if not m: warning("ID not found") continue if m.group(1) != torrentid: continue torrent.url = self.format_url(url) size, unit = tr.cssselect("div.details_title strong")[-1].text.strip("()").split() torrent.size = get_bytes_size(float(size.replace(",", "")), unit) torrent.seeders = int(tr.cssselect("img[title=Seeders]")[0].tail) torrent.leechers = int(tr.cssselect("img[title=Leechers]")[0].tail) break if not torrent.url: warning("Torrent %s not found in list" % torrentid) return None div = self.parser.select(self.document.getroot(), "div.main_column", 1) for box in div.cssselect("div.box"): title = None body = None title_t = box.cssselect("div.head") if len(title_t) > 0: title_t = title_t[0] if title_t.find("strong") is not None: title_t = title_t.find("strong") if title_t.text is not None: title = title_t.text.strip() body_t = box.cssselect("div.body,div.desc") if body_t: body = html2text(self.parser.tostring(body_t[-1])).strip() if title and body: if torrent.description is NotLoaded: torrent.description = u"" torrent.description += u"%s\n\n%s\n" % (title, body) divs = self.document.getroot().cssselect( "div#files_%s,div#filelist_%s,tr#torrent_%s td" % (torrentid, torrentid, torrentid) ) if divs: torrent.files = [] for div in divs: table = div.find("table") if table is None: continue for tr in table: if tr.attrib.get("class", None) != "colhead_dark": torrent.files.append(tr.find("td").text) return torrent
def test_lefigaro(self): l = list(self.backend.iter_threads()) assert len(l) thread = self.backend.get_thread(l[0].id) assert len(thread.root.content) assert len(html2text(thread.root.content))