Python html2text примеры, weboob.tools.misc.html2text Python примеры использования

Пример #1

0

Показать файл

Файл: pages.py Проект: pombredanne/weboob

    def get_job_advert(self, url, advert):
        re_id = re.compile('http://(.*?)/offre_emploi/detailoffre.aspx\?numoffre=(.*?)&de=consultation', re.DOTALL)
        if advert is None:
            _id = u'%s|%s' % (re_id.search(url).group(1), re_id.search(url).group(2))
            advert = RegionsJobAdvert(_id)

        advert.url = u'%s' % url

        div = self.document.getroot().xpath('//div[@id="annonce"]')[0]

        advert.title = u'%s' % self.parser.select(div, 'h1', 1, method='xpath').text

        content = self.parser.select(div, 'p', method='xpath')

        next_is_date = False
        next_is_pay = False
        description = ''

        for p in content:
            if next_is_date:
                m = re.match('(\d{2})\s(\d{2})\s(\d{4})', date)
                if m:
                    dd = int(m.group(1))
                    mm = int(m.group(2))
                    yyyy = int(m.group(3))
                    advert.publication_date = datetime.date(yyyy, mm, dd)
                next_is_date = False

            elif next_is_pay:
                advert.pay = html2text(self.parser.tostring(p))
                next_is_pay = False

            elif 'class' in p.attrib:
                if p.attrib['class'] == 'contrat_loc':
                    _p = self.parser.select(div, 'p[@class="contrat_loc"]', 1, method='xpath')
                    content_p = _p.text_content().strip().split('\r\n')
                    for el in content_p:
                        splitted_el = el.split(':')
                        if len(splitted_el) == 2:
                            if splitted_el[0] == 'Entreprise':
                                advert.society_name = splitted_el[1]
                            elif splitted_el[0] == 'Contrat':
                                advert.contract_type = splitted_el[1]
                            elif splitted_el[0] == 'Localisation':
                                advert.place = splitted_el[1]

                elif p.attrib['class'] == 'date_ref':
                    next_is_date = True

                elif p.attrib['class'] == 'rubrique_annonce' and p.text == 'Salaire':
                    next_is_pay = True

                else:
                    description = description + html2text(self.parser.tostring(p))
            else:
                description = description + html2text(self.parser.tostring(p))

        advert.description = u'%s' % description

        return advert

Пример #2

0

Показать файл

Файл: boobmsg.py Проект: lissyx/weboob

    def format_obj(self, obj, alias):
        result = u'%sTitle:%s %s\n' % (self.BOLD,
                                       self.NC, obj.title)
        result += u'%sDate:%s %s\n' % (self.BOLD,
                                       self.NC, obj.date.strftime('%Y-%m-%d %H:%M'))
        result += u'%sFrom:%s %s\n' % (self.BOLD,
                                       self.NC, obj.sender)
        if hasattr(obj, 'receivers') and obj.receivers:
            result += u'%sTo:%s %s\n' % (self.BOLD,
                                         self.NC,
                                         ', '.join(obj.receivers))

        if obj.flags & Message.IS_HTML:
            content = html2text(obj.content)
        else:
            content = obj.content

        result += '\n%s' % content

        if obj.signature:
            if obj.flags & Message.IS_HTML:
                signature = html2text(obj.signature)
            else:
                signature = obj.signature

            result += '\n-- \n%s' % signature
        return result

Пример #3

0

Показать файл

Файл: boobmsg.py Проект: pombredanne/weboob

    def format_obj(self, obj, alias):
        result = u'%sTitle:%s %s\n' % (self.BOLD,
                                       self.NC, obj.title)
        result += u'%sDate:%s %s\n' % (self.BOLD,
                                       self.NC, obj.date.strftime('%Y-%m-%d %H:%M'))
        result += u'%sFrom:%s %s\n' % (self.BOLD,
                                       self.NC, obj.sender)
        if hasattr(obj, 'receivers') and obj.receivers:
            result += u'%sTo:%s %s\n' % (self.BOLD,
                                         self.NC,
                                         ', '.join(obj.receivers))

        if obj.flags & Message.IS_HTML:
            content = html2text(obj.content)
        else:
            content = obj.content

        result += '\n%s' % content

        if obj.signature:
            if obj.flags & Message.IS_HTML:
                signature = html2text(obj.signature)
            else:
                signature = obj.signature

            result += '\n-- \n%s' % signature
        return result

Пример #4

0

Показать файл

Файл: boobtracker.py Проект: pombredanne/weboob

 def format_obj(self, obj, alias):
     result = u'%s %s %s %s %s\n' % (self.colored(obj.project.name, 'blue', 'bold'),
                                     self.colored(u'—', 'cyan', 'bold'),
                                     self.colored(obj.fullid, 'red', 'bold'),
                                     self.colored(u'—', 'cyan', 'bold'),
                                     self.colored(obj.title, 'yellow', 'bold'))
     result += '\n%s\n\n' % obj.body
     result += self.format_key('Author', '%s (%s)' % (obj.author.name, obj.creation))
     result += self.format_attr(obj, 'status')
     result += self.format_attr(obj, 'version')
     result += self.format_attr(obj, 'category')
     result += self.format_attr(obj, 'assignee')
     if hasattr(obj, 'fields') and not empty(obj.fields):
         for key, value in obj.fields.iteritems():
             result += self.format_key(key.capitalize(), value)
     if hasattr(obj, 'attachments') and obj.attachments:
         result += '\n%s\n' % self.colored('Attachments:', 'green')
         for a in obj.attachments:
             result += '* %s%s%s <%s>\n' % (self.BOLD, a.filename, self.NC, a.url)
     if hasattr(obj, 'history') and obj.history:
         result += '\n%s\n' % self.colored('History:', 'green')
         for u in obj.history:
             result += '%s %s %s %s\n' % (self.colored('*', 'red', 'bold'),
                                          self.colored(u.date, 'yellow', 'bold'),
                                          self.colored(u'—', 'cyan', 'bold'),
                                          self.colored(u.author.name, 'blue', 'bold'))
             for change in u.changes:
                 result += '  - %s %s %s %s\n' % (self.colored(change.field, 'green'),
                                                  change.last,
                                                  self.colored('->', 'magenta'), change.new)
             if u.message:
                 result += '    %s\n' % html2text(u.message).strip().replace('\n', '\n    ')
     return result

Пример #5

0

Показать файл

Файл: boobtracker.py Проект: eirmag/weboob

 def format_obj(self, obj, alias):
     result = u"%s%s - #%s - %s%s\n" % (self.BOLD, obj.project.name, obj.fullid, obj.title, self.NC)
     result += "\n%s\n\n" % obj.body
     result += "Author: %s (%s)\n" % (obj.author.name, obj.creation)
     if hasattr(obj, "status") and obj.status:
         result += "Status: %s\n" % obj.status.name
     if hasattr(obj, "version") and obj.version:
         result += "Version: %s\n" % obj.version.name
     if hasattr(obj, "category") and obj.category:
         result += "Category: %s\n" % obj.category
     if hasattr(obj, "assignee") and obj.assignee:
         result += "Assignee: %s\n" % (obj.assignee.name)
     if hasattr(obj, "attachments") and obj.attachments:
         result += "\nAttachments:\n"
         for a in obj.attachments:
             result += "* %s%s%s <%s>\n" % (self.BOLD, a.filename, self.NC, a.url)
     if hasattr(obj, "history") and obj.history:
         result += "\nHistory:\n"
         for u in obj.history:
             result += "* %s%s - %s%s\n" % (self.BOLD, u.date, u.author.name, self.NC)
             for change in u.changes:
                 result += "  - %s%s%s: %s -> %s\n" % (self.BOLD, change.field, self.NC, change.last, change.new)
             if u.message:
                 result += html2text(u.message)
     return result

Пример #6

0

Показать файл

Файл: pages.py Проект: Boussadia/weboob

    def get_job_advert(self, url, advert):
        re_id = re.compile('http://www.adecco.fr/trouver-un-emploi/Pages/Details-de-l-Offre/(.*?)/(.*?).aspx\?IOF=(.*?)$', re.DOTALL)
        if advert is None:
            _id = u'%s/%s/%s' % (re_id.search(url).group(1), re_id.search(url).group(2), re_id.search(url).group(3))
            advert = AdeccoJobAdvert(_id)

        advert.contract_type = re_id.search(url).group(1)
        div = self.document.getroot().xpath("//div[@class='contain_MoreResults']")[0]

        date = u'%s' % self.parser.select(div, "div[@class='dateResult']", 1, method='xpath').text.strip()
        m = re.match('(\d{2})\s(.*?)\s(\d{4})', date)
        if m:
            dd = int(m.group(1))
            mm = MONTHS.index(m.group(2)) + 1
            yyyy = int(m.group(3))
            advert.publication_date = datetime.date(yyyy, mm, dd)

        title = self.parser.select(div, "h1", 1, method='xpath').text_content().strip()
        town = self.parser.select(div, "h1/span/span[@class='town']", 1, method='xpath').text_content()
        page_title = self.parser.select(div, "h1/span[@class='pageTitle']", 1, method='xpath').text_content()
        advert.title = u'%s' % title.replace(town, '').replace(page_title, '')

        spans = self.document.getroot().xpath("//div[@class='jobGreyContain']/table/tr/td/span[@class='value']")
        advert.job_name = u'%s' % spans[0].text
        advert.place = u'%s' % spans[1].text
        advert.pay = u'%s' % spans[2].text
        advert.contract_type = u'%s' % spans[3].text
        advert.url = url
        description = self.document.getroot().xpath("//div[@class='descriptionContainer']/p")[0]
        advert.description = html2text(self.parser.tostring(description))
        return advert

Пример #7

0

Показать файл

Файл: boobtracker.py Проект: blckshrk/Weboob

 def format_obj(self, obj, alias):
     result = u'%s%s - #%s - %s%s\n' % (self.BOLD, obj.project.name, obj.fullid, obj.title, self.NC)
     result += '\n%s\n\n' % obj.body
     result += 'Author: %s (%s)\n' % (obj.author.name, obj.creation)
     if hasattr(obj, 'status') and obj.status:
         result += 'Status: %s\n' % obj.status.name
     if hasattr(obj, 'version') and obj.version:
         result += 'Version: %s\n' % obj.version.name
     if hasattr(obj, 'category') and obj.category:
         result += 'Category: %s\n' % obj.category
     if hasattr(obj, 'assignee') and obj.assignee:
         result += 'Assignee: %s\n' % (obj.assignee.name)
     if hasattr(obj, 'attachments') and obj.attachments:
         result += '\nAttachments:\n'
         for a in obj.attachments:
             result += '* %s%s%s <%s>\n' % (self.BOLD, a.filename, self.NC, a.url)
     if hasattr(obj, 'history') and obj.history:
         result += '\nHistory:\n'
         for u in obj.history:
             result += '* %s%s - %s%s\n' % (self.BOLD, u.date, u.author.name, self.NC)
             for change in u.changes:
                 result += '  - %s%s%s: %s -> %s\n' % (self.BOLD, change.field, self.NC, change.last, change.new)
             if u.message:
                 result += html2text(u.message)
     return result

Пример #8

0

Показать файл

Файл: pages.py Проект: hugues/weboob

    def get_video(self, video=None):
        if video is None:
            video = DailymotionVideo(self.group_dict['id'])

        div = self.parser.select(self.document.getroot(), 'div#content', 1)

        video.title = unicode(self.parser.select(div, 'span.title', 1).text).strip()
        video.author = unicode(self.parser.select(div, 'a.name, span.name, a[rel=author]', 1).text).strip()
        try:
            video.description = html2text(self.parser.tostring(self.parser.select(div, 'div#video_description', 1))).strip() or unicode()
        except BrokenPageError:
            video.description = u''

        embed_page = self.browser.readurl('http://www.dailymotion.com/embed/video/%s' % video.id)

        m = re.search('var info = ({.*?}),[^{"]', embed_page)
        if not m:
            raise BrokenPageError('Unable to find information about video')

        info = json.loads(m.group(1))
        for key in ['stream_h264_hd1080_url','stream_h264_hd_url',
                    'stream_h264_hq_url','stream_h264_url',
                    'stream_h264_ld_url']:
            if info.get(key):#key in info and info[key]:
                max_quality = key
                break
        else:
            raise BrokenPageError(u'Unable to extract video URL')
        video.url = info[max_quality]

        video.set_empty_fields(NotAvailable)

        return video

Пример #9

0

Показать файл

Файл: pages.py Проект: eirmag/weboob

    def get_video(self, video=None):
        if video is None:
            video = DailymotionVideo(self.group_dict['id'])

        div = self.parser.select(self.document.getroot(), 'div#content', 1)

        video.title = unicode(self.parser.select(div, 'span.title', 1).text).strip()
        video.author = unicode(self.parser.select(div, 'a.name, span.name', 1).text).strip()
        try:
            video.description = html2text(self.parser.tostring(self.parser.select(div, 'div#video_description', 1))).strip() or unicode()
        except BrokenPageError:
            video.description = u''
        for script in self.parser.select(self.document.getroot(), 'div.dmco_html'):
            # TODO support videos from anyclip, cf http://www.dailymotion.com/video/xkyjiv for example
            if 'id' in script.attrib and script.attrib['id'].startswith('container_player_') and \
               script.find('script') is not None:
                text = script.find('script').text
                mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', text)
                if mobj is None:
                    mobj = re.search('"sdURL":.*?"(.*?)"', urllib.unquote(text))
                    mediaURL = mobj.group(1).replace("\\", "")
                else:
                    mediaURL = urllib.unquote(mobj.group(1))
                video.url = mediaURL

        video.set_empty_fields(NotAvailable)

        return video

Пример #10

0

Показать файл

Файл: pages.py Проект: pombredanne/weboob

    def get_event(self, url, event):

        event.url = url

        header = self.document.getroot().xpath("//div[@class='pvi-hero-product']")[0]

        title = self.parser.select(header, "div[@class='d-rubric-inner']/h1", 1, method="xpath").text.strip()
        year = self.parser.select(header, "div[@class='d-rubric-inner']/small", 1, method="xpath").text.strip()

        _infos = self.parser.select(header, "ul[@class='pvi-product-specs']/li", method="xpath")
        infos = ""
        for li in _infos:
            infos += u"- %s\n" % self.parser.tocleanstring(li)

        section = self.document.getroot().xpath("//section[@class='pvi-productDetails']")[0]
        _infos = self.parser.select(section, "ul/li", method="xpath")
        for li in _infos:
            infos += u"- %s\n" % self.parser.tocleanstring(li)

        _resume = self.parser.select(section, "p[@data-rel='full-resume']", method="xpath")
        if not _resume:
            _resume = self.parser.select(section, "p[@data-rel='small-resume']", method="xpath")
            if _resume:
                resume = html2text(self.parser.tostring(_resume[0]))
            else:
                resume = ""
        else:
            _id = self.parser.select(_resume[0], "button", 1, method="xpath").attrib["data-sc-product-id"]
            resume = self.browser.get_resume(url, _id)

        event.description = u"%s %s\n\n%s\n\n%s" % (title, year, infos, resume)
        return event

Пример #11

0

Показать файл

Файл: pages.py Проект: Boussadia/weboob

 def filter(self, el):
     _resume = el[0].xpath("p[@data-rel='full-resume']")
     if not _resume:
         _resume = el[0].xpath("p[@data-rel='small-resume']")
         if _resume:
             resume = html2text(CleanText(_resume[0])(self))[6:]
             return resume

Пример #12

0

Показать файл

Файл: pages.py Проект: pombredanne/weboob

    def get_job_advert(self, url, advert):
        job_header = self.document.getroot().xpath('//div[@id="job_header"]')[0]
        if not advert:
            title = self.parser.select(job_header, 'b[@class="jobtitle"]', 1, method='xpath').text_content()
            society_name = self.parser.select(job_header, 'span[@class="company"]', 1, method='xpath').text_content()
            num_id = url.split('-')[-1]
            advert = IndeedJobAdvert(society_name + "|" + title + "|" + num_id)

        advert.place = u'%s' % self.parser.select(job_header, 'span[@class="location"]', 1, method='xpath').text_content()
        description_content = self.document.getroot().xpath('//span[@class="summary"]')[0]
        advert.description = html2text(self.parser.tostring(description_content))
        advert.job_name = u'%s' % self.parser.select(job_header, 'b[@class="jobtitle"]', 1, method='xpath').text_content()
        advert.url = url

        date = self.document.getroot().xpath('//span[@class="date"]')[0].text_content().strip()
        now = datetime.datetime.now()
        number = re.search("\d+", date)
        if number:
            if 'heures' in date:
                date = now - datetime.timedelta(hours=int(number.group(0)))
                advert.publication_date = date
            elif 'jour' in date:
                date = now - datetime.timedelta(days=int(number.group(0)))
                advert.publication_date = date

        return advert

Пример #13

0

Показать файл

Файл: pages.py Проект: Boussadia/weboob

 def read_renew(self, id):
     for tr in self.document.getroot().xpath('//tr[@class="patFuncEntry"]'):
         if len(tr.xpath('td/input[@value="%s"]' % id)) > 0:
             message = self.browser.parser.tostring(tr.xpath('td[@class="patFuncStatus"]')[0])
             renew = Renew(id)
             renew.message = html2text(message).replace('\n', '')
             return renew

Пример #14

0

Показать файл

Файл: pages.py Проект: Boussadia/weboob

    def set_video_metadata(self, video):

        head = self.parser.select(self.document.getroot(), 'head', 1)

        video.title = unicode(self.parser.select(head, 'meta[property="og:title"]', 1).get("content")).strip()
        video.author = unicode(self.parser.select(head, 'meta[name="author"]', 1).get("content")).strip()

        url = unicode(self.parser.select(head, 'meta[property="og:image"]', 1).get("content")).strip()
        # remove the useless anti-caching
        url = re.sub('\?\d+', '', url)
        video.thumbnail = BaseImage(url)
        video.thumbnail.url = video.thumbnail.id

        try:
            parts = self.parser.select(head, 'meta[property="video:duration"]', 1).get("content").strip().split(':')
        except BrokenPageError:
            # it's probably a live, np.
            video.duration = NotAvailable
        else:
            if len(parts) == 1:
                seconds = parts[0]
                hours = minutes = 0
            elif len(parts) == 2:
                minutes, seconds = parts
                hours = 0
            elif len(parts) == 3:
                hours, minutes, seconds = parts
            else:
                raise BrokenPageError('Unable to parse duration %r' % parts)
            video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds))

        try:
            video.description = html2text(self.parser.select(head, 'meta[property="og:description"]', 1).get("content")).strip() or unicode()
        except BrokenPageError:
            video.description = u''

Пример #15

0

Показать файл

    def get_video(self, video=None):
        if video is None:
            video = DailymotionVideo(self.group_dict['id'])

        div = self.parser.select(self.document.getroot(), 'div#content', 1)

        video.title = unicode(self.parser.select(div, 'span.title', 1).text).strip()
        video.author = unicode(self.parser.select(div, 'a.name, span.name, a[rel=author]', 1).text).strip()
        try:
            video.description = html2text(self.parser.tostring(self.parser.select(div, 'div#video_description', 1))).strip() or unicode()
        except BrokenPageError:
            video.description = u''
        for script in self.parser.select(self.document.getroot(), 'div.dmco_html'):
            # TODO support videos from anyclip, cf http://www.dailymotion.com/video/xkyjiv for example
            if 'id' in script.attrib and script.attrib['id'].startswith('container_player_') and \
               script.find('script') is not None:
                text = script.find('script').text
                mobj = re.search(r'\s*var flashvars = (.*)', text)
                if mobj is None:
                    raise BrokenPageError('Unable to extract video url')
                flashvars = urllib.unquote(mobj.group(1))
                for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
                    if key in flashvars:
                        max_quality = key
                        break

                mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
                if mobj is None:
                    raise BrokenPageError('Unable to extract video url')
                video.url = urllib.unquote(mobj.group(1)).replace('\\/', '/')

        video.set_empty_fields(NotAvailable)

        return video

Пример #16

0

Показать файл

Файл: boobmsg.py Проект: pombredanne/weboob

    def do_status(self, line):
        """
        status

        Display status information about a backend.
        """
        if len(line) > 0:
            backend_name = line
        else:
            backend_name = None

        results = {}
        for backend, field in self.do('get_account_status',
                                      backends=backend_name,
                                      caps=ICapAccount):
            if backend.name in results:
                results[backend.name].append(field)
            else:
                results[backend.name] = [field]

        for name, fields in results.iteritems():
            print ':: %s ::' % name
            for f in fields:
                if f.flags & f.FIELD_HTML:
                    value = html2text(f.value)
                else:
                    value = f.value
                print '%s: %s' % (f.label, value)
            print ''

Пример #17

0

Показать файл

Файл: boobmsg.py Проект: lissyx/weboob

    def do_status(self, line):
        """
        status

        Display status information about a backend.
        """
        if len(line) > 0:
            backend_name = line
        else:
            backend_name = None

        results = {}
        for backend, field in self.do('get_account_status',
                                      backends=backend_name,
                                      caps=ICapAccount):
            if backend.name in results:
                results[backend.name].append(field)
            else:
                results[backend.name] = [field]

        for name, fields in results.iteritems():
            print ':: %s ::' % name
            for f in fields:
                if f.flags & f.FIELD_HTML:
                    value = html2text(f.value)
                else:
                    value = f.value
                print '%s: %s' % (f.label, value)
            print ''

Пример #18

0

Показать файл

Файл: pages.py Проект: Boussadia/weboob

    def fill_gallery(self, gallery):
        gallery.title = self.document.xpath("//h1[@id='gn']/text()")[0]
        try:
            gallery.original_title = self.document.xpath("//h1[@id='gj']/text()")[0]
        except IndexError:
            gallery.original_title = None
        description_div = self.document.xpath("//div[@id='gd71']")[0]
        description_html = self.parser.tostring(description_div)
        gallery.description = html2text(description_html)
        cardinality_string = self.document.xpath("//div[@id='gdd']//tr[td[@class='gdt1']/text()='Images:']/td[@class='gdt2']/text()")[0]
        gallery.cardinality = int(re.match(r"\d+", cardinality_string).group(0))
        date_string = self.document.xpath("//div[@id='gdd']//tr[td[@class='gdt1']/text()='Posted:']/td[@class='gdt2']/text()")[0]
        gallery.date = datetime.strptime(date_string, "%Y-%m-%d %H:%M")
        rating_string = self.document.xpath("//td[@id='rating_label']/text()")[0]
        rating_match = re.search(r"\d+\.\d+", rating_string)
        if rating_match is None:
            gallery.rating = None
        else:
            gallery.rating = float(rating_match.group(0))

        gallery.rating_max = 5

        try:
            thumbnail_url = self.document.xpath("//div[@class='gdtm']/a/img/attribute::src")[0]
        except IndexError:
            thumbnail_style = self.document.xpath("//div[@class='gdtm']/div/attribute::style")[0]
            thumbnail_url = re.search(r"background:[^;]+url\((.+?)\)", thumbnail_style).group(1)

        gallery.thumbnail = BaseImage(thumbnail_url)
        gallery.thumbnail.url = gallery.thumbnail.id

Пример #19

0

Показать файл

Файл: boobtracker.py Проект: lissyx/weboob

 def format_obj(self, obj, alias):
     result = u'%s%s - #%s - %s%s\n' % (self.BOLD, obj.project.name,
                                        obj.fullid, obj.title, self.NC)
     result += '\n%s\n\n' % obj.body
     result += 'Author: %s (%s)\n' % (obj.author.name, obj.creation)
     if hasattr(obj, 'status') and obj.status:
         result += 'Status: %s\n' % obj.status.name
     if hasattr(obj, 'version') and obj.version:
         result += 'Version: %s\n' % obj.version.name
     if hasattr(obj, 'category') and obj.category:
         result += 'Category: %s\n' % obj.category
     if hasattr(obj, 'assignee') and obj.assignee:
         result += 'Assignee: %s\n' % (obj.assignee.name)
     if hasattr(obj, 'attachments') and obj.attachments:
         result += '\nAttachments:\n'
         for a in obj.attachments:
             result += '* %s%s%s <%s>\n' % (self.BOLD, a.filename, self.NC,
                                            a.url)
     if hasattr(obj, 'history') and obj.history:
         result += '\nHistory:\n'
         for u in obj.history:
             result += '* %s%s - %s%s\n' % (self.BOLD, u.date,
                                            u.author.name, self.NC)
             for change in u.changes:
                 result += '  - %s%s%s: %s -> %s\n' % (
                     self.BOLD, change.field, self.NC, change.last,
                     change.new)
             if u.message:
                 result += html2text(u.message)
     return result

Пример #20

0

Показать файл

Файл: pages.py Проект: Boussadia/weboob

    def get_thread_mails(self):
        mails = {
            'member' : {},
            'messages' : [],
        }

        try:
            mails['member']['pseudo'] = self.parser.tocleanstring(self.document.getroot().cssselect('div#message_heading div.username span.name')[0])
        except IndexError:
            mails['member']['pseudo'] = 'Unknown'

        for li in reversed(self.document.xpath('//ul[@id="thread"]//li[contains(@id, "message_")]')):
            try:
                txt = self.parser.tostring(li.xpath('.//div[@class="message_body"]')[0])
            except IndexError:
                continue # 'Match' message
            txt = html2text(txt).strip()

            m = re.search(r'(\d+), ', li.xpath('.//span[@class="timestamp"]//script')[0].text)
            assert m
            date = local2utc(datetime.fromtimestamp(int(m.group(1))))

            id_from = li.find('a').attrib['href'].split('/')[-1].split('?')[0]

            mails['messages'].append({
                'date' : date,
                'message' : unicode(txt),
                'id_from' : unicode(id_from),
            })

        return mails

Пример #21

0

Показать файл

 def read_renew(self, id):
     for tr in self.document.getroot().xpath('//tr[@class="patFuncEntry"]'):
         if len(tr.xpath('td/input[@value="%s"]' % id)) > 0:
             message = self.browser.parser.tostring(
                 tr.xpath('td[@class="patFuncStatus"]')[0])
             renew = Renew(id)
             renew.message = html2text(message).replace('\n', '')
             return renew

Пример #22

0

Показать файл

Файл: pages.py Проект: eirmag/weboob

    def iter_videos(self):
        for div in self.parser.select(self.document.getroot(),
                                      'div.dmpi_video_item'):
            _id = div.attrib.get('data-id', None)

            if _id is None:
                self.browser.logger.warning('Unable to find the ID of a video')
                continue

            video = DailymotionVideo(_id)
            video.title = unicode(self.parser.select(div, 'h3 a',
                                                     1).text).strip()
            video.author = unicode(
                self.parser.select(div, 'div.dmpi_user_login',
                                   1).find('a').find('span').text).strip()
            video.description = html2text(
                self.parser.tostring(
                    self.parser.select(div, 'div.dmpi_video_description',
                                       1))).strip() or unicode()
            try:
                parts = self.parser.select(div, 'div.duration',
                                           1).text.split(':')
            except BrokenPageError:
                # it's probably a live, np.
                video.duration = NotAvailable
            else:
                if len(parts) == 1:
                    seconds = parts[0]
                    hours = minutes = 0
                elif len(parts) == 2:
                    minutes, seconds = parts
                    hours = 0
                elif len(parts) == 3:
                    hours, minutes, seconds = parts
                else:
                    raise BrokenPageError(
                        'Unable to parse duration %r' %
                        self.parser.select(div, 'div.duration', 1).text)
                video.duration = datetime.timedelta(hours=int(hours),
                                                    minutes=int(minutes),
                                                    seconds=int(seconds))
            url = unicode(
                self.parser.select(div, 'img.dmco_image',
                                   1).attrib['data-src'])
            # remove the useless anti-caching
            url = re.sub('\?\d+', '', url)
            # use the bigger thumbnail
            url = url.replace('jpeg_preview_medium.jpg',
                              'jpeg_preview_large.jpg')
            video.thumbnail = Thumbnail(unicode(url))

            rating_div = self.parser.select(div, 'div.small_stars', 1)
            video.rating_max = self.get_rate(rating_div)
            video.rating = self.get_rate(rating_div.find('div'))

            video.set_empty_fields(NotAvailable, ('url', ))
            yield video

Пример #23

0

Показать файл

Файл: contact.py Проект: eirmag/weboob

    def parse_profile(self, profile, consts):
        if profile['online']:
            self.status = Contact.STATUS_ONLINE
            self.status_msg = u'online'
            self.status_msg = u'since %s' % profile['last_cnx']
        else:
            self.status = Contact.STATUS_OFFLINE
            self.status_msg = u'last connection %s' % profile['last_cnx']

        self.summary = html2text(profile.get('announce', '')).strip().replace('\n\n', '\n')
        if len(profile.get('shopping_list', '')) > 0:
            self.summary += u'\n\nLooking for:\n%s' % html2text(profile['shopping_list']).strip().replace('\n\n', '\n')

        for photo in profile['pics']:
            self.set_photo(photo.split('/')[-1],
                              url=photo + '/full',
                              thumbnail_url=photo + '/small',
                              hidden=False)
        self.profile = OrderedDict()

        if 'sex' in profile:
            for section, d in self.TABLE.iteritems():
                flags = ProfileNode.SECTION
                if section.startswith('_'):
                    flags |= ProfileNode.HEAD
                if (section.startswith('+') and int(profile['sex']) != 1) or \
                   (section.startswith('-') and int(profile['sex']) != 0):
                    continue

                section = section.lstrip('_+-')

                s = ProfileNode(section, section.capitalize(), OrderedDict(), flags=flags)

                for key, builder in d.iteritems():
                    try:
                        value = builder.get_value(profile, consts[int(profile['sex'])])
                    except KeyError:
                        pass
                    else:
                        s.value[key] = ProfileNode(key, key.capitalize().replace('_', ' '), value)

                self.profile[section] = s

        self._aum_profile = profile

Пример #24

0

Показать файл

Файл: pages.py Проект: Boussadia/weboob

    def fill_special_advert(self, advert, div):
        advert.title = u'%s' % self.parser.select(div, 'div[@class="poste"]', 1, method='xpath').text
        description = self.parser.select(div, 'div[@id="jobBodyContent"]', 1, method='xpath')
        advert.description = html2text(self.parser.tostring(description))

        titresmenuG = self.document.getroot().xpath('//div[@id="divmenuGauche"]')[0]
        contract_type = self.parser.select(titresmenuG, '//span[@itemprop="employmentType"]', method='xpath')
        if len(contract_type) != 0:
            advert.contract_type = u'%s' % contract_type[0].text_content()

        return self.fill_advert(advert, titresmenuG)

Пример #25

0

Показать файл

Файл: pages.py Проект: Boussadia/weboob

    def get_video(self, video=None):
        if not video:
            video = ArteLiveVideo(self.group_dict['id'])

        div = self.document.xpath('//div[@class="bloc-presentation"]')[0]

        description = self.parser.select(div,
                                         'div[@class="field field-name-body field-type-text-with-summary field-label-hidden bloc-rte"]',
                                         1,
                                         method='xpath')
        video.description = html2text(self.parser.tostring(description))

        json_url = self.document.xpath('//div[@class="video-container"]')[0].attrib['arte_vp_url']
        return json_url, video

Пример #26

0

Показать файл

Файл: test.py Проект: blckshrk/Weboob

    def test_content(self):
        urls = ['http://www.lefigaro.fr/international/2011/10/24/01003-20111024ARTFIG00704-les-islamo-conservateurs-maitres-du-jeu-tunisien.php',
                'http://www.lefigaro.fr/international/2012/01/29/01003-20120129ARTFIG00191-floride-la-primaire-suspendue-a-l-humeur-des-hispaniques.php']

        for url in urls:
            thread = self.backend.get_thread(url)
            assert len(thread.root.content)
            assert '<script' not in thread.root.content
            assert 'object' not in thread.root.content
            assert 'BFM' not in thread.root.content

            assert 'AUSSI' not in thread.root.content

            # no funny tags means html2text does not crash
            assert len(html2text(thread.root.content))

Пример #27

0

Показать файл

Файл: pages.py Проект: Boussadia/weboob

    def fill_normal_advert(self, advert, div):
        advert.title = u'%s' % self.parser.select(div, 'h1', 1, method='xpath').text
        description = self.parser.select(div, 'div[@id="jobBodyContent"]', 1, method='xpath')
        advert.description = html2text(self.parser.tostring(description))

        jobsummary = self.document.getroot().xpath('//div[@id="jobsummary_content"]')[0]
        contract_type = self.parser.select(jobsummary, 'dl/dd[@class="multipleddlast"]/span', method='xpath')
        if len(contract_type) != 0:
            advert.contract_type = u'%s' % contract_type[0].text_content()

        society_name = self.parser.select(jobsummary, '//span[@itemprop="name"]', method='xpath')
        if len(society_name) != 0:
            advert.society_name = u'%s' % society_name[0].text_content()

        return self.fill_advert(advert, jobsummary)

Пример #28

0

Показать файл

Файл: test.py Проект: lissyx/weboob

    def test_content(self):
        urls = [
            'http://www.lefigaro.fr/international/2011/10/24/01003-20111024ARTFIG00704-les-islamo-conservateurs-maitres-du-jeu-tunisien.php',
            'http://www.lefigaro.fr/international/2012/01/29/01003-20120129ARTFIG00191-floride-la-primaire-suspendue-a-l-humeur-des-hispaniques.php'
        ]

        for url in urls:
            thread = self.backend.get_thread(url)
            assert len(thread.root.content)
            assert '<script' not in thread.root.content
            assert 'object' not in thread.root.content
            assert 'BFM' not in thread.root.content

            assert 'AUSSI' not in thread.root.content

            # no funny tags means html2text does not crash
            assert len(html2text(thread.root.content))

Пример #29

0

Показать файл

Файл: pages.py Проект: Boussadia/weboob

    def get_job_advert(self, url, advert):
        content = self.document.getroot().xpath('//div[@id="offre-body"]')[0]
        if not advert:
            _id = self.parser.select(content, 'div/div/ul/li/div[@class="value"]/span', 1, method='xpath').text
            advert = PopolemploiJobAdvert(_id)

        advert.title = u'%s' % self.parser.select(content, 'h4', 1, method='xpath').text.strip()
        advert.job_name = u'%s' % self.parser.select(content, 'h4', 1, method='xpath').text.strip()

        description = self.parser.select(content, 'p[@itemprop="description"]', 1, method='xpath')
        advert.description = html2text(self.parser.tostring(description))

        society_name = self.parser.select(content, 'div[@class="vcard"]/p[@class="title"]/span', method='xpath')

        if society_name:
            advert.society_name = u'%s' % society_name[0].text

        advert.url = url

        place = u'%s' % self.parser.select(content,
                                           'dl/dd/ul/li[@itemprop="addressRegion"]',
                                           1, method='xpath').text
        advert.place = place.strip()

        contract_type = u'%s' % self.parser.select(content,
                                                   'dl/dd/span[@itemprop="employmentType"]',
                                                   1, method='xpath').text

        advert.contract_type = contract_type.strip()

        experience = u'%s' % self.parser.select(content,
                                                'dl/dd/span[@itemprop="experienceRequirements"]',
                                                1, method='xpath').text
        advert.experience = experience.strip()

        formation = u'%s' % self.parser.select(content,
                                               'dl/dd/span[@itemprop="qualifications"]',
                                               1, method='xpath').text
        advert.formation = formation.strip()

        pay = u'%s' % self.parser.select(content,
                                         'dl/dd/span[@itemprop="baseSalary"]',
                                         1, method='xpath').text
        advert.pay = pay.strip()

        return advert

Пример #30

0

Показать файл

Файл: pages.py Проект: eirmag/weboob

    def iter_videos(self):
        for div in self.parser.select(self.document.getroot(), 'div.dmpi_video_item'):
            _id = div.attrib.get('data-id', None)

            if _id is None:
                self.browser.logger.warning('Unable to find the ID of a video')
                continue

            video = DailymotionVideo(_id)
            video.title = unicode(self.parser.select(div, 'h3 a', 1).text).strip()
            video.author = unicode(self.parser.select(div, 'div.dmpi_user_login', 1).find('a').find('span').text).strip()
            video.description = html2text(self.parser.tostring(self.parser.select(div, 'div.dmpi_video_description', 1))).strip() or unicode()
            try:
                parts = self.parser.select(div, 'div.duration', 1).text.split(':')
            except BrokenPageError:
                # it's probably a live, np.
                video.duration = NotAvailable
            else:
                if len(parts) == 1:
                    seconds = parts[0]
                    hours = minutes = 0
                elif len(parts) == 2:
                    minutes, seconds = parts
                    hours = 0
                elif len(parts) == 3:
                    hours, minutes, seconds = parts
                else:
                    raise BrokenPageError('Unable to parse duration %r' % self.parser.select(div, 'div.duration', 1).text)
                video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds))
            url = unicode(self.parser.select(div, 'img.dmco_image', 1).attrib['data-src'])
            # remove the useless anti-caching
            url = re.sub('\?\d+', '', url)
            # use the bigger thumbnail
            url = url.replace('jpeg_preview_medium.jpg', 'jpeg_preview_large.jpg')
            video.thumbnail = Thumbnail(unicode(url))

            rating_div = self.parser.select(div, 'div.small_stars', 1)
            video.rating_max = self.get_rate(rating_div)
            video.rating = self.get_rate(rating_div.find('div'))

            video.set_empty_fields(NotAvailable, ('url',))
            yield video

Пример #31

0

Показать файл

    def get_video(self, video=None):
        if video is None:
            video = DailymotionVideo(self.group_dict['id'])

        div = self.parser.select(self.document.getroot(), 'div#content', 1)

        video.title = unicode(self.parser.select(div, 'span.title',
                                                 1).text).strip()
        video.author = unicode(
            self.parser.select(div, 'a.name, span.name, a[rel=author]',
                               1).text).strip()
        try:
            video.description = html2text(
                self.parser.tostring(
                    self.parser.select(div, 'div#video_description',
                                       1))).strip() or unicode()
        except BrokenPageError:
            video.description = u''

        embed_page = self.browser.readurl(
            'http://www.dailymotion.com/embed/video/%s' % video.id)

        m = re.search('var info = ({.*?}),[^{"]', embed_page)
        if not m:
            raise BrokenPageError('Unable to find information about video')

        info = json.loads(m.group(1))
        for key in [
                'stream_h264_hd1080_url', 'stream_h264_hd_url',
                'stream_h264_hq_url', 'stream_h264_url', 'stream_h264_ld_url'
        ]:
            if info.get(key):  #key in info and info[key]:
                max_quality = key
                break
        else:
            raise BrokenPageError(u'Unable to extract video URL')
        video.url = info[max_quality]

        video.set_empty_fields(NotAvailable)

        return video

Пример #32

0

Показать файл

Файл: comparoob.py Проект: eirmag/weboob

    def format_obj(self, obj, alias):
        if hasattr(obj, "message") and obj.message:
            message = obj.message
        else:
            message = u"%s (%s)" % (obj.shop.name, obj.shop.location)

        result = u"%s%s%s\n" % (self.BOLD, message, self.NC)
        result += u"ID: %s\n" % obj.fullid
        result += u"Product: %s\n" % obj.product.name
        result += u"Cost: %s%s\n" % (obj.cost, obj.currency)
        if hasattr(obj, "date") and obj.date:
            result += u"Date: %s\n" % obj.date.strftime("%Y-%m-%d")

        result += u"\n%sShop:%s\n" % (self.BOLD, self.NC)
        result += u"\tName: %s\n" % obj.shop.name
        if obj.shop.location:
            result += u"\tLocation: %s\n" % obj.shop.location
        if obj.shop.info:
            result += u"\n\t" + html2text(obj.shop.info).replace("\n", "\n\t").strip()

        return result

Пример #33

0

Показать файл

Файл: comparoob.py Проект: blckshrk/Weboob

    def format_obj(self, obj, alias):
        if hasattr(obj, 'message') and obj.message:
            message = obj.message
        else:
            message = u'%s (%s)' % (obj.shop.name, obj.shop.location)

        result = u'%s%s%s\n' % (self.BOLD, message, self.NC)
        result += u'ID: %s\n' % obj.fullid
        result += u'Product: %s\n' % obj.product.name
        result += u'Cost: %s%s\n' % (obj.cost, obj.currency)
        if hasattr(obj, 'date') and obj.date:
            result += u'Date: %s\n' % obj.date.strftime('%Y-%m-%d')

        result += u'\n%sShop:%s\n' % (self.BOLD, self.NC)
        result += u'\tName: %s\n' % obj.shop.name
        if obj.shop.location:
            result += u'\tLocation: %s\n' % obj.shop.location
        if obj.shop.info:
            result += u'\n\t' + html2text(obj.shop.info).replace('\n', '\n\t').strip()

        return result

Пример #34

0

Показать файл

    def format_obj(self, obj, alias):
        if hasattr(obj, 'message') and obj.message:
            message = obj.message
        else:
            message = u'%s (%s)' % (obj.shop.name, obj.shop.location)

        result = u'%s%s%s\n' % (self.BOLD, message, self.NC)
        result += u'ID: %s\n' % obj.fullid
        result += u'Product: %s\n' % obj.product.name
        result += u'Cost: %s%s\n' % (obj.cost, obj.currency)
        if hasattr(obj, 'date') and obj.date:
            result += u'Date: %s\n' % obj.date.strftime('%Y-%m-%d')

        result += u'\n%sShop:%s\n' % (self.BOLD, self.NC)
        result += u'\tName: %s\n' % obj.shop.name
        if obj.shop.location:
            result += u'\tLocation: %s\n' % obj.shop.location
        if obj.shop.info:
            result += u'\n\t' + html2text(obj.shop.info).replace(
                '\n', '\n\t').strip()

        return result

Пример #35

0

Показать файл

Файл: pages.py Проект: lissyx/weboob

    def fill_gallery(self, gallery):
        gallery.title = self.document.xpath("//h1[@id='gn']/text()")[0]
        try:
            gallery.original_title = self.document.xpath(
                "//h1[@id='gj']/text()")[0]
        except IndexError:
            gallery.original_title = None
        description_div = self.document.xpath("//div[@id='gd71']")[0]
        description_html = self.parser.tostring(description_div)
        gallery.description = html2text(description_html)
        cardinality_string = self.document.xpath(
            "//div[@id='gdd']//tr[td[@class='gdt1']/text()='Images:']/td[@class='gdt2']/text()"
        )[0]
        gallery.cardinality = int(
            re.match(r"\d+", cardinality_string).group(0))
        date_string = self.document.xpath(
            "//div[@id='gdd']//tr[td[@class='gdt1']/text()='Posted:']/td[@class='gdt2']/text()"
        )[0]
        gallery.date = datetime.strptime(date_string, "%Y-%m-%d %H:%M")
        rating_string = self.document.xpath(
            "//td[@id='rating_label']/text()")[0]
        rating_match = re.search(r"\d+\.\d+", rating_string)
        if rating_match is None:
            gallery.rating = None
        else:
            gallery.rating = float(rating_match.group(0))

        gallery.rating_max = 5

        try:
            thumbnail_url = self.document.xpath(
                "//div[@class='gdtm']/a/img/attribute::src")[0]
        except IndexError:
            thumbnail_style = self.document.xpath(
                "//div[@class='gdtm']/div/attribute::style")[0]
            thumbnail_url = re.search(r"background:[^;]+url\((.+?)\)",
                                      thumbnail_style).group(1)

        gallery.thumbnail = Thumbnail(unicode(thumbnail_url))

Пример #36

0

Показать файл

Файл: pages.py Проект: eirmag/weboob

    def get_video(self, video=None):
        if video is None:
            video = DailymotionVideo(self.group_dict['id'])

        div = self.parser.select(self.document.getroot(), 'div#content', 1)

        video.title = unicode(self.parser.select(div, 'span.title',
                                                 1).text).strip()
        video.author = unicode(
            self.parser.select(div, 'a.name, span.name', 1).text).strip()
        try:
            video.description = html2text(
                self.parser.tostring(
                    self.parser.select(div, 'div#video_description',
                                       1))).strip() or unicode()
        except BrokenPageError:
            video.description = u''
        for script in self.parser.select(self.document.getroot(),
                                         'div.dmco_html'):
            # TODO support videos from anyclip, cf http://www.dailymotion.com/video/xkyjiv for example
            if 'id' in script.attrib and script.attrib['id'].startswith('container_player_') and \
               script.find('script') is not None:
                text = script.find('script').text
                mobj = re.search(
                    r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', text)
                if mobj is None:
                    mobj = re.search('"sdURL":.*?"(.*?)"',
                                     urllib.unquote(text))
                    mediaURL = mobj.group(1).replace("\\", "")
                else:
                    mediaURL = urllib.unquote(mobj.group(1))
                video.url = mediaURL

        video.set_empty_fields(NotAvailable)

        return video

Пример #37

0

Показать файл

Файл: pages.py Проект: Boussadia/weboob

    def get_job_advert(self, url, advert):
        re_id_title = re.compile('/offres-emploi-cadres/\d*_\d*_\d*_(.*?)________(.*?).html(.*?)', re.DOTALL)
        if advert is None:
            _id = u'%s/%s' % (re_id_title.search(url).group(1), re_id_title.search(url).group(2))
            advert = ApecJobAdvert(_id)
            advert.title = re_id_title.search(url).group(2).replace('-', ' ')

        description = self.document.getroot().xpath("//div[@class='contentWithDashedBorderTop marginTop boxContent']/div")[0]
        advert.description = html2text(self.parser.tostring(description))

        advert.job_name = advert.title

        trs = self.document.getroot().xpath("//table[@class='noFieldsTable']/tr")
        for tr in trs:
            th = self.parser.select(tr, 'th', 1, method='xpath')
            td = self.parser.select(tr, 'td', 1, method='xpath')
            if u'Date de publication' in u'%s' % th.text_content():
                advert.publication_date = dateutil.parser.parse(td.text_content()).date()
            elif u'Société' in u'%s' % th.text_content() and not advert.society_name:
                society_name = td.text_content()
                a = self.parser.select(td, 'a', method='xpath')
                if a:
                    advert.society_name = u'%s' % society_name.replace(a[0].text_content(), '').strip()
                else:
                    advert.society_name = society_name.strip()
            elif u'Type de contrat' in u'%s' % th.text_content():
                advert.contract_type = u'%s' % td.text_content().strip()
            elif u'Lieu' in u'%s' % th.text_content():
                advert.place = u'%s' % td.text_content()
            elif u'Salaire' in u'%s' % th.text_content():
                advert.pay = u'%s' % td.text_content()
            elif u'Expérience' in u'%s' % th.text_content():
                advert.experience = u'%s' % td.text_content()

        advert.url = url
        return advert

Пример #38

0

Показать файл

Файл: torrents.py Проект: eirmag/weboob

    def get_torrent(self, id):
        table = self.browser.parser.select(self.document.getroot(), 'div.thin',
                                           1)

        h2 = table.xpath('.//h2')
        if len(h2) > 0:
            title = u''.join([txt.strip() for txt in h2[0].itertext()])
        else:
            title = self.browser.parser.select(table, 'div.title_text', 1).text

        torrent = Torrent(id, title)
        if '.' in id:
            torrentid = id.split('.', 1)[1]
        else:
            torrentid = id
        table = self.browser.parser.select(self.document.getroot(),
                                           'table.torrent_table')
        if len(table) == 0:
            table = self.browser.parser.select(self.document.getroot(),
                                               'div.main_column', 1)
            is_table = False
        else:
            table = table[0]
            is_table = True

        for tr in table.findall('tr' if is_table else 'div'):
            if is_table and 'group_torrent' in tr.attrib.get('class', ''):
                tds = tr.findall('td')

                if not len(tds) == 5:
                    continue

                url = tds[0].find('span').find('a').attrib['href']
                m = self.TORRENTID_REGEXP.match(url)
                if not m:
                    warning('ID not found')
                    continue
                if m.group(1) != torrentid:
                    continue

                torrent.url = self.format_url(url)
                size, unit = tds[1].text.split()
                torrent.size = get_bytes_size(float(size.replace(',', '')),
                                              unit)
                torrent.seeders = int(tds[3].text)
                torrent.leechers = int(tds[4].text)
                break
            elif not is_table and tr.attrib.get('class', '').startswith('torrent_widget') \
                    and tr.attrib.get('class', '').endswith('pad'):
                url = tr.cssselect('a[title=Download]')[0].attrib['href']
                m = self.TORRENTID_REGEXP.match(url)
                if not m:
                    warning('ID not found')
                    continue
                if m.group(1) != torrentid:
                    continue

                torrent.url = self.format_url(url)
                size, unit = tr.cssselect(
                    'div.details_title strong')[-1].text.strip('()').split()
                torrent.size = get_bytes_size(float(size.replace(',', '')),
                                              unit)
                torrent.seeders = int(
                    tr.cssselect('img[title=Seeders]')[0].tail)
                torrent.leechers = int(
                    tr.cssselect('img[title=Leechers]')[0].tail)
                break

        if not torrent.url:
            warning('Torrent %s not found in list' % torrentid)
            return None

        div = self.parser.select(self.document.getroot(), 'div.main_column', 1)
        for box in div.cssselect('div.box'):
            title = None
            body = None

            title_t = box.cssselect('div.head')
            if len(title_t) > 0:
                title_t = title_t[0]
                if title_t.find('strong') is not None:
                    title_t = title_t.find('strong')
                if title_t.text is not None:
                    title = title_t.text.strip()

            body_t = box.cssselect('div.body,div.desc')
            if body_t:
                body = html2text(self.parser.tostring(body_t[-1])).strip()

            if title and body:
                if torrent.description is NotLoaded:
                    torrent.description = u''
                torrent.description += u'%s\n\n%s\n' % (title, body)

        divs = self.document.getroot().cssselect(
            'div#files_%s,div#filelist_%s,tr#torrent_%s td' %
            (torrentid, torrentid, torrentid))
        if divs:
            torrent.files = []
            for div in divs:
                table = div.find('table')
                if table is None:
                    continue
                for tr in table:
                    if tr.attrib.get('class', None) != 'colhead_dark':
                        torrent.files.append(tr.find('td').text)

        return torrent

Пример #39

0

Показать файл

    def send_email(self, backend, mail):
        domain = self.config.get('domain')
        recipient = self.config.get('recipient')

        reply_id = ''
        if mail.parent:
            reply_id = u'<%s.%s@%s>' % (backend.name, mail.parent.full_id,
                                        domain)
        subject = mail.title
        sender = u'"%s" <%s@%s>' % (mail.sender.replace('"', '""') if
                                    mail.sender else '', backend.name, domain)

        # assume that .date is an UTC datetime
        date = formatdate(time.mktime(utc2local(mail.date).timetuple()),
                          localtime=True)
        msg_id = u'<%s.%s@%s>' % (backend.name, mail.full_id, domain)

        if self.config.get('html') and mail.flags & mail.IS_HTML:
            body = mail.content
            content_type = 'html'
        else:
            if mail.flags & mail.IS_HTML:
                body = html2text(mail.content)
            else:
                body = mail.content
            content_type = 'plain'

        if body is None:
            body = ''

        if mail.signature:
            if self.config.get('html') and mail.flags & mail.IS_HTML:
                body += u'<p>-- <br />%s</p>' % mail.signature
            else:
                body += u'\n\n-- \n'
                if mail.flags & mail.IS_HTML:
                    body += html2text(mail.signature)
                else:
                    body += mail.signature

        # Header class is smart enough to try US-ASCII, then the charset we
        # provide, then fall back to UTF-8.
        header_charset = 'ISO-8859-1'

        # We must choose the body charset manually
        for body_charset in 'US-ASCII', 'ISO-8859-1', 'UTF-8':
            try:
                body.encode(body_charset)
            except UnicodeError:
                pass
            else:
                break

        # Split real name (which is optional) and email address parts
        sender_name, sender_addr = parseaddr(sender)
        recipient_name, recipient_addr = parseaddr(recipient)

        # We must always pass Unicode strings to Header, otherwise it will
        # use RFC 2047 encoding even on plain ASCII strings.
        sender_name = str(Header(unicode(sender_name), header_charset))
        recipient_name = str(Header(unicode(recipient_name), header_charset))

        # Make sure email addresses do not contain non-ASCII characters
        sender_addr = sender_addr.encode('ascii')
        recipient_addr = recipient_addr.encode('ascii')

        # Create the message ('plain' stands for Content-Type: text/plain)
        msg = MIMEText(body.encode(body_charset), content_type, body_charset)
        msg['From'] = formataddr((sender_name, sender_addr))
        msg['To'] = formataddr((recipient_name, recipient_addr))
        msg['Subject'] = Header(unicode(subject), header_charset)
        msg['Message-Id'] = msg_id
        msg['Date'] = date
        if reply_id:
            msg['In-Reply-To'] = reply_id

        self.logger.info('Send mail from <%s> to <%s>' % (sender, recipient))
        if len(self.config.get('pipe')) > 0:
            p = subprocess.Popen(self.config.get('pipe'),
                                 shell=True,
                                 stdin=subprocess.PIPE,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT)
            p.stdin.write(msg.as_string())
            p.stdin.close()
            if p.wait() != 0:
                self.logger.error('Unable to deliver mail: %s' %
                                  p.stdout.read().strip())
                return False
        else:
            # Send the message via SMTP to localhost:25
            try:
                smtp = SMTP(self.config.get('smtp'))
                smtp.sendmail(sender, recipient, msg.as_string())
            except Exception as e:
                self.logger.error('Unable to deliver mail: %s' % e)
                return False
            else:
                smtp.quit()

        return True

Пример #40

0

Показать файл

 def get_value(self, profile, consts):
     return html2text(unicode(profile[self.key])).strip()

Пример #41

0

Показать файл

Файл: torrents.py Проект: eirmag/weboob

    def get_torrent(self, id):
        table = self.browser.parser.select(self.document.getroot(), "div.thin", 1)

        h2 = table.xpath(".//h2")
        if len(h2) > 0:
            title = u"".join([txt.strip() for txt in h2[0].itertext()])
        else:
            title = self.browser.parser.select(table, "div.title_text", 1).text

        torrent = Torrent(id, title)
        if "." in id:
            torrentid = id.split(".", 1)[1]
        else:
            torrentid = id
        table = self.browser.parser.select(self.document.getroot(), "table.torrent_table")
        if len(table) == 0:
            table = self.browser.parser.select(self.document.getroot(), "div.main_column", 1)
            is_table = False
        else:
            table = table[0]
            is_table = True

        for tr in table.findall("tr" if is_table else "div"):
            if is_table and "group_torrent" in tr.attrib.get("class", ""):
                tds = tr.findall("td")

                if not len(tds) == 5:
                    continue

                url = tds[0].find("span").find("a").attrib["href"]
                m = self.TORRENTID_REGEXP.match(url)
                if not m:
                    warning("ID not found")
                    continue
                if m.group(1) != torrentid:
                    continue

                torrent.url = self.format_url(url)
                size, unit = tds[1].text.split()
                torrent.size = get_bytes_size(float(size.replace(",", "")), unit)
                torrent.seeders = int(tds[3].text)
                torrent.leechers = int(tds[4].text)
                break
            elif (
                not is_table
                and tr.attrib.get("class", "").startswith("torrent_widget")
                and tr.attrib.get("class", "").endswith("pad")
            ):
                url = tr.cssselect("a[title=Download]")[0].attrib["href"]
                m = self.TORRENTID_REGEXP.match(url)
                if not m:
                    warning("ID not found")
                    continue
                if m.group(1) != torrentid:
                    continue

                torrent.url = self.format_url(url)
                size, unit = tr.cssselect("div.details_title strong")[-1].text.strip("()").split()
                torrent.size = get_bytes_size(float(size.replace(",", "")), unit)
                torrent.seeders = int(tr.cssselect("img[title=Seeders]")[0].tail)
                torrent.leechers = int(tr.cssselect("img[title=Leechers]")[0].tail)
                break

        if not torrent.url:
            warning("Torrent %s not found in list" % torrentid)
            return None

        div = self.parser.select(self.document.getroot(), "div.main_column", 1)
        for box in div.cssselect("div.box"):
            title = None
            body = None

            title_t = box.cssselect("div.head")
            if len(title_t) > 0:
                title_t = title_t[0]
                if title_t.find("strong") is not None:
                    title_t = title_t.find("strong")
                if title_t.text is not None:
                    title = title_t.text.strip()

            body_t = box.cssselect("div.body,div.desc")
            if body_t:
                body = html2text(self.parser.tostring(body_t[-1])).strip()

            if title and body:
                if torrent.description is NotLoaded:
                    torrent.description = u""
                torrent.description += u"%s\n\n%s\n" % (title, body)

        divs = self.document.getroot().cssselect(
            "div#files_%s,div#filelist_%s,tr#torrent_%s td" % (torrentid, torrentid, torrentid)
        )
        if divs:
            torrent.files = []
            for div in divs:
                table = div.find("table")
                if table is None:
                    continue
                for tr in table:
                    if tr.attrib.get("class", None) != "colhead_dark":
                        torrent.files.append(tr.find("td").text)

        return torrent

Пример #42

0

Показать файл

Файл: contact.py Проект: Boussadia/weboob

 def get_value(self, profile, consts):
     return html2text(unicode(profile[self.key])).strip()

Пример #43

0

Показать файл

Файл: test.py Проект: Boussadia/weboob

 def test_lefigaro(self):
     l = list(self.backend.iter_threads())
     assert len(l)
     thread = self.backend.get_thread(l[0].id)
     assert len(thread.root.content)
     assert len(html2text(thread.root.content))

Python html2text примеры использования