Python BeautifulSoup.ICantBelieveItsBeautifulSoup示例

编程语言: Python

类/类型: BeautifulSoup

方法/功能: ICantBelieveItsBeautifulSoup

hotexamples.com的示例: 3

Python BeautifulSoup.ICantBelieveItsBeautifulSoup - 已找到3个示例。这些是从开源项目中提取的最受好评的BeautifulSoup.ICantBelieveItsBeautifulSoup 来自程序包 Tautulli现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

BeautifulSoup(30)

BeautifulStoneSoup(30)

SoupStrainer(28)

NavigableString(16)

BeautifulSOAP(7)

ICantBelieveItsBeautifulSoup(3)

RobustHTMLParser(1)

get_starttag_text(1)

示例#1

显示文件

文件： semail.py 项目： webjay/fskintra

def nicehtml(html):
    # ensure that the first <li is wrapped in a <ul
    oul = re.search('(?i)<[ou]l', html)
    li = re.search('(?i)<li', html)
    if li:
        if not oul or (li.start() < oul.start()):
            # add <ul>
            st = li.start()
            html = html[:st] + '</p><ul>' + html[st:]
    bs = BeautifulSoup.ICantBelieveItsBeautifulSoup(html)
    return bs.prettify().decode('utf-8')

示例#2

显示文件

文件： semail.py 项目： webjay/fskintra

    def asEmail(self):
        if self._email:
            return self._email
        self.prepareMessage()
        hostname = socket.getfqdn()  # used below in a few places

        mpp = self.mp.copy()

        def wrapOrZap(key, title):
            val = self.mp.get(key, None)
            if val:
                mpp[key] = "<p class='%s' style='margin: 0;'>%s: %s</p>\n"
                mpp[key] %= (key, title, val)
            else:
                mpp[key] = ''

        wrapOrZap('sender', 'Fra')
        wrapOrZap('recipient', 'Til')
        wrapOrZap('cc', 'Kopi til')
        if mpp.get('time', None):
            mpp['ttime'] = u' ' + mpp['time']
        else:
            mpp['ttime'] = u''

        # create initial HTML version
        html = u'''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
  <title>%(title)s</title>
</head>
<body style='font-family: Verdana,Arial,Helvetica'>
<h1>%(title)s</h1>
<div class='meta' style='background-color: #eaeaea; color: #000; padding: 5px; margin: 0 0 10px 0;'>
%(sender)s%(recipient)s%(cc)s  <p class='date' style='margin: 0;'>Dato: %(date)s%(ttime)s</p>
</div>
<div class='text'>
  %(nicehtml)s
</div>
</body>
</html>
'''
        html %= mpp
        html = BeautifulSoup.ICantBelieveItsBeautifulSoup(html)

        # first look for inline images (if any)
        # iimags: mapping from URL to (cid, binary string contents)
        iimgs = {}
        for imgtag in html.findAll('img'):
            if not imgtag.has_key('src'):
                continue  # ignore
            url = imgtag['src']
            if url.lower().startswith('data:'):
                # ignore 'inline' images
                continue
            elif not url:
                # ignore empty URLs
                continue
            if url not in iimgs:
                try:
                    data = surllib.skoleGetURL(url, False)
                except urllib2.URLError:
                    # could not fetch URL for some reason - ignore
                    continue
                # is this actually an image?
                if not imghdr.what('', data):
                    continue  # ignore
                cid = 'image%d-%f@%s' % (len(iimgs) + 1, time.time(), hostname)
                iimgs[url] = (cid, data)
            cid, _ = iimgs[url]

            imgtag['src'] = 'cid:' + cid

        # next, handle attachments
        # attachments: email attachments ready for attachment :)
        attachments = []
        for atag in html.findAll('a'):
            try:
                url = atag['href']
            except KeyError:
                atag.replaceWithChildren()  # kill the "broken" link
                continue
            url = atag['href']
            if 'Tilmelding/Oversigt.asp' in url:
                atag.replaceWithChildren()  # kill link
                continue
            if url.startswith('/') or config.HOSTNAME in url:  # onsite!
                data = None
                try:
                    data = surllib.skoleGetURL(url, False)
                except:
                    # unable to fetch URL
                    config.log(
                        u'%s: Kan ikke hente flg. URL: %s' %
                        (self.mp['title'] if self.mp['title'] else self, url))
                if data:
                    if atag.has_key('usefilename'):
                        usefilename = atag['usefilename']
                    else:
                        usefilename = None
                    eatt = generateMIMEAttachment(url, data, usefilename)
                    attachments.append(eatt)
                    atag.replaceWithChildren()  # kill the actual link

        # now, put the pieces together
        html = html.prettify().decode('utf-8')
        msgHtml = MIMEText(html, 'html', 'utf-8')
        if not iimgs and not attachments:
            # pure HTML version
            msg = msgHtml
        else:
            # inline images but no attachments
            #   multipart/related
            #     text/html with html text
            #     image/xxx with inline images
            # OR
            # email with inline images + attachment
            #   multipart/mixed
            #     text/html med html udgave
            #     image/gif med billede
            #     application/xxx with word document
            if attachments:
                msg = MIMEMultipart('mixed', type='text/html')
            else:
                msg = MIMEMultipart('related', type='text/html')
            del msgHtml['MIME-Version']
            msg.attach(msgHtml)

            # attach images if any
            for (url, (cid, data)) in iimgs.items():
                m = MIMEImage(data)
                m.add_header('Content-ID', '<%s>' % cid)
                fn = os.path.basename(url).encode('utf-8')
                m.add_header('Content-Disposition',
                             'inline',
                             filename=('utf-8', '', fn))

                del m['MIME-Version']
                msg.attach(m)

            # attach attachments if any
            for attachment in attachments:
                del attachment['MIME-Version']
                msg.attach(attachment)

        # now for the general headers
        dt = self.mp['date']
        if self.mp.get('time', None):
            dt += ' ' + self.mp['time']
        else:
            if dt == time.strftime('%d-%m-%Y'):  # today
                ts = time.strftime('%H:%M:%S')
                if ts > '12:00:00':
                    ts = '12:00:00'
                dt += ' ' + ts
            else:
                dt += ' 12:00:00'
        dt = time.strptime(dt, '%d-%m-%Y %H:%M:%S')
        dt = email.utils.formatdate(time.mktime(dt), True)
        msg['Received'] = ('from %s ([127.0.0.1] helo=%s) '
                           'by %s with smtp (fskintra) for %s; %s') % (
                               hostname, hostname, hostname, config.EMAIL, dt)
        msg['Date'] = dt

        title = self.mp['title']
        if self.mp['childname']:
            title = u'[%s] %s' % (self.mp['childname'], title)
        msg['Subject'] = Header(title, 'utf-8', 60)
        if 'sender' in self.mp and self.mp['sender']:
            sender = u'Skoleintra - %s' % self.mp['sender']
        else:
            sender = u'Skoleintra'
        sender = headerEncodeField(sender) + u' <%s>' % config.SENDER
        msg['From'] = sender
        msg['To'] = config.EMAIL

        # other tags just for ourselves
        keys = 'mid,md5'.split(',')
        for key in keys:
            if self.mp.get(key, None):
                kkey = 'X-skoleintra-%s' % key
                msg[kkey] = Header(self.mp[key], 'utf-8', header_name=kkey)

        self._email = msg
        return msg

示例#3

显示文件

    def asEmail(self):
        if self._email:
            return self._email
        self.prepareMessage()
        hostname = socket.getfqdn()  # used below in a few places

        mpp = self.mp.copy()

        def wrapOrZap(key, title):
            val = self.mp.get(key, None)
            if val:
                mpp[key] = "<p class='%s' style='margin: 0;'>%s: %s</p>\n"
                mpp[key] %= (key, title, val)
            else:
                mpp[key] = ''

        wrapOrZap('sender', 'Fra')
        wrapOrZap('recipient', 'Til')
        if mpp.get('time', None):
            mpp['ttime'] = u' ' + mpp['time']
        else:
            mpp['ttime'] = u''

        # create initial HTML version
        html = u'''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
  <title>%(title)s</title>
</head>
<body style='font-family: Verdana,Arial,Helvetica'>
<h1>%(title)s</h1>
<div class='meta' style='background-color: #eaeaea; color: #000; padding: 5px; margin: 0 0 10px 0;'>
%(sender)s%(recipient)s  <p class='date' style='margin: 0;'>Dato: %(date)s%(ttime)s</p>
</div>
<div class='text'>
  %(nicehtml)s
</div>
</body>
</html>
'''
        html %= mpp
        html = BeautifulSoup.ICantBelieveItsBeautifulSoup(html)

        # first look for inline images (if any)
        # iimags: mapping from URL to (cid, binary string contents)
        iimgs = {}
        for imgtag in html.findAll('img'):
            url = imgtag['src']
            if url.lower().startswith('data:'):
                # ignore 'inline' images
                continue
            elif not url:
                # ignore empty URLs
                continue
            if url not in iimgs:
                try:
                    data = surllib.skoleGetURL(url, False)
                except urllib2.URLError, e:
                    # could not fetch URL for some reason - ignore
                    continue
                # is this actually an image?
                if not imghdr.what('', data):
                    continue  # ignore
                cid = 'image%d-%f@%s' % (len(iimgs) + 1, time.time(), hostname)
                iimgs[url] = (cid, data)
            cid, _ = iimgs[url]

            imgtag['src'] = 'cid:' + cid