Python BeautifulSoup.new_tag示例，calibre.ebooks.BeautifulSoup.BeautifulSoup.new_tag Python示例

示例#1

0

显示文件

文件： simple.py 项目： zyhong/calibre

    def get_soup(self, src, url=None):
        nmassage = []
        nmassage.extend(self.preprocess_regexps)
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        for pat, repl in nmassage:
            usrc = pat.sub(repl, usrc)
        soup = BeautifulSoup(usrc)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            replace = xml_to_unicode(replace,
                                     self.verbose,
                                     strip_encoding_pats=True)[0]
            for pat, repl in nmassage:
                replace = pat.sub(repl, replace)
            soup = BeautifulSoup(replace)

        if self.keep_only_tags:
            body = soup.new_tag('body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(
                self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(
                self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)

示例#2

0

显示文件

文件： simple.py 项目： JimmXinu/calibre

    def get_soup(self, src, url=None):
        nmassage = []
        nmassage.extend(self.preprocess_regexps)
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        for pat, repl in nmassage:
            usrc = pat.sub(repl, usrc)
        soup = BeautifulSoup(usrc)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
            for pat, repl in nmassage:
                replace = pat.sub(repl, replace)
            soup = BeautifulSoup(replace)

        if self.keep_only_tags:
            body = soup.new_tag('body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)

示例#3

0

显示文件

文件： driver.py 项目： zmshan2008/calibre

    def generate_annotation_html(self, bookmark):
        from calibre.ebooks.BeautifulSoup import BeautifulSoup
        # Returns <div class="user_annotations"> ... </div>
        last_read_location = bookmark.last_read_location
        timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp)
        percent_read = bookmark.percent_read

        ka_soup = BeautifulSoup()
        dtc = 0
        divTag = ka_soup.new_tag('div')
        divTag['class'] = 'user_annotations'

        # Add the last-read location
        if bookmark.book_format == 'pdf':
            markup = _("%(time)s<br />Last page read: %(loc)d (%(pr)d%%)"
                       ) % dict(time=strftime('%x', timestamp.timetuple()),
                                loc=last_read_location,
                                pr=percent_read)
        else:
            markup = _(
                "%(time)s<br />Last page read: Location %(loc)d (%(pr)d%%)"
            ) % dict(time=strftime(u'%x', timestamp.timetuple()),
                     loc=last_read_location,
                     pr=percent_read)
        spanTag = BeautifulSoup('<span style="font-weight:bold">' + markup +
                                '</span>').find('span')

        divTag.insert(dtc, spanTag)
        dtc += 1
        divTag.insert(dtc, ka_soup.new_tag('br'))
        dtc += 1

        if bookmark.user_notes:
            user_notes = bookmark.user_notes
            annotations = []

            # Add the annotations sorted by location
            # Italicize highlighted text
            for location in sorted(user_notes):
                if user_notes[location]['text']:
                    annotations.append(
                        _('<b>Location %(dl)d &bull; %(typ)s</b><br />%(text)s<br />'
                          ) %
                        dict(
                            dl=user_notes[location]['displayed_location'],
                            typ=user_notes[location]['type'],
                            text=(user_notes[location]['text'] if
                                  user_notes[location]['type'] == 'Note' else
                                  '<i>%s</i>' % user_notes[location]['text'])))
                else:
                    if bookmark.book_format == 'pdf':
                        annotations.append(
                            _('<b>Page %(dl)d &bull; %(typ)s</b><br />') %
                            dict(dl=user_notes[location]['displayed_location'],
                                 typ=user_notes[location]['type']))
                    else:
                        annotations.append(
                            _('<b>Location %(dl)d &bull; %(typ)s</b><br />') %
                            dict(dl=user_notes[location]['displayed_location'],
                                 typ=user_notes[location]['type']))

            for annotation in annotations:
                annot = BeautifulSoup('<span>' + annotation +
                                      '</span>').find('span')
                divTag.insert(dtc, annot)
                dtc += 1

        ka_soup.insert(0, divTag)
        return ka_soup

示例#4

0

显示文件

    def to_HTML(self, header=''):
        '''
        Generate HTML with user-specified CSS, element order
        '''
        # Retrieve CSS prefs
        from calibre_plugins.annotations.appearance import default_elements
        stored_css = plugin_prefs.get('appearance_css', default_elements)

        elements = []
        for element in stored_css:
            elements.append(element['name'])
            if element['name'] == 'Note':
                note_style = re.sub('\n', '', element['css'])
            elif element['name'] == 'Text':
                text_style = re.sub('\n', '', element['css'])
            elif element['name'] == 'Timestamp':
                ts_style = re.sub('\n', '', element['css'])

        # Additional CSS for timestamp color and bg to be formatted
        datetime_style = ("background-color:{0};color:{1};" + ts_style)

        # Order the elements according to stored preferences
        comments_body = ''
        for element in elements:
            if element == 'Text':
                comments_body += '{text}'
            elif element == 'Note':
                comments_body += '{note}'
            elif element == 'Timestamp':
                ts_css = '''<table cellpadding="0" width="100%" style="{ts_style}" color="{color}">
                                <tr>
                                    <td class="location" style="text-align:left">{location}</td>
                                    <td class="timestamp" uts="{unix_timestamp}" style="text-align:right">{friendly_timestamp}</td>
                                </tr>
                            </table>'''
                comments_body += re.sub(r'>\s+<', r'><', ts_css)
#         self._log_location("comments_body='%s'" % comments_body)

        if self.annotations:
            soup = BeautifulSoup(ANNOTATIONS_HEADER)
            dtc = 0

            # Add the annotations
            for i, agroup in enumerate(
                    sorted(self.annotations, key=self._annotation_sorter)):
                #                 self._log_location("agroup='%s'" % agroup)
                location = agroup.location
                if location is None:
                    location = ''

                friendly_timestamp = self._timestamp_to_datestr(
                    agroup.timestamp)

                text = ''
                if agroup.text:
                    #                     self._log_location("agroup.text='%s'" % agroup.text)
                    for agt in agroup.text:
                        #                         self._log_location("agt='%s'" % agt)
                        text += '<p class="highlight" style="{0}">{1}</p>'.format(
                            text_style, agt)

                note = ''
                if agroup.note:
                    #                     self._log_location("agroup.note='%s'" % agroup.note)
                    for agn in agroup.note:
                        #                         self._log_location("agn='%s'" % agn)
                        note += '<p class="note" style="{0}">{1}</p>'.format(
                            note_style, agn)

                try:
                    dt_bgcolor = COLOR_MAP[agroup.highlightcolor]['bg']
                    dt_fgcolor = COLOR_MAP[agroup.highlightcolor]['fg']
                except:
                    if agroup.highlightcolor is None:
                        msg = "No highlight color specified, using Default"
                    else:
                        msg = "Unknown color '%s' specified" % agroup.highlightcolor
                    self._log_location(msg)
                    dt_bgcolor = COLOR_MAP['Default']['bg']
                    dt_fgcolor = COLOR_MAP['Default']['fg']

                if agroup.hash is not None:
                    # Use existing hash when re-rendering
                    annotation_hash = agroup.hash
                else:
                    m = hashlib.md5()
                    m.update(text.encode('utf-8'))
                    m.update(note.encode('utf-8'))
                    annotation_hash = m.hexdigest()

                try:
                    ka_soup = BeautifulSoup()
                    divTag = ka_soup.new_tag('div')
#                     self._log_location("Used ka_soup.new_tag to create tag: %s" % divTag)
                except:
                    divTag = Tag(BeautifulSoup(), 'div')


#                     self._log_location("Used Tag(BeautifulSoup() to create tag: %s" % divTag)

                content_args = {
                    'color': agroup.highlightcolor,
                    'friendly_timestamp': friendly_timestamp,
                    'location': location,
                    'note': note,
                    'text': text,
                    'ts_style': datetime_style.format(dt_bgcolor, dt_fgcolor),
                    'unix_timestamp': agroup.timestamp,
                }
                #                 self._log_location("Generated comment soup: %s" % BeautifulSoup(comments_body.format(**content_args)))
                comments_body_soup = BeautifulSoup(
                    comments_body.format(**content_args))
                #                 self._log_location("Generated comment soup: comments_body_soup=%s" % comments_body_soup)
                #                 self._log_location("Generated comment soup: comments_body_soup.body=%s" % comments_body_soup.body)
                #                 self._log_location("Generated comment soup: comments_body_soup.body.children=%s" % comments_body_soup.body.children)
                #                 self._log_location("Generated comment soup: comments_body_soup.body.contents=%s" % comments_body_soup.body.contents)
                #                 self._log_location("Generated comment soup: len(comments_body_soup.body.contents)=%s" % len(comments_body_soup.body.contents))
                #                 for i in range(0, len(comments_body_soup.body.contents)):
                #                     self._log_location("i=%s" % i)
                #                     self._log_location("comment_body_tag=%s" % comments_body_soup.body.contents[i])
                while len(comments_body_soup.body.contents) > 0:
                    #                     self._log_location("comment_body_tag=%s" % comments_body_soup.body.contents[0])
                    divTag.append(comments_body_soup.body.contents[0])
                divTag['class'] = "annotation"
                divTag['genre'] = ''
                if agroup.genre:
                    divTag['genre'] = escape(agroup.genre)
                divTag['hash'] = annotation_hash
                divTag['location_sort'] = agroup.location_sort
                divTag['reader'] = agroup.reader_app
                divTag['style'] = ANNOTATION_DIV_STYLE
                #                 self._log_location("An annotation - divTag=%s" % divTag)
                soup.div.insert(dtc, divTag)
                #                 self._log_location("Full soup after adding annotation - soup=%s" % soup)
                dtc += 1
                if i < len(self.annotations) - 1 and \
                    plugin_prefs.get('appearance_hr_checkbox', False):
                    soup.div.insert(
                        dtc,
                        BeautifulSoup(
                            plugin_prefs.get('HORIZONTAL_RULE',
                                             '<hr width="80%" />')))
                    dtc += 1

        else:
            soup = BeautifulSoup(ANNOTATIONS_HEADER)
        return unicode(soup)

示例#5

0

显示文件

def comments_to_html(comments):
    '''
    Convert random comment text to normalized, xml-legal block of <p>s
    'plain text' returns as
    <p>plain text</p>

    'plain text with <i>minimal</i> <b>markup</b>' returns as
    <p>plain text with <i>minimal</i> <b>markup</b></p>

    '<p>pre-formatted text</p> returns untouched

    'A line of text\n\nFollowed by a line of text' returns as
    <p>A line of text</p>
    <p>Followed by a line of text</p>

    'A line of text.\nA second line of text.\rA third line of text' returns as
    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>

    '...end of a paragraph.Somehow the break was lost...' returns as
    <p>...end of a paragraph.</p>
    <p>Somehow the break was lost...</p>

    Deprecated HTML returns as HTML via BeautifulSoup()

    '''
    if not comments:
        return u'<p></p>'
    if not isinstance(comments, unicode_type):
        comments = comments.decode(preferred_encoding, 'replace')

    if comments.lstrip().startswith('<'):
        # Comment is already HTML do not mess with it
        return comments

    if '<' not in comments:
        comments = prepare_string_for_xml(comments)
        parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />')
                for x in comments.split('\n\n')]
        return '\n'.join(parts)

    if sanitize_pat.search(comments) is not None:
        try:
            return sanitize_comments_html(comments)
        except:
            import traceback
            traceback.print_exc()
            return u'<p></p>'

    # Explode lost CRs to \n\n
    comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.',
        '.\r'), comments)
    for lost_cr in lost_cr_pat.finditer(comments):
        comments = comments.replace(lost_cr.group(),
                                    '%s%s\n\n%s' % (lost_cr.group(1),
                                                    lost_cr.group(2),
                                                    lost_cr.group(3)))

    comments = comments.replace(u'\r', u'')
    # Convert \n\n to <p>s
    comments = comments.replace(u'\n\n', u'<p>')
    # Convert solo returns to <br />
    comments = comments.replace(u'\n', '<br />')
    # Convert two hyphens to emdash
    comments = comments.replace('--', '&mdash;')

    soup = BeautifulSoup('<div>' + comments + '</div>').find('div')
    result = BeautifulSoup('<div>')
    container = result.find('div')
    rtc = 0
    open_pTag = False

    all_tokens = list(soup.contents)
    for token in all_tokens:
        if isinstance(token,  (CData, Comment, Declaration, ProcessingInstruction)):
            continue
        if isinstance(token, NavigableString):
            if not open_pTag:
                pTag = result.new_tag('p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a',
                'hr']:
            if not open_pTag:
                pTag = result.new_tag('p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        else:
            if open_pTag:
                container.insert(rtc, pTag)
                rtc += 1
                open_pTag = False
                ptc = 0
            container.insert(rtc, token)
            rtc += 1

    if open_pTag:
        container.insert(rtc, pTag)

    for p in container.findAll('p'):
        p['class'] = 'description'

    return container.decode_contents()

示例#6

0

显示文件

文件： comments.py 项目： JimmXinu/calibre

def comments_to_html(comments):
    '''
    Convert random comment text to normalized, xml-legal block of <p>s
    'plain text' returns as
    <p>plain text</p>

    'plain text with <i>minimal</i> <b>markup</b>' returns as
    <p>plain text with <i>minimal</i> <b>markup</b></p>

    '<p>pre-formatted text</p> returns untouched

    'A line of text\n\nFollowed by a line of text' returns as
    <p>A line of text</p>
    <p>Followed by a line of text</p>

    'A line of text.\nA second line of text.\rA third line of text' returns as
    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>

    '...end of a paragraph.Somehow the break was lost...' returns as
    <p>...end of a paragraph.</p>
    <p>Somehow the break was lost...</p>

    Deprecated HTML returns as HTML via BeautifulSoup()

    '''
    if not comments:
        return u'<p></p>'
    if not isinstance(comments, unicode_type):
        comments = comments.decode(preferred_encoding, 'replace')

    if comments.lstrip().startswith('<'):
        # Comment is already HTML do not mess with it
        return comments

    if '<' not in comments:
        comments = prepare_string_for_xml(comments)
        parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />')
                for x in comments.split('\n\n')]
        return '\n'.join(parts)

    if sanitize_pat.search(comments) is not None:
        try:
            return sanitize_comments_html(comments)
        except:
            import traceback
            traceback.print_exc()
            return u'<p></p>'

    # Explode lost CRs to \n\n
    comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.',
        '.\r'), comments)
    for lost_cr in lost_cr_pat.finditer(comments):
        comments = comments.replace(lost_cr.group(),
                                    '%s%s\n\n%s' % (lost_cr.group(1),
                                                    lost_cr.group(2),
                                                    lost_cr.group(3)))

    comments = comments.replace(u'\r', u'')
    # Convert \n\n to <p>s
    comments = comments.replace(u'\n\n', u'<p>')
    # Convert solo returns to <br />
    comments = comments.replace(u'\n', '<br />')
    # Convert two hyphens to emdash
    comments = comments.replace('--', '&mdash;')

    soup = BeautifulSoup('<div>' + comments + '</div>').find('div')
    result = BeautifulSoup('<div>')
    container = result.find('div')
    rtc = 0
    open_pTag = False

    all_tokens = list(soup.contents)
    for token in all_tokens:
        if isinstance(token,  (CData, Comment, Declaration, ProcessingInstruction)):
            continue
        if isinstance(token, NavigableString):
            if not open_pTag:
                pTag = result.new_tag('p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a',
                'hr']:
            if not open_pTag:
                pTag = result.new_tag('p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        else:
            if open_pTag:
                container.insert(rtc, pTag)
                rtc += 1
                open_pTag = False
                ptc = 0
            container.insert(rtc, token)
            rtc += 1

    if open_pTag:
        container.insert(rtc, pTag)

    for p in container.findAll('p'):
        p['class'] = 'description'

    return container.decode_contents().replace('<br></br>', '<br>')

示例#7

0

显示文件

文件： driver.py 项目： JimmXinu/calibre

    def generate_annotation_html(self, bookmark):
        from calibre.ebooks.BeautifulSoup import BeautifulSoup
        # Returns <div class="user_annotations"> ... </div>
        last_read_location = bookmark.last_read_location
        timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp)
        percent_read = bookmark.percent_read

        ka_soup = BeautifulSoup()
        dtc = 0
        divTag = ka_soup.new_tag('div')
        divTag['class'] = 'user_annotations'

        # Add the last-read location
        if bookmark.book_format == 'pdf':
            markup = _("%(time)s<br />Last page read: %(loc)d (%(pr)d%%)") % dict(
                    time=strftime(u'%x', timestamp.timetuple()),
                    loc=last_read_location,
                    pr=percent_read)
        else:
            markup = _("%(time)s<br />Last page read: Location %(loc)d (%(pr)d%%)") % dict(
                    time=strftime(u'%x', timestamp.timetuple()),
                    loc=last_read_location,
                    pr=percent_read)
        spanTag = BeautifulSoup('<span style="font-weight:bold">' + markup + '</span>').find('span')

        divTag.insert(dtc, spanTag)
        dtc += 1
        divTag.insert(dtc, ka_soup.new_tag('br'))
        dtc += 1

        if bookmark.user_notes:
            user_notes = bookmark.user_notes
            annotations = []

            # Add the annotations sorted by location
            # Italicize highlighted text
            for location in sorted(user_notes):
                if user_notes[location]['text']:
                    annotations.append(
                            _('<b>Location %(dl)d &bull; %(typ)s</b><br />%(text)s<br />') % dict(
                                dl=user_notes[location]['displayed_location'],
                                typ=user_notes[location]['type'],
                                text=(user_notes[location]['text'] if
                                      user_notes[location]['type'] == 'Note' else
                                      '<i>%s</i>' % user_notes[location]['text'])))
                else:
                    if bookmark.book_format == 'pdf':
                        annotations.append(
                                _('<b>Page %(dl)d &bull; %(typ)s</b><br />') % dict(
                                    dl=user_notes[location]['displayed_location'],
                                    typ=user_notes[location]['type']))
                    else:
                        annotations.append(
                                _('<b>Location %(dl)d &bull; %(typ)s</b><br />') % dict(
                                    dl=user_notes[location]['displayed_location'],
                                    typ=user_notes[location]['type']))

            for annotation in annotations:
                annot = BeautifulSoup('<span>' + annotation + '</span>').find('span')
                divTag.insert(dtc, annot)
                dtc += 1

        ka_soup.insert(0,divTag)
        return ka_soup