def get_soup(self, src, url=None): nmassage = [] nmassage.extend(self.preprocess_regexps) # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) for pat, repl in nmassage: usrc = pat.sub(repl, usrc) soup = BeautifulSoup(usrc) replace = self.prepreprocess_html_ext(soup) if replace is not None: replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0] for pat, repl in nmassage: replace = pat.sub(repl, replace) soup = BeautifulSoup(replace) if self.keep_only_tags: body = soup.new_tag('body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance( self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance( self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def get_soup(self, src, url=None): nmassage = [] nmassage.extend(self.preprocess_regexps) # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) for pat, repl in nmassage: usrc = pat.sub(repl, usrc) soup = BeautifulSoup(usrc) replace = self.prepreprocess_html_ext(soup) if replace is not None: replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0] for pat, repl in nmassage: replace = pat.sub(repl, replace) soup = BeautifulSoup(replace) if self.keep_only_tags: body = soup.new_tag('body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def generate_annotation_html(self, bookmark): from calibre.ebooks.BeautifulSoup import BeautifulSoup # Returns <div class="user_annotations"> ... </div> last_read_location = bookmark.last_read_location timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp) percent_read = bookmark.percent_read ka_soup = BeautifulSoup() dtc = 0 divTag = ka_soup.new_tag('div') divTag['class'] = 'user_annotations' # Add the last-read location if bookmark.book_format == 'pdf': markup = _("%(time)s<br />Last page read: %(loc)d (%(pr)d%%)" ) % dict(time=strftime('%x', timestamp.timetuple()), loc=last_read_location, pr=percent_read) else: markup = _( "%(time)s<br />Last page read: Location %(loc)d (%(pr)d%%)" ) % dict(time=strftime(u'%x', timestamp.timetuple()), loc=last_read_location, pr=percent_read) spanTag = BeautifulSoup('<span style="font-weight:bold">' + markup + '</span>').find('span') divTag.insert(dtc, spanTag) dtc += 1 divTag.insert(dtc, ka_soup.new_tag('br')) dtc += 1 if bookmark.user_notes: user_notes = bookmark.user_notes annotations = [] # Add the annotations sorted by location # Italicize highlighted text for location in sorted(user_notes): if user_notes[location]['text']: annotations.append( _('<b>Location %(dl)d • %(typ)s</b><br />%(text)s<br />' ) % dict( dl=user_notes[location]['displayed_location'], typ=user_notes[location]['type'], text=(user_notes[location]['text'] if user_notes[location]['type'] == 'Note' else '<i>%s</i>' % user_notes[location]['text']))) else: if bookmark.book_format == 'pdf': annotations.append( _('<b>Page %(dl)d • %(typ)s</b><br />') % dict(dl=user_notes[location]['displayed_location'], typ=user_notes[location]['type'])) else: annotations.append( _('<b>Location %(dl)d • %(typ)s</b><br />') % dict(dl=user_notes[location]['displayed_location'], typ=user_notes[location]['type'])) for annotation in annotations: annot = BeautifulSoup('<span>' + annotation + '</span>').find('span') divTag.insert(dtc, annot) dtc += 1 ka_soup.insert(0, divTag) return ka_soup
def to_HTML(self, header=''): ''' Generate HTML with user-specified CSS, element order ''' # Retrieve CSS prefs from calibre_plugins.annotations.appearance import default_elements stored_css = plugin_prefs.get('appearance_css', default_elements) elements = [] for element in stored_css: elements.append(element['name']) if element['name'] == 'Note': note_style = re.sub('\n', '', element['css']) elif element['name'] == 'Text': text_style = re.sub('\n', '', element['css']) elif element['name'] == 'Timestamp': ts_style = re.sub('\n', '', element['css']) # Additional CSS for timestamp color and bg to be formatted datetime_style = ("background-color:{0};color:{1};" + ts_style) # Order the elements according to stored preferences comments_body = '' for element in elements: if element == 'Text': comments_body += '{text}' elif element == 'Note': comments_body += '{note}' elif element == 'Timestamp': ts_css = '''<table cellpadding="0" width="100%" style="{ts_style}" color="{color}"> <tr> <td class="location" style="text-align:left">{location}</td> <td class="timestamp" uts="{unix_timestamp}" style="text-align:right">{friendly_timestamp}</td> </tr> </table>''' comments_body += re.sub(r'>\s+<', r'><', ts_css) # self._log_location("comments_body='%s'" % comments_body) if self.annotations: soup = BeautifulSoup(ANNOTATIONS_HEADER) dtc = 0 # Add the annotations for i, agroup in enumerate( sorted(self.annotations, key=self._annotation_sorter)): # self._log_location("agroup='%s'" % agroup) location = agroup.location if location is None: location = '' friendly_timestamp = self._timestamp_to_datestr( agroup.timestamp) text = '' if agroup.text: # self._log_location("agroup.text='%s'" % agroup.text) for agt in agroup.text: # self._log_location("agt='%s'" % agt) text += '<p class="highlight" style="{0}">{1}</p>'.format( text_style, agt) note = '' if agroup.note: # self._log_location("agroup.note='%s'" % agroup.note) for agn in agroup.note: # self._log_location("agn='%s'" % agn) note += '<p class="note" style="{0}">{1}</p>'.format( note_style, agn) try: dt_bgcolor = COLOR_MAP[agroup.highlightcolor]['bg'] dt_fgcolor = COLOR_MAP[agroup.highlightcolor]['fg'] except: if agroup.highlightcolor is None: msg = "No highlight color specified, using Default" else: msg = "Unknown color '%s' specified" % agroup.highlightcolor self._log_location(msg) dt_bgcolor = COLOR_MAP['Default']['bg'] dt_fgcolor = COLOR_MAP['Default']['fg'] if agroup.hash is not None: # Use existing hash when re-rendering annotation_hash = agroup.hash else: m = hashlib.md5() m.update(text.encode('utf-8')) m.update(note.encode('utf-8')) annotation_hash = m.hexdigest() try: ka_soup = BeautifulSoup() divTag = ka_soup.new_tag('div') # self._log_location("Used ka_soup.new_tag to create tag: %s" % divTag) except: divTag = Tag(BeautifulSoup(), 'div') # self._log_location("Used Tag(BeautifulSoup() to create tag: %s" % divTag) content_args = { 'color': agroup.highlightcolor, 'friendly_timestamp': friendly_timestamp, 'location': location, 'note': note, 'text': text, 'ts_style': datetime_style.format(dt_bgcolor, dt_fgcolor), 'unix_timestamp': agroup.timestamp, } # self._log_location("Generated comment soup: %s" % BeautifulSoup(comments_body.format(**content_args))) comments_body_soup = BeautifulSoup( comments_body.format(**content_args)) # self._log_location("Generated comment soup: comments_body_soup=%s" % comments_body_soup) # self._log_location("Generated comment soup: comments_body_soup.body=%s" % comments_body_soup.body) # self._log_location("Generated comment soup: comments_body_soup.body.children=%s" % comments_body_soup.body.children) # self._log_location("Generated comment soup: comments_body_soup.body.contents=%s" % comments_body_soup.body.contents) # self._log_location("Generated comment soup: len(comments_body_soup.body.contents)=%s" % len(comments_body_soup.body.contents)) # for i in range(0, len(comments_body_soup.body.contents)): # self._log_location("i=%s" % i) # self._log_location("comment_body_tag=%s" % comments_body_soup.body.contents[i]) while len(comments_body_soup.body.contents) > 0: # self._log_location("comment_body_tag=%s" % comments_body_soup.body.contents[0]) divTag.append(comments_body_soup.body.contents[0]) divTag['class'] = "annotation" divTag['genre'] = '' if agroup.genre: divTag['genre'] = escape(agroup.genre) divTag['hash'] = annotation_hash divTag['location_sort'] = agroup.location_sort divTag['reader'] = agroup.reader_app divTag['style'] = ANNOTATION_DIV_STYLE # self._log_location("An annotation - divTag=%s" % divTag) soup.div.insert(dtc, divTag) # self._log_location("Full soup after adding annotation - soup=%s" % soup) dtc += 1 if i < len(self.annotations) - 1 and \ plugin_prefs.get('appearance_hr_checkbox', False): soup.div.insert( dtc, BeautifulSoup( plugin_prefs.get('HORIZONTAL_RULE', '<hr width="80%" />'))) dtc += 1 else: soup = BeautifulSoup(ANNOTATIONS_HEADER) return unicode(soup)
def comments_to_html(comments): ''' Convert random comment text to normalized, xml-legal block of <p>s 'plain text' returns as <p>plain text</p> 'plain text with <i>minimal</i> <b>markup</b>' returns as <p>plain text with <i>minimal</i> <b>markup</b></p> '<p>pre-formatted text</p> returns untouched 'A line of text\n\nFollowed by a line of text' returns as <p>A line of text</p> <p>Followed by a line of text</p> 'A line of text.\nA second line of text.\rA third line of text' returns as <p>A line of text.<br />A second line of text.<br />A third line of text.</p> '...end of a paragraph.Somehow the break was lost...' returns as <p>...end of a paragraph.</p> <p>Somehow the break was lost...</p> Deprecated HTML returns as HTML via BeautifulSoup() ''' if not comments: return u'<p></p>' if not isinstance(comments, unicode_type): comments = comments.decode(preferred_encoding, 'replace') if comments.lstrip().startswith('<'): # Comment is already HTML do not mess with it return comments if '<' not in comments: comments = prepare_string_for_xml(comments) parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />') for x in comments.split('\n\n')] return '\n'.join(parts) if sanitize_pat.search(comments) is not None: try: return sanitize_comments_html(comments) except: import traceback traceback.print_exc() return u'<p></p>' # Explode lost CRs to \n\n comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.', '.\r'), comments) for lost_cr in lost_cr_pat.finditer(comments): comments = comments.replace(lost_cr.group(), '%s%s\n\n%s' % (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3))) comments = comments.replace(u'\r', u'') # Convert \n\n to <p>s comments = comments.replace(u'\n\n', u'<p>') # Convert solo returns to <br /> comments = comments.replace(u'\n', '<br />') # Convert two hyphens to emdash comments = comments.replace('--', '—') soup = BeautifulSoup('<div>' + comments + '</div>').find('div') result = BeautifulSoup('<div>') container = result.find('div') rtc = 0 open_pTag = False all_tokens = list(soup.contents) for token in all_tokens: if isinstance(token, (CData, Comment, Declaration, ProcessingInstruction)): continue if isinstance(token, NavigableString): if not open_pTag: pTag = result.new_tag('p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a', 'hr']: if not open_pTag: pTag = result.new_tag('p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 else: if open_pTag: container.insert(rtc, pTag) rtc += 1 open_pTag = False ptc = 0 container.insert(rtc, token) rtc += 1 if open_pTag: container.insert(rtc, pTag) for p in container.findAll('p'): p['class'] = 'description' return container.decode_contents()
def comments_to_html(comments): ''' Convert random comment text to normalized, xml-legal block of <p>s 'plain text' returns as <p>plain text</p> 'plain text with <i>minimal</i> <b>markup</b>' returns as <p>plain text with <i>minimal</i> <b>markup</b></p> '<p>pre-formatted text</p> returns untouched 'A line of text\n\nFollowed by a line of text' returns as <p>A line of text</p> <p>Followed by a line of text</p> 'A line of text.\nA second line of text.\rA third line of text' returns as <p>A line of text.<br />A second line of text.<br />A third line of text.</p> '...end of a paragraph.Somehow the break was lost...' returns as <p>...end of a paragraph.</p> <p>Somehow the break was lost...</p> Deprecated HTML returns as HTML via BeautifulSoup() ''' if not comments: return u'<p></p>' if not isinstance(comments, unicode_type): comments = comments.decode(preferred_encoding, 'replace') if comments.lstrip().startswith('<'): # Comment is already HTML do not mess with it return comments if '<' not in comments: comments = prepare_string_for_xml(comments) parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />') for x in comments.split('\n\n')] return '\n'.join(parts) if sanitize_pat.search(comments) is not None: try: return sanitize_comments_html(comments) except: import traceback traceback.print_exc() return u'<p></p>' # Explode lost CRs to \n\n comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.', '.\r'), comments) for lost_cr in lost_cr_pat.finditer(comments): comments = comments.replace(lost_cr.group(), '%s%s\n\n%s' % (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3))) comments = comments.replace(u'\r', u'') # Convert \n\n to <p>s comments = comments.replace(u'\n\n', u'<p>') # Convert solo returns to <br /> comments = comments.replace(u'\n', '<br />') # Convert two hyphens to emdash comments = comments.replace('--', '—') soup = BeautifulSoup('<div>' + comments + '</div>').find('div') result = BeautifulSoup('<div>') container = result.find('div') rtc = 0 open_pTag = False all_tokens = list(soup.contents) for token in all_tokens: if isinstance(token, (CData, Comment, Declaration, ProcessingInstruction)): continue if isinstance(token, NavigableString): if not open_pTag: pTag = result.new_tag('p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a', 'hr']: if not open_pTag: pTag = result.new_tag('p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 else: if open_pTag: container.insert(rtc, pTag) rtc += 1 open_pTag = False ptc = 0 container.insert(rtc, token) rtc += 1 if open_pTag: container.insert(rtc, pTag) for p in container.findAll('p'): p['class'] = 'description' return container.decode_contents().replace('<br></br>', '<br>')
def generate_annotation_html(self, bookmark): from calibre.ebooks.BeautifulSoup import BeautifulSoup # Returns <div class="user_annotations"> ... </div> last_read_location = bookmark.last_read_location timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp) percent_read = bookmark.percent_read ka_soup = BeautifulSoup() dtc = 0 divTag = ka_soup.new_tag('div') divTag['class'] = 'user_annotations' # Add the last-read location if bookmark.book_format == 'pdf': markup = _("%(time)s<br />Last page read: %(loc)d (%(pr)d%%)") % dict( time=strftime(u'%x', timestamp.timetuple()), loc=last_read_location, pr=percent_read) else: markup = _("%(time)s<br />Last page read: Location %(loc)d (%(pr)d%%)") % dict( time=strftime(u'%x', timestamp.timetuple()), loc=last_read_location, pr=percent_read) spanTag = BeautifulSoup('<span style="font-weight:bold">' + markup + '</span>').find('span') divTag.insert(dtc, spanTag) dtc += 1 divTag.insert(dtc, ka_soup.new_tag('br')) dtc += 1 if bookmark.user_notes: user_notes = bookmark.user_notes annotations = [] # Add the annotations sorted by location # Italicize highlighted text for location in sorted(user_notes): if user_notes[location]['text']: annotations.append( _('<b>Location %(dl)d • %(typ)s</b><br />%(text)s<br />') % dict( dl=user_notes[location]['displayed_location'], typ=user_notes[location]['type'], text=(user_notes[location]['text'] if user_notes[location]['type'] == 'Note' else '<i>%s</i>' % user_notes[location]['text']))) else: if bookmark.book_format == 'pdf': annotations.append( _('<b>Page %(dl)d • %(typ)s</b><br />') % dict( dl=user_notes[location]['displayed_location'], typ=user_notes[location]['type'])) else: annotations.append( _('<b>Location %(dl)d • %(typ)s</b><br />') % dict( dl=user_notes[location]['displayed_location'], typ=user_notes[location]['type'])) for annotation in annotations: annot = BeautifulSoup('<span>' + annotation + '</span>').find('span') divTag.insert(dtc, annot) dtc += 1 ka_soup.insert(0,divTag) return ka_soup