def get_soup(self, src, url=None): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(self.preprocess_regexps) # Some websites have buggy doctype declarations that mess up beautifulsoup nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL | re.IGNORECASE), lambda m: '')] # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) soup = BeautifulSoup(usrc, markupMassage=nmassage) replace = self.prepreprocess_html_ext(soup) if replace is not None: soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage) if self.keep_only_tags: body = Tag(soup, 'body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance( self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance( self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def construct(self, book_notes): ''' Given a list of notes, render HTML ''' soup = None if book_notes: soup = BeautifulSoup( '''<div class="{0}"></div>'''.format('book_notes')) for note in book_notes: div_tag = Tag(soup, 'div', [('class', "book_note")]) p_tag = Tag(soup, 'p', [('class', "book_note"), ('style', "{0}".format(self._get_note_style()))]) p_tag.append(note) div_tag.append(p_tag) soup.div.append(div_tag) return soup
def preprocess_html(self, soup): for links in soup.findAll('a'): url = links['href'] if url == '/': links.extract() for prmattrs in [ 'float: right;margin-left: 5px; margin-bottom: 5px;', 'doc_tags' ]: for item in soup.findAll('div', attrs={'style': prmattrs}): item.extract() mytitle = soup.find('div', attrs={'class': 'doc_title'}) if mytitle: mytitstr = self.tag_to_string(mytitle) myauthor = soup.find('div', attrs={'class': 'doc_author'}) if myauthor: myautstr = self.tag_to_string(myauthor) myauthor.extract() myntitle = myautstr + " - " myntitle = myntitle + mytitstr else: myntitle = mytitle tag = Tag(soup, "h2") tag['class'] = "headline" tag.insert(0, capwords(myntitle)) mytitle.replaceWith(tag) mysubtitle = soup.find('div', attrs={'class': 'doc_subtitle'}) if mysubtitle: mysubtitstr = self.tag_to_string(mysubtitle) tag = Tag(soup, "h3") tag['class'] = "headline" tag.insert(0, capwords(mysubtitstr)) mysubtitle.replaceWith(tag) mylapszam = soup.find('div', attrs={'class': 'lapszam'}) if mylapszam: mylapstr = self.tag_to_string(mylapszam) tag = Tag(soup, "h5") tag['class'] = "headline" tag.insert(0, mylapstr) mylapszam.replaceWith(tag) return soup
def add_annotation_to_library(self, db, db_id, annotation): from calibre.ebooks.BeautifulSoup import Tag from calibre.ebooks.metadata import MetaInformation bm = annotation ignore_tags = set(['Catalog', 'Clippings']) if bm.type == 'kindle_bookmark': mi = db.get_metadata(db_id, index_is_id=True) user_notes_soup = self.generate_annotation_html(bm.value) if mi.comments: a_offset = mi.comments.find('<div class="user_annotations">') ad_offset = mi.comments.find( '<hr class="annotations_divider" />') if a_offset >= 0: mi.comments = mi.comments[:a_offset] if ad_offset >= 0: mi.comments = mi.comments[:ad_offset] if set(mi.tags).intersection(ignore_tags): return if mi.comments: hrTag = Tag(user_notes_soup, 'hr') hrTag['class'] = 'annotations_divider' user_notes_soup.insert(0, hrTag) mi.comments += unicode(user_notes_soup.prettify()) else: mi.comments = unicode(user_notes_soup.prettify()) # Update library comments db.set_comment(db_id, mi.comments) # Add bookmark file to db_id db.add_format_with_hooks(db_id, bm.value.bookmark_extension, bm.value.path, index_is_id=True) elif bm.type == 'kindle_clippings': # Find 'My Clippings' author=Kindle in database, or add last_update = 'Last modified %s' % strftime( u'%x %X', bm.value['timestamp'].timetuple()) mc_id = list( db.data.search_getting_ids('title:"My Clippings"', '', sort_results=False)) if mc_id: db.add_format_with_hooks(mc_id[0], 'TXT', bm.value['path'], index_is_id=True) mi = db.get_metadata(mc_id[0], index_is_id=True) mi.comments = last_update db.set_metadata(mc_id[0], mi) else: mi = MetaInformation('My Clippings', authors=['Kindle']) mi.tags = ['Clippings'] mi.comments = last_update db.add_books([bm.value['path']], ['txt'], [mi])
def get_soup(self, src, url=None): nmassage = [] nmassage.extend(self.preprocess_regexps) # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) for pat, repl in nmassage: usrc = pat.sub(repl, usrc) set_soup_module(sys.modules[BeautifulSoup.__module__]) soup = parse(usrc, return_root=False) replace = self.prepreprocess_html_ext(soup) if replace is not None: replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0] for pat, repl in nmassage: replace = pat.sub(repl, replace) soup = parse(replace, return_root=False) if self.keep_only_tags: body = Tag(soup, 'body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def postprocess_html(self, soup, first): for rmattrs in ['almenu', 'doc_author_docs', 'doc_print']: for item in soup.findAll('div', attrs={'class': rmattrs}): item.extract() for pz in soup.findAll('p', attrs={'align': 'left'}): myp = self.tag_to_string(pz) if re.search('^( | )*$', myp): tag = Tag(soup, "div") tag['class'] = "removable" tag.insert(0, '') pz.replaceWith(tag) for brz in soup.findAll('br'): tag = Tag(soup, "div") tag['class'] = "removable" tag.insert(0, '') brz.replaceWith(tag) return soup
def preview_css(self): ''' Construct a dummy set of notes and annotation for preview purposes Modeled after book_status:_get_formatted_annotations() ''' from calibre_plugins.marvin_manager.annotations import ( ANNOTATIONS_HTML_TEMPLATE, Annotation, Annotations, BookNotes, BookmarkNotes) # Assemble the preview soup soup = BeautifulSoup(ANNOTATIONS_HTML_TEMPLATE) # Load the CSS from MXD resources path = os.path.join(self.parent.opts.resources_path, 'css', 'annotations.css') with open(path, 'rb') as f: css = f.read().decode('utf-8') style_tag = Tag(soup, 'style') style_tag.insert(0, css) soup.head.style.replaceWith(style_tag) # Assemble the sample Book notes book_notes_soup = BookNotes().construct(self.sample_book_notes) soup.body.append(book_notes_soup) cd_tag = Tag(soup, 'div', [('class', "divider")]) soup.body.append(cd_tag) # Assemble the sample Bookmark notes bookmark_notes_soup = BookmarkNotes().construct(self.sample_bookmark_notes) soup.body.append(bookmark_notes_soup) cd_tag = Tag(soup, 'div', [('class', "divider")]) soup.body.append(cd_tag) # Assemble the sample annotations pas = Annotations(None, title="Preview") pas.annotations.append(Annotation(self.sample_ann_1)) pas.annotations.append(Annotation(self.sample_ann_2)) pas.annotations.append(Annotation(self.sample_ann_3)) annotations_soup = pas.to_HTML(pas.create_soup()) soup.body.append(annotations_soup) self.parent.wv.setHtml(unicode(soup.renderContents()))
def inject_css(self, html): ''' stick a <style> element into html Deep View content structured differently <html style=""><body style=""> ''' css = str(self.css_pte.toPlainText()) if css: raw_soup = self._remove_old_style(html) style_tag = Tag(raw_soup, 'style') style_tag['type'] = "text/css" style_tag.insert(0, css) head = raw_soup.find("head") head.insert(0, style_tag) self.styled_soup = raw_soup html = self.styled_soup.renderContents() return html
def _inject_css(self, html): ''' stick a <style> element into html ''' css = self.prefs.get('injected_css', None) if css: try: styled_soup = BeautifulSoup(html) head = styled_soup.find("head") style_tag = Tag(styled_soup, 'style') style_tag['type'] = "text/css" style_tag.insert(0, css) head.insert(0, style_tag) html = styled_soup.renderContents() except: return html return (html)
def generate_annotation_html(self, bookmark): from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString # Returns <div class="user_annotations"> ... </div> last_read_location = bookmark.last_read_location timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp) percent_read = bookmark.percent_read ka_soup = BeautifulSoup() dtc = 0 divTag = Tag(ka_soup, 'div') divTag['class'] = 'user_annotations' # Add the last-read location spanTag = Tag(ka_soup, 'span') spanTag['style'] = 'font-weight:bold' if bookmark.book_format == 'pdf': spanTag.insert(0,NavigableString( _("%(time)s<br />Last Page Read: %(loc)d (%(pr)d%%)") % \ dict(time=strftime(u'%x', timestamp.timetuple()), loc=last_read_location, pr=percent_read))) else: spanTag.insert(0,NavigableString( _("%(time)s<br />Last Page Read: Location %(loc)d (%(pr)d%%)") % \ dict(time=strftime(u'%x', timestamp.timetuple()), loc=last_read_location, pr=percent_read))) divTag.insert(dtc, spanTag) dtc += 1 divTag.insert(dtc, Tag(ka_soup, 'br')) dtc += 1 if bookmark.user_notes: user_notes = bookmark.user_notes annotations = [] # Add the annotations sorted by location # Italicize highlighted text for location in sorted(user_notes): if user_notes[location]['text']: annotations.append( _('<b>Location %(dl)d • %(typ)s</b><br />%(text)s<br />') % \ dict(dl=user_notes[location]['displayed_location'], typ=user_notes[location]['type'], text=(user_notes[location]['text'] if \ user_notes[location]['type'] == 'Note' else \ '<i>%s</i>' % user_notes[location]['text']))) else: if bookmark.book_format == 'pdf': annotations.append( _('<b>Page %(dl)d • %(typ)s</b><br />') % \ dict(dl=user_notes[location]['displayed_location'], typ=user_notes[location]['type'])) else: annotations.append( _('<b>Location %(dl)d • %(typ)s</b><br />') % \ dict(dl=user_notes[location]['displayed_location'], typ=user_notes[location]['type'])) for annotation in annotations: divTag.insert(dtc, annotation) dtc += 1 ka_soup.insert(0, divTag) return ka_soup
def to_HTML(self, header=''): ''' Generate HTML with user-specified CSS, element order ''' # Retrieve CSS prefs from calibre_plugins.marvin_manager.appearance import default_elements stored_css = plugin_prefs.get('appearance_css', default_elements) elements = [] for element in stored_css: elements.append(element['name']) if element['name'] == 'Note': note_style = re.sub('\n', '', element['css']) elif element['name'] == 'Text': text_style = re.sub('\n', '', element['css']) elif element['name'] == 'Timestamp': ts_style = re.sub('\n', '', element['css']) # Additional CSS for timestamp color and bg to be formatted datetime_style = ("background-color:{0};color:{1};" + ts_style) # Order the elements according to stored preferences comments_body = '' for element in elements: if element == 'Text': comments_body += '{text}' elif element == 'Note': comments_body += '{note}' elif element == 'Timestamp': ts_css = '''<table cellpadding="0" width="100%" style="{ts_style}" color="{color}"> <tr> <td class="location" style="text-align:left">{location}</td> <td class="timestamp" uts="{unix_timestamp}" style="text-align:right">{friendly_timestamp}</td> </tr> </table>''' comments_body += re.sub(r'>\s+<', r'><', ts_css) if self.annotations: soup = BeautifulSoup(ANNOTATIONS_HEADER) dtc = 0 # Add the annotations for i, agroup in enumerate( sorted(self.annotations, key=self._annotation_sorter)): location = agroup.location if location is None: location = '' friendly_timestamp = self._timestamp_to_datestr( agroup.timestamp) text = '' if agroup.text: for agt in agroup.text: text += '<p class="highlight" style="{0}">{1}</p>'.format( text_style, agt) note = '' if agroup.note: for agn in agroup.note: note += '<p class="note" style="{0}">{1}</p>'.format( note_style, agn) try: dt_bgcolor = COLOR_MAP[agroup.highlightcolor]['bg'] dt_fgcolor = COLOR_MAP[agroup.highlightcolor]['fg'] except: if agroup.highlightcolor is None: msg = "No highlight color specified, using Default" else: msg = "Unknown color '%s' specified" % agroup.highlightcolor self._log_location(msg) dt_bgcolor = COLOR_MAP['Default']['bg'] dt_fgcolor = COLOR_MAP['Default']['fg'] if agroup.hash is not None: # Use existing hash when re-rendering hash = agroup.hash else: m = hashlib.md5() m.update(text) m.update(note) hash = m.hexdigest() divTag = Tag(BeautifulSoup(), 'div') content_args = { 'color': agroup.highlightcolor, 'friendly_timestamp': friendly_timestamp, 'location': location, 'note': note, 'text': text, 'ts_style': datetime_style.format(dt_bgcolor, dt_fgcolor), 'unix_timestamp': agroup.timestamp, } divTag.insert(0, comments_body.format(**content_args)) divTag['class'] = "annotation" divTag['genre'] = '' if agroup.genre: divTag['genre'] = escape(agroup.genre) divTag['hash'] = hash divTag['location_sort'] = agroup.location_sort divTag['reader'] = agroup.reader_app divTag['style'] = ANNOTATION_DIV_STYLE soup.div.insert(dtc, divTag) dtc += 1 if i < len(self.annotations) - 1 and \ plugin_prefs.get('appearance_hr_checkbox', False): soup.div.insert( dtc, plugin_prefs.get('HORIZONTAL_RULE', '<hr width="80%" />')) dtc += 1 else: soup = BeautifulSoup(ANNOTATIONS_HEADER) return unicode(soup.renderContents())
def comments_to_html(comments): ''' Convert random comment text to normalized, xml-legal block of <p>s 'plain text' returns as <p>plain text</p> 'plain text with <i>minimal</i> <b>markup</b>' returns as <p>plain text with <i>minimal</i> <b>markup</b></p> '<p>pre-formatted text</p> returns untouched 'A line of text\n\nFollowed by a line of text' returns as <p>A line of text</p> <p>Followed by a line of text</p> 'A line of text.\nA second line of text.\rA third line of text' returns as <p>A line of text.<br />A second line of text.<br />A third line of text.</p> '...end of a paragraph.Somehow the break was lost...' returns as <p>...end of a paragraph.</p> <p>Somehow the break was lost...</p> Deprecated HTML returns as HTML via BeautifulSoup() ''' if not comments: return u'<p></p>' if not isinstance(comments, unicode): comments = comments.decode(preferred_encoding, 'replace') if comments.lstrip().startswith('<'): # Comment is already HTML do not mess with it return comments if '<' not in comments: comments = prepare_string_for_xml(comments) parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />') for x in comments.split('\n\n')] return '\n'.join(parts) if sanitize_pat.search(comments) is not None: try: return sanitize_comments_html(comments) except: import traceback traceback.print_exc() return u'<p></p>' # Explode lost CRs to \n\n comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.', '.\r'), comments) for lost_cr in lost_cr_pat.finditer(comments): comments = comments.replace(lost_cr.group(), '%s%s\n\n%s' % (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3))) comments = comments.replace(u'\r', u'') # Convert \n\n to <p>s comments = comments.replace(u'\n\n', u'<p>') # Convert solo returns to <br /> comments = comments.replace(u'\n', '<br />') # Convert two hyphens to emdash comments = comments.replace('--', '—') soup = BeautifulSoup(comments) result = BeautifulSoup() rtc = 0 open_pTag = False all_tokens = list(soup.contents) for token in all_tokens: if type(token) is NavigableString: if not open_pTag: pTag = Tag(result,'p') open_pTag = True ptc = 0 pTag.insert(ptc,prepare_string_for_xml(token)) ptc += 1 elif type(token) in (CData, Comment, Declaration, ProcessingInstruction): continue elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a', 'hr']: if not open_pTag: pTag = Tag(result,'p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 else: if open_pTag: result.insert(rtc, pTag) rtc += 1 open_pTag = False ptc = 0 result.insert(rtc, token) rtc += 1 if open_pTag: result.insert(rtc, pTag) for p in result.findAll('p'): p['class'] = 'description' for t in result.findAll(text=True): t.replaceWith(prepare_string_for_xml(unicode(t))) return result.renderContents(encoding=None)
def rebuild_collections(self, booklist, oncard): ''' For each book in the booklist for the card oncard, remove it from all its current collections, then add it to the collections specified in device_collections. oncard is None for the main memory, carda for card A, cardb for card B, etc. booklist is the object created by the :method:`books` call above. This is called after the user edits the 'Collections' field in the Device view when Metadata management is set to 'Manual'. ''' self._log_location() command_name = "rebuild_collections" command_element = "rebuildcollections" command_soup = BeautifulStoneSoup(self.parent.COMMAND_XML.format( command_element, time.mktime(time.localtime()))) LOCAL_DEBUG = False if booklist: changed = 0 for book in booklist: if LOCAL_DEBUG: self._log("{0:7} {1}".format(book.in_library, book.title)) filename = self.parent.path_template.format(book.uuid) if filename not in self.parent.cached_books: for fn in self.parent.cached_books: if book.uuid and book.uuid == self.parent.cached_books[fn]['uuid']: if LOCAL_DEBUG: self._log("'%s' matched on uuid %s" % (book.title, book.uuid)) filename = fn break elif (book.title == self.parent.cached_books[fn]['title'] and book.authors == self.parent.cached_books[fn]['authors']): if LOCAL_DEBUG: self._log("'%s' matched on title/author" % book.title) filename = fn break else: self._log("ERROR: file %s not found in cached_books" % repr(filename)) continue cached_collections = self.parent.cached_books[filename]['device_collections'] if cached_collections != book.device_collections: # Append the changed book info to the command file book_tag = Tag(command_soup, 'book') book_tag['filename'] = filename book_tag['title'] = book.title book_tag['author'] = ', '.join(book.authors) book_tag['uuid'] = book.uuid collections_tag = Tag(command_soup, 'collections') for tag in book.device_collections: c_tag = Tag(command_soup, 'collection') c_tag.insert(0, tag) collections_tag.insert(0, c_tag) book_tag.insert(0, collections_tag) command_soup.manifest.insert(0, book_tag) # Update cache self.parent.cached_books[filename]['device_collections'] = book.device_collections changed += 1 if changed: # Stage the command file self.parent._stage_command_file(command_name, command_soup, show_command=self.parent.prefs.get('development_mode', False)) # Wait for completion self.parent._wait_for_command_completion(command_name) else: self._log("no collection changes detected cached_books <=> device books")
def postprocess_html(self, soup, first_fetch): author_general = soup.find('span', {'class': 'author_general'}) author_general.em.extract() # the complete content full_div = None transcript_div = soup.find('div', {'id': 'transcript'}) if transcript_div: # that's an interview # get all <div class="qa" /> qa_div_list = list(find_by_class(transcript_div, 'div', 'qa')) for qa_div in qa_div_list: qa_div.extract() # replace all <a class="question_link">...</a> with <strong>...</strong> question_link = qa_div.find('a', {'class': 'question_link'}) question_strong = Tag(soup, 'strong') question_strong.append(question_link.string) question_link.replaceWith(question_strong) full_div = find_by_class(soup.find('div', {'id': 'content'}), 'div', 'presentation_full').next() # clean the <h1 /> full_div.h1.span.extract() title_div = full_div.h1.div title_div.replaceWith(title_div.string) # clear the presentation area for div in full_div.findAll('div'): div.extract() # add qa list back to presentation area for qa_div in qa_div_list: full_div.append(qa_div) else: # text only without title text_div = find_by_class(soup, 'div', 'text_info').next() text_div.extract() for other in text_div.findAll('div'): other.extract() # full_div contains title full_div = soup.find('div', {'id': 'content'}) for other in full_div.findAll('div'): other.extract() full_div.append(text_div) full_div.extract() nav_div = soup.body.div nav_div.extract() # keep nav_div and full_div in <body /> only for other in soup.body: other.extract() soup.body.append(nav_div) soup.body.append(full_div) return soup
def new_tag(soup, name, attrs=()): impl = getattr(soup, 'new_tag', None) if impl is not None: return impl(name, attrs=dict(attrs)) return Tag(soup, name, attrs=attrs or None)
def merge_annotations(parent, cid, old_soup, new_soup): ''' old_soup, new_soup: BeautifulSoup() Need to strip <hr>, re-sort based on location, build new merged_soup with optional interleaved <hr> elements. ''' TRANSIENT_DB = 'transient' if False: ''' Older technique: Use hashes to merge annotations ''' #Get the hashes of any existing annotations oiuas = old_soup.findAll('div', 'annotation') old_hashes = set([ua['hash'] for ua in oiuas]) # Extract old user_annotations ouas = old_soup.find('div', 'user_annotations') if ouas: ouas.extract() # Capture existing annotations parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB) # Regurgitate old_soup with current CSS regurgitated_soup = BeautifulSoup( parent.opts.db.rerender_to_html(TRANSIENT_DB, cid)) # Find new annotations uas = new_soup.findAll('div', 'annotation') new_hashes = set([ua['hash'] for ua in uas]) updates = list(new_hashes.difference(old_hashes)) if len(updates) and ouas is not None: # Append new to regurgitated dtc = len(regurgitated_soup.div) for new_annotation_id in updates: new_annotation = new_soup.find('div', {'hash': new_annotation_id}) regurgitated_soup.div.insert(dtc, new_annotation) dtc += 1 if old_soup: merged_soup = unicode(old_soup) + unicode( sort_merged_annotations(regurgitated_soup)) else: merged_soup = unicode( sort_merged_annotations(regurgitated_soup)) else: if old_soup: merged_soup = unicode(old_soup) + unicode(new_soup) else: merged_soup = unicode(new_soup) return merged_soup else: ''' Newer technique: Use timestamps to merge annotations ''' timestamps = {} # Get the timestamps and hashes of the stored annotations suas = old_soup.findAll('div', 'annotation') for sua in suas: #print("sua: %s" % sua.prettify()) timestamp = sua.find('td', 'timestamp')['uts'] timestamps[timestamp] = {'stored_hash': sua['hash']} # Rerender stored annotations ouas = old_soup.find('div', 'user_annotations') if ouas: ouas.extract() # Capture existing annotations parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB) # Regurgitate annotations with current CSS rerendered_annotations = parent.opts.db.rerender_to_html( TRANSIENT_DB, cid) regurgitated_soup = BeautifulSoup(rerendered_annotations) # Add device annotation timestamps and hashes duas = new_soup.findAll('div', 'annotation') for dua in duas: timestamp = dua.find('td', 'timestamp')['uts'] if timestamp in timestamps: timestamps[timestamp]['device_hash'] = dua['hash'] else: timestamps[timestamp] = {'device_hash': dua['hash']} merged_annotations = Tag(BeautifulSoup(), 'div', [('class', "user_annotations"), ('style', 'margin:0')]) for ts in sorted(timestamps): if 'stored_hash' in timestamps[ ts] and not 'device_hash' in timestamps[ts]: # Stored only - add from regurgitated_soup annotation = regurgitated_soup.find( 'div', {'hash': timestamps[ts]['stored_hash']}) elif not 'stored_hash' in timestamps[ ts] and 'device_hash' in timestamps[ts]: # Device only - add from new_soup annotation = new_soup.find( 'div', {'hash': timestamps[ts]['device_hash']}) elif timestamps[ts]['stored_hash'] == timestamps[ts][ 'device_hash']: # Stored matches device - add from regurgitated_soup, as user may have modified annotation = regurgitated_soup.find( 'div', {'hash': timestamps[ts]['stored_hash']}) elif timestamps[ts]['stored_hash'] != timestamps[ts]['device_hash']: # Device has been updated since initial capture - add from new_soup annotation = new_soup.find( 'div', {'hash': timestamps[ts]['device_hash']}) else: continue merged_annotations.append(annotation) merged_annotations = sort_merged_annotations(merged_annotations) # Update new_soup with merged_annotations new_soup_uas = new_soup.find('div', 'user_annotations') new_soup_uas.replaceWith(merged_annotations) return unicode(new_soup)