def highlight(html, highlights, show_tags=False): """Highlight part of an HTML documents. :param highlights: Iterable of (start, end, tags) triples, which are computed over UTF-8 bytes and don't count HTML tags :param show_tags: Whether to show the tag names within brackets after each highlight """ # Build a list of starting points and ending points starts = [] ends = [] for hl in highlights: starts.append((hl[0], 'start', [])) if len(hl) == 2: ends.append((hl[1], 'end', [])) else: ends.append((hl[1], 'end', hl[2])) # This relies on the fact that 'end' < 'start' events = sorted(ends + starts) events = iter(events) soup = BeautifulSoup(html, 'html5lib') pos = 0 node = soup highlighting = 0 try: event_pos, event_type, tags = next(events) except StopIteration: event_pos = event_type = tags = None while node is not None: if getattr(node, 'contents', None): # Move down node = node.contents[0] continue if isinstance(node, NavigableString): # Move through text nb = len(node.string.encode('utf-8')) while event_pos is not None: if event_pos == pos and event_type == 'start': # Start highlighting at beginning of text node highlighting += 1 try: event_pos, event_type, tags = next(events) except StopIteration: event_pos = None elif pos + nb > event_pos: # Next event falls inside of this text node if event_type == 'start' and highlighting: # Keep highlighting (can't highlight *more*) highlighting += 1 elif ( event_type == 'end' and not show_tags and highlighting > 1 ): # Keep highlighting (no need to put labels) highlighting -= 1 else: # 'end' and (show_tags or highlighting becomes 0) # Split it char_idx = byte_to_str_index( node.string, event_pos - pos, ) left = node.string[:char_idx] right = node.string[char_idx:] # Left part newnode = NavigableString(left) if highlighting: # Optionally highlight left part span = soup.new_tag( 'span', attrs={'class': 'highlight'}, ) span.append(newnode) newnode = span node.replace_with(newnode) node = newnode if event_type == 'start': highlighting += 1 else: highlighting -= 1 if show_tags: # Add tag labels comment = soup.new_tag( 'span', attrs={'class': 'taglist'}, ) comment.string = ' [%s]' % ', '.join(tags) node.insert_after(comment) node = comment # Right part newnode = NavigableString(right) node.insert_after(newnode) node = newnode nb -= event_pos - pos pos = event_pos # Next loop will highlight right part if needed try: event_pos, event_type, tags = next(events) except StopIteration: event_pos = None elif highlighting: # and pos + nb <= event_pos: # Highlight whole text node newnode = soup.new_tag( 'span', attrs={'class': 'highlight'}, ) node.replace_with(newnode) newnode.append(node) node = newnode if pos + nb == event_pos and event_type == 'end': if show_tags: comment = soup.new_tag( 'span', attrs={'class': 'taglist'}, ) comment.string = ' [%s]' % ', '.join(tags) newnode.insert_after(comment) node = comment highlighting -= 1 try: event_pos, event_type, tags = next(events) except StopIteration: event_pos = None break else: # not highlighting and pos + nb <= event_pos # Skip whole text node break pos += nb # Move up until there's a sibling while not node.next_sibling and node.parent: node = node.parent if not node.parent: break # Move to next node node = node.next_sibling # Remove everything but body body = soup.body soup.clear() soup.append(body) # Remove the body tag itself to only have the contents soup.body.unwrap() # Back to text return str(soup)
while True: needs_span = input("Does this same text need a span tag [Y/N]? ").upper() if needs_span in ("Y", "N"): break if needs_span == "Y": new_spans = [] if needs_splitting == "Y": for word in child.string.split(" "): new_span = soup.new_tag("span") new_span["title"] = get_title(word) new_span.string = word new_spans.append(new_span) for new_span in new_spans[::-1]: new_space = NavigableString(" ") child.insert_after(new_space) # insert a space between spans new_space.insert_after(new_span) child.extract() # it's been replaced, it's not needed anymore except KeyboardInterrupt: print("Breaking loop.") finally: print("\nNew HTML for this file:\n") print(soup.prettify()) # write code while True: needs_writing = input("Should I write this to a file [Y/N]? ").upper() if needs_writing in ("Y", "N"): break if needs_writing == "Y": try: with path.open("w", encoding="utf-8") as f: