Exemplo n.º 1
0
def highlight(html, highlights, show_tags=False):
    """Highlight part of an HTML documents.

    :param highlights: Iterable of (start, end, tags) triples, which are
        computed over UTF-8 bytes and don't count HTML tags
    :param show_tags: Whether to show the tag names within brackets after each
        highlight
    """
    # Build a list of starting points and ending points
    starts = []
    ends = []
    for hl in highlights:
        starts.append((hl[0], 'start', []))
        if len(hl) == 2:
            ends.append((hl[1], 'end', []))
        else:
            ends.append((hl[1], 'end', hl[2]))
    # This relies on the fact that 'end' < 'start'
    events = sorted(ends + starts)

    events = iter(events)
    soup = BeautifulSoup(html, 'html5lib')

    pos = 0
    node = soup
    highlighting = 0
    try:
        event_pos, event_type, tags = next(events)
    except StopIteration:
        event_pos = event_type = tags = None

    while node is not None:
        if getattr(node, 'contents', None):
            # Move down
            node = node.contents[0]
            continue

        if isinstance(node, NavigableString):
            # Move through text
            nb = len(node.string.encode('utf-8'))
            while event_pos is not None:
                if event_pos == pos and event_type == 'start':
                    # Start highlighting at beginning of text node
                    highlighting += 1
                    try:
                        event_pos, event_type, tags = next(events)
                    except StopIteration:
                        event_pos = None
                elif pos + nb > event_pos:
                    # Next event falls inside of this text node
                    if event_type == 'start' and highlighting:
                        # Keep highlighting (can't highlight *more*)
                        highlighting += 1
                    elif (
                        event_type == 'end'
                        and not show_tags
                        and highlighting > 1
                    ):
                        # Keep highlighting (no need to put labels)
                        highlighting -= 1
                    else:  # 'end' and (show_tags or highlighting becomes 0)
                        # Split it
                        char_idx = byte_to_str_index(
                            node.string,
                            event_pos - pos,
                        )
                        left = node.string[:char_idx]
                        right = node.string[char_idx:]

                        # Left part
                        newnode = NavigableString(left)
                        if highlighting:
                            # Optionally highlight left part
                            span = soup.new_tag(
                                'span',
                                attrs={'class': 'highlight'},
                            )
                            span.append(newnode)
                            newnode = span
                        node.replace_with(newnode)
                        node = newnode

                        if event_type == 'start':
                            highlighting += 1
                        else:
                            highlighting -= 1
                            if show_tags:
                                # Add tag labels
                                comment = soup.new_tag(
                                    'span',
                                    attrs={'class': 'taglist'},
                                )
                                comment.string = ' [%s]' % ', '.join(tags)
                                node.insert_after(comment)
                                node = comment

                        # Right part
                        newnode = NavigableString(right)
                        node.insert_after(newnode)
                        node = newnode
                        nb -= event_pos - pos
                        pos = event_pos
                        # Next loop will highlight right part if needed

                    try:
                        event_pos, event_type, tags = next(events)
                    except StopIteration:
                        event_pos = None
                elif highlighting:  # and pos + nb <= event_pos:
                    # Highlight whole text node
                    newnode = soup.new_tag(
                        'span',
                        attrs={'class': 'highlight'},
                    )
                    node.replace_with(newnode)
                    newnode.append(node)
                    node = newnode
                    if pos + nb == event_pos and event_type == 'end':
                        if show_tags:
                            comment = soup.new_tag(
                                'span',
                                attrs={'class': 'taglist'},
                            )
                            comment.string = ' [%s]' % ', '.join(tags)
                            newnode.insert_after(comment)
                            node = comment
                        highlighting -= 1
                        try:
                            event_pos, event_type, tags = next(events)
                        except StopIteration:
                            event_pos = None
                    break
                else:  # not highlighting and pos + nb <= event_pos
                    # Skip whole text node
                    break

            pos += nb

        # Move up until there's a sibling
        while not node.next_sibling and node.parent:
            node = node.parent
        if not node.parent:
            break
        # Move to next node
        node = node.next_sibling

    # Remove everything but body
    body = soup.body
    soup.clear()
    soup.append(body)

    # Remove the body tag itself to only have the contents
    soup.body.unwrap()

    # Back to text
    return str(soup)
Exemplo n.º 2
0
                while True:
                    needs_span = input("Does this same text need a span tag [Y/N]? ").upper()
                    if needs_span in ("Y", "N"):
                        break
                if needs_span == "Y":
                    new_spans = []
                    if needs_splitting == "Y":
                        for word in child.string.split(" "):
                            new_span = soup.new_tag("span")
                            new_span["title"] = get_title(word)
                            new_span.string = word
                            new_spans.append(new_span)
                    for new_span in new_spans[::-1]:
                        new_space = NavigableString(" ")
                        child.insert_after(new_space) # insert a space between spans
                        new_space.insert_after(new_span)
                    child.extract() # it's been replaced, it's not needed anymore
except KeyboardInterrupt:
    print("Breaking loop.")
finally:
    print("\nNew HTML for this file:\n")
    print(soup.prettify())

    # write code 
    while True:
        needs_writing = input("Should I write this to a file [Y/N]? ").upper()
        if needs_writing in ("Y", "N"):
            break
    if needs_writing == "Y":
        try:
            with path.open("w", encoding="utf-8") as f: