Пример #1
0
def add_sections(old_bs: BeautifulSoup):
    new_bs = BeautifulSoup(html_template)
    old_body = old_bs.html.body
    new_body = new_bs.html.body
    section = new_bs.new_tag('section')
    last_added = None
    # Note: the first 5 lines are a replacement of the proper for loop
    # commented out below. The reason for this is that when a tag is added
    # to a section, somehow the children iterator is messed up and as a
    # result, only every second tag is added to the section. So clumsy
    # while loop it is

    # for tag in old_body.children:
    while True:
        try:
            tag = next(old_body.children)
        except StopIteration:
            break
        if not isinstance(tag, Tag):
            print(f'WARNING, not tag: {tag}')
        if headerp.match(tag.name):
            if last_added and not headerp.match(last_added):
                if section.contents:
                    new_body.append(section)
                section = new_bs.new_tag('section')
        section.append(tag)
        last_added = tag.name
    if section.contents:
        new_body.append(section)

    remove_empty_tags(new_bs.html.body)
    return new_bs
Пример #2
0
 def parse_div(self, old_div: Tag, new_section: Tag):
     """
     Sometimes there are divs between the sections and the lower-level tags,
     such as ``p`` or lists. This method is basically the same as
     :meth:`parse_section`, only it doesn't allow ``section``s inside of
     the ``div``.
     """
     for child in self.filter_tags(old_div):
         if isinstance(child, NavigableString):
             logging.warning(f'NavigableString >{child}< in '
                             f'div in {self.title}.')
             # raise ValueError(f'NavigableString >{child}< in {old_section.name}')
         elif child.name == 'p':
             self.parse_generic(child, new_section)
             # text = ' '.join(self.get_text(child).split())
             # if text:
             #     self.add_tag('p', text, new_section)
         elif child.name == 'div':
             self.parse_div(child, new_section)
         elif child.name == 'details':
             logging.warning(f'section in div in {self.title}.')
         elif headerp.match(child.name):
             # self.add_tag(child.name, self.get_text(child), new_section)
             self.parse_generic(child, new_section)
         elif listp.match(child.name):
             self.parse_list(child, new_section)
Пример #3
0
 def convert_section(self, section: Tag, out: StringIO):
     for child in section.children:
         if headerp.match(child.name) or child.name == 'p':
             self.print_sentences(child, child.get_text(), out)
         elif listp.match(child.name):
             self.convert_list(child, out)
         elif child.name == 'section':
             self.convert_section(child, out)
         else:
             raise ValueError(f'Unexpected tag {child.name} in section')
Пример #4
0
def is_empty(tag: Tag) -> bool:
    """
    Tells whether _tag_ is "empty", i.e. it has no children, or, if it is a
    section, it only has header children.
    """
    if not tag.contents:
        return True
    elif tag.name == 'section' and all(headerp.match(t.name)
                                       for t in tag.contents):
        return True
    return False
Пример #5
0
    def parse_section(self, old_section: Tag, new_parent: Tag):
        """
        Parses a section. Only adds the simplified section to the new DOM if
        it is not empty.

        :param old_section: the section tag in the DOM of the original page.
        :param new_parent: the to-be-parent of section tag in simplified DOM.
                           Mostly `<body>` or another `<section>`.
        """
        new_section = self.new_bs.new_tag('section')
        for child in self.filter_tags(old_section):
            if isinstance(child, NavigableString):
                logging.warning(f'NavigableString >{child}< in '
                                f'{old_section.name} in {self.title}.')
                # raise ValueError(f'NavigableString >{child}< in {old_section.name}')
            elif child.name == 'details':
                self.parse_section(child, new_section)
            elif child.name == 'p':
                self.parse_generic(child, new_section)
            elif child.name == 'div':
                self.parse_div(child, new_section)
            elif child.name == 'summary' and 'section-heading' in child.get(
                    'class'):
                for gc in child.children:
                    if headerp.match(gc.name):
                        self.parse_generic(gc, new_section)
            elif listp.match(child.name):
                self.parse_list(child, new_section)

        # Only append non-empty sections (having a single header still counts
        # as empty)
        if [
                c for c in new_section.children
                if not (c.name and headerp.match(c.name))
        ]:
            new_parent.append(new_section)
Пример #6
0
def pre_parse(old_bs: BeautifulSoup, keep_poems: bool = False) -> BeautifulSoup:
    """
    Pre-parses the old html and adds headers, lists, paragraphs and poems
    to the new html.
    """
    def add_text(old_tag: Tag, new_tag: Tag, new_parent: Tag):
        text = ' '.join(old_tag.get_text().split())
        if text:
            new_tag.append(text)
            new_parent.append(new_tag)

    old_body = old_bs.find('body')
    tmp_bs = BeautifulSoup(html_template)
    new_body = tmp_bs.html.body
    for tag in old_body.find_all(re.compile('^(?:p|ul|ol|h[1-5]|div)$')):
        if tag.name != 'div':
            new_tag = tmp_bs.new_tag(tag.name)
            if tag.name == 'p':
                add_text(tag, new_tag, new_body)
            elif headerp.match(tag.name):
                add_text(tag, new_tag, new_body)
            elif listp.match(tag.name):
                for li in tag.children:
                    if isinstance(li, Tag) and li.name == 'li':
                        new_li = tmp_bs.new_tag('li')
                        add_text(li, new_li, new_tag)
                if new_tag.contents:
                    new_body.append(new_tag)
        else:  # div
            if 'poem' in tag.get('class', []) and keep_poems:
                if tag.find('div', {'class': 'stanza'}):
                    # One p per stanza
                    for stanza in tag.find_all('div', {'class': 'stanza'}):
                        new_p = tmp_bs.new_tag('p')
                        for line in stanza.find_all('span'):
                            new_p.append(line.get_text().strip() + '\n')
                        if new_p.contents:
                            new_body.append(new_p)
                else:
                    # Unstructured poem with lines as paragraphs
                    new_p = tmp_bs.new_tag('p')
                    for line in tag.find_all('p'):
                        new_p.append(line.get_text().strip() + '\n')
                    if new_p.contents:
                        new_body.append(new_p)
    return tmp_bs
Пример #7
0
    def convert_section(self, section: Tag, out: StringIO):
        """
        Converts a section to text. The text is written to _out_.

        :param section: the section tag.
        :param out: the :class:`StringIO` that collects the output.
        """
        for child in section.children:
            if headerp.match(child.name):
                print(self.tokenizer.ssplit(child.get_text()), file=out)
            elif listp.match(child.name):
                self.convert_list(child, out)
            elif child.name == 'section':
                self.convert_section(child, out)
            elif child.name == 'p':
                print(self.tokenizer.ssplit(child.get_text()), file=out)
            else:
                raise ValueError(f'Unexpected tag {child.name} in section')