def add_sections(old_bs: BeautifulSoup): new_bs = BeautifulSoup(html_template) old_body = old_bs.html.body new_body = new_bs.html.body section = new_bs.new_tag('section') last_added = None # Note: the first 5 lines are a replacement of the proper for loop # commented out below. The reason for this is that when a tag is added # to a section, somehow the children iterator is messed up and as a # result, only every second tag is added to the section. So clumsy # while loop it is # for tag in old_body.children: while True: try: tag = next(old_body.children) except StopIteration: break if not isinstance(tag, Tag): print(f'WARNING, not tag: {tag}') if headerp.match(tag.name): if last_added and not headerp.match(last_added): if section.contents: new_body.append(section) section = new_bs.new_tag('section') section.append(tag) last_added = tag.name if section.contents: new_body.append(section) remove_empty_tags(new_bs.html.body) return new_bs
def parse_div(self, old_div: Tag, new_section: Tag): """ Sometimes there are divs between the sections and the lower-level tags, such as ``p`` or lists. This method is basically the same as :meth:`parse_section`, only it doesn't allow ``section``s inside of the ``div``. """ for child in self.filter_tags(old_div): if isinstance(child, NavigableString): logging.warning(f'NavigableString >{child}< in ' f'div in {self.title}.') # raise ValueError(f'NavigableString >{child}< in {old_section.name}') elif child.name == 'p': self.parse_generic(child, new_section) # text = ' '.join(self.get_text(child).split()) # if text: # self.add_tag('p', text, new_section) elif child.name == 'div': self.parse_div(child, new_section) elif child.name == 'details': logging.warning(f'section in div in {self.title}.') elif headerp.match(child.name): # self.add_tag(child.name, self.get_text(child), new_section) self.parse_generic(child, new_section) elif listp.match(child.name): self.parse_list(child, new_section)
def convert_section(self, section: Tag, out: StringIO): for child in section.children: if headerp.match(child.name) or child.name == 'p': self.print_sentences(child, child.get_text(), out) elif listp.match(child.name): self.convert_list(child, out) elif child.name == 'section': self.convert_section(child, out) else: raise ValueError(f'Unexpected tag {child.name} in section')
def is_empty(tag: Tag) -> bool: """ Tells whether _tag_ is "empty", i.e. it has no children, or, if it is a section, it only has header children. """ if not tag.contents: return True elif tag.name == 'section' and all(headerp.match(t.name) for t in tag.contents): return True return False
def parse_section(self, old_section: Tag, new_parent: Tag): """ Parses a section. Only adds the simplified section to the new DOM if it is not empty. :param old_section: the section tag in the DOM of the original page. :param new_parent: the to-be-parent of section tag in simplified DOM. Mostly `<body>` or another `<section>`. """ new_section = self.new_bs.new_tag('section') for child in self.filter_tags(old_section): if isinstance(child, NavigableString): logging.warning(f'NavigableString >{child}< in ' f'{old_section.name} in {self.title}.') # raise ValueError(f'NavigableString >{child}< in {old_section.name}') elif child.name == 'details': self.parse_section(child, new_section) elif child.name == 'p': self.parse_generic(child, new_section) elif child.name == 'div': self.parse_div(child, new_section) elif child.name == 'summary' and 'section-heading' in child.get( 'class'): for gc in child.children: if headerp.match(gc.name): self.parse_generic(gc, new_section) elif listp.match(child.name): self.parse_list(child, new_section) # Only append non-empty sections (having a single header still counts # as empty) if [ c for c in new_section.children if not (c.name and headerp.match(c.name)) ]: new_parent.append(new_section)
def pre_parse(old_bs: BeautifulSoup, keep_poems: bool = False) -> BeautifulSoup: """ Pre-parses the old html and adds headers, lists, paragraphs and poems to the new html. """ def add_text(old_tag: Tag, new_tag: Tag, new_parent: Tag): text = ' '.join(old_tag.get_text().split()) if text: new_tag.append(text) new_parent.append(new_tag) old_body = old_bs.find('body') tmp_bs = BeautifulSoup(html_template) new_body = tmp_bs.html.body for tag in old_body.find_all(re.compile('^(?:p|ul|ol|h[1-5]|div)$')): if tag.name != 'div': new_tag = tmp_bs.new_tag(tag.name) if tag.name == 'p': add_text(tag, new_tag, new_body) elif headerp.match(tag.name): add_text(tag, new_tag, new_body) elif listp.match(tag.name): for li in tag.children: if isinstance(li, Tag) and li.name == 'li': new_li = tmp_bs.new_tag('li') add_text(li, new_li, new_tag) if new_tag.contents: new_body.append(new_tag) else: # div if 'poem' in tag.get('class', []) and keep_poems: if tag.find('div', {'class': 'stanza'}): # One p per stanza for stanza in tag.find_all('div', {'class': 'stanza'}): new_p = tmp_bs.new_tag('p') for line in stanza.find_all('span'): new_p.append(line.get_text().strip() + '\n') if new_p.contents: new_body.append(new_p) else: # Unstructured poem with lines as paragraphs new_p = tmp_bs.new_tag('p') for line in tag.find_all('p'): new_p.append(line.get_text().strip() + '\n') if new_p.contents: new_body.append(new_p) return tmp_bs
def convert_section(self, section: Tag, out: StringIO): """ Converts a section to text. The text is written to _out_. :param section: the section tag. :param out: the :class:`StringIO` that collects the output. """ for child in section.children: if headerp.match(child.name): print(self.tokenizer.ssplit(child.get_text()), file=out) elif listp.match(child.name): self.convert_list(child, out) elif child.name == 'section': self.convert_section(child, out) elif child.name == 'p': print(self.tokenizer.ssplit(child.get_text()), file=out) else: raise ValueError(f'Unexpected tag {child.name} in section')