def parse_sections(self): sections = [] section_heading_tags = ['h2', 'h3', 'h4', 'h5', 'h6', 'hr'] section_body_tags = ['ul', 'ol', 'p', 'a'] tags = tuple(section_heading_tags + section_body_tags) events = xmlutils.find_elements_by_tag(self.stream.parse_tree, tag_names=tags, max_depth=1) for e in events: if e.tag in section_heading_tags: title = None if e.tag in ('h2', 'h3', 'h4', 'h5', 'h6'): title = stringify_events(e) section = Section(title=title) sections.append(section) elif e.tag == 'ul' or e.tag == 'ol': if not sections: section = Section() sections.append(section) parts = self.parse_parts(e) sections[-1].add_parts(parts) elif e.tag == 'p': if not sections: section = Section() sections.append(section) parts = self.parse_parts(e) sections[-1].add_parts(parts) return sections
def parse_part(self, list_item, level=0): self.app.logger.debug( '(LEVEL {level}) Parsing part from : {list_item}'.format( level=level, list_item=list_item)) part = None link_element = xmlutils.find_first_element_by_tag(list_item, 'a', skip=['ul', 'ol']) if link_element is not None: self.app.logger.debug( ('(LEVEL {level}) Found chapter link: ' '{link_element} for: {link_element_text}').format( level=level, link_element=link_element, link_element_text=link_element.text)) part = self.create_part(link_element, level=level) if part: children = [] subparts = xmlutils.find_elements_by_tag(list_item, tag_names=('ul', 'ol'), max_depth=1) for e in list(subparts): if e.tag in ('ul', 'ol'): children = self.parse_parts(e, level=level + 1) part.children = children return part
def test_find_elements_by_tag_should_only_return_first_element_when_max_depth_is_0( list_element_with_embedded_list): element = list_element_with_embedded_list actual = find_elements_by_tag(element, tag_names=['ul'], max_depth=0) assert len(actual) == 1 assert actual == [element]
def test_find_elements_by_tag_should_match_any_element_when_tag_names_is_wildcard( list_element_with_embedded_list): element = list_element_with_embedded_list actual = find_elements_by_tag(element, tag_names='*') assert len(actual) == 1 assert actual == [element]
def test_find_elements_by_tag_should_only_return_matching_elements_until_max_depth_is_reached( list_element_with_embedded_list): element = list_element_with_embedded_list actual = find_elements_by_tag(element, tag_names=['li'], max_depth=1) expected = [element[0], element[1], element[2]] assert len(actual) == len(expected) assert set(actual) == set(expected)
def test_find_elements_by_tag_should_only_return_matching_elements_until_all_children_are_exhausted( list_element_with_embedded_list): element = list_element_with_embedded_list actual = find_elements_by_tag(element, tag_names=['li'], max_depth=10) expected = [ element[0], element[1], element[2], element[0][0][0], element[2][1][0] ] assert len(actual) == len(expected) assert set(actual) == set(expected)