Exemplo n.º 1
0
    def parse_sections(self):
        sections = []

        section_heading_tags = ['h2', 'h3', 'h4', 'h5', 'h6', 'hr']
        section_body_tags = ['ul', 'ol', 'p', 'a']

        tags = tuple(section_heading_tags + section_body_tags)
        events = xmlutils.find_elements_by_tag(self.stream.parse_tree,
                                               tag_names=tags,
                                               max_depth=1)

        for e in events:
            if e.tag in section_heading_tags:
                title = None
                if e.tag in ('h2', 'h3', 'h4', 'h5', 'h6'):
                    title = stringify_events(e)
                section = Section(title=title)
                sections.append(section)
            elif e.tag == 'ul' or e.tag == 'ol':
                if not sections:
                    section = Section()
                    sections.append(section)
                parts = self.parse_parts(e)
                sections[-1].add_parts(parts)
            elif e.tag == 'p':
                if not sections:
                    section = Section()
                    sections.append(section)
                parts = self.parse_parts(e)
                sections[-1].add_parts(parts)
        return sections
Exemplo n.º 2
0
    def parse_part(self, list_item, level=0):
        self.app.logger.debug(
            '(LEVEL {level}) Parsing part from : {list_item}'.format(
                level=level, list_item=list_item))

        part = None
        link_element = xmlutils.find_first_element_by_tag(list_item,
                                                          'a',
                                                          skip=['ul', 'ol'])
        if link_element is not None:
            self.app.logger.debug(
                ('(LEVEL {level}) Found chapter link: '
                 '{link_element} for: {link_element_text}').format(
                     level=level,
                     link_element=link_element,
                     link_element_text=link_element.text))
            part = self.create_part(link_element, level=level)

        if part:
            children = []
            subparts = xmlutils.find_elements_by_tag(list_item,
                                                     tag_names=('ul', 'ol'),
                                                     max_depth=1)
            for e in list(subparts):
                if e.tag in ('ul', 'ol'):
                    children = self.parse_parts(e, level=level + 1)
            part.children = children
        return part
Exemplo n.º 3
0
def test_find_elements_by_tag_should_only_return_first_element_when_max_depth_is_0(
        list_element_with_embedded_list):
    element = list_element_with_embedded_list

    actual = find_elements_by_tag(element, tag_names=['ul'], max_depth=0)
    assert len(actual) == 1
    assert actual == [element]
Exemplo n.º 4
0
def test_find_elements_by_tag_should_match_any_element_when_tag_names_is_wildcard(
        list_element_with_embedded_list):
    element = list_element_with_embedded_list

    actual = find_elements_by_tag(element, tag_names='*')
    assert len(actual) == 1
    assert actual == [element]
Exemplo n.º 5
0
def test_find_elements_by_tag_should_only_return_matching_elements_until_max_depth_is_reached(
        list_element_with_embedded_list):
    element = list_element_with_embedded_list

    actual = find_elements_by_tag(element, tag_names=['li'], max_depth=1)
    expected = [element[0], element[1], element[2]]
    assert len(actual) == len(expected)
    assert set(actual) == set(expected)
Exemplo n.º 6
0
def test_find_elements_by_tag_should_only_return_matching_elements_until_all_children_are_exhausted(
        list_element_with_embedded_list):
    element = list_element_with_embedded_list

    actual = find_elements_by_tag(element, tag_names=['li'], max_depth=10)
    expected = [
        element[0], element[1], element[2], element[0][0][0], element[2][1][0]
    ]
    assert len(actual) == len(expected)
    assert set(actual) == set(expected)