Exemplo n.º 1
0
    def test_serialize_calls(self, MockContext):
        instance = MockContext.return_value
        instance.get_serializer.return_value = None

        serialize_elements(self.doc, [1, 2, 3])
        self.assertEqual(instance.get_serializer.call_args_list,
                         [call(1), call(2), call(3)])
Exemplo n.º 2
0
    def test_serialize(self, MockContext):
        instance = MockContext.return_value
        instance.get_serializer.return_value = None

        self.assertEqual(serialize_elements(self.doc, [1, 2, 3]),
                         six.b("<div/>\n"))
        self.assertEqual(serialize_elements(self.doc, []), six.b("<div/>\n"))
Exemplo n.º 3
0
    def test_serialize_something(self, MockContext):
        def _func(ctx, document, elem, root):
            return etree.SubElement(root, 'p')

        instance = MockContext.return_value
        instance.get_serializer.return_value = _func

        self.assertEqual(serialize_elements(self.doc, [1]), six.b("<div>\n  <p/>\n</div>\n"))
        instance.get_serializer.assert_called_with(1)
Exemplo n.º 4
0
    def test_serialize_something(self, MockContext):
        def _func(ctx, document, elem, root):
            return etree.SubElement(root, 'p')

        instance = MockContext.return_value
        instance.get_serializer.return_value = _func

        self.assertEqual(serialize_elements(self.doc, [1]),
                         six.b("<div>\n  <p/>\n</div>\n"))
        instance.get_serializer.assert_called_with(1)
Exemplo n.º 5
0
    def _parse_chapter(self, content):
        def _find(tag):
            return tree.xpath('//' + tag)

        from lxml import html, etree

        utf8_parser = html.HTMLParser(encoding='utf-8')
        tree = html.document_fromstring(content, parser=utf8_parser)

        headers = []

        h1_headers = tree.xpath('.//h1')

        if h1_headers:
            for h1 in h1_headers:
                if h1.text == 'Unknown':
                    # Translators: Default chapter title when importing DOCX
                    # files. In case title does not exists.
                    h1.text = _('Title')

        for n in range(5):
            headers.append(_find('h{}'.format(n + 1)))

        level = 2

        if len(headers[0]) > 1:
            for header in headers[0][1:]:
                header.tag = 'h{}'.format(level)
            level += 1

        for levels in headers[1:]:
            has_changed = False

            for header in levels:
                header.tag = 'h{}'.format(level)

            if has_changed:
                if level < 6:
                    level += 1

        imgs = tree.xpath('.//img')

        for _img in imgs:
            image_name = _img.get('src')
            att_name, att_ext = os.path.splitext(os.path.basename(image_name))

            if image_name in self.broken_images:
                _img.set('src', 'static/{}.jpg'.format(att_name))

            if image_name in self.converted_images:
                _img.set('src', 'static/{}.png'.format(att_name))

        has_endnotes = False
        endnotes = None
        idx_endnote = 1

        for endnote in tree.xpath('.//sup[@class="endnote"]'):

            key = endnote.get('data-id', '')
            if key == '':
                continue

            endnote.text = '{}'.format(idx_endnote)
            idx_endnote += 1

            endnote_key = None
            footnote_key = None

            for k, v in self.endnotes.iteritems():
                if v == key:
                    endnote_key = k

            for k, v in self.footnotes.iteritems():
                if v == key:
                    footnote_key = k

            note_content = None

            if endnote_key:
                endnote = self.dfile.document.endnotes[endnote_key]
                note_content = serialize.serialize_elements(
                    self.dfile.document, endnote, {
                        'embed_styles': False, 'pretty_print': False,
                        'relationship': 'endnotes'
                    })

            if footnote_key:
                endnote = self.dfile.document.footnotes[footnote_key]
                note_content = serialize.serialize_elements(
                    self.dfile.document, endnote, {
                        'embed_styles': False, 'pretty_print': False,
                        'relationship': 'footnotes'
                    })

            if note_content is not None:
                if not has_endnotes:
                    endnotes = etree.SubElement(tree.find('body'), 'ol', {'class': 'endnotes'})
                    has_endnotes = True

                note_tree = lxml.html.fragment_fromstring(
                    note_content, create_parent=True,
                    parser=lxml.html.HTMLParser(
                        encoding='utf-8', remove_blank_text=True, remove_comments=True)
                )
                li = etree.SubElement(endnotes, 'li', {'id': 'endnote-{}'.format(key)})
                for child in note_tree.find('div').getchildren():
                    li.append(child)

        return etree.tostring(
            tree, pretty_print=True, encoding='utf-8', xml_declaration=False)
Exemplo n.º 6
0
    def _parse_chapter(self, content):
        # TODO: add docstrings and improve logic

        utf8_parser = html.HTMLParser(encoding='utf-8')
        tree = html.document_fromstring(content, parser=utf8_parser)

        h1_headers = tree.xpath('.//h1')

        if h1_headers:
            for h1 in h1_headers:
                # Translators: Default chapter title when importing DOCX
                # files. In case title does not exists.
                if h1.text == 'Unknown':
                    h1.text = _('Title')

        # NOTE: let's see how to handle this in a better way
        # self._fix_header_levels(tree)

        # time to adjust the src attribute of images
        self._fix_images_path(tree)

        # let's do some clean out on the not necessary tags,
        # like span tags with no reason to be
        self._clean_span_tags(tree)

        # now we need to set body and body-first styles to paragraphs
        self._fix_p_styles(tree)

        has_endnotes = False
        endnotes = None
        idx_endnote = 1
        notes_rel_types = ['footnotes', 'endnotes']

        for endnote in tree.xpath('.//sup[@class="endnote"]'):
            key = endnote.get('data-id', '')

            # below values were set in custom hooks endnotes and footnotes
            relation_id = endnote.get('data-relation-id', '')
            relationship = endnote.get('data-relationship', '')

            # continue if there is no key or relationship is not of interest here
            if key == '' or relationship not in notes_rel_types:
                continue

            endnote.text = '{}'.format(idx_endnote)
            idx_endnote += 1
            note_content = None

            # extract self.dfile.document.{footnotes|endnotes} dict
            notes_source_dict = getattr(self.dfile.document, relationship)
            if relation_id not in notes_source_dict.keys():
                continue

            note_element = notes_source_dict[relation_id]
            note_content = serialize.serialize_elements(
                self.dfile.document, note_element, {
                    'embed_styles': False,
                    'pretty_print': False,
                    'relationship': relationship
                })

            if note_content is not None:
                if not has_endnotes:
                    endnotes = etree.SubElement(tree.find('body'), 'ol',
                                                {'class': 'endnotes'})
                    has_endnotes = True

                note_tree = lxml.html.fragment_fromstring(
                    note_content,
                    create_parent=True,
                    parser=lxml.html.HTMLParser(encoding='utf-8',
                                                remove_blank_text=True,
                                                remove_comments=True))
                li = etree.SubElement(endnotes, 'li',
                                      {'id': 'endnote-{}'.format(key)})
                for child in note_tree.find('div').getchildren():
                    li.append(child)

                # children are normally just one element which inside has more children
                # so in this case, we just drop_tag and keep content
                for x in li.getchildren():
                    x.drop_tag()

        # let's cleanout infoboxes a bit
        # TODO: implement of plugins or something else more organized that separate functions
        docutils.clean_infobox_content(tree)
        docutils.fix_citations(tree)

        return etree.tostring(tree.find('body'),
                              encoding='utf-8',
                              xml_declaration=False)
Exemplo n.º 7
0
    def _parse_chapter(self, content):
        def _find(tag):
            return tree.xpath('//' + tag)

        from lxml import html, etree

        utf8_parser = html.HTMLParser(encoding='utf-8')
        tree = html.document_fromstring(content, parser=utf8_parser)

        headers = []

        h1_headers = tree.xpath('.//h1')

        if h1_headers:
            for h1 in h1_headers:
                if h1.text == 'Unknown':
                    # Translators: Default chapter title when importing DOCX
                    # files. In case title does not exists.
                    h1.text = _('Title')

        for n in range(5):
            headers.append(_find('h{}'.format(n + 1)))

        level = 2

        if len(headers[0]) > 1:
            for header in headers[0][1:]:
                header.tag = 'h{}'.format(level)
            level += 1

        for levels in headers[1:]:
            has_changed = False

            for header in levels:
                header.tag = 'h{}'.format(level)

            if has_changed:
                if level < 6:
                    level += 1

        imgs = tree.xpath('.//img')

        for _img in imgs:
            image_name = _img.get('src')
            att_name, att_ext = os.path.splitext(os.path.basename(image_name))

            if image_name in self.broken_images:
                _img.set('src', 'static/{}.jpg'.format(att_name))

            if image_name in self.converted_images:
                _img.set('src', 'static/{}.png'.format(att_name))

        has_endnotes = False
        endnotes = None
        idx_endnote = 1

        for endnote in tree.xpath('.//sup[@class="endnote"]'):
            key = endnote.get('data-id', '')
            if key == '':
                continue

            endnote.text = '{}'.format(idx_endnote)
            idx_endnote += 1

            endnote_key = None
            footnote_key = None

            for k, v in self.endnotes.iteritems():
                if v == key:
                    endnote_key = k

            for k, v in self.footnotes.iteritems():
                if v == key:
                    footnote_key = k

            note_content = None

            if endnote_key:
                endnote = self.dfile.document.endnotes[endnote_key]
                note_content = serialize.serialize_elements(
                    self.dfile.document, endnote, {
                        'embed_styles': False,
                        'pretty_print': False,
                        'relationship': 'endnotes'
                    })

            if footnote_key:
                endnote = self.dfile.document.footnotes[footnote_key]
                note_content = serialize.serialize_elements(
                    self.dfile.document, endnote, {
                        'embed_styles': False,
                        'pretty_print': False,
                        'relationship': 'footnotes'
                    })

            if note_content is not None:
                if not has_endnotes:
                    endnotes = etree.SubElement(tree.find('body'), 'ol',
                                                {'class': 'endnotes'})
                    has_endnotes = True

                note_tree = lxml.html.fragment_fromstring(
                    note_content,
                    create_parent=True,
                    parser=lxml.html.HTMLParser(encoding='utf-8',
                                                remove_blank_text=True,
                                                remove_comments=True))
                li = etree.SubElement(endnotes, 'li',
                                      {'id': 'endnote-{}'.format(key)})
                for child in note_tree.find('div').getchildren():
                    li.append(child)

                # children are normally just one element which inside has more children
                # so in this case, we just drop_tag and keep content
                for x in li.getchildren():
                    x.drop_tag()

        # let's do some clean out on the not necessary tags,
        # like span tags with no reason to be
        for tag in tree.xpath('.//span'):
            class_name = tag.get('class', None)
            parent_class = tag.getparent().get('class', '')

            if not class_name or class_name in parent_class:
                tag.drop_tag()

        # let's cleanout infoboxes a bit
        # TODO: implement of plugins or something else more organized that separate functions
        docutils.clean_infobox_content(tree)
        docutils.fix_citations(tree)

        return etree.tostring(tree,
                              pretty_print=True,
                              encoding='utf-8',
                              xml_declaration=False)
Exemplo n.º 8
0
    def _parse_chapter(self, content):
        def _find(tag):
            return tree.xpath('//' + tag)

        from lxml import html, etree

        utf8_parser = html.HTMLParser(encoding='utf-8')
        tree = html.document_fromstring(content, parser=utf8_parser)

        headers = []

        h1_headers = tree.xpath('.//h1')

        if h1_headers:
            for h1 in h1_headers:
                if h1.text == 'Unknown':
                    # Translators: Default chapter title when importing DOCX
                    # files. In case title does not exists.
                    h1.text = _('Title')

        for n in range(5):
            headers.append(_find('h{}'.format(n + 1)))

        level = 2

        if len(headers[0]) > 1:
            for header in headers[0][1:]:
                header.tag = 'h{}'.format(level)
            level += 1

        for levels in headers[1:]:
            has_changed = False

            for header in levels:
                header.tag = 'h{}'.format(level)

            if has_changed:
                if level < 6:
                    level += 1

        imgs = tree.xpath('.//img')

        for _img in imgs:
            image_name = _img.get('src')
            att_name, att_ext = os.path.splitext(os.path.basename(image_name))

            if image_name in self.broken_images:
                _img.set('src', 'static/{}.jpg'.format(att_name))

            if image_name in self.converted_images:
                _img.set('src', 'static/{}.png'.format(att_name))

        has_endnotes = False
        endnotes = None
        idx_endnote = 1

        for endnote in tree.xpath('.//sup[@class="endnote"]'):

            key = endnote.get('data-id', '')
            if key == '':
                continue

            endnote.text = '{}'.format(idx_endnote)
            idx_endnote += 1

            endnote_key = None
            footnote_key = None

            for k, v in self.endnotes.iteritems():
                if v == key:
                    endnote_key = k

            for k, v in self.footnotes.iteritems():
                if v == key:
                    footnote_key = k

            note_content = None

            if endnote_key:
                endnote = self.dfile.document.endnotes[endnote_key]
                note_content = serialize.serialize_elements(
                    self.dfile.document, endnote, {
                        'embed_styles': False,
                        'pretty_print': False,
                        'relationship': 'endnotes'
                    })

            if footnote_key:
                endnote = self.dfile.document.footnotes[footnote_key]
                note_content = serialize.serialize_elements(
                    self.dfile.document, endnote, {
                        'embed_styles': False,
                        'pretty_print': False,
                        'relationship': 'footnotes'
                    })

            if note_content is not None:
                if not has_endnotes:
                    endnotes = etree.SubElement(tree.find('body'), 'ol',
                                                {'class': 'endnotes'})
                    has_endnotes = True

                note_tree = lxml.html.fragment_fromstring(
                    note_content,
                    create_parent=True,
                    parser=lxml.html.HTMLParser(encoding='utf-8',
                                                remove_blank_text=True,
                                                remove_comments=True))
                li = etree.SubElement(endnotes, 'li',
                                      {'id': 'endnote-{}'.format(key)})
                for child in note_tree.find('div').getchildren():
                    li.append(child)

        return etree.tostring(tree,
                              pretty_print=True,
                              encoding='utf-8',
                              xml_declaration=False)
Exemplo n.º 9
0
    def test_serialize_calls(self, MockContext):
        instance = MockContext.return_value
        instance.get_serializer.return_value = None

        serialize_elements(self.doc, [1,2,3])
        self.assertEqual(instance.get_serializer.call_args_list, [call(1), call(2), call(3)])
Exemplo n.º 10
0
    def test_serialize(self, MockContext):
        instance = MockContext.return_value
        instance.get_serializer.return_value = None

        self.assertEqual(serialize_elements(self.doc, [1, 2, 3]), six.b("<div/>\n"))
        self.assertEqual(serialize_elements(self.doc, []), six.b("<div/>\n"))
Exemplo n.º 11
0
    def _handle_endnotes(self, tree):
        """
        Parse endnotes from docx file and generates the right container for it
        """

        has_endnotes = False
        endnotes = None
        endnote_counter = 1

        for sup in tree.xpath('.//sup[@class="endnote"]'):
            key = sup.get('data-id', '')

            # below values were set in custom hooks endnotes and footnotes
            relation_id = sup.get('data-relation-id', '')
            relationship = sup.get('data-relationship', '')

            # continue if there is no key or relationship is not of interest here
            if key == '' or relationship != 'endnotes':
                continue

            sup.text = '{}'.format(endnote_counter)
            endnote_counter += 1
            note_content = None

            # extract self.dfile.document.{footnotes|endnotes} dict
            # notes_source_dict = getattr(self.dfile.document, relationship)
            notes_source_dict = self.dfile.document.endnotes
            if relation_id not in notes_source_dict.keys():
                continue

            note_element = notes_source_dict[relation_id]
            note_content = serialize.serialize_elements(
                self.dfile.document, note_element, {
                    'embed_styles': False,
                    'pretty_print': False,
                    'relationship': relationship
                })

            if note_content is not None:
                if not has_endnotes:
                    endnotes = etree.SubElement(tree.find('body'), 'ol',
                                                {'class': 'endnotes'})
                    has_endnotes = True

                parser = lxml.html.HTMLParser(encoding='utf-8',
                                              remove_blank_text=True,
                                              remove_comments=True)
                note_tree = lxml.html.fragment_fromstring(note_content,
                                                          create_parent=True,
                                                          parser=parser)

                li = etree.SubElement(endnotes, 'li',
                                      {'id': 'endnote-{}'.format(key)})
                for child in note_tree.find('div').getchildren():
                    li.append(child)

                # children are normally just one element which inside has more children
                # so in this case, we just drop_tag and keep content
                for x in li.getchildren():
                    x.drop_tag()
            else:
                pass  # FIXME: should we remove the sup tag?