Python Tables примеры использования

Язык программирования: Python

Пространство имен/Пакет: calibre.ebooks.docx.tables

Класс/Тип: Tables

Примеров на hotexamples.com: 11

Python Tables - 11 примеров найдено. Это лучшие примеры Python кода для calibre.ebooks.docx.tables.Tables, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

apply_markup(3)

Tables(2)

add(1)

Пример #1

Показать файл

 def __init__(self,
              path_or_stream,
              dest_dir=None,
              log=None,
              notes_text=None):
     self.docx = DOCX(path_or_stream, log=log)
     self.log = self.docx.log
     self.notes_text = notes_text or _('Notes')
     self.dest_dir = dest_dir or os.getcwdu()
     self.mi = self.docx.metadata
     self.body = BODY()
     self.styles = Styles()
     self.images = Images()
     self.tables = Tables()
     self.object_map = OrderedDict()
     self.html = HTML(
         HEAD(
             META(charset='utf-8'),
             TITLE(self.mi.title or _('Unknown')),
             LINK(rel='stylesheet', type='text/css', href='docx.css'),
         ), self.body)
     self.html.text = '\n\t'
     self.html[0].text = '\n\t\t'
     self.html[0].tail = '\n'
     for child in self.html[0]:
         child.tail = '\n\t\t'
     self.html[0][-1].tail = '\n\t'
     self.html[1].text = self.html[1].tail = '\n'
     lang = canonicalize_lang(self.mi.language)
     if lang and lang != 'und':
         lang = lang_as_iso639_1(lang)
         if lang:
             self.html.set('lang', lang)

Пример #2

Показать файл

Файл: to_html.py Проект: bjhemens/calibre

 def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None):
     self.docx = DOCX(path_or_stream, log=log)
     self.log = self.docx.log
     self.notes_text = notes_text or _("Notes")
     self.dest_dir = dest_dir or os.getcwdu()
     self.mi = self.docx.metadata
     self.body = BODY()
     self.styles = Styles()
     self.images = Images()
     self.tables = Tables()
     self.object_map = OrderedDict()
     self.html = HTML(
         HEAD(
             META(charset="utf-8"),
             TITLE(self.mi.title or _("Unknown")),
             LINK(rel="stylesheet", type="text/css", href="docx.css"),
         ),
         self.body,
     )
     self.html.text = "\n\t"
     self.html[0].text = "\n\t\t"
     self.html[0].tail = "\n"
     for child in self.html[0]:
         child.tail = "\n\t\t"
     self.html[0][-1].tail = "\n\t"
     self.html[1].text = self.html[1].tail = "\n"
     lang = canonicalize_lang(self.mi.language)
     if lang and lang != "und":
         lang = lang_as_iso639_1(lang)
         if lang:
             self.html.set("lang", lang)

Пример #3

Показать файл

Файл: to_html.py Проект: RealEnder/calibre

 def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None):
     self.docx = DOCX(path_or_stream, log=log)
     self.ms_pat = re.compile(r'\s{2,}')
     self.ws_pat = re.compile(r'[\n\r\t]')
     self.log = self.docx.log
     self.notes_text = notes_text or _('Notes')
     self.dest_dir = dest_dir or os.getcwdu()
     self.mi = self.docx.metadata
     self.body = BODY()
     self.tables = Tables()
     self.styles = Styles(self.tables)
     self.images = Images()
     self.object_map = OrderedDict()
     self.html = HTML(
         HEAD(
             META(charset='utf-8'),
             TITLE(self.mi.title or _('Unknown')),
             LINK(rel='stylesheet', type='text/css', href='docx.css'),
         ),
         self.body
     )
     self.html.text='\n\t'
     self.html[0].text='\n\t\t'
     self.html[0].tail='\n'
     for child in self.html[0]:
         child.tail = '\n\t\t'
     self.html[0][-1].tail = '\n\t'
     self.html[1].text = self.html[1].tail = '\n'
     lang = canonicalize_lang(self.mi.language)
     if lang and lang != 'und':
         lang = lang_as_iso639_1(lang)
         if lang:
             self.html.set('lang', lang)

Пример #4

Показать файл

 def __init__(self,
              path_or_stream,
              dest_dir=None,
              log=None,
              detect_cover=True,
              notes_text=None,
              notes_nopb=False,
              nosupsub=False):
     self.docx = DOCX(path_or_stream, log=log)
     self.namespace = self.docx.namespace
     self.ms_pat = re.compile(r'\s{2,}')
     self.ws_pat = re.compile(r'[\n\r\t]')
     self.log = self.docx.log
     self.detect_cover = detect_cover
     self.notes_text = notes_text or _('Notes')
     self.notes_nopb = notes_nopb
     self.nosupsub = nosupsub
     self.dest_dir = dest_dir or os.getcwdu()
     self.mi = self.docx.metadata
     self.body = BODY()
     self.theme = Theme(self.namespace)
     self.settings = Settings(self.namespace)
     self.tables = Tables(self.namespace)
     self.fields = Fields(self.namespace)
     self.styles = Styles(self.namespace, self.tables)
     self.images = Images(self.namespace, self.log)
     self.object_map = OrderedDict()
     self.html = HTML(
         HEAD(
             META(charset='utf-8'),
             TITLE(self.mi.title or _('Unknown')),
             LINK(rel='stylesheet', type='text/css', href='docx.css'),
         ), self.body)
     self.html.text = '\n\t'
     self.html[0].text = '\n\t\t'
     self.html[0].tail = '\n'
     for child in self.html[0]:
         child.tail = '\n\t\t'
     self.html[0][-1].tail = '\n\t'
     self.html[1].text = self.html[1].tail = '\n'
     lang = html_lang(self.mi.language)
     if lang:
         self.html.set('lang', lang)
         self.doc_lang = lang
     else:
         self.doc_lang = None

Пример #5

Показать файл

Файл: to_html.py Проект: miurahr/calibre

 def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False):
     self.docx = DOCX(path_or_stream, log=log)
     self.namespace = self.docx.namespace
     self.ms_pat = re.compile(r'\s{2,}')
     self.ws_pat = re.compile(r'[\n\r\t]')
     self.log = self.docx.log
     self.detect_cover = detect_cover
     self.notes_text = notes_text or _('Notes')
     self.notes_nopb = notes_nopb
     self.nosupsub = nosupsub
     self.dest_dir = dest_dir or os.getcwdu()
     self.mi = self.docx.metadata
     self.body = BODY()
     self.theme = Theme(self.namespace)
     self.settings = Settings(self.namespace)
     self.tables = Tables(self.namespace)
     self.fields = Fields(self.namespace)
     self.styles = Styles(self.namespace, self.tables)
     self.images = Images(self.namespace, self.log)
     self.object_map = OrderedDict()
     self.html = HTML(
         HEAD(
             META(charset='utf-8'),
             TITLE(self.mi.title or _('Unknown')),
             LINK(rel='stylesheet', type='text/css', href='docx.css'),
         ),
         self.body
     )
     self.html.text='\n\t'
     self.html[0].text='\n\t\t'
     self.html[0].tail='\n'
     for child in self.html[0]:
         child.tail = '\n\t\t'
     self.html[0][-1].tail = '\n\t'
     self.html[1].text = self.html[1].tail = '\n'
     lang = html_lang(self.mi.language)
     if lang:
         self.html.set('lang', lang)
         self.doc_lang = lang
     else:
         self.doc_lang = None

Пример #6

Показать файл

Файл: to_html.py Проект: yesplease/calibre

class Convert(object):

    def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False):
        self.docx = DOCX(path_or_stream, log=log)
        self.namespace = self.docx.namespace
        self.ms_pat = re.compile(r'\s{2,}')
        self.ws_pat = re.compile(r'[\n\r\t]')
        self.log = self.docx.log
        self.detect_cover = detect_cover
        self.notes_text = notes_text or _('Notes')
        self.notes_nopb = notes_nopb
        self.nosupsub = nosupsub
        self.dest_dir = dest_dir or getcwd()
        self.mi = self.docx.metadata
        self.body = BODY()
        self.theme = Theme(self.namespace)
        self.settings = Settings(self.namespace)
        self.tables = Tables(self.namespace)
        self.fields = Fields(self.namespace)
        self.styles = Styles(self.namespace, self.tables)
        self.images = Images(self.namespace, self.log)
        self.object_map = OrderedDict()
        self.html = HTML(
            HEAD(
                META(charset='utf-8'),
                TITLE(self.mi.title or _('Unknown')),
                LINK(rel='stylesheet', type='text/css', href='docx.css'),
            ),
            self.body
        )
        self.html.text='\n\t'
        self.html[0].text='\n\t\t'
        self.html[0].tail='\n'
        for child in self.html[0]:
            child.tail = '\n\t\t'
        self.html[0][-1].tail = '\n\t'
        self.html[1].text = self.html[1].tail = '\n'
        lang = html_lang(self.mi.language)
        if lang:
            self.html.set('lang', lang)
            self.doc_lang = lang
        else:
            self.doc_lang = None

    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.fields(doc, self.log)
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.frame_map = {}
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)
        self.link_source_map = {}
        self.toc_anchor = None
        self.block_runs = []
        paras = []

        self.log.debug('Converting Word markup to HTML')

        self.read_page_properties(doc)
        self.resolve_alternate_content(doc)
        self.current_rels = relationships_by_id
        for wp, page_properties in iteritems(self.page_map):
            self.current_page = page_properties
            if wp.tag.endswith('}p'):
                p = self.convert_p(wp)
                self.body.append(p)
                paras.append(wp)

        self.read_block_anchors(doc)
        self.styles.apply_contextual_spacing(paras)
        self.mark_block_runs(paras)
        # Apply page breaks at the start of every section, except the first
        # section (since that will be the start of the file)
        self.styles.apply_section_page_breaks(self.section_starts[1:])

        notes_header = None
        orig_rid_map = self.images.rid_map
        if self.footnotes.has_notes:
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set('class', 'notes-header')
            for anchor, text, note in self.footnotes:
                dl = DL(id=anchor)
                dl.set('class', 'footnote')
                self.body.append(dl)
                dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text)))
                dl[-1][0].tail = ']'
                dl.append(DD())
                paras = []
                self.images.rid_map = self.current_rels = note.rels[0]
                for wp in note:
                    if wp.tag.endswith('}tbl'):
                        self.tables.register(wp, self.styles)
                        self.page_map[wp] = self.current_page
                    else:
                        p = self.convert_p(wp)
                        dl[-1].append(p)
                        paras.append(wp)
                self.styles.apply_contextual_spacing(paras)
                self.mark_block_runs(paras)

        for p, wp in iteritems(self.object_map):
            if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab':
                # Paragraph uses tabs for indentation, convert to text-indent
                parent = p[0]
                tabs = []
                for child in parent:
                    if child.get('class', None) == 'tab':
                        tabs.append(child)
                        if child.tail:
                            break
                    else:
                        break
                indent = len(tabs) * self.settings.default_tab_stop
                style = self.styles.resolve(wp)
                if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')):
                    if style.text_indent is not inherit:
                        indent = float(style.text_indent[:-2]) + indent
                    style.text_indent = '%.3gpt' % indent
                    parent.text = tabs[-1].tail or ''
                    list(map(parent.remove, tabs))

        self.images.rid_map = orig_rid_map

        self.resolve_links()

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map, self.page_map)

        numbered = []
        for html_obj, obj in iteritems(self.object_map):
            raw = obj.get('calibre_num_id', None)
            if raw is not None:
                lvl, num_id = raw.partition(':')[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = '\n\t'
            for child in self.body:
                child.tail = '\n\t'
            self.body[-1].tail = '\n'

        self.log.debug('Converting styles to CSS')
        self.styles.generate_classes()
        for html_obj, obj in iteritems(self.object_map):
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set('class', cls)
        for html_obj, css in iteritems(self.framed_map):
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set('class', cls)

        if notes_header is not None:
            for h in self.namespace.children(self.body, 'h1', 'h2', 'h3'):
                notes_header.tag = h.tag
                cls = h.get('class', None)
                if cls and cls != 'notes-header':
                    notes_header.set('class', '%s notes-header' % cls)
                break

        self.fields.polish_markup(self.object_map)

        self.log.debug('Cleaning up redundant markup generated by Word')
        self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)

        return self.write(doc)

    def read_page_properties(self, doc):
        current = []
        self.page_map = OrderedDict()
        self.section_starts = []

        for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'):
            if p.tag.endswith('}tbl'):
                self.tables.register(p, self.styles)
                current.append(p)
                continue
            sect = tuple(self.namespace.descendants(p, 'w:sectPr'))
            if sect:
                pr = PageProperties(self.namespace, sect)
                paras = current + [p]
                for x in paras:
                    self.page_map[x] = pr
                self.section_starts.append(paras[0])
                current = []
            else:
                current.append(p)

        if current:
            self.section_starts.append(current[0])
            last = self.namespace.XPath('./w:body/w:sectPr')(doc)
            pr = PageProperties(self.namespace, last)
            for x in current:
                self.page_map[x] = pr

    def resolve_alternate_content(self, doc):
        # For proprietary extensions in Word documents use the fallback, spec
        # compliant form
        # See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility
        for ac in self.namespace.descendants(doc, 'mc:AlternateContent'):
            choices = self.namespace.XPath('./mc:Choice')(ac)
            fallbacks = self.namespace.XPath('./mc:Fallback')(ac)
            if fallbacks:
                for choice in choices:
                    ac.remove(choice)

    def read_styles(self, relationships_by_type):

        def get_name(rtype, defname):
            name = relationships_by_type.get(rtype, None)
            if name is None:
                cname = self.docx.document_name.split('/')
                cname[-1] = defname
                if self.docx.exists('/'.join(cname)):
                    name = name
            if name and name.startswith('word/word') and not self.docx.exists(name):
                name = name.partition('/')[2]
            return name

        nname = get_name(self.namespace.names['NUMBERING'], 'numbering.xml')
        sname = get_name(self.namespace.names['STYLES'], 'styles.xml')
        sename = get_name(self.namespace.names['SETTINGS'], 'settings.xml')
        fname = get_name(self.namespace.names['FONTS'], 'fontTable.xml')
        tname = get_name(self.namespace.names['THEMES'], 'theme1.xml')
        foname = get_name(self.namespace.names['FOOTNOTES'], 'footnotes.xml')
        enname = get_name(self.namespace.names['ENDNOTES'], 'endnotes.xml')
        numbering = self.numbering = Numbering(self.namespace)
        footnotes = self.footnotes = Footnotes(self.namespace)
        fonts = self.fonts = Fonts(self.namespace)

        foraw = enraw = None
        forel, enrel = ({}, {}), ({}, {})
        if sename is not None:
            try:
                seraw = self.docx.read(sename)
            except KeyError:
                self.log.warn('Settings %s do not exist' % sename)
            except EnvironmentError as e:
                if e.errno != errno.ENOENT:
                    raise
                self.log.warn('Settings %s file missing' % sename)
            else:
                self.settings(fromstring(seraw))

        if foname is not None:
            try:
                foraw = self.docx.read(foname)
            except KeyError:
                self.log.warn('Footnotes %s do not exist' % foname)
            else:
                forel = self.docx.get_relationships(foname)
        if enname is not None:
            try:
                enraw = self.docx.read(enname)
            except KeyError:
                self.log.warn('Endnotes %s do not exist' % enname)
            else:
                enrel = self.docx.get_relationships(enname)
        footnotes(fromstring(foraw) if foraw else None, forel, fromstring(enraw) if enraw else None, enrel)

        if fname is not None:
            embed_relationships = self.docx.get_relationships(fname)[0]
            try:
                raw = self.docx.read(fname)
            except KeyError:
                self.log.warn('Fonts table %s does not exist' % fname)
            else:
                fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)

        if tname is not None:
            try:
                raw = self.docx.read(tname)
            except KeyError:
                self.log.warn('Styles %s do not exist' % sname)
            else:
                self.theme(fromstring(raw))

        styles_loaded = False
        if sname is not None:
            try:
                raw = self.docx.read(sname)
            except KeyError:
                self.log.warn('Styles %s do not exist' % sname)
            else:
                self.styles(fromstring(raw), fonts, self.theme)
                styles_loaded = True
        if not styles_loaded:
            self.styles(None, fonts, self.theme)

        if nname is not None:
            try:
                raw = self.docx.read(nname)
            except KeyError:
                self.log.warn('Numbering styles %s do not exist' % nname)
            else:
                numbering(fromstring(raw), self.styles, self.docx.get_relationships(nname)[0])

        self.styles.resolve_numbering(numbering)

    def write(self, doc):
        toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace)
        raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
        with lopen(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
            f.write(raw)
        css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub)
        if css:
            with lopen(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
                f.write(css.encode('utf-8'))

        opf = OPFCreator(self.dest_dir, self.mi)
        opf.toc = toc
        opf.create_manifest_from_files_in([self.dest_dir])
        for item in opf.manifest:
            if item.media_type == 'text/html':
                item.media_type = guess_type('a.xhtml')[0]
        opf.create_spine(['index.html'])
        if self.cover_image is not None:
            opf.guide.set_cover(self.cover_image)

        def process_guide(E, guide):
            if self.toc_anchor is not None:
                guide.append(E.reference(
                    href='index.html#' + self.toc_anchor, title=_('Table of Contents'), type='toc'))
        toc_file = os.path.join(self.dest_dir, 'toc.ncx')
        with lopen(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx:
            opf.render(of, ncx, 'toc.ncx', process_guide=process_guide)
        if os.path.getsize(toc_file) == 0:
            os.remove(toc_file)
        return os.path.join(self.dest_dir, 'metadata.opf')

    def read_block_anchors(self, doc):
        doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
        if doc_anchors:
            current_bm = set()
            rmap = {v:k for k, v in iteritems(self.object_map)}
            for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
                if p.tag.endswith('}p'):
                    if current_bm and p in rmap:
                        para = rmap[p]
                        if 'id' not in para.attrib:
                            para.set('id', generate_anchor(next(iter(current_bm)), frozenset(itervalues(self.anchor_map))))
                        for name in current_bm:
                            self.anchor_map[name] = para.get('id')
                        current_bm = set()
                elif p in doc_anchors:
                    anchor = self.namespace.get(p, 'w:name')
                    if anchor:
                        current_bm.add(anchor)

    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.frame_map[p] = style.frame
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None
        hl_xpath = self.namespace.XPath('ancestor::w:hyperlink[1]')

        def p_parent(x):
            # Ensure that nested <w:p> tags are handled. These can occur if a
            # textbox is present inside a paragraph.
            while True:
                x = x.getparent()
                try:
                    if x.tag.endswith('}p'):
                        return x
                except AttributeError:
                    break

        for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'):
            if p_parent(x) is not p:
                continue
            if x.tag.endswith('}r'):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set('id', current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    try:
                        hl = hl_xpath(x)[0]
                        self.link_map[hl].append(span)
                        self.link_source_map[hl] = self.current_rels
                        x.set('is-link', '1')
                    except IndexError:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith('}bookmarkStart'):
                anchor = self.namespace.get(x, 'w:name')
                if anchor and anchor not in self.anchor_map and anchor != '_GoBack':
                    # _GoBack is a special bookmark inserted by Word 2010 for
                    # the return to previous edit feature, we ignore it
                    old_anchor = current_anchor
                    self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(itervalues(self.anchor_map)))
                    if old_anchor is not None:
                        # The previous anchor was not applied to any element
                        for a, t in tuple(iteritems(self.anchor_map)):
                            if t == old_anchor:
                                self.anchor_map[a] = current_anchor
            elif x.tag.endswith('}hyperlink'):
                current_hyperlink = x
            elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '):
                old_anchor = current_anchor
                anchor = unicode_type(uuid.uuid4())
                self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(itervalues(self.anchor_map)))
                self.toc_anchor = current_anchor
                if old_anchor is not None:
                    # The previous anchor was not applied to any element
                    for a, t in tuple(iteritems(self.anchor_map)):
                        if t == old_anchor:
                            self.anchor_map[a] = current_anchor
        if current_anchor is not None:
            # This paragraph had no <w:r> descendants
            dest.set('id', current_anchor)
            current_anchor = None

        m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = 'h%d' % n

        if style.bidi is True:
            dest.set('dir', 'rtl')

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, 'text_border')
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set('class', cls)

        if not dest.text and len(dest) == 0 and not style.has_visible_border():
            # Empty paragraph add a non-breaking space so that it is rendered
            # by WebKit
            dest.text = NBSP

        # If the last element in a block is a <br> the <br> is not rendered in
        # HTML, unless it is followed by a trailing space. Word, on the other
        # hand inserts a blank line for trailing <br>s.
        if len(dest) > 0 and not dest[-1].tail:
            if dest[-1].tag == 'br':
                dest[-1].tail = NBSP
            elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
                dest[-1][-1].tail = NBSP

        return dest

    def wrap_elems(self, elems, wrapper):
        p = elems[0].getparent()
        idx = p.index(elems[0])
        p.insert(idx, wrapper)
        wrapper.tail = elems[-1].tail
        elems[-1].tail = None
        for elem in elems:
            try:
                p.remove(elem)
            except ValueError:
                # Probably a hyperlink that spans multiple
                # paragraphs,theoretically we should break this up into
                # multiple hyperlinks, but I can't be bothered.
                elem.getparent().remove(elem)
            wrapper.append(elem)
        return wrapper

    def resolve_links(self):
        self.resolved_link_map = {}
        for hyperlink, spans in iteritems(self.link_map):
            relationships_by_id = self.link_source_map[hyperlink]
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            self.resolved_link_map[hyperlink] = span
            tgt = self.namespace.get(hyperlink, 'w:tgtFrame')
            if tgt:
                span.set('target', tgt)
            tt = self.namespace.get(hyperlink, 'w:tooltip')
            if tt:
                span.set('title', tt)
            rid = self.namespace.get(hyperlink, 'r:id')
            if rid and rid in relationships_by_id:
                span.set('href', relationships_by_id[rid])
                continue
            anchor = self.namespace.get(hyperlink, 'w:anchor')
            if anchor and anchor in self.anchor_map:
                span.set('href', '#' + self.anchor_map[anchor])
                continue
            self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
                          (rid, anchor))
            # hrefs that point nowhere give epubcheck a hernia. The element
            # should be styled explicitly by Word anyway.
            # span.set('href', '#')
        rmap = {v:k for k, v in iteritems(self.object_map)}
        for hyperlink, runs in self.fields.hyperlink_fields:
            spans = [rmap[r] for r in runs if r in rmap]
            if not spans:
                continue
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            tgt = hyperlink.get('target', None)
            if tgt:
                span.set('target', tgt)
            tt = hyperlink.get('title', None)
            if tt:
                span.set('title', tt)
            url = hyperlink.get('url', None)
            if url is None:
                anchor = hyperlink.get('anchor', None)
                if anchor in self.anchor_map:
                    span.set('href', '#' + self.anchor_map[anchor])
                    continue
                self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
            else:
                if url in self.anchor_map:
                    span.set('href', '#' + self.anchor_map[url])
                    continue
                span.set('href', url)

        for img, link, relationships_by_id in self.images.links:
            parent = img.getparent()
            idx = parent.index(img)
            a = A(img)
            a.tail, img.tail = img.tail, None
            parent.insert(idx, a)
            tgt = link.get('target', None)
            if tgt:
                a.set('target', tgt)
            tt = link.get('title', None)
            if tt:
                a.set('title', tt)
            rid = link['id']
            if rid in relationships_by_id:
                dest = relationships_by_id[rid]
                if dest.startswith('#'):
                    if dest[1:] in self.anchor_map:
                        a.set('href', '#' + self.anchor_map[dest[1:]])
                else:
                    a.set('href', dest)

    def convert_run(self, run):
        ans = SPAN()
        self.object_map[ans] = run
        text = Text(ans, 'text', [])

        for child in run:
            if self.namespace.is_tag(child, 'w:t'):
                if not child.text:
                    continue
                space = child.get(XML('space'), None)
                preserve = False
                ctext = child.text
                if space != 'preserve':
                    # Remove leading and trailing whitespace. Word ignores
                    # leading and trailing whitespace without preserve
                    ctext = ctext.strip(' \n\r\t')
                # Only use a <span> with white-space:pre-wrap if this element
                # actually needs it, i.e. if it has more than one
                # consecutive space or it has newlines or tabs.
                multi_spaces = self.ms_pat.search(ctext) is not None
                preserve = multi_spaces or self.ws_pat.search(ctext) is not None
                if preserve:
                    text.add_elem(SPAN(ctext, style="white-space:pre-wrap"))
                    ans.append(text.elem)
                else:
                    text.buf.append(ctext)
            elif self.namespace.is_tag(child, 'w:cr'):
                text.add_elem(BR())
                ans.append(text.elem)
            elif self.namespace.is_tag(child, 'w:br'):
                typ = self.namespace.get(child, 'w:type')
                if typ in {'column', 'page'}:
                    br = BR(style='page-break-after:always')
                else:
                    clear = child.get('clear', None)
                    if clear in {'all', 'left', 'right'}:
                        br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
                    else:
                        br = BR()
                text.add_elem(br)
                ans.append(text.elem)
            elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'):
                for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
                    text.add_elem(img)
                    ans.append(text.elem)
            elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'):
                anchor, name = self.footnotes.get_ref(child)
                if anchor and name:
                    l = A(SUP(name, id='back_%s' % anchor), href='#' + anchor, title=name)
                    l.set('class', 'noteref')
                    text.add_elem(l)
                    ans.append(text.elem)
            elif self.namespace.is_tag(child, 'w:tab'):
                spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
                text.add_elem(SPAN(NBSP * spaces))
                ans.append(text.elem)
                ans[-1].set('class', 'tab')
            elif self.namespace.is_tag(child, 'w:noBreakHyphen'):
                text.buf.append('\u2011')
            elif self.namespace.is_tag(child, 'w:softHyphen'):
                text.buf.append('\u00ad')
        if text.buf:
            setattr(text.elem, text.attr, ''.join(text.buf))

        style = self.styles.resolve_run(run)
        if style.vert_align in {'superscript', 'subscript'}:
            ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup'
        if style.lang is not inherit:
            lang = html_lang(style.lang)
            if lang is not None and lang != self.doc_lang:
                ans.set('lang', lang)
        if style.rtl is True:
            ans.set('dir', 'rtl')
        if is_symbol_font(style.font_family):
            for elem in text:
                if elem.text:
                    elem.text = map_symbol_text(elem.text, style.font_family)
                if elem.tail:
                    elem.tail = map_symbol_text(elem.tail, style.font_family)
            style.font_family = 'sans-serif'
        return ans

    def add_frame(self, html_obj, style):
        last_run = self.framed[-1]
        if style is inherit:
            if last_run:
                self.framed.append([])
            return

        if last_run:
            if last_run[-1][1] == style:
                last_run.append((html_obj, style))
            else:
                self.framed[-1].append((html_obj, style))
        else:
            last_run.append((html_obj, style))

    def apply_frames(self):
        for run in filter(None, self.framed):
            style = run[0][1]
            paras = tuple(x[0] for x in run)
            parent = paras[0].getparent()
            idx = parent.index(paras[0])
            frame = DIV(*paras)
            parent.insert(idx, frame)
            self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
            self.styles.register(css, 'frame')

        if not self.block_runs:
            return
        rmap = {v:k for k, v in iteritems(self.object_map)}
        for border_style, blocks in self.block_runs:
            paras = tuple(rmap[p] for p in blocks)
            for p in paras:
                if p.tag == 'li':
                    has_li = True
                    break
            else:
                has_li = False
            parent = paras[0].getparent()
            if parent.tag in ('ul', 'ol'):
                ul = parent
                parent = ul.getparent()
                idx = parent.index(ul)
                frame = DIV(ul)
            elif has_li:
                def top_level_tag(x):
                    while True:
                        q = x.getparent()
                        if q is parent or q is None:
                            break
                        x = q
                    return x
                paras = tuple(map(top_level_tag, paras))
                idx = parent.index(paras[0])
                frame = DIV(*paras)
            else:
                idx = parent.index(paras[0])
                frame = DIV(*paras)
            parent.insert(idx, frame)
            self.framed_map[frame] = css = border_style.css
            self.styles.register(css, 'frame')

    def mark_block_runs(self, paras):

        def process_run(run):
            max_left = max_right = 0
            has_visible_border = None
            for p in run:
                style = self.styles.resolve_paragraph(p)
                if has_visible_border is None:
                    has_visible_border = style.has_visible_border()
                if isinstance(style.margin_left, numbers.Number):
                    max_left = max(style.margin_left, max_left)
                if isinstance(style.margin_right, numbers.Number):
                    max_right = max(style.margin_right, max_right)
                if has_visible_border:
                    style.margin_left = style.margin_right = inherit
                if p is not run[0]:
                    style.padding_top = 0
                else:
                    border_style = style.clone_border_styles()
                    if has_visible_border:
                        border_style.margin_top, style.margin_top = style.margin_top, inherit
                if p is not run[-1]:
                    style.padding_bottom = 0
                else:
                    if has_visible_border:
                        border_style.margin_bottom, style.margin_bottom = style.margin_bottom, inherit
                style.clear_borders()
                if p is not run[-1]:
                    style.apply_between_border()
            if has_visible_border:
                border_style.margin_left, border_style.margin_right = max_left,max_right
                self.block_runs.append((border_style, run))

        run = []
        for p in paras:
            if run and self.frame_map.get(p) == self.frame_map.get(run[-1]):
                style = self.styles.resolve_paragraph(p)
                last_style = self.styles.resolve_paragraph(run[-1])
                if style.has_identical_borders(last_style):
                    run.append(p)
                    continue
            if len(run) > 1:
                process_run(run)
            run = [p]
        if len(run) > 1:
            process_run(run)

Пример #7

Показать файл

Файл: to_html.py Проект: bjhemens/calibre

class Convert(object):
    def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None):
        self.docx = DOCX(path_or_stream, log=log)
        self.log = self.docx.log
        self.notes_text = notes_text or _("Notes")
        self.dest_dir = dest_dir or os.getcwdu()
        self.mi = self.docx.metadata
        self.body = BODY()
        self.styles = Styles()
        self.images = Images()
        self.tables = Tables()
        self.object_map = OrderedDict()
        self.html = HTML(
            HEAD(
                META(charset="utf-8"),
                TITLE(self.mi.title or _("Unknown")),
                LINK(rel="stylesheet", type="text/css", href="docx.css"),
            ),
            self.body,
        )
        self.html.text = "\n\t"
        self.html[0].text = "\n\t\t"
        self.html[0].tail = "\n"
        for child in self.html[0]:
            child.tail = "\n\t\t"
        self.html[0][-1].tail = "\n\t"
        self.html[1].text = self.html[1].tail = "\n"
        lang = canonicalize_lang(self.mi.language)
        if lang and lang != "und":
            lang = lang_as_iso639_1(lang)
            if lang:
                self.html.set("lang", lang)

    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)

        self.read_page_properties(doc)
        for wp, page_properties in self.page_map.iteritems():
            self.current_page = page_properties
            p = self.convert_p(wp)
            self.body.append(p)

        notes_header = None
        if self.footnotes.has_notes:
            dl = DL()
            dl.set("class", "notes")
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set("class", "notes-header")
            self.body.append(dl)
            for anchor, text, note in self.footnotes:
                dl.append(DT("[", A("←" + text, href="#back_%s" % anchor, title=text), id=anchor))
                dl[-1][0].tail = "]"
                dl.append(DD())
                in_table = False
                for wp in note:
                    if wp.tag.endswith("}tbl"):
                        self.tables.register(wp)
                        in_table = True
                        continue
                    if in_table:
                        if ancestor(wp, "w:tbl") is not None:
                            self.tables.add(wp)
                        else:
                            in_table = False
                    p = self.convert_p(wp)
                    dl[-1].append(p)

        self.resolve_links(relationships_by_id)

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map)

        numbered = []
        for html_obj, obj in self.object_map.iteritems():
            raw = obj.get("calibre_num_id", None)
            if raw is not None:
                lvl, num_id = raw.partition(":")[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = "\n\t"
            for child in self.body:
                child.tail = "\n\t"
            self.body[-1].tail = "\n"

        self.styles.generate_classes()
        for html_obj, obj in self.object_map.iteritems():
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set("class", cls)
        for html_obj, css in self.framed_map.iteritems():
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set("class", cls)

        if notes_header is not None:
            for h in self.body.iterchildren("h1", "h2", "h3"):
                notes_header.tag = h.tag
                cls = h.get("class", None)
                if cls and cls != "notes-header":
                    notes_header.set("class", "%s notes-header" % cls)
                break

        return self.write()

    def read_page_properties(self, doc):
        current = []
        self.page_map = OrderedDict()

        in_table = False

        for p in descendants(doc, "w:p", "w:tbl"):
            if p.tag.endswith("}tbl"):
                in_table = True
                self.tables.register(p)
                continue
            sect = tuple(descendants(p, "w:sectPr"))
            if sect:
                pr = PageProperties(sect)
                for x in current + [p]:
                    self.page_map[x] = pr
                current = []
            else:
                current.append(p)
            if in_table:
                if ancestor(p, "w:tbl") is not None:
                    self.tables.add(p)
                else:
                    in_table = False
        if current:
            last = XPath("./w:body/w:sectPr")(doc)
            pr = PageProperties(last)
            for x in current:
                self.page_map[x] = pr

    def read_styles(self, relationships_by_type):
        def get_name(rtype, defname):
            name = relationships_by_type.get(rtype, None)
            if name is None:
                cname = self.docx.document_name.split("/")
                cname[-1] = defname
                if self.docx.exists("/".join(cname)):
                    name = name
            return name

        nname = get_name(NUMBERING, "numbering.xml")
        sname = get_name(STYLES, "styles.xml")
        fname = get_name(FONTS, "fontTable.xml")
        foname = get_name(FOOTNOTES, "footnotes.xml")
        enname = get_name(ENDNOTES, "endnotes.xml")
        numbering = self.numbering = Numbering()
        footnotes = self.footnotes = Footnotes()
        fonts = self.fonts = Fonts()

        foraw = enraw = None
        if foname is not None:
            try:
                foraw = self.docx.read(foname)
            except KeyError:
                self.log.warn("Footnotes %s do not exist" % foname)
        if enname is not None:
            try:
                enraw = self.docx.read(enname)
            except KeyError:
                self.log.warn("Endnotes %s do not exist" % enname)
        footnotes(fromstring(foraw) if foraw else None, fromstring(enraw) if enraw else None)

        if fname is not None:
            embed_relationships = self.docx.get_relationships(fname)[0]
            try:
                raw = self.docx.read(fname)
            except KeyError:
                self.log.warn("Fonts table %s does not exist" % fname)
            else:
                fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)

        if sname is not None:
            try:
                raw = self.docx.read(sname)
            except KeyError:
                self.log.warn("Styles %s do not exist" % sname)
            else:
                self.styles(fromstring(raw), fonts)

        if nname is not None:
            try:
                raw = self.docx.read(nname)
            except KeyError:
                self.log.warn("Numbering styles %s do not exist" % nname)
            else:
                numbering(fromstring(raw), self.styles)

        self.styles.resolve_numbering(numbering)

    def create_toc(self):
        " Create a TOC from headings in the document "
        root = self.body
        headings = ("h1", "h2", "h3")
        tocroot = TOC()
        xpaths = [XPath("//%s" % x) for x in headings]
        level_prev = {i + 1: None for i in xrange(len(xpaths))}
        level_prev[0] = tocroot
        level_item_map = {i + 1: frozenset(xp(root)) for i, xp in enumerate(xpaths)}
        item_level_map = {e: i for i, elems in level_item_map.iteritems() for e in elems}

        self.idcount = 0

        def ensure_id(elem):
            ans = elem.get("id", None)
            if not ans:
                self.idcount += 1
                ans = "toc_id_%d" % self.idcount
                elem.set("id", ans)
            return ans

        for item in root.iterdescendants(*headings):
            lvl = plvl = item_level_map.get(item, None)
            if lvl is None:
                continue
            parent = None
            while parent is None:
                plvl -= 1
                parent = level_prev[plvl]
            lvl = plvl + 1
            elem_id = ensure_id(item)
            text = elem_to_toc_text(item)
            toc = parent.add_item("index.html", elem_id, text)
            level_prev[lvl] = toc
            for i in xrange(lvl + 1, len(xpaths) + 1):
                level_prev[i] = None

        if len(tuple(tocroot.flat())) > 1:
            return tocroot

    def write(self):
        toc = self.create_toc()
        raw = html.tostring(self.html, encoding="utf-8", doctype="<!DOCTYPE html>")
        with open(os.path.join(self.dest_dir, "index.html"), "wb") as f:
            f.write(raw)
        css = self.styles.generate_css(self.dest_dir, self.docx)
        if css:
            with open(os.path.join(self.dest_dir, "docx.css"), "wb") as f:
                f.write(css.encode("utf-8"))

        opf = OPFCreator(self.dest_dir, self.mi)
        opf.toc = toc
        opf.create_manifest_from_files_in([self.dest_dir])
        opf.create_spine(["index.html"])
        with open(os.path.join(self.dest_dir, "metadata.opf"), "wb") as of, open(
            os.path.join(self.dest_dir, "toc.ncx"), "wb"
        ) as ncx:
            opf.render(of, ncx, "toc.ncx")
        return os.path.join(self.dest_dir, "metadata.opf")

    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None

        for x in descendants(p, "w:r", "w:bookmarkStart", "w:hyperlink"):
            if x.tag.endswith("}r"):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set("id", current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    hl = ancestor(x, "w:hyperlink")
                    if hl is not None:
                        self.link_map[hl].append(span)
                    else:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith("}bookmarkStart"):
                anchor = get(x, "w:name")
                if anchor and anchor not in self.anchor_map:
                    self.anchor_map[anchor] = current_anchor = generate_anchor(
                        anchor, frozenset(self.anchor_map.itervalues())
                    )
            elif x.tag.endswith("}hyperlink"):
                current_hyperlink = x

        m = re.match(r"heading\s+(\d+)$", style.style_name or "", re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = "h%d" % n

        if style.direction == "rtl":
            dest.set("dir", "rtl")

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, "text_border")
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set("class", cls)

        return dest

    def wrap_elems(self, elems, wrapper):
        p = elems[0].getparent()
        idx = p.index(elems[0])
        p.insert(idx, wrapper)
        wrapper.tail = elems[-1].tail
        elems[-1].tail = None
        for elem in elems:
            p.remove(elem)
            wrapper.append(elem)
        return wrapper

    def resolve_links(self, relationships_by_id):
        for hyperlink, spans in self.link_map.iteritems():
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = "a"
            tgt = get(hyperlink, "w:tgtFrame")
            if tgt:
                span.set("target", tgt)
            tt = get(hyperlink, "w:tooltip")
            if tt:
                span.set("title", tt)
            rid = get(hyperlink, "r:id")
            if rid and rid in relationships_by_id:
                span.set("href", relationships_by_id[rid])
                continue
            anchor = get(hyperlink, "w:anchor")
            if anchor and anchor in self.anchor_map:
                span.set("href", "#" + self.anchor_map[anchor])
                continue
            self.log.warn("Hyperlink with unknown target (%s, %s), ignoring" % (rid, anchor))
            span.set("href", "#")

    def convert_run(self, run):
        ans = SPAN()
        self.object_map[ans] = run
        text = Text(ans, "text", [])

        for child in run:
            if is_tag(child, "w:t"):
                if not child.text:
                    continue
                space = child.get(XML("space"), None)
                if space == "preserve":
                    text.add_elem(SPAN(child.text, style="whitespace:pre-wrap"))
                    ans.append(text.elem)
                else:
                    text.buf.append(child.text)
            elif is_tag(child, "w:cr"):
                text.add_elem(BR())
                ans.append(text.elem)
            elif is_tag(child, "w:br"):
                typ = child.get("type", None)
                if typ in {"column", "page"}:
                    br = BR(style="page-break-after:always")
                else:
                    clear = child.get("clear", None)
                    if clear in {"all", "left", "right"}:
                        br = BR(style="clear:%s" % ("both" if clear == "all" else clear))
                    else:
                        br = BR()
                text.add_elem(br)
                ans.append(text.elem)
            elif is_tag(child, "w:drawing") or is_tag(child, "w:pict"):
                for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
                    text.add_elem(img)
                    ans.append(text.elem)
            elif is_tag(child, "w:footnoteReference") or is_tag(child, "w:endnoteReference"):
                anchor, name = self.footnotes.get_ref(child)
                if anchor and name:
                    l = SUP(A(name, href="#" + anchor, title=name), id="back_%s" % anchor)
                    l.set("class", "noteref")
                    text.add_elem(l)
                    ans.append(text.elem)
        if text.buf:
            setattr(text.elem, text.attr, "".join(text.buf))

        style = self.styles.resolve_run(run)
        if style.vert_align in {"superscript", "subscript"}:
            ans.tag = "sub" if style.vert_align == "subscript" else "sup"
        if style.lang is not inherit:
            ans.lang = style.lang
        return ans

    def add_frame(self, html_obj, style):
        last_run = self.framed[-1]
        if style is inherit:
            if last_run:
                self.framed.append([])
            return

        if last_run:
            if last_run[-1][1] == style:
                last_run.append((html_obj, style))
            else:
                self.framed.append((html_obj, style))
        else:
            last_run.append((html_obj, style))

    def apply_frames(self):
        for run in filter(None, self.framed):
            style = run[0][1]
            paras = tuple(x[0] for x in run)
            parent = paras[0].getparent()
            idx = parent.index(paras[0])
            frame = DIV(*paras)
            parent.insert(idx, frame)
            self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
            self.styles.register(css, "frame")

Пример #8

Показать файл

Файл: to_html.py Проект: AtulKumar2/calibre

class Convert(object):

    def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None):
        self.docx = DOCX(path_or_stream, log=log)
        self.ms_pat = re.compile(r'\s{2,}')
        self.ws_pat = re.compile(r'[\n\r\t]')
        self.log = self.docx.log
        self.detect_cover = detect_cover
        self.notes_text = notes_text or _('Notes')
        self.dest_dir = dest_dir or os.getcwdu()
        self.mi = self.docx.metadata
        self.body = BODY()
        self.theme = Theme()
        self.settings = Settings()
        self.tables = Tables()
        self.fields = Fields()
        self.styles = Styles(self.tables)
        self.images = Images(self.log)
        self.object_map = OrderedDict()
        self.html = HTML(
            HEAD(
                META(charset='utf-8'),
                TITLE(self.mi.title or _('Unknown')),
                LINK(rel='stylesheet', type='text/css', href='docx.css'),
            ),
            self.body
        )
        self.html.text='\n\t'
        self.html[0].text='\n\t\t'
        self.html[0].tail='\n'
        for child in self.html[0]:
            child.tail = '\n\t\t'
        self.html[0][-1].tail = '\n\t'
        self.html[1].text = self.html[1].tail = '\n'
        lang = canonicalize_lang(self.mi.language)
        if lang and lang != 'und':
            lang = lang_as_iso639_1(lang)
            if lang:
                self.html.set('lang', lang)

    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.fields(doc, self.log)
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)
        self.link_source_map = {}
        paras = []

        self.log.debug('Converting Word markup to HTML')

        self.read_page_properties(doc)
        self.current_rels = relationships_by_id
        for wp, page_properties in self.page_map.iteritems():
            self.current_page = page_properties
            if wp.tag.endswith('}p'):
                p = self.convert_p(wp)
                self.body.append(p)
                paras.append(wp)

        self.read_block_anchors(doc)
        self.styles.apply_contextual_spacing(paras)
        # Apply page breaks at the start of every section, except the first
        # section (since that will be the start of the file)
        self.styles.apply_section_page_breaks(self.section_starts[1:])

        notes_header = None
        orig_rid_map = self.images.rid_map
        if self.footnotes.has_notes:
            dl = DL()
            dl.set('class', 'notes')
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set('class', 'notes-header')
            self.body.append(dl)
            for anchor, text, note in self.footnotes:
                dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text), id=anchor))
                dl[-1][0].tail = ']'
                dl.append(DD())
                paras = []
                self.images.rid_map = self.current_rels = note.rels[0]
                for wp in note:
                    if wp.tag.endswith('}tbl'):
                        self.tables.register(wp, self.styles)
                        self.page_map[wp] = self.current_page
                    else:
                        p = self.convert_p(wp)
                        dl[-1].append(p)
                        paras.append(wp)
                self.styles.apply_contextual_spacing(paras)

        for p, wp in self.object_map.iteritems():
            if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab':
                # Paragraph uses tabs for indentation, convert to text-indent
                parent = p[0]
                tabs = []
                for child in parent:
                    if child.get('class', None) == 'tab':
                        tabs.append(child)
                        if child.tail:
                            break
                    else:
                        break
                indent = len(tabs) * self.settings.default_tab_stop
                style = self.styles.resolve(wp)
                if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')):
                    if style.text_indent is not inherit:
                        indent = float(style.text_indent[:-2]) + indent
                    style.text_indent = '%.3gpt' % indent
                    parent.text = tabs[-1].tail or ''
                    map(parent.remove, tabs)

        self.images.rid_map = orig_rid_map

        self.resolve_links()

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map, self.page_map)

        numbered = []
        for html_obj, obj in self.object_map.iteritems():
            raw = obj.get('calibre_num_id', None)
            if raw is not None:
                lvl, num_id = raw.partition(':')[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = '\n\t'
            for child in self.body:
                child.tail = '\n\t'
            self.body[-1].tail = '\n'

        self.log.debug('Converting styles to CSS')
        self.styles.generate_classes()
        for html_obj, obj in self.object_map.iteritems():
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set('class', cls)
        for html_obj, css in self.framed_map.iteritems():
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set('class', cls)

        if notes_header is not None:
            for h in children(self.body, 'h1', 'h2', 'h3'):
                notes_header.tag = h.tag
                cls = h.get('class', None)
                if cls and cls != 'notes-header':
                    notes_header.set('class', '%s notes-header' % cls)
                break

        self.fields.polish_markup(self.object_map)

        self.log.debug('Cleaning up redundant markup generated by Word')
        self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover)

        return self.write(doc)

    def read_page_properties(self, doc):
        current = []
        self.page_map = OrderedDict()
        self.section_starts = []

        for p in descendants(doc, 'w:p', 'w:tbl'):
            if p.tag.endswith('}tbl'):
                self.tables.register(p, self.styles)
                current.append(p)
                continue
            sect = tuple(descendants(p, 'w:sectPr'))
            if sect:
                pr = PageProperties(sect)
                paras = current + [p]
                for x in paras:
                    self.page_map[x] = pr
                self.section_starts.append(paras[0])
                current = []
            else:
                current.append(p)

        if current:
            self.section_starts.append(current[0])
            last = XPath('./w:body/w:sectPr')(doc)
            pr = PageProperties(last)
            for x in current:
                self.page_map[x] = pr

    def read_styles(self, relationships_by_type):

        def get_name(rtype, defname):
            name = relationships_by_type.get(rtype, None)
            if name is None:
                cname = self.docx.document_name.split('/')
                cname[-1] = defname
                if self.docx.exists('/'.join(cname)):
                    name = name
            return name

        nname = get_name(NUMBERING, 'numbering.xml')
        sname = get_name(STYLES, 'styles.xml')
        sename = get_name(SETTINGS, 'settings.xml')
        fname = get_name(FONTS, 'fontTable.xml')
        tname = get_name(THEMES, 'theme1.xml')
        foname = get_name(FOOTNOTES, 'footnotes.xml')
        enname = get_name(ENDNOTES, 'endnotes.xml')
        numbering = self.numbering = Numbering()
        footnotes = self.footnotes = Footnotes()
        fonts = self.fonts = Fonts()

        foraw = enraw = None
        forel, enrel = ({}, {}), ({}, {})
        if sename is not None:
            try:
                seraw = self.docx.read(sename)
            except KeyError:
                self.log.warn('Settings %s do not exist' % sename)
            else:
                self.settings(fromstring(seraw))

        if foname is not None:
            try:
                foraw = self.docx.read(foname)
            except KeyError:
                self.log.warn('Footnotes %s do not exist' % foname)
            else:
                forel = self.docx.get_relationships(foname)
        if enname is not None:
            try:
                enraw = self.docx.read(enname)
            except KeyError:
                self.log.warn('Endnotes %s do not exist' % enname)
            else:
                enrel = self.docx.get_relationships(enname)
        footnotes(fromstring(foraw) if foraw else None, forel, fromstring(enraw) if enraw else None, enrel)

        if fname is not None:
            embed_relationships = self.docx.get_relationships(fname)[0]
            try:
                raw = self.docx.read(fname)
            except KeyError:
                self.log.warn('Fonts table %s does not exist' % fname)
            else:
                fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)

        if tname is not None:
            try:
                raw = self.docx.read(tname)
            except KeyError:
                self.log.warn('Styles %s do not exist' % sname)
            else:
                self.theme(fromstring(raw))

        if sname is not None:
            try:
                raw = self.docx.read(sname)
            except KeyError:
                self.log.warn('Styles %s do not exist' % sname)
            else:
                self.styles(fromstring(raw), fonts, self.theme)

        if nname is not None:
            try:
                raw = self.docx.read(nname)
            except KeyError:
                self.log.warn('Numbering styles %s do not exist' % nname)
            else:
                numbering(fromstring(raw), self.styles, self.docx.get_relationships(nname)[0])

        self.styles.resolve_numbering(numbering)

    def write(self, doc):
        toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log)
        raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
        with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
            f.write(raw)
        css = self.styles.generate_css(self.dest_dir, self.docx)
        if css:
            with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
                f.write(css.encode('utf-8'))

        opf = OPFCreator(self.dest_dir, self.mi)
        opf.toc = toc
        opf.create_manifest_from_files_in([self.dest_dir])
        for item in opf.manifest:
            if item.media_type == 'text/html':
                item.media_type = guess_type('a.xhtml')[0]
        opf.create_spine(['index.html'])
        if self.cover_image is not None:
            opf.guide.set_cover(self.cover_image)
        toc_file = os.path.join(self.dest_dir, 'toc.ncx')
        with open(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx:
            opf.render(of, ncx, 'toc.ncx')
        if os.path.getsize(toc_file) == 0:
            os.remove(toc_file)
        return os.path.join(self.dest_dir, 'metadata.opf')

    def read_block_anchors(self, doc):
        doc_anchors = frozenset(XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
        if doc_anchors:
            current_bm = set()
            rmap = {v:k for k, v in self.object_map.iteritems()}
            for p in descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
                if p.tag.endswith('}p'):
                    if current_bm and p in rmap:
                        para = rmap[p]
                        if 'id' not in para.attrib:
                            para.set('id', generate_anchor(next(iter(current_bm)), frozenset(self.anchor_map.itervalues())))
                        for name in current_bm:
                            self.anchor_map[name] = para.get('id')
                        current_bm = set()
                elif p in doc_anchors:
                    anchor = get(p, 'w:name')
                    if anchor:
                        current_bm.add(anchor)

    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None
        hl_xpath = XPath('ancestor::w:hyperlink[1]')

        def p_parent(x):
            # Ensure that nested <w:p> tags are handled. These can occur if a
            # textbox is present inside a paragraph.
            while True:
                x = x.getparent()
                try:
                    if x.tag.endswith('}p'):
                        return x
                except AttributeError:
                    break

        for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'):
            if p_parent(x) is not p:
                continue
            if x.tag.endswith('}r'):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set('id', current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    try:
                        hl = hl_xpath(x)[0]
                        self.link_map[hl].append(span)
                        self.link_source_map[hl] = self.current_rels
                        x.set('is-link', '1')
                    except IndexError:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith('}bookmarkStart'):
                anchor = get(x, 'w:name')
                if anchor and anchor not in self.anchor_map:
                    old_anchor = current_anchor
                    self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues()))
                    if old_anchor is not None:
                        # The previous anchor was not applied to any element
                        for a, t in tuple(self.anchor_map.iteritems()):
                            if t == old_anchor:
                                self.anchor_map[a] = current_anchor
            elif x.tag.endswith('}hyperlink'):
                current_hyperlink = x

        m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = 'h%d' % n

        if style.direction == 'rtl':
            dest.set('dir', 'rtl')

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, 'text_border')
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set('class', cls)

        if not dest.text and len(dest) == 0:
            # Empty paragraph add a non-breaking space so that it is rendered
            # by WebKit
            dest.text = NBSP

        # If the last element in a block is a <br> the <br> is not rendered in
        # HTML, unless it is followed by a trailing space. Word, on the other
        # hand inserts a blank line for trailing <br>s.
        if len(dest) > 0 and not dest[-1].tail:
            if dest[-1].tag == 'br':
                dest[-1].tail = NBSP
            elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
                dest[-1][-1].tail = NBSP

        return dest

    def wrap_elems(self, elems, wrapper):
        p = elems[0].getparent()
        idx = p.index(elems[0])
        p.insert(idx, wrapper)
        wrapper.tail = elems[-1].tail
        elems[-1].tail = None
        for elem in elems:
            p.remove(elem)
            wrapper.append(elem)
        return wrapper

    def resolve_links(self):
        self.resolved_link_map = {}
        for hyperlink, spans in self.link_map.iteritems():
            relationships_by_id = self.link_source_map[hyperlink]
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            self.resolved_link_map[hyperlink] = span
            tgt = get(hyperlink, 'w:tgtFrame')
            if tgt:
                span.set('target', tgt)
            tt = get(hyperlink, 'w:tooltip')
            if tt:
                span.set('title', tt)
            rid = get(hyperlink, 'r:id')
            if rid and rid in relationships_by_id:
                span.set('href', relationships_by_id[rid])
                continue
            anchor = get(hyperlink, 'w:anchor')
            if anchor and anchor in self.anchor_map:
                span.set('href', '#' + self.anchor_map[anchor])
                continue
            self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
                          (rid, anchor))
            # hrefs that point nowhere give epubcheck a hernia. The element
            # should be styled explicitly by Word anyway.
            # span.set('href', '#')
        rmap = {v:k for k, v in self.object_map.iteritems()}
        for hyperlink, runs in self.fields.hyperlink_fields:
            spans = [rmap[r] for r in runs if r in rmap]
            if not spans:
                continue
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            tgt = hyperlink.get('target', None)
            if tgt:
                span.set('target', tgt)
            tt = hyperlink.get('title', None)
            if tt:
                span.set('title', tt)
            url = hyperlink.get('url', None)
            if url is None:
                anchor = hyperlink.get('anchor', None)
                if anchor in self.anchor_map:
                    span.set('href', '#' + self.anchor_map[anchor])
                    continue
                self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
            else:
                if url in self.anchor_map:
                    span.set('href', '#' + self.anchor_map[url])
                    continue
                span.set('href', url)

        for img, link, relationships_by_id in self.images.links:
            parent = img.getparent()
            idx = parent.index(img)
            a = A(img)
            a.tail, img.tail = img.tail, None
            parent.insert(idx, a)
            tgt = link.get('target', None)
            if tgt:
                a.set('target', tgt)
            tt = link.get('title', None)
            if tt:
                a.set('title', tt)
            rid = link['id']
            if rid in relationships_by_id:
                dest = relationships_by_id[rid]
                if dest.startswith('#'):
                    if dest[1:] in self.anchor_map:
                        a.set('href', '#' + self.anchor_map[dest[1:]])
                else:
                    a.set('href', dest)

    def convert_run(self, run):
        ans = SPAN()
        self.object_map[ans] = run
        text = Text(ans, 'text', [])

        for child in run:
            if is_tag(child, 'w:t'):
                if not child.text:
                    continue
                space = child.get(XML('space'), None)
                preserve = False
                if space == 'preserve':
                    # Only use a <span> with white-space:pre-wrap if this element
                    # actually needs it, i.e. if it has more than one
                    # consecutive space or it has newlines or tabs.
                    multi_spaces = self.ms_pat.search(child.text) is not None
                    preserve = multi_spaces or self.ws_pat.search(child.text) is not None
                if preserve:
                    text.add_elem(SPAN(child.text, style="white-space:pre-wrap"))
                    ans.append(text.elem)
                else:
                    text.buf.append(child.text)
            elif is_tag(child, 'w:cr'):
                text.add_elem(BR())
                ans.append(text.elem)
            elif is_tag(child, 'w:br'):
                typ = get(child, 'w:type')
                if typ in {'column', 'page'}:
                    br = BR(style='page-break-after:always')
                else:
                    clear = child.get('clear', None)
                    if clear in {'all', 'left', 'right'}:
                        br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
                    else:
                        br = BR()
                text.add_elem(br)
                ans.append(text.elem)
            elif is_tag(child, 'w:drawing') or is_tag(child, 'w:pict'):
                for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
                    text.add_elem(img)
                    ans.append(text.elem)
            elif is_tag(child, 'w:footnoteReference') or is_tag(child, 'w:endnoteReference'):
                anchor, name = self.footnotes.get_ref(child)
                if anchor and name:
                    l = SUP(A(name, href='#' + anchor, title=name), id='back_%s' % anchor)
                    l.set('class', 'noteref')
                    text.add_elem(l)
                    ans.append(text.elem)
            elif is_tag(child, 'w:tab'):
                spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
                text.add_elem(SPAN(NBSP * spaces))
                ans.append(text.elem)
                ans[-1].set('class', 'tab')
            elif is_tag(child, 'w:noBreakHyphen'):
                text.buf.append(u'\u2011')
            elif is_tag(child, 'w:softHyphen'):
                text.buf.append(u'\u00ad')
        if text.buf:
            setattr(text.elem, text.attr, ''.join(text.buf))

        style = self.styles.resolve_run(run)
        if style.vert_align in {'superscript', 'subscript'}:
            ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup'
        if style.lang is not inherit:
            ans.lang = style.lang
        return ans

    def add_frame(self, html_obj, style):
        last_run = self.framed[-1]
        if style is inherit:
            if last_run:
                self.framed.append([])
            return

        if last_run:
            if last_run[-1][1] == style:
                last_run.append((html_obj, style))
            else:
                self.framed[-1].append((html_obj, style))
        else:
            last_run.append((html_obj, style))

    def apply_frames(self):
        for run in filter(None, self.framed):
            style = run[0][1]
            paras = tuple(x[0] for x in run)
            parent = paras[0].getparent()
            idx = parent.index(paras[0])
            frame = DIV(*paras)
            parent.insert(idx, frame)
            self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
            self.styles.register(css, 'frame')

Пример #9

Показать файл

class Convert(object):
    def __init__(self,
                 path_or_stream,
                 dest_dir=None,
                 log=None,
                 notes_text=None):
        self.docx = DOCX(path_or_stream, log=log)
        self.log = self.docx.log
        self.notes_text = notes_text or _('Notes')
        self.dest_dir = dest_dir or os.getcwdu()
        self.mi = self.docx.metadata
        self.body = BODY()
        self.styles = Styles()
        self.images = Images()
        self.tables = Tables()
        self.object_map = OrderedDict()
        self.html = HTML(
            HEAD(
                META(charset='utf-8'),
                TITLE(self.mi.title or _('Unknown')),
                LINK(rel='stylesheet', type='text/css', href='docx.css'),
            ), self.body)
        self.html.text = '\n\t'
        self.html[0].text = '\n\t\t'
        self.html[0].tail = '\n'
        for child in self.html[0]:
            child.tail = '\n\t\t'
        self.html[0][-1].tail = '\n\t'
        self.html[1].text = self.html[1].tail = '\n'
        lang = canonicalize_lang(self.mi.language)
        if lang and lang != 'und':
            lang = lang_as_iso639_1(lang)
            if lang:
                self.html.set('lang', lang)

    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)

        self.read_page_properties(doc)
        for wp, page_properties in self.page_map.iteritems():
            self.current_page = page_properties
            p = self.convert_p(wp)
            self.body.append(p)

        notes_header = None
        if self.footnotes.has_notes:
            dl = DL()
            dl.set('class', 'notes')
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set('class', 'notes-header')
            self.body.append(dl)
            for anchor, text, note in self.footnotes:
                dl.append(
                    DT('[',
                       A('←' + text, href='#back_%s' % anchor, title=text),
                       id=anchor))
                dl[-1][0].tail = ']'
                dl.append(DD())
                in_table = False
                for wp in note:
                    if wp.tag.endswith('}tbl'):
                        self.tables.register(wp)
                        in_table = True
                        continue
                    if in_table:
                        if ancestor(wp, 'w:tbl') is not None:
                            self.tables.add(wp)
                        else:
                            in_table = False
                    p = self.convert_p(wp)
                    dl[-1].append(p)

        self.resolve_links(relationships_by_id)

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map)

        numbered = []
        for html_obj, obj in self.object_map.iteritems():
            raw = obj.get('calibre_num_id', None)
            if raw is not None:
                lvl, num_id = raw.partition(':')[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles,
                                    self.object_map)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = '\n\t'
            for child in self.body:
                child.tail = '\n\t'
            self.body[-1].tail = '\n'

        self.styles.generate_classes()
        for html_obj, obj in self.object_map.iteritems():
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set('class', cls)
        for html_obj, css in self.framed_map.iteritems():
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set('class', cls)

        if notes_header is not None:
            for h in self.body.iterchildren('h1', 'h2', 'h3'):
                notes_header.tag = h.tag
                cls = h.get('class', None)
                if cls and cls != 'notes-header':
                    notes_header.set('class', '%s notes-header' % cls)
                break

        return self.write()

    def read_page_properties(self, doc):
        current = []
        self.page_map = OrderedDict()

        in_table = False

        for p in descendants(doc, 'w:p', 'w:tbl'):
            if p.tag.endswith('}tbl'):
                in_table = True
                self.tables.register(p)
                continue
            sect = tuple(descendants(p, 'w:sectPr'))
            if sect:
                pr = PageProperties(sect)
                for x in current + [p]:
                    self.page_map[x] = pr
                current = []
            else:
                current.append(p)
            if in_table:
                if ancestor(p, 'w:tbl') is not None:
                    self.tables.add(p)
                else:
                    in_table = False
        if current:
            last = XPath('./w:body/w:sectPr')(doc)
            pr = PageProperties(last)
            for x in current:
                self.page_map[x] = pr

    def read_styles(self, relationships_by_type):
        def get_name(rtype, defname):
            name = relationships_by_type.get(rtype, None)
            if name is None:
                cname = self.docx.document_name.split('/')
                cname[-1] = defname
                if self.docx.exists('/'.join(cname)):
                    name = name
            return name

        nname = get_name(NUMBERING, 'numbering.xml')
        sname = get_name(STYLES, 'styles.xml')
        fname = get_name(FONTS, 'fontTable.xml')
        foname = get_name(FOOTNOTES, 'footnotes.xml')
        enname = get_name(ENDNOTES, 'endnotes.xml')
        numbering = self.numbering = Numbering()
        footnotes = self.footnotes = Footnotes()
        fonts = self.fonts = Fonts()

        foraw = enraw = None
        if foname is not None:
            try:
                foraw = self.docx.read(foname)
            except KeyError:
                self.log.warn('Footnotes %s do not exist' % foname)
        if enname is not None:
            try:
                enraw = self.docx.read(enname)
            except KeyError:
                self.log.warn('Endnotes %s do not exist' % enname)
        footnotes(
            fromstring(foraw) if foraw else None,
            fromstring(enraw) if enraw else None)

        if fname is not None:
            embed_relationships = self.docx.get_relationships(fname)[0]
            try:
                raw = self.docx.read(fname)
            except KeyError:
                self.log.warn('Fonts table %s does not exist' % fname)
            else:
                fonts(fromstring(raw), embed_relationships, self.docx,
                      self.dest_dir)

        if sname is not None:
            try:
                raw = self.docx.read(sname)
            except KeyError:
                self.log.warn('Styles %s do not exist' % sname)
            else:
                self.styles(fromstring(raw), fonts)

        if nname is not None:
            try:
                raw = self.docx.read(nname)
            except KeyError:
                self.log.warn('Numbering styles %s do not exist' % nname)
            else:
                numbering(fromstring(raw), self.styles)

        self.styles.resolve_numbering(numbering)

    def create_toc(self):
        ' Create a TOC from headings in the document '
        root = self.body
        headings = ('h1', 'h2', 'h3')
        tocroot = TOC()
        xpaths = [XPath('//%s' % x) for x in headings]
        level_prev = {i + 1: None for i in xrange(len(xpaths))}
        level_prev[0] = tocroot
        level_item_map = {
            i + 1: frozenset(xp(root))
            for i, xp in enumerate(xpaths)
        }
        item_level_map = {
            e: i
            for i, elems in level_item_map.iteritems() for e in elems
        }

        self.idcount = 0

        def ensure_id(elem):
            ans = elem.get('id', None)
            if not ans:
                self.idcount += 1
                ans = 'toc_id_%d' % self.idcount
                elem.set('id', ans)
            return ans

        for item in root.iterdescendants(*headings):
            lvl = plvl = item_level_map.get(item, None)
            if lvl is None:
                continue
            parent = None
            while parent is None:
                plvl -= 1
                parent = level_prev[plvl]
            lvl = plvl + 1
            elem_id = ensure_id(item)
            text = elem_to_toc_text(item)
            toc = parent.add_item('index.html', elem_id, text)
            level_prev[lvl] = toc
            for i in xrange(lvl + 1, len(xpaths) + 1):
                level_prev[i] = None

        if len(tuple(tocroot.flat())) > 1:
            return tocroot

    def write(self):
        toc = self.create_toc()
        raw = html.tostring(self.html,
                            encoding='utf-8',
                            doctype='<!DOCTYPE html>')
        with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
            f.write(raw)
        css = self.styles.generate_css(self.dest_dir, self.docx)
        if css:
            with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
                f.write(css.encode('utf-8'))

        opf = OPFCreator(self.dest_dir, self.mi)
        opf.toc = toc
        opf.create_manifest_from_files_in([self.dest_dir])
        opf.create_spine(['index.html'])
        with open(os.path.join(self.dest_dir, 'metadata.opf'),
                  'wb') as of, open(os.path.join(self.dest_dir, 'toc.ncx'),
                                    'wb') as ncx:
            opf.render(of, ncx, 'toc.ncx')
        return os.path.join(self.dest_dir, 'metadata.opf')

    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None

        for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'):
            if x.tag.endswith('}r'):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set(
                        'id', current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    hl = ancestor(x, 'w:hyperlink')
                    if hl is not None:
                        self.link_map[hl].append(span)
                    else:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith('}bookmarkStart'):
                anchor = get(x, 'w:name')
                if anchor and anchor not in self.anchor_map:
                    self.anchor_map[anchor] = current_anchor = generate_anchor(
                        anchor, frozenset(self.anchor_map.itervalues()))
            elif x.tag.endswith('}hyperlink'):
                current_hyperlink = x

        m = re.match(r'heading\s+(\d+)$', style.style_name or '',
                     re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = 'h%d' % n

        if style.direction == 'rtl':
            dest.set('dir', 'rtl')

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, 'text_border')
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set('class', cls)

        return dest

    def wrap_elems(self, elems, wrapper):
        p = elems[0].getparent()
        idx = p.index(elems[0])
        p.insert(idx, wrapper)
        wrapper.tail = elems[-1].tail
        elems[-1].tail = None
        for elem in elems:
            p.remove(elem)
            wrapper.append(elem)
        return wrapper

    def resolve_links(self, relationships_by_id):
        for hyperlink, spans in self.link_map.iteritems():
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            tgt = get(hyperlink, 'w:tgtFrame')
            if tgt:
                span.set('target', tgt)
            tt = get(hyperlink, 'w:tooltip')
            if tt:
                span.set('title', tt)
            rid = get(hyperlink, 'r:id')
            if rid and rid in relationships_by_id:
                span.set('href', relationships_by_id[rid])
                continue
            anchor = get(hyperlink, 'w:anchor')
            if anchor and anchor in self.anchor_map:
                span.set('href', '#' + self.anchor_map[anchor])
                continue
            self.log.warn('Hyperlink with unknown target (%s, %s), ignoring' %
                          (rid, anchor))
            span.set('href', '#')

    def convert_run(self, run):
        ans = SPAN()
        self.object_map[ans] = run
        text = Text(ans, 'text', [])

        for child in run:
            if is_tag(child, 'w:t'):
                if not child.text:
                    continue
                space = child.get(XML('space'), None)
                if space == 'preserve':
                    text.add_elem(SPAN(child.text,
                                       style="whitespace:pre-wrap"))
                    ans.append(text.elem)
                else:
                    text.buf.append(child.text)
            elif is_tag(child, 'w:cr'):
                text.add_elem(BR())
                ans.append(text.elem)
            elif is_tag(child, 'w:br'):
                typ = child.get('type', None)
                if typ in {'column', 'page'}:
                    br = BR(style='page-break-after:always')
                else:
                    clear = child.get('clear', None)
                    if clear in {'all', 'left', 'right'}:
                        br = BR(style='clear:%s' %
                                ('both' if clear == 'all' else clear))
                    else:
                        br = BR()
                text.add_elem(br)
                ans.append(text.elem)
            elif is_tag(child, 'w:drawing') or is_tag(child, 'w:pict'):
                for img in self.images.to_html(child, self.current_page,
                                               self.docx, self.dest_dir):
                    text.add_elem(img)
                    ans.append(text.elem)
            elif is_tag(child, 'w:footnoteReference') or is_tag(
                    child, 'w:endnoteReference'):
                anchor, name = self.footnotes.get_ref(child)
                if anchor and name:
                    l = SUP(A(name, href='#' + anchor, title=name),
                            id='back_%s' % anchor)
                    l.set('class', 'noteref')
                    text.add_elem(l)
                    ans.append(text.elem)
        if text.buf:
            setattr(text.elem, text.attr, ''.join(text.buf))

        style = self.styles.resolve_run(run)
        if style.vert_align in {'superscript', 'subscript'}:
            ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup'
        if style.lang is not inherit:
            ans.lang = style.lang
        return ans

    def add_frame(self, html_obj, style):
        last_run = self.framed[-1]
        if style is inherit:
            if last_run:
                self.framed.append([])
            return

        if last_run:
            if last_run[-1][1] == style:
                last_run.append((html_obj, style))
            else:
                self.framed.append((html_obj, style))
        else:
            last_run.append((html_obj, style))

    def apply_frames(self):
        for run in filter(None, self.framed):
            style = run[0][1]
            paras = tuple(x[0] for x in run)
            parent = paras[0].getparent()
            idx = parent.index(paras[0])
            frame = DIV(*paras)
            parent.insert(idx, frame)
            self.framed_map[frame] = css = style.css(
                self.page_map[self.object_map[paras[0]]])
            self.styles.register(css, 'frame')

Пример #10

Показать файл

Файл: to_html.py Проект: RealEnder/calibre

class Convert(object):

    def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None):
        self.docx = DOCX(path_or_stream, log=log)
        self.ms_pat = re.compile(r'\s{2,}')
        self.ws_pat = re.compile(r'[\n\r\t]')
        self.log = self.docx.log
        self.notes_text = notes_text or _('Notes')
        self.dest_dir = dest_dir or os.getcwdu()
        self.mi = self.docx.metadata
        self.body = BODY()
        self.tables = Tables()
        self.styles = Styles(self.tables)
        self.images = Images()
        self.object_map = OrderedDict()
        self.html = HTML(
            HEAD(
                META(charset='utf-8'),
                TITLE(self.mi.title or _('Unknown')),
                LINK(rel='stylesheet', type='text/css', href='docx.css'),
            ),
            self.body
        )
        self.html.text='\n\t'
        self.html[0].text='\n\t\t'
        self.html[0].tail='\n'
        for child in self.html[0]:
            child.tail = '\n\t\t'
        self.html[0][-1].tail = '\n\t'
        self.html[1].text = self.html[1].tail = '\n'
        lang = canonicalize_lang(self.mi.language)
        if lang and lang != 'und':
            lang = lang_as_iso639_1(lang)
            if lang:
                self.html.set('lang', lang)

    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)

        self.read_page_properties(doc)
        for wp, page_properties in self.page_map.iteritems():
            self.current_page = page_properties
            if wp.tag.endswith('}p'):
                p = self.convert_p(wp)
                self.body.append(p)

        notes_header = None
        if self.footnotes.has_notes:
            dl = DL()
            dl.set('class', 'notes')
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set('class', 'notes-header')
            self.body.append(dl)
            for anchor, text, note in self.footnotes:
                dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text), id=anchor))
                dl[-1][0].tail = ']'
                dl.append(DD())
                for wp in note:
                    if wp.tag.endswith('}tbl'):
                        self.tables.register(wp, self.styles)
                        self.page_map[wp] = self.current_page
                    p = self.convert_p(wp)
                    dl[-1].append(p)

        self.resolve_links(relationships_by_id)

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map, self.page_map)

        numbered = []
        for html_obj, obj in self.object_map.iteritems():
            raw = obj.get('calibre_num_id', None)
            if raw is not None:
                lvl, num_id = raw.partition(':')[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = '\n\t'
            for child in self.body:
                child.tail = '\n\t'
            self.body[-1].tail = '\n'

        self.styles.generate_classes()
        for html_obj, obj in self.object_map.iteritems():
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set('class', cls)
        for html_obj, css in self.framed_map.iteritems():
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set('class', cls)

        if notes_header is not None:
            for h in children(self.body, 'h1', 'h2', 'h3'):
                notes_header.tag = h.tag
                cls = h.get('class', None)
                if cls and cls != 'notes-header':
                    notes_header.set('class', '%s notes-header' % cls)
                break

        return self.write()

    def read_page_properties(self, doc):
        current = []
        self.page_map = OrderedDict()

        for p in descendants(doc, 'w:p', 'w:tbl'):
            if p.tag.endswith('}tbl'):
                self.tables.register(p, self.styles)
                current.append(p)
                continue
            sect = tuple(descendants(p, 'w:sectPr'))
            if sect:
                pr = PageProperties(sect)
                for x in current + [p]:
                    self.page_map[x] = pr
                current = []
            else:
                current.append(p)

        if current:
            last = XPath('./w:body/w:sectPr')(doc)
            pr = PageProperties(last)
            for x in current:
                self.page_map[x] = pr

    def read_styles(self, relationships_by_type):

        def get_name(rtype, defname):
            name = relationships_by_type.get(rtype, None)
            if name is None:
                cname = self.docx.document_name.split('/')
                cname[-1] = defname
                if self.docx.exists('/'.join(cname)):
                    name = name
            return name

        nname = get_name(NUMBERING, 'numbering.xml')
        sname = get_name(STYLES, 'styles.xml')
        fname = get_name(FONTS, 'fontTable.xml')
        foname = get_name(FOOTNOTES, 'footnotes.xml')
        enname = get_name(ENDNOTES, 'endnotes.xml')
        numbering = self.numbering = Numbering()
        footnotes = self.footnotes = Footnotes()
        fonts = self.fonts = Fonts()

        foraw = enraw = None
        if foname is not None:
            try:
                foraw = self.docx.read(foname)
            except KeyError:
                self.log.warn('Footnotes %s do not exist' % foname)
        if enname is not None:
            try:
                enraw = self.docx.read(enname)
            except KeyError:
                self.log.warn('Endnotes %s do not exist' % enname)
        footnotes(fromstring(foraw) if foraw else None, fromstring(enraw) if enraw else None)

        if fname is not None:
            embed_relationships = self.docx.get_relationships(fname)[0]
            try:
                raw = self.docx.read(fname)
            except KeyError:
                self.log.warn('Fonts table %s does not exist' % fname)
            else:
                fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)

        if sname is not None:
            try:
                raw = self.docx.read(sname)
            except KeyError:
                self.log.warn('Styles %s do not exist' % sname)
            else:
                self.styles(fromstring(raw), fonts)

        if nname is not None:
            try:
                raw = self.docx.read(nname)
            except KeyError:
                self.log.warn('Numbering styles %s do not exist' % nname)
            else:
                numbering(fromstring(raw), self.styles)

        self.styles.resolve_numbering(numbering)

    def create_toc(self):
        ' Create a TOC from headings in the document '
        root = self.body
        headings = ('h1', 'h2', 'h3')
        tocroot = TOC()
        xpaths = [XPath('//%s' % x) for x in headings]
        level_prev = {i+1:None for i in xrange(len(xpaths))}
        level_prev[0] = tocroot
        level_item_map = {i+1:frozenset(xp(root)) for i, xp in enumerate(xpaths)}
        item_level_map = {e:i for i, elems in level_item_map.iteritems() for e in elems}

        self.idcount = 0

        def ensure_id(elem):
            ans = elem.get('id', None)
            if not ans:
                self.idcount += 1
                ans = 'toc_id_%d' % self.idcount
                elem.set('id', ans)
            return ans

        for item in descendants(root, *headings):
            lvl = plvl = item_level_map.get(item, None)
            if lvl is None:
                continue
            parent = None
            while parent is None:
                plvl -= 1
                parent = level_prev[plvl]
            lvl = plvl + 1
            elem_id = ensure_id(item)
            text = elem_to_toc_text(item)
            toc = parent.add_item('index.html', elem_id, text)
            level_prev[lvl] = toc
            for i in xrange(lvl+1, len(xpaths)+1):
                level_prev[i] = None

        if len(tuple(tocroot.flat())) > 1:
            return tocroot

    def write(self):
        toc = self.create_toc()
        raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
        with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
            f.write(raw)
        css = self.styles.generate_css(self.dest_dir, self.docx)
        if css:
            with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
                f.write(css.encode('utf-8'))

        opf = OPFCreator(self.dest_dir, self.mi)
        opf.toc = toc
        opf.create_manifest_from_files_in([self.dest_dir])
        opf.create_spine(['index.html'])
        with open(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(os.path.join(self.dest_dir, 'toc.ncx'), 'wb') as ncx:
            opf.render(of, ncx, 'toc.ncx')
        return os.path.join(self.dest_dir, 'metadata.opf')

    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None
        hl_xpath = XPath('ancestor::w:hyperlink[1]')

        for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'):
            if x.tag.endswith('}r'):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set('id', current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    try:
                        hl = hl_xpath(x)[0]
                        self.link_map[hl].append(span)
                    except IndexError:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith('}bookmarkStart'):
                anchor = get(x, 'w:name')
                if anchor and anchor not in self.anchor_map:
                    self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.itervalues()))
            elif x.tag.endswith('}hyperlink'):
                current_hyperlink = x

        m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = 'h%d' % n

        if style.direction == 'rtl':
            dest.set('dir', 'rtl')

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, 'text_border')
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set('class', cls)

        if not dest.text and len(dest) == 0:
            # Empty paragraph add a non-breaking space so that it is rendered
            # by WebKit
            dest.text = '\xa0'
        return dest

    def wrap_elems(self, elems, wrapper):
        p = elems[0].getparent()
        idx = p.index(elems[0])
        p.insert(idx, wrapper)
        wrapper.tail = elems[-1].tail
        elems[-1].tail = None
        for elem in elems:
            p.remove(elem)
            wrapper.append(elem)
        return wrapper

    def resolve_links(self, relationships_by_id):
        for hyperlink, spans in self.link_map.iteritems():
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            tgt = get(hyperlink, 'w:tgtFrame')
            if tgt:
                span.set('target', tgt)
            tt = get(hyperlink, 'w:tooltip')
            if tt:
                span.set('title', tt)
            rid = get(hyperlink, 'r:id')
            if rid and rid in relationships_by_id:
                span.set('href', relationships_by_id[rid])
                continue
            anchor = get(hyperlink, 'w:anchor')
            if anchor and anchor in self.anchor_map:
                span.set('href', '#' + self.anchor_map[anchor])
                continue
            self.log.warn('Hyperlink with unknown target (%s, %s), ignoring' %
                          (rid, anchor))
            span.set('href', '#')

    def convert_run(self, run):
        ans = SPAN()
        self.object_map[ans] = run
        text = Text(ans, 'text', [])

        for child in run:
            if is_tag(child, 'w:t'):
                if not child.text:
                    continue
                space = child.get(XML('space'), None)
                preserve = False
                if space == 'preserve':
                    # Only use a <span> with white-space:pre-wrap if this element
                    # actually needs it, i.e. if it has more than one
                    # consecutive space or it has newlines or tabs.
                    multi_spaces = self.ms_pat.search(child.text) is not None
                    preserve = multi_spaces or self.ws_pat.search(child.text) is not None
                if preserve:
                    text.add_elem(SPAN(child.text, style="white-space:pre-wrap"))
                    ans.append(text.elem)
                else:
                    text.buf.append(child.text)
            elif is_tag(child, 'w:cr'):
                text.add_elem(BR())
                ans.append(text.elem)
            elif is_tag(child, 'w:br'):
                typ = get(child, 'w:type')
                if typ in {'column', 'page'}:
                    br = BR(style='page-break-after:always')
                else:
                    clear = child.get('clear', None)
                    if clear in {'all', 'left', 'right'}:
                        br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
                    else:
                        br = BR()
                text.add_elem(br)
                ans.append(text.elem)
            elif is_tag(child, 'w:drawing') or is_tag(child, 'w:pict'):
                for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
                    text.add_elem(img)
                    ans.append(text.elem)
            elif is_tag(child, 'w:footnoteReference') or is_tag(child, 'w:endnoteReference'):
                anchor, name = self.footnotes.get_ref(child)
                if anchor and name:
                    l = SUP(A(name, href='#' + anchor, title=name), id='back_%s' % anchor)
                    l.set('class', 'noteref')
                    text.add_elem(l)
                    ans.append(text.elem)
            elif is_tag(child, 'w:fldChar') and get(child, 'w:fldCharType') == 'separate':
                text.buf.append('\xa0')
        if text.buf:
            setattr(text.elem, text.attr, ''.join(text.buf))

        style = self.styles.resolve_run(run)
        if style.vert_align in {'superscript', 'subscript'}:
            ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup'
        if style.lang is not inherit:
            ans.lang = style.lang
        return ans

    def add_frame(self, html_obj, style):
        last_run = self.framed[-1]
        if style is inherit:
            if last_run:
                self.framed.append([])
            return

        if last_run:
            if last_run[-1][1] == style:
                last_run.append((html_obj, style))
            else:
                self.framed.append((html_obj, style))
        else:
            last_run.append((html_obj, style))

    def apply_frames(self):
        for run in filter(None, self.framed):
            style = run[0][1]
            paras = tuple(x[0] for x in run)
            parent = paras[0].getparent()
            idx = parent.index(paras[0])
            frame = DIV(*paras)
            parent.insert(idx, frame)
            self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
            self.styles.register(css, 'frame')

Пример #11

Показать файл

Файл: to_html.py Проект: kmshi/calibre

class Convert(object):
    def __init__(self,
                 path_or_stream,
                 dest_dir=None,
                 log=None,
                 detect_cover=True,
                 notes_text=None):
        self.docx = DOCX(path_or_stream, log=log)
        self.ms_pat = re.compile(r'\s{2,}')
        self.ws_pat = re.compile(r'[\n\r\t]')
        self.log = self.docx.log
        self.detect_cover = detect_cover
        self.notes_text = notes_text or _('Notes')
        self.dest_dir = dest_dir or os.getcwdu()
        self.mi = self.docx.metadata
        self.body = BODY()
        self.theme = Theme()
        self.settings = Settings()
        self.tables = Tables()
        self.fields = Fields()
        self.styles = Styles(self.tables)
        self.images = Images(self.log)
        self.object_map = OrderedDict()
        self.html = HTML(
            HEAD(
                META(charset='utf-8'),
                TITLE(self.mi.title or _('Unknown')),
                LINK(rel='stylesheet', type='text/css', href='docx.css'),
            ), self.body)
        self.html.text = '\n\t'
        self.html[0].text = '\n\t\t'
        self.html[0].tail = '\n'
        for child in self.html[0]:
            child.tail = '\n\t\t'
        self.html[0][-1].tail = '\n\t'
        self.html[1].text = self.html[1].tail = '\n'
        lang = canonicalize_lang(self.mi.language)
        if lang and lang != 'und':
            lang = lang_as_iso639_1(lang)
            if lang:
                self.html.set('lang', lang)

    def __call__(self):
        doc = self.docx.document
        relationships_by_id, relationships_by_type = self.docx.document_relationships
        self.fields(doc, self.log)
        self.read_styles(relationships_by_type)
        self.images(relationships_by_id)
        self.layers = OrderedDict()
        self.framed = [[]]
        self.framed_map = {}
        self.anchor_map = {}
        self.link_map = defaultdict(list)
        self.link_source_map = {}
        paras = []

        self.log.debug('Converting Word markup to HTML')
        self.read_page_properties(doc)
        self.current_rels = relationships_by_id
        for wp, page_properties in self.page_map.iteritems():
            self.current_page = page_properties
            if wp.tag.endswith('}p'):
                p = self.convert_p(wp)
                self.body.append(p)
                paras.append(wp)
        self.read_block_anchors(doc)
        self.styles.apply_contextual_spacing(paras)
        # Apply page breaks at the start of every section, except the first
        # section (since that will be the start of the file)
        self.styles.apply_section_page_breaks(self.section_starts[1:])

        notes_header = None
        orig_rid_map = self.images.rid_map
        if self.footnotes.has_notes:
            dl = DL()
            dl.set('class', 'notes')
            self.body.append(H1(self.notes_text))
            notes_header = self.body[-1]
            notes_header.set('class', 'notes-header')
            self.body.append(dl)
            for anchor, text, note in self.footnotes:
                dl.append(
                    DT('[',
                       A('←' + text, href='#back_%s' % anchor, title=text),
                       id=anchor))
                dl[-1][0].tail = ']'
                dl.append(DD())
                paras = []
                self.images.rid_map = self.current_rels = note.rels[0]
                for wp in note:
                    if wp.tag.endswith('}tbl'):
                        self.tables.register(wp, self.styles)
                        self.page_map[wp] = self.current_page
                    else:
                        p = self.convert_p(wp)
                        dl[-1].append(p)
                        paras.append(wp)
                self.styles.apply_contextual_spacing(paras)

        for p, wp in self.object_map.iteritems():
            if len(p) > 0 and not p.text and len(
                    p[0]) > 0 and not p[0].text and p[0][0].get('class',
                                                                None) == 'tab':
                # Paragraph uses tabs for indentation, convert to text-indent
                parent = p[0]
                tabs = []
                for child in parent:
                    if child.get('class', None) == 'tab':
                        tabs.append(child)
                        if child.tail:
                            break
                    else:
                        break
                indent = len(tabs) * self.settings.default_tab_stop
                style = self.styles.resolve(wp)
                if style.text_indent is inherit or (
                        hasattr(style.text_indent, 'endswith')
                        and style.text_indent.endswith('pt')):
                    if style.text_indent is not inherit:
                        indent = float(style.text_indent[:-2]) + indent
                    style.text_indent = '%.3gpt' % indent
                    parent.text = tabs[-1].tail or ''
                    map(parent.remove, tabs)

        self.images.rid_map = orig_rid_map

        self.resolve_links()

        self.styles.cascade(self.layers)

        self.tables.apply_markup(self.object_map, self.page_map)

        numbered = []
        for html_obj, obj in self.object_map.iteritems():
            raw = obj.get('calibre_num_id', None)
            if raw is not None:
                lvl, num_id = raw.partition(':')[0::2]
                try:
                    lvl = int(lvl)
                except (TypeError, ValueError):
                    lvl = 0
                numbered.append((html_obj, num_id, lvl))
        self.numbering.apply_markup(numbered, self.body, self.styles,
                                    self.object_map, self.images)
        self.apply_frames()

        if len(self.body) > 0:
            self.body.text = '\n\t'
            for child in self.body:
                child.tail = '\n\t'
            self.body[-1].tail = '\n'

        self.log.debug('Converting styles to CSS')
        self.styles.generate_classes()
        for html_obj, obj in self.object_map.iteritems():
            style = self.styles.resolve(obj)
            if style is not None:
                css = style.css
                if css:
                    cls = self.styles.class_name(css)
                    if cls:
                        html_obj.set('class', cls)
        for html_obj, css in self.framed_map.iteritems():
            cls = self.styles.class_name(css)
            if cls:
                html_obj.set('class', cls)

        if notes_header is not None:
            for h in children(self.body, 'h1', 'h2', 'h3'):
                notes_header.tag = h.tag
                cls = h.get('class', None)
                if cls and cls != 'notes-header':
                    notes_header.set('class', '%s notes-header' % cls)
                break

        self.log.debug('Cleaning up redundant markup generated by Word')
        self.cover_image = cleanup_markup(self.log, self.html, self.styles,
                                          self.dest_dir, self.detect_cover)

        return self.write(doc)

    def read_page_properties(self, doc):
        current = []
        self.page_map = OrderedDict()
        self.section_starts = []

        for p in descendants(doc, 'w:p', 'w:tbl'):
            if p.tag.endswith('}tbl'):
                self.tables.register(p, self.styles)
                current.append(p)
                continue
            sect = tuple(descendants(p, 'w:sectPr'))
            if sect:
                pr = PageProperties(sect)
                paras = current + [p]
                for x in paras:
                    self.page_map[x] = pr
                self.section_starts.append(paras[0])
                current = []
            else:
                current.append(p)

        if current:
            self.section_starts.append(current[0])
            last = XPath('./w:body/w:sectPr')(doc)
            pr = PageProperties(last)
            for x in current:
                self.page_map[x] = pr

    def read_styles(self, relationships_by_type):
        def get_name(rtype, defname):
            name = relationships_by_type.get(rtype, None)
            if name is None:
                cname = self.docx.document_name.split('/')
                cname[-1] = defname
                if self.docx.exists('/'.join(cname)):
                    name = name
            return name

        nname = get_name(NUMBERING, 'numbering.xml')
        sname = get_name(STYLES, 'styles.xml')
        sename = get_name(SETTINGS, 'settings.xml')
        fname = get_name(FONTS, 'fontTable.xml')
        tname = get_name(THEMES, 'theme1.xml')
        foname = get_name(FOOTNOTES, 'footnotes.xml')
        enname = get_name(ENDNOTES, 'endnotes.xml')
        numbering = self.numbering = Numbering()
        footnotes = self.footnotes = Footnotes()
        fonts = self.fonts = Fonts()

        foraw = enraw = None
        forel, enrel = ({}, {}), ({}, {})
        if sename is not None:
            try:
                seraw = self.docx.read(sename)
            except KeyError:
                self.log.warn('Settings %s do not exist' % sename)
            else:
                self.settings(fromstring(seraw))

        if foname is not None:
            try:
                foraw = self.docx.read(foname)
            except KeyError:
                self.log.warn('Footnotes %s do not exist' % foname)
            else:
                forel = self.docx.get_relationships(foname)
        if enname is not None:
            try:
                enraw = self.docx.read(enname)
            except KeyError:
                self.log.warn('Endnotes %s do not exist' % enname)
            else:
                enrel = self.docx.get_relationships(enname)
        footnotes(
            fromstring(foraw) if foraw else None, forel,
            fromstring(enraw) if enraw else None, enrel)

        if fname is not None:
            embed_relationships = self.docx.get_relationships(fname)[0]
            try:
                raw = self.docx.read(fname)
            except KeyError:
                self.log.warn('Fonts table %s does not exist' % fname)
            else:
                fonts(fromstring(raw), embed_relationships, self.docx,
                      self.dest_dir)

        if tname is not None:
            try:
                raw = self.docx.read(tname)
            except KeyError:
                self.log.warn('Styles %s do not exist' % sname)
            else:
                self.theme(fromstring(raw))

        if sname is not None:
            try:
                raw = self.docx.read(sname)
            except KeyError:
                self.log.warn('Styles %s do not exist' % sname)
            else:
                self.styles(fromstring(raw), fonts, self.theme)

        if nname is not None:
            try:
                raw = self.docx.read(nname)
            except KeyError:
                self.log.warn('Numbering styles %s do not exist' % nname)
            else:
                numbering(fromstring(raw), self.styles,
                          self.docx.get_relationships(nname)[0])

        self.styles.resolve_numbering(numbering)

    def write(self, doc):
        toc = create_toc(doc, self.body, self.resolved_link_map, self.styles,
                         self.object_map, self.log)
        raw = html.tostring(self.html,
                            encoding='utf-8',
                            doctype='<!DOCTYPE html>')
        with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
            f.write(raw)
        css = self.styles.generate_css(self.dest_dir, self.docx)
        if css:
            with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
                f.write(css.encode('utf-8'))

        opf = OPFCreator(self.dest_dir, self.mi)
        opf.toc = toc
        opf.create_manifest_from_files_in([self.dest_dir])
        opf.create_spine(['index.html'])
        if self.cover_image is not None:
            opf.guide.set_cover(self.cover_image)
        with open(os.path.join(self.dest_dir, 'metadata.opf'),
                  'wb') as of, open(os.path.join(self.dest_dir, 'toc.ncx'),
                                    'wb') as ncx:
            opf.render(of, ncx, 'toc.ncx')
        return os.path.join(self.dest_dir, 'metadata.opf')

    def read_block_anchors(self, doc):
        doc_anchors = frozenset(
            XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
        if doc_anchors:
            current_bm = set()
            rmap = {v: k for k, v in self.object_map.iteritems()}
            for p in descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
                if p.tag.endswith('}p'):
                    if current_bm and p in rmap:
                        para = rmap[p]
                        if 'id' not in para.attrib:
                            para.set(
                                'id',
                                generate_anchor(
                                    next(iter(current_bm)),
                                    frozenset(self.anchor_map.itervalues())))
                        for name in current_bm:
                            self.anchor_map[name] = para.get('id')
                        current_bm = set()
                elif p in doc_anchors:
                    anchor = get(p, 'w:name')
                    if anchor:
                        current_bm.add(anchor)

    def convert_p(self, p):
        dest = P()
        self.object_map[dest] = p
        style = self.styles.resolve_paragraph(p)
        self.layers[p] = []
        self.add_frame(dest, style.frame)

        current_anchor = None
        current_hyperlink = None
        hl_xpath = XPath('ancestor::w:hyperlink[1]')

        for x in descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink'):
            if x.tag.endswith('}r'):
                span = self.convert_run(x)
                if current_anchor is not None:
                    (dest if len(dest) == 0 else span).set(
                        'id', current_anchor)
                    current_anchor = None
                if current_hyperlink is not None:
                    try:
                        hl = hl_xpath(x)[0]
                        self.link_map[hl].append(span)
                        self.link_source_map[hl] = self.current_rels
                        x.set('is-link', '1')
                    except IndexError:
                        current_hyperlink = None
                dest.append(span)
                self.layers[p].append(x)
            elif x.tag.endswith('}bookmarkStart'):
                anchor = get(x, 'w:name')
                if anchor and anchor not in self.anchor_map:
                    old_anchor = current_anchor
                    self.anchor_map[anchor] = current_anchor = generate_anchor(
                        anchor, frozenset(self.anchor_map.itervalues()))
                    if old_anchor is not None:
                        # The previous anchor was not applied to any element
                        for a, t in tuple(self.anchor_map.iteritems()):
                            if t == old_anchor:
                                self.anchor_map[a] = current_anchor
            elif x.tag.endswith('}hyperlink'):
                current_hyperlink = x

        m = re.match(r'heading\s+(\d+)$', style.style_name or '',
                     re.IGNORECASE)
        if m is not None:
            n = min(6, max(1, int(m.group(1))))
            dest.tag = 'h%d' % n

        if style.direction == 'rtl':
            dest.set('dir', 'rtl')

        border_runs = []
        common_borders = []
        for span in dest:
            run = self.object_map[span]
            style = self.styles.resolve_run(run)
            if not border_runs or border_runs[-1][1].same_border(style):
                border_runs.append((span, style))
            elif border_runs:
                if len(border_runs) > 1:
                    common_borders.append(border_runs)
                border_runs = []

        for border_run in common_borders:
            spans = []
            bs = {}
            for span, style in border_run:
                style.get_border_css(bs)
                style.clear_border_css()
                spans.append(span)
            if bs:
                cls = self.styles.register(bs, 'text_border')
                wrapper = self.wrap_elems(spans, SPAN())
                wrapper.set('class', cls)

        if not dest.text and len(dest) == 0:
            # Empty paragraph add a non-breaking space so that it is rendered
            # by WebKit
            dest.text = NBSP

        # If the last element in a block is a <br> the <br> is not rendered in
        # HTML, unless it is followed by a trailing space. Word, on the other
        # hand inserts a blank line for trailing <br>s.
        if len(dest) > 0 and not dest[-1].tail:
            if dest[-1].tag == 'br':
                dest[-1].tail = NBSP
            elif len(dest[-1]) > 0 and dest[-1][
                    -1].tag == 'br' and not dest[-1][-1].tail:
                dest[-1][-1].tail = NBSP

        return dest

    def wrap_elems(self, elems, wrapper):
        p = elems[0].getparent()
        idx = p.index(elems[0])
        p.insert(idx, wrapper)
        wrapper.tail = elems[-1].tail
        elems[-1].tail = None
        for elem in elems:
            p.remove(elem)
            wrapper.append(elem)
        return wrapper

    def resolve_links(self):
        self.resolved_link_map = {}
        for hyperlink, spans in self.link_map.iteritems():
            relationships_by_id = self.link_source_map[hyperlink]
            span = spans[0]
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            self.resolved_link_map[hyperlink] = span
            tgt = get(hyperlink, 'w:tgtFrame')
            if tgt:
                span.set('target', tgt)
            tt = get(hyperlink, 'w:tooltip')
            if tt:
                span.set('title', tt)
            rid = get(hyperlink, 'r:id')
            if rid and rid in relationships_by_id:
                span.set('href', relationships_by_id[rid])
                continue
            anchor = get(hyperlink, 'w:anchor')
            if anchor and anchor in self.anchor_map:
                span.set('href', '#' + self.anchor_map[anchor])
                continue
            self.log.warn(
                'Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
                (rid, anchor))
            # hrefs that point nowhere give epubcheck a hernia. The element
            # should be styled explicitly by Word anyway.
            # span.set('href', '#')
        rmap = {v: k for k, v in self.object_map.iteritems()}
        for hyperlink, runs in self.fields.hyperlink_fields:
            spans = [rmap[r] for r in runs if r in rmap]
            if not spans:
                continue
            if len(spans) > 1:
                span = self.wrap_elems(spans, SPAN())
            span.tag = 'a'
            tgt = hyperlink.get('target', None)
            if tgt:
                span.set('target', tgt)
            tt = hyperlink.get('title', None)
            if tt:
                span.set('title', tt)
            url = hyperlink.get('url', None)
            if url is None:
                anchor = hyperlink.get('anchor', None)
                if anchor in self.anchor_map:
                    span.set('href', '#' + self.anchor_map[anchor])
                    continue
                self.log.warn('Hyperlink field with unknown anchor: %s' %
                              anchor)
            else:
                if url in self.anchor_map:
                    span.set('href', '#' + self.anchor_map[url])
                    continue
                span.set('href', url)

        for img, link in self.images.links:
            parent = img.getparent()
            idx = parent.index(img)
            a = A(img)
            a.tail, img.tail = img.tail, None
            parent.insert(idx, a)
            tgt = link.get('target', None)
            if tgt:
                a.set('target', tgt)
            tt = link.get('title', None)
            if tt:
                a.set('title', tt)
            rid = link['id']
            if rid in relationships_by_id:
                dest = relationships_by_id[rid]
                if dest.startswith('#'):
                    if dest[1:] in self.anchor_map:
                        a.set('href', '#' + self.anchor_map[dest[1:]])
                else:
                    a.set('href', dest)

    def convert_run(self, run):
        ans = SPAN()
        self.object_map[ans] = run
        text = Text(ans, 'text', [])

        for child in run:
            if is_tag(child, 'w:t'):
                if not child.text:
                    continue
                space = child.get(XML('space'), None)
                preserve = False
                if space == 'preserve':
                    # Only use a <span> with white-space:pre-wrap if this element
                    # actually needs it, i.e. if it has more than one
                    # consecutive space or it has newlines or tabs.
                    multi_spaces = self.ms_pat.search(child.text) is not None
                    preserve = multi_spaces or self.ws_pat.search(
                        child.text) is not None
                if preserve:
                    text.add_elem(
                        SPAN(child.text, style="white-space:pre-wrap"))
                    ans.append(text.elem)
                else:
                    text.buf.append(child.text)
            elif is_tag(child, 'w:cr'):
                text.add_elem(BR())
                ans.append(text.elem)
            elif is_tag(child, 'w:br'):
                typ = get(child, 'w:type')
                if typ in {'column', 'page'}:
                    br = BR(style='page-break-after:always')
                else:
                    clear = child.get('clear', None)
                    if clear in {'all', 'left', 'right'}:
                        br = BR(style='clear:%s' %
                                ('both' if clear == 'all' else clear))
                    else:
                        br = BR()
                text.add_elem(br)
                ans.append(text.elem)
            elif is_tag(child, 'w:drawing') or is_tag(child, 'w:pict'):
                for img in self.images.to_html(child, self.current_page,
                                               self.docx, self.dest_dir):
                    text.add_elem(img)
                    ans.append(text.elem)
            elif is_tag(child, 'w:footnoteReference') or is_tag(
                    child, 'w:endnoteReference'):
                anchor, name = self.footnotes.get_ref(child)
                if anchor and name:
                    l = SUP(A(name, href='#' + anchor, title=name),
                            id='back_%s' % anchor)
                    l.set('class', 'noteref')
                    text.add_elem(l)
                    ans.append(text.elem)
            elif is_tag(child, 'w:tab'):
                spaces = int(
                    math.ceil((self.settings.default_tab_stop / 36) * 6))
                text.add_elem(SPAN(NBSP * spaces))
                ans.append(text.elem)
                ans[-1].set('class', 'tab')
        if text.buf:
            setattr(text.elem, text.attr, ''.join(text.buf))

        style = self.styles.resolve_run(run)
        if style.vert_align in {'superscript', 'subscript'}:
            ans.tag = 'sub' if style.vert_align == 'subscript' else 'sup'
        if style.lang is not inherit:
            ans.lang = style.lang
        return ans

    def add_frame(self, html_obj, style):
        last_run = self.framed[-1]
        if style is inherit:
            if last_run:
                self.framed.append([])
            return

        if last_run:
            if last_run[-1][1] == style:
                last_run.append((html_obj, style))
            else:
                self.framed.append((html_obj, style))
        else:
            last_run.append((html_obj, style))

    def apply_frames(self):
        for run in filter(None, self.framed):
            style = run[0][1]
            paras = tuple(x[0] for x in run)
            parent = paras[0].getparent()
            idx = parent.index(paras[0])
            frame = DIV(*paras)
            parent.insert(idx, frame)
            self.framed_map[frame] = css = style.css(
                self.page_map[self.object_map[paras[0]]])
            self.styles.register(css, 'frame')